rbd: remove snapshot existence validation code
[linux-block.git] / drivers / block / rbd.c
CommitLineData
e2a58ee5 1
602adf40
YS
2/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
dfc5606d 25 For usage instructions, please refer to:
602adf40 26
dfc5606d 27 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
28
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
ed95b21a 34#include <linux/ceph/cls_lock_client.h>
43df3d35 35#include <linux/ceph/striper.h>
602adf40 36#include <linux/ceph/decode.h>
59c2be1e 37#include <linux/parser.h>
30d1cff8 38#include <linux/bsearch.h>
602adf40
YS
39
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
7ad18afa 43#include <linux/blk-mq.h>
602adf40
YS
44#include <linux/fs.h>
45#include <linux/blkdev.h>
1c2a9dfe 46#include <linux/slab.h>
f8a22fc2 47#include <linux/idr.h>
bc1ecc65 48#include <linux/workqueue.h>
602adf40
YS
49
50#include "rbd_types.h"
51
aafb230e
AE
52#define RBD_DEBUG /* Activate rbd_assert() calls */
53
a2acd00e
AE
54/*
55 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
bfc18e38 64 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
a2acd00e
AE
65 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
f0f8cef5 87#define RBD_DRV_NAME "rbd"
602adf40 88
7e513d43
ID
89#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
602adf40 91
6d69bb53
ID
92#define RBD_MAX_PARENT_CHAIN_LEN 16
93
d4b125e9
AE
94#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
35d489f9 98#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
99
100#define RBD_SNAP_HEAD_NAME "-"
101
9682fc6d
AE
102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
9e15b77d
AE
104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 106#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 107
1e130199 108#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 109
ed95b21a 110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
99d16943
ID
111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
d889140c
AE
113/* Feature bits */
114
8767b293
ID
115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
22e8bd51
ID
118#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
119#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
b9f6d447 120#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
8767b293 121#define RBD_FEATURE_DATA_POOL (1ULL<<7)
e573427a 122#define RBD_FEATURE_OPERATIONS (1ULL<<8)
8767b293 123
ed95b21a
ID
124#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
125 RBD_FEATURE_STRIPINGV2 | \
7e97332e 126 RBD_FEATURE_EXCLUSIVE_LOCK | \
22e8bd51
ID
127 RBD_FEATURE_OBJECT_MAP | \
128 RBD_FEATURE_FAST_DIFF | \
b9f6d447 129 RBD_FEATURE_DEEP_FLATTEN | \
e573427a
ID
130 RBD_FEATURE_DATA_POOL | \
131 RBD_FEATURE_OPERATIONS)
d889140c
AE
132
133/* Features supported by this (client software) implementation. */
134
770eba6e 135#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
d889140c 136
81a89793
AE
137/*
138 * An RBD device name will be "rbd#", where the "rbd" comes from
139 * RBD_DRV_NAME above, and # is a unique integer identifier.
81a89793 140 */
602adf40
YS
141#define DEV_NAME_LEN 32
142
143/*
144 * block device image metadata (in-memory version)
145 */
146struct rbd_image_header {
f35a4dee 147 /* These six fields never change for a given rbd image */
849b4260 148 char *object_prefix;
602adf40 149 __u8 obj_order;
f35a4dee
AE
150 u64 stripe_unit;
151 u64 stripe_count;
7e97332e 152 s64 data_pool_id;
f35a4dee 153 u64 features; /* Might be changeable someday? */
602adf40 154
f84344f3
AE
155 /* The remaining fields need to be updated occasionally */
156 u64 image_size;
157 struct ceph_snap_context *snapc;
f35a4dee
AE
158 char *snap_names; /* format 1 only */
159 u64 *snap_sizes; /* format 1 only */
59c2be1e
YS
160};
161
0d7dbfce
AE
162/*
163 * An rbd image specification.
164 *
165 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
166 * identify an image. Each rbd_dev structure includes a pointer to
167 * an rbd_spec structure that encapsulates this identity.
168 *
169 * Each of the id's in an rbd_spec has an associated name. For a
170 * user-mapped image, the names are supplied and the id's associated
171 * with them are looked up. For a layered image, a parent image is
172 * defined by the tuple, and the names are looked up.
173 *
174 * An rbd_dev structure contains a parent_spec pointer which is
175 * non-null if the image it represents is a child in a layered
176 * image. This pointer will refer to the rbd_spec structure used
177 * by the parent rbd_dev for its own identity (i.e., the structure
178 * is shared between the parent and child).
179 *
180 * Since these structures are populated once, during the discovery
181 * phase of image construction, they are effectively immutable so
182 * we make no effort to synchronize access to them.
183 *
184 * Note that code herein does not assume the image name is known (it
185 * could be a null pointer).
0d7dbfce
AE
186 */
187struct rbd_spec {
188 u64 pool_id;
ecb4dc22 189 const char *pool_name;
b26c047b 190 const char *pool_ns; /* NULL if default, never "" */
0d7dbfce 191
ecb4dc22
AE
192 const char *image_id;
193 const char *image_name;
0d7dbfce
AE
194
195 u64 snap_id;
ecb4dc22 196 const char *snap_name;
0d7dbfce
AE
197
198 struct kref kref;
199};
200
602adf40 201/*
f0f8cef5 202 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
203 */
204struct rbd_client {
205 struct ceph_client *client;
206 struct kref kref;
207 struct list_head node;
208};
209
0192ce2e
ID
210struct pending_result {
211 int result; /* first nonzero result */
212 int num_pending;
213};
214
bf0d5f50 215struct rbd_img_request;
bf0d5f50 216
9969ebc5 217enum obj_request_type {
a1fbb5e7 218 OBJ_REQUEST_NODATA = 1,
5359a17d 219 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
7e07efb1 220 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
afb97888 221 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
9969ebc5 222};
bf0d5f50 223
6d2940c8 224enum obj_operation_type {
a1fbb5e7 225 OBJ_OP_READ = 1,
6d2940c8 226 OBJ_OP_WRITE,
90e98c52 227 OBJ_OP_DISCARD,
6484cbe9 228 OBJ_OP_ZEROOUT,
6d2940c8
GZ
229};
230
0ad5d953
ID
231#define RBD_OBJ_FLAG_DELETION (1U << 0)
232#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
793333a3 233#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
22e8bd51
ID
234#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
235#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
0ad5d953 236
a9b67e69 237enum rbd_obj_read_state {
85b5e6d1
ID
238 RBD_OBJ_READ_START = 1,
239 RBD_OBJ_READ_OBJECT,
a9b67e69
ID
240 RBD_OBJ_READ_PARENT,
241};
242
3da691bf
ID
243/*
244 * Writes go through the following state machine to deal with
245 * layering:
246 *
89a59c1c
ID
247 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248 * . | .
249 * . v .
250 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
251 * . | . .
252 * . v v (deep-copyup .
253 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
254 * flattened) v | . .
255 * . v . .
256 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
257 * | not needed) v
258 * v .
259 * done . . . . . . . . . . . . . . . . . .
260 * ^
261 * |
262 * RBD_OBJ_WRITE_FLAT
3da691bf
ID
263 *
264 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
89a59c1c
ID
265 * assert_exists guard is needed or not (in some cases it's not needed
266 * even if there is a parent).
3da691bf
ID
267 */
268enum rbd_obj_write_state {
85b5e6d1 269 RBD_OBJ_WRITE_START = 1,
22e8bd51 270 RBD_OBJ_WRITE_PRE_OBJECT_MAP,
85b5e6d1 271 RBD_OBJ_WRITE_OBJECT,
793333a3
ID
272 __RBD_OBJ_WRITE_COPYUP,
273 RBD_OBJ_WRITE_COPYUP,
22e8bd51 274 RBD_OBJ_WRITE_POST_OBJECT_MAP,
793333a3
ID
275};
276
277enum rbd_obj_copyup_state {
278 RBD_OBJ_COPYUP_START = 1,
279 RBD_OBJ_COPYUP_READ_PARENT,
22e8bd51
ID
280 __RBD_OBJ_COPYUP_OBJECT_MAPS,
281 RBD_OBJ_COPYUP_OBJECT_MAPS,
793333a3
ID
282 __RBD_OBJ_COPYUP_WRITE_OBJECT,
283 RBD_OBJ_COPYUP_WRITE_OBJECT,
926f9b3f
AE
284};
285
bf0d5f50 286struct rbd_obj_request {
43df3d35 287 struct ceph_object_extent ex;
0ad5d953 288 unsigned int flags; /* RBD_OBJ_FLAG_* */
c5b5ef6c 289 union {
a9b67e69 290 enum rbd_obj_read_state read_state; /* for reads */
3da691bf 291 enum rbd_obj_write_state write_state; /* for writes */
c5b5ef6c 292 };
bf0d5f50 293
51c3509e 294 struct rbd_img_request *img_request;
86bd7998
ID
295 struct ceph_file_extent *img_extents;
296 u32 num_img_extents;
bf0d5f50 297
788e2df3 298 union {
5359a17d 299 struct ceph_bio_iter bio_pos;
788e2df3 300 struct {
7e07efb1
ID
301 struct ceph_bvec_iter bvec_pos;
302 u32 bvec_count;
afb97888 303 u32 bvec_idx;
788e2df3
AE
304 };
305 };
793333a3
ID
306
307 enum rbd_obj_copyup_state copyup_state;
7e07efb1
ID
308 struct bio_vec *copyup_bvecs;
309 u32 copyup_bvec_count;
bf0d5f50 310
bcbab1db 311 struct list_head osd_reqs; /* w/ r_private_item */
bf0d5f50 312
85b5e6d1 313 struct mutex state_mutex;
793333a3 314 struct pending_result pending;
bf0d5f50
AE
315 struct kref kref;
316};
317
0c425248 318enum img_req_flags {
9849e986 319 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
d0b2e944 320 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
0c425248
AE
321};
322
0192ce2e
ID
323enum rbd_img_state {
324 RBD_IMG_START = 1,
637cd060 325 RBD_IMG_EXCLUSIVE_LOCK,
0192ce2e
ID
326 __RBD_IMG_OBJECT_REQUESTS,
327 RBD_IMG_OBJECT_REQUESTS,
328};
329
bf0d5f50 330struct rbd_img_request {
bf0d5f50 331 struct rbd_device *rbd_dev;
9bb0248d 332 enum obj_operation_type op_type;
ecc633ca 333 enum obj_request_type data_type;
0c425248 334 unsigned long flags;
0192ce2e 335 enum rbd_img_state state;
bf0d5f50 336 union {
9849e986 337 u64 snap_id; /* for reads */
bf0d5f50 338 struct ceph_snap_context *snapc; /* for writes */
9849e986
AE
339 };
340 union {
341 struct request *rq; /* block request */
342 struct rbd_obj_request *obj_request; /* obj req initiator */
bf0d5f50 343 };
bf0d5f50 344
e1fddc8f 345 struct list_head lock_item;
43df3d35 346 struct list_head object_extents; /* obj_req.ex structs */
bf0d5f50 347
0192ce2e
ID
348 struct mutex state_mutex;
349 struct pending_result pending;
350 struct work_struct work;
351 int work_result;
bf0d5f50
AE
352 struct kref kref;
353};
354
355#define for_each_obj_request(ireq, oreq) \
43df3d35 356 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 357#define for_each_obj_request_safe(ireq, oreq, n) \
43df3d35 358 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 359
99d16943
ID
360enum rbd_watch_state {
361 RBD_WATCH_STATE_UNREGISTERED,
362 RBD_WATCH_STATE_REGISTERED,
363 RBD_WATCH_STATE_ERROR,
364};
365
ed95b21a
ID
366enum rbd_lock_state {
367 RBD_LOCK_STATE_UNLOCKED,
368 RBD_LOCK_STATE_LOCKED,
369 RBD_LOCK_STATE_RELEASING,
370};
371
372/* WatchNotify::ClientId */
373struct rbd_client_id {
374 u64 gid;
375 u64 handle;
376};
377
f84344f3 378struct rbd_mapping {
99c1f08f 379 u64 size;
34b13184 380 u64 features;
f84344f3
AE
381};
382
602adf40
YS
383/*
384 * a single device
385 */
386struct rbd_device {
de71a297 387 int dev_id; /* blkdev unique id */
602adf40
YS
388
389 int major; /* blkdev assigned major */
dd82fff1 390 int minor;
602adf40 391 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 392
a30b71b9 393 u32 image_format; /* Either 1 or 2 */
602adf40
YS
394 struct rbd_client *rbd_client;
395
396 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
397
b82d167b 398 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
399
400 struct rbd_image_header header;
b82d167b 401 unsigned long flags; /* possibly lock protected */
0d7dbfce 402 struct rbd_spec *spec;
d147543d 403 struct rbd_options *opts;
0d6d1e9c 404 char *config_info; /* add{,_single_major} string */
602adf40 405
c41d13a3 406 struct ceph_object_id header_oid;
922dab61 407 struct ceph_object_locator header_oloc;
971f839a 408
1643dfa4 409 struct ceph_file_layout layout; /* used for all rbd requests */
0903e875 410
99d16943
ID
411 struct mutex watch_mutex;
412 enum rbd_watch_state watch_state;
922dab61 413 struct ceph_osd_linger_request *watch_handle;
99d16943
ID
414 u64 watch_cookie;
415 struct delayed_work watch_dwork;
59c2be1e 416
ed95b21a
ID
417 struct rw_semaphore lock_rwsem;
418 enum rbd_lock_state lock_state;
cbbfb0ff 419 char lock_cookie[32];
ed95b21a
ID
420 struct rbd_client_id owner_cid;
421 struct work_struct acquired_lock_work;
422 struct work_struct released_lock_work;
423 struct delayed_work lock_dwork;
424 struct work_struct unlock_work;
e1fddc8f 425 spinlock_t lock_lists_lock;
637cd060 426 struct list_head acquiring_list;
e1fddc8f 427 struct list_head running_list;
637cd060
ID
428 struct completion acquire_wait;
429 int acquire_err;
e1fddc8f 430 struct completion releasing_wait;
ed95b21a 431
22e8bd51
ID
432 spinlock_t object_map_lock;
433 u8 *object_map;
434 u64 object_map_size; /* in objects */
435 u64 object_map_flags;
ed95b21a 436
1643dfa4 437 struct workqueue_struct *task_wq;
59c2be1e 438
86b00e0d
AE
439 struct rbd_spec *parent_spec;
440 u64 parent_overlap;
a2acd00e 441 atomic_t parent_ref;
2f82ee54 442 struct rbd_device *parent;
86b00e0d 443
7ad18afa
CH
444 /* Block layer tags. */
445 struct blk_mq_tag_set tag_set;
446
c666601a
JD
447 /* protects updating the header */
448 struct rw_semaphore header_rwsem;
f84344f3
AE
449
450 struct rbd_mapping mapping;
602adf40
YS
451
452 struct list_head node;
dfc5606d 453
dfc5606d
YS
454 /* sysfs related */
455 struct device dev;
b82d167b 456 unsigned long open_count; /* protected by lock */
dfc5606d
YS
457};
458
b82d167b 459/*
87c0fded
ID
460 * Flag bits for rbd_dev->flags:
461 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
462 * by rbd_dev->lock
b82d167b 463 */
6d292906 464enum rbd_dev_flags {
686238b7 465 RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */
b82d167b 466 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
39258aa2 467 RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */
6d292906
AE
468};
469
cfbf6377 470static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
e124a82f 471
602adf40 472static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
473static DEFINE_SPINLOCK(rbd_dev_list_lock);
474
432b8587
AE
475static LIST_HEAD(rbd_client_list); /* clients */
476static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 477
78c2a44a
AE
478/* Slab caches for frequently-allocated structures */
479
1c2a9dfe 480static struct kmem_cache *rbd_img_request_cache;
868311b1 481static struct kmem_cache *rbd_obj_request_cache;
1c2a9dfe 482
9b60e70b 483static int rbd_major;
f8a22fc2
ID
484static DEFINE_IDA(rbd_dev_id_ida);
485
f5ee37bd
ID
486static struct workqueue_struct *rbd_wq;
487
89a59c1c
ID
488static struct ceph_snap_context rbd_empty_snapc = {
489 .nref = REFCOUNT_INIT(1),
490};
491
9b60e70b 492/*
3cfa3b16 493 * single-major requires >= 0.75 version of userspace rbd utility.
9b60e70b 494 */
3cfa3b16 495static bool single_major = true;
5657a819 496module_param(single_major, bool, 0444);
3cfa3b16 497MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
9b60e70b 498
7e9586ba
GKH
499static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
500static ssize_t remove_store(struct bus_type *bus, const char *buf,
501 size_t count);
502static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
503 size_t count);
504static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
505 size_t count);
6d69bb53 506static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
f0f8cef5 507
9b60e70b
ID
508static int rbd_dev_id_to_minor(int dev_id)
509{
7e513d43 510 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
511}
512
513static int minor_to_rbd_dev_id(int minor)
514{
7e513d43 515 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
516}
517
39258aa2
ID
518static bool rbd_is_ro(struct rbd_device *rbd_dev)
519{
520 return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
521}
522
f3c0e459
ID
523static bool rbd_is_snap(struct rbd_device *rbd_dev)
524{
525 return rbd_dev->spec->snap_id != CEPH_NOSNAP;
526}
527
ed95b21a
ID
528static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
529{
637cd060
ID
530 lockdep_assert_held(&rbd_dev->lock_rwsem);
531
ed95b21a
ID
532 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
533 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
534}
535
536static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
537{
538 bool is_lock_owner;
539
540 down_read(&rbd_dev->lock_rwsem);
541 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
542 up_read(&rbd_dev->lock_rwsem);
543 return is_lock_owner;
544}
545
7e9586ba 546static ssize_t supported_features_show(struct bus_type *bus, char *buf)
8767b293
ID
547{
548 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
549}
550
7e9586ba
GKH
551static BUS_ATTR_WO(add);
552static BUS_ATTR_WO(remove);
553static BUS_ATTR_WO(add_single_major);
554static BUS_ATTR_WO(remove_single_major);
555static BUS_ATTR_RO(supported_features);
b15a21dd
GKH
556
557static struct attribute *rbd_bus_attrs[] = {
558 &bus_attr_add.attr,
559 &bus_attr_remove.attr,
9b60e70b
ID
560 &bus_attr_add_single_major.attr,
561 &bus_attr_remove_single_major.attr,
8767b293 562 &bus_attr_supported_features.attr,
b15a21dd 563 NULL,
f0f8cef5 564};
92c76dc0
ID
565
566static umode_t rbd_bus_is_visible(struct kobject *kobj,
567 struct attribute *attr, int index)
568{
9b60e70b
ID
569 if (!single_major &&
570 (attr == &bus_attr_add_single_major.attr ||
571 attr == &bus_attr_remove_single_major.attr))
572 return 0;
573
92c76dc0
ID
574 return attr->mode;
575}
576
577static const struct attribute_group rbd_bus_group = {
578 .attrs = rbd_bus_attrs,
579 .is_visible = rbd_bus_is_visible,
580};
581__ATTRIBUTE_GROUPS(rbd_bus);
f0f8cef5
AE
582
583static struct bus_type rbd_bus_type = {
584 .name = "rbd",
b15a21dd 585 .bus_groups = rbd_bus_groups,
f0f8cef5
AE
586};
587
588static void rbd_root_dev_release(struct device *dev)
589{
590}
591
592static struct device rbd_root_dev = {
593 .init_name = "rbd",
594 .release = rbd_root_dev_release,
595};
596
06ecc6cb
AE
597static __printf(2, 3)
598void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
599{
600 struct va_format vaf;
601 va_list args;
602
603 va_start(args, fmt);
604 vaf.fmt = fmt;
605 vaf.va = &args;
606
607 if (!rbd_dev)
608 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
609 else if (rbd_dev->disk)
610 printk(KERN_WARNING "%s: %s: %pV\n",
611 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
612 else if (rbd_dev->spec && rbd_dev->spec->image_name)
613 printk(KERN_WARNING "%s: image %s: %pV\n",
614 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
615 else if (rbd_dev->spec && rbd_dev->spec->image_id)
616 printk(KERN_WARNING "%s: id %s: %pV\n",
617 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
618 else /* punt */
619 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
620 RBD_DRV_NAME, rbd_dev, &vaf);
621 va_end(args);
622}
623
aafb230e
AE
624#ifdef RBD_DEBUG
625#define rbd_assert(expr) \
626 if (unlikely(!(expr))) { \
627 printk(KERN_ERR "\nAssertion failure in %s() " \
628 "at line %d:\n\n" \
629 "\trbd_assert(%s);\n\n", \
630 __func__, __LINE__, #expr); \
631 BUG(); \
632 }
633#else /* !RBD_DEBUG */
634# define rbd_assert(expr) ((void) 0)
635#endif /* !RBD_DEBUG */
dfc5606d 636
05a46afd 637static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
8b3e1a56 638
cc4a38bd 639static int rbd_dev_refresh(struct rbd_device *rbd_dev);
2df3fac7 640static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
a720ae09 641static int rbd_dev_header_info(struct rbd_device *rbd_dev);
e8f59b59 642static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
54cac61f
AE
643static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
644 u64 snap_id);
2ad3d716
AE
645static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
646 u8 *order, u64 *snap_size);
647static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
648 u64 *snap_features);
22e8bd51 649static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
59c2be1e 650
54ab3b24 651static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
0192ce2e
ID
652static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
653
654/*
655 * Return true if nothing else is pending.
656 */
657static bool pending_result_dec(struct pending_result *pending, int *result)
658{
659 rbd_assert(pending->num_pending > 0);
660
661 if (*result && !pending->result)
662 pending->result = *result;
663 if (--pending->num_pending)
664 return false;
665
666 *result = pending->result;
667 return true;
668}
59c2be1e 669
602adf40
YS
670static int rbd_open(struct block_device *bdev, fmode_t mode)
671{
f0f8cef5 672 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 673 bool removing = false;
602adf40 674
a14ea269 675 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
676 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
677 removing = true;
678 else
679 rbd_dev->open_count++;
a14ea269 680 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
681 if (removing)
682 return -ENOENT;
683
c3e946ce 684 (void) get_device(&rbd_dev->dev);
340c7a2b 685
602adf40
YS
686 return 0;
687}
688
db2a144b 689static void rbd_release(struct gendisk *disk, fmode_t mode)
dfc5606d
YS
690{
691 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
692 unsigned long open_count_before;
693
a14ea269 694 spin_lock_irq(&rbd_dev->lock);
b82d167b 695 open_count_before = rbd_dev->open_count--;
a14ea269 696 spin_unlock_irq(&rbd_dev->lock);
b82d167b 697 rbd_assert(open_count_before > 0);
dfc5606d 698
c3e946ce 699 put_device(&rbd_dev->dev);
dfc5606d
YS
700}
701
131fd9f6
GZ
702static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
703{
1de797bb 704 int ro;
131fd9f6 705
1de797bb 706 if (get_user(ro, (int __user *)arg))
131fd9f6
GZ
707 return -EFAULT;
708
c1b62057
ID
709 /*
710 * Both images mapped read-only and snapshots can't be marked
711 * read-write.
712 */
713 if (!ro) {
714 if (rbd_is_ro(rbd_dev))
715 return -EROFS;
716
717 rbd_assert(!rbd_is_snap(rbd_dev));
718 }
131fd9f6 719
1de797bb
ID
720 /* Let blkdev_roset() handle it */
721 return -ENOTTY;
131fd9f6
GZ
722}
723
724static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
725 unsigned int cmd, unsigned long arg)
726{
727 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
1de797bb 728 int ret;
131fd9f6 729
131fd9f6
GZ
730 switch (cmd) {
731 case BLKROSET:
732 ret = rbd_ioctl_set_ro(rbd_dev, arg);
733 break;
734 default:
735 ret = -ENOTTY;
736 }
737
131fd9f6
GZ
738 return ret;
739}
740
741#ifdef CONFIG_COMPAT
742static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
743 unsigned int cmd, unsigned long arg)
744{
745 return rbd_ioctl(bdev, mode, cmd, arg);
746}
747#endif /* CONFIG_COMPAT */
748
602adf40
YS
749static const struct block_device_operations rbd_bd_ops = {
750 .owner = THIS_MODULE,
751 .open = rbd_open,
dfc5606d 752 .release = rbd_release,
131fd9f6
GZ
753 .ioctl = rbd_ioctl,
754#ifdef CONFIG_COMPAT
755 .compat_ioctl = rbd_compat_ioctl,
756#endif
602adf40
YS
757};
758
759/*
7262cfca 760 * Initialize an rbd client instance. Success or not, this function
cfbf6377 761 * consumes ceph_opts. Caller holds client_mutex.
602adf40 762 */
f8c38929 763static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
764{
765 struct rbd_client *rbdc;
766 int ret = -ENOMEM;
767
37206ee5 768 dout("%s:\n", __func__);
602adf40
YS
769 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
770 if (!rbdc)
771 goto out_opt;
772
773 kref_init(&rbdc->kref);
774 INIT_LIST_HEAD(&rbdc->node);
775
74da4a0f 776 rbdc->client = ceph_create_client(ceph_opts, rbdc);
602adf40 777 if (IS_ERR(rbdc->client))
08f75463 778 goto out_rbdc;
43ae4701 779 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
780
781 ret = ceph_open_session(rbdc->client);
782 if (ret < 0)
08f75463 783 goto out_client;
602adf40 784
432b8587 785 spin_lock(&rbd_client_list_lock);
602adf40 786 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 787 spin_unlock(&rbd_client_list_lock);
602adf40 788
37206ee5 789 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 790
602adf40 791 return rbdc;
08f75463 792out_client:
602adf40 793 ceph_destroy_client(rbdc->client);
08f75463 794out_rbdc:
602adf40
YS
795 kfree(rbdc);
796out_opt:
43ae4701
AE
797 if (ceph_opts)
798 ceph_destroy_options(ceph_opts);
37206ee5
AE
799 dout("%s: error %d\n", __func__, ret);
800
28f259b7 801 return ERR_PTR(ret);
602adf40
YS
802}
803
2f82ee54
AE
804static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
805{
806 kref_get(&rbdc->kref);
807
808 return rbdc;
809}
810
602adf40 811/*
1f7ba331
AE
812 * Find a ceph client with specific addr and configuration. If
813 * found, bump its reference count.
602adf40 814 */
1f7ba331 815static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
816{
817 struct rbd_client *client_node;
1f7ba331 818 bool found = false;
602adf40 819
43ae4701 820 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
821 return NULL;
822
1f7ba331
AE
823 spin_lock(&rbd_client_list_lock);
824 list_for_each_entry(client_node, &rbd_client_list, node) {
825 if (!ceph_compare_options(ceph_opts, client_node->client)) {
2f82ee54
AE
826 __rbd_get_client(client_node);
827
1f7ba331
AE
828 found = true;
829 break;
830 }
831 }
832 spin_unlock(&rbd_client_list_lock);
833
834 return found ? client_node : NULL;
602adf40
YS
835}
836
59c2be1e 837/*
210c104c 838 * (Per device) rbd map options
59c2be1e
YS
839 */
840enum {
b5584180 841 Opt_queue_depth,
0c93e1b7 842 Opt_alloc_size,
34f55d0b 843 Opt_lock_timeout,
59c2be1e
YS
844 Opt_last_int,
845 /* int args above */
b26c047b 846 Opt_pool_ns,
59c2be1e
YS
847 Opt_last_string,
848 /* string args above */
cc0538b6
AE
849 Opt_read_only,
850 Opt_read_write,
80de1912 851 Opt_lock_on_read,
e010dd0a 852 Opt_exclusive,
d9360540 853 Opt_notrim,
210c104c 854 Opt_err
59c2be1e
YS
855};
856
43ae4701 857static match_table_t rbd_opts_tokens = {
b5584180 858 {Opt_queue_depth, "queue_depth=%d"},
0c93e1b7 859 {Opt_alloc_size, "alloc_size=%d"},
34f55d0b 860 {Opt_lock_timeout, "lock_timeout=%d"},
59c2be1e 861 /* int args above */
b26c047b 862 {Opt_pool_ns, "_pool_ns=%s"},
59c2be1e 863 /* string args above */
be466c1c 864 {Opt_read_only, "read_only"},
cc0538b6
AE
865 {Opt_read_only, "ro"}, /* Alternate spelling */
866 {Opt_read_write, "read_write"},
867 {Opt_read_write, "rw"}, /* Alternate spelling */
80de1912 868 {Opt_lock_on_read, "lock_on_read"},
e010dd0a 869 {Opt_exclusive, "exclusive"},
d9360540 870 {Opt_notrim, "notrim"},
210c104c 871 {Opt_err, NULL}
59c2be1e
YS
872};
873
98571b5a 874struct rbd_options {
b5584180 875 int queue_depth;
0c93e1b7 876 int alloc_size;
34f55d0b 877 unsigned long lock_timeout;
98571b5a 878 bool read_only;
80de1912 879 bool lock_on_read;
e010dd0a 880 bool exclusive;
d9360540 881 bool trim;
98571b5a
AE
882};
883
b5584180 884#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
0c93e1b7 885#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
34f55d0b 886#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
98571b5a 887#define RBD_READ_ONLY_DEFAULT false
80de1912 888#define RBD_LOCK_ON_READ_DEFAULT false
e010dd0a 889#define RBD_EXCLUSIVE_DEFAULT false
d9360540 890#define RBD_TRIM_DEFAULT true
98571b5a 891
c300156b
ID
892struct parse_rbd_opts_ctx {
893 struct rbd_spec *spec;
894 struct rbd_options *opts;
895};
896
59c2be1e
YS
897static int parse_rbd_opts_token(char *c, void *private)
898{
c300156b 899 struct parse_rbd_opts_ctx *pctx = private;
59c2be1e
YS
900 substring_t argstr[MAX_OPT_ARGS];
901 int token, intval, ret;
902
43ae4701 903 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
904 if (token < Opt_last_int) {
905 ret = match_int(&argstr[0], &intval);
906 if (ret < 0) {
2f56b6ba 907 pr_err("bad option arg (not int) at '%s'\n", c);
59c2be1e
YS
908 return ret;
909 }
910 dout("got int token %d val %d\n", token, intval);
911 } else if (token > Opt_last_int && token < Opt_last_string) {
210c104c 912 dout("got string token %d val %s\n", token, argstr[0].from);
59c2be1e
YS
913 } else {
914 dout("got token %d\n", token);
915 }
916
917 switch (token) {
b5584180
ID
918 case Opt_queue_depth:
919 if (intval < 1) {
920 pr_err("queue_depth out of range\n");
921 return -EINVAL;
922 }
c300156b 923 pctx->opts->queue_depth = intval;
b5584180 924 break;
0c93e1b7 925 case Opt_alloc_size:
16d80c54 926 if (intval < SECTOR_SIZE) {
0c93e1b7
ID
927 pr_err("alloc_size out of range\n");
928 return -EINVAL;
929 }
930 if (!is_power_of_2(intval)) {
931 pr_err("alloc_size must be a power of 2\n");
932 return -EINVAL;
933 }
934 pctx->opts->alloc_size = intval;
935 break;
34f55d0b
DY
936 case Opt_lock_timeout:
937 /* 0 is "wait forever" (i.e. infinite timeout) */
938 if (intval < 0 || intval > INT_MAX / 1000) {
939 pr_err("lock_timeout out of range\n");
940 return -EINVAL;
941 }
c300156b 942 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
34f55d0b 943 break;
b26c047b
ID
944 case Opt_pool_ns:
945 kfree(pctx->spec->pool_ns);
946 pctx->spec->pool_ns = match_strdup(argstr);
947 if (!pctx->spec->pool_ns)
948 return -ENOMEM;
34f55d0b 949 break;
cc0538b6 950 case Opt_read_only:
c300156b 951 pctx->opts->read_only = true;
cc0538b6
AE
952 break;
953 case Opt_read_write:
c300156b 954 pctx->opts->read_only = false;
cc0538b6 955 break;
80de1912 956 case Opt_lock_on_read:
c300156b 957 pctx->opts->lock_on_read = true;
80de1912 958 break;
e010dd0a 959 case Opt_exclusive:
c300156b 960 pctx->opts->exclusive = true;
e010dd0a 961 break;
d9360540 962 case Opt_notrim:
c300156b 963 pctx->opts->trim = false;
d9360540 964 break;
59c2be1e 965 default:
210c104c
ID
966 /* libceph prints "bad option" msg */
967 return -EINVAL;
59c2be1e 968 }
210c104c 969
59c2be1e
YS
970 return 0;
971}
972
6d2940c8
GZ
973static char* obj_op_name(enum obj_operation_type op_type)
974{
975 switch (op_type) {
976 case OBJ_OP_READ:
977 return "read";
978 case OBJ_OP_WRITE:
979 return "write";
90e98c52
GZ
980 case OBJ_OP_DISCARD:
981 return "discard";
6484cbe9
ID
982 case OBJ_OP_ZEROOUT:
983 return "zeroout";
6d2940c8
GZ
984 default:
985 return "???";
986 }
987}
988
602adf40
YS
989/*
990 * Destroy ceph client
d23a4b3f 991 *
432b8587 992 * Caller must hold rbd_client_list_lock.
602adf40
YS
993 */
994static void rbd_client_release(struct kref *kref)
995{
996 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
997
37206ee5 998 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 999 spin_lock(&rbd_client_list_lock);
602adf40 1000 list_del(&rbdc->node);
cd9d9f5d 1001 spin_unlock(&rbd_client_list_lock);
602adf40
YS
1002
1003 ceph_destroy_client(rbdc->client);
1004 kfree(rbdc);
1005}
1006
1007/*
1008 * Drop reference to ceph client node. If it's not referenced anymore, release
1009 * it.
1010 */
9d3997fd 1011static void rbd_put_client(struct rbd_client *rbdc)
602adf40 1012{
c53d5893
AE
1013 if (rbdc)
1014 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
1015}
1016
5feb0d8d
ID
1017/*
1018 * Get a ceph client with specific addr and configuration, if one does
1019 * not exist create it. Either way, ceph_opts is consumed by this
1020 * function.
1021 */
1022static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
1023{
1024 struct rbd_client *rbdc;
dd435855 1025 int ret;
5feb0d8d 1026
a32e4143 1027 mutex_lock(&client_mutex);
5feb0d8d 1028 rbdc = rbd_client_find(ceph_opts);
dd435855 1029 if (rbdc) {
5feb0d8d 1030 ceph_destroy_options(ceph_opts);
dd435855
ID
1031
1032 /*
1033 * Using an existing client. Make sure ->pg_pools is up to
1034 * date before we look up the pool id in do_rbd_add().
1035 */
9d4a227f
ID
1036 ret = ceph_wait_for_latest_osdmap(rbdc->client,
1037 rbdc->client->options->mount_timeout);
dd435855
ID
1038 if (ret) {
1039 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
1040 rbd_put_client(rbdc);
1041 rbdc = ERR_PTR(ret);
1042 }
1043 } else {
5feb0d8d 1044 rbdc = rbd_client_create(ceph_opts);
dd435855 1045 }
5feb0d8d
ID
1046 mutex_unlock(&client_mutex);
1047
1048 return rbdc;
1049}
1050
a30b71b9
AE
1051static bool rbd_image_format_valid(u32 image_format)
1052{
1053 return image_format == 1 || image_format == 2;
1054}
1055
8e94af8e
AE
1056static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
1057{
103a150f
AE
1058 size_t size;
1059 u32 snap_count;
1060
1061 /* The header has to start with the magic rbd header text */
1062 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
1063 return false;
1064
db2388b6
AE
1065 /* The bio layer requires at least sector-sized I/O */
1066
1067 if (ondisk->options.order < SECTOR_SHIFT)
1068 return false;
1069
1070 /* If we use u64 in a few spots we may be able to loosen this */
1071
1072 if (ondisk->options.order > 8 * sizeof (int) - 1)
1073 return false;
1074
103a150f
AE
1075 /*
1076 * The size of a snapshot header has to fit in a size_t, and
1077 * that limits the number of snapshots.
1078 */
1079 snap_count = le32_to_cpu(ondisk->snap_count);
1080 size = SIZE_MAX - sizeof (struct ceph_snap_context);
1081 if (snap_count > size / sizeof (__le64))
1082 return false;
1083
1084 /*
1085 * Not only that, but the size of the entire the snapshot
1086 * header must also be representable in a size_t.
1087 */
1088 size -= snap_count * sizeof (__le64);
1089 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1090 return false;
1091
1092 return true;
8e94af8e
AE
1093}
1094
5bc3fb17
ID
1095/*
1096 * returns the size of an object in the image
1097 */
1098static u32 rbd_obj_bytes(struct rbd_image_header *header)
1099{
1100 return 1U << header->obj_order;
1101}
1102
263423f8
ID
1103static void rbd_init_layout(struct rbd_device *rbd_dev)
1104{
1105 if (rbd_dev->header.stripe_unit == 0 ||
1106 rbd_dev->header.stripe_count == 0) {
1107 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1108 rbd_dev->header.stripe_count = 1;
1109 }
1110
1111 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1112 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1113 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
7e97332e
ID
1114 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1115 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
263423f8
ID
1116 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1117}
1118
602adf40 1119/*
bb23e37a
AE
1120 * Fill an rbd image header with information from the given format 1
1121 * on-disk header.
602adf40 1122 */
662518b1 1123static int rbd_header_from_disk(struct rbd_device *rbd_dev,
4156d998 1124 struct rbd_image_header_ondisk *ondisk)
602adf40 1125{
662518b1 1126 struct rbd_image_header *header = &rbd_dev->header;
bb23e37a
AE
1127 bool first_time = header->object_prefix == NULL;
1128 struct ceph_snap_context *snapc;
1129 char *object_prefix = NULL;
1130 char *snap_names = NULL;
1131 u64 *snap_sizes = NULL;
ccece235 1132 u32 snap_count;
bb23e37a 1133 int ret = -ENOMEM;
621901d6 1134 u32 i;
602adf40 1135
bb23e37a 1136 /* Allocate this now to avoid having to handle failure below */
6a52325f 1137
bb23e37a 1138 if (first_time) {
848d796c
ID
1139 object_prefix = kstrndup(ondisk->object_prefix,
1140 sizeof(ondisk->object_prefix),
1141 GFP_KERNEL);
bb23e37a
AE
1142 if (!object_prefix)
1143 return -ENOMEM;
bb23e37a 1144 }
00f1f36f 1145
bb23e37a 1146 /* Allocate the snapshot context and fill it in */
00f1f36f 1147
bb23e37a
AE
1148 snap_count = le32_to_cpu(ondisk->snap_count);
1149 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1150 if (!snapc)
1151 goto out_err;
1152 snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 1153 if (snap_count) {
bb23e37a 1154 struct rbd_image_snap_ondisk *snaps;
f785cc1d
AE
1155 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1156
bb23e37a 1157 /* We'll keep a copy of the snapshot names... */
621901d6 1158
bb23e37a
AE
1159 if (snap_names_len > (u64)SIZE_MAX)
1160 goto out_2big;
1161 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1162 if (!snap_names)
6a52325f
AE
1163 goto out_err;
1164
bb23e37a 1165 /* ...as well as the array of their sizes. */
88a25a5f
ME
1166 snap_sizes = kmalloc_array(snap_count,
1167 sizeof(*header->snap_sizes),
1168 GFP_KERNEL);
bb23e37a 1169 if (!snap_sizes)
6a52325f 1170 goto out_err;
bb23e37a 1171
f785cc1d 1172 /*
bb23e37a
AE
1173 * Copy the names, and fill in each snapshot's id
1174 * and size.
1175 *
99a41ebc 1176 * Note that rbd_dev_v1_header_info() guarantees the
bb23e37a 1177 * ondisk buffer we're working with has
f785cc1d
AE
1178 * snap_names_len bytes beyond the end of the
1179 * snapshot id array, this memcpy() is safe.
1180 */
bb23e37a
AE
1181 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1182 snaps = ondisk->snaps;
1183 for (i = 0; i < snap_count; i++) {
1184 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1185 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1186 }
602adf40 1187 }
6a52325f 1188
bb23e37a 1189 /* We won't fail any more, fill in the header */
621901d6 1190
bb23e37a
AE
1191 if (first_time) {
1192 header->object_prefix = object_prefix;
1193 header->obj_order = ondisk->options.order;
263423f8 1194 rbd_init_layout(rbd_dev);
602adf40 1195 } else {
662518b1
AE
1196 ceph_put_snap_context(header->snapc);
1197 kfree(header->snap_names);
1198 kfree(header->snap_sizes);
602adf40 1199 }
849b4260 1200
bb23e37a 1201 /* The remaining fields always get updated (when we refresh) */
621901d6 1202
f84344f3 1203 header->image_size = le64_to_cpu(ondisk->image_size);
bb23e37a
AE
1204 header->snapc = snapc;
1205 header->snap_names = snap_names;
1206 header->snap_sizes = snap_sizes;
468521c1 1207
602adf40 1208 return 0;
bb23e37a
AE
1209out_2big:
1210 ret = -EIO;
6a52325f 1211out_err:
bb23e37a
AE
1212 kfree(snap_sizes);
1213 kfree(snap_names);
1214 ceph_put_snap_context(snapc);
1215 kfree(object_prefix);
ccece235 1216
bb23e37a 1217 return ret;
602adf40
YS
1218}
1219
9682fc6d
AE
1220static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1221{
1222 const char *snap_name;
1223
1224 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1225
1226 /* Skip over names until we find the one we are looking for */
1227
1228 snap_name = rbd_dev->header.snap_names;
1229 while (which--)
1230 snap_name += strlen(snap_name) + 1;
1231
1232 return kstrdup(snap_name, GFP_KERNEL);
1233}
1234
30d1cff8
AE
1235/*
1236 * Snapshot id comparison function for use with qsort()/bsearch().
1237 * Note that result is for snapshots in *descending* order.
1238 */
1239static int snapid_compare_reverse(const void *s1, const void *s2)
1240{
1241 u64 snap_id1 = *(u64 *)s1;
1242 u64 snap_id2 = *(u64 *)s2;
1243
1244 if (snap_id1 < snap_id2)
1245 return 1;
1246 return snap_id1 == snap_id2 ? 0 : -1;
1247}
1248
1249/*
1250 * Search a snapshot context to see if the given snapshot id is
1251 * present.
1252 *
1253 * Returns the position of the snapshot id in the array if it's found,
1254 * or BAD_SNAP_INDEX otherwise.
1255 *
1256 * Note: The snapshot array is in kept sorted (by the osd) in
1257 * reverse order, highest snapshot id first.
1258 */
9682fc6d
AE
1259static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1260{
1261 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
30d1cff8 1262 u64 *found;
9682fc6d 1263
30d1cff8
AE
1264 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1265 sizeof (snap_id), snapid_compare_reverse);
9682fc6d 1266
30d1cff8 1267 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9682fc6d
AE
1268}
1269
2ad3d716
AE
1270static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1271 u64 snap_id)
9e15b77d 1272{
54cac61f 1273 u32 which;
da6a6b63 1274 const char *snap_name;
9e15b77d 1275
54cac61f
AE
1276 which = rbd_dev_snap_index(rbd_dev, snap_id);
1277 if (which == BAD_SNAP_INDEX)
da6a6b63 1278 return ERR_PTR(-ENOENT);
54cac61f 1279
da6a6b63
JD
1280 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1281 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
54cac61f
AE
1282}
1283
1284static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1285{
9e15b77d
AE
1286 if (snap_id == CEPH_NOSNAP)
1287 return RBD_SNAP_HEAD_NAME;
1288
54cac61f
AE
1289 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1290 if (rbd_dev->image_format == 1)
1291 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9e15b77d 1292
54cac61f 1293 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
9e15b77d
AE
1294}
1295
2ad3d716
AE
1296static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1297 u64 *snap_size)
602adf40 1298{
2ad3d716
AE
1299 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1300 if (snap_id == CEPH_NOSNAP) {
1301 *snap_size = rbd_dev->header.image_size;
1302 } else if (rbd_dev->image_format == 1) {
1303 u32 which;
602adf40 1304
2ad3d716
AE
1305 which = rbd_dev_snap_index(rbd_dev, snap_id);
1306 if (which == BAD_SNAP_INDEX)
1307 return -ENOENT;
e86924a8 1308
2ad3d716
AE
1309 *snap_size = rbd_dev->header.snap_sizes[which];
1310 } else {
1311 u64 size = 0;
1312 int ret;
1313
1314 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1315 if (ret)
1316 return ret;
1317
1318 *snap_size = size;
1319 }
1320 return 0;
602adf40
YS
1321}
1322
2ad3d716
AE
1323static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1324 u64 *snap_features)
602adf40 1325{
2ad3d716
AE
1326 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1327 if (snap_id == CEPH_NOSNAP) {
1328 *snap_features = rbd_dev->header.features;
1329 } else if (rbd_dev->image_format == 1) {
1330 *snap_features = 0; /* No features for format 1 */
602adf40 1331 } else {
2ad3d716
AE
1332 u64 features = 0;
1333 int ret;
8b0241f8 1334
2ad3d716
AE
1335 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1336 if (ret)
1337 return ret;
1338
1339 *snap_features = features;
1340 }
1341 return 0;
1342}
1343
1344static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1345{
8f4b7d98 1346 u64 snap_id = rbd_dev->spec->snap_id;
2ad3d716
AE
1347 u64 size = 0;
1348 u64 features = 0;
1349 int ret;
1350
2ad3d716
AE
1351 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1352 if (ret)
1353 return ret;
1354 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1355 if (ret)
1356 return ret;
1357
1358 rbd_dev->mapping.size = size;
1359 rbd_dev->mapping.features = features;
1360
8b0241f8 1361 return 0;
602adf40
YS
1362}
1363
d1cf5788
AE
1364static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1365{
1366 rbd_dev->mapping.size = 0;
1367 rbd_dev->mapping.features = 0;
200a6a8b
AE
1368}
1369
5359a17d 1370static void zero_bvec(struct bio_vec *bv)
602adf40 1371{
602adf40 1372 void *buf;
5359a17d 1373 unsigned long flags;
602adf40 1374
5359a17d
ID
1375 buf = bvec_kmap_irq(bv, &flags);
1376 memset(buf, 0, bv->bv_len);
1377 flush_dcache_page(bv->bv_page);
1378 bvec_kunmap_irq(buf, &flags);
602adf40
YS
1379}
1380
5359a17d 1381static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
b9434c5b 1382{
5359a17d 1383 struct ceph_bio_iter it = *bio_pos;
b9434c5b 1384
5359a17d
ID
1385 ceph_bio_iter_advance(&it, off);
1386 ceph_bio_iter_advance_step(&it, bytes, ({
1387 zero_bvec(&bv);
1388 }));
b9434c5b
AE
1389}
1390
7e07efb1 1391static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
602adf40 1392{
7e07efb1 1393 struct ceph_bvec_iter it = *bvec_pos;
602adf40 1394
7e07efb1
ID
1395 ceph_bvec_iter_advance(&it, off);
1396 ceph_bvec_iter_advance_step(&it, bytes, ({
1397 zero_bvec(&bv);
1398 }));
f7760dad
AE
1399}
1400
1401/*
3da691bf 1402 * Zero a range in @obj_req data buffer defined by a bio (list) or
afb97888 1403 * (private) bio_vec array.
f7760dad 1404 *
3da691bf 1405 * @off is relative to the start of the data buffer.
926f9b3f 1406 */
3da691bf
ID
1407static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1408 u32 bytes)
926f9b3f 1409{
54ab3b24
ID
1410 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1411
ecc633ca 1412 switch (obj_req->img_request->data_type) {
3da691bf
ID
1413 case OBJ_REQUEST_BIO:
1414 zero_bios(&obj_req->bio_pos, off, bytes);
1415 break;
1416 case OBJ_REQUEST_BVECS:
afb97888 1417 case OBJ_REQUEST_OWN_BVECS:
3da691bf
ID
1418 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1419 break;
1420 default:
16809372 1421 BUG();
6365d33a
AE
1422 }
1423}
1424
bf0d5f50
AE
1425static void rbd_obj_request_destroy(struct kref *kref);
1426static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1427{
1428 rbd_assert(obj_request != NULL);
37206ee5 1429 dout("%s: obj %p (was %d)\n", __func__, obj_request,
2c935bc5 1430 kref_read(&obj_request->kref));
bf0d5f50
AE
1431 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1432}
1433
bf0d5f50
AE
1434static void rbd_img_request_destroy(struct kref *kref);
1435static void rbd_img_request_put(struct rbd_img_request *img_request)
1436{
1437 rbd_assert(img_request != NULL);
37206ee5 1438 dout("%s: img %p (was %d)\n", __func__, img_request,
2c935bc5 1439 kref_read(&img_request->kref));
e93aca0a 1440 kref_put(&img_request->kref, rbd_img_request_destroy);
bf0d5f50
AE
1441}
1442
1443static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1444 struct rbd_obj_request *obj_request)
1445{
25dcf954
AE
1446 rbd_assert(obj_request->img_request == NULL);
1447
b155e86c 1448 /* Image request now owns object's original reference */
bf0d5f50 1449 obj_request->img_request = img_request;
15961b44 1450 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
bf0d5f50
AE
1451}
1452
1453static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1454 struct rbd_obj_request *obj_request)
1455{
15961b44 1456 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
43df3d35 1457 list_del(&obj_request->ex.oe_item);
bf0d5f50 1458 rbd_assert(obj_request->img_request == img_request);
bf0d5f50
AE
1459 rbd_obj_request_put(obj_request);
1460}
1461
a086a1b8 1462static void rbd_osd_submit(struct ceph_osd_request *osd_req)
bf0d5f50 1463{
a086a1b8 1464 struct rbd_obj_request *obj_req = osd_req->r_priv;
980917fc 1465
a086a1b8
ID
1466 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1467 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1468 obj_req->ex.oe_off, obj_req->ex.oe_len);
980917fc 1469 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
bf0d5f50
AE
1470}
1471
0c425248
AE
1472/*
1473 * The default/initial value for all image request flags is 0. Each
1474 * is conditionally set to 1 at image request initialization time
1475 * and currently never change thereafter.
1476 */
d0b2e944
AE
1477static void img_request_layered_set(struct rbd_img_request *img_request)
1478{
1479 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1480 smp_mb();
1481}
1482
a2acd00e
AE
1483static void img_request_layered_clear(struct rbd_img_request *img_request)
1484{
1485 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1486 smp_mb();
1487}
1488
d0b2e944
AE
1489static bool img_request_layered_test(struct rbd_img_request *img_request)
1490{
1491 smp_mb();
1492 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1493}
1494
3da691bf 1495static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
6e2a4505 1496{
3da691bf 1497 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
b9434c5b 1498
43df3d35
ID
1499 return !obj_req->ex.oe_off &&
1500 obj_req->ex.oe_len == rbd_dev->layout.object_size;
6e2a4505
AE
1501}
1502
3da691bf 1503static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
bf0d5f50 1504{
3da691bf 1505 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bf0d5f50 1506
43df3d35 1507 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
3da691bf 1508 rbd_dev->layout.object_size;
0dcc685e
ID
1509}
1510
13488d53
ID
1511/*
1512 * Must be called after rbd_obj_calc_img_extents().
1513 */
1514static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1515{
1516 if (!obj_req->num_img_extents ||
9b17eb2c
ID
1517 (rbd_obj_is_entire(obj_req) &&
1518 !obj_req->img_request->snapc->num_snaps))
13488d53
ID
1519 return false;
1520
1521 return true;
1522}
1523
86bd7998 1524static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
bf0d5f50 1525{
86bd7998
ID
1526 return ceph_file_extents_bytes(obj_req->img_extents,
1527 obj_req->num_img_extents);
bf0d5f50
AE
1528}
1529
3da691bf 1530static bool rbd_img_is_write(struct rbd_img_request *img_req)
bf0d5f50 1531{
9bb0248d 1532 switch (img_req->op_type) {
3da691bf
ID
1533 case OBJ_OP_READ:
1534 return false;
1535 case OBJ_OP_WRITE:
1536 case OBJ_OP_DISCARD:
6484cbe9 1537 case OBJ_OP_ZEROOUT:
3da691bf
ID
1538 return true;
1539 default:
c6244b3b 1540 BUG();
3da691bf 1541 }
90e98c52
GZ
1542}
1543
85e084fe 1544static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
bf0d5f50 1545{
3da691bf 1546 struct rbd_obj_request *obj_req = osd_req->r_priv;
54ab3b24 1547 int result;
bf0d5f50 1548
3da691bf
ID
1549 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1550 osd_req->r_result, obj_req);
bf0d5f50 1551
54ab3b24
ID
1552 /*
1553 * Writes aren't allowed to return a data payload. In some
1554 * guarded write cases (e.g. stat + zero on an empty object)
1555 * a stat response makes it through, but we don't care.
1556 */
1557 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1558 result = 0;
3da691bf 1559 else
54ab3b24 1560 result = osd_req->r_result;
bf0d5f50 1561
54ab3b24 1562 rbd_obj_handle_request(obj_req, result);
bf0d5f50
AE
1563}
1564
bcbab1db 1565static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
430c28c3 1566{
bcbab1db 1567 struct rbd_obj_request *obj_request = osd_req->r_priv;
430c28c3 1568
a162b308 1569 osd_req->r_flags = CEPH_OSD_FLAG_READ;
7c84883a 1570 osd_req->r_snapid = obj_request->img_request->snap_id;
9d4df01f
AE
1571}
1572
bcbab1db 1573static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
9d4df01f 1574{
bcbab1db 1575 struct rbd_obj_request *obj_request = osd_req->r_priv;
9d4df01f 1576
a162b308 1577 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
fac02ddf 1578 ktime_get_real_ts64(&osd_req->r_mtime);
43df3d35 1579 osd_req->r_data_offset = obj_request->ex.oe_off;
430c28c3
AE
1580}
1581
bc81207e 1582static struct ceph_osd_request *
bcbab1db
ID
1583__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1584 struct ceph_snap_context *snapc, int num_ops)
bc81207e 1585{
e28eded5 1586 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bc81207e
ID
1587 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1588 struct ceph_osd_request *req;
a90bb0c1
ID
1589 const char *name_format = rbd_dev->image_format == 1 ?
1590 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
bcbab1db 1591 int ret;
bc81207e 1592
e28eded5 1593 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
bc81207e 1594 if (!req)
bcbab1db 1595 return ERR_PTR(-ENOMEM);
bc81207e 1596
bcbab1db 1597 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
bc81207e 1598 req->r_callback = rbd_osd_req_callback;
a162b308 1599 req->r_priv = obj_req;
bc81207e 1600
b26c047b
ID
1601 /*
1602 * Data objects may be stored in a separate pool, but always in
1603 * the same namespace in that pool as the header in its pool.
1604 */
1605 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
bc81207e 1606 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
b26c047b 1607
bcbab1db
ID
1608 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1609 rbd_dev->header.object_prefix,
1610 obj_req->ex.oe_objno);
1611 if (ret)
1612 return ERR_PTR(ret);
bc81207e 1613
bc81207e 1614 return req;
bc81207e
ID
1615}
1616
e28eded5 1617static struct ceph_osd_request *
bcbab1db 1618rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
bf0d5f50 1619{
bcbab1db
ID
1620 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1621 num_ops);
bf0d5f50
AE
1622}
1623
ecc633ca 1624static struct rbd_obj_request *rbd_obj_request_create(void)
bf0d5f50
AE
1625{
1626 struct rbd_obj_request *obj_request;
bf0d5f50 1627
5a60e876 1628 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
6c696d85 1629 if (!obj_request)
f907ad55 1630 return NULL;
f907ad55 1631
43df3d35 1632 ceph_object_extent_init(&obj_request->ex);
bcbab1db 1633 INIT_LIST_HEAD(&obj_request->osd_reqs);
85b5e6d1 1634 mutex_init(&obj_request->state_mutex);
bf0d5f50
AE
1635 kref_init(&obj_request->kref);
1636
67e2b652 1637 dout("%s %p\n", __func__, obj_request);
bf0d5f50
AE
1638 return obj_request;
1639}
1640
1641static void rbd_obj_request_destroy(struct kref *kref)
1642{
1643 struct rbd_obj_request *obj_request;
bcbab1db 1644 struct ceph_osd_request *osd_req;
7e07efb1 1645 u32 i;
bf0d5f50
AE
1646
1647 obj_request = container_of(kref, struct rbd_obj_request, kref);
1648
37206ee5
AE
1649 dout("%s: obj %p\n", __func__, obj_request);
1650
bcbab1db
ID
1651 while (!list_empty(&obj_request->osd_reqs)) {
1652 osd_req = list_first_entry(&obj_request->osd_reqs,
1653 struct ceph_osd_request, r_private_item);
1654 list_del_init(&osd_req->r_private_item);
1655 ceph_osdc_put_request(osd_req);
1656 }
bf0d5f50 1657
ecc633ca 1658 switch (obj_request->img_request->data_type) {
9969ebc5 1659 case OBJ_REQUEST_NODATA:
bf0d5f50 1660 case OBJ_REQUEST_BIO:
7e07efb1 1661 case OBJ_REQUEST_BVECS:
5359a17d 1662 break; /* Nothing to do */
afb97888
ID
1663 case OBJ_REQUEST_OWN_BVECS:
1664 kfree(obj_request->bvec_pos.bvecs);
788e2df3 1665 break;
7e07efb1 1666 default:
16809372 1667 BUG();
bf0d5f50
AE
1668 }
1669
86bd7998 1670 kfree(obj_request->img_extents);
7e07efb1
ID
1671 if (obj_request->copyup_bvecs) {
1672 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1673 if (obj_request->copyup_bvecs[i].bv_page)
1674 __free_page(obj_request->copyup_bvecs[i].bv_page);
1675 }
1676 kfree(obj_request->copyup_bvecs);
bf0d5f50
AE
1677 }
1678
868311b1 1679 kmem_cache_free(rbd_obj_request_cache, obj_request);
bf0d5f50
AE
1680}
1681
fb65d228
AE
1682/* It's OK to call this for a device with no parent */
1683
1684static void rbd_spec_put(struct rbd_spec *spec);
1685static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1686{
1687 rbd_dev_remove_parent(rbd_dev);
1688 rbd_spec_put(rbd_dev->parent_spec);
1689 rbd_dev->parent_spec = NULL;
1690 rbd_dev->parent_overlap = 0;
1691}
1692
a2acd00e
AE
1693/*
1694 * Parent image reference counting is used to determine when an
1695 * image's parent fields can be safely torn down--after there are no
1696 * more in-flight requests to the parent image. When the last
1697 * reference is dropped, cleaning them up is safe.
1698 */
1699static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1700{
1701 int counter;
1702
1703 if (!rbd_dev->parent_spec)
1704 return;
1705
1706 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1707 if (counter > 0)
1708 return;
1709
1710 /* Last reference; clean up parent data structures */
1711
1712 if (!counter)
1713 rbd_dev_unparent(rbd_dev);
1714 else
9584d508 1715 rbd_warn(rbd_dev, "parent reference underflow");
a2acd00e
AE
1716}
1717
1718/*
1719 * If an image has a non-zero parent overlap, get a reference to its
1720 * parent.
1721 *
1722 * Returns true if the rbd device has a parent with a non-zero
1723 * overlap and a reference for it was successfully taken, or
1724 * false otherwise.
1725 */
1726static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1727{
ae43e9d0 1728 int counter = 0;
a2acd00e
AE
1729
1730 if (!rbd_dev->parent_spec)
1731 return false;
1732
ae43e9d0
ID
1733 down_read(&rbd_dev->header_rwsem);
1734 if (rbd_dev->parent_overlap)
1735 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1736 up_read(&rbd_dev->header_rwsem);
a2acd00e
AE
1737
1738 if (counter < 0)
9584d508 1739 rbd_warn(rbd_dev, "parent reference overflow");
a2acd00e 1740
ae43e9d0 1741 return counter > 0;
a2acd00e
AE
1742}
1743
bf0d5f50
AE
1744/*
1745 * Caller is responsible for filling in the list of object requests
1746 * that comprises the image request, and the Linux request pointer
1747 * (if there is one).
1748 */
cc344fa1
AE
1749static struct rbd_img_request *rbd_img_request_create(
1750 struct rbd_device *rbd_dev,
6d2940c8 1751 enum obj_operation_type op_type,
4e752f0a 1752 struct ceph_snap_context *snapc)
bf0d5f50
AE
1753{
1754 struct rbd_img_request *img_request;
bf0d5f50 1755
a0c5895b 1756 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
bf0d5f50
AE
1757 if (!img_request)
1758 return NULL;
1759
bf0d5f50 1760 img_request->rbd_dev = rbd_dev;
9bb0248d 1761 img_request->op_type = op_type;
9bb0248d 1762 if (!rbd_img_is_write(img_request))
bf0d5f50 1763 img_request->snap_id = rbd_dev->spec->snap_id;
9bb0248d
ID
1764 else
1765 img_request->snapc = snapc;
1766
a2acd00e 1767 if (rbd_dev_parent_get(rbd_dev))
d0b2e944 1768 img_request_layered_set(img_request);
a0c5895b 1769
e1fddc8f 1770 INIT_LIST_HEAD(&img_request->lock_item);
43df3d35 1771 INIT_LIST_HEAD(&img_request->object_extents);
0192ce2e 1772 mutex_init(&img_request->state_mutex);
bf0d5f50
AE
1773 kref_init(&img_request->kref);
1774
bf0d5f50
AE
1775 return img_request;
1776}
1777
1778static void rbd_img_request_destroy(struct kref *kref)
1779{
1780 struct rbd_img_request *img_request;
1781 struct rbd_obj_request *obj_request;
1782 struct rbd_obj_request *next_obj_request;
1783
1784 img_request = container_of(kref, struct rbd_img_request, kref);
1785
37206ee5
AE
1786 dout("%s: img %p\n", __func__, img_request);
1787
e1fddc8f 1788 WARN_ON(!list_empty(&img_request->lock_item));
bf0d5f50
AE
1789 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1790 rbd_img_obj_request_del(img_request, obj_request);
1791
a2acd00e
AE
1792 if (img_request_layered_test(img_request)) {
1793 img_request_layered_clear(img_request);
1794 rbd_dev_parent_put(img_request->rbd_dev);
1795 }
1796
9bb0248d 1797 if (rbd_img_is_write(img_request))
812164f8 1798 ceph_put_snap_context(img_request->snapc);
bf0d5f50 1799
1c2a9dfe 1800 kmem_cache_free(rbd_img_request_cache, img_request);
bf0d5f50
AE
1801}
1802
22e8bd51
ID
1803#define BITS_PER_OBJ 2
1804#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
1805#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
e93f3152 1806
22e8bd51
ID
1807static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1808 u64 *index, u8 *shift)
1809{
1810 u32 off;
e93f3152 1811
22e8bd51
ID
1812 rbd_assert(objno < rbd_dev->object_map_size);
1813 *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1814 *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1815}
e93f3152 1816
22e8bd51
ID
1817static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1818{
1819 u64 index;
1820 u8 shift;
e93f3152 1821
22e8bd51
ID
1822 lockdep_assert_held(&rbd_dev->object_map_lock);
1823 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1824 return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
e93f3152
AE
1825}
1826
22e8bd51 1827static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
e93f3152 1828{
22e8bd51
ID
1829 u64 index;
1830 u8 shift;
1831 u8 *p;
e93f3152 1832
22e8bd51
ID
1833 lockdep_assert_held(&rbd_dev->object_map_lock);
1834 rbd_assert(!(val & ~OBJ_MASK));
e93f3152 1835
22e8bd51
ID
1836 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1837 p = &rbd_dev->object_map[index];
1838 *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
e93f3152
AE
1839}
1840
22e8bd51 1841static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1217857f 1842{
22e8bd51
ID
1843 u8 state;
1844
1845 spin_lock(&rbd_dev->object_map_lock);
1846 state = __rbd_object_map_get(rbd_dev, objno);
1847 spin_unlock(&rbd_dev->object_map_lock);
1848 return state;
3da691bf 1849}
1217857f 1850
22e8bd51 1851static bool use_object_map(struct rbd_device *rbd_dev)
3da691bf 1852{
3fe69921
ID
1853 /*
1854 * An image mapped read-only can't use the object map -- it isn't
1855 * loaded because the header lock isn't acquired. Someone else can
1856 * write to the image and update the object map behind our back.
1857 *
1858 * A snapshot can't be written to, so using the object map is always
1859 * safe.
1860 */
1861 if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
1862 return false;
1863
22e8bd51
ID
1864 return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1865 !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
3da691bf
ID
1866}
1867
22e8bd51 1868static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
3da691bf 1869{
22e8bd51 1870 u8 state;
8b3e1a56 1871
22e8bd51
ID
1872 /* fall back to default logic if object map is disabled or invalid */
1873 if (!use_object_map(rbd_dev))
1874 return true;
3da691bf 1875
22e8bd51
ID
1876 state = rbd_object_map_get(rbd_dev, objno);
1877 return state != OBJECT_NONEXISTENT;
1217857f
AE
1878}
1879
22e8bd51
ID
1880static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1881 struct ceph_object_id *oid)
13488d53 1882{
22e8bd51
ID
1883 if (snap_id == CEPH_NOSNAP)
1884 ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1885 rbd_dev->spec->image_id);
1886 else
1887 ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1888 rbd_dev->spec->image_id, snap_id);
13488d53
ID
1889}
1890
22e8bd51 1891static int rbd_object_map_lock(struct rbd_device *rbd_dev)
2169238d 1892{
22e8bd51
ID
1893 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1894 CEPH_DEFINE_OID_ONSTACK(oid);
1895 u8 lock_type;
1896 char *lock_tag;
1897 struct ceph_locker *lockers;
1898 u32 num_lockers;
1899 bool broke_lock = false;
1900 int ret;
2169238d 1901
22e8bd51 1902 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
2169238d 1903
22e8bd51
ID
1904again:
1905 ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1906 CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1907 if (ret != -EBUSY || broke_lock) {
1908 if (ret == -EEXIST)
1909 ret = 0; /* already locked by myself */
1910 if (ret)
1911 rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1912 return ret;
1913 }
2169238d 1914
22e8bd51
ID
1915 ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1916 RBD_LOCK_NAME, &lock_type, &lock_tag,
1917 &lockers, &num_lockers);
1918 if (ret) {
1919 if (ret == -ENOENT)
1920 goto again;
3da691bf 1921
22e8bd51 1922 rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
86bd7998 1923 return ret;
22e8bd51 1924 }
86bd7998 1925
22e8bd51
ID
1926 kfree(lock_tag);
1927 if (num_lockers == 0)
1928 goto again;
2169238d 1929
22e8bd51
ID
1930 rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1931 ENTITY_NAME(lockers[0].id.name));
2169238d 1932
22e8bd51
ID
1933 ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1934 RBD_LOCK_NAME, lockers[0].id.cookie,
1935 &lockers[0].id.name);
1936 ceph_free_lockers(lockers, num_lockers);
1937 if (ret) {
1938 if (ret == -ENOENT)
1939 goto again;
13488d53 1940
22e8bd51
ID
1941 rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1942 return ret;
3da691bf
ID
1943 }
1944
22e8bd51
ID
1945 broke_lock = true;
1946 goto again;
2169238d
AE
1947}
1948
22e8bd51 1949static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
6484cbe9 1950{
22e8bd51
ID
1951 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1952 CEPH_DEFINE_OID_ONSTACK(oid);
1953 int ret;
1954
1955 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1956
1957 ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1958 "");
1959 if (ret && ret != -ENOENT)
1960 rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
6484cbe9
ID
1961}
1962
22e8bd51 1963static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
6484cbe9 1964{
22e8bd51
ID
1965 u8 struct_v;
1966 u32 struct_len;
1967 u32 header_len;
1968 void *header_end;
6484cbe9
ID
1969 int ret;
1970
22e8bd51
ID
1971 ceph_decode_32_safe(p, end, header_len, e_inval);
1972 header_end = *p + header_len;
0c93e1b7 1973
22e8bd51
ID
1974 ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1975 &struct_len);
6484cbe9
ID
1976 if (ret)
1977 return ret;
1978
22e8bd51 1979 ceph_decode_64_safe(p, end, *object_map_size, e_inval);
6484cbe9 1980
22e8bd51 1981 *p = header_end;
6484cbe9 1982 return 0;
22e8bd51
ID
1983
1984e_inval:
1985 return -EINVAL;
6484cbe9
ID
1986}
1987
22e8bd51 1988static int __rbd_object_map_load(struct rbd_device *rbd_dev)
13488d53 1989{
22e8bd51
ID
1990 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1991 CEPH_DEFINE_OID_ONSTACK(oid);
1992 struct page **pages;
1993 void *p, *end;
1994 size_t reply_len;
1995 u64 num_objects;
1996 u64 object_map_bytes;
1997 u64 object_map_size;
1998 int num_pages;
1999 int ret;
13488d53 2000
22e8bd51 2001 rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
13488d53 2002
22e8bd51
ID
2003 num_objects = ceph_get_num_objects(&rbd_dev->layout,
2004 rbd_dev->mapping.size);
2005 object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
2006 BITS_PER_BYTE);
2007 num_pages = calc_pages_for(0, object_map_bytes) + 1;
2008 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
2009 if (IS_ERR(pages))
2010 return PTR_ERR(pages);
13488d53 2011
22e8bd51
ID
2012 reply_len = num_pages * PAGE_SIZE;
2013 rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
2014 ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
2015 "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
2016 NULL, 0, pages, &reply_len);
2017 if (ret)
2018 goto out;
3b434a2a 2019
22e8bd51
ID
2020 p = page_address(pages[0]);
2021 end = p + min(reply_len, (size_t)PAGE_SIZE);
2022 ret = decode_object_map_header(&p, end, &object_map_size);
2023 if (ret)
2024 goto out;
2025
2026 if (object_map_size != num_objects) {
2027 rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
2028 object_map_size, num_objects);
2029 ret = -EINVAL;
2030 goto out;
3b434a2a
JD
2031 }
2032
22e8bd51
ID
2033 if (offset_in_page(p) + object_map_bytes > reply_len) {
2034 ret = -EINVAL;
2035 goto out;
2036 }
2037
2038 rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
2039 if (!rbd_dev->object_map) {
2040 ret = -ENOMEM;
2041 goto out;
2042 }
2043
2044 rbd_dev->object_map_size = object_map_size;
2045 ceph_copy_from_page_vector(pages, rbd_dev->object_map,
2046 offset_in_page(p), object_map_bytes);
2047
2048out:
2049 ceph_release_page_vector(pages, num_pages);
2050 return ret;
2051}
3da691bf 2052
22e8bd51
ID
2053static void rbd_object_map_free(struct rbd_device *rbd_dev)
2054{
2055 kvfree(rbd_dev->object_map);
2056 rbd_dev->object_map = NULL;
2057 rbd_dev->object_map_size = 0;
3b434a2a
JD
2058}
2059
22e8bd51 2060static int rbd_object_map_load(struct rbd_device *rbd_dev)
bf0d5f50 2061{
3da691bf 2062 int ret;
37206ee5 2063
22e8bd51 2064 ret = __rbd_object_map_load(rbd_dev);
86bd7998
ID
2065 if (ret)
2066 return ret;
f1a4739f 2067
22e8bd51
ID
2068 ret = rbd_dev_v2_get_flags(rbd_dev);
2069 if (ret) {
2070 rbd_object_map_free(rbd_dev);
2071 return ret;
2072 }
2073
2074 if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
2075 rbd_warn(rbd_dev, "object map is invalid");
2076
2077 return 0;
2078}
2079
2080static int rbd_object_map_open(struct rbd_device *rbd_dev)
2081{
2082 int ret;
2083
2084 ret = rbd_object_map_lock(rbd_dev);
2085 if (ret)
2086 return ret;
2087
2088 ret = rbd_object_map_load(rbd_dev);
2089 if (ret) {
2090 rbd_object_map_unlock(rbd_dev);
2091 return ret;
2092 }
2093
2094 return 0;
2095}
2096
2097static void rbd_object_map_close(struct rbd_device *rbd_dev)
2098{
2099 rbd_object_map_free(rbd_dev);
2100 rbd_object_map_unlock(rbd_dev);
2101}
2102
2103/*
2104 * This function needs snap_id (or more precisely just something to
2105 * distinguish between HEAD and snapshot object maps), new_state and
2106 * current_state that were passed to rbd_object_map_update().
2107 *
2108 * To avoid allocating and stashing a context we piggyback on the OSD
2109 * request. A HEAD update has two ops (assert_locked). For new_state
2110 * and current_state we decode our own object_map_update op, encoded in
2111 * rbd_cls_object_map_update().
2112 */
2113static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
2114 struct ceph_osd_request *osd_req)
2115{
2116 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2117 struct ceph_osd_data *osd_data;
2118 u64 objno;
633739b2 2119 u8 state, new_state, uninitialized_var(current_state);
22e8bd51
ID
2120 bool has_current_state;
2121 void *p;
2122
2123 if (osd_req->r_result)
2124 return osd_req->r_result;
2125
2126 /*
2127 * Nothing to do for a snapshot object map.
2128 */
2129 if (osd_req->r_num_ops == 1)
2130 return 0;
2131
2132 /*
2133 * Update in-memory HEAD object map.
2134 */
2135 rbd_assert(osd_req->r_num_ops == 2);
2136 osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
2137 rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
2138
2139 p = page_address(osd_data->pages[0]);
2140 objno = ceph_decode_64(&p);
2141 rbd_assert(objno == obj_req->ex.oe_objno);
2142 rbd_assert(ceph_decode_64(&p) == objno + 1);
2143 new_state = ceph_decode_8(&p);
2144 has_current_state = ceph_decode_8(&p);
2145 if (has_current_state)
2146 current_state = ceph_decode_8(&p);
2147
2148 spin_lock(&rbd_dev->object_map_lock);
2149 state = __rbd_object_map_get(rbd_dev, objno);
2150 if (!has_current_state || current_state == state ||
2151 (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2152 __rbd_object_map_set(rbd_dev, objno, new_state);
2153 spin_unlock(&rbd_dev->object_map_lock);
2154
2155 return 0;
2156}
2157
2158static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2159{
2160 struct rbd_obj_request *obj_req = osd_req->r_priv;
2161 int result;
2162
2163 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2164 osd_req->r_result, obj_req);
2165
2166 result = rbd_object_map_update_finish(obj_req, osd_req);
2167 rbd_obj_handle_request(obj_req, result);
2168}
2169
2170static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2171{
2172 u8 state = rbd_object_map_get(rbd_dev, objno);
bf0d5f50 2173
22e8bd51
ID
2174 if (state == new_state ||
2175 (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2176 (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2177 return false;
2178
2179 return true;
2180}
2181
2182static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2183 int which, u64 objno, u8 new_state,
2184 const u8 *current_state)
2185{
2186 struct page **pages;
2187 void *p, *start;
2188 int ret;
2189
2190 ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2191 if (ret)
2192 return ret;
2193
2194 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2195 if (IS_ERR(pages))
2196 return PTR_ERR(pages);
2197
2198 p = start = page_address(pages[0]);
2199 ceph_encode_64(&p, objno);
2200 ceph_encode_64(&p, objno + 1);
2201 ceph_encode_8(&p, new_state);
2202 if (current_state) {
2203 ceph_encode_8(&p, 1);
2204 ceph_encode_8(&p, *current_state);
2205 } else {
2206 ceph_encode_8(&p, 0);
2207 }
2208
2209 osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2210 false, true);
2211 return 0;
2212}
2213
2214/*
2215 * Return:
2216 * 0 - object map update sent
2217 * 1 - object map update isn't needed
2218 * <0 - error
2219 */
2220static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2221 u8 new_state, const u8 *current_state)
2222{
2223 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2224 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2225 struct ceph_osd_request *req;
2226 int num_ops = 1;
2227 int which = 0;
2228 int ret;
2229
2230 if (snap_id == CEPH_NOSNAP) {
2231 if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2232 return 1;
2233
2234 num_ops++; /* assert_locked */
2235 }
2236
2237 req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2238 if (!req)
2239 return -ENOMEM;
2240
2241 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2242 req->r_callback = rbd_object_map_callback;
2243 req->r_priv = obj_req;
2244
2245 rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2246 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2247 req->r_flags = CEPH_OSD_FLAG_WRITE;
2248 ktime_get_real_ts64(&req->r_mtime);
2249
2250 if (snap_id == CEPH_NOSNAP) {
2251 /*
2252 * Protect against possible race conditions during lock
2253 * ownership transitions.
2254 */
2255 ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2256 CEPH_CLS_LOCK_EXCLUSIVE, "", "");
3da691bf
ID
2257 if (ret)
2258 return ret;
22e8bd51
ID
2259 }
2260
2261 ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2262 new_state, current_state);
2263 if (ret)
2264 return ret;
2265
2266 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2267 if (ret)
2268 return ret;
13488d53 2269
22e8bd51
ID
2270 ceph_osdc_start_request(osdc, req, false);
2271 return 0;
2272}
2273
86bd7998
ID
2274static void prune_extents(struct ceph_file_extent *img_extents,
2275 u32 *num_img_extents, u64 overlap)
e93f3152 2276{
86bd7998 2277 u32 cnt = *num_img_extents;
e93f3152 2278
86bd7998
ID
2279 /* drop extents completely beyond the overlap */
2280 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2281 cnt--;
e93f3152 2282
86bd7998
ID
2283 if (cnt) {
2284 struct ceph_file_extent *ex = &img_extents[cnt - 1];
e93f3152 2285
86bd7998
ID
2286 /* trim final overlapping extent */
2287 if (ex->fe_off + ex->fe_len > overlap)
2288 ex->fe_len = overlap - ex->fe_off;
2289 }
e93f3152 2290
86bd7998 2291 *num_img_extents = cnt;
e93f3152
AE
2292}
2293
86bd7998
ID
2294/*
2295 * Determine the byte range(s) covered by either just the object extent
2296 * or the entire object in the parent image.
2297 */
2298static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2299 bool entire)
e93f3152 2300{
86bd7998
ID
2301 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2302 int ret;
e93f3152 2303
86bd7998
ID
2304 if (!rbd_dev->parent_overlap)
2305 return 0;
e93f3152 2306
86bd7998
ID
2307 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2308 entire ? 0 : obj_req->ex.oe_off,
2309 entire ? rbd_dev->layout.object_size :
2310 obj_req->ex.oe_len,
2311 &obj_req->img_extents,
2312 &obj_req->num_img_extents);
2313 if (ret)
2314 return ret;
e93f3152 2315
86bd7998
ID
2316 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2317 rbd_dev->parent_overlap);
2318 return 0;
e93f3152
AE
2319}
2320
bcbab1db 2321static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
1217857f 2322{
bcbab1db
ID
2323 struct rbd_obj_request *obj_req = osd_req->r_priv;
2324
ecc633ca 2325 switch (obj_req->img_request->data_type) {
3da691bf 2326 case OBJ_REQUEST_BIO:
bcbab1db 2327 osd_req_op_extent_osd_data_bio(osd_req, which,
3da691bf 2328 &obj_req->bio_pos,
43df3d35 2329 obj_req->ex.oe_len);
3da691bf
ID
2330 break;
2331 case OBJ_REQUEST_BVECS:
afb97888 2332 case OBJ_REQUEST_OWN_BVECS:
3da691bf 2333 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
43df3d35 2334 obj_req->ex.oe_len);
afb97888 2335 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
bcbab1db 2336 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
3da691bf
ID
2337 &obj_req->bvec_pos);
2338 break;
2339 default:
16809372 2340 BUG();
1217857f 2341 }
3da691bf 2342}
1217857f 2343
bcbab1db 2344static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
3da691bf
ID
2345{
2346 struct page **pages;
8b3e1a56 2347
3da691bf
ID
2348 /*
2349 * The response data for a STAT call consists of:
2350 * le64 length;
2351 * struct {
2352 * le32 tv_sec;
2353 * le32 tv_nsec;
2354 * } mtime;
2355 */
2356 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2357 if (IS_ERR(pages))
2358 return PTR_ERR(pages);
2359
bcbab1db
ID
2360 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2361 osd_req_op_raw_data_in_pages(osd_req, which, pages,
3da691bf
ID
2362 8 + sizeof(struct ceph_timespec),
2363 0, false, true);
2364 return 0;
1217857f
AE
2365}
2366
b5ae8cbc
ID
2367static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2368 u32 bytes)
2369{
2370 struct rbd_obj_request *obj_req = osd_req->r_priv;
2371 int ret;
2372
2373 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2374 if (ret)
2375 return ret;
2376
2377 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2378 obj_req->copyup_bvec_count, bytes);
2379 return 0;
2380}
2381
ea9b743c
ID
2382static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2383{
2384 obj_req->read_state = RBD_OBJ_READ_START;
2385 return 0;
2386}
2387
bcbab1db
ID
2388static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2389 int which)
2169238d 2390{
bcbab1db 2391 struct rbd_obj_request *obj_req = osd_req->r_priv;
3da691bf
ID
2392 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2393 u16 opcode;
2169238d 2394
8b5bec5c
ID
2395 if (!use_object_map(rbd_dev) ||
2396 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2397 osd_req_op_alloc_hint_init(osd_req, which++,
2398 rbd_dev->layout.object_size,
2399 rbd_dev->layout.object_size);
2400 }
2169238d 2401
3da691bf
ID
2402 if (rbd_obj_is_entire(obj_req))
2403 opcode = CEPH_OSD_OP_WRITEFULL;
2404 else
2405 opcode = CEPH_OSD_OP_WRITE;
2169238d 2406
bcbab1db 2407 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2408 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
bcbab1db 2409 rbd_osd_setup_data(osd_req, which);
3da691bf 2410}
2169238d 2411
ea9b743c 2412static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
3da691bf 2413{
3da691bf
ID
2414 int ret;
2415
86bd7998
ID
2416 /* reverse map the entire object onto the parent */
2417 ret = rbd_obj_calc_img_extents(obj_req, true);
2418 if (ret)
2419 return ret;
2420
0ad5d953
ID
2421 if (rbd_obj_copyup_enabled(obj_req))
2422 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2423
85b5e6d1 2424 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf 2425 return 0;
2169238d
AE
2426}
2427
6484cbe9
ID
2428static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2429{
2430 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2431 CEPH_OSD_OP_ZERO;
2432}
2433
27bbd911
ID
2434static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2435 int which)
2436{
2437 struct rbd_obj_request *obj_req = osd_req->r_priv;
2438
2439 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2440 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2441 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
13488d53 2442 } else {
27bbd911
ID
2443 osd_req_op_extent_init(osd_req, which,
2444 truncate_or_zero_opcode(obj_req),
2445 obj_req->ex.oe_off, obj_req->ex.oe_len,
2446 0, 0);
2447 }
2448}
2449
ea9b743c 2450static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
6484cbe9 2451{
0c93e1b7 2452 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
27bbd911 2453 u64 off, next_off;
6484cbe9
ID
2454 int ret;
2455
0c93e1b7
ID
2456 /*
2457 * Align the range to alloc_size boundary and punt on discards
2458 * that are too small to free up any space.
2459 *
2460 * alloc_size == object_size && is_tail() is a special case for
2461 * filestore with filestore_punch_hole = false, needed to allow
2462 * truncate (in addition to delete).
2463 */
2464 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2465 !rbd_obj_is_tail(obj_req)) {
27bbd911
ID
2466 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2467 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2468 rbd_dev->opts->alloc_size);
0c93e1b7
ID
2469 if (off >= next_off)
2470 return 1;
27bbd911
ID
2471
2472 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2473 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2474 off, next_off - off);
2475 obj_req->ex.oe_off = off;
2476 obj_req->ex.oe_len = next_off - off;
0c93e1b7
ID
2477 }
2478
6484cbe9
ID
2479 /* reverse map the entire object onto the parent */
2480 ret = rbd_obj_calc_img_extents(obj_req, true);
2481 if (ret)
2482 return ret;
2483
22e8bd51 2484 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
0ad5d953
ID
2485 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2486 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2487
85b5e6d1 2488 obj_req->write_state = RBD_OBJ_WRITE_START;
6484cbe9
ID
2489 return 0;
2490}
2491
bcbab1db
ID
2492static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2493 int which)
3da691bf 2494{
bcbab1db 2495 struct rbd_obj_request *obj_req = osd_req->r_priv;
3b434a2a
JD
2496 u16 opcode;
2497
3da691bf 2498 if (rbd_obj_is_entire(obj_req)) {
86bd7998 2499 if (obj_req->num_img_extents) {
0ad5d953 2500 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
bcbab1db 2501 osd_req_op_init(osd_req, which++,
9b17eb2c 2502 CEPH_OSD_OP_CREATE, 0);
3b434a2a
JD
2503 opcode = CEPH_OSD_OP_TRUNCATE;
2504 } else {
0ad5d953 2505 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
bcbab1db 2506 osd_req_op_init(osd_req, which++,
3da691bf
ID
2507 CEPH_OSD_OP_DELETE, 0);
2508 opcode = 0;
3b434a2a 2509 }
3b434a2a 2510 } else {
6484cbe9 2511 opcode = truncate_or_zero_opcode(obj_req);
3b434a2a
JD
2512 }
2513
3da691bf 2514 if (opcode)
bcbab1db 2515 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2516 obj_req->ex.oe_off, obj_req->ex.oe_len,
3da691bf 2517 0, 0);
3b434a2a
JD
2518}
2519
ea9b743c 2520static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
bf0d5f50 2521{
3da691bf 2522 int ret;
37206ee5 2523
86bd7998
ID
2524 /* reverse map the entire object onto the parent */
2525 ret = rbd_obj_calc_img_extents(obj_req, true);
2526 if (ret)
2527 return ret;
f1a4739f 2528
0ad5d953
ID
2529 if (rbd_obj_copyup_enabled(obj_req))
2530 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2531 if (!obj_req->num_img_extents) {
22e8bd51 2532 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
0ad5d953
ID
2533 if (rbd_obj_is_entire(obj_req))
2534 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
3da691bf 2535 }
3b434a2a 2536
a086a1b8 2537 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf
ID
2538 return 0;
2539}
9d4df01f 2540
a086a1b8
ID
2541static int count_write_ops(struct rbd_obj_request *obj_req)
2542{
8b5bec5c
ID
2543 struct rbd_img_request *img_req = obj_req->img_request;
2544
2545 switch (img_req->op_type) {
a086a1b8 2546 case OBJ_OP_WRITE:
8b5bec5c
ID
2547 if (!use_object_map(img_req->rbd_dev) ||
2548 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2549 return 2; /* setallochint + write/writefull */
2550
2551 return 1; /* write/writefull */
a086a1b8
ID
2552 case OBJ_OP_DISCARD:
2553 return 1; /* delete/truncate/zero */
2554 case OBJ_OP_ZEROOUT:
2555 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2556 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2557 return 2; /* create + truncate */
bf0d5f50 2558
a086a1b8
ID
2559 return 1; /* delete/truncate/zero */
2560 default:
2561 BUG();
3da691bf 2562 }
a086a1b8 2563}
3b434a2a 2564
a086a1b8
ID
2565static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2566 int which)
2567{
2568 struct rbd_obj_request *obj_req = osd_req->r_priv;
2569
2570 switch (obj_req->img_request->op_type) {
2571 case OBJ_OP_WRITE:
2572 __rbd_osd_setup_write_ops(osd_req, which);
2573 break;
2574 case OBJ_OP_DISCARD:
2575 __rbd_osd_setup_discard_ops(osd_req, which);
2576 break;
2577 case OBJ_OP_ZEROOUT:
2578 __rbd_osd_setup_zeroout_ops(osd_req, which);
2579 break;
2580 default:
2581 BUG();
2582 }
3da691bf 2583}
9d4df01f 2584
3da691bf 2585/*
a086a1b8
ID
2586 * Prune the list of object requests (adjust offset and/or length, drop
2587 * redundant requests). Prepare object request state machines and image
2588 * request state machine for execution.
3da691bf
ID
2589 */
2590static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2591{
0c93e1b7 2592 struct rbd_obj_request *obj_req, *next_obj_req;
3da691bf 2593 int ret;
430c28c3 2594
0c93e1b7 2595 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
9bb0248d 2596 switch (img_req->op_type) {
3da691bf 2597 case OBJ_OP_READ:
ea9b743c 2598 ret = rbd_obj_init_read(obj_req);
3da691bf
ID
2599 break;
2600 case OBJ_OP_WRITE:
ea9b743c 2601 ret = rbd_obj_init_write(obj_req);
3da691bf
ID
2602 break;
2603 case OBJ_OP_DISCARD:
ea9b743c 2604 ret = rbd_obj_init_discard(obj_req);
3da691bf 2605 break;
6484cbe9 2606 case OBJ_OP_ZEROOUT:
ea9b743c 2607 ret = rbd_obj_init_zeroout(obj_req);
6484cbe9 2608 break;
3da691bf 2609 default:
16809372 2610 BUG();
3da691bf 2611 }
0c93e1b7 2612 if (ret < 0)
3da691bf 2613 return ret;
0c93e1b7 2614 if (ret > 0) {
0c93e1b7
ID
2615 rbd_img_obj_request_del(img_req, obj_req);
2616 continue;
2617 }
bf0d5f50
AE
2618 }
2619
0192ce2e 2620 img_req->state = RBD_IMG_START;
bf0d5f50 2621 return 0;
3da691bf 2622}
bf0d5f50 2623
5a237819
ID
2624union rbd_img_fill_iter {
2625 struct ceph_bio_iter bio_iter;
2626 struct ceph_bvec_iter bvec_iter;
2627};
bf0d5f50 2628
5a237819
ID
2629struct rbd_img_fill_ctx {
2630 enum obj_request_type pos_type;
2631 union rbd_img_fill_iter *pos;
2632 union rbd_img_fill_iter iter;
2633 ceph_object_extent_fn_t set_pos_fn;
afb97888
ID
2634 ceph_object_extent_fn_t count_fn;
2635 ceph_object_extent_fn_t copy_fn;
5a237819 2636};
bf0d5f50 2637
5a237819 2638static struct ceph_object_extent *alloc_object_extent(void *arg)
0eefd470 2639{
5a237819
ID
2640 struct rbd_img_request *img_req = arg;
2641 struct rbd_obj_request *obj_req;
0eefd470 2642
5a237819
ID
2643 obj_req = rbd_obj_request_create();
2644 if (!obj_req)
2645 return NULL;
2761713d 2646
5a237819
ID
2647 rbd_img_obj_request_add(img_req, obj_req);
2648 return &obj_req->ex;
2649}
0eefd470 2650
afb97888
ID
2651/*
2652 * While su != os && sc == 1 is technically not fancy (it's the same
2653 * layout as su == os && sc == 1), we can't use the nocopy path for it
2654 * because ->set_pos_fn() should be called only once per object.
2655 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2656 * treat su != os && sc == 1 as fancy.
2657 */
2658static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2659{
2660 return l->stripe_unit != l->object_size;
2661}
0eefd470 2662
afb97888
ID
2663static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2664 struct ceph_file_extent *img_extents,
2665 u32 num_img_extents,
2666 struct rbd_img_fill_ctx *fctx)
2667{
2668 u32 i;
2669 int ret;
2670
2671 img_req->data_type = fctx->pos_type;
0eefd470
AE
2672
2673 /*
afb97888
ID
2674 * Create object requests and set each object request's starting
2675 * position in the provided bio (list) or bio_vec array.
0eefd470 2676 */
afb97888
ID
2677 fctx->iter = *fctx->pos;
2678 for (i = 0; i < num_img_extents; i++) {
2679 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2680 img_extents[i].fe_off,
2681 img_extents[i].fe_len,
2682 &img_req->object_extents,
2683 alloc_object_extent, img_req,
2684 fctx->set_pos_fn, &fctx->iter);
2685 if (ret)
2686 return ret;
2687 }
0eefd470 2688
afb97888 2689 return __rbd_img_fill_request(img_req);
0eefd470
AE
2690}
2691
5a237819
ID
2692/*
2693 * Map a list of image extents to a list of object extents, create the
2694 * corresponding object requests (normally each to a different object,
2695 * but not always) and add them to @img_req. For each object request,
afb97888 2696 * set up its data descriptor to point to the corresponding chunk(s) of
5a237819
ID
2697 * @fctx->pos data buffer.
2698 *
afb97888
ID
2699 * Because ceph_file_to_extents() will merge adjacent object extents
2700 * together, each object request's data descriptor may point to multiple
2701 * different chunks of @fctx->pos data buffer.
2702 *
5a237819
ID
2703 * @fctx->pos data buffer is assumed to be large enough.
2704 */
2705static int rbd_img_fill_request(struct rbd_img_request *img_req,
2706 struct ceph_file_extent *img_extents,
2707 u32 num_img_extents,
2708 struct rbd_img_fill_ctx *fctx)
3d7efd18 2709{
afb97888
ID
2710 struct rbd_device *rbd_dev = img_req->rbd_dev;
2711 struct rbd_obj_request *obj_req;
5a237819
ID
2712 u32 i;
2713 int ret;
2714
afb97888
ID
2715 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2716 !rbd_layout_is_fancy(&rbd_dev->layout))
2717 return rbd_img_fill_request_nocopy(img_req, img_extents,
2718 num_img_extents, fctx);
3d7efd18 2719
afb97888 2720 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
0eefd470 2721
bbea1c1a 2722 /*
afb97888
ID
2723 * Create object requests and determine ->bvec_count for each object
2724 * request. Note that ->bvec_count sum over all object requests may
2725 * be greater than the number of bio_vecs in the provided bio (list)
2726 * or bio_vec array because when mapped, those bio_vecs can straddle
2727 * stripe unit boundaries.
bbea1c1a 2728 */
5a237819
ID
2729 fctx->iter = *fctx->pos;
2730 for (i = 0; i < num_img_extents; i++) {
afb97888 2731 ret = ceph_file_to_extents(&rbd_dev->layout,
5a237819
ID
2732 img_extents[i].fe_off,
2733 img_extents[i].fe_len,
2734 &img_req->object_extents,
2735 alloc_object_extent, img_req,
afb97888
ID
2736 fctx->count_fn, &fctx->iter);
2737 if (ret)
2738 return ret;
bbea1c1a 2739 }
0eefd470 2740
afb97888
ID
2741 for_each_obj_request(img_req, obj_req) {
2742 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2743 sizeof(*obj_req->bvec_pos.bvecs),
2744 GFP_NOIO);
2745 if (!obj_req->bvec_pos.bvecs)
2746 return -ENOMEM;
2747 }
0eefd470 2748
8785b1d4 2749 /*
afb97888
ID
2750 * Fill in each object request's private bio_vec array, splitting and
2751 * rearranging the provided bio_vecs in stripe unit chunks as needed.
8785b1d4 2752 */
afb97888
ID
2753 fctx->iter = *fctx->pos;
2754 for (i = 0; i < num_img_extents; i++) {
2755 ret = ceph_iterate_extents(&rbd_dev->layout,
2756 img_extents[i].fe_off,
2757 img_extents[i].fe_len,
2758 &img_req->object_extents,
2759 fctx->copy_fn, &fctx->iter);
5a237819
ID
2760 if (ret)
2761 return ret;
2762 }
3d7efd18 2763
5a237819
ID
2764 return __rbd_img_fill_request(img_req);
2765}
2766
2767static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2768 u64 off, u64 len)
2769{
2770 struct ceph_file_extent ex = { off, len };
2771 union rbd_img_fill_iter dummy;
2772 struct rbd_img_fill_ctx fctx = {
2773 .pos_type = OBJ_REQUEST_NODATA,
2774 .pos = &dummy,
2775 };
2776
2777 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2778}
2779
2780static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2781{
2782 struct rbd_obj_request *obj_req =
2783 container_of(ex, struct rbd_obj_request, ex);
2784 struct ceph_bio_iter *it = arg;
3d7efd18 2785
5a237819
ID
2786 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2787 obj_req->bio_pos = *it;
2788 ceph_bio_iter_advance(it, bytes);
2789}
3d7efd18 2790
afb97888
ID
2791static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2792{
2793 struct rbd_obj_request *obj_req =
2794 container_of(ex, struct rbd_obj_request, ex);
2795 struct ceph_bio_iter *it = arg;
0eefd470 2796
afb97888
ID
2797 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2798 ceph_bio_iter_advance_step(it, bytes, ({
2799 obj_req->bvec_count++;
2800 }));
0eefd470 2801
afb97888 2802}
0eefd470 2803
afb97888
ID
2804static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2805{
2806 struct rbd_obj_request *obj_req =
2807 container_of(ex, struct rbd_obj_request, ex);
2808 struct ceph_bio_iter *it = arg;
0eefd470 2809
afb97888
ID
2810 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2811 ceph_bio_iter_advance_step(it, bytes, ({
2812 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2813 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2814 }));
3d7efd18
AE
2815}
2816
5a237819
ID
2817static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2818 struct ceph_file_extent *img_extents,
2819 u32 num_img_extents,
2820 struct ceph_bio_iter *bio_pos)
2821{
2822 struct rbd_img_fill_ctx fctx = {
2823 .pos_type = OBJ_REQUEST_BIO,
2824 .pos = (union rbd_img_fill_iter *)bio_pos,
2825 .set_pos_fn = set_bio_pos,
afb97888
ID
2826 .count_fn = count_bio_bvecs,
2827 .copy_fn = copy_bio_bvecs,
5a237819 2828 };
3d7efd18 2829
5a237819
ID
2830 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2831 &fctx);
2832}
3d7efd18 2833
5a237819
ID
2834static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2835 u64 off, u64 len, struct bio *bio)
2836{
2837 struct ceph_file_extent ex = { off, len };
2838 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
3d7efd18 2839
5a237819
ID
2840 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2841}
a9e8ba2c 2842
5a237819
ID
2843static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2844{
2845 struct rbd_obj_request *obj_req =
2846 container_of(ex, struct rbd_obj_request, ex);
2847 struct ceph_bvec_iter *it = arg;
3d7efd18 2848
5a237819
ID
2849 obj_req->bvec_pos = *it;
2850 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2851 ceph_bvec_iter_advance(it, bytes);
2852}
3d7efd18 2853
afb97888
ID
2854static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2855{
2856 struct rbd_obj_request *obj_req =
2857 container_of(ex, struct rbd_obj_request, ex);
2858 struct ceph_bvec_iter *it = arg;
058aa991 2859
afb97888
ID
2860 ceph_bvec_iter_advance_step(it, bytes, ({
2861 obj_req->bvec_count++;
2862 }));
2863}
058aa991 2864
afb97888
ID
2865static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2866{
2867 struct rbd_obj_request *obj_req =
2868 container_of(ex, struct rbd_obj_request, ex);
2869 struct ceph_bvec_iter *it = arg;
3d7efd18 2870
afb97888
ID
2871 ceph_bvec_iter_advance_step(it, bytes, ({
2872 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2873 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2874 }));
3d7efd18
AE
2875}
2876
5a237819
ID
2877static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2878 struct ceph_file_extent *img_extents,
2879 u32 num_img_extents,
2880 struct ceph_bvec_iter *bvec_pos)
c5b5ef6c 2881{
5a237819
ID
2882 struct rbd_img_fill_ctx fctx = {
2883 .pos_type = OBJ_REQUEST_BVECS,
2884 .pos = (union rbd_img_fill_iter *)bvec_pos,
2885 .set_pos_fn = set_bvec_pos,
afb97888
ID
2886 .count_fn = count_bvecs,
2887 .copy_fn = copy_bvecs,
5a237819 2888 };
c5b5ef6c 2889
5a237819
ID
2890 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2891 &fctx);
2892}
c5b5ef6c 2893
5a237819
ID
2894static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2895 struct ceph_file_extent *img_extents,
2896 u32 num_img_extents,
2897 struct bio_vec *bvecs)
2898{
2899 struct ceph_bvec_iter it = {
2900 .bvecs = bvecs,
2901 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2902 num_img_extents) },
2903 };
c5b5ef6c 2904
5a237819
ID
2905 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2906 &it);
2907}
c5b5ef6c 2908
0192ce2e 2909static void rbd_img_handle_request_work(struct work_struct *work)
bf0d5f50 2910{
0192ce2e
ID
2911 struct rbd_img_request *img_req =
2912 container_of(work, struct rbd_img_request, work);
c5b5ef6c 2913
0192ce2e
ID
2914 rbd_img_handle_request(img_req, img_req->work_result);
2915}
c2e82414 2916
0192ce2e
ID
2917static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2918{
2919 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2920 img_req->work_result = result;
2921 queue_work(rbd_wq, &img_req->work);
c5b5ef6c 2922}
c2e82414 2923
22e8bd51
ID
2924static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2925{
2926 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2927
2928 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2929 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2930 return true;
2931 }
2932
2933 dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2934 obj_req->ex.oe_objno);
2935 return false;
2936}
2937
85b5e6d1
ID
2938static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2939{
a086a1b8
ID
2940 struct ceph_osd_request *osd_req;
2941 int ret;
2942
2943 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2944 if (IS_ERR(osd_req))
2945 return PTR_ERR(osd_req);
2946
2947 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2948 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2949 rbd_osd_setup_data(osd_req, 0);
2950 rbd_osd_format_read(osd_req);
2951
2952 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2953 if (ret)
2954 return ret;
2955
2956 rbd_osd_submit(osd_req);
85b5e6d1 2957 return 0;
c5b5ef6c
AE
2958}
2959
86bd7998 2960static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
c5b5ef6c 2961{
3da691bf
ID
2962 struct rbd_img_request *img_req = obj_req->img_request;
2963 struct rbd_img_request *child_img_req;
c5b5ef6c
AE
2964 int ret;
2965
e93aca0a
ID
2966 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2967 OBJ_OP_READ, NULL);
3da691bf 2968 if (!child_img_req)
710214e3
ID
2969 return -ENOMEM;
2970
e93aca0a
ID
2971 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2972 child_img_req->obj_request = obj_req;
a90bb0c1 2973
21ed05a8
ID
2974 dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2975 obj_req);
2976
3da691bf 2977 if (!rbd_img_is_write(img_req)) {
ecc633ca 2978 switch (img_req->data_type) {
3da691bf 2979 case OBJ_REQUEST_BIO:
5a237819
ID
2980 ret = __rbd_img_fill_from_bio(child_img_req,
2981 obj_req->img_extents,
2982 obj_req->num_img_extents,
2983 &obj_req->bio_pos);
3da691bf
ID
2984 break;
2985 case OBJ_REQUEST_BVECS:
afb97888 2986 case OBJ_REQUEST_OWN_BVECS:
5a237819
ID
2987 ret = __rbd_img_fill_from_bvecs(child_img_req,
2988 obj_req->img_extents,
2989 obj_req->num_img_extents,
2990 &obj_req->bvec_pos);
3da691bf
ID
2991 break;
2992 default:
d342a15b 2993 BUG();
3da691bf
ID
2994 }
2995 } else {
5a237819
ID
2996 ret = rbd_img_fill_from_bvecs(child_img_req,
2997 obj_req->img_extents,
2998 obj_req->num_img_extents,
2999 obj_req->copyup_bvecs);
3da691bf
ID
3000 }
3001 if (ret) {
3002 rbd_img_request_put(child_img_req);
3003 return ret;
3004 }
3005
0192ce2e
ID
3006 /* avoid parent chain recursion */
3007 rbd_img_schedule(child_img_req, 0);
3da691bf
ID
3008 return 0;
3009}
3010
85b5e6d1 3011static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
3da691bf
ID
3012{
3013 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3014 int ret;
3015
22e8bd51 3016again:
a9b67e69 3017 switch (obj_req->read_state) {
85b5e6d1
ID
3018 case RBD_OBJ_READ_START:
3019 rbd_assert(!*result);
3020
22e8bd51
ID
3021 if (!rbd_obj_may_exist(obj_req)) {
3022 *result = -ENOENT;
3023 obj_req->read_state = RBD_OBJ_READ_OBJECT;
3024 goto again;
3025 }
3026
85b5e6d1 3027 ret = rbd_obj_read_object(obj_req);
3da691bf 3028 if (ret) {
85b5e6d1 3029 *result = ret;
3da691bf
ID
3030 return true;
3031 }
85b5e6d1
ID
3032 obj_req->read_state = RBD_OBJ_READ_OBJECT;
3033 return false;
a9b67e69
ID
3034 case RBD_OBJ_READ_OBJECT:
3035 if (*result == -ENOENT && rbd_dev->parent_overlap) {
3036 /* reverse map this object extent onto the parent */
3037 ret = rbd_obj_calc_img_extents(obj_req, false);
86bd7998 3038 if (ret) {
54ab3b24 3039 *result = ret;
86bd7998
ID
3040 return true;
3041 }
a9b67e69
ID
3042 if (obj_req->num_img_extents) {
3043 ret = rbd_obj_read_from_parent(obj_req);
3044 if (ret) {
3045 *result = ret;
3046 return true;
3047 }
3048 obj_req->read_state = RBD_OBJ_READ_PARENT;
3049 return false;
3050 }
86bd7998 3051 }
710214e3 3052
a9b67e69
ID
3053 /*
3054 * -ENOENT means a hole in the image -- zero-fill the entire
3055 * length of the request. A short read also implies zero-fill
3056 * to the end of the request.
3057 */
3058 if (*result == -ENOENT) {
3059 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
3060 *result = 0;
3061 } else if (*result >= 0) {
3062 if (*result < obj_req->ex.oe_len)
3063 rbd_obj_zero_range(obj_req, *result,
3064 obj_req->ex.oe_len - *result);
3065 else
3066 rbd_assert(*result == obj_req->ex.oe_len);
3067 *result = 0;
3068 }
3069 return true;
3070 case RBD_OBJ_READ_PARENT:
d435c9a7
ID
3071 /*
3072 * The parent image is read only up to the overlap -- zero-fill
3073 * from the overlap to the end of the request.
3074 */
3075 if (!*result) {
3076 u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
3077
3078 if (obj_overlap < obj_req->ex.oe_len)
3079 rbd_obj_zero_range(obj_req, obj_overlap,
3080 obj_req->ex.oe_len - obj_overlap);
3081 }
a9b67e69
ID
3082 return true;
3083 default:
3084 BUG();
710214e3 3085 }
3da691bf 3086}
c5b5ef6c 3087
22e8bd51
ID
3088static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
3089{
3090 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3091
3092 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
3093 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
3094
3095 if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
3096 (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
3097 dout("%s %p noop for nonexistent\n", __func__, obj_req);
3098 return true;
3099 }
3100
3101 return false;
3102}
3103
3104/*
3105 * Return:
3106 * 0 - object map update sent
3107 * 1 - object map update isn't needed
3108 * <0 - error
3109 */
3110static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
3111{
3112 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3113 u8 new_state;
3114
3115 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3116 return 1;
3117
3118 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3119 new_state = OBJECT_PENDING;
3120 else
3121 new_state = OBJECT_EXISTS;
3122
3123 return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
3124}
3125
85b5e6d1
ID
3126static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
3127{
a086a1b8
ID
3128 struct ceph_osd_request *osd_req;
3129 int num_ops = count_write_ops(obj_req);
3130 int which = 0;
3131 int ret;
710214e3 3132
a086a1b8
ID
3133 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3134 num_ops++; /* stat */
3135
3136 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3137 if (IS_ERR(osd_req))
3138 return PTR_ERR(osd_req);
3139
3140 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3141 ret = rbd_osd_setup_stat(osd_req, which++);
3142 if (ret)
3143 return ret;
710214e3 3144 }
c5b5ef6c 3145
a086a1b8
ID
3146 rbd_osd_setup_write_ops(osd_req, which);
3147 rbd_osd_format_write(osd_req);
3148
3149 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3150 if (ret)
3151 return ret;
3152
3153 rbd_osd_submit(osd_req);
85b5e6d1 3154 return 0;
3da691bf 3155}
c5b5ef6c 3156
3da691bf
ID
3157/*
3158 * copyup_bvecs pages are never highmem pages
3159 */
3160static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3161{
3162 struct ceph_bvec_iter it = {
3163 .bvecs = bvecs,
3164 .iter = { .bi_size = bytes },
3165 };
c5b5ef6c 3166
3da691bf
ID
3167 ceph_bvec_iter_advance_step(&it, bytes, ({
3168 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3169 bv.bv_len))
3170 return false;
3171 }));
3172 return true;
c5b5ef6c
AE
3173}
3174
3a482501
ID
3175#define MODS_ONLY U32_MAX
3176
793333a3
ID
3177static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3178 u32 bytes)
b454e36d 3179{
bcbab1db 3180 struct ceph_osd_request *osd_req;
fe943d50 3181 int ret;
70d045f6 3182
3da691bf 3183 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
89a59c1c 3184 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
70d045f6 3185
bcbab1db
ID
3186 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3187 if (IS_ERR(osd_req))
3188 return PTR_ERR(osd_req);
b454e36d 3189
b5ae8cbc 3190 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
fe943d50
CX
3191 if (ret)
3192 return ret;
3193
bcbab1db 3194 rbd_osd_format_write(osd_req);
3da691bf 3195
bcbab1db 3196 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
89a59c1c
ID
3197 if (ret)
3198 return ret;
3199
a086a1b8 3200 rbd_osd_submit(osd_req);
89a59c1c
ID
3201 return 0;
3202}
3203
793333a3
ID
3204static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3205 u32 bytes)
b454e36d 3206{
bcbab1db 3207 struct ceph_osd_request *osd_req;
a086a1b8
ID
3208 int num_ops = count_write_ops(obj_req);
3209 int which = 0;
fe943d50 3210 int ret;
70d045f6 3211
3da691bf 3212 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
70d045f6 3213
a086a1b8
ID
3214 if (bytes != MODS_ONLY)
3215 num_ops++; /* copyup */
13488d53 3216
a086a1b8 3217 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
bcbab1db
ID
3218 if (IS_ERR(osd_req))
3219 return PTR_ERR(osd_req);
b454e36d 3220
3a482501 3221 if (bytes != MODS_ONLY) {
b5ae8cbc 3222 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3a482501
ID
3223 if (ret)
3224 return ret;
3da691bf 3225 }
3da691bf 3226
a086a1b8
ID
3227 rbd_osd_setup_write_ops(osd_req, which);
3228 rbd_osd_format_write(osd_req);
70d045f6 3229
bcbab1db 3230 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
26f887e0
ID
3231 if (ret)
3232 return ret;
3233
a086a1b8 3234 rbd_osd_submit(osd_req);
3da691bf 3235 return 0;
70d045f6
ID
3236}
3237
7e07efb1 3238static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
70d045f6 3239{
7e07efb1 3240 u32 i;
b454e36d 3241
7e07efb1
ID
3242 rbd_assert(!obj_req->copyup_bvecs);
3243 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3244 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3245 sizeof(*obj_req->copyup_bvecs),
3246 GFP_NOIO);
3247 if (!obj_req->copyup_bvecs)
3248 return -ENOMEM;
b454e36d 3249
7e07efb1
ID
3250 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3251 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3252
3253 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3254 if (!obj_req->copyup_bvecs[i].bv_page)
3255 return -ENOMEM;
3d7efd18 3256
7e07efb1
ID
3257 obj_req->copyup_bvecs[i].bv_offset = 0;
3258 obj_req->copyup_bvecs[i].bv_len = len;
3259 obj_overlap -= len;
3260 }
b454e36d 3261
7e07efb1
ID
3262 rbd_assert(!obj_overlap);
3263 return 0;
b454e36d
AE
3264}
3265
0ad5d953
ID
3266/*
3267 * The target object doesn't exist. Read the data for the entire
3268 * target object up to the overlap point (if any) from the parent,
3269 * so we can use it for a copyup.
3270 */
793333a3 3271static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
bf0d5f50 3272{
3da691bf 3273 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 3274 int ret;
bf0d5f50 3275
86bd7998
ID
3276 rbd_assert(obj_req->num_img_extents);
3277 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3278 rbd_dev->parent_overlap);
3279 if (!obj_req->num_img_extents) {
3da691bf
ID
3280 /*
3281 * The overlap has become 0 (most likely because the
3a482501
ID
3282 * image has been flattened). Re-submit the original write
3283 * request -- pass MODS_ONLY since the copyup isn't needed
3284 * anymore.
3da691bf 3285 */
793333a3 3286 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
bf0d5f50
AE
3287 }
3288
86bd7998 3289 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3da691bf
ID
3290 if (ret)
3291 return ret;
3292
86bd7998 3293 return rbd_obj_read_from_parent(obj_req);
bf0d5f50 3294}
8b3e1a56 3295
22e8bd51
ID
3296static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3297{
3298 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3299 struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3300 u8 new_state;
3301 u32 i;
3302 int ret;
3303
3304 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3305
3306 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3307 return;
3308
3309 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3310 return;
3311
3312 for (i = 0; i < snapc->num_snaps; i++) {
3313 if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3314 i + 1 < snapc->num_snaps)
3315 new_state = OBJECT_EXISTS_CLEAN;
3316 else
3317 new_state = OBJECT_EXISTS;
3318
3319 ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3320 new_state, NULL);
3321 if (ret < 0) {
3322 obj_req->pending.result = ret;
3323 return;
3324 }
3325
3326 rbd_assert(!ret);
3327 obj_req->pending.num_pending++;
3328 }
3329}
3330
793333a3
ID
3331static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3332{
3333 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3334 int ret;
3335
3336 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3337
3338 /*
3339 * Only send non-zero copyup data to save some I/O and network
3340 * bandwidth -- zero copyup data is equivalent to the object not
3341 * existing.
3342 */
3343 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3344 bytes = 0;
3345
3346 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3347 /*
3348 * Send a copyup request with an empty snapshot context to
3349 * deep-copyup the object through all existing snapshots.
3350 * A second request with the current snapshot context will be
3351 * sent for the actual modification.
3352 */
3353 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3354 if (ret) {
3355 obj_req->pending.result = ret;
3356 return;
3357 }
3358
3359 obj_req->pending.num_pending++;
3360 bytes = MODS_ONLY;
3361 }
3362
3363 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3364 if (ret) {
3365 obj_req->pending.result = ret;
3366 return;
3367 }
3368
3369 obj_req->pending.num_pending++;
3370}
3371
3372static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3373{
22e8bd51 3374 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
793333a3
ID
3375 int ret;
3376
3377again:
3378 switch (obj_req->copyup_state) {
3379 case RBD_OBJ_COPYUP_START:
3380 rbd_assert(!*result);
3381
3382 ret = rbd_obj_copyup_read_parent(obj_req);
3383 if (ret) {
3384 *result = ret;
3385 return true;
3386 }
3387 if (obj_req->num_img_extents)
3388 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3389 else
3390 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3391 return false;
3392 case RBD_OBJ_COPYUP_READ_PARENT:
3393 if (*result)
3394 return true;
3395
3396 if (is_zero_bvecs(obj_req->copyup_bvecs,
3397 rbd_obj_img_extents_bytes(obj_req))) {
3398 dout("%s %p detected zeros\n", __func__, obj_req);
3399 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3400 }
3401
22e8bd51
ID
3402 rbd_obj_copyup_object_maps(obj_req);
3403 if (!obj_req->pending.num_pending) {
3404 *result = obj_req->pending.result;
3405 obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3406 goto again;
3407 }
3408 obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3409 return false;
3410 case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3411 if (!pending_result_dec(&obj_req->pending, result))
3412 return false;
3413 /* fall through */
3414 case RBD_OBJ_COPYUP_OBJECT_MAPS:
3415 if (*result) {
3416 rbd_warn(rbd_dev, "snap object map update failed: %d",
3417 *result);
3418 return true;
3419 }
3420
793333a3
ID
3421 rbd_obj_copyup_write_object(obj_req);
3422 if (!obj_req->pending.num_pending) {
3423 *result = obj_req->pending.result;
3424 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3425 goto again;
3426 }
3427 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3428 return false;
3429 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3430 if (!pending_result_dec(&obj_req->pending, result))
3431 return false;
3432 /* fall through */
3433 case RBD_OBJ_COPYUP_WRITE_OBJECT:
3434 return true;
3435 default:
3436 BUG();
3437 }
3438}
3439
22e8bd51
ID
3440/*
3441 * Return:
3442 * 0 - object map update sent
3443 * 1 - object map update isn't needed
3444 * <0 - error
3445 */
3446static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
3447{
3448 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3449 u8 current_state = OBJECT_PENDING;
3450
3451 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3452 return 1;
3453
3454 if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3455 return 1;
3456
3457 return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3458 &current_state);
3459}
3460
85b5e6d1 3461static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
8b3e1a56 3462{
793333a3 3463 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 3464 int ret;
8b3e1a56 3465
793333a3 3466again:
3da691bf 3467 switch (obj_req->write_state) {
85b5e6d1
ID
3468 case RBD_OBJ_WRITE_START:
3469 rbd_assert(!*result);
3470
22e8bd51
ID
3471 if (rbd_obj_write_is_noop(obj_req))
3472 return true;
3473
3474 ret = rbd_obj_write_pre_object_map(obj_req);
3475 if (ret < 0) {
3476 *result = ret;
3477 return true;
3478 }
3479 obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3480 if (ret > 0)
3481 goto again;
3482 return false;
3483 case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3484 if (*result) {
3485 rbd_warn(rbd_dev, "pre object map update failed: %d",
3486 *result);
3487 return true;
3488 }
85b5e6d1
ID
3489 ret = rbd_obj_write_object(obj_req);
3490 if (ret) {
3491 *result = ret;
3492 return true;
3493 }
3494 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3495 return false;
0ad5d953 3496 case RBD_OBJ_WRITE_OBJECT:
54ab3b24 3497 if (*result == -ENOENT) {
0ad5d953 3498 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
793333a3
ID
3499 *result = 0;
3500 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3501 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3502 goto again;
0ad5d953 3503 }
3da691bf 3504 /*
0ad5d953
ID
3505 * On a non-existent object:
3506 * delete - -ENOENT, truncate/zero - 0
3da691bf 3507 */
0ad5d953
ID
3508 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3509 *result = 0;
3da691bf 3510 }
a9b67e69 3511 if (*result)
3a482501 3512 return true;
8b3e1a56 3513
793333a3
ID
3514 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3515 goto again;
3516 case __RBD_OBJ_WRITE_COPYUP:
3517 if (!rbd_obj_advance_copyup(obj_req, result))
3518 return false;
3519 /* fall through */
3520 case RBD_OBJ_WRITE_COPYUP:
22e8bd51 3521 if (*result) {
793333a3 3522 rbd_warn(rbd_dev, "copyup failed: %d", *result);
22e8bd51
ID
3523 return true;
3524 }
3525 ret = rbd_obj_write_post_object_map(obj_req);
3526 if (ret < 0) {
3527 *result = ret;
3528 return true;
3529 }
3530 obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3531 if (ret > 0)
3532 goto again;
3533 return false;
3534 case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3535 if (*result)
3536 rbd_warn(rbd_dev, "post object map update failed: %d",
3537 *result);
793333a3 3538 return true;
3da691bf 3539 default:
c6244b3b 3540 BUG();
3da691bf
ID
3541 }
3542}
02c74fba 3543
3da691bf 3544/*
0ad5d953 3545 * Return true if @obj_req is completed.
3da691bf 3546 */
54ab3b24
ID
3547static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3548 int *result)
3da691bf 3549{
0ad5d953 3550 struct rbd_img_request *img_req = obj_req->img_request;
0192ce2e 3551 struct rbd_device *rbd_dev = img_req->rbd_dev;
0ad5d953
ID
3552 bool done;
3553
85b5e6d1 3554 mutex_lock(&obj_req->state_mutex);
0ad5d953 3555 if (!rbd_img_is_write(img_req))
85b5e6d1 3556 done = rbd_obj_advance_read(obj_req, result);
0ad5d953 3557 else
85b5e6d1
ID
3558 done = rbd_obj_advance_write(obj_req, result);
3559 mutex_unlock(&obj_req->state_mutex);
0ad5d953 3560
0192ce2e
ID
3561 if (done && *result) {
3562 rbd_assert(*result < 0);
3563 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3564 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3565 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3566 }
0ad5d953 3567 return done;
3da691bf 3568}
02c74fba 3569
0192ce2e
ID
3570/*
3571 * This is open-coded in rbd_img_handle_request() to avoid parent chain
3572 * recursion.
3573 */
3574static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3575{
3576 if (__rbd_obj_handle_request(obj_req, &result))
3577 rbd_img_handle_request(obj_req->img_request, result);
3578}
3579
e1fddc8f
ID
3580static bool need_exclusive_lock(struct rbd_img_request *img_req)
3581{
3582 struct rbd_device *rbd_dev = img_req->rbd_dev;
3583
3584 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3585 return false;
3586
3fe69921 3587 if (rbd_is_ro(rbd_dev))
e1fddc8f
ID
3588 return false;
3589
3590 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
22e8bd51
ID
3591 if (rbd_dev->opts->lock_on_read ||
3592 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
e1fddc8f
ID
3593 return true;
3594
3595 return rbd_img_is_write(img_req);
3596}
3597
637cd060 3598static bool rbd_lock_add_request(struct rbd_img_request *img_req)
e1fddc8f
ID
3599{
3600 struct rbd_device *rbd_dev = img_req->rbd_dev;
637cd060 3601 bool locked;
e1fddc8f
ID
3602
3603 lockdep_assert_held(&rbd_dev->lock_rwsem);
637cd060 3604 locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
e1fddc8f
ID
3605 spin_lock(&rbd_dev->lock_lists_lock);
3606 rbd_assert(list_empty(&img_req->lock_item));
637cd060
ID
3607 if (!locked)
3608 list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3609 else
3610 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
e1fddc8f 3611 spin_unlock(&rbd_dev->lock_lists_lock);
637cd060 3612 return locked;
e1fddc8f
ID
3613}
3614
3615static void rbd_lock_del_request(struct rbd_img_request *img_req)
3616{
3617 struct rbd_device *rbd_dev = img_req->rbd_dev;
3618 bool need_wakeup;
3619
3620 lockdep_assert_held(&rbd_dev->lock_rwsem);
3621 spin_lock(&rbd_dev->lock_lists_lock);
3622 rbd_assert(!list_empty(&img_req->lock_item));
3623 list_del_init(&img_req->lock_item);
3624 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3625 list_empty(&rbd_dev->running_list));
3626 spin_unlock(&rbd_dev->lock_lists_lock);
3627 if (need_wakeup)
3628 complete(&rbd_dev->releasing_wait);
3629}
3630
637cd060
ID
3631static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3632{
3633 struct rbd_device *rbd_dev = img_req->rbd_dev;
3634
3635 if (!need_exclusive_lock(img_req))
3636 return 1;
3637
3638 if (rbd_lock_add_request(img_req))
3639 return 1;
3640
3641 if (rbd_dev->opts->exclusive) {
3642 WARN_ON(1); /* lock got released? */
3643 return -EROFS;
3644 }
3645
3646 /*
3647 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3648 * and cancel_delayed_work() in wake_lock_waiters().
3649 */
3650 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3651 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3652 return 0;
3653}
3654
0192ce2e 3655static void rbd_img_object_requests(struct rbd_img_request *img_req)
7114edac 3656{
0192ce2e 3657 struct rbd_obj_request *obj_req;
7114edac 3658
0192ce2e
ID
3659 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3660
3661 for_each_obj_request(img_req, obj_req) {
3662 int result = 0;
a9e8ba2c 3663
0192ce2e
ID
3664 if (__rbd_obj_handle_request(obj_req, &result)) {
3665 if (result) {
3666 img_req->pending.result = result;
3667 return;
3668 }
3669 } else {
3670 img_req->pending.num_pending++;
3671 }
3672 }
8b3e1a56
AE
3673}
3674
0192ce2e 3675static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
8b3e1a56 3676{
637cd060 3677 struct rbd_device *rbd_dev = img_req->rbd_dev;
3da691bf 3678 int ret;
8b3e1a56 3679
0192ce2e
ID
3680again:
3681 switch (img_req->state) {
3682 case RBD_IMG_START:
3683 rbd_assert(!*result);
8b3e1a56 3684
637cd060
ID
3685 ret = rbd_img_exclusive_lock(img_req);
3686 if (ret < 0) {
3687 *result = ret;
3da691bf
ID
3688 return true;
3689 }
637cd060
ID
3690 img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3691 if (ret > 0)
3692 goto again;
3da691bf 3693 return false;
637cd060
ID
3694 case RBD_IMG_EXCLUSIVE_LOCK:
3695 if (*result)
89a59c1c
ID
3696 return true;
3697
637cd060
ID
3698 rbd_assert(!need_exclusive_lock(img_req) ||
3699 __rbd_is_lock_owner(rbd_dev));
3700
0192ce2e
ID
3701 rbd_img_object_requests(img_req);
3702 if (!img_req->pending.num_pending) {
3703 *result = img_req->pending.result;
3704 img_req->state = RBD_IMG_OBJECT_REQUESTS;
3705 goto again;
3da691bf 3706 }
0192ce2e 3707 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3da691bf 3708 return false;
0192ce2e
ID
3709 case __RBD_IMG_OBJECT_REQUESTS:
3710 if (!pending_result_dec(&img_req->pending, result))
3711 return false;
3712 /* fall through */
3713 case RBD_IMG_OBJECT_REQUESTS:
3714 return true;
3da691bf 3715 default:
c6244b3b 3716 BUG();
3da691bf
ID
3717 }
3718}
02c74fba 3719
3da691bf 3720/*
0192ce2e 3721 * Return true if @img_req is completed.
3da691bf 3722 */
0192ce2e
ID
3723static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3724 int *result)
7114edac 3725{
0192ce2e
ID
3726 struct rbd_device *rbd_dev = img_req->rbd_dev;
3727 bool done;
7114edac 3728
e1fddc8f
ID
3729 if (need_exclusive_lock(img_req)) {
3730 down_read(&rbd_dev->lock_rwsem);
3731 mutex_lock(&img_req->state_mutex);
3732 done = rbd_img_advance(img_req, result);
3733 if (done)
3734 rbd_lock_del_request(img_req);
3735 mutex_unlock(&img_req->state_mutex);
3736 up_read(&rbd_dev->lock_rwsem);
3737 } else {
3738 mutex_lock(&img_req->state_mutex);
3739 done = rbd_img_advance(img_req, result);
3740 mutex_unlock(&img_req->state_mutex);
02c74fba 3741 }
a9e8ba2c 3742
0192ce2e
ID
3743 if (done && *result) {
3744 rbd_assert(*result < 0);
3745 rbd_warn(rbd_dev, "%s%s result %d",
3746 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3747 obj_op_name(img_req->op_type), *result);
7114edac 3748 }
0192ce2e 3749 return done;
7114edac 3750}
a9e8ba2c 3751
0192ce2e 3752static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3da691bf 3753{
7114edac 3754again:
0192ce2e 3755 if (!__rbd_img_handle_request(img_req, &result))
7114edac 3756 return;
8b3e1a56 3757
7114edac 3758 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
0192ce2e
ID
3759 struct rbd_obj_request *obj_req = img_req->obj_request;
3760
54ab3b24 3761 rbd_img_request_put(img_req);
0192ce2e
ID
3762 if (__rbd_obj_handle_request(obj_req, &result)) {
3763 img_req = obj_req->img_request;
3764 goto again;
3765 }
3766 } else {
3767 struct request *rq = img_req->rq;
3768
3769 rbd_img_request_put(img_req);
3770 blk_mq_end_request(rq, errno_to_blk_status(result));
7114edac 3771 }
8b3e1a56 3772}
bf0d5f50 3773
ed95b21a 3774static const struct rbd_client_id rbd_empty_cid;
b8d70035 3775
ed95b21a
ID
3776static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3777 const struct rbd_client_id *rhs)
3778{
3779 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3780}
3781
3782static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3783{
3784 struct rbd_client_id cid;
3785
3786 mutex_lock(&rbd_dev->watch_mutex);
3787 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3788 cid.handle = rbd_dev->watch_cookie;
3789 mutex_unlock(&rbd_dev->watch_mutex);
3790 return cid;
3791}
3792
3793/*
3794 * lock_rwsem must be held for write
3795 */
3796static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3797 const struct rbd_client_id *cid)
3798{
3799 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3800 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3801 cid->gid, cid->handle);
3802 rbd_dev->owner_cid = *cid; /* struct */
3803}
3804
3805static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3806{
3807 mutex_lock(&rbd_dev->watch_mutex);
3808 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3809 mutex_unlock(&rbd_dev->watch_mutex);
3810}
3811
edd8ca80
FM
3812static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3813{
3814 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3815
a2b1da09 3816 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
edd8ca80
FM
3817 strcpy(rbd_dev->lock_cookie, cookie);
3818 rbd_set_owner_cid(rbd_dev, &cid);
3819 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3820}
3821
ed95b21a
ID
3822/*
3823 * lock_rwsem must be held for write
3824 */
3825static int rbd_lock(struct rbd_device *rbd_dev)
b8d70035 3826{
922dab61 3827 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3828 char cookie[32];
e627db08 3829 int ret;
b8d70035 3830
cbbfb0ff
ID
3831 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3832 rbd_dev->lock_cookie[0] != '\0');
52bb1f9b 3833
ed95b21a
ID
3834 format_lock_cookie(rbd_dev, cookie);
3835 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3836 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3837 RBD_LOCK_TAG, "", 0);
e627db08 3838 if (ret)
ed95b21a 3839 return ret;
b8d70035 3840
edd8ca80 3841 __rbd_lock(rbd_dev, cookie);
ed95b21a 3842 return 0;
b8d70035
AE
3843}
3844
ed95b21a
ID
3845/*
3846 * lock_rwsem must be held for write
3847 */
bbead745 3848static void rbd_unlock(struct rbd_device *rbd_dev)
bb040aa0 3849{
922dab61 3850 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
bb040aa0
ID
3851 int ret;
3852
cbbfb0ff
ID
3853 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3854 rbd_dev->lock_cookie[0] == '\0');
bb040aa0 3855
ed95b21a 3856 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
cbbfb0ff 3857 RBD_LOCK_NAME, rbd_dev->lock_cookie);
bbead745 3858 if (ret && ret != -ENOENT)
637cd060 3859 rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
bb040aa0 3860
bbead745
ID
3861 /* treat errors as the image is unlocked */
3862 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
cbbfb0ff 3863 rbd_dev->lock_cookie[0] = '\0';
ed95b21a
ID
3864 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3865 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
bb040aa0
ID
3866}
3867
ed95b21a
ID
3868static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3869 enum rbd_notify_op notify_op,
3870 struct page ***preply_pages,
3871 size_t *preply_len)
9969ebc5
AE
3872{
3873 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3874 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
08a79102
KS
3875 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3876 int buf_size = sizeof(buf);
ed95b21a 3877 void *p = buf;
9969ebc5 3878
ed95b21a 3879 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
9969ebc5 3880
ed95b21a
ID
3881 /* encode *LockPayload NotifyMessage (op + ClientId) */
3882 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3883 ceph_encode_32(&p, notify_op);
3884 ceph_encode_64(&p, cid.gid);
3885 ceph_encode_64(&p, cid.handle);
8eb87565 3886
ed95b21a
ID
3887 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3888 &rbd_dev->header_oloc, buf, buf_size,
3889 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
b30a01f2
ID
3890}
3891
ed95b21a
ID
3892static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3893 enum rbd_notify_op notify_op)
b30a01f2 3894{
ed95b21a
ID
3895 struct page **reply_pages;
3896 size_t reply_len;
b30a01f2 3897
ed95b21a
ID
3898 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3899 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3900}
b30a01f2 3901
ed95b21a
ID
3902static void rbd_notify_acquired_lock(struct work_struct *work)
3903{
3904 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3905 acquired_lock_work);
76756a51 3906
ed95b21a 3907 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
c525f036
ID
3908}
3909
ed95b21a 3910static void rbd_notify_released_lock(struct work_struct *work)
c525f036 3911{
ed95b21a
ID
3912 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3913 released_lock_work);
811c6688 3914
ed95b21a 3915 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
fca27065
ID
3916}
3917
ed95b21a 3918static int rbd_request_lock(struct rbd_device *rbd_dev)
36be9a76 3919{
ed95b21a
ID
3920 struct page **reply_pages;
3921 size_t reply_len;
3922 bool lock_owner_responded = false;
36be9a76
AE
3923 int ret;
3924
ed95b21a 3925 dout("%s rbd_dev %p\n", __func__, rbd_dev);
36be9a76 3926
ed95b21a
ID
3927 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3928 &reply_pages, &reply_len);
3929 if (ret && ret != -ETIMEDOUT) {
3930 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
36be9a76 3931 goto out;
ed95b21a 3932 }
36be9a76 3933
ed95b21a
ID
3934 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3935 void *p = page_address(reply_pages[0]);
3936 void *const end = p + reply_len;
3937 u32 n;
36be9a76 3938
ed95b21a
ID
3939 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3940 while (n--) {
3941 u8 struct_v;
3942 u32 len;
36be9a76 3943
ed95b21a
ID
3944 ceph_decode_need(&p, end, 8 + 8, e_inval);
3945 p += 8 + 8; /* skip gid and cookie */
04017e29 3946
ed95b21a
ID
3947 ceph_decode_32_safe(&p, end, len, e_inval);
3948 if (!len)
3949 continue;
3950
3951 if (lock_owner_responded) {
3952 rbd_warn(rbd_dev,
3953 "duplicate lock owners detected");
3954 ret = -EIO;
3955 goto out;
3956 }
3957
3958 lock_owner_responded = true;
3959 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3960 &struct_v, &len);
3961 if (ret) {
3962 rbd_warn(rbd_dev,
3963 "failed to decode ResponseMessage: %d",
3964 ret);
3965 goto e_inval;
3966 }
3967
3968 ret = ceph_decode_32(&p);
3969 }
3970 }
3971
3972 if (!lock_owner_responded) {
3973 rbd_warn(rbd_dev, "no lock owners detected");
3974 ret = -ETIMEDOUT;
3975 }
3976
3977out:
3978 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3979 return ret;
3980
3981e_inval:
3982 ret = -EINVAL;
3983 goto out;
3984}
3985
637cd060
ID
3986/*
3987 * Either image request state machine(s) or rbd_add_acquire_lock()
3988 * (i.e. "rbd map").
3989 */
3990static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
ed95b21a 3991{
637cd060
ID
3992 struct rbd_img_request *img_req;
3993
3994 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
d9b9c893 3995 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
3996
3997 cancel_delayed_work(&rbd_dev->lock_dwork);
637cd060
ID
3998 if (!completion_done(&rbd_dev->acquire_wait)) {
3999 rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
4000 list_empty(&rbd_dev->running_list));
4001 rbd_dev->acquire_err = result;
4002 complete_all(&rbd_dev->acquire_wait);
4003 return;
4004 }
4005
4006 list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
4007 mutex_lock(&img_req->state_mutex);
4008 rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
4009 rbd_img_schedule(img_req, result);
4010 mutex_unlock(&img_req->state_mutex);
4011 }
4012
4013 list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
ed95b21a
ID
4014}
4015
4016static int get_lock_owner_info(struct rbd_device *rbd_dev,
4017 struct ceph_locker **lockers, u32 *num_lockers)
4018{
4019 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4020 u8 lock_type;
4021 char *lock_tag;
4022 int ret;
4023
4024 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4025
4026 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
4027 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4028 &lock_type, &lock_tag, lockers, num_lockers);
4029 if (ret)
4030 return ret;
4031
4032 if (*num_lockers == 0) {
4033 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
4034 goto out;
4035 }
4036
4037 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
4038 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
4039 lock_tag);
4040 ret = -EBUSY;
4041 goto out;
4042 }
4043
4044 if (lock_type == CEPH_CLS_LOCK_SHARED) {
4045 rbd_warn(rbd_dev, "shared lock type detected");
4046 ret = -EBUSY;
4047 goto out;
4048 }
4049
4050 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
4051 strlen(RBD_LOCK_COOKIE_PREFIX))) {
4052 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
4053 (*lockers)[0].id.cookie);
4054 ret = -EBUSY;
4055 goto out;
4056 }
4057
4058out:
4059 kfree(lock_tag);
4060 return ret;
4061}
4062
4063static int find_watcher(struct rbd_device *rbd_dev,
4064 const struct ceph_locker *locker)
4065{
4066 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4067 struct ceph_watch_item *watchers;
4068 u32 num_watchers;
4069 u64 cookie;
4070 int i;
4071 int ret;
4072
4073 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
4074 &rbd_dev->header_oloc, &watchers,
4075 &num_watchers);
4076 if (ret)
4077 return ret;
4078
4079 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
4080 for (i = 0; i < num_watchers; i++) {
4081 if (!memcmp(&watchers[i].addr, &locker->info.addr,
4082 sizeof(locker->info.addr)) &&
4083 watchers[i].cookie == cookie) {
4084 struct rbd_client_id cid = {
4085 .gid = le64_to_cpu(watchers[i].name.num),
4086 .handle = cookie,
4087 };
4088
4089 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
4090 rbd_dev, cid.gid, cid.handle);
4091 rbd_set_owner_cid(rbd_dev, &cid);
4092 ret = 1;
4093 goto out;
4094 }
4095 }
4096
4097 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
4098 ret = 0;
4099out:
4100 kfree(watchers);
4101 return ret;
4102}
4103
4104/*
4105 * lock_rwsem must be held for write
4106 */
4107static int rbd_try_lock(struct rbd_device *rbd_dev)
4108{
4109 struct ceph_client *client = rbd_dev->rbd_client->client;
4110 struct ceph_locker *lockers;
4111 u32 num_lockers;
4112 int ret;
4113
4114 for (;;) {
4115 ret = rbd_lock(rbd_dev);
4116 if (ret != -EBUSY)
4117 return ret;
4118
4119 /* determine if the current lock holder is still alive */
4120 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
4121 if (ret)
4122 return ret;
4123
4124 if (num_lockers == 0)
4125 goto again;
4126
4127 ret = find_watcher(rbd_dev, lockers);
637cd060
ID
4128 if (ret)
4129 goto out; /* request lock or error */
ed95b21a 4130
22e8bd51 4131 rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
ed95b21a
ID
4132 ENTITY_NAME(lockers[0].id.name));
4133
4134 ret = ceph_monc_blacklist_add(&client->monc,
4135 &lockers[0].info.addr);
4136 if (ret) {
4137 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
4138 ENTITY_NAME(lockers[0].id.name), ret);
4139 goto out;
4140 }
4141
4142 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4143 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4144 lockers[0].id.cookie,
4145 &lockers[0].id.name);
4146 if (ret && ret != -ENOENT)
4147 goto out;
4148
4149again:
4150 ceph_free_lockers(lockers, num_lockers);
4151 }
4152
4153out:
4154 ceph_free_lockers(lockers, num_lockers);
4155 return ret;
4156}
4157
22e8bd51
ID
4158static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4159{
4160 int ret;
4161
4162 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4163 ret = rbd_object_map_open(rbd_dev);
4164 if (ret)
4165 return ret;
4166 }
4167
4168 return 0;
4169}
4170
ed95b21a 4171/*
637cd060
ID
4172 * Return:
4173 * 0 - lock acquired
4174 * 1 - caller should call rbd_request_lock()
4175 * <0 - error
ed95b21a 4176 */
637cd060 4177static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
ed95b21a 4178{
637cd060 4179 int ret;
ed95b21a
ID
4180
4181 down_read(&rbd_dev->lock_rwsem);
4182 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4183 rbd_dev->lock_state);
4184 if (__rbd_is_lock_owner(rbd_dev)) {
ed95b21a 4185 up_read(&rbd_dev->lock_rwsem);
637cd060 4186 return 0;
ed95b21a
ID
4187 }
4188
4189 up_read(&rbd_dev->lock_rwsem);
4190 down_write(&rbd_dev->lock_rwsem);
4191 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4192 rbd_dev->lock_state);
637cd060
ID
4193 if (__rbd_is_lock_owner(rbd_dev)) {
4194 up_write(&rbd_dev->lock_rwsem);
4195 return 0;
ed95b21a
ID
4196 }
4197
637cd060
ID
4198 ret = rbd_try_lock(rbd_dev);
4199 if (ret < 0) {
4200 rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4201 if (ret == -EBLACKLISTED)
4202 goto out;
4203
4204 ret = 1; /* request lock anyway */
4205 }
4206 if (ret > 0) {
4207 up_write(&rbd_dev->lock_rwsem);
4208 return ret;
4209 }
4210
4211 rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4212 rbd_assert(list_empty(&rbd_dev->running_list));
4213
22e8bd51
ID
4214 ret = rbd_post_acquire_action(rbd_dev);
4215 if (ret) {
4216 rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4217 /*
4218 * Can't stay in RBD_LOCK_STATE_LOCKED because
4219 * rbd_lock_add_request() would let the request through,
4220 * assuming that e.g. object map is locked and loaded.
4221 */
4222 rbd_unlock(rbd_dev);
ed95b21a
ID
4223 }
4224
637cd060
ID
4225out:
4226 wake_lock_waiters(rbd_dev, ret);
ed95b21a 4227 up_write(&rbd_dev->lock_rwsem);
637cd060 4228 return ret;
ed95b21a
ID
4229}
4230
4231static void rbd_acquire_lock(struct work_struct *work)
4232{
4233 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4234 struct rbd_device, lock_dwork);
637cd060 4235 int ret;
ed95b21a
ID
4236
4237 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4238again:
637cd060
ID
4239 ret = rbd_try_acquire_lock(rbd_dev);
4240 if (ret <= 0) {
4241 dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
ed95b21a
ID
4242 return;
4243 }
4244
4245 ret = rbd_request_lock(rbd_dev);
4246 if (ret == -ETIMEDOUT) {
4247 goto again; /* treat this as a dead client */
e010dd0a
ID
4248 } else if (ret == -EROFS) {
4249 rbd_warn(rbd_dev, "peer will not release lock");
637cd060
ID
4250 down_write(&rbd_dev->lock_rwsem);
4251 wake_lock_waiters(rbd_dev, ret);
4252 up_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
4253 } else if (ret < 0) {
4254 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4255 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4256 RBD_RETRY_DELAY);
4257 } else {
4258 /*
4259 * lock owner acked, but resend if we don't see them
4260 * release the lock
4261 */
6b0a8774 4262 dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
ed95b21a
ID
4263 rbd_dev);
4264 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4265 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4266 }
4267}
4268
a2b1da09 4269static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
ed95b21a 4270{
e1fddc8f
ID
4271 bool need_wait;
4272
a2b1da09 4273 dout("%s rbd_dev %p\n", __func__, rbd_dev);
d9b9c893 4274 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
a2b1da09 4275
ed95b21a
ID
4276 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4277 return false;
4278
52bb1f9b 4279 /*
ed95b21a 4280 * Ensure that all in-flight IO is flushed.
52bb1f9b 4281 */
e1fddc8f
ID
4282 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4283 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4284 need_wait = !list_empty(&rbd_dev->running_list);
4285 downgrade_write(&rbd_dev->lock_rwsem);
4286 if (need_wait)
4287 wait_for_completion(&rbd_dev->releasing_wait);
ed95b21a
ID
4288 up_read(&rbd_dev->lock_rwsem);
4289
4290 down_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
4291 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4292 return false;
4293
e1fddc8f 4294 rbd_assert(list_empty(&rbd_dev->running_list));
a2b1da09
ID
4295 return true;
4296}
4297
22e8bd51
ID
4298static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4299{
4300 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4301 rbd_object_map_close(rbd_dev);
4302}
4303
e1fddc8f
ID
4304static void __rbd_release_lock(struct rbd_device *rbd_dev)
4305{
4306 rbd_assert(list_empty(&rbd_dev->running_list));
4307
22e8bd51 4308 rbd_pre_release_action(rbd_dev);
bbead745 4309 rbd_unlock(rbd_dev);
e1fddc8f
ID
4310}
4311
a2b1da09
ID
4312/*
4313 * lock_rwsem must be held for write
4314 */
4315static void rbd_release_lock(struct rbd_device *rbd_dev)
4316{
4317 if (!rbd_quiesce_lock(rbd_dev))
4318 return;
4319
e1fddc8f 4320 __rbd_release_lock(rbd_dev);
a2b1da09 4321
bbead745
ID
4322 /*
4323 * Give others a chance to grab the lock - we would re-acquire
637cd060
ID
4324 * almost immediately if we got new IO while draining the running
4325 * list otherwise. We need to ack our own notifications, so this
4326 * lock_dwork will be requeued from rbd_handle_released_lock() by
4327 * way of maybe_kick_acquire().
bbead745
ID
4328 */
4329 cancel_delayed_work(&rbd_dev->lock_dwork);
ed95b21a
ID
4330}
4331
4332static void rbd_release_lock_work(struct work_struct *work)
4333{
4334 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4335 unlock_work);
4336
4337 down_write(&rbd_dev->lock_rwsem);
4338 rbd_release_lock(rbd_dev);
4339 up_write(&rbd_dev->lock_rwsem);
4340}
4341
637cd060
ID
4342static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4343{
4344 bool have_requests;
4345
4346 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4347 if (__rbd_is_lock_owner(rbd_dev))
4348 return;
4349
4350 spin_lock(&rbd_dev->lock_lists_lock);
4351 have_requests = !list_empty(&rbd_dev->acquiring_list);
4352 spin_unlock(&rbd_dev->lock_lists_lock);
4353 if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4354 dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4355 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4356 }
4357}
4358
ed95b21a
ID
4359static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4360 void **p)
4361{
4362 struct rbd_client_id cid = { 0 };
4363
4364 if (struct_v >= 2) {
4365 cid.gid = ceph_decode_64(p);
4366 cid.handle = ceph_decode_64(p);
4367 }
4368
4369 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4370 cid.handle);
4371 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4372 down_write(&rbd_dev->lock_rwsem);
4373 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4374 /*
4375 * we already know that the remote client is
4376 * the owner
4377 */
4378 up_write(&rbd_dev->lock_rwsem);
4379 return;
4380 }
4381
4382 rbd_set_owner_cid(rbd_dev, &cid);
4383 downgrade_write(&rbd_dev->lock_rwsem);
4384 } else {
4385 down_read(&rbd_dev->lock_rwsem);
4386 }
4387
637cd060 4388 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
4389 up_read(&rbd_dev->lock_rwsem);
4390}
4391
4392static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4393 void **p)
4394{
4395 struct rbd_client_id cid = { 0 };
4396
4397 if (struct_v >= 2) {
4398 cid.gid = ceph_decode_64(p);
4399 cid.handle = ceph_decode_64(p);
4400 }
4401
4402 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4403 cid.handle);
4404 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4405 down_write(&rbd_dev->lock_rwsem);
4406 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4407 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
4408 __func__, rbd_dev, cid.gid, cid.handle,
4409 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4410 up_write(&rbd_dev->lock_rwsem);
4411 return;
4412 }
4413
4414 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4415 downgrade_write(&rbd_dev->lock_rwsem);
4416 } else {
4417 down_read(&rbd_dev->lock_rwsem);
4418 }
4419
637cd060 4420 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
4421 up_read(&rbd_dev->lock_rwsem);
4422}
4423
3b77faa0
ID
4424/*
4425 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4426 * ResponseMessage is needed.
4427 */
4428static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4429 void **p)
ed95b21a
ID
4430{
4431 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4432 struct rbd_client_id cid = { 0 };
3b77faa0 4433 int result = 1;
ed95b21a
ID
4434
4435 if (struct_v >= 2) {
4436 cid.gid = ceph_decode_64(p);
4437 cid.handle = ceph_decode_64(p);
4438 }
4439
4440 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4441 cid.handle);
4442 if (rbd_cid_equal(&cid, &my_cid))
3b77faa0 4443 return result;
ed95b21a
ID
4444
4445 down_read(&rbd_dev->lock_rwsem);
3b77faa0
ID
4446 if (__rbd_is_lock_owner(rbd_dev)) {
4447 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4448 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4449 goto out_unlock;
4450
4451 /*
4452 * encode ResponseMessage(0) so the peer can detect
4453 * a missing owner
4454 */
4455 result = 0;
4456
4457 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
e010dd0a
ID
4458 if (!rbd_dev->opts->exclusive) {
4459 dout("%s rbd_dev %p queueing unlock_work\n",
4460 __func__, rbd_dev);
4461 queue_work(rbd_dev->task_wq,
4462 &rbd_dev->unlock_work);
4463 } else {
4464 /* refuse to release the lock */
4465 result = -EROFS;
4466 }
ed95b21a
ID
4467 }
4468 }
3b77faa0
ID
4469
4470out_unlock:
ed95b21a 4471 up_read(&rbd_dev->lock_rwsem);
3b77faa0 4472 return result;
ed95b21a
ID
4473}
4474
4475static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4476 u64 notify_id, u64 cookie, s32 *result)
4477{
4478 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
08a79102
KS
4479 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4480 int buf_size = sizeof(buf);
ed95b21a
ID
4481 int ret;
4482
4483 if (result) {
4484 void *p = buf;
4485
4486 /* encode ResponseMessage */
4487 ceph_start_encoding(&p, 1, 1,
4488 buf_size - CEPH_ENCODING_START_BLK_LEN);
4489 ceph_encode_32(&p, *result);
4490 } else {
4491 buf_size = 0;
4492 }
b8d70035 4493
922dab61
ID
4494 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4495 &rbd_dev->header_oloc, notify_id, cookie,
ed95b21a 4496 buf, buf_size);
52bb1f9b 4497 if (ret)
ed95b21a
ID
4498 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4499}
4500
4501static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4502 u64 cookie)
4503{
4504 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4505 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4506}
4507
4508static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4509 u64 notify_id, u64 cookie, s32 result)
4510{
4511 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4512 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4513}
4514
4515static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4516 u64 notifier_id, void *data, size_t data_len)
4517{
4518 struct rbd_device *rbd_dev = arg;
4519 void *p = data;
4520 void *const end = p + data_len;
d4c2269b 4521 u8 struct_v = 0;
ed95b21a
ID
4522 u32 len;
4523 u32 notify_op;
4524 int ret;
4525
4526 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4527 __func__, rbd_dev, cookie, notify_id, data_len);
4528 if (data_len) {
4529 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4530 &struct_v, &len);
4531 if (ret) {
4532 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4533 ret);
4534 return;
4535 }
4536
4537 notify_op = ceph_decode_32(&p);
4538 } else {
4539 /* legacy notification for header updates */
4540 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4541 len = 0;
4542 }
4543
4544 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4545 switch (notify_op) {
4546 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4547 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4548 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4549 break;
4550 case RBD_NOTIFY_OP_RELEASED_LOCK:
4551 rbd_handle_released_lock(rbd_dev, struct_v, &p);
4552 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4553 break;
4554 case RBD_NOTIFY_OP_REQUEST_LOCK:
3b77faa0
ID
4555 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4556 if (ret <= 0)
ed95b21a 4557 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3b77faa0 4558 cookie, ret);
ed95b21a
ID
4559 else
4560 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4561 break;
4562 case RBD_NOTIFY_OP_HEADER_UPDATE:
4563 ret = rbd_dev_refresh(rbd_dev);
4564 if (ret)
4565 rbd_warn(rbd_dev, "refresh failed: %d", ret);
4566
4567 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4568 break;
4569 default:
4570 if (rbd_is_lock_owner(rbd_dev))
4571 rbd_acknowledge_notify_result(rbd_dev, notify_id,
4572 cookie, -EOPNOTSUPP);
4573 else
4574 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4575 break;
4576 }
b8d70035
AE
4577}
4578
99d16943
ID
4579static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4580
922dab61 4581static void rbd_watch_errcb(void *arg, u64 cookie, int err)
bb040aa0 4582{
922dab61 4583 struct rbd_device *rbd_dev = arg;
bb040aa0 4584
922dab61 4585 rbd_warn(rbd_dev, "encountered watch error: %d", err);
bb040aa0 4586
ed95b21a
ID
4587 down_write(&rbd_dev->lock_rwsem);
4588 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4589 up_write(&rbd_dev->lock_rwsem);
4590
99d16943
ID
4591 mutex_lock(&rbd_dev->watch_mutex);
4592 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4593 __rbd_unregister_watch(rbd_dev);
4594 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
bb040aa0 4595
99d16943 4596 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
bb040aa0 4597 }
99d16943 4598 mutex_unlock(&rbd_dev->watch_mutex);
bb040aa0
ID
4599}
4600
9969ebc5 4601/*
99d16943 4602 * watch_mutex must be locked
9969ebc5 4603 */
99d16943 4604static int __rbd_register_watch(struct rbd_device *rbd_dev)
9969ebc5
AE
4605{
4606 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
922dab61 4607 struct ceph_osd_linger_request *handle;
9969ebc5 4608
922dab61 4609 rbd_assert(!rbd_dev->watch_handle);
99d16943 4610 dout("%s rbd_dev %p\n", __func__, rbd_dev);
9969ebc5 4611
922dab61
ID
4612 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4613 &rbd_dev->header_oloc, rbd_watch_cb,
4614 rbd_watch_errcb, rbd_dev);
4615 if (IS_ERR(handle))
4616 return PTR_ERR(handle);
8eb87565 4617
922dab61 4618 rbd_dev->watch_handle = handle;
b30a01f2 4619 return 0;
b30a01f2
ID
4620}
4621
99d16943
ID
4622/*
4623 * watch_mutex must be locked
4624 */
4625static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
b30a01f2 4626{
922dab61
ID
4627 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4628 int ret;
b30a01f2 4629
99d16943
ID
4630 rbd_assert(rbd_dev->watch_handle);
4631 dout("%s rbd_dev %p\n", __func__, rbd_dev);
b30a01f2 4632
922dab61
ID
4633 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4634 if (ret)
4635 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
76756a51 4636
922dab61 4637 rbd_dev->watch_handle = NULL;
c525f036
ID
4638}
4639
99d16943
ID
4640static int rbd_register_watch(struct rbd_device *rbd_dev)
4641{
4642 int ret;
4643
4644 mutex_lock(&rbd_dev->watch_mutex);
4645 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4646 ret = __rbd_register_watch(rbd_dev);
4647 if (ret)
4648 goto out;
4649
4650 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4651 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4652
4653out:
4654 mutex_unlock(&rbd_dev->watch_mutex);
4655 return ret;
4656}
4657
4658static void cancel_tasks_sync(struct rbd_device *rbd_dev)
c525f036 4659{
99d16943
ID
4660 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4661
ed95b21a
ID
4662 cancel_work_sync(&rbd_dev->acquired_lock_work);
4663 cancel_work_sync(&rbd_dev->released_lock_work);
4664 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4665 cancel_work_sync(&rbd_dev->unlock_work);
99d16943
ID
4666}
4667
4668static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4669{
4670 cancel_tasks_sync(rbd_dev);
4671
4672 mutex_lock(&rbd_dev->watch_mutex);
4673 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4674 __rbd_unregister_watch(rbd_dev);
4675 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4676 mutex_unlock(&rbd_dev->watch_mutex);
811c6688 4677
23edca86 4678 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
811c6688 4679 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
fca27065
ID
4680}
4681
14bb211d
ID
4682/*
4683 * lock_rwsem must be held for write
4684 */
4685static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4686{
4687 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4688 char cookie[32];
4689 int ret;
4690
a2b1da09
ID
4691 if (!rbd_quiesce_lock(rbd_dev))
4692 return;
14bb211d
ID
4693
4694 format_lock_cookie(rbd_dev, cookie);
4695 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4696 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4697 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4698 RBD_LOCK_TAG, cookie);
4699 if (ret) {
4700 if (ret != -EOPNOTSUPP)
4701 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4702 ret);
4703
4704 /*
4705 * Lock cookie cannot be updated on older OSDs, so do
4706 * a manual release and queue an acquire.
4707 */
e1fddc8f 4708 __rbd_release_lock(rbd_dev);
a2b1da09 4709 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
14bb211d 4710 } else {
edd8ca80 4711 __rbd_lock(rbd_dev, cookie);
637cd060 4712 wake_lock_waiters(rbd_dev, 0);
14bb211d
ID
4713 }
4714}
4715
99d16943
ID
4716static void rbd_reregister_watch(struct work_struct *work)
4717{
4718 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4719 struct rbd_device, watch_dwork);
4720 int ret;
4721
4722 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4723
4724 mutex_lock(&rbd_dev->watch_mutex);
87c0fded
ID
4725 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4726 mutex_unlock(&rbd_dev->watch_mutex);
14bb211d 4727 return;
87c0fded 4728 }
99d16943
ID
4729
4730 ret = __rbd_register_watch(rbd_dev);
4731 if (ret) {
4732 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
637cd060 4733 if (ret != -EBLACKLISTED && ret != -ENOENT) {
99d16943
ID
4734 queue_delayed_work(rbd_dev->task_wq,
4735 &rbd_dev->watch_dwork,
4736 RBD_RETRY_DELAY);
637cd060
ID
4737 mutex_unlock(&rbd_dev->watch_mutex);
4738 return;
87c0fded 4739 }
637cd060 4740
87c0fded 4741 mutex_unlock(&rbd_dev->watch_mutex);
637cd060
ID
4742 down_write(&rbd_dev->lock_rwsem);
4743 wake_lock_waiters(rbd_dev, ret);
4744 up_write(&rbd_dev->lock_rwsem);
14bb211d 4745 return;
99d16943
ID
4746 }
4747
4748 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4749 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4750 mutex_unlock(&rbd_dev->watch_mutex);
4751
14bb211d
ID
4752 down_write(&rbd_dev->lock_rwsem);
4753 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4754 rbd_reacquire_lock(rbd_dev);
4755 up_write(&rbd_dev->lock_rwsem);
4756
99d16943
ID
4757 ret = rbd_dev_refresh(rbd_dev);
4758 if (ret)
f6870cc9 4759 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
99d16943
ID
4760}
4761
36be9a76 4762/*
f40eb349
AE
4763 * Synchronous osd object method call. Returns the number of bytes
4764 * returned in the outbound buffer, or a negative error code.
36be9a76
AE
4765 */
4766static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
ecd4a68a
ID
4767 struct ceph_object_id *oid,
4768 struct ceph_object_locator *oloc,
36be9a76 4769 const char *method_name,
4157976b 4770 const void *outbound,
36be9a76 4771 size_t outbound_size,
4157976b 4772 void *inbound,
e2a58ee5 4773 size_t inbound_size)
36be9a76 4774{
ecd4a68a
ID
4775 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4776 struct page *req_page = NULL;
4777 struct page *reply_page;
36be9a76
AE
4778 int ret;
4779
4780 /*
6010a451
AE
4781 * Method calls are ultimately read operations. The result
4782 * should placed into the inbound buffer provided. They
4783 * also supply outbound data--parameters for the object
4784 * method. Currently if this is present it will be a
4785 * snapshot id.
36be9a76 4786 */
ecd4a68a
ID
4787 if (outbound) {
4788 if (outbound_size > PAGE_SIZE)
4789 return -E2BIG;
36be9a76 4790
ecd4a68a
ID
4791 req_page = alloc_page(GFP_KERNEL);
4792 if (!req_page)
4793 return -ENOMEM;
04017e29 4794
ecd4a68a 4795 memcpy(page_address(req_page), outbound, outbound_size);
04017e29 4796 }
36be9a76 4797
ecd4a68a
ID
4798 reply_page = alloc_page(GFP_KERNEL);
4799 if (!reply_page) {
4800 if (req_page)
4801 __free_page(req_page);
4802 return -ENOMEM;
4803 }
57385b51 4804
ecd4a68a
ID
4805 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4806 CEPH_OSD_FLAG_READ, req_page, outbound_size,
68ada915 4807 &reply_page, &inbound_size);
ecd4a68a
ID
4808 if (!ret) {
4809 memcpy(inbound, page_address(reply_page), inbound_size);
4810 ret = inbound_size;
4811 }
36be9a76 4812
ecd4a68a
ID
4813 if (req_page)
4814 __free_page(req_page);
4815 __free_page(reply_page);
36be9a76
AE
4816 return ret;
4817}
4818
7ad18afa 4819static void rbd_queue_workfn(struct work_struct *work)
bf0d5f50 4820{
7ad18afa
CH
4821 struct request *rq = blk_mq_rq_from_pdu(work);
4822 struct rbd_device *rbd_dev = rq->q->queuedata;
bc1ecc65 4823 struct rbd_img_request *img_request;
4e752f0a 4824 struct ceph_snap_context *snapc = NULL;
bc1ecc65
ID
4825 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4826 u64 length = blk_rq_bytes(rq);
6d2940c8 4827 enum obj_operation_type op_type;
4e752f0a 4828 u64 mapping_size;
bf0d5f50
AE
4829 int result;
4830
aebf526b
CH
4831 switch (req_op(rq)) {
4832 case REQ_OP_DISCARD:
90e98c52 4833 op_type = OBJ_OP_DISCARD;
aebf526b 4834 break;
6484cbe9
ID
4835 case REQ_OP_WRITE_ZEROES:
4836 op_type = OBJ_OP_ZEROOUT;
4837 break;
aebf526b 4838 case REQ_OP_WRITE:
6d2940c8 4839 op_type = OBJ_OP_WRITE;
aebf526b
CH
4840 break;
4841 case REQ_OP_READ:
6d2940c8 4842 op_type = OBJ_OP_READ;
aebf526b
CH
4843 break;
4844 default:
4845 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
4846 result = -EIO;
4847 goto err;
4848 }
6d2940c8 4849
bc1ecc65 4850 /* Ignore/skip any zero-length requests */
bf0d5f50 4851
bc1ecc65
ID
4852 if (!length) {
4853 dout("%s: zero-length request\n", __func__);
4854 result = 0;
4855 goto err_rq;
4856 }
bf0d5f50 4857
b948ad78
ID
4858 if (op_type != OBJ_OP_READ) {
4859 if (rbd_is_ro(rbd_dev)) {
4860 rbd_warn(rbd_dev, "%s on read-only mapping",
4861 obj_op_name(op_type));
4862 result = -EIO;
4863 goto err;
4864 }
4865 rbd_assert(!rbd_is_snap(rbd_dev));
b91a7bdc 4866 }
4dda41d3 4867
bc1ecc65
ID
4868 if (offset && length > U64_MAX - offset + 1) {
4869 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4870 length);
4871 result = -EINVAL;
4872 goto err_rq; /* Shouldn't happen */
4873 }
4dda41d3 4874
7ad18afa
CH
4875 blk_mq_start_request(rq);
4876
4e752f0a
JD
4877 down_read(&rbd_dev->header_rwsem);
4878 mapping_size = rbd_dev->mapping.size;
6d2940c8 4879 if (op_type != OBJ_OP_READ) {
4e752f0a
JD
4880 snapc = rbd_dev->header.snapc;
4881 ceph_get_snap_context(snapc);
4882 }
4883 up_read(&rbd_dev->header_rwsem);
4884
4885 if (offset + length > mapping_size) {
bc1ecc65 4886 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4e752f0a 4887 length, mapping_size);
bc1ecc65
ID
4888 result = -EIO;
4889 goto err_rq;
4890 }
bf0d5f50 4891
dfd9875f 4892 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
bc1ecc65
ID
4893 if (!img_request) {
4894 result = -ENOMEM;
637cd060 4895 goto err_rq;
bc1ecc65
ID
4896 }
4897 img_request->rq = rq;
70b16db8 4898 snapc = NULL; /* img_request consumes a ref */
bf0d5f50 4899
21ed05a8
ID
4900 dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4901 img_request, obj_op_name(op_type), offset, length);
4902
6484cbe9 4903 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
5a237819 4904 result = rbd_img_fill_nodata(img_request, offset, length);
90e98c52 4905 else
5a237819
ID
4906 result = rbd_img_fill_from_bio(img_request, offset, length,
4907 rq->bio);
0192ce2e 4908 if (result)
bc1ecc65 4909 goto err_img_request;
bf0d5f50 4910
e1fddc8f 4911 rbd_img_handle_request(img_request, 0);
bc1ecc65 4912 return;
bf0d5f50 4913
bc1ecc65
ID
4914err_img_request:
4915 rbd_img_request_put(img_request);
4916err_rq:
4917 if (result)
4918 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
6d2940c8 4919 obj_op_name(op_type), length, offset, result);
e96a650a 4920 ceph_put_snap_context(snapc);
7ad18afa 4921err:
2a842aca 4922 blk_mq_end_request(rq, errno_to_blk_status(result));
bc1ecc65 4923}
bf0d5f50 4924
fc17b653 4925static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
7ad18afa 4926 const struct blk_mq_queue_data *bd)
bc1ecc65 4927{
7ad18afa
CH
4928 struct request *rq = bd->rq;
4929 struct work_struct *work = blk_mq_rq_to_pdu(rq);
bf0d5f50 4930
7ad18afa 4931 queue_work(rbd_wq, work);
fc17b653 4932 return BLK_STS_OK;
bf0d5f50
AE
4933}
4934
602adf40
YS
4935static void rbd_free_disk(struct rbd_device *rbd_dev)
4936{
5769ed0c
ID
4937 blk_cleanup_queue(rbd_dev->disk->queue);
4938 blk_mq_free_tag_set(&rbd_dev->tag_set);
4939 put_disk(rbd_dev->disk);
a0cab924 4940 rbd_dev->disk = NULL;
602adf40
YS
4941}
4942
788e2df3 4943static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
fe5478e0
ID
4944 struct ceph_object_id *oid,
4945 struct ceph_object_locator *oloc,
4946 void *buf, int buf_len)
788e2df3
AE
4947
4948{
fe5478e0
ID
4949 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4950 struct ceph_osd_request *req;
4951 struct page **pages;
4952 int num_pages = calc_pages_for(0, buf_len);
788e2df3
AE
4953 int ret;
4954
fe5478e0
ID
4955 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4956 if (!req)
4957 return -ENOMEM;
788e2df3 4958
fe5478e0
ID
4959 ceph_oid_copy(&req->r_base_oid, oid);
4960 ceph_oloc_copy(&req->r_base_oloc, oloc);
4961 req->r_flags = CEPH_OSD_FLAG_READ;
430c28c3 4962
fe5478e0
ID
4963 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4964 if (IS_ERR(pages)) {
4965 ret = PTR_ERR(pages);
4966 goto out_req;
4967 }
1ceae7ef 4968
fe5478e0
ID
4969 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4970 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4971 true);
4972
26f887e0
ID
4973 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4974 if (ret)
4975 goto out_req;
4976
fe5478e0
ID
4977 ceph_osdc_start_request(osdc, req, false);
4978 ret = ceph_osdc_wait_request(osdc, req);
4979 if (ret >= 0)
4980 ceph_copy_from_page_vector(pages, buf, 0, ret);
788e2df3 4981
fe5478e0
ID
4982out_req:
4983 ceph_osdc_put_request(req);
788e2df3
AE
4984 return ret;
4985}
4986
602adf40 4987/*
662518b1
AE
4988 * Read the complete header for the given rbd device. On successful
4989 * return, the rbd_dev->header field will contain up-to-date
4990 * information about the image.
602adf40 4991 */
99a41ebc 4992static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
602adf40 4993{
4156d998 4994 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 4995 u32 snap_count = 0;
4156d998
AE
4996 u64 names_size = 0;
4997 u32 want_count;
4998 int ret;
602adf40 4999
00f1f36f 5000 /*
4156d998
AE
5001 * The complete header will include an array of its 64-bit
5002 * snapshot ids, followed by the names of those snapshots as
5003 * a contiguous block of NUL-terminated strings. Note that
5004 * the number of snapshots could change by the time we read
5005 * it in, in which case we re-read it.
00f1f36f 5006 */
4156d998
AE
5007 do {
5008 size_t size;
5009
5010 kfree(ondisk);
5011
5012 size = sizeof (*ondisk);
5013 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
5014 size += names_size;
5015 ondisk = kmalloc(size, GFP_KERNEL);
5016 if (!ondisk)
662518b1 5017 return -ENOMEM;
4156d998 5018
fe5478e0
ID
5019 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
5020 &rbd_dev->header_oloc, ondisk, size);
4156d998 5021 if (ret < 0)
662518b1 5022 goto out;
c0cd10db 5023 if ((size_t)ret < size) {
4156d998 5024 ret = -ENXIO;
06ecc6cb
AE
5025 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
5026 size, ret);
662518b1 5027 goto out;
4156d998
AE
5028 }
5029 if (!rbd_dev_ondisk_valid(ondisk)) {
5030 ret = -ENXIO;
06ecc6cb 5031 rbd_warn(rbd_dev, "invalid header");
662518b1 5032 goto out;
81e759fb 5033 }
602adf40 5034
4156d998
AE
5035 names_size = le64_to_cpu(ondisk->snap_names_len);
5036 want_count = snap_count;
5037 snap_count = le32_to_cpu(ondisk->snap_count);
5038 } while (snap_count != want_count);
00f1f36f 5039
662518b1
AE
5040 ret = rbd_header_from_disk(rbd_dev, ondisk);
5041out:
4156d998
AE
5042 kfree(ondisk);
5043
5044 return ret;
602adf40
YS
5045}
5046
9875201e
JD
5047static void rbd_dev_update_size(struct rbd_device *rbd_dev)
5048{
5049 sector_t size;
9875201e
JD
5050
5051 /*
811c6688
ID
5052 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
5053 * try to update its size. If REMOVING is set, updating size
5054 * is just useless work since the device can't be opened.
9875201e 5055 */
811c6688
ID
5056 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
5057 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
9875201e
JD
5058 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
5059 dout("setting size to %llu sectors", (unsigned long long)size);
5060 set_capacity(rbd_dev->disk, size);
5061 revalidate_disk(rbd_dev->disk);
5062 }
5063}
5064
cc4a38bd 5065static int rbd_dev_refresh(struct rbd_device *rbd_dev)
1fe5e993 5066{
e627db08 5067 u64 mapping_size;
1fe5e993
AE
5068 int ret;
5069
cfbf6377 5070 down_write(&rbd_dev->header_rwsem);
3b5cf2a2 5071 mapping_size = rbd_dev->mapping.size;
a720ae09
ID
5072
5073 ret = rbd_dev_header_info(rbd_dev);
52bb1f9b 5074 if (ret)
73e39e4d 5075 goto out;
15228ede 5076
e8f59b59
ID
5077 /*
5078 * If there is a parent, see if it has disappeared due to the
5079 * mapped image getting flattened.
5080 */
5081 if (rbd_dev->parent) {
5082 ret = rbd_dev_v2_parent_info(rbd_dev);
5083 if (ret)
73e39e4d 5084 goto out;
e8f59b59
ID
5085 }
5086
686238b7
ID
5087 rbd_assert(!rbd_is_snap(rbd_dev));
5088 rbd_dev->mapping.size = rbd_dev->header.image_size;
15228ede 5089
73e39e4d 5090out:
cfbf6377 5091 up_write(&rbd_dev->header_rwsem);
73e39e4d 5092 if (!ret && mapping_size != rbd_dev->mapping.size)
9875201e 5093 rbd_dev_update_size(rbd_dev);
1fe5e993 5094
73e39e4d 5095 return ret;
1fe5e993
AE
5096}
5097
d6296d39
CH
5098static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
5099 unsigned int hctx_idx, unsigned int numa_node)
7ad18afa
CH
5100{
5101 struct work_struct *work = blk_mq_rq_to_pdu(rq);
5102
5103 INIT_WORK(work, rbd_queue_workfn);
5104 return 0;
5105}
5106
f363b089 5107static const struct blk_mq_ops rbd_mq_ops = {
7ad18afa 5108 .queue_rq = rbd_queue_rq,
7ad18afa
CH
5109 .init_request = rbd_init_request,
5110};
5111
602adf40
YS
5112static int rbd_init_disk(struct rbd_device *rbd_dev)
5113{
5114 struct gendisk *disk;
5115 struct request_queue *q;
420efbdf
ID
5116 unsigned int objset_bytes =
5117 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
7ad18afa 5118 int err;
602adf40 5119
602adf40 5120 /* create gendisk info */
7e513d43
ID
5121 disk = alloc_disk(single_major ?
5122 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
5123 RBD_MINORS_PER_MAJOR);
602adf40 5124 if (!disk)
1fcdb8aa 5125 return -ENOMEM;
602adf40 5126
f0f8cef5 5127 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 5128 rbd_dev->dev_id);
602adf40 5129 disk->major = rbd_dev->major;
dd82fff1 5130 disk->first_minor = rbd_dev->minor;
7e513d43
ID
5131 if (single_major)
5132 disk->flags |= GENHD_FL_EXT_DEVT;
602adf40
YS
5133 disk->fops = &rbd_bd_ops;
5134 disk->private_data = rbd_dev;
5135
7ad18afa
CH
5136 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
5137 rbd_dev->tag_set.ops = &rbd_mq_ops;
b5584180 5138 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
7ad18afa 5139 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
56d18f62 5140 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
7ad18afa
CH
5141 rbd_dev->tag_set.nr_hw_queues = 1;
5142 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
5143
5144 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
5145 if (err)
602adf40 5146 goto out_disk;
029bcbd8 5147
7ad18afa
CH
5148 q = blk_mq_init_queue(&rbd_dev->tag_set);
5149 if (IS_ERR(q)) {
5150 err = PTR_ERR(q);
5151 goto out_tag_set;
5152 }
5153
8b904b5b 5154 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
d8a2c89c 5155 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
593a9e7b 5156
420efbdf 5157 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
0d9fde4f 5158 q->limits.max_sectors = queue_max_hw_sectors(q);
21acdf45 5159 blk_queue_max_segments(q, USHRT_MAX);
24f1df60 5160 blk_queue_max_segment_size(q, UINT_MAX);
16d80c54
ID
5161 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
5162 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
029bcbd8 5163
d9360540
ID
5164 if (rbd_dev->opts->trim) {
5165 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
16d80c54 5166 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
d9360540
ID
5167 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5168 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5169 }
90e98c52 5170
bae818ee 5171 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
dc3b17cc 5172 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
bae818ee 5173
5769ed0c
ID
5174 /*
5175 * disk_release() expects a queue ref from add_disk() and will
5176 * put it. Hold an extra ref until add_disk() is called.
5177 */
5178 WARN_ON(!blk_get_queue(q));
602adf40 5179 disk->queue = q;
602adf40
YS
5180 q->queuedata = rbd_dev;
5181
5182 rbd_dev->disk = disk;
602adf40 5183
602adf40 5184 return 0;
7ad18afa
CH
5185out_tag_set:
5186 blk_mq_free_tag_set(&rbd_dev->tag_set);
602adf40
YS
5187out_disk:
5188 put_disk(disk);
7ad18afa 5189 return err;
602adf40
YS
5190}
5191
dfc5606d
YS
5192/*
5193 sysfs
5194*/
5195
593a9e7b
AE
5196static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5197{
5198 return container_of(dev, struct rbd_device, dev);
5199}
5200
dfc5606d
YS
5201static ssize_t rbd_size_show(struct device *dev,
5202 struct device_attribute *attr, char *buf)
5203{
593a9e7b 5204 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0 5205
fc71d833
AE
5206 return sprintf(buf, "%llu\n",
5207 (unsigned long long)rbd_dev->mapping.size);
dfc5606d
YS
5208}
5209
34b13184
AE
5210/*
5211 * Note this shows the features for whatever's mapped, which is not
5212 * necessarily the base image.
5213 */
5214static ssize_t rbd_features_show(struct device *dev,
5215 struct device_attribute *attr, char *buf)
5216{
5217 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5218
5219 return sprintf(buf, "0x%016llx\n",
fc71d833 5220 (unsigned long long)rbd_dev->mapping.features);
34b13184
AE
5221}
5222
dfc5606d
YS
5223static ssize_t rbd_major_show(struct device *dev,
5224 struct device_attribute *attr, char *buf)
5225{
593a9e7b 5226 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 5227
fc71d833
AE
5228 if (rbd_dev->major)
5229 return sprintf(buf, "%d\n", rbd_dev->major);
5230
5231 return sprintf(buf, "(none)\n");
dd82fff1
ID
5232}
5233
5234static ssize_t rbd_minor_show(struct device *dev,
5235 struct device_attribute *attr, char *buf)
5236{
5237 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
fc71d833 5238
dd82fff1 5239 return sprintf(buf, "%d\n", rbd_dev->minor);
dfc5606d
YS
5240}
5241
005a07bf
ID
5242static ssize_t rbd_client_addr_show(struct device *dev,
5243 struct device_attribute *attr, char *buf)
5244{
5245 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5246 struct ceph_entity_addr *client_addr =
5247 ceph_client_addr(rbd_dev->rbd_client->client);
5248
5249 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5250 le32_to_cpu(client_addr->nonce));
5251}
5252
dfc5606d
YS
5253static ssize_t rbd_client_id_show(struct device *dev,
5254 struct device_attribute *attr, char *buf)
602adf40 5255{
593a9e7b 5256 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5257
1dbb4399 5258 return sprintf(buf, "client%lld\n",
033268a5 5259 ceph_client_gid(rbd_dev->rbd_client->client));
602adf40
YS
5260}
5261
267fb90b
MC
5262static ssize_t rbd_cluster_fsid_show(struct device *dev,
5263 struct device_attribute *attr, char *buf)
5264{
5265 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5266
5267 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5268}
5269
0d6d1e9c
MC
5270static ssize_t rbd_config_info_show(struct device *dev,
5271 struct device_attribute *attr, char *buf)
5272{
5273 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5274
5275 return sprintf(buf, "%s\n", rbd_dev->config_info);
602adf40
YS
5276}
5277
dfc5606d
YS
5278static ssize_t rbd_pool_show(struct device *dev,
5279 struct device_attribute *attr, char *buf)
602adf40 5280{
593a9e7b 5281 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5282
0d7dbfce 5283 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
5284}
5285
9bb2f334
AE
5286static ssize_t rbd_pool_id_show(struct device *dev,
5287 struct device_attribute *attr, char *buf)
5288{
5289 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5290
0d7dbfce 5291 return sprintf(buf, "%llu\n",
fc71d833 5292 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
5293}
5294
b26c047b
ID
5295static ssize_t rbd_pool_ns_show(struct device *dev,
5296 struct device_attribute *attr, char *buf)
5297{
5298 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5299
5300 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5301}
5302
dfc5606d
YS
5303static ssize_t rbd_name_show(struct device *dev,
5304 struct device_attribute *attr, char *buf)
5305{
593a9e7b 5306 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5307
a92ffdf8
AE
5308 if (rbd_dev->spec->image_name)
5309 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5310
5311 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
5312}
5313
589d30e0
AE
5314static ssize_t rbd_image_id_show(struct device *dev,
5315 struct device_attribute *attr, char *buf)
5316{
5317 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5318
0d7dbfce 5319 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
5320}
5321
34b13184
AE
5322/*
5323 * Shows the name of the currently-mapped snapshot (or
5324 * RBD_SNAP_HEAD_NAME for the base image).
5325 */
dfc5606d
YS
5326static ssize_t rbd_snap_show(struct device *dev,
5327 struct device_attribute *attr,
5328 char *buf)
5329{
593a9e7b 5330 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5331
0d7dbfce 5332 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
5333}
5334
92a58671
MC
5335static ssize_t rbd_snap_id_show(struct device *dev,
5336 struct device_attribute *attr, char *buf)
5337{
5338 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5339
5340 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5341}
5342
86b00e0d 5343/*
ff96128f
ID
5344 * For a v2 image, shows the chain of parent images, separated by empty
5345 * lines. For v1 images or if there is no parent, shows "(no parent
5346 * image)".
86b00e0d
AE
5347 */
5348static ssize_t rbd_parent_show(struct device *dev,
ff96128f
ID
5349 struct device_attribute *attr,
5350 char *buf)
86b00e0d
AE
5351{
5352 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
ff96128f 5353 ssize_t count = 0;
86b00e0d 5354
ff96128f 5355 if (!rbd_dev->parent)
86b00e0d
AE
5356 return sprintf(buf, "(no parent image)\n");
5357
ff96128f
ID
5358 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5359 struct rbd_spec *spec = rbd_dev->parent_spec;
5360
5361 count += sprintf(&buf[count], "%s"
5362 "pool_id %llu\npool_name %s\n"
e92c0eaf 5363 "pool_ns %s\n"
ff96128f
ID
5364 "image_id %s\nimage_name %s\n"
5365 "snap_id %llu\nsnap_name %s\n"
5366 "overlap %llu\n",
5367 !count ? "" : "\n", /* first? */
5368 spec->pool_id, spec->pool_name,
e92c0eaf 5369 spec->pool_ns ?: "",
ff96128f
ID
5370 spec->image_id, spec->image_name ?: "(unknown)",
5371 spec->snap_id, spec->snap_name,
5372 rbd_dev->parent_overlap);
5373 }
5374
5375 return count;
86b00e0d
AE
5376}
5377
dfc5606d
YS
5378static ssize_t rbd_image_refresh(struct device *dev,
5379 struct device_attribute *attr,
5380 const char *buf,
5381 size_t size)
5382{
593a9e7b 5383 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 5384 int ret;
602adf40 5385
cc4a38bd 5386 ret = rbd_dev_refresh(rbd_dev);
e627db08 5387 if (ret)
52bb1f9b 5388 return ret;
b813623a 5389
52bb1f9b 5390 return size;
dfc5606d 5391}
602adf40 5392
5657a819
JP
5393static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5394static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5395static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5396static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5397static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5398static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5399static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5400static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5401static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5402static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
b26c047b 5403static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5657a819
JP
5404static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5405static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5406static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5407static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5408static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5409static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
dfc5606d
YS
5410
5411static struct attribute *rbd_attrs[] = {
5412 &dev_attr_size.attr,
34b13184 5413 &dev_attr_features.attr,
dfc5606d 5414 &dev_attr_major.attr,
dd82fff1 5415 &dev_attr_minor.attr,
005a07bf 5416 &dev_attr_client_addr.attr,
dfc5606d 5417 &dev_attr_client_id.attr,
267fb90b 5418 &dev_attr_cluster_fsid.attr,
0d6d1e9c 5419 &dev_attr_config_info.attr,
dfc5606d 5420 &dev_attr_pool.attr,
9bb2f334 5421 &dev_attr_pool_id.attr,
b26c047b 5422 &dev_attr_pool_ns.attr,
dfc5606d 5423 &dev_attr_name.attr,
589d30e0 5424 &dev_attr_image_id.attr,
dfc5606d 5425 &dev_attr_current_snap.attr,
92a58671 5426 &dev_attr_snap_id.attr,
86b00e0d 5427 &dev_attr_parent.attr,
dfc5606d 5428 &dev_attr_refresh.attr,
dfc5606d
YS
5429 NULL
5430};
5431
5432static struct attribute_group rbd_attr_group = {
5433 .attrs = rbd_attrs,
5434};
5435
5436static const struct attribute_group *rbd_attr_groups[] = {
5437 &rbd_attr_group,
5438 NULL
5439};
5440
6cac4695 5441static void rbd_dev_release(struct device *dev);
dfc5606d 5442
b9942bc9 5443static const struct device_type rbd_device_type = {
dfc5606d
YS
5444 .name = "rbd",
5445 .groups = rbd_attr_groups,
6cac4695 5446 .release = rbd_dev_release,
dfc5606d
YS
5447};
5448
8b8fb99c
AE
5449static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5450{
5451 kref_get(&spec->kref);
5452
5453 return spec;
5454}
5455
5456static void rbd_spec_free(struct kref *kref);
5457static void rbd_spec_put(struct rbd_spec *spec)
5458{
5459 if (spec)
5460 kref_put(&spec->kref, rbd_spec_free);
5461}
5462
5463static struct rbd_spec *rbd_spec_alloc(void)
5464{
5465 struct rbd_spec *spec;
5466
5467 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5468 if (!spec)
5469 return NULL;
04077599
ID
5470
5471 spec->pool_id = CEPH_NOPOOL;
5472 spec->snap_id = CEPH_NOSNAP;
8b8fb99c
AE
5473 kref_init(&spec->kref);
5474
8b8fb99c
AE
5475 return spec;
5476}
5477
5478static void rbd_spec_free(struct kref *kref)
5479{
5480 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5481
5482 kfree(spec->pool_name);
b26c047b 5483 kfree(spec->pool_ns);
8b8fb99c
AE
5484 kfree(spec->image_id);
5485 kfree(spec->image_name);
5486 kfree(spec->snap_name);
5487 kfree(spec);
5488}
5489
1643dfa4 5490static void rbd_dev_free(struct rbd_device *rbd_dev)
dd5ac32d 5491{
99d16943 5492 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
ed95b21a 5493 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
dd5ac32d 5494
c41d13a3 5495 ceph_oid_destroy(&rbd_dev->header_oid);
6b6dddbe 5496 ceph_oloc_destroy(&rbd_dev->header_oloc);
0d6d1e9c 5497 kfree(rbd_dev->config_info);
c41d13a3 5498
dd5ac32d
ID
5499 rbd_put_client(rbd_dev->rbd_client);
5500 rbd_spec_put(rbd_dev->spec);
5501 kfree(rbd_dev->opts);
5502 kfree(rbd_dev);
1643dfa4
ID
5503}
5504
5505static void rbd_dev_release(struct device *dev)
5506{
5507 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5508 bool need_put = !!rbd_dev->opts;
5509
5510 if (need_put) {
5511 destroy_workqueue(rbd_dev->task_wq);
5512 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5513 }
5514
5515 rbd_dev_free(rbd_dev);
dd5ac32d
ID
5516
5517 /*
5518 * This is racy, but way better than putting module outside of
5519 * the release callback. The race window is pretty small, so
5520 * doing something similar to dm (dm-builtin.c) is overkill.
5521 */
5522 if (need_put)
5523 module_put(THIS_MODULE);
5524}
5525
1643dfa4
ID
5526static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5527 struct rbd_spec *spec)
c53d5893
AE
5528{
5529 struct rbd_device *rbd_dev;
5530
1643dfa4 5531 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
c53d5893
AE
5532 if (!rbd_dev)
5533 return NULL;
5534
5535 spin_lock_init(&rbd_dev->lock);
5536 INIT_LIST_HEAD(&rbd_dev->node);
c53d5893
AE
5537 init_rwsem(&rbd_dev->header_rwsem);
5538
7e97332e 5539 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
c41d13a3 5540 ceph_oid_init(&rbd_dev->header_oid);
431a02cd 5541 rbd_dev->header_oloc.pool = spec->pool_id;
b26c047b
ID
5542 if (spec->pool_ns) {
5543 WARN_ON(!*spec->pool_ns);
5544 rbd_dev->header_oloc.pool_ns =
5545 ceph_find_or_create_string(spec->pool_ns,
5546 strlen(spec->pool_ns));
5547 }
c41d13a3 5548
99d16943
ID
5549 mutex_init(&rbd_dev->watch_mutex);
5550 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5551 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5552
ed95b21a
ID
5553 init_rwsem(&rbd_dev->lock_rwsem);
5554 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5555 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5556 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5557 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5558 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
e1fddc8f 5559 spin_lock_init(&rbd_dev->lock_lists_lock);
637cd060 5560 INIT_LIST_HEAD(&rbd_dev->acquiring_list);
e1fddc8f 5561 INIT_LIST_HEAD(&rbd_dev->running_list);
637cd060 5562 init_completion(&rbd_dev->acquire_wait);
e1fddc8f 5563 init_completion(&rbd_dev->releasing_wait);
ed95b21a 5564
22e8bd51 5565 spin_lock_init(&rbd_dev->object_map_lock);
ed95b21a 5566
dd5ac32d
ID
5567 rbd_dev->dev.bus = &rbd_bus_type;
5568 rbd_dev->dev.type = &rbd_device_type;
5569 rbd_dev->dev.parent = &rbd_root_dev;
dd5ac32d
ID
5570 device_initialize(&rbd_dev->dev);
5571
c53d5893 5572 rbd_dev->rbd_client = rbdc;
d147543d 5573 rbd_dev->spec = spec;
0903e875 5574
1643dfa4
ID
5575 return rbd_dev;
5576}
5577
5578/*
5579 * Create a mapping rbd_dev.
5580 */
5581static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5582 struct rbd_spec *spec,
5583 struct rbd_options *opts)
5584{
5585 struct rbd_device *rbd_dev;
5586
5587 rbd_dev = __rbd_dev_create(rbdc, spec);
5588 if (!rbd_dev)
5589 return NULL;
5590
5591 rbd_dev->opts = opts;
5592
5593 /* get an id and fill in device name */
5594 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5595 minor_to_rbd_dev_id(1 << MINORBITS),
5596 GFP_KERNEL);
5597 if (rbd_dev->dev_id < 0)
5598 goto fail_rbd_dev;
5599
5600 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5601 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5602 rbd_dev->name);
5603 if (!rbd_dev->task_wq)
5604 goto fail_dev_id;
dd5ac32d 5605
1643dfa4
ID
5606 /* we have a ref from do_rbd_add() */
5607 __module_get(THIS_MODULE);
dd5ac32d 5608
1643dfa4 5609 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
c53d5893 5610 return rbd_dev;
1643dfa4
ID
5611
5612fail_dev_id:
5613 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5614fail_rbd_dev:
5615 rbd_dev_free(rbd_dev);
5616 return NULL;
c53d5893
AE
5617}
5618
5619static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5620{
dd5ac32d
ID
5621 if (rbd_dev)
5622 put_device(&rbd_dev->dev);
c53d5893
AE
5623}
5624
9d475de5
AE
5625/*
5626 * Get the size and object order for an image snapshot, or if
5627 * snap_id is CEPH_NOSNAP, gets this information for the base
5628 * image.
5629 */
5630static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5631 u8 *order, u64 *snap_size)
5632{
5633 __le64 snapid = cpu_to_le64(snap_id);
5634 int ret;
5635 struct {
5636 u8 order;
5637 __le64 size;
5638 } __attribute__ ((packed)) size_buf = { 0 };
5639
ecd4a68a
ID
5640 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5641 &rbd_dev->header_oloc, "get_size",
5642 &snapid, sizeof(snapid),
5643 &size_buf, sizeof(size_buf));
36be9a76 5644 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
5645 if (ret < 0)
5646 return ret;
57385b51
AE
5647 if (ret < sizeof (size_buf))
5648 return -ERANGE;
9d475de5 5649
c3545579 5650 if (order) {
c86f86e9 5651 *order = size_buf.order;
c3545579
JD
5652 dout(" order %u", (unsigned int)*order);
5653 }
9d475de5
AE
5654 *snap_size = le64_to_cpu(size_buf.size);
5655
c3545579
JD
5656 dout(" snap_id 0x%016llx snap_size = %llu\n",
5657 (unsigned long long)snap_id,
57385b51 5658 (unsigned long long)*snap_size);
9d475de5
AE
5659
5660 return 0;
5661}
5662
5663static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5664{
5665 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5666 &rbd_dev->header.obj_order,
5667 &rbd_dev->header.image_size);
5668}
5669
1e130199
AE
5670static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5671{
5435d206 5672 size_t size;
1e130199
AE
5673 void *reply_buf;
5674 int ret;
5675 void *p;
5676
5435d206
DY
5677 /* Response will be an encoded string, which includes a length */
5678 size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5679 reply_buf = kzalloc(size, GFP_KERNEL);
1e130199
AE
5680 if (!reply_buf)
5681 return -ENOMEM;
5682
ecd4a68a
ID
5683 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5684 &rbd_dev->header_oloc, "get_object_prefix",
5435d206 5685 NULL, 0, reply_buf, size);
36be9a76 5686 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
5687 if (ret < 0)
5688 goto out;
5689
5690 p = reply_buf;
5691 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
57385b51
AE
5692 p + ret, NULL, GFP_NOIO);
5693 ret = 0;
1e130199
AE
5694
5695 if (IS_ERR(rbd_dev->header.object_prefix)) {
5696 ret = PTR_ERR(rbd_dev->header.object_prefix);
5697 rbd_dev->header.object_prefix = NULL;
5698 } else {
5699 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
5700 }
1e130199
AE
5701out:
5702 kfree(reply_buf);
5703
5704 return ret;
5705}
5706
b1b5402a
AE
5707static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5708 u64 *snap_features)
5709{
5710 __le64 snapid = cpu_to_le64(snap_id);
5711 struct {
5712 __le64 features;
5713 __le64 incompat;
4157976b 5714 } __attribute__ ((packed)) features_buf = { 0 };
d3767f0f 5715 u64 unsup;
b1b5402a
AE
5716 int ret;
5717
ecd4a68a
ID
5718 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5719 &rbd_dev->header_oloc, "get_features",
5720 &snapid, sizeof(snapid),
5721 &features_buf, sizeof(features_buf));
36be9a76 5722 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
5723 if (ret < 0)
5724 return ret;
57385b51
AE
5725 if (ret < sizeof (features_buf))
5726 return -ERANGE;
d889140c 5727
d3767f0f
ID
5728 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5729 if (unsup) {
5730 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5731 unsup);
b8f5c6ed 5732 return -ENXIO;
d3767f0f 5733 }
d889140c 5734
b1b5402a
AE
5735 *snap_features = le64_to_cpu(features_buf.features);
5736
5737 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
57385b51
AE
5738 (unsigned long long)snap_id,
5739 (unsigned long long)*snap_features,
5740 (unsigned long long)le64_to_cpu(features_buf.incompat));
b1b5402a
AE
5741
5742 return 0;
5743}
5744
5745static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5746{
5747 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5748 &rbd_dev->header.features);
5749}
5750
22e8bd51
ID
5751/*
5752 * These are generic image flags, but since they are used only for
5753 * object map, store them in rbd_dev->object_map_flags.
5754 *
5755 * For the same reason, this function is called only on object map
5756 * (re)load and not on header refresh.
5757 */
5758static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5759{
5760 __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5761 __le64 flags;
5762 int ret;
5763
5764 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5765 &rbd_dev->header_oloc, "get_flags",
5766 &snapid, sizeof(snapid),
5767 &flags, sizeof(flags));
5768 if (ret < 0)
5769 return ret;
5770 if (ret < sizeof(flags))
5771 return -EBADMSG;
5772
5773 rbd_dev->object_map_flags = le64_to_cpu(flags);
5774 return 0;
5775}
5776
eb3b2d6b
ID
5777struct parent_image_info {
5778 u64 pool_id;
e92c0eaf 5779 const char *pool_ns;
eb3b2d6b
ID
5780 const char *image_id;
5781 u64 snap_id;
5782
e92c0eaf 5783 bool has_overlap;
eb3b2d6b
ID
5784 u64 overlap;
5785};
5786
e92c0eaf
ID
5787/*
5788 * The caller is responsible for @pii.
5789 */
5790static int decode_parent_image_spec(void **p, void *end,
5791 struct parent_image_info *pii)
5792{
5793 u8 struct_v;
5794 u32 struct_len;
5795 int ret;
5796
5797 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5798 &struct_v, &struct_len);
5799 if (ret)
5800 return ret;
5801
5802 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5803 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5804 if (IS_ERR(pii->pool_ns)) {
5805 ret = PTR_ERR(pii->pool_ns);
5806 pii->pool_ns = NULL;
5807 return ret;
5808 }
5809 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5810 if (IS_ERR(pii->image_id)) {
5811 ret = PTR_ERR(pii->image_id);
5812 pii->image_id = NULL;
5813 return ret;
5814 }
5815 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5816 return 0;
5817
5818e_inval:
5819 return -EINVAL;
5820}
5821
5822static int __get_parent_info(struct rbd_device *rbd_dev,
5823 struct page *req_page,
5824 struct page *reply_page,
5825 struct parent_image_info *pii)
5826{
5827 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5828 size_t reply_len = PAGE_SIZE;
5829 void *p, *end;
5830 int ret;
5831
5832 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5833 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
68ada915 5834 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5835 if (ret)
5836 return ret == -EOPNOTSUPP ? 1 : ret;
5837
5838 p = page_address(reply_page);
5839 end = p + reply_len;
5840 ret = decode_parent_image_spec(&p, end, pii);
5841 if (ret)
5842 return ret;
5843
5844 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5845 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
68ada915 5846 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5847 if (ret)
5848 return ret;
5849
5850 p = page_address(reply_page);
5851 end = p + reply_len;
5852 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5853 if (pii->has_overlap)
5854 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5855
5856 return 0;
5857
5858e_inval:
5859 return -EINVAL;
5860}
5861
eb3b2d6b
ID
5862/*
5863 * The caller is responsible for @pii.
5864 */
5865static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5866 struct page *req_page,
5867 struct page *reply_page,
5868 struct parent_image_info *pii)
5869{
5870 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5871 size_t reply_len = PAGE_SIZE;
5872 void *p, *end;
5873 int ret;
5874
5875 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5876 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
68ada915 5877 req_page, sizeof(u64), &reply_page, &reply_len);
eb3b2d6b
ID
5878 if (ret)
5879 return ret;
5880
5881 p = page_address(reply_page);
5882 end = p + reply_len;
5883 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5884 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5885 if (IS_ERR(pii->image_id)) {
5886 ret = PTR_ERR(pii->image_id);
5887 pii->image_id = NULL;
5888 return ret;
5889 }
5890 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
e92c0eaf 5891 pii->has_overlap = true;
eb3b2d6b
ID
5892 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5893
5894 return 0;
5895
5896e_inval:
5897 return -EINVAL;
5898}
5899
5900static int get_parent_info(struct rbd_device *rbd_dev,
5901 struct parent_image_info *pii)
5902{
5903 struct page *req_page, *reply_page;
5904 void *p;
5905 int ret;
5906
5907 req_page = alloc_page(GFP_KERNEL);
5908 if (!req_page)
5909 return -ENOMEM;
5910
5911 reply_page = alloc_page(GFP_KERNEL);
5912 if (!reply_page) {
5913 __free_page(req_page);
5914 return -ENOMEM;
5915 }
5916
5917 p = page_address(req_page);
5918 ceph_encode_64(&p, rbd_dev->spec->snap_id);
e92c0eaf
ID
5919 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5920 if (ret > 0)
5921 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5922 pii);
eb3b2d6b
ID
5923
5924 __free_page(req_page);
5925 __free_page(reply_page);
5926 return ret;
5927}
5928
86b00e0d
AE
5929static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5930{
5931 struct rbd_spec *parent_spec;
eb3b2d6b 5932 struct parent_image_info pii = { 0 };
86b00e0d
AE
5933 int ret;
5934
5935 parent_spec = rbd_spec_alloc();
5936 if (!parent_spec)
5937 return -ENOMEM;
5938
eb3b2d6b
ID
5939 ret = get_parent_info(rbd_dev, &pii);
5940 if (ret)
86b00e0d 5941 goto out_err;
86b00e0d 5942
e92c0eaf
ID
5943 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5944 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5945 pii.has_overlap, pii.overlap);
86b00e0d 5946
e92c0eaf 5947 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
392a9dad
AE
5948 /*
5949 * Either the parent never existed, or we have
5950 * record of it but the image got flattened so it no
5951 * longer has a parent. When the parent of a
5952 * layered image disappears we immediately set the
5953 * overlap to 0. The effect of this is that all new
5954 * requests will be treated as if the image had no
5955 * parent.
e92c0eaf
ID
5956 *
5957 * If !pii.has_overlap, the parent image spec is not
5958 * applicable. It's there to avoid duplication in each
5959 * snapshot record.
392a9dad
AE
5960 */
5961 if (rbd_dev->parent_overlap) {
5962 rbd_dev->parent_overlap = 0;
392a9dad
AE
5963 rbd_dev_parent_put(rbd_dev);
5964 pr_info("%s: clone image has been flattened\n",
5965 rbd_dev->disk->disk_name);
5966 }
5967
86b00e0d 5968 goto out; /* No parent? No problem. */
392a9dad 5969 }
86b00e0d 5970
0903e875
AE
5971 /* The ceph file layout needs to fit pool id in 32 bits */
5972
5973 ret = -EIO;
eb3b2d6b 5974 if (pii.pool_id > (u64)U32_MAX) {
9584d508 5975 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
eb3b2d6b 5976 (unsigned long long)pii.pool_id, U32_MAX);
86b00e0d
AE
5977 goto out_err;
5978 }
86b00e0d 5979
3b5cf2a2
AE
5980 /*
5981 * The parent won't change (except when the clone is
5982 * flattened, already handled that). So we only need to
5983 * record the parent spec we have not already done so.
5984 */
5985 if (!rbd_dev->parent_spec) {
eb3b2d6b 5986 parent_spec->pool_id = pii.pool_id;
e92c0eaf
ID
5987 if (pii.pool_ns && *pii.pool_ns) {
5988 parent_spec->pool_ns = pii.pool_ns;
5989 pii.pool_ns = NULL;
5990 }
eb3b2d6b
ID
5991 parent_spec->image_id = pii.image_id;
5992 pii.image_id = NULL;
5993 parent_spec->snap_id = pii.snap_id;
b26c047b 5994
70cf49cf
AE
5995 rbd_dev->parent_spec = parent_spec;
5996 parent_spec = NULL; /* rbd_dev now owns this */
3b5cf2a2
AE
5997 }
5998
5999 /*
cf32bd9c
ID
6000 * We always update the parent overlap. If it's zero we issue
6001 * a warning, as we will proceed as if there was no parent.
3b5cf2a2 6002 */
eb3b2d6b 6003 if (!pii.overlap) {
3b5cf2a2 6004 if (parent_spec) {
cf32bd9c
ID
6005 /* refresh, careful to warn just once */
6006 if (rbd_dev->parent_overlap)
6007 rbd_warn(rbd_dev,
6008 "clone now standalone (overlap became 0)");
3b5cf2a2 6009 } else {
cf32bd9c
ID
6010 /* initial probe */
6011 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
3b5cf2a2 6012 }
70cf49cf 6013 }
eb3b2d6b 6014 rbd_dev->parent_overlap = pii.overlap;
cf32bd9c 6015
86b00e0d
AE
6016out:
6017 ret = 0;
6018out_err:
e92c0eaf 6019 kfree(pii.pool_ns);
eb3b2d6b 6020 kfree(pii.image_id);
86b00e0d 6021 rbd_spec_put(parent_spec);
86b00e0d
AE
6022 return ret;
6023}
6024
cc070d59
AE
6025static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
6026{
6027 struct {
6028 __le64 stripe_unit;
6029 __le64 stripe_count;
6030 } __attribute__ ((packed)) striping_info_buf = { 0 };
6031 size_t size = sizeof (striping_info_buf);
6032 void *p;
cc070d59
AE
6033 int ret;
6034
ecd4a68a
ID
6035 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6036 &rbd_dev->header_oloc, "get_stripe_unit_count",
6037 NULL, 0, &striping_info_buf, size);
cc070d59
AE
6038 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6039 if (ret < 0)
6040 return ret;
6041 if (ret < size)
6042 return -ERANGE;
6043
cc070d59 6044 p = &striping_info_buf;
b1331852
ID
6045 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
6046 rbd_dev->header.stripe_count = ceph_decode_64(&p);
cc070d59
AE
6047 return 0;
6048}
6049
7e97332e
ID
6050static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
6051{
6052 __le64 data_pool_id;
6053 int ret;
6054
6055 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6056 &rbd_dev->header_oloc, "get_data_pool",
6057 NULL, 0, &data_pool_id, sizeof(data_pool_id));
6058 if (ret < 0)
6059 return ret;
6060 if (ret < sizeof(data_pool_id))
6061 return -EBADMSG;
6062
6063 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
6064 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
6065 return 0;
6066}
6067
9e15b77d
AE
6068static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
6069{
ecd4a68a 6070 CEPH_DEFINE_OID_ONSTACK(oid);
9e15b77d
AE
6071 size_t image_id_size;
6072 char *image_id;
6073 void *p;
6074 void *end;
6075 size_t size;
6076 void *reply_buf = NULL;
6077 size_t len = 0;
6078 char *image_name = NULL;
6079 int ret;
6080
6081 rbd_assert(!rbd_dev->spec->image_name);
6082
69e7a02f
AE
6083 len = strlen(rbd_dev->spec->image_id);
6084 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
6085 image_id = kmalloc(image_id_size, GFP_KERNEL);
6086 if (!image_id)
6087 return NULL;
6088
6089 p = image_id;
4157976b 6090 end = image_id + image_id_size;
57385b51 6091 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
9e15b77d
AE
6092
6093 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
6094 reply_buf = kmalloc(size, GFP_KERNEL);
6095 if (!reply_buf)
6096 goto out;
6097
ecd4a68a
ID
6098 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
6099 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6100 "dir_get_name", image_id, image_id_size,
6101 reply_buf, size);
9e15b77d
AE
6102 if (ret < 0)
6103 goto out;
6104 p = reply_buf;
f40eb349
AE
6105 end = reply_buf + ret;
6106
9e15b77d
AE
6107 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
6108 if (IS_ERR(image_name))
6109 image_name = NULL;
6110 else
6111 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
6112out:
6113 kfree(reply_buf);
6114 kfree(image_id);
6115
6116 return image_name;
6117}
6118
2ad3d716
AE
6119static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6120{
6121 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6122 const char *snap_name;
6123 u32 which = 0;
6124
6125 /* Skip over names until we find the one we are looking for */
6126
6127 snap_name = rbd_dev->header.snap_names;
6128 while (which < snapc->num_snaps) {
6129 if (!strcmp(name, snap_name))
6130 return snapc->snaps[which];
6131 snap_name += strlen(snap_name) + 1;
6132 which++;
6133 }
6134 return CEPH_NOSNAP;
6135}
6136
6137static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6138{
6139 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6140 u32 which;
6141 bool found = false;
6142 u64 snap_id;
6143
6144 for (which = 0; !found && which < snapc->num_snaps; which++) {
6145 const char *snap_name;
6146
6147 snap_id = snapc->snaps[which];
6148 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
efadc98a
JD
6149 if (IS_ERR(snap_name)) {
6150 /* ignore no-longer existing snapshots */
6151 if (PTR_ERR(snap_name) == -ENOENT)
6152 continue;
6153 else
6154 break;
6155 }
2ad3d716
AE
6156 found = !strcmp(name, snap_name);
6157 kfree(snap_name);
6158 }
6159 return found ? snap_id : CEPH_NOSNAP;
6160}
6161
6162/*
6163 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
6164 * no snapshot by that name is found, or if an error occurs.
6165 */
6166static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6167{
6168 if (rbd_dev->image_format == 1)
6169 return rbd_v1_snap_id_by_name(rbd_dev, name);
6170
6171 return rbd_v2_snap_id_by_name(rbd_dev, name);
6172}
6173
9e15b77d 6174/*
04077599
ID
6175 * An image being mapped will have everything but the snap id.
6176 */
6177static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
6178{
6179 struct rbd_spec *spec = rbd_dev->spec;
6180
6181 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
6182 rbd_assert(spec->image_id && spec->image_name);
6183 rbd_assert(spec->snap_name);
6184
6185 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
6186 u64 snap_id;
6187
6188 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
6189 if (snap_id == CEPH_NOSNAP)
6190 return -ENOENT;
6191
6192 spec->snap_id = snap_id;
6193 } else {
6194 spec->snap_id = CEPH_NOSNAP;
6195 }
6196
6197 return 0;
6198}
6199
6200/*
6201 * A parent image will have all ids but none of the names.
e1d4213f 6202 *
04077599
ID
6203 * All names in an rbd spec are dynamically allocated. It's OK if we
6204 * can't figure out the name for an image id.
9e15b77d 6205 */
04077599 6206static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
9e15b77d 6207{
2e9f7f1c
AE
6208 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6209 struct rbd_spec *spec = rbd_dev->spec;
6210 const char *pool_name;
6211 const char *image_name;
6212 const char *snap_name;
9e15b77d
AE
6213 int ret;
6214
04077599
ID
6215 rbd_assert(spec->pool_id != CEPH_NOPOOL);
6216 rbd_assert(spec->image_id);
6217 rbd_assert(spec->snap_id != CEPH_NOSNAP);
9e15b77d 6218
2e9f7f1c 6219 /* Get the pool name; we have to make our own copy of this */
9e15b77d 6220
2e9f7f1c
AE
6221 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6222 if (!pool_name) {
6223 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
935dc89f
AE
6224 return -EIO;
6225 }
2e9f7f1c
AE
6226 pool_name = kstrdup(pool_name, GFP_KERNEL);
6227 if (!pool_name)
9e15b77d
AE
6228 return -ENOMEM;
6229
6230 /* Fetch the image name; tolerate failure here */
6231
2e9f7f1c
AE
6232 image_name = rbd_dev_image_name(rbd_dev);
6233 if (!image_name)
06ecc6cb 6234 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d 6235
04077599 6236 /* Fetch the snapshot name */
9e15b77d 6237
2e9f7f1c 6238 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
da6a6b63
JD
6239 if (IS_ERR(snap_name)) {
6240 ret = PTR_ERR(snap_name);
9e15b77d 6241 goto out_err;
2e9f7f1c
AE
6242 }
6243
6244 spec->pool_name = pool_name;
6245 spec->image_name = image_name;
6246 spec->snap_name = snap_name;
9e15b77d
AE
6247
6248 return 0;
04077599 6249
9e15b77d 6250out_err:
2e9f7f1c
AE
6251 kfree(image_name);
6252 kfree(pool_name);
9e15b77d
AE
6253 return ret;
6254}
6255
cc4a38bd 6256static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
35d489f9
AE
6257{
6258 size_t size;
6259 int ret;
6260 void *reply_buf;
6261 void *p;
6262 void *end;
6263 u64 seq;
6264 u32 snap_count;
6265 struct ceph_snap_context *snapc;
6266 u32 i;
6267
6268 /*
6269 * We'll need room for the seq value (maximum snapshot id),
6270 * snapshot count, and array of that many snapshot ids.
6271 * For now we have a fixed upper limit on the number we're
6272 * prepared to receive.
6273 */
6274 size = sizeof (__le64) + sizeof (__le32) +
6275 RBD_MAX_SNAP_COUNT * sizeof (__le64);
6276 reply_buf = kzalloc(size, GFP_KERNEL);
6277 if (!reply_buf)
6278 return -ENOMEM;
6279
ecd4a68a
ID
6280 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6281 &rbd_dev->header_oloc, "get_snapcontext",
6282 NULL, 0, reply_buf, size);
36be9a76 6283 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
6284 if (ret < 0)
6285 goto out;
6286
35d489f9 6287 p = reply_buf;
57385b51
AE
6288 end = reply_buf + ret;
6289 ret = -ERANGE;
35d489f9
AE
6290 ceph_decode_64_safe(&p, end, seq, out);
6291 ceph_decode_32_safe(&p, end, snap_count, out);
6292
6293 /*
6294 * Make sure the reported number of snapshot ids wouldn't go
6295 * beyond the end of our buffer. But before checking that,
6296 * make sure the computed size of the snapshot context we
6297 * allocate is representable in a size_t.
6298 */
6299 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6300 / sizeof (u64)) {
6301 ret = -EINVAL;
6302 goto out;
6303 }
6304 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6305 goto out;
468521c1 6306 ret = 0;
35d489f9 6307
812164f8 6308 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
35d489f9
AE
6309 if (!snapc) {
6310 ret = -ENOMEM;
6311 goto out;
6312 }
35d489f9 6313 snapc->seq = seq;
35d489f9
AE
6314 for (i = 0; i < snap_count; i++)
6315 snapc->snaps[i] = ceph_decode_64(&p);
6316
49ece554 6317 ceph_put_snap_context(rbd_dev->header.snapc);
35d489f9
AE
6318 rbd_dev->header.snapc = snapc;
6319
6320 dout(" snap context seq = %llu, snap_count = %u\n",
57385b51 6321 (unsigned long long)seq, (unsigned int)snap_count);
35d489f9
AE
6322out:
6323 kfree(reply_buf);
6324
57385b51 6325 return ret;
35d489f9
AE
6326}
6327
54cac61f
AE
6328static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6329 u64 snap_id)
b8b1e2db
AE
6330{
6331 size_t size;
6332 void *reply_buf;
54cac61f 6333 __le64 snapid;
b8b1e2db
AE
6334 int ret;
6335 void *p;
6336 void *end;
b8b1e2db
AE
6337 char *snap_name;
6338
6339 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6340 reply_buf = kmalloc(size, GFP_KERNEL);
6341 if (!reply_buf)
6342 return ERR_PTR(-ENOMEM);
6343
54cac61f 6344 snapid = cpu_to_le64(snap_id);
ecd4a68a
ID
6345 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6346 &rbd_dev->header_oloc, "get_snapshot_name",
6347 &snapid, sizeof(snapid), reply_buf, size);
36be9a76 6348 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
f40eb349
AE
6349 if (ret < 0) {
6350 snap_name = ERR_PTR(ret);
b8b1e2db 6351 goto out;
f40eb349 6352 }
b8b1e2db
AE
6353
6354 p = reply_buf;
f40eb349 6355 end = reply_buf + ret;
e5c35534 6356 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
f40eb349 6357 if (IS_ERR(snap_name))
b8b1e2db 6358 goto out;
b8b1e2db 6359
f40eb349 6360 dout(" snap_id 0x%016llx snap_name = %s\n",
54cac61f 6361 (unsigned long long)snap_id, snap_name);
b8b1e2db
AE
6362out:
6363 kfree(reply_buf);
6364
f40eb349 6365 return snap_name;
b8b1e2db
AE
6366}
6367
2df3fac7 6368static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
117973fb 6369{
2df3fac7 6370 bool first_time = rbd_dev->header.object_prefix == NULL;
117973fb 6371 int ret;
117973fb 6372
1617e40c
JD
6373 ret = rbd_dev_v2_image_size(rbd_dev);
6374 if (ret)
cfbf6377 6375 return ret;
1617e40c 6376
2df3fac7
AE
6377 if (first_time) {
6378 ret = rbd_dev_v2_header_onetime(rbd_dev);
6379 if (ret)
cfbf6377 6380 return ret;
2df3fac7
AE
6381 }
6382
cc4a38bd 6383 ret = rbd_dev_v2_snap_context(rbd_dev);
d194cd1d
ID
6384 if (ret && first_time) {
6385 kfree(rbd_dev->header.object_prefix);
6386 rbd_dev->header.object_prefix = NULL;
6387 }
117973fb
AE
6388
6389 return ret;
6390}
6391
a720ae09
ID
6392static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6393{
6394 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6395
6396 if (rbd_dev->image_format == 1)
6397 return rbd_dev_v1_header_info(rbd_dev);
6398
6399 return rbd_dev_v2_header_info(rbd_dev);
6400}
6401
e28fff26
AE
6402/*
6403 * Skips over white space at *buf, and updates *buf to point to the
6404 * first found non-space character (if any). Returns the length of
593a9e7b
AE
6405 * the token (string of non-white space characters) found. Note
6406 * that *buf must be terminated with '\0'.
e28fff26
AE
6407 */
6408static inline size_t next_token(const char **buf)
6409{
6410 /*
6411 * These are the characters that produce nonzero for
6412 * isspace() in the "C" and "POSIX" locales.
6413 */
6414 const char *spaces = " \f\n\r\t\v";
6415
6416 *buf += strspn(*buf, spaces); /* Find start of token */
6417
6418 return strcspn(*buf, spaces); /* Return token length */
6419}
6420
ea3352f4
AE
6421/*
6422 * Finds the next token in *buf, dynamically allocates a buffer big
6423 * enough to hold a copy of it, and copies the token into the new
6424 * buffer. The copy is guaranteed to be terminated with '\0'. Note
6425 * that a duplicate buffer is created even for a zero-length token.
6426 *
6427 * Returns a pointer to the newly-allocated duplicate, or a null
6428 * pointer if memory for the duplicate was not available. If
6429 * the lenp argument is a non-null pointer, the length of the token
6430 * (not including the '\0') is returned in *lenp.
6431 *
6432 * If successful, the *buf pointer will be updated to point beyond
6433 * the end of the found token.
6434 *
6435 * Note: uses GFP_KERNEL for allocation.
6436 */
6437static inline char *dup_token(const char **buf, size_t *lenp)
6438{
6439 char *dup;
6440 size_t len;
6441
6442 len = next_token(buf);
4caf35f9 6443 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
6444 if (!dup)
6445 return NULL;
ea3352f4
AE
6446 *(dup + len) = '\0';
6447 *buf += len;
6448
6449 if (lenp)
6450 *lenp = len;
6451
6452 return dup;
6453}
6454
a725f65e 6455/*
859c31df
AE
6456 * Parse the options provided for an "rbd add" (i.e., rbd image
6457 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
6458 * and the data written is passed here via a NUL-terminated buffer.
6459 * Returns 0 if successful or an error code otherwise.
d22f76e7 6460 *
859c31df
AE
6461 * The information extracted from these options is recorded in
6462 * the other parameters which return dynamically-allocated
6463 * structures:
6464 * ceph_opts
6465 * The address of a pointer that will refer to a ceph options
6466 * structure. Caller must release the returned pointer using
6467 * ceph_destroy_options() when it is no longer needed.
6468 * rbd_opts
6469 * Address of an rbd options pointer. Fully initialized by
6470 * this function; caller must release with kfree().
6471 * spec
6472 * Address of an rbd image specification pointer. Fully
6473 * initialized by this function based on parsed options.
6474 * Caller must release with rbd_spec_put().
6475 *
6476 * The options passed take this form:
6477 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6478 * where:
6479 * <mon_addrs>
6480 * A comma-separated list of one or more monitor addresses.
6481 * A monitor address is an ip address, optionally followed
6482 * by a port number (separated by a colon).
6483 * I.e.: ip1[:port1][,ip2[:port2]...]
6484 * <options>
6485 * A comma-separated list of ceph and/or rbd options.
6486 * <pool_name>
6487 * The name of the rados pool containing the rbd image.
6488 * <image_name>
6489 * The name of the image in that pool to map.
6490 * <snap_id>
6491 * An optional snapshot id. If provided, the mapping will
6492 * present data from the image at the time that snapshot was
6493 * created. The image head is used if no snapshot id is
6494 * provided. Snapshot mappings are always read-only.
a725f65e 6495 */
859c31df 6496static int rbd_add_parse_args(const char *buf,
dc79b113 6497 struct ceph_options **ceph_opts,
859c31df
AE
6498 struct rbd_options **opts,
6499 struct rbd_spec **rbd_spec)
e28fff26 6500{
d22f76e7 6501 size_t len;
859c31df 6502 char *options;
0ddebc0c 6503 const char *mon_addrs;
ecb4dc22 6504 char *snap_name;
0ddebc0c 6505 size_t mon_addrs_size;
c300156b 6506 struct parse_rbd_opts_ctx pctx = { 0 };
859c31df 6507 struct ceph_options *copts;
dc79b113 6508 int ret;
e28fff26
AE
6509
6510 /* The first four tokens are required */
6511
7ef3214a 6512 len = next_token(&buf);
4fb5d671
AE
6513 if (!len) {
6514 rbd_warn(NULL, "no monitor address(es) provided");
6515 return -EINVAL;
6516 }
0ddebc0c 6517 mon_addrs = buf;
f28e565a 6518 mon_addrs_size = len + 1;
7ef3214a 6519 buf += len;
a725f65e 6520
dc79b113 6521 ret = -EINVAL;
f28e565a
AE
6522 options = dup_token(&buf, NULL);
6523 if (!options)
dc79b113 6524 return -ENOMEM;
4fb5d671
AE
6525 if (!*options) {
6526 rbd_warn(NULL, "no options provided");
6527 goto out_err;
6528 }
e28fff26 6529
c300156b
ID
6530 pctx.spec = rbd_spec_alloc();
6531 if (!pctx.spec)
f28e565a 6532 goto out_mem;
859c31df 6533
c300156b
ID
6534 pctx.spec->pool_name = dup_token(&buf, NULL);
6535 if (!pctx.spec->pool_name)
859c31df 6536 goto out_mem;
c300156b 6537 if (!*pctx.spec->pool_name) {
4fb5d671
AE
6538 rbd_warn(NULL, "no pool name provided");
6539 goto out_err;
6540 }
e28fff26 6541
c300156b
ID
6542 pctx.spec->image_name = dup_token(&buf, NULL);
6543 if (!pctx.spec->image_name)
f28e565a 6544 goto out_mem;
c300156b 6545 if (!*pctx.spec->image_name) {
4fb5d671
AE
6546 rbd_warn(NULL, "no image name provided");
6547 goto out_err;
6548 }
d4b125e9 6549
f28e565a
AE
6550 /*
6551 * Snapshot name is optional; default is to use "-"
6552 * (indicating the head/no snapshot).
6553 */
3feeb894 6554 len = next_token(&buf);
820a5f3e 6555 if (!len) {
3feeb894
AE
6556 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6557 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 6558 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 6559 ret = -ENAMETOOLONG;
f28e565a 6560 goto out_err;
849b4260 6561 }
ecb4dc22
AE
6562 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6563 if (!snap_name)
f28e565a 6564 goto out_mem;
ecb4dc22 6565 *(snap_name + len) = '\0';
c300156b 6566 pctx.spec->snap_name = snap_name;
e5c35534 6567
0ddebc0c 6568 /* Initialize all rbd options to the defaults */
e28fff26 6569
c300156b
ID
6570 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6571 if (!pctx.opts)
4e9afeba
AE
6572 goto out_mem;
6573
c300156b
ID
6574 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6575 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
0c93e1b7 6576 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
c300156b
ID
6577 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6578 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6579 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6580 pctx.opts->trim = RBD_TRIM_DEFAULT;
d22f76e7 6581
859c31df 6582 copts = ceph_parse_options(options, mon_addrs,
c300156b
ID
6583 mon_addrs + mon_addrs_size - 1,
6584 parse_rbd_opts_token, &pctx);
859c31df
AE
6585 if (IS_ERR(copts)) {
6586 ret = PTR_ERR(copts);
dc79b113
AE
6587 goto out_err;
6588 }
859c31df
AE
6589 kfree(options);
6590
6591 *ceph_opts = copts;
c300156b
ID
6592 *opts = pctx.opts;
6593 *rbd_spec = pctx.spec;
0ddebc0c 6594
dc79b113 6595 return 0;
f28e565a 6596out_mem:
dc79b113 6597 ret = -ENOMEM;
d22f76e7 6598out_err:
c300156b
ID
6599 kfree(pctx.opts);
6600 rbd_spec_put(pctx.spec);
f28e565a 6601 kfree(options);
d22f76e7 6602
dc79b113 6603 return ret;
a725f65e
AE
6604}
6605
e010dd0a
ID
6606static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6607{
6608 down_write(&rbd_dev->lock_rwsem);
6609 if (__rbd_is_lock_owner(rbd_dev))
e1fddc8f 6610 __rbd_release_lock(rbd_dev);
e010dd0a
ID
6611 up_write(&rbd_dev->lock_rwsem);
6612}
6613
637cd060
ID
6614/*
6615 * If the wait is interrupted, an error is returned even if the lock
6616 * was successfully acquired. rbd_dev_image_unlock() will release it
6617 * if needed.
6618 */
e010dd0a
ID
6619static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6620{
637cd060 6621 long ret;
2f18d466 6622
e010dd0a 6623 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
637cd060
ID
6624 if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6625 return 0;
6626
e010dd0a
ID
6627 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6628 return -EINVAL;
6629 }
6630
3fe69921 6631 if (rbd_is_ro(rbd_dev))
637cd060
ID
6632 return 0;
6633
6634 rbd_assert(!rbd_is_lock_owner(rbd_dev));
6635 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6636 ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6637 ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
25e6be21 6638 if (ret > 0) {
637cd060 6639 ret = rbd_dev->acquire_err;
25e6be21
DY
6640 } else {
6641 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6642 if (!ret)
6643 ret = -ETIMEDOUT;
6644 }
637cd060 6645
2f18d466 6646 if (ret) {
637cd060
ID
6647 rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6648 return ret;
e010dd0a
ID
6649 }
6650
637cd060
ID
6651 /*
6652 * The lock may have been released by now, unless automatic lock
6653 * transitions are disabled.
6654 */
6655 rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
e010dd0a
ID
6656 return 0;
6657}
6658
589d30e0
AE
6659/*
6660 * An rbd format 2 image has a unique identifier, distinct from the
6661 * name given to it by the user. Internally, that identifier is
6662 * what's used to specify the names of objects related to the image.
6663 *
6664 * A special "rbd id" object is used to map an rbd image name to its
6665 * id. If that object doesn't exist, then there is no v2 rbd image
6666 * with the supplied name.
6667 *
6668 * This function will record the given rbd_dev's image_id field if
6669 * it can be determined, and in that case will return 0. If any
6670 * errors occur a negative errno will be returned and the rbd_dev's
6671 * image_id field will be unchanged (and should be NULL).
6672 */
6673static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6674{
6675 int ret;
6676 size_t size;
ecd4a68a 6677 CEPH_DEFINE_OID_ONSTACK(oid);
589d30e0 6678 void *response;
c0fba368 6679 char *image_id;
2f82ee54 6680
2c0d0a10
AE
6681 /*
6682 * When probing a parent image, the image id is already
6683 * known (and the image name likely is not). There's no
c0fba368
AE
6684 * need to fetch the image id again in this case. We
6685 * do still need to set the image format though.
2c0d0a10 6686 */
c0fba368
AE
6687 if (rbd_dev->spec->image_id) {
6688 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6689
2c0d0a10 6690 return 0;
c0fba368 6691 }
2c0d0a10 6692
589d30e0
AE
6693 /*
6694 * First, see if the format 2 image id file exists, and if
6695 * so, get the image's persistent id from it.
6696 */
ecd4a68a
ID
6697 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6698 rbd_dev->spec->image_name);
6699 if (ret)
6700 return ret;
6701
6702 dout("rbd id object name is %s\n", oid.name);
589d30e0
AE
6703
6704 /* Response will be an encoded string, which includes a length */
589d30e0
AE
6705 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6706 response = kzalloc(size, GFP_NOIO);
6707 if (!response) {
6708 ret = -ENOMEM;
6709 goto out;
6710 }
6711
c0fba368
AE
6712 /* If it doesn't exist we'll assume it's a format 1 image */
6713
ecd4a68a
ID
6714 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6715 "get_id", NULL, 0,
5435d206 6716 response, size);
36be9a76 6717 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
c0fba368
AE
6718 if (ret == -ENOENT) {
6719 image_id = kstrdup("", GFP_KERNEL);
6720 ret = image_id ? 0 : -ENOMEM;
6721 if (!ret)
6722 rbd_dev->image_format = 1;
7dd440c9 6723 } else if (ret >= 0) {
c0fba368
AE
6724 void *p = response;
6725
6726 image_id = ceph_extract_encoded_string(&p, p + ret,
979ed480 6727 NULL, GFP_NOIO);
461f758a 6728 ret = PTR_ERR_OR_ZERO(image_id);
c0fba368
AE
6729 if (!ret)
6730 rbd_dev->image_format = 2;
c0fba368
AE
6731 }
6732
6733 if (!ret) {
6734 rbd_dev->spec->image_id = image_id;
6735 dout("image_id is %s\n", image_id);
589d30e0
AE
6736 }
6737out:
6738 kfree(response);
ecd4a68a 6739 ceph_oid_destroy(&oid);
589d30e0
AE
6740 return ret;
6741}
6742
3abef3b3
AE
6743/*
6744 * Undo whatever state changes are made by v1 or v2 header info
6745 * call.
6746 */
6fd48b3b
AE
6747static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6748{
6749 struct rbd_image_header *header;
6750
e69b8d41 6751 rbd_dev_parent_put(rbd_dev);
22e8bd51 6752 rbd_object_map_free(rbd_dev);
da5ef6be 6753 rbd_dev_mapping_clear(rbd_dev);
6fd48b3b
AE
6754
6755 /* Free dynamic fields from the header, then zero it out */
6756
6757 header = &rbd_dev->header;
812164f8 6758 ceph_put_snap_context(header->snapc);
6fd48b3b
AE
6759 kfree(header->snap_sizes);
6760 kfree(header->snap_names);
6761 kfree(header->object_prefix);
6762 memset(header, 0, sizeof (*header));
6763}
6764
2df3fac7 6765static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
a30b71b9
AE
6766{
6767 int ret;
a30b71b9 6768
1e130199 6769 ret = rbd_dev_v2_object_prefix(rbd_dev);
57385b51 6770 if (ret)
b1b5402a
AE
6771 goto out_err;
6772
2df3fac7
AE
6773 /*
6774 * Get the and check features for the image. Currently the
6775 * features are assumed to never change.
6776 */
b1b5402a 6777 ret = rbd_dev_v2_features(rbd_dev);
57385b51 6778 if (ret)
9d475de5 6779 goto out_err;
35d489f9 6780
cc070d59
AE
6781 /* If the image supports fancy striping, get its parameters */
6782
6783 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6784 ret = rbd_dev_v2_striping_info(rbd_dev);
6785 if (ret < 0)
6786 goto out_err;
6787 }
a30b71b9 6788
7e97332e
ID
6789 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6790 ret = rbd_dev_v2_data_pool(rbd_dev);
6791 if (ret)
6792 goto out_err;
6793 }
6794
263423f8 6795 rbd_init_layout(rbd_dev);
35152979 6796 return 0;
263423f8 6797
9d475de5 6798out_err:
642a2537 6799 rbd_dev->header.features = 0;
1e130199
AE
6800 kfree(rbd_dev->header.object_prefix);
6801 rbd_dev->header.object_prefix = NULL;
9d475de5 6802 return ret;
a30b71b9
AE
6803}
6804
6d69bb53
ID
6805/*
6806 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6807 * rbd_dev_image_probe() recursion depth, which means it's also the
6808 * length of the already discovered part of the parent chain.
6809 */
6810static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
83a06263 6811{
2f82ee54 6812 struct rbd_device *parent = NULL;
124afba2
AE
6813 int ret;
6814
6815 if (!rbd_dev->parent_spec)
6816 return 0;
124afba2 6817
6d69bb53
ID
6818 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6819 pr_info("parent chain is too long (%d)\n", depth);
6820 ret = -EINVAL;
6821 goto out_err;
6822 }
6823
1643dfa4 6824 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
1f2c6651
ID
6825 if (!parent) {
6826 ret = -ENOMEM;
124afba2 6827 goto out_err;
1f2c6651
ID
6828 }
6829
6830 /*
6831 * Images related by parent/child relationships always share
6832 * rbd_client and spec/parent_spec, so bump their refcounts.
6833 */
6834 __rbd_get_client(rbd_dev->rbd_client);
6835 rbd_spec_get(rbd_dev->parent_spec);
124afba2 6836
39258aa2
ID
6837 __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
6838
6d69bb53 6839 ret = rbd_dev_image_probe(parent, depth);
124afba2
AE
6840 if (ret < 0)
6841 goto out_err;
1f2c6651 6842
124afba2 6843 rbd_dev->parent = parent;
a2acd00e 6844 atomic_set(&rbd_dev->parent_ref, 1);
124afba2 6845 return 0;
1f2c6651 6846
124afba2 6847out_err:
1f2c6651 6848 rbd_dev_unparent(rbd_dev);
1761b229 6849 rbd_dev_destroy(parent);
124afba2
AE
6850 return ret;
6851}
6852
5769ed0c
ID
6853static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6854{
6855 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5769ed0c
ID
6856 rbd_free_disk(rbd_dev);
6857 if (!single_major)
6858 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6859}
6860
811c6688
ID
6861/*
6862 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6863 * upon return.
6864 */
200a6a8b 6865static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
124afba2 6866{
83a06263 6867 int ret;
d1cf5788 6868
9b60e70b 6869 /* Record our major and minor device numbers. */
83a06263 6870
9b60e70b
ID
6871 if (!single_major) {
6872 ret = register_blkdev(0, rbd_dev->name);
6873 if (ret < 0)
1643dfa4 6874 goto err_out_unlock;
9b60e70b
ID
6875
6876 rbd_dev->major = ret;
6877 rbd_dev->minor = 0;
6878 } else {
6879 rbd_dev->major = rbd_major;
6880 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6881 }
83a06263
AE
6882
6883 /* Set up the blkdev mapping. */
6884
6885 ret = rbd_init_disk(rbd_dev);
6886 if (ret)
6887 goto err_out_blkdev;
6888
f35a4dee 6889 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
39258aa2 6890 set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
f35a4dee 6891
5769ed0c 6892 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
f35a4dee 6893 if (ret)
da5ef6be 6894 goto err_out_disk;
83a06263 6895
129b79d4 6896 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
811c6688 6897 up_write(&rbd_dev->header_rwsem);
5769ed0c 6898 return 0;
2f82ee54 6899
83a06263
AE
6900err_out_disk:
6901 rbd_free_disk(rbd_dev);
6902err_out_blkdev:
9b60e70b
ID
6903 if (!single_major)
6904 unregister_blkdev(rbd_dev->major, rbd_dev->name);
811c6688
ID
6905err_out_unlock:
6906 up_write(&rbd_dev->header_rwsem);
83a06263
AE
6907 return ret;
6908}
6909
332bb12d
AE
6910static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6911{
6912 struct rbd_spec *spec = rbd_dev->spec;
c41d13a3 6913 int ret;
332bb12d
AE
6914
6915 /* Record the header object name for this rbd image. */
6916
6917 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
332bb12d 6918 if (rbd_dev->image_format == 1)
c41d13a3
ID
6919 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6920 spec->image_name, RBD_SUFFIX);
332bb12d 6921 else
c41d13a3
ID
6922 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6923 RBD_HEADER_PREFIX, spec->image_id);
332bb12d 6924
c41d13a3 6925 return ret;
332bb12d
AE
6926}
6927
b9ef2b88
ID
6928static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6929{
6930 if (!is_snap) {
6931 pr_info("image %s/%s%s%s does not exist\n",
6932 rbd_dev->spec->pool_name,
6933 rbd_dev->spec->pool_ns ?: "",
6934 rbd_dev->spec->pool_ns ? "/" : "",
6935 rbd_dev->spec->image_name);
6936 } else {
6937 pr_info("snap %s/%s%s%s@%s does not exist\n",
6938 rbd_dev->spec->pool_name,
6939 rbd_dev->spec->pool_ns ?: "",
6940 rbd_dev->spec->pool_ns ? "/" : "",
6941 rbd_dev->spec->image_name,
6942 rbd_dev->spec->snap_name);
6943 }
6944}
6945
200a6a8b
AE
6946static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6947{
6fd48b3b 6948 rbd_dev_unprobe(rbd_dev);
fd22aef8
ID
6949 if (rbd_dev->opts)
6950 rbd_unregister_watch(rbd_dev);
6fd48b3b
AE
6951 rbd_dev->image_format = 0;
6952 kfree(rbd_dev->spec->image_id);
6953 rbd_dev->spec->image_id = NULL;
200a6a8b
AE
6954}
6955
a30b71b9
AE
6956/*
6957 * Probe for the existence of the header object for the given rbd
1f3ef788
AE
6958 * device. If this image is the one being mapped (i.e., not a
6959 * parent), initiate a watch on its header object before using that
6960 * object to get detailed information about the rbd image.
a30b71b9 6961 */
6d69bb53 6962static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
a30b71b9 6963{
b9ef2b88 6964 bool need_watch = !rbd_is_ro(rbd_dev);
a30b71b9
AE
6965 int ret;
6966
6967 /*
3abef3b3
AE
6968 * Get the id from the image id object. Unless there's an
6969 * error, rbd_dev->spec->image_id will be filled in with
6970 * a dynamically-allocated string, and rbd_dev->image_format
6971 * will be set to either 1 or 2.
a30b71b9
AE
6972 */
6973 ret = rbd_dev_image_id(rbd_dev);
6974 if (ret)
c0fba368 6975 return ret;
c0fba368 6976
332bb12d
AE
6977 ret = rbd_dev_header_name(rbd_dev);
6978 if (ret)
6979 goto err_out_format;
6980
b9ef2b88 6981 if (need_watch) {
99d16943 6982 ret = rbd_register_watch(rbd_dev);
1fe48023
ID
6983 if (ret) {
6984 if (ret == -ENOENT)
b9ef2b88 6985 rbd_print_dne(rbd_dev, false);
c41d13a3 6986 goto err_out_format;
1fe48023 6987 }
1f3ef788 6988 }
b644de2b 6989
a720ae09 6990 ret = rbd_dev_header_info(rbd_dev);
b9ef2b88
ID
6991 if (ret) {
6992 if (ret == -ENOENT && !need_watch)
6993 rbd_print_dne(rbd_dev, false);
b644de2b 6994 goto err_out_watch;
b9ef2b88 6995 }
83a06263 6996
04077599
ID
6997 /*
6998 * If this image is the one being mapped, we have pool name and
6999 * id, image name and id, and snap name - need to fill snap id.
7000 * Otherwise this is a parent image, identified by pool, image
7001 * and snap ids - need to fill in names for those ids.
7002 */
6d69bb53 7003 if (!depth)
04077599
ID
7004 ret = rbd_spec_fill_snap_id(rbd_dev);
7005 else
7006 ret = rbd_spec_fill_names(rbd_dev);
1fe48023
ID
7007 if (ret) {
7008 if (ret == -ENOENT)
b9ef2b88 7009 rbd_print_dne(rbd_dev, true);
33dca39f 7010 goto err_out_probe;
1fe48023 7011 }
9bb81c9b 7012
da5ef6be
ID
7013 ret = rbd_dev_mapping_set(rbd_dev);
7014 if (ret)
7015 goto err_out_probe;
7016
f3c0e459 7017 if (rbd_is_snap(rbd_dev) &&
22e8bd51
ID
7018 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
7019 ret = rbd_object_map_load(rbd_dev);
7020 if (ret)
7021 goto err_out_probe;
7022 }
7023
e8f59b59
ID
7024 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
7025 ret = rbd_dev_v2_parent_info(rbd_dev);
7026 if (ret)
7027 goto err_out_probe;
e8f59b59
ID
7028 }
7029
6d69bb53 7030 ret = rbd_dev_probe_parent(rbd_dev, depth);
30d60ba2
AE
7031 if (ret)
7032 goto err_out_probe;
7033
7034 dout("discovered format %u image, header name is %s\n",
c41d13a3 7035 rbd_dev->image_format, rbd_dev->header_oid.name);
30d60ba2 7036 return 0;
e8f59b59 7037
6fd48b3b
AE
7038err_out_probe:
7039 rbd_dev_unprobe(rbd_dev);
b644de2b 7040err_out_watch:
b9ef2b88 7041 if (need_watch)
99d16943 7042 rbd_unregister_watch(rbd_dev);
332bb12d
AE
7043err_out_format:
7044 rbd_dev->image_format = 0;
5655c4d9
AE
7045 kfree(rbd_dev->spec->image_id);
7046 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
7047 return ret;
7048}
7049
9b60e70b
ID
7050static ssize_t do_rbd_add(struct bus_type *bus,
7051 const char *buf,
7052 size_t count)
602adf40 7053{
cb8627c7 7054 struct rbd_device *rbd_dev = NULL;
dc79b113 7055 struct ceph_options *ceph_opts = NULL;
4e9afeba 7056 struct rbd_options *rbd_opts = NULL;
859c31df 7057 struct rbd_spec *spec = NULL;
9d3997fd 7058 struct rbd_client *rbdc;
b51c83c2 7059 int rc;
602adf40
YS
7060
7061 if (!try_module_get(THIS_MODULE))
7062 return -ENODEV;
7063
602adf40 7064 /* parse add command */
859c31df 7065 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 7066 if (rc < 0)
dd5ac32d 7067 goto out;
78cea76e 7068
9d3997fd
AE
7069 rbdc = rbd_get_client(ceph_opts);
7070 if (IS_ERR(rbdc)) {
7071 rc = PTR_ERR(rbdc);
0ddebc0c 7072 goto err_out_args;
9d3997fd 7073 }
602adf40 7074
602adf40 7075 /* pick the pool */
dd435855 7076 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
1fe48023
ID
7077 if (rc < 0) {
7078 if (rc == -ENOENT)
7079 pr_info("pool %s does not exist\n", spec->pool_name);
602adf40 7080 goto err_out_client;
1fe48023 7081 }
c0cd10db 7082 spec->pool_id = (u64)rc;
859c31df 7083
d147543d 7084 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
b51c83c2
ID
7085 if (!rbd_dev) {
7086 rc = -ENOMEM;
bd4ba655 7087 goto err_out_client;
b51c83c2 7088 }
c53d5893
AE
7089 rbdc = NULL; /* rbd_dev now owns this */
7090 spec = NULL; /* rbd_dev now owns this */
d147543d 7091 rbd_opts = NULL; /* rbd_dev now owns this */
602adf40 7092
39258aa2
ID
7093 /* if we are mapping a snapshot it will be a read-only mapping */
7094 if (rbd_dev->opts->read_only ||
7095 strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
7096 __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
7097
0d6d1e9c
MC
7098 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7099 if (!rbd_dev->config_info) {
7100 rc = -ENOMEM;
7101 goto err_out_rbd_dev;
7102 }
7103
811c6688 7104 down_write(&rbd_dev->header_rwsem);
6d69bb53 7105 rc = rbd_dev_image_probe(rbd_dev, 0);
0d6d1e9c
MC
7106 if (rc < 0) {
7107 up_write(&rbd_dev->header_rwsem);
c53d5893 7108 goto err_out_rbd_dev;
0d6d1e9c 7109 }
05fd6f6f 7110
0c93e1b7
ID
7111 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7112 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7113 rbd_dev->layout.object_size);
7114 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7115 }
7116
b536f69a 7117 rc = rbd_dev_device_setup(rbd_dev);
fd22aef8 7118 if (rc)
8b679ec5 7119 goto err_out_image_probe;
3abef3b3 7120
637cd060
ID
7121 rc = rbd_add_acquire_lock(rbd_dev);
7122 if (rc)
7123 goto err_out_image_lock;
3abef3b3 7124
5769ed0c
ID
7125 /* Everything's ready. Announce the disk to the world. */
7126
7127 rc = device_add(&rbd_dev->dev);
7128 if (rc)
e010dd0a 7129 goto err_out_image_lock;
5769ed0c
ID
7130
7131 add_disk(rbd_dev->disk);
7132 /* see rbd_init_disk() */
7133 blk_put_queue(rbd_dev->disk->queue);
7134
7135 spin_lock(&rbd_dev_list_lock);
7136 list_add_tail(&rbd_dev->node, &rbd_dev_list);
7137 spin_unlock(&rbd_dev_list_lock);
7138
7139 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7140 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7141 rbd_dev->header.features);
dd5ac32d
ID
7142 rc = count;
7143out:
7144 module_put(THIS_MODULE);
7145 return rc;
b536f69a 7146
e010dd0a
ID
7147err_out_image_lock:
7148 rbd_dev_image_unlock(rbd_dev);
5769ed0c 7149 rbd_dev_device_release(rbd_dev);
8b679ec5
ID
7150err_out_image_probe:
7151 rbd_dev_image_release(rbd_dev);
c53d5893
AE
7152err_out_rbd_dev:
7153 rbd_dev_destroy(rbd_dev);
bd4ba655 7154err_out_client:
9d3997fd 7155 rbd_put_client(rbdc);
0ddebc0c 7156err_out_args:
859c31df 7157 rbd_spec_put(spec);
d147543d 7158 kfree(rbd_opts);
dd5ac32d 7159 goto out;
602adf40
YS
7160}
7161
7e9586ba 7162static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
7163{
7164 if (single_major)
7165 return -EINVAL;
7166
7167 return do_rbd_add(bus, buf, count);
7168}
7169
7e9586ba
GKH
7170static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7171 size_t count)
9b60e70b
ID
7172{
7173 return do_rbd_add(bus, buf, count);
7174}
7175
05a46afd
AE
7176static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7177{
ad945fc1 7178 while (rbd_dev->parent) {
05a46afd
AE
7179 struct rbd_device *first = rbd_dev;
7180 struct rbd_device *second = first->parent;
7181 struct rbd_device *third;
7182
7183 /*
7184 * Follow to the parent with no grandparent and
7185 * remove it.
7186 */
7187 while (second && (third = second->parent)) {
7188 first = second;
7189 second = third;
7190 }
ad945fc1 7191 rbd_assert(second);
8ad42cd0 7192 rbd_dev_image_release(second);
8b679ec5 7193 rbd_dev_destroy(second);
ad945fc1
AE
7194 first->parent = NULL;
7195 first->parent_overlap = 0;
7196
7197 rbd_assert(first->parent_spec);
05a46afd
AE
7198 rbd_spec_put(first->parent_spec);
7199 first->parent_spec = NULL;
05a46afd
AE
7200 }
7201}
7202
9b60e70b
ID
7203static ssize_t do_rbd_remove(struct bus_type *bus,
7204 const char *buf,
7205 size_t count)
602adf40
YS
7206{
7207 struct rbd_device *rbd_dev = NULL;
751cc0e3
AE
7208 struct list_head *tmp;
7209 int dev_id;
0276dca6 7210 char opt_buf[6];
0276dca6 7211 bool force = false;
0d8189e1 7212 int ret;
602adf40 7213
0276dca6
MC
7214 dev_id = -1;
7215 opt_buf[0] = '\0';
7216 sscanf(buf, "%d %5s", &dev_id, opt_buf);
7217 if (dev_id < 0) {
7218 pr_err("dev_id out of range\n");
602adf40 7219 return -EINVAL;
0276dca6
MC
7220 }
7221 if (opt_buf[0] != '\0') {
7222 if (!strcmp(opt_buf, "force")) {
7223 force = true;
7224 } else {
7225 pr_err("bad remove option at '%s'\n", opt_buf);
7226 return -EINVAL;
7227 }
7228 }
602adf40 7229
751cc0e3
AE
7230 ret = -ENOENT;
7231 spin_lock(&rbd_dev_list_lock);
7232 list_for_each(tmp, &rbd_dev_list) {
7233 rbd_dev = list_entry(tmp, struct rbd_device, node);
7234 if (rbd_dev->dev_id == dev_id) {
7235 ret = 0;
7236 break;
7237 }
42382b70 7238 }
751cc0e3
AE
7239 if (!ret) {
7240 spin_lock_irq(&rbd_dev->lock);
0276dca6 7241 if (rbd_dev->open_count && !force)
751cc0e3 7242 ret = -EBUSY;
85f5a4d6
ID
7243 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7244 &rbd_dev->flags))
7245 ret = -EINPROGRESS;
751cc0e3
AE
7246 spin_unlock_irq(&rbd_dev->lock);
7247 }
7248 spin_unlock(&rbd_dev_list_lock);
85f5a4d6 7249 if (ret)
1ba0f1e7 7250 return ret;
751cc0e3 7251
0276dca6
MC
7252 if (force) {
7253 /*
7254 * Prevent new IO from being queued and wait for existing
7255 * IO to complete/fail.
7256 */
7257 blk_mq_freeze_queue(rbd_dev->disk->queue);
7258 blk_set_queue_dying(rbd_dev->disk->queue);
7259 }
7260
5769ed0c
ID
7261 del_gendisk(rbd_dev->disk);
7262 spin_lock(&rbd_dev_list_lock);
7263 list_del_init(&rbd_dev->node);
7264 spin_unlock(&rbd_dev_list_lock);
7265 device_del(&rbd_dev->dev);
fca27065 7266
e010dd0a 7267 rbd_dev_image_unlock(rbd_dev);
dd5ac32d 7268 rbd_dev_device_release(rbd_dev);
8ad42cd0 7269 rbd_dev_image_release(rbd_dev);
8b679ec5 7270 rbd_dev_destroy(rbd_dev);
1ba0f1e7 7271 return count;
602adf40
YS
7272}
7273
7e9586ba 7274static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
7275{
7276 if (single_major)
7277 return -EINVAL;
7278
7279 return do_rbd_remove(bus, buf, count);
7280}
7281
7e9586ba
GKH
7282static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7283 size_t count)
9b60e70b
ID
7284{
7285 return do_rbd_remove(bus, buf, count);
7286}
7287
602adf40
YS
7288/*
7289 * create control files in sysfs
dfc5606d 7290 * /sys/bus/rbd/...
602adf40 7291 */
7d8dc534 7292static int __init rbd_sysfs_init(void)
602adf40 7293{
dfc5606d 7294 int ret;
602adf40 7295
fed4c143 7296 ret = device_register(&rbd_root_dev);
21079786 7297 if (ret < 0)
dfc5606d 7298 return ret;
602adf40 7299
fed4c143
AE
7300 ret = bus_register(&rbd_bus_type);
7301 if (ret < 0)
7302 device_unregister(&rbd_root_dev);
602adf40 7303
602adf40
YS
7304 return ret;
7305}
7306
7d8dc534 7307static void __exit rbd_sysfs_cleanup(void)
602adf40 7308{
dfc5606d 7309 bus_unregister(&rbd_bus_type);
fed4c143 7310 device_unregister(&rbd_root_dev);
602adf40
YS
7311}
7312
7d8dc534 7313static int __init rbd_slab_init(void)
1c2a9dfe
AE
7314{
7315 rbd_assert(!rbd_img_request_cache);
03d94406 7316 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
868311b1
AE
7317 if (!rbd_img_request_cache)
7318 return -ENOMEM;
7319
7320 rbd_assert(!rbd_obj_request_cache);
03d94406 7321 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
78c2a44a
AE
7322 if (!rbd_obj_request_cache)
7323 goto out_err;
7324
6c696d85 7325 return 0;
1c2a9dfe 7326
6c696d85 7327out_err:
868311b1
AE
7328 kmem_cache_destroy(rbd_img_request_cache);
7329 rbd_img_request_cache = NULL;
1c2a9dfe
AE
7330 return -ENOMEM;
7331}
7332
7333static void rbd_slab_exit(void)
7334{
868311b1
AE
7335 rbd_assert(rbd_obj_request_cache);
7336 kmem_cache_destroy(rbd_obj_request_cache);
7337 rbd_obj_request_cache = NULL;
7338
1c2a9dfe
AE
7339 rbd_assert(rbd_img_request_cache);
7340 kmem_cache_destroy(rbd_img_request_cache);
7341 rbd_img_request_cache = NULL;
7342}
7343
cc344fa1 7344static int __init rbd_init(void)
602adf40
YS
7345{
7346 int rc;
7347
1e32d34c
AE
7348 if (!libceph_compatible(NULL)) {
7349 rbd_warn(NULL, "libceph incompatibility (quitting)");
1e32d34c
AE
7350 return -EINVAL;
7351 }
e1b4d96d 7352
1c2a9dfe 7353 rc = rbd_slab_init();
602adf40
YS
7354 if (rc)
7355 return rc;
e1b4d96d 7356
f5ee37bd
ID
7357 /*
7358 * The number of active work items is limited by the number of
f77303bd 7359 * rbd devices * queue depth, so leave @max_active at default.
f5ee37bd
ID
7360 */
7361 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7362 if (!rbd_wq) {
7363 rc = -ENOMEM;
7364 goto err_out_slab;
7365 }
7366
9b60e70b
ID
7367 if (single_major) {
7368 rbd_major = register_blkdev(0, RBD_DRV_NAME);
7369 if (rbd_major < 0) {
7370 rc = rbd_major;
f5ee37bd 7371 goto err_out_wq;
9b60e70b
ID
7372 }
7373 }
7374
1c2a9dfe
AE
7375 rc = rbd_sysfs_init();
7376 if (rc)
9b60e70b
ID
7377 goto err_out_blkdev;
7378
7379 if (single_major)
7380 pr_info("loaded (major %d)\n", rbd_major);
7381 else
7382 pr_info("loaded\n");
1c2a9dfe 7383
e1b4d96d
ID
7384 return 0;
7385
9b60e70b
ID
7386err_out_blkdev:
7387 if (single_major)
7388 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd
ID
7389err_out_wq:
7390 destroy_workqueue(rbd_wq);
e1b4d96d
ID
7391err_out_slab:
7392 rbd_slab_exit();
1c2a9dfe 7393 return rc;
602adf40
YS
7394}
7395
cc344fa1 7396static void __exit rbd_exit(void)
602adf40 7397{
ffe312cf 7398 ida_destroy(&rbd_dev_id_ida);
602adf40 7399 rbd_sysfs_cleanup();
9b60e70b
ID
7400 if (single_major)
7401 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd 7402 destroy_workqueue(rbd_wq);
1c2a9dfe 7403 rbd_slab_exit();
602adf40
YS
7404}
7405
7406module_init(rbd_init);
7407module_exit(rbd_exit);
7408
d552c619 7409MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
602adf40
YS
7410MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7411MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
602adf40
YS
7412/* following authorship retained from original osdblk.c */
7413MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7414
90da258b 7415MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
602adf40 7416MODULE_LICENSE("GPL");