Merge tag 'tag-chrome-platform-fixes-for-v5.7-rc5' of git://git.kernel.org/pub/scm...
[linux-block.git] / drivers / block / rbd.c
CommitLineData
e2a58ee5 1
602adf40
YS
2/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
dfc5606d 25 For usage instructions, please refer to:
602adf40 26
dfc5606d 27 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
28
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
ed95b21a 34#include <linux/ceph/cls_lock_client.h>
43df3d35 35#include <linux/ceph/striper.h>
602adf40 36#include <linux/ceph/decode.h>
82995cc6 37#include <linux/fs_parser.h>
30d1cff8 38#include <linux/bsearch.h>
602adf40
YS
39
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
7ad18afa 43#include <linux/blk-mq.h>
602adf40
YS
44#include <linux/fs.h>
45#include <linux/blkdev.h>
1c2a9dfe 46#include <linux/slab.h>
f8a22fc2 47#include <linux/idr.h>
bc1ecc65 48#include <linux/workqueue.h>
602adf40
YS
49
50#include "rbd_types.h"
51
aafb230e
AE
52#define RBD_DEBUG /* Activate rbd_assert() calls */
53
a2acd00e
AE
54/*
55 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
bfc18e38 64 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
a2acd00e
AE
65 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
f0f8cef5 87#define RBD_DRV_NAME "rbd"
602adf40 88
7e513d43
ID
89#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
602adf40 91
6d69bb53
ID
92#define RBD_MAX_PARENT_CHAIN_LEN 16
93
d4b125e9
AE
94#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
35d489f9 98#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
99
100#define RBD_SNAP_HEAD_NAME "-"
101
9682fc6d
AE
102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
9e15b77d
AE
104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 106#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 107
1e130199 108#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 109
ed95b21a 110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
99d16943
ID
111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
d889140c
AE
113/* Feature bits */
114
8767b293
ID
115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
22e8bd51
ID
118#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
119#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
b9f6d447 120#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
8767b293 121#define RBD_FEATURE_DATA_POOL (1ULL<<7)
e573427a 122#define RBD_FEATURE_OPERATIONS (1ULL<<8)
8767b293 123
ed95b21a
ID
124#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
125 RBD_FEATURE_STRIPINGV2 | \
7e97332e 126 RBD_FEATURE_EXCLUSIVE_LOCK | \
22e8bd51
ID
127 RBD_FEATURE_OBJECT_MAP | \
128 RBD_FEATURE_FAST_DIFF | \
b9f6d447 129 RBD_FEATURE_DEEP_FLATTEN | \
e573427a
ID
130 RBD_FEATURE_DATA_POOL | \
131 RBD_FEATURE_OPERATIONS)
d889140c
AE
132
133/* Features supported by this (client software) implementation. */
134
770eba6e 135#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
d889140c 136
81a89793
AE
137/*
138 * An RBD device name will be "rbd#", where the "rbd" comes from
139 * RBD_DRV_NAME above, and # is a unique integer identifier.
81a89793 140 */
602adf40
YS
141#define DEV_NAME_LEN 32
142
143/*
144 * block device image metadata (in-memory version)
145 */
146struct rbd_image_header {
f35a4dee 147 /* These six fields never change for a given rbd image */
849b4260 148 char *object_prefix;
602adf40 149 __u8 obj_order;
f35a4dee
AE
150 u64 stripe_unit;
151 u64 stripe_count;
7e97332e 152 s64 data_pool_id;
f35a4dee 153 u64 features; /* Might be changeable someday? */
602adf40 154
f84344f3
AE
155 /* The remaining fields need to be updated occasionally */
156 u64 image_size;
157 struct ceph_snap_context *snapc;
f35a4dee
AE
158 char *snap_names; /* format 1 only */
159 u64 *snap_sizes; /* format 1 only */
59c2be1e
YS
160};
161
0d7dbfce
AE
162/*
163 * An rbd image specification.
164 *
165 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
166 * identify an image. Each rbd_dev structure includes a pointer to
167 * an rbd_spec structure that encapsulates this identity.
168 *
169 * Each of the id's in an rbd_spec has an associated name. For a
170 * user-mapped image, the names are supplied and the id's associated
171 * with them are looked up. For a layered image, a parent image is
172 * defined by the tuple, and the names are looked up.
173 *
174 * An rbd_dev structure contains a parent_spec pointer which is
175 * non-null if the image it represents is a child in a layered
176 * image. This pointer will refer to the rbd_spec structure used
177 * by the parent rbd_dev for its own identity (i.e., the structure
178 * is shared between the parent and child).
179 *
180 * Since these structures are populated once, during the discovery
181 * phase of image construction, they are effectively immutable so
182 * we make no effort to synchronize access to them.
183 *
184 * Note that code herein does not assume the image name is known (it
185 * could be a null pointer).
0d7dbfce
AE
186 */
187struct rbd_spec {
188 u64 pool_id;
ecb4dc22 189 const char *pool_name;
b26c047b 190 const char *pool_ns; /* NULL if default, never "" */
0d7dbfce 191
ecb4dc22
AE
192 const char *image_id;
193 const char *image_name;
0d7dbfce
AE
194
195 u64 snap_id;
ecb4dc22 196 const char *snap_name;
0d7dbfce
AE
197
198 struct kref kref;
199};
200
602adf40 201/*
f0f8cef5 202 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
203 */
204struct rbd_client {
205 struct ceph_client *client;
206 struct kref kref;
207 struct list_head node;
208};
209
0192ce2e
ID
210struct pending_result {
211 int result; /* first nonzero result */
212 int num_pending;
213};
214
bf0d5f50 215struct rbd_img_request;
bf0d5f50 216
9969ebc5 217enum obj_request_type {
a1fbb5e7 218 OBJ_REQUEST_NODATA = 1,
5359a17d 219 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
7e07efb1 220 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
afb97888 221 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
9969ebc5 222};
bf0d5f50 223
6d2940c8 224enum obj_operation_type {
a1fbb5e7 225 OBJ_OP_READ = 1,
6d2940c8 226 OBJ_OP_WRITE,
90e98c52 227 OBJ_OP_DISCARD,
6484cbe9 228 OBJ_OP_ZEROOUT,
6d2940c8
GZ
229};
230
0ad5d953
ID
231#define RBD_OBJ_FLAG_DELETION (1U << 0)
232#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
793333a3 233#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
22e8bd51
ID
234#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
235#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
0ad5d953 236
a9b67e69 237enum rbd_obj_read_state {
85b5e6d1
ID
238 RBD_OBJ_READ_START = 1,
239 RBD_OBJ_READ_OBJECT,
a9b67e69
ID
240 RBD_OBJ_READ_PARENT,
241};
242
3da691bf
ID
243/*
244 * Writes go through the following state machine to deal with
245 * layering:
246 *
89a59c1c
ID
247 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248 * . | .
249 * . v .
250 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
251 * . | . .
252 * . v v (deep-copyup .
253 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
254 * flattened) v | . .
255 * . v . .
256 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
257 * | not needed) v
258 * v .
259 * done . . . . . . . . . . . . . . . . . .
260 * ^
261 * |
262 * RBD_OBJ_WRITE_FLAT
3da691bf
ID
263 *
264 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
89a59c1c
ID
265 * assert_exists guard is needed or not (in some cases it's not needed
266 * even if there is a parent).
3da691bf
ID
267 */
268enum rbd_obj_write_state {
85b5e6d1 269 RBD_OBJ_WRITE_START = 1,
22e8bd51 270 RBD_OBJ_WRITE_PRE_OBJECT_MAP,
85b5e6d1 271 RBD_OBJ_WRITE_OBJECT,
793333a3
ID
272 __RBD_OBJ_WRITE_COPYUP,
273 RBD_OBJ_WRITE_COPYUP,
22e8bd51 274 RBD_OBJ_WRITE_POST_OBJECT_MAP,
793333a3
ID
275};
276
277enum rbd_obj_copyup_state {
278 RBD_OBJ_COPYUP_START = 1,
279 RBD_OBJ_COPYUP_READ_PARENT,
22e8bd51
ID
280 __RBD_OBJ_COPYUP_OBJECT_MAPS,
281 RBD_OBJ_COPYUP_OBJECT_MAPS,
793333a3
ID
282 __RBD_OBJ_COPYUP_WRITE_OBJECT,
283 RBD_OBJ_COPYUP_WRITE_OBJECT,
926f9b3f
AE
284};
285
bf0d5f50 286struct rbd_obj_request {
43df3d35 287 struct ceph_object_extent ex;
0ad5d953 288 unsigned int flags; /* RBD_OBJ_FLAG_* */
c5b5ef6c 289 union {
a9b67e69 290 enum rbd_obj_read_state read_state; /* for reads */
3da691bf 291 enum rbd_obj_write_state write_state; /* for writes */
c5b5ef6c 292 };
bf0d5f50 293
51c3509e 294 struct rbd_img_request *img_request;
86bd7998
ID
295 struct ceph_file_extent *img_extents;
296 u32 num_img_extents;
bf0d5f50 297
788e2df3 298 union {
5359a17d 299 struct ceph_bio_iter bio_pos;
788e2df3 300 struct {
7e07efb1
ID
301 struct ceph_bvec_iter bvec_pos;
302 u32 bvec_count;
afb97888 303 u32 bvec_idx;
788e2df3
AE
304 };
305 };
793333a3
ID
306
307 enum rbd_obj_copyup_state copyup_state;
7e07efb1
ID
308 struct bio_vec *copyup_bvecs;
309 u32 copyup_bvec_count;
bf0d5f50 310
bcbab1db 311 struct list_head osd_reqs; /* w/ r_private_item */
bf0d5f50 312
85b5e6d1 313 struct mutex state_mutex;
793333a3 314 struct pending_result pending;
bf0d5f50
AE
315 struct kref kref;
316};
317
0c425248 318enum img_req_flags {
9849e986 319 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
d0b2e944 320 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
0c425248
AE
321};
322
0192ce2e
ID
323enum rbd_img_state {
324 RBD_IMG_START = 1,
637cd060 325 RBD_IMG_EXCLUSIVE_LOCK,
0192ce2e
ID
326 __RBD_IMG_OBJECT_REQUESTS,
327 RBD_IMG_OBJECT_REQUESTS,
328};
329
bf0d5f50 330struct rbd_img_request {
bf0d5f50 331 struct rbd_device *rbd_dev;
9bb0248d 332 enum obj_operation_type op_type;
ecc633ca 333 enum obj_request_type data_type;
0c425248 334 unsigned long flags;
0192ce2e 335 enum rbd_img_state state;
bf0d5f50 336 union {
9849e986 337 u64 snap_id; /* for reads */
bf0d5f50 338 struct ceph_snap_context *snapc; /* for writes */
9849e986 339 };
59e542c8 340 struct rbd_obj_request *obj_request; /* obj req initiator */
bf0d5f50 341
e1fddc8f 342 struct list_head lock_item;
43df3d35 343 struct list_head object_extents; /* obj_req.ex structs */
bf0d5f50 344
0192ce2e
ID
345 struct mutex state_mutex;
346 struct pending_result pending;
347 struct work_struct work;
348 int work_result;
bf0d5f50
AE
349};
350
351#define for_each_obj_request(ireq, oreq) \
43df3d35 352 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 353#define for_each_obj_request_safe(ireq, oreq, n) \
43df3d35 354 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 355
99d16943
ID
356enum rbd_watch_state {
357 RBD_WATCH_STATE_UNREGISTERED,
358 RBD_WATCH_STATE_REGISTERED,
359 RBD_WATCH_STATE_ERROR,
360};
361
ed95b21a
ID
362enum rbd_lock_state {
363 RBD_LOCK_STATE_UNLOCKED,
364 RBD_LOCK_STATE_LOCKED,
365 RBD_LOCK_STATE_RELEASING,
366};
367
368/* WatchNotify::ClientId */
369struct rbd_client_id {
370 u64 gid;
371 u64 handle;
372};
373
f84344f3 374struct rbd_mapping {
99c1f08f 375 u64 size;
f84344f3
AE
376};
377
602adf40
YS
378/*
379 * a single device
380 */
381struct rbd_device {
de71a297 382 int dev_id; /* blkdev unique id */
602adf40
YS
383
384 int major; /* blkdev assigned major */
dd82fff1 385 int minor;
602adf40 386 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 387
a30b71b9 388 u32 image_format; /* Either 1 or 2 */
602adf40
YS
389 struct rbd_client *rbd_client;
390
391 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392
b82d167b 393 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
394
395 struct rbd_image_header header;
b82d167b 396 unsigned long flags; /* possibly lock protected */
0d7dbfce 397 struct rbd_spec *spec;
d147543d 398 struct rbd_options *opts;
0d6d1e9c 399 char *config_info; /* add{,_single_major} string */
602adf40 400
c41d13a3 401 struct ceph_object_id header_oid;
922dab61 402 struct ceph_object_locator header_oloc;
971f839a 403
1643dfa4 404 struct ceph_file_layout layout; /* used for all rbd requests */
0903e875 405
99d16943
ID
406 struct mutex watch_mutex;
407 enum rbd_watch_state watch_state;
922dab61 408 struct ceph_osd_linger_request *watch_handle;
99d16943
ID
409 u64 watch_cookie;
410 struct delayed_work watch_dwork;
59c2be1e 411
ed95b21a
ID
412 struct rw_semaphore lock_rwsem;
413 enum rbd_lock_state lock_state;
cbbfb0ff 414 char lock_cookie[32];
ed95b21a
ID
415 struct rbd_client_id owner_cid;
416 struct work_struct acquired_lock_work;
417 struct work_struct released_lock_work;
418 struct delayed_work lock_dwork;
419 struct work_struct unlock_work;
e1fddc8f 420 spinlock_t lock_lists_lock;
637cd060 421 struct list_head acquiring_list;
e1fddc8f 422 struct list_head running_list;
637cd060
ID
423 struct completion acquire_wait;
424 int acquire_err;
e1fddc8f 425 struct completion releasing_wait;
ed95b21a 426
22e8bd51
ID
427 spinlock_t object_map_lock;
428 u8 *object_map;
429 u64 object_map_size; /* in objects */
430 u64 object_map_flags;
ed95b21a 431
1643dfa4 432 struct workqueue_struct *task_wq;
59c2be1e 433
86b00e0d
AE
434 struct rbd_spec *parent_spec;
435 u64 parent_overlap;
a2acd00e 436 atomic_t parent_ref;
2f82ee54 437 struct rbd_device *parent;
86b00e0d 438
7ad18afa
CH
439 /* Block layer tags. */
440 struct blk_mq_tag_set tag_set;
441
c666601a
JD
442 /* protects updating the header */
443 struct rw_semaphore header_rwsem;
f84344f3
AE
444
445 struct rbd_mapping mapping;
602adf40
YS
446
447 struct list_head node;
dfc5606d 448
dfc5606d
YS
449 /* sysfs related */
450 struct device dev;
b82d167b 451 unsigned long open_count; /* protected by lock */
dfc5606d
YS
452};
453
b82d167b 454/*
87c0fded
ID
455 * Flag bits for rbd_dev->flags:
456 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
457 * by rbd_dev->lock
b82d167b 458 */
6d292906 459enum rbd_dev_flags {
686238b7 460 RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */
b82d167b 461 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
39258aa2 462 RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */
6d292906
AE
463};
464
cfbf6377 465static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
e124a82f 466
602adf40 467static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
468static DEFINE_SPINLOCK(rbd_dev_list_lock);
469
432b8587
AE
470static LIST_HEAD(rbd_client_list); /* clients */
471static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 472
78c2a44a
AE
473/* Slab caches for frequently-allocated structures */
474
1c2a9dfe 475static struct kmem_cache *rbd_img_request_cache;
868311b1 476static struct kmem_cache *rbd_obj_request_cache;
1c2a9dfe 477
9b60e70b 478static int rbd_major;
f8a22fc2
ID
479static DEFINE_IDA(rbd_dev_id_ida);
480
f5ee37bd
ID
481static struct workqueue_struct *rbd_wq;
482
89a59c1c
ID
483static struct ceph_snap_context rbd_empty_snapc = {
484 .nref = REFCOUNT_INIT(1),
485};
486
9b60e70b 487/*
3cfa3b16 488 * single-major requires >= 0.75 version of userspace rbd utility.
9b60e70b 489 */
3cfa3b16 490static bool single_major = true;
5657a819 491module_param(single_major, bool, 0444);
3cfa3b16 492MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
9b60e70b 493
7e9586ba
GKH
494static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
495static ssize_t remove_store(struct bus_type *bus, const char *buf,
496 size_t count);
497static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
498 size_t count);
499static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
500 size_t count);
6d69bb53 501static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
f0f8cef5 502
9b60e70b
ID
503static int rbd_dev_id_to_minor(int dev_id)
504{
7e513d43 505 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
506}
507
508static int minor_to_rbd_dev_id(int minor)
509{
7e513d43 510 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
511}
512
39258aa2
ID
513static bool rbd_is_ro(struct rbd_device *rbd_dev)
514{
515 return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
516}
517
f3c0e459
ID
518static bool rbd_is_snap(struct rbd_device *rbd_dev)
519{
520 return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521}
522
ed95b21a
ID
523static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524{
637cd060
ID
525 lockdep_assert_held(&rbd_dev->lock_rwsem);
526
ed95b21a
ID
527 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529}
530
531static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532{
533 bool is_lock_owner;
534
535 down_read(&rbd_dev->lock_rwsem);
536 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537 up_read(&rbd_dev->lock_rwsem);
538 return is_lock_owner;
539}
540
7e9586ba 541static ssize_t supported_features_show(struct bus_type *bus, char *buf)
8767b293
ID
542{
543 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
544}
545
7e9586ba
GKH
546static BUS_ATTR_WO(add);
547static BUS_ATTR_WO(remove);
548static BUS_ATTR_WO(add_single_major);
549static BUS_ATTR_WO(remove_single_major);
550static BUS_ATTR_RO(supported_features);
b15a21dd
GKH
551
552static struct attribute *rbd_bus_attrs[] = {
553 &bus_attr_add.attr,
554 &bus_attr_remove.attr,
9b60e70b
ID
555 &bus_attr_add_single_major.attr,
556 &bus_attr_remove_single_major.attr,
8767b293 557 &bus_attr_supported_features.attr,
b15a21dd 558 NULL,
f0f8cef5 559};
92c76dc0
ID
560
561static umode_t rbd_bus_is_visible(struct kobject *kobj,
562 struct attribute *attr, int index)
563{
9b60e70b
ID
564 if (!single_major &&
565 (attr == &bus_attr_add_single_major.attr ||
566 attr == &bus_attr_remove_single_major.attr))
567 return 0;
568
92c76dc0
ID
569 return attr->mode;
570}
571
572static const struct attribute_group rbd_bus_group = {
573 .attrs = rbd_bus_attrs,
574 .is_visible = rbd_bus_is_visible,
575};
576__ATTRIBUTE_GROUPS(rbd_bus);
f0f8cef5
AE
577
578static struct bus_type rbd_bus_type = {
579 .name = "rbd",
b15a21dd 580 .bus_groups = rbd_bus_groups,
f0f8cef5
AE
581};
582
583static void rbd_root_dev_release(struct device *dev)
584{
585}
586
587static struct device rbd_root_dev = {
588 .init_name = "rbd",
589 .release = rbd_root_dev_release,
590};
591
06ecc6cb
AE
592static __printf(2, 3)
593void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
594{
595 struct va_format vaf;
596 va_list args;
597
598 va_start(args, fmt);
599 vaf.fmt = fmt;
600 vaf.va = &args;
601
602 if (!rbd_dev)
603 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
604 else if (rbd_dev->disk)
605 printk(KERN_WARNING "%s: %s: %pV\n",
606 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
607 else if (rbd_dev->spec && rbd_dev->spec->image_name)
608 printk(KERN_WARNING "%s: image %s: %pV\n",
609 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
610 else if (rbd_dev->spec && rbd_dev->spec->image_id)
611 printk(KERN_WARNING "%s: id %s: %pV\n",
612 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
613 else /* punt */
614 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
615 RBD_DRV_NAME, rbd_dev, &vaf);
616 va_end(args);
617}
618
aafb230e
AE
619#ifdef RBD_DEBUG
620#define rbd_assert(expr) \
621 if (unlikely(!(expr))) { \
622 printk(KERN_ERR "\nAssertion failure in %s() " \
623 "at line %d:\n\n" \
624 "\trbd_assert(%s);\n\n", \
625 __func__, __LINE__, #expr); \
626 BUG(); \
627 }
628#else /* !RBD_DEBUG */
629# define rbd_assert(expr) ((void) 0)
630#endif /* !RBD_DEBUG */
dfc5606d 631
05a46afd 632static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
8b3e1a56 633
cc4a38bd 634static int rbd_dev_refresh(struct rbd_device *rbd_dev);
2df3fac7 635static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
a720ae09 636static int rbd_dev_header_info(struct rbd_device *rbd_dev);
e8f59b59 637static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
54cac61f
AE
638static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
639 u64 snap_id);
2ad3d716
AE
640static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
641 u8 *order, u64 *snap_size);
22e8bd51 642static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
59c2be1e 643
54ab3b24 644static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
0192ce2e
ID
645static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
646
647/*
648 * Return true if nothing else is pending.
649 */
650static bool pending_result_dec(struct pending_result *pending, int *result)
651{
652 rbd_assert(pending->num_pending > 0);
653
654 if (*result && !pending->result)
655 pending->result = *result;
656 if (--pending->num_pending)
657 return false;
658
659 *result = pending->result;
660 return true;
661}
59c2be1e 662
602adf40
YS
663static int rbd_open(struct block_device *bdev, fmode_t mode)
664{
f0f8cef5 665 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 666 bool removing = false;
602adf40 667
a14ea269 668 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
669 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
670 removing = true;
671 else
672 rbd_dev->open_count++;
a14ea269 673 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
674 if (removing)
675 return -ENOENT;
676
c3e946ce 677 (void) get_device(&rbd_dev->dev);
340c7a2b 678
602adf40
YS
679 return 0;
680}
681
db2a144b 682static void rbd_release(struct gendisk *disk, fmode_t mode)
dfc5606d
YS
683{
684 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
685 unsigned long open_count_before;
686
a14ea269 687 spin_lock_irq(&rbd_dev->lock);
b82d167b 688 open_count_before = rbd_dev->open_count--;
a14ea269 689 spin_unlock_irq(&rbd_dev->lock);
b82d167b 690 rbd_assert(open_count_before > 0);
dfc5606d 691
c3e946ce 692 put_device(&rbd_dev->dev);
dfc5606d
YS
693}
694
131fd9f6
GZ
695static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
696{
1de797bb 697 int ro;
131fd9f6 698
1de797bb 699 if (get_user(ro, (int __user *)arg))
131fd9f6
GZ
700 return -EFAULT;
701
c1b62057
ID
702 /*
703 * Both images mapped read-only and snapshots can't be marked
704 * read-write.
705 */
706 if (!ro) {
707 if (rbd_is_ro(rbd_dev))
708 return -EROFS;
709
710 rbd_assert(!rbd_is_snap(rbd_dev));
711 }
131fd9f6 712
1de797bb
ID
713 /* Let blkdev_roset() handle it */
714 return -ENOTTY;
131fd9f6
GZ
715}
716
717static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
718 unsigned int cmd, unsigned long arg)
719{
720 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
1de797bb 721 int ret;
131fd9f6 722
131fd9f6
GZ
723 switch (cmd) {
724 case BLKROSET:
725 ret = rbd_ioctl_set_ro(rbd_dev, arg);
726 break;
727 default:
728 ret = -ENOTTY;
729 }
730
131fd9f6
GZ
731 return ret;
732}
733
734#ifdef CONFIG_COMPAT
735static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
736 unsigned int cmd, unsigned long arg)
737{
738 return rbd_ioctl(bdev, mode, cmd, arg);
739}
740#endif /* CONFIG_COMPAT */
741
602adf40
YS
742static const struct block_device_operations rbd_bd_ops = {
743 .owner = THIS_MODULE,
744 .open = rbd_open,
dfc5606d 745 .release = rbd_release,
131fd9f6
GZ
746 .ioctl = rbd_ioctl,
747#ifdef CONFIG_COMPAT
748 .compat_ioctl = rbd_compat_ioctl,
749#endif
602adf40
YS
750};
751
752/*
7262cfca 753 * Initialize an rbd client instance. Success or not, this function
cfbf6377 754 * consumes ceph_opts. Caller holds client_mutex.
602adf40 755 */
f8c38929 756static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
757{
758 struct rbd_client *rbdc;
759 int ret = -ENOMEM;
760
37206ee5 761 dout("%s:\n", __func__);
602adf40
YS
762 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
763 if (!rbdc)
764 goto out_opt;
765
766 kref_init(&rbdc->kref);
767 INIT_LIST_HEAD(&rbdc->node);
768
74da4a0f 769 rbdc->client = ceph_create_client(ceph_opts, rbdc);
602adf40 770 if (IS_ERR(rbdc->client))
08f75463 771 goto out_rbdc;
43ae4701 772 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
773
774 ret = ceph_open_session(rbdc->client);
775 if (ret < 0)
08f75463 776 goto out_client;
602adf40 777
432b8587 778 spin_lock(&rbd_client_list_lock);
602adf40 779 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 780 spin_unlock(&rbd_client_list_lock);
602adf40 781
37206ee5 782 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 783
602adf40 784 return rbdc;
08f75463 785out_client:
602adf40 786 ceph_destroy_client(rbdc->client);
08f75463 787out_rbdc:
602adf40
YS
788 kfree(rbdc);
789out_opt:
43ae4701
AE
790 if (ceph_opts)
791 ceph_destroy_options(ceph_opts);
37206ee5
AE
792 dout("%s: error %d\n", __func__, ret);
793
28f259b7 794 return ERR_PTR(ret);
602adf40
YS
795}
796
2f82ee54
AE
797static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
798{
799 kref_get(&rbdc->kref);
800
801 return rbdc;
802}
803
602adf40 804/*
1f7ba331
AE
805 * Find a ceph client with specific addr and configuration. If
806 * found, bump its reference count.
602adf40 807 */
1f7ba331 808static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
809{
810 struct rbd_client *client_node;
1f7ba331 811 bool found = false;
602adf40 812
43ae4701 813 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
814 return NULL;
815
1f7ba331
AE
816 spin_lock(&rbd_client_list_lock);
817 list_for_each_entry(client_node, &rbd_client_list, node) {
818 if (!ceph_compare_options(ceph_opts, client_node->client)) {
2f82ee54
AE
819 __rbd_get_client(client_node);
820
1f7ba331
AE
821 found = true;
822 break;
823 }
824 }
825 spin_unlock(&rbd_client_list_lock);
826
827 return found ? client_node : NULL;
602adf40
YS
828}
829
59c2be1e 830/*
210c104c 831 * (Per device) rbd map options
59c2be1e
YS
832 */
833enum {
b5584180 834 Opt_queue_depth,
0c93e1b7 835 Opt_alloc_size,
34f55d0b 836 Opt_lock_timeout,
59c2be1e 837 /* int args above */
b26c047b 838 Opt_pool_ns,
59c2be1e 839 /* string args above */
cc0538b6
AE
840 Opt_read_only,
841 Opt_read_write,
80de1912 842 Opt_lock_on_read,
e010dd0a 843 Opt_exclusive,
d9360540 844 Opt_notrim,
59c2be1e
YS
845};
846
d7167b14 847static const struct fs_parameter_spec rbd_parameters[] = {
82995cc6
DH
848 fsparam_u32 ("alloc_size", Opt_alloc_size),
849 fsparam_flag ("exclusive", Opt_exclusive),
850 fsparam_flag ("lock_on_read", Opt_lock_on_read),
851 fsparam_u32 ("lock_timeout", Opt_lock_timeout),
852 fsparam_flag ("notrim", Opt_notrim),
853 fsparam_string ("_pool_ns", Opt_pool_ns),
854 fsparam_u32 ("queue_depth", Opt_queue_depth),
855 fsparam_flag ("read_only", Opt_read_only),
856 fsparam_flag ("read_write", Opt_read_write),
857 fsparam_flag ("ro", Opt_read_only),
858 fsparam_flag ("rw", Opt_read_write),
859 {}
860};
861
98571b5a 862struct rbd_options {
b5584180 863 int queue_depth;
0c93e1b7 864 int alloc_size;
34f55d0b 865 unsigned long lock_timeout;
98571b5a 866 bool read_only;
80de1912 867 bool lock_on_read;
e010dd0a 868 bool exclusive;
d9360540 869 bool trim;
98571b5a
AE
870};
871
b5584180 872#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
0c93e1b7 873#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
34f55d0b 874#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
98571b5a 875#define RBD_READ_ONLY_DEFAULT false
80de1912 876#define RBD_LOCK_ON_READ_DEFAULT false
e010dd0a 877#define RBD_EXCLUSIVE_DEFAULT false
d9360540 878#define RBD_TRIM_DEFAULT true
98571b5a 879
82995cc6 880struct rbd_parse_opts_ctx {
c300156b 881 struct rbd_spec *spec;
82995cc6 882 struct ceph_options *copts;
c300156b
ID
883 struct rbd_options *opts;
884};
885
6d2940c8
GZ
886static char* obj_op_name(enum obj_operation_type op_type)
887{
888 switch (op_type) {
889 case OBJ_OP_READ:
890 return "read";
891 case OBJ_OP_WRITE:
892 return "write";
90e98c52
GZ
893 case OBJ_OP_DISCARD:
894 return "discard";
6484cbe9
ID
895 case OBJ_OP_ZEROOUT:
896 return "zeroout";
6d2940c8
GZ
897 default:
898 return "???";
899 }
900}
901
602adf40
YS
902/*
903 * Destroy ceph client
d23a4b3f 904 *
432b8587 905 * Caller must hold rbd_client_list_lock.
602adf40
YS
906 */
907static void rbd_client_release(struct kref *kref)
908{
909 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
910
37206ee5 911 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 912 spin_lock(&rbd_client_list_lock);
602adf40 913 list_del(&rbdc->node);
cd9d9f5d 914 spin_unlock(&rbd_client_list_lock);
602adf40
YS
915
916 ceph_destroy_client(rbdc->client);
917 kfree(rbdc);
918}
919
920/*
921 * Drop reference to ceph client node. If it's not referenced anymore, release
922 * it.
923 */
9d3997fd 924static void rbd_put_client(struct rbd_client *rbdc)
602adf40 925{
c53d5893
AE
926 if (rbdc)
927 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
928}
929
5feb0d8d
ID
930/*
931 * Get a ceph client with specific addr and configuration, if one does
932 * not exist create it. Either way, ceph_opts is consumed by this
933 * function.
934 */
935static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
936{
937 struct rbd_client *rbdc;
dd435855 938 int ret;
5feb0d8d 939
a32e4143 940 mutex_lock(&client_mutex);
5feb0d8d 941 rbdc = rbd_client_find(ceph_opts);
dd435855 942 if (rbdc) {
5feb0d8d 943 ceph_destroy_options(ceph_opts);
dd435855
ID
944
945 /*
946 * Using an existing client. Make sure ->pg_pools is up to
947 * date before we look up the pool id in do_rbd_add().
948 */
9d4a227f
ID
949 ret = ceph_wait_for_latest_osdmap(rbdc->client,
950 rbdc->client->options->mount_timeout);
dd435855
ID
951 if (ret) {
952 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
953 rbd_put_client(rbdc);
954 rbdc = ERR_PTR(ret);
955 }
956 } else {
5feb0d8d 957 rbdc = rbd_client_create(ceph_opts);
dd435855 958 }
5feb0d8d
ID
959 mutex_unlock(&client_mutex);
960
961 return rbdc;
962}
963
a30b71b9
AE
964static bool rbd_image_format_valid(u32 image_format)
965{
966 return image_format == 1 || image_format == 2;
967}
968
8e94af8e
AE
969static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
970{
103a150f
AE
971 size_t size;
972 u32 snap_count;
973
974 /* The header has to start with the magic rbd header text */
975 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
976 return false;
977
db2388b6
AE
978 /* The bio layer requires at least sector-sized I/O */
979
980 if (ondisk->options.order < SECTOR_SHIFT)
981 return false;
982
983 /* If we use u64 in a few spots we may be able to loosen this */
984
985 if (ondisk->options.order > 8 * sizeof (int) - 1)
986 return false;
987
103a150f
AE
988 /*
989 * The size of a snapshot header has to fit in a size_t, and
990 * that limits the number of snapshots.
991 */
992 snap_count = le32_to_cpu(ondisk->snap_count);
993 size = SIZE_MAX - sizeof (struct ceph_snap_context);
994 if (snap_count > size / sizeof (__le64))
995 return false;
996
997 /*
998 * Not only that, but the size of the entire the snapshot
999 * header must also be representable in a size_t.
1000 */
1001 size -= snap_count * sizeof (__le64);
1002 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1003 return false;
1004
1005 return true;
8e94af8e
AE
1006}
1007
5bc3fb17
ID
1008/*
1009 * returns the size of an object in the image
1010 */
1011static u32 rbd_obj_bytes(struct rbd_image_header *header)
1012{
1013 return 1U << header->obj_order;
1014}
1015
263423f8
ID
1016static void rbd_init_layout(struct rbd_device *rbd_dev)
1017{
1018 if (rbd_dev->header.stripe_unit == 0 ||
1019 rbd_dev->header.stripe_count == 0) {
1020 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1021 rbd_dev->header.stripe_count = 1;
1022 }
1023
1024 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1025 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1026 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
7e97332e
ID
1027 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1028 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
263423f8
ID
1029 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1030}
1031
602adf40 1032/*
bb23e37a
AE
1033 * Fill an rbd image header with information from the given format 1
1034 * on-disk header.
602adf40 1035 */
662518b1 1036static int rbd_header_from_disk(struct rbd_device *rbd_dev,
4156d998 1037 struct rbd_image_header_ondisk *ondisk)
602adf40 1038{
662518b1 1039 struct rbd_image_header *header = &rbd_dev->header;
bb23e37a
AE
1040 bool first_time = header->object_prefix == NULL;
1041 struct ceph_snap_context *snapc;
1042 char *object_prefix = NULL;
1043 char *snap_names = NULL;
1044 u64 *snap_sizes = NULL;
ccece235 1045 u32 snap_count;
bb23e37a 1046 int ret = -ENOMEM;
621901d6 1047 u32 i;
602adf40 1048
bb23e37a 1049 /* Allocate this now to avoid having to handle failure below */
6a52325f 1050
bb23e37a 1051 if (first_time) {
848d796c
ID
1052 object_prefix = kstrndup(ondisk->object_prefix,
1053 sizeof(ondisk->object_prefix),
1054 GFP_KERNEL);
bb23e37a
AE
1055 if (!object_prefix)
1056 return -ENOMEM;
bb23e37a 1057 }
00f1f36f 1058
bb23e37a 1059 /* Allocate the snapshot context and fill it in */
00f1f36f 1060
bb23e37a
AE
1061 snap_count = le32_to_cpu(ondisk->snap_count);
1062 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1063 if (!snapc)
1064 goto out_err;
1065 snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 1066 if (snap_count) {
bb23e37a 1067 struct rbd_image_snap_ondisk *snaps;
f785cc1d
AE
1068 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1069
bb23e37a 1070 /* We'll keep a copy of the snapshot names... */
621901d6 1071
bb23e37a
AE
1072 if (snap_names_len > (u64)SIZE_MAX)
1073 goto out_2big;
1074 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1075 if (!snap_names)
6a52325f
AE
1076 goto out_err;
1077
bb23e37a 1078 /* ...as well as the array of their sizes. */
88a25a5f
ME
1079 snap_sizes = kmalloc_array(snap_count,
1080 sizeof(*header->snap_sizes),
1081 GFP_KERNEL);
bb23e37a 1082 if (!snap_sizes)
6a52325f 1083 goto out_err;
bb23e37a 1084
f785cc1d 1085 /*
bb23e37a
AE
1086 * Copy the names, and fill in each snapshot's id
1087 * and size.
1088 *
99a41ebc 1089 * Note that rbd_dev_v1_header_info() guarantees the
bb23e37a 1090 * ondisk buffer we're working with has
f785cc1d
AE
1091 * snap_names_len bytes beyond the end of the
1092 * snapshot id array, this memcpy() is safe.
1093 */
bb23e37a
AE
1094 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1095 snaps = ondisk->snaps;
1096 for (i = 0; i < snap_count; i++) {
1097 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1098 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1099 }
602adf40 1100 }
6a52325f 1101
bb23e37a 1102 /* We won't fail any more, fill in the header */
621901d6 1103
bb23e37a
AE
1104 if (first_time) {
1105 header->object_prefix = object_prefix;
1106 header->obj_order = ondisk->options.order;
263423f8 1107 rbd_init_layout(rbd_dev);
602adf40 1108 } else {
662518b1
AE
1109 ceph_put_snap_context(header->snapc);
1110 kfree(header->snap_names);
1111 kfree(header->snap_sizes);
602adf40 1112 }
849b4260 1113
bb23e37a 1114 /* The remaining fields always get updated (when we refresh) */
621901d6 1115
f84344f3 1116 header->image_size = le64_to_cpu(ondisk->image_size);
bb23e37a
AE
1117 header->snapc = snapc;
1118 header->snap_names = snap_names;
1119 header->snap_sizes = snap_sizes;
468521c1 1120
602adf40 1121 return 0;
bb23e37a
AE
1122out_2big:
1123 ret = -EIO;
6a52325f 1124out_err:
bb23e37a
AE
1125 kfree(snap_sizes);
1126 kfree(snap_names);
1127 ceph_put_snap_context(snapc);
1128 kfree(object_prefix);
ccece235 1129
bb23e37a 1130 return ret;
602adf40
YS
1131}
1132
9682fc6d
AE
1133static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1134{
1135 const char *snap_name;
1136
1137 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1138
1139 /* Skip over names until we find the one we are looking for */
1140
1141 snap_name = rbd_dev->header.snap_names;
1142 while (which--)
1143 snap_name += strlen(snap_name) + 1;
1144
1145 return kstrdup(snap_name, GFP_KERNEL);
1146}
1147
30d1cff8
AE
1148/*
1149 * Snapshot id comparison function for use with qsort()/bsearch().
1150 * Note that result is for snapshots in *descending* order.
1151 */
1152static int snapid_compare_reverse(const void *s1, const void *s2)
1153{
1154 u64 snap_id1 = *(u64 *)s1;
1155 u64 snap_id2 = *(u64 *)s2;
1156
1157 if (snap_id1 < snap_id2)
1158 return 1;
1159 return snap_id1 == snap_id2 ? 0 : -1;
1160}
1161
1162/*
1163 * Search a snapshot context to see if the given snapshot id is
1164 * present.
1165 *
1166 * Returns the position of the snapshot id in the array if it's found,
1167 * or BAD_SNAP_INDEX otherwise.
1168 *
1169 * Note: The snapshot array is in kept sorted (by the osd) in
1170 * reverse order, highest snapshot id first.
1171 */
9682fc6d
AE
1172static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1173{
1174 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
30d1cff8 1175 u64 *found;
9682fc6d 1176
30d1cff8
AE
1177 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1178 sizeof (snap_id), snapid_compare_reverse);
9682fc6d 1179
30d1cff8 1180 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9682fc6d
AE
1181}
1182
2ad3d716
AE
1183static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1184 u64 snap_id)
9e15b77d 1185{
54cac61f 1186 u32 which;
da6a6b63 1187 const char *snap_name;
9e15b77d 1188
54cac61f
AE
1189 which = rbd_dev_snap_index(rbd_dev, snap_id);
1190 if (which == BAD_SNAP_INDEX)
da6a6b63 1191 return ERR_PTR(-ENOENT);
54cac61f 1192
da6a6b63
JD
1193 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1194 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
54cac61f
AE
1195}
1196
1197static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1198{
9e15b77d
AE
1199 if (snap_id == CEPH_NOSNAP)
1200 return RBD_SNAP_HEAD_NAME;
1201
54cac61f
AE
1202 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1203 if (rbd_dev->image_format == 1)
1204 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9e15b77d 1205
54cac61f 1206 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
9e15b77d
AE
1207}
1208
2ad3d716
AE
1209static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1210 u64 *snap_size)
602adf40 1211{
2ad3d716
AE
1212 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1213 if (snap_id == CEPH_NOSNAP) {
1214 *snap_size = rbd_dev->header.image_size;
1215 } else if (rbd_dev->image_format == 1) {
1216 u32 which;
602adf40 1217
2ad3d716
AE
1218 which = rbd_dev_snap_index(rbd_dev, snap_id);
1219 if (which == BAD_SNAP_INDEX)
1220 return -ENOENT;
e86924a8 1221
2ad3d716
AE
1222 *snap_size = rbd_dev->header.snap_sizes[which];
1223 } else {
1224 u64 size = 0;
1225 int ret;
1226
1227 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1228 if (ret)
1229 return ret;
1230
1231 *snap_size = size;
1232 }
1233 return 0;
602adf40
YS
1234}
1235
2ad3d716
AE
1236static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1237{
8f4b7d98 1238 u64 snap_id = rbd_dev->spec->snap_id;
2ad3d716 1239 u64 size = 0;
2ad3d716
AE
1240 int ret;
1241
2ad3d716 1242 ret = rbd_snap_size(rbd_dev, snap_id, &size);
2ad3d716
AE
1243 if (ret)
1244 return ret;
1245
1246 rbd_dev->mapping.size = size;
8b0241f8 1247 return 0;
602adf40
YS
1248}
1249
d1cf5788
AE
1250static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1251{
1252 rbd_dev->mapping.size = 0;
200a6a8b
AE
1253}
1254
5359a17d 1255static void zero_bvec(struct bio_vec *bv)
602adf40 1256{
602adf40 1257 void *buf;
5359a17d 1258 unsigned long flags;
602adf40 1259
5359a17d
ID
1260 buf = bvec_kmap_irq(bv, &flags);
1261 memset(buf, 0, bv->bv_len);
1262 flush_dcache_page(bv->bv_page);
1263 bvec_kunmap_irq(buf, &flags);
602adf40
YS
1264}
1265
5359a17d 1266static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
b9434c5b 1267{
5359a17d 1268 struct ceph_bio_iter it = *bio_pos;
b9434c5b 1269
5359a17d
ID
1270 ceph_bio_iter_advance(&it, off);
1271 ceph_bio_iter_advance_step(&it, bytes, ({
1272 zero_bvec(&bv);
1273 }));
b9434c5b
AE
1274}
1275
7e07efb1 1276static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
602adf40 1277{
7e07efb1 1278 struct ceph_bvec_iter it = *bvec_pos;
602adf40 1279
7e07efb1
ID
1280 ceph_bvec_iter_advance(&it, off);
1281 ceph_bvec_iter_advance_step(&it, bytes, ({
1282 zero_bvec(&bv);
1283 }));
f7760dad
AE
1284}
1285
1286/*
3da691bf 1287 * Zero a range in @obj_req data buffer defined by a bio (list) or
afb97888 1288 * (private) bio_vec array.
f7760dad 1289 *
3da691bf 1290 * @off is relative to the start of the data buffer.
926f9b3f 1291 */
3da691bf
ID
1292static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1293 u32 bytes)
926f9b3f 1294{
54ab3b24
ID
1295 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1296
ecc633ca 1297 switch (obj_req->img_request->data_type) {
3da691bf
ID
1298 case OBJ_REQUEST_BIO:
1299 zero_bios(&obj_req->bio_pos, off, bytes);
1300 break;
1301 case OBJ_REQUEST_BVECS:
afb97888 1302 case OBJ_REQUEST_OWN_BVECS:
3da691bf
ID
1303 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1304 break;
1305 default:
16809372 1306 BUG();
6365d33a
AE
1307 }
1308}
1309
bf0d5f50
AE
1310static void rbd_obj_request_destroy(struct kref *kref);
1311static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1312{
1313 rbd_assert(obj_request != NULL);
37206ee5 1314 dout("%s: obj %p (was %d)\n", __func__, obj_request,
2c935bc5 1315 kref_read(&obj_request->kref));
bf0d5f50
AE
1316 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1317}
1318
bf0d5f50
AE
1319static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1320 struct rbd_obj_request *obj_request)
1321{
25dcf954
AE
1322 rbd_assert(obj_request->img_request == NULL);
1323
b155e86c 1324 /* Image request now owns object's original reference */
bf0d5f50 1325 obj_request->img_request = img_request;
15961b44 1326 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
bf0d5f50
AE
1327}
1328
1329static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1330 struct rbd_obj_request *obj_request)
1331{
15961b44 1332 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
43df3d35 1333 list_del(&obj_request->ex.oe_item);
bf0d5f50 1334 rbd_assert(obj_request->img_request == img_request);
bf0d5f50
AE
1335 rbd_obj_request_put(obj_request);
1336}
1337
a086a1b8 1338static void rbd_osd_submit(struct ceph_osd_request *osd_req)
bf0d5f50 1339{
a086a1b8 1340 struct rbd_obj_request *obj_req = osd_req->r_priv;
980917fc 1341
a086a1b8
ID
1342 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1343 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1344 obj_req->ex.oe_off, obj_req->ex.oe_len);
980917fc 1345 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
bf0d5f50
AE
1346}
1347
0c425248
AE
1348/*
1349 * The default/initial value for all image request flags is 0. Each
1350 * is conditionally set to 1 at image request initialization time
1351 * and currently never change thereafter.
1352 */
d0b2e944
AE
1353static void img_request_layered_set(struct rbd_img_request *img_request)
1354{
1355 set_bit(IMG_REQ_LAYERED, &img_request->flags);
d0b2e944
AE
1356}
1357
1358static bool img_request_layered_test(struct rbd_img_request *img_request)
1359{
d0b2e944
AE
1360 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1361}
1362
3da691bf 1363static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
6e2a4505 1364{
3da691bf 1365 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
b9434c5b 1366
43df3d35
ID
1367 return !obj_req->ex.oe_off &&
1368 obj_req->ex.oe_len == rbd_dev->layout.object_size;
6e2a4505
AE
1369}
1370
3da691bf 1371static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
bf0d5f50 1372{
3da691bf 1373 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bf0d5f50 1374
43df3d35 1375 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
3da691bf 1376 rbd_dev->layout.object_size;
0dcc685e
ID
1377}
1378
13488d53
ID
1379/*
1380 * Must be called after rbd_obj_calc_img_extents().
1381 */
1382static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1383{
1384 if (!obj_req->num_img_extents ||
9b17eb2c
ID
1385 (rbd_obj_is_entire(obj_req) &&
1386 !obj_req->img_request->snapc->num_snaps))
13488d53
ID
1387 return false;
1388
1389 return true;
1390}
1391
86bd7998 1392static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
bf0d5f50 1393{
86bd7998
ID
1394 return ceph_file_extents_bytes(obj_req->img_extents,
1395 obj_req->num_img_extents);
bf0d5f50
AE
1396}
1397
3da691bf 1398static bool rbd_img_is_write(struct rbd_img_request *img_req)
bf0d5f50 1399{
9bb0248d 1400 switch (img_req->op_type) {
3da691bf
ID
1401 case OBJ_OP_READ:
1402 return false;
1403 case OBJ_OP_WRITE:
1404 case OBJ_OP_DISCARD:
6484cbe9 1405 case OBJ_OP_ZEROOUT:
3da691bf
ID
1406 return true;
1407 default:
c6244b3b 1408 BUG();
3da691bf 1409 }
90e98c52
GZ
1410}
1411
85e084fe 1412static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
bf0d5f50 1413{
3da691bf 1414 struct rbd_obj_request *obj_req = osd_req->r_priv;
54ab3b24 1415 int result;
bf0d5f50 1416
3da691bf
ID
1417 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1418 osd_req->r_result, obj_req);
bf0d5f50 1419
54ab3b24
ID
1420 /*
1421 * Writes aren't allowed to return a data payload. In some
1422 * guarded write cases (e.g. stat + zero on an empty object)
1423 * a stat response makes it through, but we don't care.
1424 */
1425 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1426 result = 0;
3da691bf 1427 else
54ab3b24 1428 result = osd_req->r_result;
bf0d5f50 1429
54ab3b24 1430 rbd_obj_handle_request(obj_req, result);
bf0d5f50
AE
1431}
1432
bcbab1db 1433static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
430c28c3 1434{
bcbab1db 1435 struct rbd_obj_request *obj_request = osd_req->r_priv;
430c28c3 1436
a162b308 1437 osd_req->r_flags = CEPH_OSD_FLAG_READ;
7c84883a 1438 osd_req->r_snapid = obj_request->img_request->snap_id;
9d4df01f
AE
1439}
1440
bcbab1db 1441static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
9d4df01f 1442{
bcbab1db 1443 struct rbd_obj_request *obj_request = osd_req->r_priv;
9d4df01f 1444
a162b308 1445 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
fac02ddf 1446 ktime_get_real_ts64(&osd_req->r_mtime);
43df3d35 1447 osd_req->r_data_offset = obj_request->ex.oe_off;
430c28c3
AE
1448}
1449
bc81207e 1450static struct ceph_osd_request *
bcbab1db
ID
1451__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1452 struct ceph_snap_context *snapc, int num_ops)
bc81207e 1453{
e28eded5 1454 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bc81207e
ID
1455 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1456 struct ceph_osd_request *req;
a90bb0c1
ID
1457 const char *name_format = rbd_dev->image_format == 1 ?
1458 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
bcbab1db 1459 int ret;
bc81207e 1460
e28eded5 1461 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
bc81207e 1462 if (!req)
bcbab1db 1463 return ERR_PTR(-ENOMEM);
bc81207e 1464
bcbab1db 1465 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
bc81207e 1466 req->r_callback = rbd_osd_req_callback;
a162b308 1467 req->r_priv = obj_req;
bc81207e 1468
b26c047b
ID
1469 /*
1470 * Data objects may be stored in a separate pool, but always in
1471 * the same namespace in that pool as the header in its pool.
1472 */
1473 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
bc81207e 1474 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
b26c047b 1475
bcbab1db
ID
1476 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1477 rbd_dev->header.object_prefix,
1478 obj_req->ex.oe_objno);
1479 if (ret)
1480 return ERR_PTR(ret);
bc81207e 1481
bc81207e 1482 return req;
bc81207e
ID
1483}
1484
e28eded5 1485static struct ceph_osd_request *
bcbab1db 1486rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
bf0d5f50 1487{
bcbab1db
ID
1488 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1489 num_ops);
bf0d5f50
AE
1490}
1491
ecc633ca 1492static struct rbd_obj_request *rbd_obj_request_create(void)
bf0d5f50
AE
1493{
1494 struct rbd_obj_request *obj_request;
bf0d5f50 1495
5a60e876 1496 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
6c696d85 1497 if (!obj_request)
f907ad55 1498 return NULL;
f907ad55 1499
43df3d35 1500 ceph_object_extent_init(&obj_request->ex);
bcbab1db 1501 INIT_LIST_HEAD(&obj_request->osd_reqs);
85b5e6d1 1502 mutex_init(&obj_request->state_mutex);
bf0d5f50
AE
1503 kref_init(&obj_request->kref);
1504
67e2b652 1505 dout("%s %p\n", __func__, obj_request);
bf0d5f50
AE
1506 return obj_request;
1507}
1508
1509static void rbd_obj_request_destroy(struct kref *kref)
1510{
1511 struct rbd_obj_request *obj_request;
bcbab1db 1512 struct ceph_osd_request *osd_req;
7e07efb1 1513 u32 i;
bf0d5f50
AE
1514
1515 obj_request = container_of(kref, struct rbd_obj_request, kref);
1516
37206ee5
AE
1517 dout("%s: obj %p\n", __func__, obj_request);
1518
bcbab1db
ID
1519 while (!list_empty(&obj_request->osd_reqs)) {
1520 osd_req = list_first_entry(&obj_request->osd_reqs,
1521 struct ceph_osd_request, r_private_item);
1522 list_del_init(&osd_req->r_private_item);
1523 ceph_osdc_put_request(osd_req);
1524 }
bf0d5f50 1525
ecc633ca 1526 switch (obj_request->img_request->data_type) {
9969ebc5 1527 case OBJ_REQUEST_NODATA:
bf0d5f50 1528 case OBJ_REQUEST_BIO:
7e07efb1 1529 case OBJ_REQUEST_BVECS:
5359a17d 1530 break; /* Nothing to do */
afb97888
ID
1531 case OBJ_REQUEST_OWN_BVECS:
1532 kfree(obj_request->bvec_pos.bvecs);
788e2df3 1533 break;
7e07efb1 1534 default:
16809372 1535 BUG();
bf0d5f50
AE
1536 }
1537
86bd7998 1538 kfree(obj_request->img_extents);
7e07efb1
ID
1539 if (obj_request->copyup_bvecs) {
1540 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1541 if (obj_request->copyup_bvecs[i].bv_page)
1542 __free_page(obj_request->copyup_bvecs[i].bv_page);
1543 }
1544 kfree(obj_request->copyup_bvecs);
bf0d5f50
AE
1545 }
1546
868311b1 1547 kmem_cache_free(rbd_obj_request_cache, obj_request);
bf0d5f50
AE
1548}
1549
fb65d228
AE
1550/* It's OK to call this for a device with no parent */
1551
1552static void rbd_spec_put(struct rbd_spec *spec);
1553static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1554{
1555 rbd_dev_remove_parent(rbd_dev);
1556 rbd_spec_put(rbd_dev->parent_spec);
1557 rbd_dev->parent_spec = NULL;
1558 rbd_dev->parent_overlap = 0;
1559}
1560
a2acd00e
AE
1561/*
1562 * Parent image reference counting is used to determine when an
1563 * image's parent fields can be safely torn down--after there are no
1564 * more in-flight requests to the parent image. When the last
1565 * reference is dropped, cleaning them up is safe.
1566 */
1567static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1568{
1569 int counter;
1570
1571 if (!rbd_dev->parent_spec)
1572 return;
1573
1574 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1575 if (counter > 0)
1576 return;
1577
1578 /* Last reference; clean up parent data structures */
1579
1580 if (!counter)
1581 rbd_dev_unparent(rbd_dev);
1582 else
9584d508 1583 rbd_warn(rbd_dev, "parent reference underflow");
a2acd00e
AE
1584}
1585
1586/*
1587 * If an image has a non-zero parent overlap, get a reference to its
1588 * parent.
1589 *
1590 * Returns true if the rbd device has a parent with a non-zero
1591 * overlap and a reference for it was successfully taken, or
1592 * false otherwise.
1593 */
1594static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1595{
ae43e9d0 1596 int counter = 0;
a2acd00e
AE
1597
1598 if (!rbd_dev->parent_spec)
1599 return false;
1600
ae43e9d0
ID
1601 if (rbd_dev->parent_overlap)
1602 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
a2acd00e
AE
1603
1604 if (counter < 0)
9584d508 1605 rbd_warn(rbd_dev, "parent reference overflow");
a2acd00e 1606
ae43e9d0 1607 return counter > 0;
a2acd00e
AE
1608}
1609
59e542c8
ID
1610static void rbd_img_request_init(struct rbd_img_request *img_request,
1611 struct rbd_device *rbd_dev,
1612 enum obj_operation_type op_type)
bf0d5f50 1613{
59e542c8 1614 memset(img_request, 0, sizeof(*img_request));
bf0d5f50 1615
bf0d5f50 1616 img_request->rbd_dev = rbd_dev;
9bb0248d 1617 img_request->op_type = op_type;
a0c5895b 1618
e1fddc8f 1619 INIT_LIST_HEAD(&img_request->lock_item);
43df3d35 1620 INIT_LIST_HEAD(&img_request->object_extents);
0192ce2e 1621 mutex_init(&img_request->state_mutex);
bf0d5f50
AE
1622}
1623
a52cc685
ID
1624static void rbd_img_capture_header(struct rbd_img_request *img_req)
1625{
1626 struct rbd_device *rbd_dev = img_req->rbd_dev;
1627
1628 lockdep_assert_held(&rbd_dev->header_rwsem);
1629
1630 if (rbd_img_is_write(img_req))
1631 img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1632 else
1633 img_req->snap_id = rbd_dev->spec->snap_id;
1634
1635 if (rbd_dev_parent_get(rbd_dev))
1636 img_request_layered_set(img_req);
1637}
1638
679a97d2 1639static void rbd_img_request_destroy(struct rbd_img_request *img_request)
bf0d5f50 1640{
bf0d5f50
AE
1641 struct rbd_obj_request *obj_request;
1642 struct rbd_obj_request *next_obj_request;
1643
37206ee5
AE
1644 dout("%s: img %p\n", __func__, img_request);
1645
e1fddc8f 1646 WARN_ON(!list_empty(&img_request->lock_item));
bf0d5f50
AE
1647 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1648 rbd_img_obj_request_del(img_request, obj_request);
1649
78b42a87 1650 if (img_request_layered_test(img_request))
a2acd00e 1651 rbd_dev_parent_put(img_request->rbd_dev);
a2acd00e 1652
9bb0248d 1653 if (rbd_img_is_write(img_request))
812164f8 1654 ceph_put_snap_context(img_request->snapc);
bf0d5f50 1655
59e542c8
ID
1656 if (test_bit(IMG_REQ_CHILD, &img_request->flags))
1657 kmem_cache_free(rbd_img_request_cache, img_request);
bf0d5f50
AE
1658}
1659
22e8bd51
ID
1660#define BITS_PER_OBJ 2
1661#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
1662#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
e93f3152 1663
22e8bd51
ID
1664static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1665 u64 *index, u8 *shift)
1666{
1667 u32 off;
e93f3152 1668
22e8bd51
ID
1669 rbd_assert(objno < rbd_dev->object_map_size);
1670 *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1671 *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1672}
e93f3152 1673
22e8bd51
ID
1674static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1675{
1676 u64 index;
1677 u8 shift;
e93f3152 1678
22e8bd51
ID
1679 lockdep_assert_held(&rbd_dev->object_map_lock);
1680 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1681 return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
e93f3152
AE
1682}
1683
22e8bd51 1684static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
e93f3152 1685{
22e8bd51
ID
1686 u64 index;
1687 u8 shift;
1688 u8 *p;
e93f3152 1689
22e8bd51
ID
1690 lockdep_assert_held(&rbd_dev->object_map_lock);
1691 rbd_assert(!(val & ~OBJ_MASK));
e93f3152 1692
22e8bd51
ID
1693 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1694 p = &rbd_dev->object_map[index];
1695 *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
e93f3152
AE
1696}
1697
22e8bd51 1698static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1217857f 1699{
22e8bd51
ID
1700 u8 state;
1701
1702 spin_lock(&rbd_dev->object_map_lock);
1703 state = __rbd_object_map_get(rbd_dev, objno);
1704 spin_unlock(&rbd_dev->object_map_lock);
1705 return state;
3da691bf 1706}
1217857f 1707
22e8bd51 1708static bool use_object_map(struct rbd_device *rbd_dev)
3da691bf 1709{
3fe69921
ID
1710 /*
1711 * An image mapped read-only can't use the object map -- it isn't
1712 * loaded because the header lock isn't acquired. Someone else can
1713 * write to the image and update the object map behind our back.
1714 *
1715 * A snapshot can't be written to, so using the object map is always
1716 * safe.
1717 */
1718 if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
1719 return false;
1720
22e8bd51
ID
1721 return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1722 !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
3da691bf
ID
1723}
1724
22e8bd51 1725static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
3da691bf 1726{
22e8bd51 1727 u8 state;
8b3e1a56 1728
22e8bd51
ID
1729 /* fall back to default logic if object map is disabled or invalid */
1730 if (!use_object_map(rbd_dev))
1731 return true;
3da691bf 1732
22e8bd51
ID
1733 state = rbd_object_map_get(rbd_dev, objno);
1734 return state != OBJECT_NONEXISTENT;
1217857f
AE
1735}
1736
22e8bd51
ID
1737static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1738 struct ceph_object_id *oid)
13488d53 1739{
22e8bd51
ID
1740 if (snap_id == CEPH_NOSNAP)
1741 ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1742 rbd_dev->spec->image_id);
1743 else
1744 ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1745 rbd_dev->spec->image_id, snap_id);
13488d53
ID
1746}
1747
22e8bd51 1748static int rbd_object_map_lock(struct rbd_device *rbd_dev)
2169238d 1749{
22e8bd51
ID
1750 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1751 CEPH_DEFINE_OID_ONSTACK(oid);
1752 u8 lock_type;
1753 char *lock_tag;
1754 struct ceph_locker *lockers;
1755 u32 num_lockers;
1756 bool broke_lock = false;
1757 int ret;
2169238d 1758
22e8bd51 1759 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
2169238d 1760
22e8bd51
ID
1761again:
1762 ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1763 CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1764 if (ret != -EBUSY || broke_lock) {
1765 if (ret == -EEXIST)
1766 ret = 0; /* already locked by myself */
1767 if (ret)
1768 rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1769 return ret;
1770 }
2169238d 1771
22e8bd51
ID
1772 ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1773 RBD_LOCK_NAME, &lock_type, &lock_tag,
1774 &lockers, &num_lockers);
1775 if (ret) {
1776 if (ret == -ENOENT)
1777 goto again;
3da691bf 1778
22e8bd51 1779 rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
86bd7998 1780 return ret;
22e8bd51 1781 }
86bd7998 1782
22e8bd51
ID
1783 kfree(lock_tag);
1784 if (num_lockers == 0)
1785 goto again;
2169238d 1786
22e8bd51
ID
1787 rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1788 ENTITY_NAME(lockers[0].id.name));
2169238d 1789
22e8bd51
ID
1790 ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1791 RBD_LOCK_NAME, lockers[0].id.cookie,
1792 &lockers[0].id.name);
1793 ceph_free_lockers(lockers, num_lockers);
1794 if (ret) {
1795 if (ret == -ENOENT)
1796 goto again;
13488d53 1797
22e8bd51
ID
1798 rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1799 return ret;
3da691bf
ID
1800 }
1801
22e8bd51
ID
1802 broke_lock = true;
1803 goto again;
2169238d
AE
1804}
1805
22e8bd51 1806static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
6484cbe9 1807{
22e8bd51
ID
1808 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1809 CEPH_DEFINE_OID_ONSTACK(oid);
1810 int ret;
1811
1812 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1813
1814 ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1815 "");
1816 if (ret && ret != -ENOENT)
1817 rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
6484cbe9
ID
1818}
1819
22e8bd51 1820static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
6484cbe9 1821{
22e8bd51
ID
1822 u8 struct_v;
1823 u32 struct_len;
1824 u32 header_len;
1825 void *header_end;
6484cbe9
ID
1826 int ret;
1827
22e8bd51
ID
1828 ceph_decode_32_safe(p, end, header_len, e_inval);
1829 header_end = *p + header_len;
0c93e1b7 1830
22e8bd51
ID
1831 ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1832 &struct_len);
6484cbe9
ID
1833 if (ret)
1834 return ret;
1835
22e8bd51 1836 ceph_decode_64_safe(p, end, *object_map_size, e_inval);
6484cbe9 1837
22e8bd51 1838 *p = header_end;
6484cbe9 1839 return 0;
22e8bd51
ID
1840
1841e_inval:
1842 return -EINVAL;
6484cbe9
ID
1843}
1844
22e8bd51 1845static int __rbd_object_map_load(struct rbd_device *rbd_dev)
13488d53 1846{
22e8bd51
ID
1847 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1848 CEPH_DEFINE_OID_ONSTACK(oid);
1849 struct page **pages;
1850 void *p, *end;
1851 size_t reply_len;
1852 u64 num_objects;
1853 u64 object_map_bytes;
1854 u64 object_map_size;
1855 int num_pages;
1856 int ret;
13488d53 1857
22e8bd51 1858 rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
13488d53 1859
22e8bd51
ID
1860 num_objects = ceph_get_num_objects(&rbd_dev->layout,
1861 rbd_dev->mapping.size);
1862 object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1863 BITS_PER_BYTE);
1864 num_pages = calc_pages_for(0, object_map_bytes) + 1;
1865 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1866 if (IS_ERR(pages))
1867 return PTR_ERR(pages);
13488d53 1868
22e8bd51
ID
1869 reply_len = num_pages * PAGE_SIZE;
1870 rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1871 ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1872 "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1873 NULL, 0, pages, &reply_len);
1874 if (ret)
1875 goto out;
3b434a2a 1876
22e8bd51
ID
1877 p = page_address(pages[0]);
1878 end = p + min(reply_len, (size_t)PAGE_SIZE);
1879 ret = decode_object_map_header(&p, end, &object_map_size);
1880 if (ret)
1881 goto out;
1882
1883 if (object_map_size != num_objects) {
1884 rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1885 object_map_size, num_objects);
1886 ret = -EINVAL;
1887 goto out;
3b434a2a
JD
1888 }
1889
22e8bd51
ID
1890 if (offset_in_page(p) + object_map_bytes > reply_len) {
1891 ret = -EINVAL;
1892 goto out;
1893 }
1894
1895 rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
1896 if (!rbd_dev->object_map) {
1897 ret = -ENOMEM;
1898 goto out;
1899 }
1900
1901 rbd_dev->object_map_size = object_map_size;
1902 ceph_copy_from_page_vector(pages, rbd_dev->object_map,
1903 offset_in_page(p), object_map_bytes);
1904
1905out:
1906 ceph_release_page_vector(pages, num_pages);
1907 return ret;
1908}
3da691bf 1909
22e8bd51
ID
1910static void rbd_object_map_free(struct rbd_device *rbd_dev)
1911{
1912 kvfree(rbd_dev->object_map);
1913 rbd_dev->object_map = NULL;
1914 rbd_dev->object_map_size = 0;
3b434a2a
JD
1915}
1916
22e8bd51 1917static int rbd_object_map_load(struct rbd_device *rbd_dev)
bf0d5f50 1918{
3da691bf 1919 int ret;
37206ee5 1920
22e8bd51 1921 ret = __rbd_object_map_load(rbd_dev);
86bd7998
ID
1922 if (ret)
1923 return ret;
f1a4739f 1924
22e8bd51
ID
1925 ret = rbd_dev_v2_get_flags(rbd_dev);
1926 if (ret) {
1927 rbd_object_map_free(rbd_dev);
1928 return ret;
1929 }
1930
1931 if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
1932 rbd_warn(rbd_dev, "object map is invalid");
1933
1934 return 0;
1935}
1936
1937static int rbd_object_map_open(struct rbd_device *rbd_dev)
1938{
1939 int ret;
1940
1941 ret = rbd_object_map_lock(rbd_dev);
1942 if (ret)
1943 return ret;
1944
1945 ret = rbd_object_map_load(rbd_dev);
1946 if (ret) {
1947 rbd_object_map_unlock(rbd_dev);
1948 return ret;
1949 }
1950
1951 return 0;
1952}
1953
1954static void rbd_object_map_close(struct rbd_device *rbd_dev)
1955{
1956 rbd_object_map_free(rbd_dev);
1957 rbd_object_map_unlock(rbd_dev);
1958}
1959
1960/*
1961 * This function needs snap_id (or more precisely just something to
1962 * distinguish between HEAD and snapshot object maps), new_state and
1963 * current_state that were passed to rbd_object_map_update().
1964 *
1965 * To avoid allocating and stashing a context we piggyback on the OSD
1966 * request. A HEAD update has two ops (assert_locked). For new_state
1967 * and current_state we decode our own object_map_update op, encoded in
1968 * rbd_cls_object_map_update().
1969 */
1970static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
1971 struct ceph_osd_request *osd_req)
1972{
1973 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1974 struct ceph_osd_data *osd_data;
1975 u64 objno;
633739b2 1976 u8 state, new_state, uninitialized_var(current_state);
22e8bd51
ID
1977 bool has_current_state;
1978 void *p;
1979
1980 if (osd_req->r_result)
1981 return osd_req->r_result;
1982
1983 /*
1984 * Nothing to do for a snapshot object map.
1985 */
1986 if (osd_req->r_num_ops == 1)
1987 return 0;
1988
1989 /*
1990 * Update in-memory HEAD object map.
1991 */
1992 rbd_assert(osd_req->r_num_ops == 2);
1993 osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
1994 rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
1995
1996 p = page_address(osd_data->pages[0]);
1997 objno = ceph_decode_64(&p);
1998 rbd_assert(objno == obj_req->ex.oe_objno);
1999 rbd_assert(ceph_decode_64(&p) == objno + 1);
2000 new_state = ceph_decode_8(&p);
2001 has_current_state = ceph_decode_8(&p);
2002 if (has_current_state)
2003 current_state = ceph_decode_8(&p);
2004
2005 spin_lock(&rbd_dev->object_map_lock);
2006 state = __rbd_object_map_get(rbd_dev, objno);
2007 if (!has_current_state || current_state == state ||
2008 (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2009 __rbd_object_map_set(rbd_dev, objno, new_state);
2010 spin_unlock(&rbd_dev->object_map_lock);
2011
2012 return 0;
2013}
2014
2015static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2016{
2017 struct rbd_obj_request *obj_req = osd_req->r_priv;
2018 int result;
2019
2020 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2021 osd_req->r_result, obj_req);
2022
2023 result = rbd_object_map_update_finish(obj_req, osd_req);
2024 rbd_obj_handle_request(obj_req, result);
2025}
2026
2027static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2028{
2029 u8 state = rbd_object_map_get(rbd_dev, objno);
bf0d5f50 2030
22e8bd51
ID
2031 if (state == new_state ||
2032 (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2033 (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2034 return false;
2035
2036 return true;
2037}
2038
2039static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2040 int which, u64 objno, u8 new_state,
2041 const u8 *current_state)
2042{
2043 struct page **pages;
2044 void *p, *start;
2045 int ret;
2046
2047 ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2048 if (ret)
2049 return ret;
2050
2051 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2052 if (IS_ERR(pages))
2053 return PTR_ERR(pages);
2054
2055 p = start = page_address(pages[0]);
2056 ceph_encode_64(&p, objno);
2057 ceph_encode_64(&p, objno + 1);
2058 ceph_encode_8(&p, new_state);
2059 if (current_state) {
2060 ceph_encode_8(&p, 1);
2061 ceph_encode_8(&p, *current_state);
2062 } else {
2063 ceph_encode_8(&p, 0);
2064 }
2065
2066 osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2067 false, true);
2068 return 0;
2069}
2070
2071/*
2072 * Return:
2073 * 0 - object map update sent
2074 * 1 - object map update isn't needed
2075 * <0 - error
2076 */
2077static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2078 u8 new_state, const u8 *current_state)
2079{
2080 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2081 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2082 struct ceph_osd_request *req;
2083 int num_ops = 1;
2084 int which = 0;
2085 int ret;
2086
2087 if (snap_id == CEPH_NOSNAP) {
2088 if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2089 return 1;
2090
2091 num_ops++; /* assert_locked */
2092 }
2093
2094 req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2095 if (!req)
2096 return -ENOMEM;
2097
2098 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2099 req->r_callback = rbd_object_map_callback;
2100 req->r_priv = obj_req;
2101
2102 rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2103 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2104 req->r_flags = CEPH_OSD_FLAG_WRITE;
2105 ktime_get_real_ts64(&req->r_mtime);
2106
2107 if (snap_id == CEPH_NOSNAP) {
2108 /*
2109 * Protect against possible race conditions during lock
2110 * ownership transitions.
2111 */
2112 ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2113 CEPH_CLS_LOCK_EXCLUSIVE, "", "");
3da691bf
ID
2114 if (ret)
2115 return ret;
22e8bd51
ID
2116 }
2117
2118 ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2119 new_state, current_state);
2120 if (ret)
2121 return ret;
2122
2123 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2124 if (ret)
2125 return ret;
13488d53 2126
22e8bd51
ID
2127 ceph_osdc_start_request(osdc, req, false);
2128 return 0;
2129}
2130
86bd7998
ID
2131static void prune_extents(struct ceph_file_extent *img_extents,
2132 u32 *num_img_extents, u64 overlap)
e93f3152 2133{
86bd7998 2134 u32 cnt = *num_img_extents;
e93f3152 2135
86bd7998
ID
2136 /* drop extents completely beyond the overlap */
2137 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2138 cnt--;
e93f3152 2139
86bd7998
ID
2140 if (cnt) {
2141 struct ceph_file_extent *ex = &img_extents[cnt - 1];
e93f3152 2142
86bd7998
ID
2143 /* trim final overlapping extent */
2144 if (ex->fe_off + ex->fe_len > overlap)
2145 ex->fe_len = overlap - ex->fe_off;
2146 }
e93f3152 2147
86bd7998 2148 *num_img_extents = cnt;
e93f3152
AE
2149}
2150
86bd7998
ID
2151/*
2152 * Determine the byte range(s) covered by either just the object extent
2153 * or the entire object in the parent image.
2154 */
2155static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2156 bool entire)
e93f3152 2157{
86bd7998
ID
2158 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2159 int ret;
e93f3152 2160
86bd7998
ID
2161 if (!rbd_dev->parent_overlap)
2162 return 0;
e93f3152 2163
86bd7998
ID
2164 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2165 entire ? 0 : obj_req->ex.oe_off,
2166 entire ? rbd_dev->layout.object_size :
2167 obj_req->ex.oe_len,
2168 &obj_req->img_extents,
2169 &obj_req->num_img_extents);
2170 if (ret)
2171 return ret;
e93f3152 2172
86bd7998
ID
2173 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2174 rbd_dev->parent_overlap);
2175 return 0;
e93f3152
AE
2176}
2177
bcbab1db 2178static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
1217857f 2179{
bcbab1db
ID
2180 struct rbd_obj_request *obj_req = osd_req->r_priv;
2181
ecc633ca 2182 switch (obj_req->img_request->data_type) {
3da691bf 2183 case OBJ_REQUEST_BIO:
bcbab1db 2184 osd_req_op_extent_osd_data_bio(osd_req, which,
3da691bf 2185 &obj_req->bio_pos,
43df3d35 2186 obj_req->ex.oe_len);
3da691bf
ID
2187 break;
2188 case OBJ_REQUEST_BVECS:
afb97888 2189 case OBJ_REQUEST_OWN_BVECS:
3da691bf 2190 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
43df3d35 2191 obj_req->ex.oe_len);
afb97888 2192 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
bcbab1db 2193 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
3da691bf
ID
2194 &obj_req->bvec_pos);
2195 break;
2196 default:
16809372 2197 BUG();
1217857f 2198 }
3da691bf 2199}
1217857f 2200
bcbab1db 2201static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
3da691bf
ID
2202{
2203 struct page **pages;
8b3e1a56 2204
3da691bf
ID
2205 /*
2206 * The response data for a STAT call consists of:
2207 * le64 length;
2208 * struct {
2209 * le32 tv_sec;
2210 * le32 tv_nsec;
2211 * } mtime;
2212 */
2213 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2214 if (IS_ERR(pages))
2215 return PTR_ERR(pages);
2216
bcbab1db
ID
2217 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2218 osd_req_op_raw_data_in_pages(osd_req, which, pages,
3da691bf
ID
2219 8 + sizeof(struct ceph_timespec),
2220 0, false, true);
2221 return 0;
1217857f
AE
2222}
2223
b5ae8cbc
ID
2224static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2225 u32 bytes)
2226{
2227 struct rbd_obj_request *obj_req = osd_req->r_priv;
2228 int ret;
2229
2230 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2231 if (ret)
2232 return ret;
2233
2234 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2235 obj_req->copyup_bvec_count, bytes);
2236 return 0;
2237}
2238
ea9b743c
ID
2239static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2240{
2241 obj_req->read_state = RBD_OBJ_READ_START;
2242 return 0;
2243}
2244
bcbab1db
ID
2245static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2246 int which)
2169238d 2247{
bcbab1db 2248 struct rbd_obj_request *obj_req = osd_req->r_priv;
3da691bf
ID
2249 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2250 u16 opcode;
2169238d 2251
8b5bec5c
ID
2252 if (!use_object_map(rbd_dev) ||
2253 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2254 osd_req_op_alloc_hint_init(osd_req, which++,
2255 rbd_dev->layout.object_size,
2256 rbd_dev->layout.object_size);
2257 }
2169238d 2258
3da691bf
ID
2259 if (rbd_obj_is_entire(obj_req))
2260 opcode = CEPH_OSD_OP_WRITEFULL;
2261 else
2262 opcode = CEPH_OSD_OP_WRITE;
2169238d 2263
bcbab1db 2264 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2265 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
bcbab1db 2266 rbd_osd_setup_data(osd_req, which);
3da691bf 2267}
2169238d 2268
ea9b743c 2269static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
3da691bf 2270{
3da691bf
ID
2271 int ret;
2272
86bd7998
ID
2273 /* reverse map the entire object onto the parent */
2274 ret = rbd_obj_calc_img_extents(obj_req, true);
2275 if (ret)
2276 return ret;
2277
0ad5d953
ID
2278 if (rbd_obj_copyup_enabled(obj_req))
2279 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2280
85b5e6d1 2281 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf 2282 return 0;
2169238d
AE
2283}
2284
6484cbe9
ID
2285static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2286{
2287 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2288 CEPH_OSD_OP_ZERO;
2289}
2290
27bbd911
ID
2291static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2292 int which)
2293{
2294 struct rbd_obj_request *obj_req = osd_req->r_priv;
2295
2296 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2297 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2298 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
13488d53 2299 } else {
27bbd911
ID
2300 osd_req_op_extent_init(osd_req, which,
2301 truncate_or_zero_opcode(obj_req),
2302 obj_req->ex.oe_off, obj_req->ex.oe_len,
2303 0, 0);
2304 }
2305}
2306
ea9b743c 2307static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
6484cbe9 2308{
0c93e1b7 2309 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
27bbd911 2310 u64 off, next_off;
6484cbe9
ID
2311 int ret;
2312
0c93e1b7
ID
2313 /*
2314 * Align the range to alloc_size boundary and punt on discards
2315 * that are too small to free up any space.
2316 *
2317 * alloc_size == object_size && is_tail() is a special case for
2318 * filestore with filestore_punch_hole = false, needed to allow
2319 * truncate (in addition to delete).
2320 */
2321 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2322 !rbd_obj_is_tail(obj_req)) {
27bbd911
ID
2323 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2324 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2325 rbd_dev->opts->alloc_size);
0c93e1b7
ID
2326 if (off >= next_off)
2327 return 1;
27bbd911
ID
2328
2329 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2330 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2331 off, next_off - off);
2332 obj_req->ex.oe_off = off;
2333 obj_req->ex.oe_len = next_off - off;
0c93e1b7
ID
2334 }
2335
6484cbe9
ID
2336 /* reverse map the entire object onto the parent */
2337 ret = rbd_obj_calc_img_extents(obj_req, true);
2338 if (ret)
2339 return ret;
2340
22e8bd51 2341 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
0ad5d953
ID
2342 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2343 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2344
85b5e6d1 2345 obj_req->write_state = RBD_OBJ_WRITE_START;
6484cbe9
ID
2346 return 0;
2347}
2348
bcbab1db
ID
2349static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2350 int which)
3da691bf 2351{
bcbab1db 2352 struct rbd_obj_request *obj_req = osd_req->r_priv;
3b434a2a
JD
2353 u16 opcode;
2354
3da691bf 2355 if (rbd_obj_is_entire(obj_req)) {
86bd7998 2356 if (obj_req->num_img_extents) {
0ad5d953 2357 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
bcbab1db 2358 osd_req_op_init(osd_req, which++,
9b17eb2c 2359 CEPH_OSD_OP_CREATE, 0);
3b434a2a
JD
2360 opcode = CEPH_OSD_OP_TRUNCATE;
2361 } else {
0ad5d953 2362 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
bcbab1db 2363 osd_req_op_init(osd_req, which++,
3da691bf
ID
2364 CEPH_OSD_OP_DELETE, 0);
2365 opcode = 0;
3b434a2a 2366 }
3b434a2a 2367 } else {
6484cbe9 2368 opcode = truncate_or_zero_opcode(obj_req);
3b434a2a
JD
2369 }
2370
3da691bf 2371 if (opcode)
bcbab1db 2372 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2373 obj_req->ex.oe_off, obj_req->ex.oe_len,
3da691bf 2374 0, 0);
3b434a2a
JD
2375}
2376
ea9b743c 2377static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
bf0d5f50 2378{
3da691bf 2379 int ret;
37206ee5 2380
86bd7998
ID
2381 /* reverse map the entire object onto the parent */
2382 ret = rbd_obj_calc_img_extents(obj_req, true);
2383 if (ret)
2384 return ret;
f1a4739f 2385
0ad5d953
ID
2386 if (rbd_obj_copyup_enabled(obj_req))
2387 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2388 if (!obj_req->num_img_extents) {
22e8bd51 2389 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
0ad5d953
ID
2390 if (rbd_obj_is_entire(obj_req))
2391 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
3da691bf 2392 }
3b434a2a 2393
a086a1b8 2394 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf
ID
2395 return 0;
2396}
9d4df01f 2397
a086a1b8
ID
2398static int count_write_ops(struct rbd_obj_request *obj_req)
2399{
8b5bec5c
ID
2400 struct rbd_img_request *img_req = obj_req->img_request;
2401
2402 switch (img_req->op_type) {
a086a1b8 2403 case OBJ_OP_WRITE:
8b5bec5c
ID
2404 if (!use_object_map(img_req->rbd_dev) ||
2405 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2406 return 2; /* setallochint + write/writefull */
2407
2408 return 1; /* write/writefull */
a086a1b8
ID
2409 case OBJ_OP_DISCARD:
2410 return 1; /* delete/truncate/zero */
2411 case OBJ_OP_ZEROOUT:
2412 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2413 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2414 return 2; /* create + truncate */
bf0d5f50 2415
a086a1b8
ID
2416 return 1; /* delete/truncate/zero */
2417 default:
2418 BUG();
3da691bf 2419 }
a086a1b8 2420}
3b434a2a 2421
a086a1b8
ID
2422static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2423 int which)
2424{
2425 struct rbd_obj_request *obj_req = osd_req->r_priv;
2426
2427 switch (obj_req->img_request->op_type) {
2428 case OBJ_OP_WRITE:
2429 __rbd_osd_setup_write_ops(osd_req, which);
2430 break;
2431 case OBJ_OP_DISCARD:
2432 __rbd_osd_setup_discard_ops(osd_req, which);
2433 break;
2434 case OBJ_OP_ZEROOUT:
2435 __rbd_osd_setup_zeroout_ops(osd_req, which);
2436 break;
2437 default:
2438 BUG();
2439 }
3da691bf 2440}
9d4df01f 2441
3da691bf 2442/*
a086a1b8
ID
2443 * Prune the list of object requests (adjust offset and/or length, drop
2444 * redundant requests). Prepare object request state machines and image
2445 * request state machine for execution.
3da691bf
ID
2446 */
2447static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2448{
0c93e1b7 2449 struct rbd_obj_request *obj_req, *next_obj_req;
3da691bf 2450 int ret;
430c28c3 2451
0c93e1b7 2452 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
9bb0248d 2453 switch (img_req->op_type) {
3da691bf 2454 case OBJ_OP_READ:
ea9b743c 2455 ret = rbd_obj_init_read(obj_req);
3da691bf
ID
2456 break;
2457 case OBJ_OP_WRITE:
ea9b743c 2458 ret = rbd_obj_init_write(obj_req);
3da691bf
ID
2459 break;
2460 case OBJ_OP_DISCARD:
ea9b743c 2461 ret = rbd_obj_init_discard(obj_req);
3da691bf 2462 break;
6484cbe9 2463 case OBJ_OP_ZEROOUT:
ea9b743c 2464 ret = rbd_obj_init_zeroout(obj_req);
6484cbe9 2465 break;
3da691bf 2466 default:
16809372 2467 BUG();
3da691bf 2468 }
0c93e1b7 2469 if (ret < 0)
3da691bf 2470 return ret;
0c93e1b7 2471 if (ret > 0) {
0c93e1b7
ID
2472 rbd_img_obj_request_del(img_req, obj_req);
2473 continue;
2474 }
bf0d5f50
AE
2475 }
2476
0192ce2e 2477 img_req->state = RBD_IMG_START;
bf0d5f50 2478 return 0;
3da691bf 2479}
bf0d5f50 2480
5a237819
ID
2481union rbd_img_fill_iter {
2482 struct ceph_bio_iter bio_iter;
2483 struct ceph_bvec_iter bvec_iter;
2484};
bf0d5f50 2485
5a237819
ID
2486struct rbd_img_fill_ctx {
2487 enum obj_request_type pos_type;
2488 union rbd_img_fill_iter *pos;
2489 union rbd_img_fill_iter iter;
2490 ceph_object_extent_fn_t set_pos_fn;
afb97888
ID
2491 ceph_object_extent_fn_t count_fn;
2492 ceph_object_extent_fn_t copy_fn;
5a237819 2493};
bf0d5f50 2494
5a237819 2495static struct ceph_object_extent *alloc_object_extent(void *arg)
0eefd470 2496{
5a237819
ID
2497 struct rbd_img_request *img_req = arg;
2498 struct rbd_obj_request *obj_req;
0eefd470 2499
5a237819
ID
2500 obj_req = rbd_obj_request_create();
2501 if (!obj_req)
2502 return NULL;
2761713d 2503
5a237819
ID
2504 rbd_img_obj_request_add(img_req, obj_req);
2505 return &obj_req->ex;
2506}
0eefd470 2507
afb97888
ID
2508/*
2509 * While su != os && sc == 1 is technically not fancy (it's the same
2510 * layout as su == os && sc == 1), we can't use the nocopy path for it
2511 * because ->set_pos_fn() should be called only once per object.
2512 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2513 * treat su != os && sc == 1 as fancy.
2514 */
2515static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2516{
2517 return l->stripe_unit != l->object_size;
2518}
0eefd470 2519
afb97888
ID
2520static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2521 struct ceph_file_extent *img_extents,
2522 u32 num_img_extents,
2523 struct rbd_img_fill_ctx *fctx)
2524{
2525 u32 i;
2526 int ret;
2527
2528 img_req->data_type = fctx->pos_type;
0eefd470
AE
2529
2530 /*
afb97888
ID
2531 * Create object requests and set each object request's starting
2532 * position in the provided bio (list) or bio_vec array.
0eefd470 2533 */
afb97888
ID
2534 fctx->iter = *fctx->pos;
2535 for (i = 0; i < num_img_extents; i++) {
2536 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2537 img_extents[i].fe_off,
2538 img_extents[i].fe_len,
2539 &img_req->object_extents,
2540 alloc_object_extent, img_req,
2541 fctx->set_pos_fn, &fctx->iter);
2542 if (ret)
2543 return ret;
2544 }
0eefd470 2545
afb97888 2546 return __rbd_img_fill_request(img_req);
0eefd470
AE
2547}
2548
5a237819
ID
2549/*
2550 * Map a list of image extents to a list of object extents, create the
2551 * corresponding object requests (normally each to a different object,
2552 * but not always) and add them to @img_req. For each object request,
afb97888 2553 * set up its data descriptor to point to the corresponding chunk(s) of
5a237819
ID
2554 * @fctx->pos data buffer.
2555 *
afb97888
ID
2556 * Because ceph_file_to_extents() will merge adjacent object extents
2557 * together, each object request's data descriptor may point to multiple
2558 * different chunks of @fctx->pos data buffer.
2559 *
5a237819
ID
2560 * @fctx->pos data buffer is assumed to be large enough.
2561 */
2562static int rbd_img_fill_request(struct rbd_img_request *img_req,
2563 struct ceph_file_extent *img_extents,
2564 u32 num_img_extents,
2565 struct rbd_img_fill_ctx *fctx)
3d7efd18 2566{
afb97888
ID
2567 struct rbd_device *rbd_dev = img_req->rbd_dev;
2568 struct rbd_obj_request *obj_req;
5a237819
ID
2569 u32 i;
2570 int ret;
2571
afb97888
ID
2572 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2573 !rbd_layout_is_fancy(&rbd_dev->layout))
2574 return rbd_img_fill_request_nocopy(img_req, img_extents,
2575 num_img_extents, fctx);
3d7efd18 2576
afb97888 2577 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
0eefd470 2578
bbea1c1a 2579 /*
afb97888
ID
2580 * Create object requests and determine ->bvec_count for each object
2581 * request. Note that ->bvec_count sum over all object requests may
2582 * be greater than the number of bio_vecs in the provided bio (list)
2583 * or bio_vec array because when mapped, those bio_vecs can straddle
2584 * stripe unit boundaries.
bbea1c1a 2585 */
5a237819
ID
2586 fctx->iter = *fctx->pos;
2587 for (i = 0; i < num_img_extents; i++) {
afb97888 2588 ret = ceph_file_to_extents(&rbd_dev->layout,
5a237819
ID
2589 img_extents[i].fe_off,
2590 img_extents[i].fe_len,
2591 &img_req->object_extents,
2592 alloc_object_extent, img_req,
afb97888
ID
2593 fctx->count_fn, &fctx->iter);
2594 if (ret)
2595 return ret;
bbea1c1a 2596 }
0eefd470 2597
afb97888
ID
2598 for_each_obj_request(img_req, obj_req) {
2599 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2600 sizeof(*obj_req->bvec_pos.bvecs),
2601 GFP_NOIO);
2602 if (!obj_req->bvec_pos.bvecs)
2603 return -ENOMEM;
2604 }
0eefd470 2605
8785b1d4 2606 /*
afb97888
ID
2607 * Fill in each object request's private bio_vec array, splitting and
2608 * rearranging the provided bio_vecs in stripe unit chunks as needed.
8785b1d4 2609 */
afb97888
ID
2610 fctx->iter = *fctx->pos;
2611 for (i = 0; i < num_img_extents; i++) {
2612 ret = ceph_iterate_extents(&rbd_dev->layout,
2613 img_extents[i].fe_off,
2614 img_extents[i].fe_len,
2615 &img_req->object_extents,
2616 fctx->copy_fn, &fctx->iter);
5a237819
ID
2617 if (ret)
2618 return ret;
2619 }
3d7efd18 2620
5a237819
ID
2621 return __rbd_img_fill_request(img_req);
2622}
2623
2624static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2625 u64 off, u64 len)
2626{
2627 struct ceph_file_extent ex = { off, len };
a55e601b 2628 union rbd_img_fill_iter dummy = {};
5a237819
ID
2629 struct rbd_img_fill_ctx fctx = {
2630 .pos_type = OBJ_REQUEST_NODATA,
2631 .pos = &dummy,
2632 };
2633
2634 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2635}
2636
2637static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2638{
2639 struct rbd_obj_request *obj_req =
2640 container_of(ex, struct rbd_obj_request, ex);
2641 struct ceph_bio_iter *it = arg;
3d7efd18 2642
5a237819
ID
2643 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2644 obj_req->bio_pos = *it;
2645 ceph_bio_iter_advance(it, bytes);
2646}
3d7efd18 2647
afb97888
ID
2648static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2649{
2650 struct rbd_obj_request *obj_req =
2651 container_of(ex, struct rbd_obj_request, ex);
2652 struct ceph_bio_iter *it = arg;
0eefd470 2653
afb97888
ID
2654 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2655 ceph_bio_iter_advance_step(it, bytes, ({
2656 obj_req->bvec_count++;
2657 }));
0eefd470 2658
afb97888 2659}
0eefd470 2660
afb97888
ID
2661static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2662{
2663 struct rbd_obj_request *obj_req =
2664 container_of(ex, struct rbd_obj_request, ex);
2665 struct ceph_bio_iter *it = arg;
0eefd470 2666
afb97888
ID
2667 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2668 ceph_bio_iter_advance_step(it, bytes, ({
2669 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2670 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2671 }));
3d7efd18
AE
2672}
2673
5a237819
ID
2674static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2675 struct ceph_file_extent *img_extents,
2676 u32 num_img_extents,
2677 struct ceph_bio_iter *bio_pos)
2678{
2679 struct rbd_img_fill_ctx fctx = {
2680 .pos_type = OBJ_REQUEST_BIO,
2681 .pos = (union rbd_img_fill_iter *)bio_pos,
2682 .set_pos_fn = set_bio_pos,
afb97888
ID
2683 .count_fn = count_bio_bvecs,
2684 .copy_fn = copy_bio_bvecs,
5a237819 2685 };
3d7efd18 2686
5a237819
ID
2687 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2688 &fctx);
2689}
3d7efd18 2690
5a237819
ID
2691static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2692 u64 off, u64 len, struct bio *bio)
2693{
2694 struct ceph_file_extent ex = { off, len };
2695 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
3d7efd18 2696
5a237819
ID
2697 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2698}
a9e8ba2c 2699
5a237819
ID
2700static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2701{
2702 struct rbd_obj_request *obj_req =
2703 container_of(ex, struct rbd_obj_request, ex);
2704 struct ceph_bvec_iter *it = arg;
3d7efd18 2705
5a237819
ID
2706 obj_req->bvec_pos = *it;
2707 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2708 ceph_bvec_iter_advance(it, bytes);
2709}
3d7efd18 2710
afb97888
ID
2711static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2712{
2713 struct rbd_obj_request *obj_req =
2714 container_of(ex, struct rbd_obj_request, ex);
2715 struct ceph_bvec_iter *it = arg;
058aa991 2716
afb97888
ID
2717 ceph_bvec_iter_advance_step(it, bytes, ({
2718 obj_req->bvec_count++;
2719 }));
2720}
058aa991 2721
afb97888
ID
2722static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2723{
2724 struct rbd_obj_request *obj_req =
2725 container_of(ex, struct rbd_obj_request, ex);
2726 struct ceph_bvec_iter *it = arg;
3d7efd18 2727
afb97888
ID
2728 ceph_bvec_iter_advance_step(it, bytes, ({
2729 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2730 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2731 }));
3d7efd18
AE
2732}
2733
5a237819
ID
2734static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2735 struct ceph_file_extent *img_extents,
2736 u32 num_img_extents,
2737 struct ceph_bvec_iter *bvec_pos)
c5b5ef6c 2738{
5a237819
ID
2739 struct rbd_img_fill_ctx fctx = {
2740 .pos_type = OBJ_REQUEST_BVECS,
2741 .pos = (union rbd_img_fill_iter *)bvec_pos,
2742 .set_pos_fn = set_bvec_pos,
afb97888
ID
2743 .count_fn = count_bvecs,
2744 .copy_fn = copy_bvecs,
5a237819 2745 };
c5b5ef6c 2746
5a237819
ID
2747 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2748 &fctx);
2749}
c5b5ef6c 2750
5a237819
ID
2751static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2752 struct ceph_file_extent *img_extents,
2753 u32 num_img_extents,
2754 struct bio_vec *bvecs)
2755{
2756 struct ceph_bvec_iter it = {
2757 .bvecs = bvecs,
2758 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2759 num_img_extents) },
2760 };
c5b5ef6c 2761
5a237819
ID
2762 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2763 &it);
2764}
c5b5ef6c 2765
0192ce2e 2766static void rbd_img_handle_request_work(struct work_struct *work)
bf0d5f50 2767{
0192ce2e
ID
2768 struct rbd_img_request *img_req =
2769 container_of(work, struct rbd_img_request, work);
c5b5ef6c 2770
0192ce2e
ID
2771 rbd_img_handle_request(img_req, img_req->work_result);
2772}
c2e82414 2773
0192ce2e
ID
2774static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2775{
2776 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2777 img_req->work_result = result;
2778 queue_work(rbd_wq, &img_req->work);
c5b5ef6c 2779}
c2e82414 2780
22e8bd51
ID
2781static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2782{
2783 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2784
2785 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2786 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2787 return true;
2788 }
2789
2790 dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2791 obj_req->ex.oe_objno);
2792 return false;
2793}
2794
85b5e6d1
ID
2795static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2796{
a086a1b8
ID
2797 struct ceph_osd_request *osd_req;
2798 int ret;
2799
2800 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2801 if (IS_ERR(osd_req))
2802 return PTR_ERR(osd_req);
2803
2804 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2805 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2806 rbd_osd_setup_data(osd_req, 0);
2807 rbd_osd_format_read(osd_req);
2808
2809 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2810 if (ret)
2811 return ret;
2812
2813 rbd_osd_submit(osd_req);
85b5e6d1 2814 return 0;
c5b5ef6c
AE
2815}
2816
86bd7998 2817static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
c5b5ef6c 2818{
3da691bf 2819 struct rbd_img_request *img_req = obj_req->img_request;
a52cc685 2820 struct rbd_device *parent = img_req->rbd_dev->parent;
3da691bf 2821 struct rbd_img_request *child_img_req;
c5b5ef6c
AE
2822 int ret;
2823
59e542c8 2824 child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
3da691bf 2825 if (!child_img_req)
710214e3
ID
2826 return -ENOMEM;
2827
59e542c8 2828 rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
e93aca0a
ID
2829 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2830 child_img_req->obj_request = obj_req;
a90bb0c1 2831
a52cc685
ID
2832 down_read(&parent->header_rwsem);
2833 rbd_img_capture_header(child_img_req);
2834 up_read(&parent->header_rwsem);
2835
21ed05a8
ID
2836 dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2837 obj_req);
2838
3da691bf 2839 if (!rbd_img_is_write(img_req)) {
ecc633ca 2840 switch (img_req->data_type) {
3da691bf 2841 case OBJ_REQUEST_BIO:
5a237819
ID
2842 ret = __rbd_img_fill_from_bio(child_img_req,
2843 obj_req->img_extents,
2844 obj_req->num_img_extents,
2845 &obj_req->bio_pos);
3da691bf
ID
2846 break;
2847 case OBJ_REQUEST_BVECS:
afb97888 2848 case OBJ_REQUEST_OWN_BVECS:
5a237819
ID
2849 ret = __rbd_img_fill_from_bvecs(child_img_req,
2850 obj_req->img_extents,
2851 obj_req->num_img_extents,
2852 &obj_req->bvec_pos);
3da691bf
ID
2853 break;
2854 default:
d342a15b 2855 BUG();
3da691bf
ID
2856 }
2857 } else {
5a237819
ID
2858 ret = rbd_img_fill_from_bvecs(child_img_req,
2859 obj_req->img_extents,
2860 obj_req->num_img_extents,
2861 obj_req->copyup_bvecs);
3da691bf
ID
2862 }
2863 if (ret) {
679a97d2 2864 rbd_img_request_destroy(child_img_req);
3da691bf
ID
2865 return ret;
2866 }
2867
0192ce2e
ID
2868 /* avoid parent chain recursion */
2869 rbd_img_schedule(child_img_req, 0);
3da691bf
ID
2870 return 0;
2871}
2872
85b5e6d1 2873static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
3da691bf
ID
2874{
2875 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2876 int ret;
2877
22e8bd51 2878again:
a9b67e69 2879 switch (obj_req->read_state) {
85b5e6d1
ID
2880 case RBD_OBJ_READ_START:
2881 rbd_assert(!*result);
2882
22e8bd51
ID
2883 if (!rbd_obj_may_exist(obj_req)) {
2884 *result = -ENOENT;
2885 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2886 goto again;
2887 }
2888
85b5e6d1 2889 ret = rbd_obj_read_object(obj_req);
3da691bf 2890 if (ret) {
85b5e6d1 2891 *result = ret;
3da691bf
ID
2892 return true;
2893 }
85b5e6d1
ID
2894 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2895 return false;
a9b67e69
ID
2896 case RBD_OBJ_READ_OBJECT:
2897 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2898 /* reverse map this object extent onto the parent */
2899 ret = rbd_obj_calc_img_extents(obj_req, false);
86bd7998 2900 if (ret) {
54ab3b24 2901 *result = ret;
86bd7998
ID
2902 return true;
2903 }
a9b67e69
ID
2904 if (obj_req->num_img_extents) {
2905 ret = rbd_obj_read_from_parent(obj_req);
2906 if (ret) {
2907 *result = ret;
2908 return true;
2909 }
2910 obj_req->read_state = RBD_OBJ_READ_PARENT;
2911 return false;
2912 }
86bd7998 2913 }
710214e3 2914
a9b67e69
ID
2915 /*
2916 * -ENOENT means a hole in the image -- zero-fill the entire
2917 * length of the request. A short read also implies zero-fill
2918 * to the end of the request.
2919 */
2920 if (*result == -ENOENT) {
2921 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2922 *result = 0;
2923 } else if (*result >= 0) {
2924 if (*result < obj_req->ex.oe_len)
2925 rbd_obj_zero_range(obj_req, *result,
2926 obj_req->ex.oe_len - *result);
2927 else
2928 rbd_assert(*result == obj_req->ex.oe_len);
2929 *result = 0;
2930 }
2931 return true;
2932 case RBD_OBJ_READ_PARENT:
d435c9a7
ID
2933 /*
2934 * The parent image is read only up to the overlap -- zero-fill
2935 * from the overlap to the end of the request.
2936 */
2937 if (!*result) {
2938 u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2939
2940 if (obj_overlap < obj_req->ex.oe_len)
2941 rbd_obj_zero_range(obj_req, obj_overlap,
2942 obj_req->ex.oe_len - obj_overlap);
2943 }
a9b67e69
ID
2944 return true;
2945 default:
2946 BUG();
710214e3 2947 }
3da691bf 2948}
c5b5ef6c 2949
22e8bd51
ID
2950static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
2951{
2952 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2953
2954 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
2955 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2956
2957 if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
2958 (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
2959 dout("%s %p noop for nonexistent\n", __func__, obj_req);
2960 return true;
2961 }
2962
2963 return false;
2964}
2965
2966/*
2967 * Return:
2968 * 0 - object map update sent
2969 * 1 - object map update isn't needed
2970 * <0 - error
2971 */
2972static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
2973{
2974 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2975 u8 new_state;
2976
2977 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
2978 return 1;
2979
2980 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
2981 new_state = OBJECT_PENDING;
2982 else
2983 new_state = OBJECT_EXISTS;
2984
2985 return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
2986}
2987
85b5e6d1
ID
2988static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
2989{
a086a1b8
ID
2990 struct ceph_osd_request *osd_req;
2991 int num_ops = count_write_ops(obj_req);
2992 int which = 0;
2993 int ret;
710214e3 2994
a086a1b8
ID
2995 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2996 num_ops++; /* stat */
2997
2998 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
2999 if (IS_ERR(osd_req))
3000 return PTR_ERR(osd_req);
3001
3002 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3003 ret = rbd_osd_setup_stat(osd_req, which++);
3004 if (ret)
3005 return ret;
710214e3 3006 }
c5b5ef6c 3007
a086a1b8
ID
3008 rbd_osd_setup_write_ops(osd_req, which);
3009 rbd_osd_format_write(osd_req);
3010
3011 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3012 if (ret)
3013 return ret;
3014
3015 rbd_osd_submit(osd_req);
85b5e6d1 3016 return 0;
3da691bf 3017}
c5b5ef6c 3018
3da691bf
ID
3019/*
3020 * copyup_bvecs pages are never highmem pages
3021 */
3022static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3023{
3024 struct ceph_bvec_iter it = {
3025 .bvecs = bvecs,
3026 .iter = { .bi_size = bytes },
3027 };
c5b5ef6c 3028
3da691bf
ID
3029 ceph_bvec_iter_advance_step(&it, bytes, ({
3030 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3031 bv.bv_len))
3032 return false;
3033 }));
3034 return true;
c5b5ef6c
AE
3035}
3036
3a482501
ID
3037#define MODS_ONLY U32_MAX
3038
793333a3
ID
3039static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3040 u32 bytes)
b454e36d 3041{
bcbab1db 3042 struct ceph_osd_request *osd_req;
fe943d50 3043 int ret;
70d045f6 3044
3da691bf 3045 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
89a59c1c 3046 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
70d045f6 3047
bcbab1db
ID
3048 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3049 if (IS_ERR(osd_req))
3050 return PTR_ERR(osd_req);
b454e36d 3051
b5ae8cbc 3052 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
fe943d50
CX
3053 if (ret)
3054 return ret;
3055
bcbab1db 3056 rbd_osd_format_write(osd_req);
3da691bf 3057
bcbab1db 3058 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
89a59c1c
ID
3059 if (ret)
3060 return ret;
3061
a086a1b8 3062 rbd_osd_submit(osd_req);
89a59c1c
ID
3063 return 0;
3064}
3065
793333a3
ID
3066static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3067 u32 bytes)
b454e36d 3068{
bcbab1db 3069 struct ceph_osd_request *osd_req;
a086a1b8
ID
3070 int num_ops = count_write_ops(obj_req);
3071 int which = 0;
fe943d50 3072 int ret;
70d045f6 3073
3da691bf 3074 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
70d045f6 3075
a086a1b8
ID
3076 if (bytes != MODS_ONLY)
3077 num_ops++; /* copyup */
13488d53 3078
a086a1b8 3079 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
bcbab1db
ID
3080 if (IS_ERR(osd_req))
3081 return PTR_ERR(osd_req);
b454e36d 3082
3a482501 3083 if (bytes != MODS_ONLY) {
b5ae8cbc 3084 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3a482501
ID
3085 if (ret)
3086 return ret;
3da691bf 3087 }
3da691bf 3088
a086a1b8
ID
3089 rbd_osd_setup_write_ops(osd_req, which);
3090 rbd_osd_format_write(osd_req);
70d045f6 3091
bcbab1db 3092 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
26f887e0
ID
3093 if (ret)
3094 return ret;
3095
a086a1b8 3096 rbd_osd_submit(osd_req);
3da691bf 3097 return 0;
70d045f6
ID
3098}
3099
7e07efb1 3100static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
70d045f6 3101{
7e07efb1 3102 u32 i;
b454e36d 3103
7e07efb1
ID
3104 rbd_assert(!obj_req->copyup_bvecs);
3105 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3106 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3107 sizeof(*obj_req->copyup_bvecs),
3108 GFP_NOIO);
3109 if (!obj_req->copyup_bvecs)
3110 return -ENOMEM;
b454e36d 3111
7e07efb1
ID
3112 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3113 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3114
3115 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3116 if (!obj_req->copyup_bvecs[i].bv_page)
3117 return -ENOMEM;
3d7efd18 3118
7e07efb1
ID
3119 obj_req->copyup_bvecs[i].bv_offset = 0;
3120 obj_req->copyup_bvecs[i].bv_len = len;
3121 obj_overlap -= len;
3122 }
b454e36d 3123
7e07efb1
ID
3124 rbd_assert(!obj_overlap);
3125 return 0;
b454e36d
AE
3126}
3127
0ad5d953
ID
3128/*
3129 * The target object doesn't exist. Read the data for the entire
3130 * target object up to the overlap point (if any) from the parent,
3131 * so we can use it for a copyup.
3132 */
793333a3 3133static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
bf0d5f50 3134{
3da691bf 3135 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 3136 int ret;
bf0d5f50 3137
86bd7998
ID
3138 rbd_assert(obj_req->num_img_extents);
3139 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3140 rbd_dev->parent_overlap);
3141 if (!obj_req->num_img_extents) {
3da691bf
ID
3142 /*
3143 * The overlap has become 0 (most likely because the
3a482501
ID
3144 * image has been flattened). Re-submit the original write
3145 * request -- pass MODS_ONLY since the copyup isn't needed
3146 * anymore.
3da691bf 3147 */
793333a3 3148 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
bf0d5f50
AE
3149 }
3150
86bd7998 3151 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3da691bf
ID
3152 if (ret)
3153 return ret;
3154
86bd7998 3155 return rbd_obj_read_from_parent(obj_req);
bf0d5f50 3156}
8b3e1a56 3157
22e8bd51
ID
3158static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3159{
3160 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3161 struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3162 u8 new_state;
3163 u32 i;
3164 int ret;
3165
3166 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3167
3168 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3169 return;
3170
3171 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3172 return;
3173
3174 for (i = 0; i < snapc->num_snaps; i++) {
3175 if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3176 i + 1 < snapc->num_snaps)
3177 new_state = OBJECT_EXISTS_CLEAN;
3178 else
3179 new_state = OBJECT_EXISTS;
3180
3181 ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3182 new_state, NULL);
3183 if (ret < 0) {
3184 obj_req->pending.result = ret;
3185 return;
3186 }
3187
3188 rbd_assert(!ret);
3189 obj_req->pending.num_pending++;
3190 }
3191}
3192
793333a3
ID
3193static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3194{
3195 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3196 int ret;
3197
3198 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3199
3200 /*
3201 * Only send non-zero copyup data to save some I/O and network
3202 * bandwidth -- zero copyup data is equivalent to the object not
3203 * existing.
3204 */
3205 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3206 bytes = 0;
3207
3208 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3209 /*
3210 * Send a copyup request with an empty snapshot context to
3211 * deep-copyup the object through all existing snapshots.
3212 * A second request with the current snapshot context will be
3213 * sent for the actual modification.
3214 */
3215 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3216 if (ret) {
3217 obj_req->pending.result = ret;
3218 return;
3219 }
3220
3221 obj_req->pending.num_pending++;
3222 bytes = MODS_ONLY;
3223 }
3224
3225 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3226 if (ret) {
3227 obj_req->pending.result = ret;
3228 return;
3229 }
3230
3231 obj_req->pending.num_pending++;
3232}
3233
3234static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3235{
22e8bd51 3236 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
793333a3
ID
3237 int ret;
3238
3239again:
3240 switch (obj_req->copyup_state) {
3241 case RBD_OBJ_COPYUP_START:
3242 rbd_assert(!*result);
3243
3244 ret = rbd_obj_copyup_read_parent(obj_req);
3245 if (ret) {
3246 *result = ret;
3247 return true;
3248 }
3249 if (obj_req->num_img_extents)
3250 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3251 else
3252 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3253 return false;
3254 case RBD_OBJ_COPYUP_READ_PARENT:
3255 if (*result)
3256 return true;
3257
3258 if (is_zero_bvecs(obj_req->copyup_bvecs,
3259 rbd_obj_img_extents_bytes(obj_req))) {
3260 dout("%s %p detected zeros\n", __func__, obj_req);
3261 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3262 }
3263
22e8bd51
ID
3264 rbd_obj_copyup_object_maps(obj_req);
3265 if (!obj_req->pending.num_pending) {
3266 *result = obj_req->pending.result;
3267 obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3268 goto again;
3269 }
3270 obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3271 return false;
3272 case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3273 if (!pending_result_dec(&obj_req->pending, result))
3274 return false;
3275 /* fall through */
3276 case RBD_OBJ_COPYUP_OBJECT_MAPS:
3277 if (*result) {
3278 rbd_warn(rbd_dev, "snap object map update failed: %d",
3279 *result);
3280 return true;
3281 }
3282
793333a3
ID
3283 rbd_obj_copyup_write_object(obj_req);
3284 if (!obj_req->pending.num_pending) {
3285 *result = obj_req->pending.result;
3286 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3287 goto again;
3288 }
3289 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3290 return false;
3291 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3292 if (!pending_result_dec(&obj_req->pending, result))
3293 return false;
3294 /* fall through */
3295 case RBD_OBJ_COPYUP_WRITE_OBJECT:
3296 return true;
3297 default:
3298 BUG();
3299 }
3300}
3301
22e8bd51
ID
3302/*
3303 * Return:
3304 * 0 - object map update sent
3305 * 1 - object map update isn't needed
3306 * <0 - error
3307 */
3308static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
3309{
3310 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3311 u8 current_state = OBJECT_PENDING;
3312
3313 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3314 return 1;
3315
3316 if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3317 return 1;
3318
3319 return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3320 &current_state);
3321}
3322
85b5e6d1 3323static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
8b3e1a56 3324{
793333a3 3325 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 3326 int ret;
8b3e1a56 3327
793333a3 3328again:
3da691bf 3329 switch (obj_req->write_state) {
85b5e6d1
ID
3330 case RBD_OBJ_WRITE_START:
3331 rbd_assert(!*result);
3332
22e8bd51
ID
3333 if (rbd_obj_write_is_noop(obj_req))
3334 return true;
3335
3336 ret = rbd_obj_write_pre_object_map(obj_req);
3337 if (ret < 0) {
3338 *result = ret;
3339 return true;
3340 }
3341 obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3342 if (ret > 0)
3343 goto again;
3344 return false;
3345 case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3346 if (*result) {
3347 rbd_warn(rbd_dev, "pre object map update failed: %d",
3348 *result);
3349 return true;
3350 }
85b5e6d1
ID
3351 ret = rbd_obj_write_object(obj_req);
3352 if (ret) {
3353 *result = ret;
3354 return true;
3355 }
3356 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3357 return false;
0ad5d953 3358 case RBD_OBJ_WRITE_OBJECT:
54ab3b24 3359 if (*result == -ENOENT) {
0ad5d953 3360 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
793333a3
ID
3361 *result = 0;
3362 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3363 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3364 goto again;
0ad5d953 3365 }
3da691bf 3366 /*
0ad5d953
ID
3367 * On a non-existent object:
3368 * delete - -ENOENT, truncate/zero - 0
3da691bf 3369 */
0ad5d953
ID
3370 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3371 *result = 0;
3da691bf 3372 }
a9b67e69 3373 if (*result)
3a482501 3374 return true;
8b3e1a56 3375
793333a3
ID
3376 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3377 goto again;
3378 case __RBD_OBJ_WRITE_COPYUP:
3379 if (!rbd_obj_advance_copyup(obj_req, result))
3380 return false;
3381 /* fall through */
3382 case RBD_OBJ_WRITE_COPYUP:
22e8bd51 3383 if (*result) {
793333a3 3384 rbd_warn(rbd_dev, "copyup failed: %d", *result);
22e8bd51
ID
3385 return true;
3386 }
3387 ret = rbd_obj_write_post_object_map(obj_req);
3388 if (ret < 0) {
3389 *result = ret;
3390 return true;
3391 }
3392 obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3393 if (ret > 0)
3394 goto again;
3395 return false;
3396 case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3397 if (*result)
3398 rbd_warn(rbd_dev, "post object map update failed: %d",
3399 *result);
793333a3 3400 return true;
3da691bf 3401 default:
c6244b3b 3402 BUG();
3da691bf
ID
3403 }
3404}
02c74fba 3405
3da691bf 3406/*
0ad5d953 3407 * Return true if @obj_req is completed.
3da691bf 3408 */
54ab3b24
ID
3409static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3410 int *result)
3da691bf 3411{
0ad5d953 3412 struct rbd_img_request *img_req = obj_req->img_request;
0192ce2e 3413 struct rbd_device *rbd_dev = img_req->rbd_dev;
0ad5d953
ID
3414 bool done;
3415
85b5e6d1 3416 mutex_lock(&obj_req->state_mutex);
0ad5d953 3417 if (!rbd_img_is_write(img_req))
85b5e6d1 3418 done = rbd_obj_advance_read(obj_req, result);
0ad5d953 3419 else
85b5e6d1
ID
3420 done = rbd_obj_advance_write(obj_req, result);
3421 mutex_unlock(&obj_req->state_mutex);
0ad5d953 3422
0192ce2e
ID
3423 if (done && *result) {
3424 rbd_assert(*result < 0);
3425 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3426 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3427 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3428 }
0ad5d953 3429 return done;
3da691bf 3430}
02c74fba 3431
0192ce2e
ID
3432/*
3433 * This is open-coded in rbd_img_handle_request() to avoid parent chain
3434 * recursion.
3435 */
3436static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3437{
3438 if (__rbd_obj_handle_request(obj_req, &result))
3439 rbd_img_handle_request(obj_req->img_request, result);
3440}
3441
e1fddc8f
ID
3442static bool need_exclusive_lock(struct rbd_img_request *img_req)
3443{
3444 struct rbd_device *rbd_dev = img_req->rbd_dev;
3445
3446 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3447 return false;
3448
3fe69921 3449 if (rbd_is_ro(rbd_dev))
e1fddc8f
ID
3450 return false;
3451
3452 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
22e8bd51
ID
3453 if (rbd_dev->opts->lock_on_read ||
3454 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
e1fddc8f
ID
3455 return true;
3456
3457 return rbd_img_is_write(img_req);
3458}
3459
637cd060 3460static bool rbd_lock_add_request(struct rbd_img_request *img_req)
e1fddc8f
ID
3461{
3462 struct rbd_device *rbd_dev = img_req->rbd_dev;
637cd060 3463 bool locked;
e1fddc8f
ID
3464
3465 lockdep_assert_held(&rbd_dev->lock_rwsem);
637cd060 3466 locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
e1fddc8f
ID
3467 spin_lock(&rbd_dev->lock_lists_lock);
3468 rbd_assert(list_empty(&img_req->lock_item));
637cd060
ID
3469 if (!locked)
3470 list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3471 else
3472 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
e1fddc8f 3473 spin_unlock(&rbd_dev->lock_lists_lock);
637cd060 3474 return locked;
e1fddc8f
ID
3475}
3476
3477static void rbd_lock_del_request(struct rbd_img_request *img_req)
3478{
3479 struct rbd_device *rbd_dev = img_req->rbd_dev;
3480 bool need_wakeup;
3481
3482 lockdep_assert_held(&rbd_dev->lock_rwsem);
3483 spin_lock(&rbd_dev->lock_lists_lock);
3484 rbd_assert(!list_empty(&img_req->lock_item));
3485 list_del_init(&img_req->lock_item);
3486 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3487 list_empty(&rbd_dev->running_list));
3488 spin_unlock(&rbd_dev->lock_lists_lock);
3489 if (need_wakeup)
3490 complete(&rbd_dev->releasing_wait);
3491}
3492
637cd060
ID
3493static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3494{
3495 struct rbd_device *rbd_dev = img_req->rbd_dev;
3496
3497 if (!need_exclusive_lock(img_req))
3498 return 1;
3499
3500 if (rbd_lock_add_request(img_req))
3501 return 1;
3502
3503 if (rbd_dev->opts->exclusive) {
3504 WARN_ON(1); /* lock got released? */
3505 return -EROFS;
3506 }
3507
3508 /*
3509 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3510 * and cancel_delayed_work() in wake_lock_waiters().
3511 */
3512 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3513 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3514 return 0;
3515}
3516
0192ce2e 3517static void rbd_img_object_requests(struct rbd_img_request *img_req)
7114edac 3518{
0192ce2e 3519 struct rbd_obj_request *obj_req;
7114edac 3520
0192ce2e
ID
3521 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3522
3523 for_each_obj_request(img_req, obj_req) {
3524 int result = 0;
a9e8ba2c 3525
0192ce2e
ID
3526 if (__rbd_obj_handle_request(obj_req, &result)) {
3527 if (result) {
3528 img_req->pending.result = result;
3529 return;
3530 }
3531 } else {
3532 img_req->pending.num_pending++;
3533 }
3534 }
8b3e1a56
AE
3535}
3536
0192ce2e 3537static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
8b3e1a56 3538{
637cd060 3539 struct rbd_device *rbd_dev = img_req->rbd_dev;
3da691bf 3540 int ret;
8b3e1a56 3541
0192ce2e
ID
3542again:
3543 switch (img_req->state) {
3544 case RBD_IMG_START:
3545 rbd_assert(!*result);
8b3e1a56 3546
637cd060
ID
3547 ret = rbd_img_exclusive_lock(img_req);
3548 if (ret < 0) {
3549 *result = ret;
3da691bf
ID
3550 return true;
3551 }
637cd060
ID
3552 img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3553 if (ret > 0)
3554 goto again;
3da691bf 3555 return false;
637cd060
ID
3556 case RBD_IMG_EXCLUSIVE_LOCK:
3557 if (*result)
89a59c1c
ID
3558 return true;
3559
637cd060
ID
3560 rbd_assert(!need_exclusive_lock(img_req) ||
3561 __rbd_is_lock_owner(rbd_dev));
3562
0192ce2e
ID
3563 rbd_img_object_requests(img_req);
3564 if (!img_req->pending.num_pending) {
3565 *result = img_req->pending.result;
3566 img_req->state = RBD_IMG_OBJECT_REQUESTS;
3567 goto again;
3da691bf 3568 }
0192ce2e 3569 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3da691bf 3570 return false;
0192ce2e
ID
3571 case __RBD_IMG_OBJECT_REQUESTS:
3572 if (!pending_result_dec(&img_req->pending, result))
3573 return false;
3574 /* fall through */
3575 case RBD_IMG_OBJECT_REQUESTS:
3576 return true;
3da691bf 3577 default:
c6244b3b 3578 BUG();
3da691bf
ID
3579 }
3580}
02c74fba 3581
3da691bf 3582/*
0192ce2e 3583 * Return true if @img_req is completed.
3da691bf 3584 */
0192ce2e
ID
3585static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3586 int *result)
7114edac 3587{
0192ce2e
ID
3588 struct rbd_device *rbd_dev = img_req->rbd_dev;
3589 bool done;
7114edac 3590
e1fddc8f
ID
3591 if (need_exclusive_lock(img_req)) {
3592 down_read(&rbd_dev->lock_rwsem);
3593 mutex_lock(&img_req->state_mutex);
3594 done = rbd_img_advance(img_req, result);
3595 if (done)
3596 rbd_lock_del_request(img_req);
3597 mutex_unlock(&img_req->state_mutex);
3598 up_read(&rbd_dev->lock_rwsem);
3599 } else {
3600 mutex_lock(&img_req->state_mutex);
3601 done = rbd_img_advance(img_req, result);
3602 mutex_unlock(&img_req->state_mutex);
02c74fba 3603 }
a9e8ba2c 3604
0192ce2e
ID
3605 if (done && *result) {
3606 rbd_assert(*result < 0);
3607 rbd_warn(rbd_dev, "%s%s result %d",
3608 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3609 obj_op_name(img_req->op_type), *result);
7114edac 3610 }
0192ce2e 3611 return done;
7114edac 3612}
a9e8ba2c 3613
0192ce2e 3614static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3da691bf 3615{
7114edac 3616again:
0192ce2e 3617 if (!__rbd_img_handle_request(img_req, &result))
7114edac 3618 return;
8b3e1a56 3619
7114edac 3620 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
0192ce2e
ID
3621 struct rbd_obj_request *obj_req = img_req->obj_request;
3622
679a97d2 3623 rbd_img_request_destroy(img_req);
0192ce2e
ID
3624 if (__rbd_obj_handle_request(obj_req, &result)) {
3625 img_req = obj_req->img_request;
3626 goto again;
3627 }
3628 } else {
59e542c8 3629 struct request *rq = blk_mq_rq_from_pdu(img_req);
0192ce2e 3630
679a97d2 3631 rbd_img_request_destroy(img_req);
0192ce2e 3632 blk_mq_end_request(rq, errno_to_blk_status(result));
7114edac 3633 }
8b3e1a56 3634}
bf0d5f50 3635
ed95b21a 3636static const struct rbd_client_id rbd_empty_cid;
b8d70035 3637
ed95b21a
ID
3638static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3639 const struct rbd_client_id *rhs)
3640{
3641 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3642}
3643
3644static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3645{
3646 struct rbd_client_id cid;
3647
3648 mutex_lock(&rbd_dev->watch_mutex);
3649 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3650 cid.handle = rbd_dev->watch_cookie;
3651 mutex_unlock(&rbd_dev->watch_mutex);
3652 return cid;
3653}
3654
3655/*
3656 * lock_rwsem must be held for write
3657 */
3658static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3659 const struct rbd_client_id *cid)
3660{
3661 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3662 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3663 cid->gid, cid->handle);
3664 rbd_dev->owner_cid = *cid; /* struct */
3665}
3666
3667static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3668{
3669 mutex_lock(&rbd_dev->watch_mutex);
3670 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3671 mutex_unlock(&rbd_dev->watch_mutex);
3672}
3673
edd8ca80
FM
3674static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3675{
3676 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3677
a2b1da09 3678 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
edd8ca80
FM
3679 strcpy(rbd_dev->lock_cookie, cookie);
3680 rbd_set_owner_cid(rbd_dev, &cid);
3681 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3682}
3683
ed95b21a
ID
3684/*
3685 * lock_rwsem must be held for write
3686 */
3687static int rbd_lock(struct rbd_device *rbd_dev)
b8d70035 3688{
922dab61 3689 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3690 char cookie[32];
e627db08 3691 int ret;
b8d70035 3692
cbbfb0ff
ID
3693 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3694 rbd_dev->lock_cookie[0] != '\0');
52bb1f9b 3695
ed95b21a
ID
3696 format_lock_cookie(rbd_dev, cookie);
3697 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3698 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3699 RBD_LOCK_TAG, "", 0);
e627db08 3700 if (ret)
ed95b21a 3701 return ret;
b8d70035 3702
edd8ca80 3703 __rbd_lock(rbd_dev, cookie);
ed95b21a 3704 return 0;
b8d70035
AE
3705}
3706
ed95b21a
ID
3707/*
3708 * lock_rwsem must be held for write
3709 */
bbead745 3710static void rbd_unlock(struct rbd_device *rbd_dev)
bb040aa0 3711{
922dab61 3712 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
bb040aa0
ID
3713 int ret;
3714
cbbfb0ff
ID
3715 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3716 rbd_dev->lock_cookie[0] == '\0');
bb040aa0 3717
ed95b21a 3718 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
cbbfb0ff 3719 RBD_LOCK_NAME, rbd_dev->lock_cookie);
bbead745 3720 if (ret && ret != -ENOENT)
637cd060 3721 rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
bb040aa0 3722
bbead745
ID
3723 /* treat errors as the image is unlocked */
3724 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
cbbfb0ff 3725 rbd_dev->lock_cookie[0] = '\0';
ed95b21a
ID
3726 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3727 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
bb040aa0
ID
3728}
3729
ed95b21a
ID
3730static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3731 enum rbd_notify_op notify_op,
3732 struct page ***preply_pages,
3733 size_t *preply_len)
9969ebc5
AE
3734{
3735 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3736 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
08a79102
KS
3737 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3738 int buf_size = sizeof(buf);
ed95b21a 3739 void *p = buf;
9969ebc5 3740
ed95b21a 3741 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
9969ebc5 3742
ed95b21a
ID
3743 /* encode *LockPayload NotifyMessage (op + ClientId) */
3744 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3745 ceph_encode_32(&p, notify_op);
3746 ceph_encode_64(&p, cid.gid);
3747 ceph_encode_64(&p, cid.handle);
8eb87565 3748
ed95b21a
ID
3749 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3750 &rbd_dev->header_oloc, buf, buf_size,
3751 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
b30a01f2
ID
3752}
3753
ed95b21a
ID
3754static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3755 enum rbd_notify_op notify_op)
b30a01f2 3756{
8ae0299a 3757 __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
ed95b21a 3758}
b30a01f2 3759
ed95b21a
ID
3760static void rbd_notify_acquired_lock(struct work_struct *work)
3761{
3762 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3763 acquired_lock_work);
76756a51 3764
ed95b21a 3765 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
c525f036
ID
3766}
3767
ed95b21a 3768static void rbd_notify_released_lock(struct work_struct *work)
c525f036 3769{
ed95b21a
ID
3770 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3771 released_lock_work);
811c6688 3772
ed95b21a 3773 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
fca27065
ID
3774}
3775
ed95b21a 3776static int rbd_request_lock(struct rbd_device *rbd_dev)
36be9a76 3777{
ed95b21a
ID
3778 struct page **reply_pages;
3779 size_t reply_len;
3780 bool lock_owner_responded = false;
36be9a76
AE
3781 int ret;
3782
ed95b21a 3783 dout("%s rbd_dev %p\n", __func__, rbd_dev);
36be9a76 3784
ed95b21a
ID
3785 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3786 &reply_pages, &reply_len);
3787 if (ret && ret != -ETIMEDOUT) {
3788 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
36be9a76 3789 goto out;
ed95b21a 3790 }
36be9a76 3791
ed95b21a
ID
3792 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3793 void *p = page_address(reply_pages[0]);
3794 void *const end = p + reply_len;
3795 u32 n;
36be9a76 3796
ed95b21a
ID
3797 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3798 while (n--) {
3799 u8 struct_v;
3800 u32 len;
36be9a76 3801
ed95b21a
ID
3802 ceph_decode_need(&p, end, 8 + 8, e_inval);
3803 p += 8 + 8; /* skip gid and cookie */
04017e29 3804
ed95b21a
ID
3805 ceph_decode_32_safe(&p, end, len, e_inval);
3806 if (!len)
3807 continue;
3808
3809 if (lock_owner_responded) {
3810 rbd_warn(rbd_dev,
3811 "duplicate lock owners detected");
3812 ret = -EIO;
3813 goto out;
3814 }
3815
3816 lock_owner_responded = true;
3817 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3818 &struct_v, &len);
3819 if (ret) {
3820 rbd_warn(rbd_dev,
3821 "failed to decode ResponseMessage: %d",
3822 ret);
3823 goto e_inval;
3824 }
3825
3826 ret = ceph_decode_32(&p);
3827 }
3828 }
3829
3830 if (!lock_owner_responded) {
3831 rbd_warn(rbd_dev, "no lock owners detected");
3832 ret = -ETIMEDOUT;
3833 }
3834
3835out:
3836 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3837 return ret;
3838
3839e_inval:
3840 ret = -EINVAL;
3841 goto out;
3842}
3843
637cd060
ID
3844/*
3845 * Either image request state machine(s) or rbd_add_acquire_lock()
3846 * (i.e. "rbd map").
3847 */
3848static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
ed95b21a 3849{
637cd060
ID
3850 struct rbd_img_request *img_req;
3851
3852 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
d9b9c893 3853 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
3854
3855 cancel_delayed_work(&rbd_dev->lock_dwork);
637cd060
ID
3856 if (!completion_done(&rbd_dev->acquire_wait)) {
3857 rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3858 list_empty(&rbd_dev->running_list));
3859 rbd_dev->acquire_err = result;
3860 complete_all(&rbd_dev->acquire_wait);
3861 return;
3862 }
3863
3864 list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3865 mutex_lock(&img_req->state_mutex);
3866 rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3867 rbd_img_schedule(img_req, result);
3868 mutex_unlock(&img_req->state_mutex);
3869 }
3870
3871 list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
ed95b21a
ID
3872}
3873
3874static int get_lock_owner_info(struct rbd_device *rbd_dev,
3875 struct ceph_locker **lockers, u32 *num_lockers)
3876{
3877 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3878 u8 lock_type;
3879 char *lock_tag;
3880 int ret;
3881
3882 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3883
3884 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3885 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3886 &lock_type, &lock_tag, lockers, num_lockers);
3887 if (ret)
3888 return ret;
3889
3890 if (*num_lockers == 0) {
3891 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3892 goto out;
3893 }
3894
3895 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3896 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3897 lock_tag);
3898 ret = -EBUSY;
3899 goto out;
3900 }
3901
3902 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3903 rbd_warn(rbd_dev, "shared lock type detected");
3904 ret = -EBUSY;
3905 goto out;
3906 }
3907
3908 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3909 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3910 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3911 (*lockers)[0].id.cookie);
3912 ret = -EBUSY;
3913 goto out;
3914 }
3915
3916out:
3917 kfree(lock_tag);
3918 return ret;
3919}
3920
3921static int find_watcher(struct rbd_device *rbd_dev,
3922 const struct ceph_locker *locker)
3923{
3924 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3925 struct ceph_watch_item *watchers;
3926 u32 num_watchers;
3927 u64 cookie;
3928 int i;
3929 int ret;
3930
3931 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3932 &rbd_dev->header_oloc, &watchers,
3933 &num_watchers);
3934 if (ret)
3935 return ret;
3936
3937 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3938 for (i = 0; i < num_watchers; i++) {
3939 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3940 sizeof(locker->info.addr)) &&
3941 watchers[i].cookie == cookie) {
3942 struct rbd_client_id cid = {
3943 .gid = le64_to_cpu(watchers[i].name.num),
3944 .handle = cookie,
3945 };
3946
3947 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3948 rbd_dev, cid.gid, cid.handle);
3949 rbd_set_owner_cid(rbd_dev, &cid);
3950 ret = 1;
3951 goto out;
3952 }
3953 }
3954
3955 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3956 ret = 0;
3957out:
3958 kfree(watchers);
3959 return ret;
3960}
3961
3962/*
3963 * lock_rwsem must be held for write
3964 */
3965static int rbd_try_lock(struct rbd_device *rbd_dev)
3966{
3967 struct ceph_client *client = rbd_dev->rbd_client->client;
3968 struct ceph_locker *lockers;
3969 u32 num_lockers;
3970 int ret;
3971
3972 for (;;) {
3973 ret = rbd_lock(rbd_dev);
3974 if (ret != -EBUSY)
3975 return ret;
3976
3977 /* determine if the current lock holder is still alive */
3978 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3979 if (ret)
3980 return ret;
3981
3982 if (num_lockers == 0)
3983 goto again;
3984
3985 ret = find_watcher(rbd_dev, lockers);
637cd060
ID
3986 if (ret)
3987 goto out; /* request lock or error */
ed95b21a 3988
22e8bd51 3989 rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
ed95b21a
ID
3990 ENTITY_NAME(lockers[0].id.name));
3991
3992 ret = ceph_monc_blacklist_add(&client->monc,
3993 &lockers[0].info.addr);
3994 if (ret) {
3995 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3996 ENTITY_NAME(lockers[0].id.name), ret);
3997 goto out;
3998 }
3999
4000 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4001 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4002 lockers[0].id.cookie,
4003 &lockers[0].id.name);
4004 if (ret && ret != -ENOENT)
4005 goto out;
4006
4007again:
4008 ceph_free_lockers(lockers, num_lockers);
4009 }
4010
4011out:
4012 ceph_free_lockers(lockers, num_lockers);
4013 return ret;
4014}
4015
22e8bd51
ID
4016static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4017{
4018 int ret;
4019
4020 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4021 ret = rbd_object_map_open(rbd_dev);
4022 if (ret)
4023 return ret;
4024 }
4025
4026 return 0;
4027}
4028
ed95b21a 4029/*
637cd060
ID
4030 * Return:
4031 * 0 - lock acquired
4032 * 1 - caller should call rbd_request_lock()
4033 * <0 - error
ed95b21a 4034 */
637cd060 4035static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
ed95b21a 4036{
637cd060 4037 int ret;
ed95b21a
ID
4038
4039 down_read(&rbd_dev->lock_rwsem);
4040 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4041 rbd_dev->lock_state);
4042 if (__rbd_is_lock_owner(rbd_dev)) {
ed95b21a 4043 up_read(&rbd_dev->lock_rwsem);
637cd060 4044 return 0;
ed95b21a
ID
4045 }
4046
4047 up_read(&rbd_dev->lock_rwsem);
4048 down_write(&rbd_dev->lock_rwsem);
4049 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4050 rbd_dev->lock_state);
637cd060
ID
4051 if (__rbd_is_lock_owner(rbd_dev)) {
4052 up_write(&rbd_dev->lock_rwsem);
4053 return 0;
ed95b21a
ID
4054 }
4055
637cd060
ID
4056 ret = rbd_try_lock(rbd_dev);
4057 if (ret < 0) {
4058 rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4059 if (ret == -EBLACKLISTED)
4060 goto out;
4061
4062 ret = 1; /* request lock anyway */
4063 }
4064 if (ret > 0) {
4065 up_write(&rbd_dev->lock_rwsem);
4066 return ret;
4067 }
4068
4069 rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4070 rbd_assert(list_empty(&rbd_dev->running_list));
4071
22e8bd51
ID
4072 ret = rbd_post_acquire_action(rbd_dev);
4073 if (ret) {
4074 rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4075 /*
4076 * Can't stay in RBD_LOCK_STATE_LOCKED because
4077 * rbd_lock_add_request() would let the request through,
4078 * assuming that e.g. object map is locked and loaded.
4079 */
4080 rbd_unlock(rbd_dev);
ed95b21a
ID
4081 }
4082
637cd060
ID
4083out:
4084 wake_lock_waiters(rbd_dev, ret);
ed95b21a 4085 up_write(&rbd_dev->lock_rwsem);
637cd060 4086 return ret;
ed95b21a
ID
4087}
4088
4089static void rbd_acquire_lock(struct work_struct *work)
4090{
4091 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4092 struct rbd_device, lock_dwork);
637cd060 4093 int ret;
ed95b21a
ID
4094
4095 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4096again:
637cd060
ID
4097 ret = rbd_try_acquire_lock(rbd_dev);
4098 if (ret <= 0) {
4099 dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
ed95b21a
ID
4100 return;
4101 }
4102
4103 ret = rbd_request_lock(rbd_dev);
4104 if (ret == -ETIMEDOUT) {
4105 goto again; /* treat this as a dead client */
e010dd0a
ID
4106 } else if (ret == -EROFS) {
4107 rbd_warn(rbd_dev, "peer will not release lock");
637cd060
ID
4108 down_write(&rbd_dev->lock_rwsem);
4109 wake_lock_waiters(rbd_dev, ret);
4110 up_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
4111 } else if (ret < 0) {
4112 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4113 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4114 RBD_RETRY_DELAY);
4115 } else {
4116 /*
4117 * lock owner acked, but resend if we don't see them
4118 * release the lock
4119 */
6b0a8774 4120 dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
ed95b21a
ID
4121 rbd_dev);
4122 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4123 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4124 }
4125}
4126
a2b1da09 4127static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
ed95b21a 4128{
e1fddc8f
ID
4129 bool need_wait;
4130
a2b1da09 4131 dout("%s rbd_dev %p\n", __func__, rbd_dev);
d9b9c893 4132 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
a2b1da09 4133
ed95b21a
ID
4134 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4135 return false;
4136
52bb1f9b 4137 /*
ed95b21a 4138 * Ensure that all in-flight IO is flushed.
52bb1f9b 4139 */
e1fddc8f
ID
4140 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4141 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4142 need_wait = !list_empty(&rbd_dev->running_list);
4143 downgrade_write(&rbd_dev->lock_rwsem);
4144 if (need_wait)
4145 wait_for_completion(&rbd_dev->releasing_wait);
ed95b21a
ID
4146 up_read(&rbd_dev->lock_rwsem);
4147
4148 down_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
4149 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4150 return false;
4151
e1fddc8f 4152 rbd_assert(list_empty(&rbd_dev->running_list));
a2b1da09
ID
4153 return true;
4154}
4155
22e8bd51
ID
4156static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4157{
4158 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4159 rbd_object_map_close(rbd_dev);
4160}
4161
e1fddc8f
ID
4162static void __rbd_release_lock(struct rbd_device *rbd_dev)
4163{
4164 rbd_assert(list_empty(&rbd_dev->running_list));
4165
22e8bd51 4166 rbd_pre_release_action(rbd_dev);
bbead745 4167 rbd_unlock(rbd_dev);
e1fddc8f
ID
4168}
4169
a2b1da09
ID
4170/*
4171 * lock_rwsem must be held for write
4172 */
4173static void rbd_release_lock(struct rbd_device *rbd_dev)
4174{
4175 if (!rbd_quiesce_lock(rbd_dev))
4176 return;
4177
e1fddc8f 4178 __rbd_release_lock(rbd_dev);
a2b1da09 4179
bbead745
ID
4180 /*
4181 * Give others a chance to grab the lock - we would re-acquire
637cd060
ID
4182 * almost immediately if we got new IO while draining the running
4183 * list otherwise. We need to ack our own notifications, so this
4184 * lock_dwork will be requeued from rbd_handle_released_lock() by
4185 * way of maybe_kick_acquire().
bbead745
ID
4186 */
4187 cancel_delayed_work(&rbd_dev->lock_dwork);
ed95b21a
ID
4188}
4189
4190static void rbd_release_lock_work(struct work_struct *work)
4191{
4192 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4193 unlock_work);
4194
4195 down_write(&rbd_dev->lock_rwsem);
4196 rbd_release_lock(rbd_dev);
4197 up_write(&rbd_dev->lock_rwsem);
4198}
4199
637cd060
ID
4200static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4201{
4202 bool have_requests;
4203
4204 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4205 if (__rbd_is_lock_owner(rbd_dev))
4206 return;
4207
4208 spin_lock(&rbd_dev->lock_lists_lock);
4209 have_requests = !list_empty(&rbd_dev->acquiring_list);
4210 spin_unlock(&rbd_dev->lock_lists_lock);
4211 if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4212 dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4213 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4214 }
4215}
4216
ed95b21a
ID
4217static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4218 void **p)
4219{
4220 struct rbd_client_id cid = { 0 };
4221
4222 if (struct_v >= 2) {
4223 cid.gid = ceph_decode_64(p);
4224 cid.handle = ceph_decode_64(p);
4225 }
4226
4227 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4228 cid.handle);
4229 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4230 down_write(&rbd_dev->lock_rwsem);
4231 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4232 /*
4233 * we already know that the remote client is
4234 * the owner
4235 */
4236 up_write(&rbd_dev->lock_rwsem);
4237 return;
4238 }
4239
4240 rbd_set_owner_cid(rbd_dev, &cid);
4241 downgrade_write(&rbd_dev->lock_rwsem);
4242 } else {
4243 down_read(&rbd_dev->lock_rwsem);
4244 }
4245
637cd060 4246 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
4247 up_read(&rbd_dev->lock_rwsem);
4248}
4249
4250static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4251 void **p)
4252{
4253 struct rbd_client_id cid = { 0 };
4254
4255 if (struct_v >= 2) {
4256 cid.gid = ceph_decode_64(p);
4257 cid.handle = ceph_decode_64(p);
4258 }
4259
4260 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4261 cid.handle);
4262 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4263 down_write(&rbd_dev->lock_rwsem);
4264 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4265 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
4266 __func__, rbd_dev, cid.gid, cid.handle,
4267 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4268 up_write(&rbd_dev->lock_rwsem);
4269 return;
4270 }
4271
4272 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4273 downgrade_write(&rbd_dev->lock_rwsem);
4274 } else {
4275 down_read(&rbd_dev->lock_rwsem);
4276 }
4277
637cd060 4278 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
4279 up_read(&rbd_dev->lock_rwsem);
4280}
4281
3b77faa0
ID
4282/*
4283 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4284 * ResponseMessage is needed.
4285 */
4286static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4287 void **p)
ed95b21a
ID
4288{
4289 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4290 struct rbd_client_id cid = { 0 };
3b77faa0 4291 int result = 1;
ed95b21a
ID
4292
4293 if (struct_v >= 2) {
4294 cid.gid = ceph_decode_64(p);
4295 cid.handle = ceph_decode_64(p);
4296 }
4297
4298 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4299 cid.handle);
4300 if (rbd_cid_equal(&cid, &my_cid))
3b77faa0 4301 return result;
ed95b21a
ID
4302
4303 down_read(&rbd_dev->lock_rwsem);
3b77faa0
ID
4304 if (__rbd_is_lock_owner(rbd_dev)) {
4305 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4306 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4307 goto out_unlock;
4308
4309 /*
4310 * encode ResponseMessage(0) so the peer can detect
4311 * a missing owner
4312 */
4313 result = 0;
4314
4315 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
e010dd0a
ID
4316 if (!rbd_dev->opts->exclusive) {
4317 dout("%s rbd_dev %p queueing unlock_work\n",
4318 __func__, rbd_dev);
4319 queue_work(rbd_dev->task_wq,
4320 &rbd_dev->unlock_work);
4321 } else {
4322 /* refuse to release the lock */
4323 result = -EROFS;
4324 }
ed95b21a
ID
4325 }
4326 }
3b77faa0
ID
4327
4328out_unlock:
ed95b21a 4329 up_read(&rbd_dev->lock_rwsem);
3b77faa0 4330 return result;
ed95b21a
ID
4331}
4332
4333static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4334 u64 notify_id, u64 cookie, s32 *result)
4335{
4336 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
08a79102
KS
4337 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4338 int buf_size = sizeof(buf);
ed95b21a
ID
4339 int ret;
4340
4341 if (result) {
4342 void *p = buf;
4343
4344 /* encode ResponseMessage */
4345 ceph_start_encoding(&p, 1, 1,
4346 buf_size - CEPH_ENCODING_START_BLK_LEN);
4347 ceph_encode_32(&p, *result);
4348 } else {
4349 buf_size = 0;
4350 }
b8d70035 4351
922dab61
ID
4352 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4353 &rbd_dev->header_oloc, notify_id, cookie,
ed95b21a 4354 buf, buf_size);
52bb1f9b 4355 if (ret)
ed95b21a
ID
4356 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4357}
4358
4359static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4360 u64 cookie)
4361{
4362 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4363 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4364}
4365
4366static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4367 u64 notify_id, u64 cookie, s32 result)
4368{
4369 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4370 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4371}
4372
4373static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4374 u64 notifier_id, void *data, size_t data_len)
4375{
4376 struct rbd_device *rbd_dev = arg;
4377 void *p = data;
4378 void *const end = p + data_len;
d4c2269b 4379 u8 struct_v = 0;
ed95b21a
ID
4380 u32 len;
4381 u32 notify_op;
4382 int ret;
4383
4384 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4385 __func__, rbd_dev, cookie, notify_id, data_len);
4386 if (data_len) {
4387 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4388 &struct_v, &len);
4389 if (ret) {
4390 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4391 ret);
4392 return;
4393 }
4394
4395 notify_op = ceph_decode_32(&p);
4396 } else {
4397 /* legacy notification for header updates */
4398 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4399 len = 0;
4400 }
4401
4402 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4403 switch (notify_op) {
4404 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4405 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4406 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4407 break;
4408 case RBD_NOTIFY_OP_RELEASED_LOCK:
4409 rbd_handle_released_lock(rbd_dev, struct_v, &p);
4410 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4411 break;
4412 case RBD_NOTIFY_OP_REQUEST_LOCK:
3b77faa0
ID
4413 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4414 if (ret <= 0)
ed95b21a 4415 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3b77faa0 4416 cookie, ret);
ed95b21a
ID
4417 else
4418 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4419 break;
4420 case RBD_NOTIFY_OP_HEADER_UPDATE:
4421 ret = rbd_dev_refresh(rbd_dev);
4422 if (ret)
4423 rbd_warn(rbd_dev, "refresh failed: %d", ret);
4424
4425 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4426 break;
4427 default:
4428 if (rbd_is_lock_owner(rbd_dev))
4429 rbd_acknowledge_notify_result(rbd_dev, notify_id,
4430 cookie, -EOPNOTSUPP);
4431 else
4432 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4433 break;
4434 }
b8d70035
AE
4435}
4436
99d16943
ID
4437static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4438
922dab61 4439static void rbd_watch_errcb(void *arg, u64 cookie, int err)
bb040aa0 4440{
922dab61 4441 struct rbd_device *rbd_dev = arg;
bb040aa0 4442
922dab61 4443 rbd_warn(rbd_dev, "encountered watch error: %d", err);
bb040aa0 4444
ed95b21a
ID
4445 down_write(&rbd_dev->lock_rwsem);
4446 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4447 up_write(&rbd_dev->lock_rwsem);
4448
99d16943
ID
4449 mutex_lock(&rbd_dev->watch_mutex);
4450 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4451 __rbd_unregister_watch(rbd_dev);
4452 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
bb040aa0 4453
99d16943 4454 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
bb040aa0 4455 }
99d16943 4456 mutex_unlock(&rbd_dev->watch_mutex);
bb040aa0
ID
4457}
4458
9969ebc5 4459/*
99d16943 4460 * watch_mutex must be locked
9969ebc5 4461 */
99d16943 4462static int __rbd_register_watch(struct rbd_device *rbd_dev)
9969ebc5
AE
4463{
4464 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
922dab61 4465 struct ceph_osd_linger_request *handle;
9969ebc5 4466
922dab61 4467 rbd_assert(!rbd_dev->watch_handle);
99d16943 4468 dout("%s rbd_dev %p\n", __func__, rbd_dev);
9969ebc5 4469
922dab61
ID
4470 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4471 &rbd_dev->header_oloc, rbd_watch_cb,
4472 rbd_watch_errcb, rbd_dev);
4473 if (IS_ERR(handle))
4474 return PTR_ERR(handle);
8eb87565 4475
922dab61 4476 rbd_dev->watch_handle = handle;
b30a01f2 4477 return 0;
b30a01f2
ID
4478}
4479
99d16943
ID
4480/*
4481 * watch_mutex must be locked
4482 */
4483static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
b30a01f2 4484{
922dab61
ID
4485 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4486 int ret;
b30a01f2 4487
99d16943
ID
4488 rbd_assert(rbd_dev->watch_handle);
4489 dout("%s rbd_dev %p\n", __func__, rbd_dev);
b30a01f2 4490
922dab61
ID
4491 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4492 if (ret)
4493 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
76756a51 4494
922dab61 4495 rbd_dev->watch_handle = NULL;
c525f036
ID
4496}
4497
99d16943
ID
4498static int rbd_register_watch(struct rbd_device *rbd_dev)
4499{
4500 int ret;
4501
4502 mutex_lock(&rbd_dev->watch_mutex);
4503 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4504 ret = __rbd_register_watch(rbd_dev);
4505 if (ret)
4506 goto out;
4507
4508 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4509 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4510
4511out:
4512 mutex_unlock(&rbd_dev->watch_mutex);
4513 return ret;
4514}
4515
4516static void cancel_tasks_sync(struct rbd_device *rbd_dev)
c525f036 4517{
99d16943
ID
4518 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4519
ed95b21a
ID
4520 cancel_work_sync(&rbd_dev->acquired_lock_work);
4521 cancel_work_sync(&rbd_dev->released_lock_work);
4522 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4523 cancel_work_sync(&rbd_dev->unlock_work);
99d16943
ID
4524}
4525
0e4e1de5
ID
4526/*
4527 * header_rwsem must not be held to avoid a deadlock with
4528 * rbd_dev_refresh() when flushing notifies.
4529 */
99d16943
ID
4530static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4531{
4532 cancel_tasks_sync(rbd_dev);
4533
4534 mutex_lock(&rbd_dev->watch_mutex);
4535 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4536 __rbd_unregister_watch(rbd_dev);
4537 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4538 mutex_unlock(&rbd_dev->watch_mutex);
811c6688 4539
23edca86 4540 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
811c6688 4541 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
fca27065
ID
4542}
4543
14bb211d
ID
4544/*
4545 * lock_rwsem must be held for write
4546 */
4547static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4548{
4549 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4550 char cookie[32];
4551 int ret;
4552
a2b1da09
ID
4553 if (!rbd_quiesce_lock(rbd_dev))
4554 return;
14bb211d
ID
4555
4556 format_lock_cookie(rbd_dev, cookie);
4557 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4558 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4559 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4560 RBD_LOCK_TAG, cookie);
4561 if (ret) {
4562 if (ret != -EOPNOTSUPP)
4563 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4564 ret);
4565
4566 /*
4567 * Lock cookie cannot be updated on older OSDs, so do
4568 * a manual release and queue an acquire.
4569 */
e1fddc8f 4570 __rbd_release_lock(rbd_dev);
a2b1da09 4571 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
14bb211d 4572 } else {
edd8ca80 4573 __rbd_lock(rbd_dev, cookie);
637cd060 4574 wake_lock_waiters(rbd_dev, 0);
14bb211d
ID
4575 }
4576}
4577
99d16943
ID
4578static void rbd_reregister_watch(struct work_struct *work)
4579{
4580 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4581 struct rbd_device, watch_dwork);
4582 int ret;
4583
4584 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4585
4586 mutex_lock(&rbd_dev->watch_mutex);
87c0fded
ID
4587 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4588 mutex_unlock(&rbd_dev->watch_mutex);
14bb211d 4589 return;
87c0fded 4590 }
99d16943
ID
4591
4592 ret = __rbd_register_watch(rbd_dev);
4593 if (ret) {
4594 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
637cd060 4595 if (ret != -EBLACKLISTED && ret != -ENOENT) {
99d16943
ID
4596 queue_delayed_work(rbd_dev->task_wq,
4597 &rbd_dev->watch_dwork,
4598 RBD_RETRY_DELAY);
637cd060
ID
4599 mutex_unlock(&rbd_dev->watch_mutex);
4600 return;
87c0fded 4601 }
637cd060 4602
87c0fded 4603 mutex_unlock(&rbd_dev->watch_mutex);
637cd060
ID
4604 down_write(&rbd_dev->lock_rwsem);
4605 wake_lock_waiters(rbd_dev, ret);
4606 up_write(&rbd_dev->lock_rwsem);
14bb211d 4607 return;
99d16943
ID
4608 }
4609
4610 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4611 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4612 mutex_unlock(&rbd_dev->watch_mutex);
4613
14bb211d
ID
4614 down_write(&rbd_dev->lock_rwsem);
4615 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4616 rbd_reacquire_lock(rbd_dev);
4617 up_write(&rbd_dev->lock_rwsem);
4618
99d16943
ID
4619 ret = rbd_dev_refresh(rbd_dev);
4620 if (ret)
f6870cc9 4621 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
99d16943
ID
4622}
4623
36be9a76 4624/*
f40eb349
AE
4625 * Synchronous osd object method call. Returns the number of bytes
4626 * returned in the outbound buffer, or a negative error code.
36be9a76
AE
4627 */
4628static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
ecd4a68a
ID
4629 struct ceph_object_id *oid,
4630 struct ceph_object_locator *oloc,
36be9a76 4631 const char *method_name,
4157976b 4632 const void *outbound,
36be9a76 4633 size_t outbound_size,
4157976b 4634 void *inbound,
e2a58ee5 4635 size_t inbound_size)
36be9a76 4636{
ecd4a68a
ID
4637 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4638 struct page *req_page = NULL;
4639 struct page *reply_page;
36be9a76
AE
4640 int ret;
4641
4642 /*
6010a451
AE
4643 * Method calls are ultimately read operations. The result
4644 * should placed into the inbound buffer provided. They
4645 * also supply outbound data--parameters for the object
4646 * method. Currently if this is present it will be a
4647 * snapshot id.
36be9a76 4648 */
ecd4a68a
ID
4649 if (outbound) {
4650 if (outbound_size > PAGE_SIZE)
4651 return -E2BIG;
36be9a76 4652
ecd4a68a
ID
4653 req_page = alloc_page(GFP_KERNEL);
4654 if (!req_page)
4655 return -ENOMEM;
04017e29 4656
ecd4a68a 4657 memcpy(page_address(req_page), outbound, outbound_size);
04017e29 4658 }
36be9a76 4659
ecd4a68a
ID
4660 reply_page = alloc_page(GFP_KERNEL);
4661 if (!reply_page) {
4662 if (req_page)
4663 __free_page(req_page);
4664 return -ENOMEM;
4665 }
57385b51 4666
ecd4a68a
ID
4667 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4668 CEPH_OSD_FLAG_READ, req_page, outbound_size,
68ada915 4669 &reply_page, &inbound_size);
ecd4a68a
ID
4670 if (!ret) {
4671 memcpy(inbound, page_address(reply_page), inbound_size);
4672 ret = inbound_size;
4673 }
36be9a76 4674
ecd4a68a
ID
4675 if (req_page)
4676 __free_page(req_page);
4677 __free_page(reply_page);
36be9a76
AE
4678 return ret;
4679}
4680
7ad18afa 4681static void rbd_queue_workfn(struct work_struct *work)
bf0d5f50 4682{
59e542c8
ID
4683 struct rbd_img_request *img_request =
4684 container_of(work, struct rbd_img_request, work);
4685 struct rbd_device *rbd_dev = img_request->rbd_dev;
4686 enum obj_operation_type op_type = img_request->op_type;
4687 struct request *rq = blk_mq_rq_from_pdu(img_request);
bc1ecc65
ID
4688 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4689 u64 length = blk_rq_bytes(rq);
4e752f0a 4690 u64 mapping_size;
bf0d5f50
AE
4691 int result;
4692
bc1ecc65 4693 /* Ignore/skip any zero-length requests */
bc1ecc65
ID
4694 if (!length) {
4695 dout("%s: zero-length request\n", __func__);
4696 result = 0;
59e542c8 4697 goto err_img_request;
bc1ecc65 4698 }
4dda41d3 4699
7ad18afa
CH
4700 blk_mq_start_request(rq);
4701
4e752f0a
JD
4702 down_read(&rbd_dev->header_rwsem);
4703 mapping_size = rbd_dev->mapping.size;
a52cc685 4704 rbd_img_capture_header(img_request);
4e752f0a
JD
4705 up_read(&rbd_dev->header_rwsem);
4706
4707 if (offset + length > mapping_size) {
bc1ecc65 4708 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4e752f0a 4709 length, mapping_size);
bc1ecc65 4710 result = -EIO;
a52cc685 4711 goto err_img_request;
bc1ecc65 4712 }
bf0d5f50 4713
21ed05a8
ID
4714 dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4715 img_request, obj_op_name(op_type), offset, length);
4716
6484cbe9 4717 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
5a237819 4718 result = rbd_img_fill_nodata(img_request, offset, length);
90e98c52 4719 else
5a237819
ID
4720 result = rbd_img_fill_from_bio(img_request, offset, length,
4721 rq->bio);
0192ce2e 4722 if (result)
bc1ecc65 4723 goto err_img_request;
bf0d5f50 4724
e1fddc8f 4725 rbd_img_handle_request(img_request, 0);
bc1ecc65 4726 return;
bf0d5f50 4727
bc1ecc65 4728err_img_request:
679a97d2 4729 rbd_img_request_destroy(img_request);
bc1ecc65
ID
4730 if (result)
4731 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
6d2940c8 4732 obj_op_name(op_type), length, offset, result);
2a842aca 4733 blk_mq_end_request(rq, errno_to_blk_status(result));
bc1ecc65 4734}
bf0d5f50 4735
fc17b653 4736static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
7ad18afa 4737 const struct blk_mq_queue_data *bd)
bc1ecc65 4738{
59e542c8
ID
4739 struct rbd_device *rbd_dev = hctx->queue->queuedata;
4740 struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
4741 enum obj_operation_type op_type;
bf0d5f50 4742
59e542c8
ID
4743 switch (req_op(bd->rq)) {
4744 case REQ_OP_DISCARD:
4745 op_type = OBJ_OP_DISCARD;
4746 break;
4747 case REQ_OP_WRITE_ZEROES:
4748 op_type = OBJ_OP_ZEROOUT;
4749 break;
4750 case REQ_OP_WRITE:
4751 op_type = OBJ_OP_WRITE;
4752 break;
4753 case REQ_OP_READ:
4754 op_type = OBJ_OP_READ;
4755 break;
4756 default:
4757 rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
4758 return BLK_STS_IOERR;
4759 }
4760
4761 rbd_img_request_init(img_req, rbd_dev, op_type);
4762
4763 if (rbd_img_is_write(img_req)) {
4764 if (rbd_is_ro(rbd_dev)) {
4765 rbd_warn(rbd_dev, "%s on read-only mapping",
4766 obj_op_name(img_req->op_type));
4767 return BLK_STS_IOERR;
4768 }
4769 rbd_assert(!rbd_is_snap(rbd_dev));
4770 }
4771
4772 INIT_WORK(&img_req->work, rbd_queue_workfn);
4773 queue_work(rbd_wq, &img_req->work);
fc17b653 4774 return BLK_STS_OK;
bf0d5f50
AE
4775}
4776
602adf40
YS
4777static void rbd_free_disk(struct rbd_device *rbd_dev)
4778{
5769ed0c
ID
4779 blk_cleanup_queue(rbd_dev->disk->queue);
4780 blk_mq_free_tag_set(&rbd_dev->tag_set);
4781 put_disk(rbd_dev->disk);
a0cab924 4782 rbd_dev->disk = NULL;
602adf40
YS
4783}
4784
788e2df3 4785static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
fe5478e0
ID
4786 struct ceph_object_id *oid,
4787 struct ceph_object_locator *oloc,
4788 void *buf, int buf_len)
788e2df3
AE
4789
4790{
fe5478e0
ID
4791 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4792 struct ceph_osd_request *req;
4793 struct page **pages;
4794 int num_pages = calc_pages_for(0, buf_len);
788e2df3
AE
4795 int ret;
4796
fe5478e0
ID
4797 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4798 if (!req)
4799 return -ENOMEM;
788e2df3 4800
fe5478e0
ID
4801 ceph_oid_copy(&req->r_base_oid, oid);
4802 ceph_oloc_copy(&req->r_base_oloc, oloc);
4803 req->r_flags = CEPH_OSD_FLAG_READ;
430c28c3 4804
fe5478e0
ID
4805 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4806 if (IS_ERR(pages)) {
4807 ret = PTR_ERR(pages);
4808 goto out_req;
4809 }
1ceae7ef 4810
fe5478e0
ID
4811 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4812 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4813 true);
4814
26f887e0
ID
4815 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4816 if (ret)
4817 goto out_req;
4818
fe5478e0
ID
4819 ceph_osdc_start_request(osdc, req, false);
4820 ret = ceph_osdc_wait_request(osdc, req);
4821 if (ret >= 0)
4822 ceph_copy_from_page_vector(pages, buf, 0, ret);
788e2df3 4823
fe5478e0
ID
4824out_req:
4825 ceph_osdc_put_request(req);
788e2df3
AE
4826 return ret;
4827}
4828
602adf40 4829/*
662518b1
AE
4830 * Read the complete header for the given rbd device. On successful
4831 * return, the rbd_dev->header field will contain up-to-date
4832 * information about the image.
602adf40 4833 */
99a41ebc 4834static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
602adf40 4835{
4156d998 4836 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 4837 u32 snap_count = 0;
4156d998
AE
4838 u64 names_size = 0;
4839 u32 want_count;
4840 int ret;
602adf40 4841
00f1f36f 4842 /*
4156d998
AE
4843 * The complete header will include an array of its 64-bit
4844 * snapshot ids, followed by the names of those snapshots as
4845 * a contiguous block of NUL-terminated strings. Note that
4846 * the number of snapshots could change by the time we read
4847 * it in, in which case we re-read it.
00f1f36f 4848 */
4156d998
AE
4849 do {
4850 size_t size;
4851
4852 kfree(ondisk);
4853
4854 size = sizeof (*ondisk);
4855 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4856 size += names_size;
4857 ondisk = kmalloc(size, GFP_KERNEL);
4858 if (!ondisk)
662518b1 4859 return -ENOMEM;
4156d998 4860
fe5478e0
ID
4861 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4862 &rbd_dev->header_oloc, ondisk, size);
4156d998 4863 if (ret < 0)
662518b1 4864 goto out;
c0cd10db 4865 if ((size_t)ret < size) {
4156d998 4866 ret = -ENXIO;
06ecc6cb
AE
4867 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4868 size, ret);
662518b1 4869 goto out;
4156d998
AE
4870 }
4871 if (!rbd_dev_ondisk_valid(ondisk)) {
4872 ret = -ENXIO;
06ecc6cb 4873 rbd_warn(rbd_dev, "invalid header");
662518b1 4874 goto out;
81e759fb 4875 }
602adf40 4876
4156d998
AE
4877 names_size = le64_to_cpu(ondisk->snap_names_len);
4878 want_count = snap_count;
4879 snap_count = le32_to_cpu(ondisk->snap_count);
4880 } while (snap_count != want_count);
00f1f36f 4881
662518b1
AE
4882 ret = rbd_header_from_disk(rbd_dev, ondisk);
4883out:
4156d998
AE
4884 kfree(ondisk);
4885
4886 return ret;
602adf40
YS
4887}
4888
9875201e
JD
4889static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4890{
4891 sector_t size;
9875201e
JD
4892
4893 /*
811c6688
ID
4894 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4895 * try to update its size. If REMOVING is set, updating size
4896 * is just useless work since the device can't be opened.
9875201e 4897 */
811c6688
ID
4898 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4899 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
9875201e
JD
4900 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4901 dout("setting size to %llu sectors", (unsigned long long)size);
4902 set_capacity(rbd_dev->disk, size);
4903 revalidate_disk(rbd_dev->disk);
4904 }
4905}
4906
cc4a38bd 4907static int rbd_dev_refresh(struct rbd_device *rbd_dev)
1fe5e993 4908{
e627db08 4909 u64 mapping_size;
1fe5e993
AE
4910 int ret;
4911
cfbf6377 4912 down_write(&rbd_dev->header_rwsem);
3b5cf2a2 4913 mapping_size = rbd_dev->mapping.size;
a720ae09
ID
4914
4915 ret = rbd_dev_header_info(rbd_dev);
52bb1f9b 4916 if (ret)
73e39e4d 4917 goto out;
15228ede 4918
e8f59b59
ID
4919 /*
4920 * If there is a parent, see if it has disappeared due to the
4921 * mapped image getting flattened.
4922 */
4923 if (rbd_dev->parent) {
4924 ret = rbd_dev_v2_parent_info(rbd_dev);
4925 if (ret)
73e39e4d 4926 goto out;
e8f59b59
ID
4927 }
4928
686238b7
ID
4929 rbd_assert(!rbd_is_snap(rbd_dev));
4930 rbd_dev->mapping.size = rbd_dev->header.image_size;
15228ede 4931
73e39e4d 4932out:
cfbf6377 4933 up_write(&rbd_dev->header_rwsem);
73e39e4d 4934 if (!ret && mapping_size != rbd_dev->mapping.size)
9875201e 4935 rbd_dev_update_size(rbd_dev);
1fe5e993 4936
73e39e4d 4937 return ret;
1fe5e993
AE
4938}
4939
f363b089 4940static const struct blk_mq_ops rbd_mq_ops = {
7ad18afa 4941 .queue_rq = rbd_queue_rq,
7ad18afa
CH
4942};
4943
602adf40
YS
4944static int rbd_init_disk(struct rbd_device *rbd_dev)
4945{
4946 struct gendisk *disk;
4947 struct request_queue *q;
420efbdf
ID
4948 unsigned int objset_bytes =
4949 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
7ad18afa 4950 int err;
602adf40 4951
602adf40 4952 /* create gendisk info */
7e513d43
ID
4953 disk = alloc_disk(single_major ?
4954 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4955 RBD_MINORS_PER_MAJOR);
602adf40 4956 if (!disk)
1fcdb8aa 4957 return -ENOMEM;
602adf40 4958
f0f8cef5 4959 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 4960 rbd_dev->dev_id);
602adf40 4961 disk->major = rbd_dev->major;
dd82fff1 4962 disk->first_minor = rbd_dev->minor;
7e513d43
ID
4963 if (single_major)
4964 disk->flags |= GENHD_FL_EXT_DEVT;
602adf40
YS
4965 disk->fops = &rbd_bd_ops;
4966 disk->private_data = rbd_dev;
4967
7ad18afa
CH
4968 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4969 rbd_dev->tag_set.ops = &rbd_mq_ops;
b5584180 4970 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
7ad18afa 4971 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
56d18f62 4972 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
f9b6b98d 4973 rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
59e542c8 4974 rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
7ad18afa
CH
4975
4976 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4977 if (err)
602adf40 4978 goto out_disk;
029bcbd8 4979
7ad18afa
CH
4980 q = blk_mq_init_queue(&rbd_dev->tag_set);
4981 if (IS_ERR(q)) {
4982 err = PTR_ERR(q);
4983 goto out_tag_set;
4984 }
4985
8b904b5b 4986 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
d8a2c89c 4987 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
593a9e7b 4988
420efbdf 4989 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
0d9fde4f 4990 q->limits.max_sectors = queue_max_hw_sectors(q);
21acdf45 4991 blk_queue_max_segments(q, USHRT_MAX);
24f1df60 4992 blk_queue_max_segment_size(q, UINT_MAX);
16d80c54
ID
4993 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
4994 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
029bcbd8 4995
d9360540
ID
4996 if (rbd_dev->opts->trim) {
4997 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
16d80c54 4998 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
d9360540
ID
4999 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5000 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5001 }
90e98c52 5002
bae818ee 5003 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
dc3b17cc 5004 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
bae818ee 5005
5769ed0c
ID
5006 /*
5007 * disk_release() expects a queue ref from add_disk() and will
5008 * put it. Hold an extra ref until add_disk() is called.
5009 */
5010 WARN_ON(!blk_get_queue(q));
602adf40 5011 disk->queue = q;
602adf40
YS
5012 q->queuedata = rbd_dev;
5013
5014 rbd_dev->disk = disk;
602adf40 5015
602adf40 5016 return 0;
7ad18afa
CH
5017out_tag_set:
5018 blk_mq_free_tag_set(&rbd_dev->tag_set);
602adf40
YS
5019out_disk:
5020 put_disk(disk);
7ad18afa 5021 return err;
602adf40
YS
5022}
5023
dfc5606d
YS
5024/*
5025 sysfs
5026*/
5027
593a9e7b
AE
5028static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5029{
5030 return container_of(dev, struct rbd_device, dev);
5031}
5032
dfc5606d
YS
5033static ssize_t rbd_size_show(struct device *dev,
5034 struct device_attribute *attr, char *buf)
5035{
593a9e7b 5036 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0 5037
fc71d833
AE
5038 return sprintf(buf, "%llu\n",
5039 (unsigned long long)rbd_dev->mapping.size);
dfc5606d
YS
5040}
5041
34b13184
AE
5042static ssize_t rbd_features_show(struct device *dev,
5043 struct device_attribute *attr, char *buf)
5044{
5045 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5046
fa58bcad 5047 return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
34b13184
AE
5048}
5049
dfc5606d
YS
5050static ssize_t rbd_major_show(struct device *dev,
5051 struct device_attribute *attr, char *buf)
5052{
593a9e7b 5053 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 5054
fc71d833
AE
5055 if (rbd_dev->major)
5056 return sprintf(buf, "%d\n", rbd_dev->major);
5057
5058 return sprintf(buf, "(none)\n");
dd82fff1
ID
5059}
5060
5061static ssize_t rbd_minor_show(struct device *dev,
5062 struct device_attribute *attr, char *buf)
5063{
5064 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
fc71d833 5065
dd82fff1 5066 return sprintf(buf, "%d\n", rbd_dev->minor);
dfc5606d
YS
5067}
5068
005a07bf
ID
5069static ssize_t rbd_client_addr_show(struct device *dev,
5070 struct device_attribute *attr, char *buf)
5071{
5072 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5073 struct ceph_entity_addr *client_addr =
5074 ceph_client_addr(rbd_dev->rbd_client->client);
5075
5076 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5077 le32_to_cpu(client_addr->nonce));
5078}
5079
dfc5606d
YS
5080static ssize_t rbd_client_id_show(struct device *dev,
5081 struct device_attribute *attr, char *buf)
602adf40 5082{
593a9e7b 5083 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5084
1dbb4399 5085 return sprintf(buf, "client%lld\n",
033268a5 5086 ceph_client_gid(rbd_dev->rbd_client->client));
602adf40
YS
5087}
5088
267fb90b
MC
5089static ssize_t rbd_cluster_fsid_show(struct device *dev,
5090 struct device_attribute *attr, char *buf)
5091{
5092 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5093
5094 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5095}
5096
0d6d1e9c
MC
5097static ssize_t rbd_config_info_show(struct device *dev,
5098 struct device_attribute *attr, char *buf)
5099{
5100 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5101
5102 return sprintf(buf, "%s\n", rbd_dev->config_info);
602adf40
YS
5103}
5104
dfc5606d
YS
5105static ssize_t rbd_pool_show(struct device *dev,
5106 struct device_attribute *attr, char *buf)
602adf40 5107{
593a9e7b 5108 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5109
0d7dbfce 5110 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
5111}
5112
9bb2f334
AE
5113static ssize_t rbd_pool_id_show(struct device *dev,
5114 struct device_attribute *attr, char *buf)
5115{
5116 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5117
0d7dbfce 5118 return sprintf(buf, "%llu\n",
fc71d833 5119 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
5120}
5121
b26c047b
ID
5122static ssize_t rbd_pool_ns_show(struct device *dev,
5123 struct device_attribute *attr, char *buf)
5124{
5125 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5126
5127 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5128}
5129
dfc5606d
YS
5130static ssize_t rbd_name_show(struct device *dev,
5131 struct device_attribute *attr, char *buf)
5132{
593a9e7b 5133 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5134
a92ffdf8
AE
5135 if (rbd_dev->spec->image_name)
5136 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5137
5138 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
5139}
5140
589d30e0
AE
5141static ssize_t rbd_image_id_show(struct device *dev,
5142 struct device_attribute *attr, char *buf)
5143{
5144 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5145
0d7dbfce 5146 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
5147}
5148
34b13184
AE
5149/*
5150 * Shows the name of the currently-mapped snapshot (or
5151 * RBD_SNAP_HEAD_NAME for the base image).
5152 */
dfc5606d
YS
5153static ssize_t rbd_snap_show(struct device *dev,
5154 struct device_attribute *attr,
5155 char *buf)
5156{
593a9e7b 5157 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5158
0d7dbfce 5159 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
5160}
5161
92a58671
MC
5162static ssize_t rbd_snap_id_show(struct device *dev,
5163 struct device_attribute *attr, char *buf)
5164{
5165 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5166
5167 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5168}
5169
86b00e0d 5170/*
ff96128f
ID
5171 * For a v2 image, shows the chain of parent images, separated by empty
5172 * lines. For v1 images or if there is no parent, shows "(no parent
5173 * image)".
86b00e0d
AE
5174 */
5175static ssize_t rbd_parent_show(struct device *dev,
ff96128f
ID
5176 struct device_attribute *attr,
5177 char *buf)
86b00e0d
AE
5178{
5179 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
ff96128f 5180 ssize_t count = 0;
86b00e0d 5181
ff96128f 5182 if (!rbd_dev->parent)
86b00e0d
AE
5183 return sprintf(buf, "(no parent image)\n");
5184
ff96128f
ID
5185 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5186 struct rbd_spec *spec = rbd_dev->parent_spec;
5187
5188 count += sprintf(&buf[count], "%s"
5189 "pool_id %llu\npool_name %s\n"
e92c0eaf 5190 "pool_ns %s\n"
ff96128f
ID
5191 "image_id %s\nimage_name %s\n"
5192 "snap_id %llu\nsnap_name %s\n"
5193 "overlap %llu\n",
5194 !count ? "" : "\n", /* first? */
5195 spec->pool_id, spec->pool_name,
e92c0eaf 5196 spec->pool_ns ?: "",
ff96128f
ID
5197 spec->image_id, spec->image_name ?: "(unknown)",
5198 spec->snap_id, spec->snap_name,
5199 rbd_dev->parent_overlap);
5200 }
5201
5202 return count;
86b00e0d
AE
5203}
5204
dfc5606d
YS
5205static ssize_t rbd_image_refresh(struct device *dev,
5206 struct device_attribute *attr,
5207 const char *buf,
5208 size_t size)
5209{
593a9e7b 5210 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 5211 int ret;
602adf40 5212
cc4a38bd 5213 ret = rbd_dev_refresh(rbd_dev);
e627db08 5214 if (ret)
52bb1f9b 5215 return ret;
b813623a 5216
52bb1f9b 5217 return size;
dfc5606d 5218}
602adf40 5219
5657a819
JP
5220static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5221static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5222static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5223static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5224static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5225static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5226static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5227static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5228static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5229static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
b26c047b 5230static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5657a819
JP
5231static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5232static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5233static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5234static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5235static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5236static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
dfc5606d
YS
5237
5238static struct attribute *rbd_attrs[] = {
5239 &dev_attr_size.attr,
34b13184 5240 &dev_attr_features.attr,
dfc5606d 5241 &dev_attr_major.attr,
dd82fff1 5242 &dev_attr_minor.attr,
005a07bf 5243 &dev_attr_client_addr.attr,
dfc5606d 5244 &dev_attr_client_id.attr,
267fb90b 5245 &dev_attr_cluster_fsid.attr,
0d6d1e9c 5246 &dev_attr_config_info.attr,
dfc5606d 5247 &dev_attr_pool.attr,
9bb2f334 5248 &dev_attr_pool_id.attr,
b26c047b 5249 &dev_attr_pool_ns.attr,
dfc5606d 5250 &dev_attr_name.attr,
589d30e0 5251 &dev_attr_image_id.attr,
dfc5606d 5252 &dev_attr_current_snap.attr,
92a58671 5253 &dev_attr_snap_id.attr,
86b00e0d 5254 &dev_attr_parent.attr,
dfc5606d 5255 &dev_attr_refresh.attr,
dfc5606d
YS
5256 NULL
5257};
5258
5259static struct attribute_group rbd_attr_group = {
5260 .attrs = rbd_attrs,
5261};
5262
5263static const struct attribute_group *rbd_attr_groups[] = {
5264 &rbd_attr_group,
5265 NULL
5266};
5267
6cac4695 5268static void rbd_dev_release(struct device *dev);
dfc5606d 5269
b9942bc9 5270static const struct device_type rbd_device_type = {
dfc5606d
YS
5271 .name = "rbd",
5272 .groups = rbd_attr_groups,
6cac4695 5273 .release = rbd_dev_release,
dfc5606d
YS
5274};
5275
8b8fb99c
AE
5276static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5277{
5278 kref_get(&spec->kref);
5279
5280 return spec;
5281}
5282
5283static void rbd_spec_free(struct kref *kref);
5284static void rbd_spec_put(struct rbd_spec *spec)
5285{
5286 if (spec)
5287 kref_put(&spec->kref, rbd_spec_free);
5288}
5289
5290static struct rbd_spec *rbd_spec_alloc(void)
5291{
5292 struct rbd_spec *spec;
5293
5294 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5295 if (!spec)
5296 return NULL;
04077599
ID
5297
5298 spec->pool_id = CEPH_NOPOOL;
5299 spec->snap_id = CEPH_NOSNAP;
8b8fb99c
AE
5300 kref_init(&spec->kref);
5301
8b8fb99c
AE
5302 return spec;
5303}
5304
5305static void rbd_spec_free(struct kref *kref)
5306{
5307 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5308
5309 kfree(spec->pool_name);
b26c047b 5310 kfree(spec->pool_ns);
8b8fb99c
AE
5311 kfree(spec->image_id);
5312 kfree(spec->image_name);
5313 kfree(spec->snap_name);
5314 kfree(spec);
5315}
5316
1643dfa4 5317static void rbd_dev_free(struct rbd_device *rbd_dev)
dd5ac32d 5318{
99d16943 5319 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
ed95b21a 5320 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
dd5ac32d 5321
c41d13a3 5322 ceph_oid_destroy(&rbd_dev->header_oid);
6b6dddbe 5323 ceph_oloc_destroy(&rbd_dev->header_oloc);
0d6d1e9c 5324 kfree(rbd_dev->config_info);
c41d13a3 5325
dd5ac32d
ID
5326 rbd_put_client(rbd_dev->rbd_client);
5327 rbd_spec_put(rbd_dev->spec);
5328 kfree(rbd_dev->opts);
5329 kfree(rbd_dev);
1643dfa4
ID
5330}
5331
5332static void rbd_dev_release(struct device *dev)
5333{
5334 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5335 bool need_put = !!rbd_dev->opts;
5336
5337 if (need_put) {
5338 destroy_workqueue(rbd_dev->task_wq);
5339 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5340 }
5341
5342 rbd_dev_free(rbd_dev);
dd5ac32d
ID
5343
5344 /*
5345 * This is racy, but way better than putting module outside of
5346 * the release callback. The race window is pretty small, so
5347 * doing something similar to dm (dm-builtin.c) is overkill.
5348 */
5349 if (need_put)
5350 module_put(THIS_MODULE);
5351}
5352
1643dfa4
ID
5353static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5354 struct rbd_spec *spec)
c53d5893
AE
5355{
5356 struct rbd_device *rbd_dev;
5357
1643dfa4 5358 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
c53d5893
AE
5359 if (!rbd_dev)
5360 return NULL;
5361
5362 spin_lock_init(&rbd_dev->lock);
5363 INIT_LIST_HEAD(&rbd_dev->node);
c53d5893
AE
5364 init_rwsem(&rbd_dev->header_rwsem);
5365
7e97332e 5366 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
c41d13a3 5367 ceph_oid_init(&rbd_dev->header_oid);
431a02cd 5368 rbd_dev->header_oloc.pool = spec->pool_id;
b26c047b
ID
5369 if (spec->pool_ns) {
5370 WARN_ON(!*spec->pool_ns);
5371 rbd_dev->header_oloc.pool_ns =
5372 ceph_find_or_create_string(spec->pool_ns,
5373 strlen(spec->pool_ns));
5374 }
c41d13a3 5375
99d16943
ID
5376 mutex_init(&rbd_dev->watch_mutex);
5377 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5378 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5379
ed95b21a
ID
5380 init_rwsem(&rbd_dev->lock_rwsem);
5381 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5382 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5383 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5384 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5385 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
e1fddc8f 5386 spin_lock_init(&rbd_dev->lock_lists_lock);
637cd060 5387 INIT_LIST_HEAD(&rbd_dev->acquiring_list);
e1fddc8f 5388 INIT_LIST_HEAD(&rbd_dev->running_list);
637cd060 5389 init_completion(&rbd_dev->acquire_wait);
e1fddc8f 5390 init_completion(&rbd_dev->releasing_wait);
ed95b21a 5391
22e8bd51 5392 spin_lock_init(&rbd_dev->object_map_lock);
ed95b21a 5393
dd5ac32d
ID
5394 rbd_dev->dev.bus = &rbd_bus_type;
5395 rbd_dev->dev.type = &rbd_device_type;
5396 rbd_dev->dev.parent = &rbd_root_dev;
dd5ac32d
ID
5397 device_initialize(&rbd_dev->dev);
5398
c53d5893 5399 rbd_dev->rbd_client = rbdc;
d147543d 5400 rbd_dev->spec = spec;
0903e875 5401
1643dfa4
ID
5402 return rbd_dev;
5403}
5404
5405/*
5406 * Create a mapping rbd_dev.
5407 */
5408static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5409 struct rbd_spec *spec,
5410 struct rbd_options *opts)
5411{
5412 struct rbd_device *rbd_dev;
5413
5414 rbd_dev = __rbd_dev_create(rbdc, spec);
5415 if (!rbd_dev)
5416 return NULL;
5417
5418 rbd_dev->opts = opts;
5419
5420 /* get an id and fill in device name */
5421 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5422 minor_to_rbd_dev_id(1 << MINORBITS),
5423 GFP_KERNEL);
5424 if (rbd_dev->dev_id < 0)
5425 goto fail_rbd_dev;
5426
5427 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5428 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5429 rbd_dev->name);
5430 if (!rbd_dev->task_wq)
5431 goto fail_dev_id;
dd5ac32d 5432
1643dfa4
ID
5433 /* we have a ref from do_rbd_add() */
5434 __module_get(THIS_MODULE);
dd5ac32d 5435
1643dfa4 5436 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
c53d5893 5437 return rbd_dev;
1643dfa4
ID
5438
5439fail_dev_id:
5440 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5441fail_rbd_dev:
5442 rbd_dev_free(rbd_dev);
5443 return NULL;
c53d5893
AE
5444}
5445
5446static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5447{
dd5ac32d
ID
5448 if (rbd_dev)
5449 put_device(&rbd_dev->dev);
c53d5893
AE
5450}
5451
9d475de5
AE
5452/*
5453 * Get the size and object order for an image snapshot, or if
5454 * snap_id is CEPH_NOSNAP, gets this information for the base
5455 * image.
5456 */
5457static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5458 u8 *order, u64 *snap_size)
5459{
5460 __le64 snapid = cpu_to_le64(snap_id);
5461 int ret;
5462 struct {
5463 u8 order;
5464 __le64 size;
5465 } __attribute__ ((packed)) size_buf = { 0 };
5466
ecd4a68a
ID
5467 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5468 &rbd_dev->header_oloc, "get_size",
5469 &snapid, sizeof(snapid),
5470 &size_buf, sizeof(size_buf));
36be9a76 5471 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
5472 if (ret < 0)
5473 return ret;
57385b51
AE
5474 if (ret < sizeof (size_buf))
5475 return -ERANGE;
9d475de5 5476
c3545579 5477 if (order) {
c86f86e9 5478 *order = size_buf.order;
c3545579
JD
5479 dout(" order %u", (unsigned int)*order);
5480 }
9d475de5
AE
5481 *snap_size = le64_to_cpu(size_buf.size);
5482
c3545579
JD
5483 dout(" snap_id 0x%016llx snap_size = %llu\n",
5484 (unsigned long long)snap_id,
57385b51 5485 (unsigned long long)*snap_size);
9d475de5
AE
5486
5487 return 0;
5488}
5489
5490static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5491{
5492 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5493 &rbd_dev->header.obj_order,
5494 &rbd_dev->header.image_size);
5495}
5496
1e130199
AE
5497static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5498{
5435d206 5499 size_t size;
1e130199
AE
5500 void *reply_buf;
5501 int ret;
5502 void *p;
5503
5435d206
DY
5504 /* Response will be an encoded string, which includes a length */
5505 size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5506 reply_buf = kzalloc(size, GFP_KERNEL);
1e130199
AE
5507 if (!reply_buf)
5508 return -ENOMEM;
5509
ecd4a68a
ID
5510 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5511 &rbd_dev->header_oloc, "get_object_prefix",
5435d206 5512 NULL, 0, reply_buf, size);
36be9a76 5513 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
5514 if (ret < 0)
5515 goto out;
5516
5517 p = reply_buf;
5518 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
57385b51
AE
5519 p + ret, NULL, GFP_NOIO);
5520 ret = 0;
1e130199
AE
5521
5522 if (IS_ERR(rbd_dev->header.object_prefix)) {
5523 ret = PTR_ERR(rbd_dev->header.object_prefix);
5524 rbd_dev->header.object_prefix = NULL;
5525 } else {
5526 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
5527 }
1e130199
AE
5528out:
5529 kfree(reply_buf);
5530
5531 return ret;
5532}
5533
b1b5402a 5534static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
196e2d6d 5535 bool read_only, u64 *snap_features)
b1b5402a 5536{
196e2d6d
ID
5537 struct {
5538 __le64 snap_id;
5539 u8 read_only;
5540 } features_in;
b1b5402a
AE
5541 struct {
5542 __le64 features;
5543 __le64 incompat;
4157976b 5544 } __attribute__ ((packed)) features_buf = { 0 };
d3767f0f 5545 u64 unsup;
b1b5402a
AE
5546 int ret;
5547
196e2d6d
ID
5548 features_in.snap_id = cpu_to_le64(snap_id);
5549 features_in.read_only = read_only;
5550
ecd4a68a
ID
5551 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5552 &rbd_dev->header_oloc, "get_features",
196e2d6d 5553 &features_in, sizeof(features_in),
ecd4a68a 5554 &features_buf, sizeof(features_buf));
36be9a76 5555 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
5556 if (ret < 0)
5557 return ret;
57385b51
AE
5558 if (ret < sizeof (features_buf))
5559 return -ERANGE;
d889140c 5560
d3767f0f
ID
5561 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5562 if (unsup) {
5563 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5564 unsup);
b8f5c6ed 5565 return -ENXIO;
d3767f0f 5566 }
d889140c 5567
b1b5402a
AE
5568 *snap_features = le64_to_cpu(features_buf.features);
5569
5570 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
57385b51
AE
5571 (unsigned long long)snap_id,
5572 (unsigned long long)*snap_features,
5573 (unsigned long long)le64_to_cpu(features_buf.incompat));
b1b5402a
AE
5574
5575 return 0;
5576}
5577
5578static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5579{
5580 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
196e2d6d
ID
5581 rbd_is_ro(rbd_dev),
5582 &rbd_dev->header.features);
b1b5402a
AE
5583}
5584
22e8bd51
ID
5585/*
5586 * These are generic image flags, but since they are used only for
5587 * object map, store them in rbd_dev->object_map_flags.
5588 *
5589 * For the same reason, this function is called only on object map
5590 * (re)load and not on header refresh.
5591 */
5592static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5593{
5594 __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5595 __le64 flags;
5596 int ret;
5597
5598 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5599 &rbd_dev->header_oloc, "get_flags",
5600 &snapid, sizeof(snapid),
5601 &flags, sizeof(flags));
5602 if (ret < 0)
5603 return ret;
5604 if (ret < sizeof(flags))
5605 return -EBADMSG;
5606
5607 rbd_dev->object_map_flags = le64_to_cpu(flags);
5608 return 0;
5609}
5610
eb3b2d6b
ID
5611struct parent_image_info {
5612 u64 pool_id;
e92c0eaf 5613 const char *pool_ns;
eb3b2d6b
ID
5614 const char *image_id;
5615 u64 snap_id;
5616
e92c0eaf 5617 bool has_overlap;
eb3b2d6b
ID
5618 u64 overlap;
5619};
5620
e92c0eaf
ID
5621/*
5622 * The caller is responsible for @pii.
5623 */
5624static int decode_parent_image_spec(void **p, void *end,
5625 struct parent_image_info *pii)
5626{
5627 u8 struct_v;
5628 u32 struct_len;
5629 int ret;
5630
5631 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5632 &struct_v, &struct_len);
5633 if (ret)
5634 return ret;
5635
5636 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5637 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5638 if (IS_ERR(pii->pool_ns)) {
5639 ret = PTR_ERR(pii->pool_ns);
5640 pii->pool_ns = NULL;
5641 return ret;
5642 }
5643 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5644 if (IS_ERR(pii->image_id)) {
5645 ret = PTR_ERR(pii->image_id);
5646 pii->image_id = NULL;
5647 return ret;
5648 }
5649 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5650 return 0;
5651
5652e_inval:
5653 return -EINVAL;
5654}
5655
5656static int __get_parent_info(struct rbd_device *rbd_dev,
5657 struct page *req_page,
5658 struct page *reply_page,
5659 struct parent_image_info *pii)
5660{
5661 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5662 size_t reply_len = PAGE_SIZE;
5663 void *p, *end;
5664 int ret;
5665
5666 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5667 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
68ada915 5668 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5669 if (ret)
5670 return ret == -EOPNOTSUPP ? 1 : ret;
5671
5672 p = page_address(reply_page);
5673 end = p + reply_len;
5674 ret = decode_parent_image_spec(&p, end, pii);
5675 if (ret)
5676 return ret;
5677
5678 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5679 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
68ada915 5680 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5681 if (ret)
5682 return ret;
5683
5684 p = page_address(reply_page);
5685 end = p + reply_len;
5686 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5687 if (pii->has_overlap)
5688 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5689
5690 return 0;
5691
5692e_inval:
5693 return -EINVAL;
5694}
5695
eb3b2d6b
ID
5696/*
5697 * The caller is responsible for @pii.
5698 */
5699static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5700 struct page *req_page,
5701 struct page *reply_page,
5702 struct parent_image_info *pii)
5703{
5704 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5705 size_t reply_len = PAGE_SIZE;
5706 void *p, *end;
5707 int ret;
5708
5709 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5710 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
68ada915 5711 req_page, sizeof(u64), &reply_page, &reply_len);
eb3b2d6b
ID
5712 if (ret)
5713 return ret;
5714
5715 p = page_address(reply_page);
5716 end = p + reply_len;
5717 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5718 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5719 if (IS_ERR(pii->image_id)) {
5720 ret = PTR_ERR(pii->image_id);
5721 pii->image_id = NULL;
5722 return ret;
5723 }
5724 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
e92c0eaf 5725 pii->has_overlap = true;
eb3b2d6b
ID
5726 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5727
5728 return 0;
5729
5730e_inval:
5731 return -EINVAL;
5732}
5733
5734static int get_parent_info(struct rbd_device *rbd_dev,
5735 struct parent_image_info *pii)
5736{
5737 struct page *req_page, *reply_page;
5738 void *p;
5739 int ret;
5740
5741 req_page = alloc_page(GFP_KERNEL);
5742 if (!req_page)
5743 return -ENOMEM;
5744
5745 reply_page = alloc_page(GFP_KERNEL);
5746 if (!reply_page) {
5747 __free_page(req_page);
5748 return -ENOMEM;
5749 }
5750
5751 p = page_address(req_page);
5752 ceph_encode_64(&p, rbd_dev->spec->snap_id);
e92c0eaf
ID
5753 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5754 if (ret > 0)
5755 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5756 pii);
eb3b2d6b
ID
5757
5758 __free_page(req_page);
5759 __free_page(reply_page);
5760 return ret;
5761}
5762
86b00e0d
AE
5763static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5764{
5765 struct rbd_spec *parent_spec;
eb3b2d6b 5766 struct parent_image_info pii = { 0 };
86b00e0d
AE
5767 int ret;
5768
5769 parent_spec = rbd_spec_alloc();
5770 if (!parent_spec)
5771 return -ENOMEM;
5772
eb3b2d6b
ID
5773 ret = get_parent_info(rbd_dev, &pii);
5774 if (ret)
86b00e0d 5775 goto out_err;
86b00e0d 5776
e92c0eaf
ID
5777 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5778 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5779 pii.has_overlap, pii.overlap);
86b00e0d 5780
e92c0eaf 5781 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
392a9dad
AE
5782 /*
5783 * Either the parent never existed, or we have
5784 * record of it but the image got flattened so it no
5785 * longer has a parent. When the parent of a
5786 * layered image disappears we immediately set the
5787 * overlap to 0. The effect of this is that all new
5788 * requests will be treated as if the image had no
5789 * parent.
e92c0eaf
ID
5790 *
5791 * If !pii.has_overlap, the parent image spec is not
5792 * applicable. It's there to avoid duplication in each
5793 * snapshot record.
392a9dad
AE
5794 */
5795 if (rbd_dev->parent_overlap) {
5796 rbd_dev->parent_overlap = 0;
392a9dad
AE
5797 rbd_dev_parent_put(rbd_dev);
5798 pr_info("%s: clone image has been flattened\n",
5799 rbd_dev->disk->disk_name);
5800 }
5801
86b00e0d 5802 goto out; /* No parent? No problem. */
392a9dad 5803 }
86b00e0d 5804
0903e875
AE
5805 /* The ceph file layout needs to fit pool id in 32 bits */
5806
5807 ret = -EIO;
eb3b2d6b 5808 if (pii.pool_id > (u64)U32_MAX) {
9584d508 5809 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
eb3b2d6b 5810 (unsigned long long)pii.pool_id, U32_MAX);
86b00e0d
AE
5811 goto out_err;
5812 }
86b00e0d 5813
3b5cf2a2
AE
5814 /*
5815 * The parent won't change (except when the clone is
5816 * flattened, already handled that). So we only need to
5817 * record the parent spec we have not already done so.
5818 */
5819 if (!rbd_dev->parent_spec) {
eb3b2d6b 5820 parent_spec->pool_id = pii.pool_id;
e92c0eaf
ID
5821 if (pii.pool_ns && *pii.pool_ns) {
5822 parent_spec->pool_ns = pii.pool_ns;
5823 pii.pool_ns = NULL;
5824 }
eb3b2d6b
ID
5825 parent_spec->image_id = pii.image_id;
5826 pii.image_id = NULL;
5827 parent_spec->snap_id = pii.snap_id;
b26c047b 5828
70cf49cf
AE
5829 rbd_dev->parent_spec = parent_spec;
5830 parent_spec = NULL; /* rbd_dev now owns this */
3b5cf2a2
AE
5831 }
5832
5833 /*
cf32bd9c
ID
5834 * We always update the parent overlap. If it's zero we issue
5835 * a warning, as we will proceed as if there was no parent.
3b5cf2a2 5836 */
eb3b2d6b 5837 if (!pii.overlap) {
3b5cf2a2 5838 if (parent_spec) {
cf32bd9c
ID
5839 /* refresh, careful to warn just once */
5840 if (rbd_dev->parent_overlap)
5841 rbd_warn(rbd_dev,
5842 "clone now standalone (overlap became 0)");
3b5cf2a2 5843 } else {
cf32bd9c
ID
5844 /* initial probe */
5845 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
3b5cf2a2 5846 }
70cf49cf 5847 }
eb3b2d6b 5848 rbd_dev->parent_overlap = pii.overlap;
cf32bd9c 5849
86b00e0d
AE
5850out:
5851 ret = 0;
5852out_err:
e92c0eaf 5853 kfree(pii.pool_ns);
eb3b2d6b 5854 kfree(pii.image_id);
86b00e0d 5855 rbd_spec_put(parent_spec);
86b00e0d
AE
5856 return ret;
5857}
5858
cc070d59
AE
5859static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5860{
5861 struct {
5862 __le64 stripe_unit;
5863 __le64 stripe_count;
5864 } __attribute__ ((packed)) striping_info_buf = { 0 };
5865 size_t size = sizeof (striping_info_buf);
5866 void *p;
cc070d59
AE
5867 int ret;
5868
ecd4a68a
ID
5869 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5870 &rbd_dev->header_oloc, "get_stripe_unit_count",
5871 NULL, 0, &striping_info_buf, size);
cc070d59
AE
5872 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5873 if (ret < 0)
5874 return ret;
5875 if (ret < size)
5876 return -ERANGE;
5877
cc070d59 5878 p = &striping_info_buf;
b1331852
ID
5879 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5880 rbd_dev->header.stripe_count = ceph_decode_64(&p);
cc070d59
AE
5881 return 0;
5882}
5883
7e97332e
ID
5884static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5885{
5886 __le64 data_pool_id;
5887 int ret;
5888
5889 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5890 &rbd_dev->header_oloc, "get_data_pool",
5891 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5892 if (ret < 0)
5893 return ret;
5894 if (ret < sizeof(data_pool_id))
5895 return -EBADMSG;
5896
5897 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5898 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5899 return 0;
5900}
5901
9e15b77d
AE
5902static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5903{
ecd4a68a 5904 CEPH_DEFINE_OID_ONSTACK(oid);
9e15b77d
AE
5905 size_t image_id_size;
5906 char *image_id;
5907 void *p;
5908 void *end;
5909 size_t size;
5910 void *reply_buf = NULL;
5911 size_t len = 0;
5912 char *image_name = NULL;
5913 int ret;
5914
5915 rbd_assert(!rbd_dev->spec->image_name);
5916
69e7a02f
AE
5917 len = strlen(rbd_dev->spec->image_id);
5918 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
5919 image_id = kmalloc(image_id_size, GFP_KERNEL);
5920 if (!image_id)
5921 return NULL;
5922
5923 p = image_id;
4157976b 5924 end = image_id + image_id_size;
57385b51 5925 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
9e15b77d
AE
5926
5927 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5928 reply_buf = kmalloc(size, GFP_KERNEL);
5929 if (!reply_buf)
5930 goto out;
5931
ecd4a68a
ID
5932 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5933 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5934 "dir_get_name", image_id, image_id_size,
5935 reply_buf, size);
9e15b77d
AE
5936 if (ret < 0)
5937 goto out;
5938 p = reply_buf;
f40eb349
AE
5939 end = reply_buf + ret;
5940
9e15b77d
AE
5941 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5942 if (IS_ERR(image_name))
5943 image_name = NULL;
5944 else
5945 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5946out:
5947 kfree(reply_buf);
5948 kfree(image_id);
5949
5950 return image_name;
5951}
5952
2ad3d716
AE
5953static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5954{
5955 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5956 const char *snap_name;
5957 u32 which = 0;
5958
5959 /* Skip over names until we find the one we are looking for */
5960
5961 snap_name = rbd_dev->header.snap_names;
5962 while (which < snapc->num_snaps) {
5963 if (!strcmp(name, snap_name))
5964 return snapc->snaps[which];
5965 snap_name += strlen(snap_name) + 1;
5966 which++;
5967 }
5968 return CEPH_NOSNAP;
5969}
5970
5971static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5972{
5973 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5974 u32 which;
5975 bool found = false;
5976 u64 snap_id;
5977
5978 for (which = 0; !found && which < snapc->num_snaps; which++) {
5979 const char *snap_name;
5980
5981 snap_id = snapc->snaps[which];
5982 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
efadc98a
JD
5983 if (IS_ERR(snap_name)) {
5984 /* ignore no-longer existing snapshots */
5985 if (PTR_ERR(snap_name) == -ENOENT)
5986 continue;
5987 else
5988 break;
5989 }
2ad3d716
AE
5990 found = !strcmp(name, snap_name);
5991 kfree(snap_name);
5992 }
5993 return found ? snap_id : CEPH_NOSNAP;
5994}
5995
5996/*
5997 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5998 * no snapshot by that name is found, or if an error occurs.
5999 */
6000static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6001{
6002 if (rbd_dev->image_format == 1)
6003 return rbd_v1_snap_id_by_name(rbd_dev, name);
6004
6005 return rbd_v2_snap_id_by_name(rbd_dev, name);
6006}
6007
9e15b77d 6008/*
04077599
ID
6009 * An image being mapped will have everything but the snap id.
6010 */
6011static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
6012{
6013 struct rbd_spec *spec = rbd_dev->spec;
6014
6015 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
6016 rbd_assert(spec->image_id && spec->image_name);
6017 rbd_assert(spec->snap_name);
6018
6019 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
6020 u64 snap_id;
6021
6022 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
6023 if (snap_id == CEPH_NOSNAP)
6024 return -ENOENT;
6025
6026 spec->snap_id = snap_id;
6027 } else {
6028 spec->snap_id = CEPH_NOSNAP;
6029 }
6030
6031 return 0;
6032}
6033
6034/*
6035 * A parent image will have all ids but none of the names.
e1d4213f 6036 *
04077599
ID
6037 * All names in an rbd spec are dynamically allocated. It's OK if we
6038 * can't figure out the name for an image id.
9e15b77d 6039 */
04077599 6040static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
9e15b77d 6041{
2e9f7f1c
AE
6042 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6043 struct rbd_spec *spec = rbd_dev->spec;
6044 const char *pool_name;
6045 const char *image_name;
6046 const char *snap_name;
9e15b77d
AE
6047 int ret;
6048
04077599
ID
6049 rbd_assert(spec->pool_id != CEPH_NOPOOL);
6050 rbd_assert(spec->image_id);
6051 rbd_assert(spec->snap_id != CEPH_NOSNAP);
9e15b77d 6052
2e9f7f1c 6053 /* Get the pool name; we have to make our own copy of this */
9e15b77d 6054
2e9f7f1c
AE
6055 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6056 if (!pool_name) {
6057 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
935dc89f
AE
6058 return -EIO;
6059 }
2e9f7f1c
AE
6060 pool_name = kstrdup(pool_name, GFP_KERNEL);
6061 if (!pool_name)
9e15b77d
AE
6062 return -ENOMEM;
6063
6064 /* Fetch the image name; tolerate failure here */
6065
2e9f7f1c
AE
6066 image_name = rbd_dev_image_name(rbd_dev);
6067 if (!image_name)
06ecc6cb 6068 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d 6069
04077599 6070 /* Fetch the snapshot name */
9e15b77d 6071
2e9f7f1c 6072 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
da6a6b63
JD
6073 if (IS_ERR(snap_name)) {
6074 ret = PTR_ERR(snap_name);
9e15b77d 6075 goto out_err;
2e9f7f1c
AE
6076 }
6077
6078 spec->pool_name = pool_name;
6079 spec->image_name = image_name;
6080 spec->snap_name = snap_name;
9e15b77d
AE
6081
6082 return 0;
04077599 6083
9e15b77d 6084out_err:
2e9f7f1c
AE
6085 kfree(image_name);
6086 kfree(pool_name);
9e15b77d
AE
6087 return ret;
6088}
6089
cc4a38bd 6090static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
35d489f9
AE
6091{
6092 size_t size;
6093 int ret;
6094 void *reply_buf;
6095 void *p;
6096 void *end;
6097 u64 seq;
6098 u32 snap_count;
6099 struct ceph_snap_context *snapc;
6100 u32 i;
6101
6102 /*
6103 * We'll need room for the seq value (maximum snapshot id),
6104 * snapshot count, and array of that many snapshot ids.
6105 * For now we have a fixed upper limit on the number we're
6106 * prepared to receive.
6107 */
6108 size = sizeof (__le64) + sizeof (__le32) +
6109 RBD_MAX_SNAP_COUNT * sizeof (__le64);
6110 reply_buf = kzalloc(size, GFP_KERNEL);
6111 if (!reply_buf)
6112 return -ENOMEM;
6113
ecd4a68a
ID
6114 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6115 &rbd_dev->header_oloc, "get_snapcontext",
6116 NULL, 0, reply_buf, size);
36be9a76 6117 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
6118 if (ret < 0)
6119 goto out;
6120
35d489f9 6121 p = reply_buf;
57385b51
AE
6122 end = reply_buf + ret;
6123 ret = -ERANGE;
35d489f9
AE
6124 ceph_decode_64_safe(&p, end, seq, out);
6125 ceph_decode_32_safe(&p, end, snap_count, out);
6126
6127 /*
6128 * Make sure the reported number of snapshot ids wouldn't go
6129 * beyond the end of our buffer. But before checking that,
6130 * make sure the computed size of the snapshot context we
6131 * allocate is representable in a size_t.
6132 */
6133 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6134 / sizeof (u64)) {
6135 ret = -EINVAL;
6136 goto out;
6137 }
6138 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6139 goto out;
468521c1 6140 ret = 0;
35d489f9 6141
812164f8 6142 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
35d489f9
AE
6143 if (!snapc) {
6144 ret = -ENOMEM;
6145 goto out;
6146 }
35d489f9 6147 snapc->seq = seq;
35d489f9
AE
6148 for (i = 0; i < snap_count; i++)
6149 snapc->snaps[i] = ceph_decode_64(&p);
6150
49ece554 6151 ceph_put_snap_context(rbd_dev->header.snapc);
35d489f9
AE
6152 rbd_dev->header.snapc = snapc;
6153
6154 dout(" snap context seq = %llu, snap_count = %u\n",
57385b51 6155 (unsigned long long)seq, (unsigned int)snap_count);
35d489f9
AE
6156out:
6157 kfree(reply_buf);
6158
57385b51 6159 return ret;
35d489f9
AE
6160}
6161
54cac61f
AE
6162static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6163 u64 snap_id)
b8b1e2db
AE
6164{
6165 size_t size;
6166 void *reply_buf;
54cac61f 6167 __le64 snapid;
b8b1e2db
AE
6168 int ret;
6169 void *p;
6170 void *end;
b8b1e2db
AE
6171 char *snap_name;
6172
6173 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6174 reply_buf = kmalloc(size, GFP_KERNEL);
6175 if (!reply_buf)
6176 return ERR_PTR(-ENOMEM);
6177
54cac61f 6178 snapid = cpu_to_le64(snap_id);
ecd4a68a
ID
6179 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6180 &rbd_dev->header_oloc, "get_snapshot_name",
6181 &snapid, sizeof(snapid), reply_buf, size);
36be9a76 6182 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
f40eb349
AE
6183 if (ret < 0) {
6184 snap_name = ERR_PTR(ret);
b8b1e2db 6185 goto out;
f40eb349 6186 }
b8b1e2db
AE
6187
6188 p = reply_buf;
f40eb349 6189 end = reply_buf + ret;
e5c35534 6190 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
f40eb349 6191 if (IS_ERR(snap_name))
b8b1e2db 6192 goto out;
b8b1e2db 6193
f40eb349 6194 dout(" snap_id 0x%016llx snap_name = %s\n",
54cac61f 6195 (unsigned long long)snap_id, snap_name);
b8b1e2db
AE
6196out:
6197 kfree(reply_buf);
6198
f40eb349 6199 return snap_name;
b8b1e2db
AE
6200}
6201
2df3fac7 6202static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
117973fb 6203{
2df3fac7 6204 bool first_time = rbd_dev->header.object_prefix == NULL;
117973fb 6205 int ret;
117973fb 6206
1617e40c
JD
6207 ret = rbd_dev_v2_image_size(rbd_dev);
6208 if (ret)
cfbf6377 6209 return ret;
1617e40c 6210
2df3fac7
AE
6211 if (first_time) {
6212 ret = rbd_dev_v2_header_onetime(rbd_dev);
6213 if (ret)
cfbf6377 6214 return ret;
2df3fac7
AE
6215 }
6216
cc4a38bd 6217 ret = rbd_dev_v2_snap_context(rbd_dev);
d194cd1d
ID
6218 if (ret && first_time) {
6219 kfree(rbd_dev->header.object_prefix);
6220 rbd_dev->header.object_prefix = NULL;
6221 }
117973fb
AE
6222
6223 return ret;
6224}
6225
a720ae09
ID
6226static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6227{
6228 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6229
6230 if (rbd_dev->image_format == 1)
6231 return rbd_dev_v1_header_info(rbd_dev);
6232
6233 return rbd_dev_v2_header_info(rbd_dev);
6234}
6235
e28fff26
AE
6236/*
6237 * Skips over white space at *buf, and updates *buf to point to the
6238 * first found non-space character (if any). Returns the length of
593a9e7b
AE
6239 * the token (string of non-white space characters) found. Note
6240 * that *buf must be terminated with '\0'.
e28fff26
AE
6241 */
6242static inline size_t next_token(const char **buf)
6243{
6244 /*
6245 * These are the characters that produce nonzero for
6246 * isspace() in the "C" and "POSIX" locales.
6247 */
6248 const char *spaces = " \f\n\r\t\v";
6249
6250 *buf += strspn(*buf, spaces); /* Find start of token */
6251
6252 return strcspn(*buf, spaces); /* Return token length */
6253}
6254
ea3352f4
AE
6255/*
6256 * Finds the next token in *buf, dynamically allocates a buffer big
6257 * enough to hold a copy of it, and copies the token into the new
6258 * buffer. The copy is guaranteed to be terminated with '\0'. Note
6259 * that a duplicate buffer is created even for a zero-length token.
6260 *
6261 * Returns a pointer to the newly-allocated duplicate, or a null
6262 * pointer if memory for the duplicate was not available. If
6263 * the lenp argument is a non-null pointer, the length of the token
6264 * (not including the '\0') is returned in *lenp.
6265 *
6266 * If successful, the *buf pointer will be updated to point beyond
6267 * the end of the found token.
6268 *
6269 * Note: uses GFP_KERNEL for allocation.
6270 */
6271static inline char *dup_token(const char **buf, size_t *lenp)
6272{
6273 char *dup;
6274 size_t len;
6275
6276 len = next_token(buf);
4caf35f9 6277 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
6278 if (!dup)
6279 return NULL;
ea3352f4
AE
6280 *(dup + len) = '\0';
6281 *buf += len;
6282
6283 if (lenp)
6284 *lenp = len;
6285
6286 return dup;
6287}
6288
82995cc6
DH
6289static int rbd_parse_param(struct fs_parameter *param,
6290 struct rbd_parse_opts_ctx *pctx)
6291{
6292 struct rbd_options *opt = pctx->opts;
6293 struct fs_parse_result result;
3fbb8d55 6294 struct p_log log = {.prefix = "rbd"};
82995cc6
DH
6295 int token, ret;
6296
6297 ret = ceph_parse_param(param, pctx->copts, NULL);
6298 if (ret != -ENOPARAM)
6299 return ret;
6300
d7167b14 6301 token = __fs_parse(&log, rbd_parameters, param, &result);
82995cc6
DH
6302 dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
6303 if (token < 0) {
2c3f3dc3
AV
6304 if (token == -ENOPARAM)
6305 return inval_plog(&log, "Unknown parameter '%s'",
6306 param->key);
82995cc6
DH
6307 return token;
6308 }
6309
6310 switch (token) {
6311 case Opt_queue_depth:
6312 if (result.uint_32 < 1)
6313 goto out_of_range;
6314 opt->queue_depth = result.uint_32;
6315 break;
6316 case Opt_alloc_size:
6317 if (result.uint_32 < SECTOR_SIZE)
6318 goto out_of_range;
2c3f3dc3
AV
6319 if (!is_power_of_2(result.uint_32))
6320 return inval_plog(&log, "alloc_size must be a power of 2");
82995cc6
DH
6321 opt->alloc_size = result.uint_32;
6322 break;
6323 case Opt_lock_timeout:
6324 /* 0 is "wait forever" (i.e. infinite timeout) */
6325 if (result.uint_32 > INT_MAX / 1000)
6326 goto out_of_range;
6327 opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
6328 break;
6329 case Opt_pool_ns:
6330 kfree(pctx->spec->pool_ns);
6331 pctx->spec->pool_ns = param->string;
6332 param->string = NULL;
6333 break;
6334 case Opt_read_only:
6335 opt->read_only = true;
6336 break;
6337 case Opt_read_write:
6338 opt->read_only = false;
6339 break;
6340 case Opt_lock_on_read:
6341 opt->lock_on_read = true;
6342 break;
6343 case Opt_exclusive:
6344 opt->exclusive = true;
6345 break;
6346 case Opt_notrim:
6347 opt->trim = false;
6348 break;
6349 default:
6350 BUG();
6351 }
6352
6353 return 0;
6354
6355out_of_range:
2c3f3dc3 6356 return inval_plog(&log, "%s out of range", param->key);
82995cc6
DH
6357}
6358
6359/*
6360 * This duplicates most of generic_parse_monolithic(), untying it from
6361 * fs_context and skipping standard superblock and security options.
6362 */
6363static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
6364{
6365 char *key;
6366 int ret = 0;
6367
6368 dout("%s '%s'\n", __func__, options);
6369 while ((key = strsep(&options, ",")) != NULL) {
6370 if (*key) {
6371 struct fs_parameter param = {
6372 .key = key,
0f89589a 6373 .type = fs_value_is_flag,
82995cc6
DH
6374 };
6375 char *value = strchr(key, '=');
6376 size_t v_len = 0;
6377
6378 if (value) {
6379 if (value == key)
6380 continue;
6381 *value++ = 0;
6382 v_len = strlen(value);
82995cc6
DH
6383 param.string = kmemdup_nul(value, v_len,
6384 GFP_KERNEL);
6385 if (!param.string)
6386 return -ENOMEM;
0f89589a 6387 param.type = fs_value_is_string;
82995cc6
DH
6388 }
6389 param.size = v_len;
6390
6391 ret = rbd_parse_param(&param, pctx);
6392 kfree(param.string);
6393 if (ret)
6394 break;
6395 }
6396 }
6397
6398 return ret;
6399}
6400
a725f65e 6401/*
859c31df
AE
6402 * Parse the options provided for an "rbd add" (i.e., rbd image
6403 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
6404 * and the data written is passed here via a NUL-terminated buffer.
6405 * Returns 0 if successful or an error code otherwise.
d22f76e7 6406 *
859c31df
AE
6407 * The information extracted from these options is recorded in
6408 * the other parameters which return dynamically-allocated
6409 * structures:
6410 * ceph_opts
6411 * The address of a pointer that will refer to a ceph options
6412 * structure. Caller must release the returned pointer using
6413 * ceph_destroy_options() when it is no longer needed.
6414 * rbd_opts
6415 * Address of an rbd options pointer. Fully initialized by
6416 * this function; caller must release with kfree().
6417 * spec
6418 * Address of an rbd image specification pointer. Fully
6419 * initialized by this function based on parsed options.
6420 * Caller must release with rbd_spec_put().
6421 *
6422 * The options passed take this form:
6423 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6424 * where:
6425 * <mon_addrs>
6426 * A comma-separated list of one or more monitor addresses.
6427 * A monitor address is an ip address, optionally followed
6428 * by a port number (separated by a colon).
6429 * I.e.: ip1[:port1][,ip2[:port2]...]
6430 * <options>
6431 * A comma-separated list of ceph and/or rbd options.
6432 * <pool_name>
6433 * The name of the rados pool containing the rbd image.
6434 * <image_name>
6435 * The name of the image in that pool to map.
6436 * <snap_id>
6437 * An optional snapshot id. If provided, the mapping will
6438 * present data from the image at the time that snapshot was
6439 * created. The image head is used if no snapshot id is
6440 * provided. Snapshot mappings are always read-only.
a725f65e 6441 */
859c31df 6442static int rbd_add_parse_args(const char *buf,
dc79b113 6443 struct ceph_options **ceph_opts,
859c31df
AE
6444 struct rbd_options **opts,
6445 struct rbd_spec **rbd_spec)
e28fff26 6446{
d22f76e7 6447 size_t len;
859c31df 6448 char *options;
0ddebc0c 6449 const char *mon_addrs;
ecb4dc22 6450 char *snap_name;
0ddebc0c 6451 size_t mon_addrs_size;
82995cc6 6452 struct rbd_parse_opts_ctx pctx = { 0 };
dc79b113 6453 int ret;
e28fff26
AE
6454
6455 /* The first four tokens are required */
6456
7ef3214a 6457 len = next_token(&buf);
4fb5d671
AE
6458 if (!len) {
6459 rbd_warn(NULL, "no monitor address(es) provided");
6460 return -EINVAL;
6461 }
0ddebc0c 6462 mon_addrs = buf;
82995cc6 6463 mon_addrs_size = len;
7ef3214a 6464 buf += len;
a725f65e 6465
dc79b113 6466 ret = -EINVAL;
f28e565a
AE
6467 options = dup_token(&buf, NULL);
6468 if (!options)
dc79b113 6469 return -ENOMEM;
4fb5d671
AE
6470 if (!*options) {
6471 rbd_warn(NULL, "no options provided");
6472 goto out_err;
6473 }
e28fff26 6474
c300156b
ID
6475 pctx.spec = rbd_spec_alloc();
6476 if (!pctx.spec)
f28e565a 6477 goto out_mem;
859c31df 6478
c300156b
ID
6479 pctx.spec->pool_name = dup_token(&buf, NULL);
6480 if (!pctx.spec->pool_name)
859c31df 6481 goto out_mem;
c300156b 6482 if (!*pctx.spec->pool_name) {
4fb5d671
AE
6483 rbd_warn(NULL, "no pool name provided");
6484 goto out_err;
6485 }
e28fff26 6486
c300156b
ID
6487 pctx.spec->image_name = dup_token(&buf, NULL);
6488 if (!pctx.spec->image_name)
f28e565a 6489 goto out_mem;
c300156b 6490 if (!*pctx.spec->image_name) {
4fb5d671
AE
6491 rbd_warn(NULL, "no image name provided");
6492 goto out_err;
6493 }
d4b125e9 6494
f28e565a
AE
6495 /*
6496 * Snapshot name is optional; default is to use "-"
6497 * (indicating the head/no snapshot).
6498 */
3feeb894 6499 len = next_token(&buf);
820a5f3e 6500 if (!len) {
3feeb894
AE
6501 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6502 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 6503 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 6504 ret = -ENAMETOOLONG;
f28e565a 6505 goto out_err;
849b4260 6506 }
ecb4dc22
AE
6507 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6508 if (!snap_name)
f28e565a 6509 goto out_mem;
ecb4dc22 6510 *(snap_name + len) = '\0';
c300156b 6511 pctx.spec->snap_name = snap_name;
e5c35534 6512
82995cc6
DH
6513 pctx.copts = ceph_alloc_options();
6514 if (!pctx.copts)
6515 goto out_mem;
6516
0ddebc0c 6517 /* Initialize all rbd options to the defaults */
e28fff26 6518
c300156b
ID
6519 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6520 if (!pctx.opts)
4e9afeba
AE
6521 goto out_mem;
6522
c300156b
ID
6523 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6524 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
0c93e1b7 6525 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
c300156b
ID
6526 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6527 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6528 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6529 pctx.opts->trim = RBD_TRIM_DEFAULT;
d22f76e7 6530
82995cc6
DH
6531 ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL);
6532 if (ret)
dc79b113 6533 goto out_err;
859c31df 6534
82995cc6
DH
6535 ret = rbd_parse_options(options, &pctx);
6536 if (ret)
6537 goto out_err;
6538
6539 *ceph_opts = pctx.copts;
c300156b
ID
6540 *opts = pctx.opts;
6541 *rbd_spec = pctx.spec;
82995cc6 6542 kfree(options);
dc79b113 6543 return 0;
82995cc6 6544
f28e565a 6545out_mem:
dc79b113 6546 ret = -ENOMEM;
d22f76e7 6547out_err:
c300156b 6548 kfree(pctx.opts);
82995cc6 6549 ceph_destroy_options(pctx.copts);
c300156b 6550 rbd_spec_put(pctx.spec);
f28e565a 6551 kfree(options);
dc79b113 6552 return ret;
a725f65e
AE
6553}
6554
e010dd0a
ID
6555static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6556{
6557 down_write(&rbd_dev->lock_rwsem);
6558 if (__rbd_is_lock_owner(rbd_dev))
e1fddc8f 6559 __rbd_release_lock(rbd_dev);
e010dd0a
ID
6560 up_write(&rbd_dev->lock_rwsem);
6561}
6562
637cd060
ID
6563/*
6564 * If the wait is interrupted, an error is returned even if the lock
6565 * was successfully acquired. rbd_dev_image_unlock() will release it
6566 * if needed.
6567 */
e010dd0a
ID
6568static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6569{
637cd060 6570 long ret;
2f18d466 6571
e010dd0a 6572 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
637cd060
ID
6573 if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6574 return 0;
6575
e010dd0a
ID
6576 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6577 return -EINVAL;
6578 }
6579
3fe69921 6580 if (rbd_is_ro(rbd_dev))
637cd060
ID
6581 return 0;
6582
6583 rbd_assert(!rbd_is_lock_owner(rbd_dev));
6584 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6585 ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6586 ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
25e6be21 6587 if (ret > 0) {
637cd060 6588 ret = rbd_dev->acquire_err;
25e6be21
DY
6589 } else {
6590 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6591 if (!ret)
6592 ret = -ETIMEDOUT;
6593 }
637cd060 6594
2f18d466 6595 if (ret) {
637cd060
ID
6596 rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6597 return ret;
e010dd0a
ID
6598 }
6599
637cd060
ID
6600 /*
6601 * The lock may have been released by now, unless automatic lock
6602 * transitions are disabled.
6603 */
6604 rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
e010dd0a
ID
6605 return 0;
6606}
6607
589d30e0
AE
6608/*
6609 * An rbd format 2 image has a unique identifier, distinct from the
6610 * name given to it by the user. Internally, that identifier is
6611 * what's used to specify the names of objects related to the image.
6612 *
6613 * A special "rbd id" object is used to map an rbd image name to its
6614 * id. If that object doesn't exist, then there is no v2 rbd image
6615 * with the supplied name.
6616 *
6617 * This function will record the given rbd_dev's image_id field if
6618 * it can be determined, and in that case will return 0. If any
6619 * errors occur a negative errno will be returned and the rbd_dev's
6620 * image_id field will be unchanged (and should be NULL).
6621 */
6622static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6623{
6624 int ret;
6625 size_t size;
ecd4a68a 6626 CEPH_DEFINE_OID_ONSTACK(oid);
589d30e0 6627 void *response;
c0fba368 6628 char *image_id;
2f82ee54 6629
2c0d0a10
AE
6630 /*
6631 * When probing a parent image, the image id is already
6632 * known (and the image name likely is not). There's no
c0fba368
AE
6633 * need to fetch the image id again in this case. We
6634 * do still need to set the image format though.
2c0d0a10 6635 */
c0fba368
AE
6636 if (rbd_dev->spec->image_id) {
6637 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6638
2c0d0a10 6639 return 0;
c0fba368 6640 }
2c0d0a10 6641
589d30e0
AE
6642 /*
6643 * First, see if the format 2 image id file exists, and if
6644 * so, get the image's persistent id from it.
6645 */
ecd4a68a
ID
6646 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6647 rbd_dev->spec->image_name);
6648 if (ret)
6649 return ret;
6650
6651 dout("rbd id object name is %s\n", oid.name);
589d30e0
AE
6652
6653 /* Response will be an encoded string, which includes a length */
589d30e0
AE
6654 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6655 response = kzalloc(size, GFP_NOIO);
6656 if (!response) {
6657 ret = -ENOMEM;
6658 goto out;
6659 }
6660
c0fba368
AE
6661 /* If it doesn't exist we'll assume it's a format 1 image */
6662
ecd4a68a
ID
6663 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6664 "get_id", NULL, 0,
5435d206 6665 response, size);
36be9a76 6666 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
c0fba368
AE
6667 if (ret == -ENOENT) {
6668 image_id = kstrdup("", GFP_KERNEL);
6669 ret = image_id ? 0 : -ENOMEM;
6670 if (!ret)
6671 rbd_dev->image_format = 1;
7dd440c9 6672 } else if (ret >= 0) {
c0fba368
AE
6673 void *p = response;
6674
6675 image_id = ceph_extract_encoded_string(&p, p + ret,
979ed480 6676 NULL, GFP_NOIO);
461f758a 6677 ret = PTR_ERR_OR_ZERO(image_id);
c0fba368
AE
6678 if (!ret)
6679 rbd_dev->image_format = 2;
c0fba368
AE
6680 }
6681
6682 if (!ret) {
6683 rbd_dev->spec->image_id = image_id;
6684 dout("image_id is %s\n", image_id);
589d30e0
AE
6685 }
6686out:
6687 kfree(response);
ecd4a68a 6688 ceph_oid_destroy(&oid);
589d30e0
AE
6689 return ret;
6690}
6691
3abef3b3
AE
6692/*
6693 * Undo whatever state changes are made by v1 or v2 header info
6694 * call.
6695 */
6fd48b3b
AE
6696static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6697{
6698 struct rbd_image_header *header;
6699
e69b8d41 6700 rbd_dev_parent_put(rbd_dev);
22e8bd51 6701 rbd_object_map_free(rbd_dev);
da5ef6be 6702 rbd_dev_mapping_clear(rbd_dev);
6fd48b3b
AE
6703
6704 /* Free dynamic fields from the header, then zero it out */
6705
6706 header = &rbd_dev->header;
812164f8 6707 ceph_put_snap_context(header->snapc);
6fd48b3b
AE
6708 kfree(header->snap_sizes);
6709 kfree(header->snap_names);
6710 kfree(header->object_prefix);
6711 memset(header, 0, sizeof (*header));
6712}
6713
2df3fac7 6714static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
a30b71b9
AE
6715{
6716 int ret;
a30b71b9 6717
1e130199 6718 ret = rbd_dev_v2_object_prefix(rbd_dev);
57385b51 6719 if (ret)
b1b5402a
AE
6720 goto out_err;
6721
2df3fac7
AE
6722 /*
6723 * Get the and check features for the image. Currently the
6724 * features are assumed to never change.
6725 */
b1b5402a 6726 ret = rbd_dev_v2_features(rbd_dev);
57385b51 6727 if (ret)
9d475de5 6728 goto out_err;
35d489f9 6729
cc070d59
AE
6730 /* If the image supports fancy striping, get its parameters */
6731
6732 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6733 ret = rbd_dev_v2_striping_info(rbd_dev);
6734 if (ret < 0)
6735 goto out_err;
6736 }
a30b71b9 6737
7e97332e
ID
6738 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6739 ret = rbd_dev_v2_data_pool(rbd_dev);
6740 if (ret)
6741 goto out_err;
6742 }
6743
263423f8 6744 rbd_init_layout(rbd_dev);
35152979 6745 return 0;
263423f8 6746
9d475de5 6747out_err:
642a2537 6748 rbd_dev->header.features = 0;
1e130199
AE
6749 kfree(rbd_dev->header.object_prefix);
6750 rbd_dev->header.object_prefix = NULL;
9d475de5 6751 return ret;
a30b71b9
AE
6752}
6753
6d69bb53
ID
6754/*
6755 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6756 * rbd_dev_image_probe() recursion depth, which means it's also the
6757 * length of the already discovered part of the parent chain.
6758 */
6759static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
83a06263 6760{
2f82ee54 6761 struct rbd_device *parent = NULL;
124afba2
AE
6762 int ret;
6763
6764 if (!rbd_dev->parent_spec)
6765 return 0;
124afba2 6766
6d69bb53
ID
6767 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6768 pr_info("parent chain is too long (%d)\n", depth);
6769 ret = -EINVAL;
6770 goto out_err;
6771 }
6772
1643dfa4 6773 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
1f2c6651
ID
6774 if (!parent) {
6775 ret = -ENOMEM;
124afba2 6776 goto out_err;
1f2c6651
ID
6777 }
6778
6779 /*
6780 * Images related by parent/child relationships always share
6781 * rbd_client and spec/parent_spec, so bump their refcounts.
6782 */
6783 __rbd_get_client(rbd_dev->rbd_client);
6784 rbd_spec_get(rbd_dev->parent_spec);
124afba2 6785
39258aa2
ID
6786 __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
6787
6d69bb53 6788 ret = rbd_dev_image_probe(parent, depth);
124afba2
AE
6789 if (ret < 0)
6790 goto out_err;
1f2c6651 6791
124afba2 6792 rbd_dev->parent = parent;
a2acd00e 6793 atomic_set(&rbd_dev->parent_ref, 1);
124afba2 6794 return 0;
1f2c6651 6795
124afba2 6796out_err:
1f2c6651 6797 rbd_dev_unparent(rbd_dev);
1761b229 6798 rbd_dev_destroy(parent);
124afba2
AE
6799 return ret;
6800}
6801
5769ed0c
ID
6802static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6803{
6804 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5769ed0c
ID
6805 rbd_free_disk(rbd_dev);
6806 if (!single_major)
6807 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6808}
6809
811c6688
ID
6810/*
6811 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6812 * upon return.
6813 */
200a6a8b 6814static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
124afba2 6815{
83a06263 6816 int ret;
d1cf5788 6817
9b60e70b 6818 /* Record our major and minor device numbers. */
83a06263 6819
9b60e70b
ID
6820 if (!single_major) {
6821 ret = register_blkdev(0, rbd_dev->name);
6822 if (ret < 0)
1643dfa4 6823 goto err_out_unlock;
9b60e70b
ID
6824
6825 rbd_dev->major = ret;
6826 rbd_dev->minor = 0;
6827 } else {
6828 rbd_dev->major = rbd_major;
6829 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6830 }
83a06263
AE
6831
6832 /* Set up the blkdev mapping. */
6833
6834 ret = rbd_init_disk(rbd_dev);
6835 if (ret)
6836 goto err_out_blkdev;
6837
f35a4dee 6838 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
39258aa2 6839 set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
f35a4dee 6840
5769ed0c 6841 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
f35a4dee 6842 if (ret)
da5ef6be 6843 goto err_out_disk;
83a06263 6844
129b79d4 6845 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
811c6688 6846 up_write(&rbd_dev->header_rwsem);
5769ed0c 6847 return 0;
2f82ee54 6848
83a06263
AE
6849err_out_disk:
6850 rbd_free_disk(rbd_dev);
6851err_out_blkdev:
9b60e70b
ID
6852 if (!single_major)
6853 unregister_blkdev(rbd_dev->major, rbd_dev->name);
811c6688
ID
6854err_out_unlock:
6855 up_write(&rbd_dev->header_rwsem);
83a06263
AE
6856 return ret;
6857}
6858
332bb12d
AE
6859static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6860{
6861 struct rbd_spec *spec = rbd_dev->spec;
c41d13a3 6862 int ret;
332bb12d
AE
6863
6864 /* Record the header object name for this rbd image. */
6865
6866 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
332bb12d 6867 if (rbd_dev->image_format == 1)
c41d13a3
ID
6868 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6869 spec->image_name, RBD_SUFFIX);
332bb12d 6870 else
c41d13a3
ID
6871 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6872 RBD_HEADER_PREFIX, spec->image_id);
332bb12d 6873
c41d13a3 6874 return ret;
332bb12d
AE
6875}
6876
b9ef2b88
ID
6877static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6878{
6879 if (!is_snap) {
6880 pr_info("image %s/%s%s%s does not exist\n",
6881 rbd_dev->spec->pool_name,
6882 rbd_dev->spec->pool_ns ?: "",
6883 rbd_dev->spec->pool_ns ? "/" : "",
6884 rbd_dev->spec->image_name);
6885 } else {
6886 pr_info("snap %s/%s%s%s@%s does not exist\n",
6887 rbd_dev->spec->pool_name,
6888 rbd_dev->spec->pool_ns ?: "",
6889 rbd_dev->spec->pool_ns ? "/" : "",
6890 rbd_dev->spec->image_name,
6891 rbd_dev->spec->snap_name);
6892 }
6893}
6894
200a6a8b
AE
6895static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6896{
b8776051 6897 if (!rbd_is_ro(rbd_dev))
fd22aef8 6898 rbd_unregister_watch(rbd_dev);
952c48b0
ID
6899
6900 rbd_dev_unprobe(rbd_dev);
6fd48b3b
AE
6901 rbd_dev->image_format = 0;
6902 kfree(rbd_dev->spec->image_id);
6903 rbd_dev->spec->image_id = NULL;
200a6a8b
AE
6904}
6905
a30b71b9
AE
6906/*
6907 * Probe for the existence of the header object for the given rbd
1f3ef788
AE
6908 * device. If this image is the one being mapped (i.e., not a
6909 * parent), initiate a watch on its header object before using that
6910 * object to get detailed information about the rbd image.
0e4e1de5
ID
6911 *
6912 * On success, returns with header_rwsem held for write if called
6913 * with @depth == 0.
a30b71b9 6914 */
6d69bb53 6915static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
a30b71b9 6916{
b9ef2b88 6917 bool need_watch = !rbd_is_ro(rbd_dev);
a30b71b9
AE
6918 int ret;
6919
6920 /*
3abef3b3
AE
6921 * Get the id from the image id object. Unless there's an
6922 * error, rbd_dev->spec->image_id will be filled in with
6923 * a dynamically-allocated string, and rbd_dev->image_format
6924 * will be set to either 1 or 2.
a30b71b9
AE
6925 */
6926 ret = rbd_dev_image_id(rbd_dev);
6927 if (ret)
c0fba368 6928 return ret;
c0fba368 6929
332bb12d
AE
6930 ret = rbd_dev_header_name(rbd_dev);
6931 if (ret)
6932 goto err_out_format;
6933
b9ef2b88 6934 if (need_watch) {
99d16943 6935 ret = rbd_register_watch(rbd_dev);
1fe48023
ID
6936 if (ret) {
6937 if (ret == -ENOENT)
b9ef2b88 6938 rbd_print_dne(rbd_dev, false);
c41d13a3 6939 goto err_out_format;
1fe48023 6940 }
1f3ef788 6941 }
b644de2b 6942
0e4e1de5
ID
6943 if (!depth)
6944 down_write(&rbd_dev->header_rwsem);
6945
a720ae09 6946 ret = rbd_dev_header_info(rbd_dev);
b9ef2b88
ID
6947 if (ret) {
6948 if (ret == -ENOENT && !need_watch)
6949 rbd_print_dne(rbd_dev, false);
952c48b0 6950 goto err_out_probe;
b9ef2b88 6951 }
83a06263 6952
04077599
ID
6953 /*
6954 * If this image is the one being mapped, we have pool name and
6955 * id, image name and id, and snap name - need to fill snap id.
6956 * Otherwise this is a parent image, identified by pool, image
6957 * and snap ids - need to fill in names for those ids.
6958 */
6d69bb53 6959 if (!depth)
04077599
ID
6960 ret = rbd_spec_fill_snap_id(rbd_dev);
6961 else
6962 ret = rbd_spec_fill_names(rbd_dev);
1fe48023
ID
6963 if (ret) {
6964 if (ret == -ENOENT)
b9ef2b88 6965 rbd_print_dne(rbd_dev, true);
33dca39f 6966 goto err_out_probe;
1fe48023 6967 }
9bb81c9b 6968
da5ef6be
ID
6969 ret = rbd_dev_mapping_set(rbd_dev);
6970 if (ret)
6971 goto err_out_probe;
6972
f3c0e459 6973 if (rbd_is_snap(rbd_dev) &&
22e8bd51
ID
6974 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
6975 ret = rbd_object_map_load(rbd_dev);
6976 if (ret)
6977 goto err_out_probe;
6978 }
6979
e8f59b59
ID
6980 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6981 ret = rbd_dev_v2_parent_info(rbd_dev);
6982 if (ret)
6983 goto err_out_probe;
e8f59b59
ID
6984 }
6985
6d69bb53 6986 ret = rbd_dev_probe_parent(rbd_dev, depth);
30d60ba2
AE
6987 if (ret)
6988 goto err_out_probe;
6989
6990 dout("discovered format %u image, header name is %s\n",
c41d13a3 6991 rbd_dev->image_format, rbd_dev->header_oid.name);
30d60ba2 6992 return 0;
e8f59b59 6993
6fd48b3b 6994err_out_probe:
0e4e1de5
ID
6995 if (!depth)
6996 up_write(&rbd_dev->header_rwsem);
b9ef2b88 6997 if (need_watch)
99d16943 6998 rbd_unregister_watch(rbd_dev);
952c48b0 6999 rbd_dev_unprobe(rbd_dev);
332bb12d
AE
7000err_out_format:
7001 rbd_dev->image_format = 0;
5655c4d9
AE
7002 kfree(rbd_dev->spec->image_id);
7003 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
7004 return ret;
7005}
7006
9b60e70b
ID
7007static ssize_t do_rbd_add(struct bus_type *bus,
7008 const char *buf,
7009 size_t count)
602adf40 7010{
cb8627c7 7011 struct rbd_device *rbd_dev = NULL;
dc79b113 7012 struct ceph_options *ceph_opts = NULL;
4e9afeba 7013 struct rbd_options *rbd_opts = NULL;
859c31df 7014 struct rbd_spec *spec = NULL;
9d3997fd 7015 struct rbd_client *rbdc;
b51c83c2 7016 int rc;
602adf40
YS
7017
7018 if (!try_module_get(THIS_MODULE))
7019 return -ENODEV;
7020
602adf40 7021 /* parse add command */
859c31df 7022 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 7023 if (rc < 0)
dd5ac32d 7024 goto out;
78cea76e 7025
9d3997fd
AE
7026 rbdc = rbd_get_client(ceph_opts);
7027 if (IS_ERR(rbdc)) {
7028 rc = PTR_ERR(rbdc);
0ddebc0c 7029 goto err_out_args;
9d3997fd 7030 }
602adf40 7031
602adf40 7032 /* pick the pool */
dd435855 7033 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
1fe48023
ID
7034 if (rc < 0) {
7035 if (rc == -ENOENT)
7036 pr_info("pool %s does not exist\n", spec->pool_name);
602adf40 7037 goto err_out_client;
1fe48023 7038 }
c0cd10db 7039 spec->pool_id = (u64)rc;
859c31df 7040
d147543d 7041 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
b51c83c2
ID
7042 if (!rbd_dev) {
7043 rc = -ENOMEM;
bd4ba655 7044 goto err_out_client;
b51c83c2 7045 }
c53d5893
AE
7046 rbdc = NULL; /* rbd_dev now owns this */
7047 spec = NULL; /* rbd_dev now owns this */
d147543d 7048 rbd_opts = NULL; /* rbd_dev now owns this */
602adf40 7049
39258aa2
ID
7050 /* if we are mapping a snapshot it will be a read-only mapping */
7051 if (rbd_dev->opts->read_only ||
7052 strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
7053 __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
7054
0d6d1e9c
MC
7055 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7056 if (!rbd_dev->config_info) {
7057 rc = -ENOMEM;
7058 goto err_out_rbd_dev;
7059 }
7060
6d69bb53 7061 rc = rbd_dev_image_probe(rbd_dev, 0);
0e4e1de5 7062 if (rc < 0)
c53d5893 7063 goto err_out_rbd_dev;
05fd6f6f 7064
0c93e1b7
ID
7065 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7066 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7067 rbd_dev->layout.object_size);
7068 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7069 }
7070
b536f69a 7071 rc = rbd_dev_device_setup(rbd_dev);
fd22aef8 7072 if (rc)
8b679ec5 7073 goto err_out_image_probe;
3abef3b3 7074
637cd060
ID
7075 rc = rbd_add_acquire_lock(rbd_dev);
7076 if (rc)
7077 goto err_out_image_lock;
3abef3b3 7078
5769ed0c
ID
7079 /* Everything's ready. Announce the disk to the world. */
7080
7081 rc = device_add(&rbd_dev->dev);
7082 if (rc)
e010dd0a 7083 goto err_out_image_lock;
5769ed0c 7084
3325322f 7085 device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
5769ed0c
ID
7086 /* see rbd_init_disk() */
7087 blk_put_queue(rbd_dev->disk->queue);
7088
7089 spin_lock(&rbd_dev_list_lock);
7090 list_add_tail(&rbd_dev->node, &rbd_dev_list);
7091 spin_unlock(&rbd_dev_list_lock);
7092
7093 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7094 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7095 rbd_dev->header.features);
dd5ac32d
ID
7096 rc = count;
7097out:
7098 module_put(THIS_MODULE);
7099 return rc;
b536f69a 7100
e010dd0a
ID
7101err_out_image_lock:
7102 rbd_dev_image_unlock(rbd_dev);
5769ed0c 7103 rbd_dev_device_release(rbd_dev);
8b679ec5
ID
7104err_out_image_probe:
7105 rbd_dev_image_release(rbd_dev);
c53d5893
AE
7106err_out_rbd_dev:
7107 rbd_dev_destroy(rbd_dev);
bd4ba655 7108err_out_client:
9d3997fd 7109 rbd_put_client(rbdc);
0ddebc0c 7110err_out_args:
859c31df 7111 rbd_spec_put(spec);
d147543d 7112 kfree(rbd_opts);
dd5ac32d 7113 goto out;
602adf40
YS
7114}
7115
7e9586ba 7116static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
7117{
7118 if (single_major)
7119 return -EINVAL;
7120
7121 return do_rbd_add(bus, buf, count);
7122}
7123
7e9586ba
GKH
7124static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7125 size_t count)
9b60e70b
ID
7126{
7127 return do_rbd_add(bus, buf, count);
7128}
7129
05a46afd
AE
7130static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7131{
ad945fc1 7132 while (rbd_dev->parent) {
05a46afd
AE
7133 struct rbd_device *first = rbd_dev;
7134 struct rbd_device *second = first->parent;
7135 struct rbd_device *third;
7136
7137 /*
7138 * Follow to the parent with no grandparent and
7139 * remove it.
7140 */
7141 while (second && (third = second->parent)) {
7142 first = second;
7143 second = third;
7144 }
ad945fc1 7145 rbd_assert(second);
8ad42cd0 7146 rbd_dev_image_release(second);
8b679ec5 7147 rbd_dev_destroy(second);
ad945fc1
AE
7148 first->parent = NULL;
7149 first->parent_overlap = 0;
7150
7151 rbd_assert(first->parent_spec);
05a46afd
AE
7152 rbd_spec_put(first->parent_spec);
7153 first->parent_spec = NULL;
05a46afd
AE
7154 }
7155}
7156
9b60e70b
ID
7157static ssize_t do_rbd_remove(struct bus_type *bus,
7158 const char *buf,
7159 size_t count)
602adf40
YS
7160{
7161 struct rbd_device *rbd_dev = NULL;
751cc0e3
AE
7162 struct list_head *tmp;
7163 int dev_id;
0276dca6 7164 char opt_buf[6];
0276dca6 7165 bool force = false;
0d8189e1 7166 int ret;
602adf40 7167
0276dca6
MC
7168 dev_id = -1;
7169 opt_buf[0] = '\0';
7170 sscanf(buf, "%d %5s", &dev_id, opt_buf);
7171 if (dev_id < 0) {
7172 pr_err("dev_id out of range\n");
602adf40 7173 return -EINVAL;
0276dca6
MC
7174 }
7175 if (opt_buf[0] != '\0') {
7176 if (!strcmp(opt_buf, "force")) {
7177 force = true;
7178 } else {
7179 pr_err("bad remove option at '%s'\n", opt_buf);
7180 return -EINVAL;
7181 }
7182 }
602adf40 7183
751cc0e3
AE
7184 ret = -ENOENT;
7185 spin_lock(&rbd_dev_list_lock);
7186 list_for_each(tmp, &rbd_dev_list) {
7187 rbd_dev = list_entry(tmp, struct rbd_device, node);
7188 if (rbd_dev->dev_id == dev_id) {
7189 ret = 0;
7190 break;
7191 }
42382b70 7192 }
751cc0e3
AE
7193 if (!ret) {
7194 spin_lock_irq(&rbd_dev->lock);
0276dca6 7195 if (rbd_dev->open_count && !force)
751cc0e3 7196 ret = -EBUSY;
85f5a4d6
ID
7197 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7198 &rbd_dev->flags))
7199 ret = -EINPROGRESS;
751cc0e3
AE
7200 spin_unlock_irq(&rbd_dev->lock);
7201 }
7202 spin_unlock(&rbd_dev_list_lock);
85f5a4d6 7203 if (ret)
1ba0f1e7 7204 return ret;
751cc0e3 7205
0276dca6
MC
7206 if (force) {
7207 /*
7208 * Prevent new IO from being queued and wait for existing
7209 * IO to complete/fail.
7210 */
7211 blk_mq_freeze_queue(rbd_dev->disk->queue);
7212 blk_set_queue_dying(rbd_dev->disk->queue);
7213 }
7214
5769ed0c
ID
7215 del_gendisk(rbd_dev->disk);
7216 spin_lock(&rbd_dev_list_lock);
7217 list_del_init(&rbd_dev->node);
7218 spin_unlock(&rbd_dev_list_lock);
7219 device_del(&rbd_dev->dev);
fca27065 7220
e010dd0a 7221 rbd_dev_image_unlock(rbd_dev);
dd5ac32d 7222 rbd_dev_device_release(rbd_dev);
8ad42cd0 7223 rbd_dev_image_release(rbd_dev);
8b679ec5 7224 rbd_dev_destroy(rbd_dev);
1ba0f1e7 7225 return count;
602adf40
YS
7226}
7227
7e9586ba 7228static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
7229{
7230 if (single_major)
7231 return -EINVAL;
7232
7233 return do_rbd_remove(bus, buf, count);
7234}
7235
7e9586ba
GKH
7236static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7237 size_t count)
9b60e70b
ID
7238{
7239 return do_rbd_remove(bus, buf, count);
7240}
7241
602adf40
YS
7242/*
7243 * create control files in sysfs
dfc5606d 7244 * /sys/bus/rbd/...
602adf40 7245 */
7d8dc534 7246static int __init rbd_sysfs_init(void)
602adf40 7247{
dfc5606d 7248 int ret;
602adf40 7249
fed4c143 7250 ret = device_register(&rbd_root_dev);
21079786 7251 if (ret < 0)
dfc5606d 7252 return ret;
602adf40 7253
fed4c143
AE
7254 ret = bus_register(&rbd_bus_type);
7255 if (ret < 0)
7256 device_unregister(&rbd_root_dev);
602adf40 7257
602adf40
YS
7258 return ret;
7259}
7260
7d8dc534 7261static void __exit rbd_sysfs_cleanup(void)
602adf40 7262{
dfc5606d 7263 bus_unregister(&rbd_bus_type);
fed4c143 7264 device_unregister(&rbd_root_dev);
602adf40
YS
7265}
7266
7d8dc534 7267static int __init rbd_slab_init(void)
1c2a9dfe
AE
7268{
7269 rbd_assert(!rbd_img_request_cache);
03d94406 7270 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
868311b1
AE
7271 if (!rbd_img_request_cache)
7272 return -ENOMEM;
7273
7274 rbd_assert(!rbd_obj_request_cache);
03d94406 7275 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
78c2a44a
AE
7276 if (!rbd_obj_request_cache)
7277 goto out_err;
7278
6c696d85 7279 return 0;
1c2a9dfe 7280
6c696d85 7281out_err:
868311b1
AE
7282 kmem_cache_destroy(rbd_img_request_cache);
7283 rbd_img_request_cache = NULL;
1c2a9dfe
AE
7284 return -ENOMEM;
7285}
7286
7287static void rbd_slab_exit(void)
7288{
868311b1
AE
7289 rbd_assert(rbd_obj_request_cache);
7290 kmem_cache_destroy(rbd_obj_request_cache);
7291 rbd_obj_request_cache = NULL;
7292
1c2a9dfe
AE
7293 rbd_assert(rbd_img_request_cache);
7294 kmem_cache_destroy(rbd_img_request_cache);
7295 rbd_img_request_cache = NULL;
7296}
7297
cc344fa1 7298static int __init rbd_init(void)
602adf40
YS
7299{
7300 int rc;
7301
1e32d34c
AE
7302 if (!libceph_compatible(NULL)) {
7303 rbd_warn(NULL, "libceph incompatibility (quitting)");
1e32d34c
AE
7304 return -EINVAL;
7305 }
e1b4d96d 7306
1c2a9dfe 7307 rc = rbd_slab_init();
602adf40
YS
7308 if (rc)
7309 return rc;
e1b4d96d 7310
f5ee37bd
ID
7311 /*
7312 * The number of active work items is limited by the number of
f77303bd 7313 * rbd devices * queue depth, so leave @max_active at default.
f5ee37bd
ID
7314 */
7315 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7316 if (!rbd_wq) {
7317 rc = -ENOMEM;
7318 goto err_out_slab;
7319 }
7320
9b60e70b
ID
7321 if (single_major) {
7322 rbd_major = register_blkdev(0, RBD_DRV_NAME);
7323 if (rbd_major < 0) {
7324 rc = rbd_major;
f5ee37bd 7325 goto err_out_wq;
9b60e70b
ID
7326 }
7327 }
7328
1c2a9dfe
AE
7329 rc = rbd_sysfs_init();
7330 if (rc)
9b60e70b
ID
7331 goto err_out_blkdev;
7332
7333 if (single_major)
7334 pr_info("loaded (major %d)\n", rbd_major);
7335 else
7336 pr_info("loaded\n");
1c2a9dfe 7337
e1b4d96d
ID
7338 return 0;
7339
9b60e70b
ID
7340err_out_blkdev:
7341 if (single_major)
7342 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd
ID
7343err_out_wq:
7344 destroy_workqueue(rbd_wq);
e1b4d96d
ID
7345err_out_slab:
7346 rbd_slab_exit();
1c2a9dfe 7347 return rc;
602adf40
YS
7348}
7349
cc344fa1 7350static void __exit rbd_exit(void)
602adf40 7351{
ffe312cf 7352 ida_destroy(&rbd_dev_id_ida);
602adf40 7353 rbd_sysfs_cleanup();
9b60e70b
ID
7354 if (single_major)
7355 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd 7356 destroy_workqueue(rbd_wq);
1c2a9dfe 7357 rbd_slab_exit();
602adf40
YS
7358}
7359
7360module_init(rbd_init);
7361module_exit(rbd_exit);
7362
d552c619 7363MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
602adf40
YS
7364MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7365MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
602adf40
YS
7366/* following authorship retained from original osdblk.c */
7367MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7368
90da258b 7369MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
602adf40 7370MODULE_LICENSE("GPL");