block: remove GENHD_FL_EXT_DEVT
[linux-block.git] / drivers / block / rbd.c
CommitLineData
e2a58ee5 1
602adf40
YS
2/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
dfc5606d 25 For usage instructions, please refer to:
602adf40 26
dfc5606d 27 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
28
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
ed95b21a 34#include <linux/ceph/cls_lock_client.h>
43df3d35 35#include <linux/ceph/striper.h>
602adf40 36#include <linux/ceph/decode.h>
82995cc6 37#include <linux/fs_parser.h>
30d1cff8 38#include <linux/bsearch.h>
602adf40
YS
39
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
7ad18afa 43#include <linux/blk-mq.h>
602adf40
YS
44#include <linux/fs.h>
45#include <linux/blkdev.h>
1c2a9dfe 46#include <linux/slab.h>
f8a22fc2 47#include <linux/idr.h>
bc1ecc65 48#include <linux/workqueue.h>
602adf40
YS
49
50#include "rbd_types.h"
51
aafb230e
AE
52#define RBD_DEBUG /* Activate rbd_assert() calls */
53
a2acd00e
AE
54/*
55 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
bfc18e38 64 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
a2acd00e
AE
65 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
f0f8cef5 87#define RBD_DRV_NAME "rbd"
602adf40 88
7e513d43
ID
89#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
602adf40 91
6d69bb53
ID
92#define RBD_MAX_PARENT_CHAIN_LEN 16
93
d4b125e9
AE
94#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
35d489f9 98#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
99
100#define RBD_SNAP_HEAD_NAME "-"
101
9682fc6d
AE
102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
9e15b77d
AE
104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 106#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 107
1e130199 108#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 109
ed95b21a 110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
99d16943
ID
111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
d889140c
AE
113/* Feature bits */
114
8767b293
ID
115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
22e8bd51
ID
118#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
119#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
b9f6d447 120#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
8767b293 121#define RBD_FEATURE_DATA_POOL (1ULL<<7)
e573427a 122#define RBD_FEATURE_OPERATIONS (1ULL<<8)
8767b293 123
ed95b21a
ID
124#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
125 RBD_FEATURE_STRIPINGV2 | \
7e97332e 126 RBD_FEATURE_EXCLUSIVE_LOCK | \
22e8bd51
ID
127 RBD_FEATURE_OBJECT_MAP | \
128 RBD_FEATURE_FAST_DIFF | \
b9f6d447 129 RBD_FEATURE_DEEP_FLATTEN | \
e573427a
ID
130 RBD_FEATURE_DATA_POOL | \
131 RBD_FEATURE_OPERATIONS)
d889140c
AE
132
133/* Features supported by this (client software) implementation. */
134
770eba6e 135#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
d889140c 136
81a89793
AE
137/*
138 * An RBD device name will be "rbd#", where the "rbd" comes from
139 * RBD_DRV_NAME above, and # is a unique integer identifier.
81a89793 140 */
602adf40
YS
141#define DEV_NAME_LEN 32
142
143/*
144 * block device image metadata (in-memory version)
145 */
146struct rbd_image_header {
f35a4dee 147 /* These six fields never change for a given rbd image */
849b4260 148 char *object_prefix;
602adf40 149 __u8 obj_order;
f35a4dee
AE
150 u64 stripe_unit;
151 u64 stripe_count;
7e97332e 152 s64 data_pool_id;
f35a4dee 153 u64 features; /* Might be changeable someday? */
602adf40 154
f84344f3
AE
155 /* The remaining fields need to be updated occasionally */
156 u64 image_size;
157 struct ceph_snap_context *snapc;
f35a4dee
AE
158 char *snap_names; /* format 1 only */
159 u64 *snap_sizes; /* format 1 only */
59c2be1e
YS
160};
161
0d7dbfce
AE
162/*
163 * An rbd image specification.
164 *
165 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
166 * identify an image. Each rbd_dev structure includes a pointer to
167 * an rbd_spec structure that encapsulates this identity.
168 *
169 * Each of the id's in an rbd_spec has an associated name. For a
170 * user-mapped image, the names are supplied and the id's associated
171 * with them are looked up. For a layered image, a parent image is
172 * defined by the tuple, and the names are looked up.
173 *
174 * An rbd_dev structure contains a parent_spec pointer which is
175 * non-null if the image it represents is a child in a layered
176 * image. This pointer will refer to the rbd_spec structure used
177 * by the parent rbd_dev for its own identity (i.e., the structure
178 * is shared between the parent and child).
179 *
180 * Since these structures are populated once, during the discovery
181 * phase of image construction, they are effectively immutable so
182 * we make no effort to synchronize access to them.
183 *
184 * Note that code herein does not assume the image name is known (it
185 * could be a null pointer).
0d7dbfce
AE
186 */
187struct rbd_spec {
188 u64 pool_id;
ecb4dc22 189 const char *pool_name;
b26c047b 190 const char *pool_ns; /* NULL if default, never "" */
0d7dbfce 191
ecb4dc22
AE
192 const char *image_id;
193 const char *image_name;
0d7dbfce
AE
194
195 u64 snap_id;
ecb4dc22 196 const char *snap_name;
0d7dbfce
AE
197
198 struct kref kref;
199};
200
602adf40 201/*
f0f8cef5 202 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
203 */
204struct rbd_client {
205 struct ceph_client *client;
206 struct kref kref;
207 struct list_head node;
208};
209
0192ce2e
ID
210struct pending_result {
211 int result; /* first nonzero result */
212 int num_pending;
213};
214
bf0d5f50 215struct rbd_img_request;
bf0d5f50 216
9969ebc5 217enum obj_request_type {
a1fbb5e7 218 OBJ_REQUEST_NODATA = 1,
5359a17d 219 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
7e07efb1 220 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
afb97888 221 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
9969ebc5 222};
bf0d5f50 223
6d2940c8 224enum obj_operation_type {
a1fbb5e7 225 OBJ_OP_READ = 1,
6d2940c8 226 OBJ_OP_WRITE,
90e98c52 227 OBJ_OP_DISCARD,
6484cbe9 228 OBJ_OP_ZEROOUT,
6d2940c8
GZ
229};
230
0ad5d953
ID
231#define RBD_OBJ_FLAG_DELETION (1U << 0)
232#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
793333a3 233#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
22e8bd51
ID
234#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
235#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
0ad5d953 236
a9b67e69 237enum rbd_obj_read_state {
85b5e6d1
ID
238 RBD_OBJ_READ_START = 1,
239 RBD_OBJ_READ_OBJECT,
a9b67e69
ID
240 RBD_OBJ_READ_PARENT,
241};
242
3da691bf
ID
243/*
244 * Writes go through the following state machine to deal with
245 * layering:
246 *
89a59c1c
ID
247 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248 * . | .
249 * . v .
250 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
251 * . | . .
252 * . v v (deep-copyup .
253 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
254 * flattened) v | . .
255 * . v . .
256 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
257 * | not needed) v
258 * v .
259 * done . . . . . . . . . . . . . . . . . .
260 * ^
261 * |
262 * RBD_OBJ_WRITE_FLAT
3da691bf
ID
263 *
264 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
89a59c1c
ID
265 * assert_exists guard is needed or not (in some cases it's not needed
266 * even if there is a parent).
3da691bf
ID
267 */
268enum rbd_obj_write_state {
85b5e6d1 269 RBD_OBJ_WRITE_START = 1,
22e8bd51 270 RBD_OBJ_WRITE_PRE_OBJECT_MAP,
85b5e6d1 271 RBD_OBJ_WRITE_OBJECT,
793333a3
ID
272 __RBD_OBJ_WRITE_COPYUP,
273 RBD_OBJ_WRITE_COPYUP,
22e8bd51 274 RBD_OBJ_WRITE_POST_OBJECT_MAP,
793333a3
ID
275};
276
277enum rbd_obj_copyup_state {
278 RBD_OBJ_COPYUP_START = 1,
279 RBD_OBJ_COPYUP_READ_PARENT,
22e8bd51
ID
280 __RBD_OBJ_COPYUP_OBJECT_MAPS,
281 RBD_OBJ_COPYUP_OBJECT_MAPS,
793333a3
ID
282 __RBD_OBJ_COPYUP_WRITE_OBJECT,
283 RBD_OBJ_COPYUP_WRITE_OBJECT,
926f9b3f
AE
284};
285
bf0d5f50 286struct rbd_obj_request {
43df3d35 287 struct ceph_object_extent ex;
0ad5d953 288 unsigned int flags; /* RBD_OBJ_FLAG_* */
c5b5ef6c 289 union {
a9b67e69 290 enum rbd_obj_read_state read_state; /* for reads */
3da691bf 291 enum rbd_obj_write_state write_state; /* for writes */
c5b5ef6c 292 };
bf0d5f50 293
51c3509e 294 struct rbd_img_request *img_request;
86bd7998
ID
295 struct ceph_file_extent *img_extents;
296 u32 num_img_extents;
bf0d5f50 297
788e2df3 298 union {
5359a17d 299 struct ceph_bio_iter bio_pos;
788e2df3 300 struct {
7e07efb1
ID
301 struct ceph_bvec_iter bvec_pos;
302 u32 bvec_count;
afb97888 303 u32 bvec_idx;
788e2df3
AE
304 };
305 };
793333a3
ID
306
307 enum rbd_obj_copyup_state copyup_state;
7e07efb1
ID
308 struct bio_vec *copyup_bvecs;
309 u32 copyup_bvec_count;
bf0d5f50 310
bcbab1db 311 struct list_head osd_reqs; /* w/ r_private_item */
bf0d5f50 312
85b5e6d1 313 struct mutex state_mutex;
793333a3 314 struct pending_result pending;
bf0d5f50
AE
315 struct kref kref;
316};
317
0c425248 318enum img_req_flags {
9849e986 319 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
d0b2e944 320 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
0c425248
AE
321};
322
0192ce2e
ID
323enum rbd_img_state {
324 RBD_IMG_START = 1,
637cd060 325 RBD_IMG_EXCLUSIVE_LOCK,
0192ce2e
ID
326 __RBD_IMG_OBJECT_REQUESTS,
327 RBD_IMG_OBJECT_REQUESTS,
328};
329
bf0d5f50 330struct rbd_img_request {
bf0d5f50 331 struct rbd_device *rbd_dev;
9bb0248d 332 enum obj_operation_type op_type;
ecc633ca 333 enum obj_request_type data_type;
0c425248 334 unsigned long flags;
0192ce2e 335 enum rbd_img_state state;
bf0d5f50 336 union {
9849e986 337 u64 snap_id; /* for reads */
bf0d5f50 338 struct ceph_snap_context *snapc; /* for writes */
9849e986 339 };
59e542c8 340 struct rbd_obj_request *obj_request; /* obj req initiator */
bf0d5f50 341
e1fddc8f 342 struct list_head lock_item;
43df3d35 343 struct list_head object_extents; /* obj_req.ex structs */
bf0d5f50 344
0192ce2e
ID
345 struct mutex state_mutex;
346 struct pending_result pending;
347 struct work_struct work;
348 int work_result;
bf0d5f50
AE
349};
350
351#define for_each_obj_request(ireq, oreq) \
43df3d35 352 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 353#define for_each_obj_request_safe(ireq, oreq, n) \
43df3d35 354 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 355
99d16943
ID
356enum rbd_watch_state {
357 RBD_WATCH_STATE_UNREGISTERED,
358 RBD_WATCH_STATE_REGISTERED,
359 RBD_WATCH_STATE_ERROR,
360};
361
ed95b21a
ID
362enum rbd_lock_state {
363 RBD_LOCK_STATE_UNLOCKED,
364 RBD_LOCK_STATE_LOCKED,
365 RBD_LOCK_STATE_RELEASING,
366};
367
368/* WatchNotify::ClientId */
369struct rbd_client_id {
370 u64 gid;
371 u64 handle;
372};
373
f84344f3 374struct rbd_mapping {
99c1f08f 375 u64 size;
f84344f3
AE
376};
377
602adf40
YS
378/*
379 * a single device
380 */
381struct rbd_device {
de71a297 382 int dev_id; /* blkdev unique id */
602adf40
YS
383
384 int major; /* blkdev assigned major */
dd82fff1 385 int minor;
602adf40 386 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 387
a30b71b9 388 u32 image_format; /* Either 1 or 2 */
602adf40
YS
389 struct rbd_client *rbd_client;
390
391 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392
b82d167b 393 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
394
395 struct rbd_image_header header;
b82d167b 396 unsigned long flags; /* possibly lock protected */
0d7dbfce 397 struct rbd_spec *spec;
d147543d 398 struct rbd_options *opts;
0d6d1e9c 399 char *config_info; /* add{,_single_major} string */
602adf40 400
c41d13a3 401 struct ceph_object_id header_oid;
922dab61 402 struct ceph_object_locator header_oloc;
971f839a 403
1643dfa4 404 struct ceph_file_layout layout; /* used for all rbd requests */
0903e875 405
99d16943
ID
406 struct mutex watch_mutex;
407 enum rbd_watch_state watch_state;
922dab61 408 struct ceph_osd_linger_request *watch_handle;
99d16943
ID
409 u64 watch_cookie;
410 struct delayed_work watch_dwork;
59c2be1e 411
ed95b21a
ID
412 struct rw_semaphore lock_rwsem;
413 enum rbd_lock_state lock_state;
cbbfb0ff 414 char lock_cookie[32];
ed95b21a
ID
415 struct rbd_client_id owner_cid;
416 struct work_struct acquired_lock_work;
417 struct work_struct released_lock_work;
418 struct delayed_work lock_dwork;
419 struct work_struct unlock_work;
e1fddc8f 420 spinlock_t lock_lists_lock;
637cd060 421 struct list_head acquiring_list;
e1fddc8f 422 struct list_head running_list;
637cd060
ID
423 struct completion acquire_wait;
424 int acquire_err;
e1fddc8f 425 struct completion releasing_wait;
ed95b21a 426
22e8bd51
ID
427 spinlock_t object_map_lock;
428 u8 *object_map;
429 u64 object_map_size; /* in objects */
430 u64 object_map_flags;
ed95b21a 431
1643dfa4 432 struct workqueue_struct *task_wq;
59c2be1e 433
86b00e0d
AE
434 struct rbd_spec *parent_spec;
435 u64 parent_overlap;
a2acd00e 436 atomic_t parent_ref;
2f82ee54 437 struct rbd_device *parent;
86b00e0d 438
7ad18afa
CH
439 /* Block layer tags. */
440 struct blk_mq_tag_set tag_set;
441
c666601a
JD
442 /* protects updating the header */
443 struct rw_semaphore header_rwsem;
f84344f3
AE
444
445 struct rbd_mapping mapping;
602adf40
YS
446
447 struct list_head node;
dfc5606d 448
dfc5606d
YS
449 /* sysfs related */
450 struct device dev;
b82d167b 451 unsigned long open_count; /* protected by lock */
dfc5606d
YS
452};
453
b82d167b 454/*
87c0fded
ID
455 * Flag bits for rbd_dev->flags:
456 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
457 * by rbd_dev->lock
b82d167b 458 */
6d292906 459enum rbd_dev_flags {
686238b7 460 RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */
b82d167b 461 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
39258aa2 462 RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */
6d292906
AE
463};
464
cfbf6377 465static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
e124a82f 466
602adf40 467static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
468static DEFINE_SPINLOCK(rbd_dev_list_lock);
469
432b8587
AE
470static LIST_HEAD(rbd_client_list); /* clients */
471static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 472
78c2a44a
AE
473/* Slab caches for frequently-allocated structures */
474
1c2a9dfe 475static struct kmem_cache *rbd_img_request_cache;
868311b1 476static struct kmem_cache *rbd_obj_request_cache;
1c2a9dfe 477
9b60e70b 478static int rbd_major;
f8a22fc2
ID
479static DEFINE_IDA(rbd_dev_id_ida);
480
f5ee37bd
ID
481static struct workqueue_struct *rbd_wq;
482
89a59c1c
ID
483static struct ceph_snap_context rbd_empty_snapc = {
484 .nref = REFCOUNT_INIT(1),
485};
486
9b60e70b 487/*
3cfa3b16 488 * single-major requires >= 0.75 version of userspace rbd utility.
9b60e70b 489 */
3cfa3b16 490static bool single_major = true;
5657a819 491module_param(single_major, bool, 0444);
3cfa3b16 492MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
9b60e70b 493
7e9586ba
GKH
494static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
495static ssize_t remove_store(struct bus_type *bus, const char *buf,
496 size_t count);
497static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
498 size_t count);
499static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
500 size_t count);
6d69bb53 501static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
f0f8cef5 502
9b60e70b
ID
503static int rbd_dev_id_to_minor(int dev_id)
504{
7e513d43 505 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
506}
507
508static int minor_to_rbd_dev_id(int minor)
509{
7e513d43 510 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
511}
512
39258aa2
ID
513static bool rbd_is_ro(struct rbd_device *rbd_dev)
514{
515 return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
516}
517
f3c0e459
ID
518static bool rbd_is_snap(struct rbd_device *rbd_dev)
519{
520 return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521}
522
ed95b21a
ID
523static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524{
637cd060
ID
525 lockdep_assert_held(&rbd_dev->lock_rwsem);
526
ed95b21a
ID
527 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529}
530
531static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532{
533 bool is_lock_owner;
534
535 down_read(&rbd_dev->lock_rwsem);
536 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537 up_read(&rbd_dev->lock_rwsem);
538 return is_lock_owner;
539}
540
7e9586ba 541static ssize_t supported_features_show(struct bus_type *bus, char *buf)
8767b293
ID
542{
543 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
544}
545
7e9586ba
GKH
546static BUS_ATTR_WO(add);
547static BUS_ATTR_WO(remove);
548static BUS_ATTR_WO(add_single_major);
549static BUS_ATTR_WO(remove_single_major);
550static BUS_ATTR_RO(supported_features);
b15a21dd
GKH
551
552static struct attribute *rbd_bus_attrs[] = {
553 &bus_attr_add.attr,
554 &bus_attr_remove.attr,
9b60e70b
ID
555 &bus_attr_add_single_major.attr,
556 &bus_attr_remove_single_major.attr,
8767b293 557 &bus_attr_supported_features.attr,
b15a21dd 558 NULL,
f0f8cef5 559};
92c76dc0
ID
560
561static umode_t rbd_bus_is_visible(struct kobject *kobj,
562 struct attribute *attr, int index)
563{
9b60e70b
ID
564 if (!single_major &&
565 (attr == &bus_attr_add_single_major.attr ||
566 attr == &bus_attr_remove_single_major.attr))
567 return 0;
568
92c76dc0
ID
569 return attr->mode;
570}
571
572static const struct attribute_group rbd_bus_group = {
573 .attrs = rbd_bus_attrs,
574 .is_visible = rbd_bus_is_visible,
575};
576__ATTRIBUTE_GROUPS(rbd_bus);
f0f8cef5
AE
577
578static struct bus_type rbd_bus_type = {
579 .name = "rbd",
b15a21dd 580 .bus_groups = rbd_bus_groups,
f0f8cef5
AE
581};
582
583static void rbd_root_dev_release(struct device *dev)
584{
585}
586
587static struct device rbd_root_dev = {
588 .init_name = "rbd",
589 .release = rbd_root_dev_release,
590};
591
06ecc6cb
AE
592static __printf(2, 3)
593void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
594{
595 struct va_format vaf;
596 va_list args;
597
598 va_start(args, fmt);
599 vaf.fmt = fmt;
600 vaf.va = &args;
601
602 if (!rbd_dev)
603 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
604 else if (rbd_dev->disk)
605 printk(KERN_WARNING "%s: %s: %pV\n",
606 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
607 else if (rbd_dev->spec && rbd_dev->spec->image_name)
608 printk(KERN_WARNING "%s: image %s: %pV\n",
609 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
610 else if (rbd_dev->spec && rbd_dev->spec->image_id)
611 printk(KERN_WARNING "%s: id %s: %pV\n",
612 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
613 else /* punt */
614 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
615 RBD_DRV_NAME, rbd_dev, &vaf);
616 va_end(args);
617}
618
aafb230e
AE
619#ifdef RBD_DEBUG
620#define rbd_assert(expr) \
621 if (unlikely(!(expr))) { \
622 printk(KERN_ERR "\nAssertion failure in %s() " \
623 "at line %d:\n\n" \
624 "\trbd_assert(%s);\n\n", \
625 __func__, __LINE__, #expr); \
626 BUG(); \
627 }
628#else /* !RBD_DEBUG */
629# define rbd_assert(expr) ((void) 0)
630#endif /* !RBD_DEBUG */
dfc5606d 631
05a46afd 632static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
8b3e1a56 633
cc4a38bd 634static int rbd_dev_refresh(struct rbd_device *rbd_dev);
2df3fac7 635static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
a720ae09 636static int rbd_dev_header_info(struct rbd_device *rbd_dev);
e8f59b59 637static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
54cac61f
AE
638static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
639 u64 snap_id);
2ad3d716
AE
640static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
641 u8 *order, u64 *snap_size);
22e8bd51 642static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
59c2be1e 643
54ab3b24 644static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
0192ce2e
ID
645static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
646
647/*
648 * Return true if nothing else is pending.
649 */
650static bool pending_result_dec(struct pending_result *pending, int *result)
651{
652 rbd_assert(pending->num_pending > 0);
653
654 if (*result && !pending->result)
655 pending->result = *result;
656 if (--pending->num_pending)
657 return false;
658
659 *result = pending->result;
660 return true;
661}
59c2be1e 662
602adf40
YS
663static int rbd_open(struct block_device *bdev, fmode_t mode)
664{
f0f8cef5 665 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 666 bool removing = false;
602adf40 667
a14ea269 668 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
669 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
670 removing = true;
671 else
672 rbd_dev->open_count++;
a14ea269 673 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
674 if (removing)
675 return -ENOENT;
676
c3e946ce 677 (void) get_device(&rbd_dev->dev);
340c7a2b 678
602adf40
YS
679 return 0;
680}
681
db2a144b 682static void rbd_release(struct gendisk *disk, fmode_t mode)
dfc5606d
YS
683{
684 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
685 unsigned long open_count_before;
686
a14ea269 687 spin_lock_irq(&rbd_dev->lock);
b82d167b 688 open_count_before = rbd_dev->open_count--;
a14ea269 689 spin_unlock_irq(&rbd_dev->lock);
b82d167b 690 rbd_assert(open_count_before > 0);
dfc5606d 691
c3e946ce 692 put_device(&rbd_dev->dev);
dfc5606d
YS
693}
694
602adf40
YS
695static const struct block_device_operations rbd_bd_ops = {
696 .owner = THIS_MODULE,
697 .open = rbd_open,
dfc5606d 698 .release = rbd_release,
602adf40
YS
699};
700
701/*
7262cfca 702 * Initialize an rbd client instance. Success or not, this function
cfbf6377 703 * consumes ceph_opts. Caller holds client_mutex.
602adf40 704 */
f8c38929 705static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
706{
707 struct rbd_client *rbdc;
708 int ret = -ENOMEM;
709
37206ee5 710 dout("%s:\n", __func__);
602adf40
YS
711 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
712 if (!rbdc)
713 goto out_opt;
714
715 kref_init(&rbdc->kref);
716 INIT_LIST_HEAD(&rbdc->node);
717
74da4a0f 718 rbdc->client = ceph_create_client(ceph_opts, rbdc);
602adf40 719 if (IS_ERR(rbdc->client))
08f75463 720 goto out_rbdc;
43ae4701 721 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
722
723 ret = ceph_open_session(rbdc->client);
724 if (ret < 0)
08f75463 725 goto out_client;
602adf40 726
432b8587 727 spin_lock(&rbd_client_list_lock);
602adf40 728 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 729 spin_unlock(&rbd_client_list_lock);
602adf40 730
37206ee5 731 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 732
602adf40 733 return rbdc;
08f75463 734out_client:
602adf40 735 ceph_destroy_client(rbdc->client);
08f75463 736out_rbdc:
602adf40
YS
737 kfree(rbdc);
738out_opt:
43ae4701
AE
739 if (ceph_opts)
740 ceph_destroy_options(ceph_opts);
37206ee5
AE
741 dout("%s: error %d\n", __func__, ret);
742
28f259b7 743 return ERR_PTR(ret);
602adf40
YS
744}
745
2f82ee54
AE
746static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
747{
748 kref_get(&rbdc->kref);
749
750 return rbdc;
751}
752
602adf40 753/*
1f7ba331
AE
754 * Find a ceph client with specific addr and configuration. If
755 * found, bump its reference count.
602adf40 756 */
1f7ba331 757static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
758{
759 struct rbd_client *client_node;
1f7ba331 760 bool found = false;
602adf40 761
43ae4701 762 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
763 return NULL;
764
1f7ba331
AE
765 spin_lock(&rbd_client_list_lock);
766 list_for_each_entry(client_node, &rbd_client_list, node) {
767 if (!ceph_compare_options(ceph_opts, client_node->client)) {
2f82ee54
AE
768 __rbd_get_client(client_node);
769
1f7ba331
AE
770 found = true;
771 break;
772 }
773 }
774 spin_unlock(&rbd_client_list_lock);
775
776 return found ? client_node : NULL;
602adf40
YS
777}
778
59c2be1e 779/*
210c104c 780 * (Per device) rbd map options
59c2be1e
YS
781 */
782enum {
b5584180 783 Opt_queue_depth,
0c93e1b7 784 Opt_alloc_size,
34f55d0b 785 Opt_lock_timeout,
59c2be1e 786 /* int args above */
b26c047b 787 Opt_pool_ns,
dc1dad8e 788 Opt_compression_hint,
59c2be1e 789 /* string args above */
cc0538b6
AE
790 Opt_read_only,
791 Opt_read_write,
80de1912 792 Opt_lock_on_read,
e010dd0a 793 Opt_exclusive,
d9360540 794 Opt_notrim,
59c2be1e
YS
795};
796
dc1dad8e
ID
797enum {
798 Opt_compression_hint_none,
799 Opt_compression_hint_compressible,
800 Opt_compression_hint_incompressible,
801};
802
803static const struct constant_table rbd_param_compression_hint[] = {
804 {"none", Opt_compression_hint_none},
805 {"compressible", Opt_compression_hint_compressible},
806 {"incompressible", Opt_compression_hint_incompressible},
807 {}
808};
809
d7167b14 810static const struct fs_parameter_spec rbd_parameters[] = {
82995cc6 811 fsparam_u32 ("alloc_size", Opt_alloc_size),
dc1dad8e
ID
812 fsparam_enum ("compression_hint", Opt_compression_hint,
813 rbd_param_compression_hint),
82995cc6
DH
814 fsparam_flag ("exclusive", Opt_exclusive),
815 fsparam_flag ("lock_on_read", Opt_lock_on_read),
816 fsparam_u32 ("lock_timeout", Opt_lock_timeout),
817 fsparam_flag ("notrim", Opt_notrim),
818 fsparam_string ("_pool_ns", Opt_pool_ns),
819 fsparam_u32 ("queue_depth", Opt_queue_depth),
820 fsparam_flag ("read_only", Opt_read_only),
821 fsparam_flag ("read_write", Opt_read_write),
822 fsparam_flag ("ro", Opt_read_only),
823 fsparam_flag ("rw", Opt_read_write),
824 {}
825};
826
98571b5a 827struct rbd_options {
b5584180 828 int queue_depth;
0c93e1b7 829 int alloc_size;
34f55d0b 830 unsigned long lock_timeout;
98571b5a 831 bool read_only;
80de1912 832 bool lock_on_read;
e010dd0a 833 bool exclusive;
d9360540 834 bool trim;
dc1dad8e
ID
835
836 u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
98571b5a
AE
837};
838
d2a27964 839#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_DEFAULT_RQ
0c93e1b7 840#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
34f55d0b 841#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
98571b5a 842#define RBD_READ_ONLY_DEFAULT false
80de1912 843#define RBD_LOCK_ON_READ_DEFAULT false
e010dd0a 844#define RBD_EXCLUSIVE_DEFAULT false
d9360540 845#define RBD_TRIM_DEFAULT true
98571b5a 846
82995cc6 847struct rbd_parse_opts_ctx {
c300156b 848 struct rbd_spec *spec;
82995cc6 849 struct ceph_options *copts;
c300156b
ID
850 struct rbd_options *opts;
851};
852
6d2940c8
GZ
853static char* obj_op_name(enum obj_operation_type op_type)
854{
855 switch (op_type) {
856 case OBJ_OP_READ:
857 return "read";
858 case OBJ_OP_WRITE:
859 return "write";
90e98c52
GZ
860 case OBJ_OP_DISCARD:
861 return "discard";
6484cbe9
ID
862 case OBJ_OP_ZEROOUT:
863 return "zeroout";
6d2940c8
GZ
864 default:
865 return "???";
866 }
867}
868
602adf40
YS
869/*
870 * Destroy ceph client
d23a4b3f 871 *
432b8587 872 * Caller must hold rbd_client_list_lock.
602adf40
YS
873 */
874static void rbd_client_release(struct kref *kref)
875{
876 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
877
37206ee5 878 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 879 spin_lock(&rbd_client_list_lock);
602adf40 880 list_del(&rbdc->node);
cd9d9f5d 881 spin_unlock(&rbd_client_list_lock);
602adf40
YS
882
883 ceph_destroy_client(rbdc->client);
884 kfree(rbdc);
885}
886
887/*
888 * Drop reference to ceph client node. If it's not referenced anymore, release
889 * it.
890 */
9d3997fd 891static void rbd_put_client(struct rbd_client *rbdc)
602adf40 892{
c53d5893
AE
893 if (rbdc)
894 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
895}
896
5feb0d8d
ID
897/*
898 * Get a ceph client with specific addr and configuration, if one does
899 * not exist create it. Either way, ceph_opts is consumed by this
900 * function.
901 */
902static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
903{
904 struct rbd_client *rbdc;
dd435855 905 int ret;
5feb0d8d 906
a32e4143 907 mutex_lock(&client_mutex);
5feb0d8d 908 rbdc = rbd_client_find(ceph_opts);
dd435855 909 if (rbdc) {
5feb0d8d 910 ceph_destroy_options(ceph_opts);
dd435855
ID
911
912 /*
913 * Using an existing client. Make sure ->pg_pools is up to
914 * date before we look up the pool id in do_rbd_add().
915 */
9d4a227f
ID
916 ret = ceph_wait_for_latest_osdmap(rbdc->client,
917 rbdc->client->options->mount_timeout);
dd435855
ID
918 if (ret) {
919 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
920 rbd_put_client(rbdc);
921 rbdc = ERR_PTR(ret);
922 }
923 } else {
5feb0d8d 924 rbdc = rbd_client_create(ceph_opts);
dd435855 925 }
5feb0d8d
ID
926 mutex_unlock(&client_mutex);
927
928 return rbdc;
929}
930
a30b71b9
AE
931static bool rbd_image_format_valid(u32 image_format)
932{
933 return image_format == 1 || image_format == 2;
934}
935
8e94af8e
AE
936static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
937{
103a150f
AE
938 size_t size;
939 u32 snap_count;
940
941 /* The header has to start with the magic rbd header text */
942 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
943 return false;
944
db2388b6
AE
945 /* The bio layer requires at least sector-sized I/O */
946
947 if (ondisk->options.order < SECTOR_SHIFT)
948 return false;
949
950 /* If we use u64 in a few spots we may be able to loosen this */
951
952 if (ondisk->options.order > 8 * sizeof (int) - 1)
953 return false;
954
103a150f
AE
955 /*
956 * The size of a snapshot header has to fit in a size_t, and
957 * that limits the number of snapshots.
958 */
959 snap_count = le32_to_cpu(ondisk->snap_count);
960 size = SIZE_MAX - sizeof (struct ceph_snap_context);
961 if (snap_count > size / sizeof (__le64))
962 return false;
963
964 /*
965 * Not only that, but the size of the entire the snapshot
966 * header must also be representable in a size_t.
967 */
968 size -= snap_count * sizeof (__le64);
969 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
970 return false;
971
972 return true;
8e94af8e
AE
973}
974
5bc3fb17
ID
975/*
976 * returns the size of an object in the image
977 */
978static u32 rbd_obj_bytes(struct rbd_image_header *header)
979{
980 return 1U << header->obj_order;
981}
982
263423f8
ID
983static void rbd_init_layout(struct rbd_device *rbd_dev)
984{
985 if (rbd_dev->header.stripe_unit == 0 ||
986 rbd_dev->header.stripe_count == 0) {
987 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
988 rbd_dev->header.stripe_count = 1;
989 }
990
991 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
992 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
993 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
7e97332e
ID
994 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
995 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
263423f8
ID
996 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
997}
998
602adf40 999/*
bb23e37a
AE
1000 * Fill an rbd image header with information from the given format 1
1001 * on-disk header.
602adf40 1002 */
662518b1 1003static int rbd_header_from_disk(struct rbd_device *rbd_dev,
4156d998 1004 struct rbd_image_header_ondisk *ondisk)
602adf40 1005{
662518b1 1006 struct rbd_image_header *header = &rbd_dev->header;
bb23e37a
AE
1007 bool first_time = header->object_prefix == NULL;
1008 struct ceph_snap_context *snapc;
1009 char *object_prefix = NULL;
1010 char *snap_names = NULL;
1011 u64 *snap_sizes = NULL;
ccece235 1012 u32 snap_count;
bb23e37a 1013 int ret = -ENOMEM;
621901d6 1014 u32 i;
602adf40 1015
bb23e37a 1016 /* Allocate this now to avoid having to handle failure below */
6a52325f 1017
bb23e37a 1018 if (first_time) {
848d796c
ID
1019 object_prefix = kstrndup(ondisk->object_prefix,
1020 sizeof(ondisk->object_prefix),
1021 GFP_KERNEL);
bb23e37a
AE
1022 if (!object_prefix)
1023 return -ENOMEM;
bb23e37a 1024 }
00f1f36f 1025
bb23e37a 1026 /* Allocate the snapshot context and fill it in */
00f1f36f 1027
bb23e37a
AE
1028 snap_count = le32_to_cpu(ondisk->snap_count);
1029 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1030 if (!snapc)
1031 goto out_err;
1032 snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 1033 if (snap_count) {
bb23e37a 1034 struct rbd_image_snap_ondisk *snaps;
f785cc1d
AE
1035 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1036
bb23e37a 1037 /* We'll keep a copy of the snapshot names... */
621901d6 1038
bb23e37a
AE
1039 if (snap_names_len > (u64)SIZE_MAX)
1040 goto out_2big;
1041 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1042 if (!snap_names)
6a52325f
AE
1043 goto out_err;
1044
bb23e37a 1045 /* ...as well as the array of their sizes. */
88a25a5f
ME
1046 snap_sizes = kmalloc_array(snap_count,
1047 sizeof(*header->snap_sizes),
1048 GFP_KERNEL);
bb23e37a 1049 if (!snap_sizes)
6a52325f 1050 goto out_err;
bb23e37a 1051
f785cc1d 1052 /*
bb23e37a
AE
1053 * Copy the names, and fill in each snapshot's id
1054 * and size.
1055 *
99a41ebc 1056 * Note that rbd_dev_v1_header_info() guarantees the
bb23e37a 1057 * ondisk buffer we're working with has
f785cc1d
AE
1058 * snap_names_len bytes beyond the end of the
1059 * snapshot id array, this memcpy() is safe.
1060 */
bb23e37a
AE
1061 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1062 snaps = ondisk->snaps;
1063 for (i = 0; i < snap_count; i++) {
1064 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1065 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1066 }
602adf40 1067 }
6a52325f 1068
bb23e37a 1069 /* We won't fail any more, fill in the header */
621901d6 1070
bb23e37a
AE
1071 if (first_time) {
1072 header->object_prefix = object_prefix;
1073 header->obj_order = ondisk->options.order;
263423f8 1074 rbd_init_layout(rbd_dev);
602adf40 1075 } else {
662518b1
AE
1076 ceph_put_snap_context(header->snapc);
1077 kfree(header->snap_names);
1078 kfree(header->snap_sizes);
602adf40 1079 }
849b4260 1080
bb23e37a 1081 /* The remaining fields always get updated (when we refresh) */
621901d6 1082
f84344f3 1083 header->image_size = le64_to_cpu(ondisk->image_size);
bb23e37a
AE
1084 header->snapc = snapc;
1085 header->snap_names = snap_names;
1086 header->snap_sizes = snap_sizes;
468521c1 1087
602adf40 1088 return 0;
bb23e37a
AE
1089out_2big:
1090 ret = -EIO;
6a52325f 1091out_err:
bb23e37a
AE
1092 kfree(snap_sizes);
1093 kfree(snap_names);
1094 ceph_put_snap_context(snapc);
1095 kfree(object_prefix);
ccece235 1096
bb23e37a 1097 return ret;
602adf40
YS
1098}
1099
9682fc6d
AE
1100static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1101{
1102 const char *snap_name;
1103
1104 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1105
1106 /* Skip over names until we find the one we are looking for */
1107
1108 snap_name = rbd_dev->header.snap_names;
1109 while (which--)
1110 snap_name += strlen(snap_name) + 1;
1111
1112 return kstrdup(snap_name, GFP_KERNEL);
1113}
1114
30d1cff8
AE
1115/*
1116 * Snapshot id comparison function for use with qsort()/bsearch().
1117 * Note that result is for snapshots in *descending* order.
1118 */
1119static int snapid_compare_reverse(const void *s1, const void *s2)
1120{
1121 u64 snap_id1 = *(u64 *)s1;
1122 u64 snap_id2 = *(u64 *)s2;
1123
1124 if (snap_id1 < snap_id2)
1125 return 1;
1126 return snap_id1 == snap_id2 ? 0 : -1;
1127}
1128
1129/*
1130 * Search a snapshot context to see if the given snapshot id is
1131 * present.
1132 *
1133 * Returns the position of the snapshot id in the array if it's found,
1134 * or BAD_SNAP_INDEX otherwise.
1135 *
1136 * Note: The snapshot array is in kept sorted (by the osd) in
1137 * reverse order, highest snapshot id first.
1138 */
9682fc6d
AE
1139static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1140{
1141 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
30d1cff8 1142 u64 *found;
9682fc6d 1143
30d1cff8
AE
1144 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1145 sizeof (snap_id), snapid_compare_reverse);
9682fc6d 1146
30d1cff8 1147 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9682fc6d
AE
1148}
1149
2ad3d716
AE
1150static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1151 u64 snap_id)
9e15b77d 1152{
54cac61f 1153 u32 which;
da6a6b63 1154 const char *snap_name;
9e15b77d 1155
54cac61f
AE
1156 which = rbd_dev_snap_index(rbd_dev, snap_id);
1157 if (which == BAD_SNAP_INDEX)
da6a6b63 1158 return ERR_PTR(-ENOENT);
54cac61f 1159
da6a6b63
JD
1160 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1161 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
54cac61f
AE
1162}
1163
1164static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1165{
9e15b77d
AE
1166 if (snap_id == CEPH_NOSNAP)
1167 return RBD_SNAP_HEAD_NAME;
1168
54cac61f
AE
1169 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1170 if (rbd_dev->image_format == 1)
1171 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9e15b77d 1172
54cac61f 1173 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
9e15b77d
AE
1174}
1175
2ad3d716
AE
1176static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1177 u64 *snap_size)
602adf40 1178{
2ad3d716
AE
1179 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1180 if (snap_id == CEPH_NOSNAP) {
1181 *snap_size = rbd_dev->header.image_size;
1182 } else if (rbd_dev->image_format == 1) {
1183 u32 which;
602adf40 1184
2ad3d716
AE
1185 which = rbd_dev_snap_index(rbd_dev, snap_id);
1186 if (which == BAD_SNAP_INDEX)
1187 return -ENOENT;
e86924a8 1188
2ad3d716
AE
1189 *snap_size = rbd_dev->header.snap_sizes[which];
1190 } else {
1191 u64 size = 0;
1192 int ret;
1193
1194 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1195 if (ret)
1196 return ret;
1197
1198 *snap_size = size;
1199 }
1200 return 0;
602adf40
YS
1201}
1202
2ad3d716
AE
1203static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1204{
8f4b7d98 1205 u64 snap_id = rbd_dev->spec->snap_id;
2ad3d716 1206 u64 size = 0;
2ad3d716
AE
1207 int ret;
1208
2ad3d716 1209 ret = rbd_snap_size(rbd_dev, snap_id, &size);
2ad3d716
AE
1210 if (ret)
1211 return ret;
1212
1213 rbd_dev->mapping.size = size;
8b0241f8 1214 return 0;
602adf40
YS
1215}
1216
d1cf5788
AE
1217static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1218{
1219 rbd_dev->mapping.size = 0;
200a6a8b
AE
1220}
1221
5359a17d 1222static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
b9434c5b 1223{
5359a17d 1224 struct ceph_bio_iter it = *bio_pos;
b9434c5b 1225
5359a17d
ID
1226 ceph_bio_iter_advance(&it, off);
1227 ceph_bio_iter_advance_step(&it, bytes, ({
732022b8 1228 memzero_bvec(&bv);
5359a17d 1229 }));
b9434c5b
AE
1230}
1231
7e07efb1 1232static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
602adf40 1233{
7e07efb1 1234 struct ceph_bvec_iter it = *bvec_pos;
602adf40 1235
7e07efb1
ID
1236 ceph_bvec_iter_advance(&it, off);
1237 ceph_bvec_iter_advance_step(&it, bytes, ({
732022b8 1238 memzero_bvec(&bv);
7e07efb1 1239 }));
f7760dad
AE
1240}
1241
1242/*
3da691bf 1243 * Zero a range in @obj_req data buffer defined by a bio (list) or
afb97888 1244 * (private) bio_vec array.
f7760dad 1245 *
3da691bf 1246 * @off is relative to the start of the data buffer.
926f9b3f 1247 */
3da691bf
ID
1248static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1249 u32 bytes)
926f9b3f 1250{
54ab3b24
ID
1251 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1252
ecc633ca 1253 switch (obj_req->img_request->data_type) {
3da691bf
ID
1254 case OBJ_REQUEST_BIO:
1255 zero_bios(&obj_req->bio_pos, off, bytes);
1256 break;
1257 case OBJ_REQUEST_BVECS:
afb97888 1258 case OBJ_REQUEST_OWN_BVECS:
3da691bf
ID
1259 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1260 break;
1261 default:
16809372 1262 BUG();
6365d33a
AE
1263 }
1264}
1265
bf0d5f50
AE
1266static void rbd_obj_request_destroy(struct kref *kref);
1267static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1268{
1269 rbd_assert(obj_request != NULL);
37206ee5 1270 dout("%s: obj %p (was %d)\n", __func__, obj_request,
2c935bc5 1271 kref_read(&obj_request->kref));
bf0d5f50
AE
1272 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1273}
1274
bf0d5f50
AE
1275static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1276 struct rbd_obj_request *obj_request)
1277{
25dcf954
AE
1278 rbd_assert(obj_request->img_request == NULL);
1279
b155e86c 1280 /* Image request now owns object's original reference */
bf0d5f50 1281 obj_request->img_request = img_request;
15961b44 1282 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
bf0d5f50
AE
1283}
1284
1285static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1286 struct rbd_obj_request *obj_request)
1287{
15961b44 1288 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
43df3d35 1289 list_del(&obj_request->ex.oe_item);
bf0d5f50 1290 rbd_assert(obj_request->img_request == img_request);
bf0d5f50
AE
1291 rbd_obj_request_put(obj_request);
1292}
1293
a086a1b8 1294static void rbd_osd_submit(struct ceph_osd_request *osd_req)
bf0d5f50 1295{
a086a1b8 1296 struct rbd_obj_request *obj_req = osd_req->r_priv;
980917fc 1297
a086a1b8
ID
1298 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1299 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1300 obj_req->ex.oe_off, obj_req->ex.oe_len);
980917fc 1301 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
bf0d5f50
AE
1302}
1303
0c425248
AE
1304/*
1305 * The default/initial value for all image request flags is 0. Each
1306 * is conditionally set to 1 at image request initialization time
1307 * and currently never change thereafter.
1308 */
d0b2e944
AE
1309static void img_request_layered_set(struct rbd_img_request *img_request)
1310{
1311 set_bit(IMG_REQ_LAYERED, &img_request->flags);
d0b2e944
AE
1312}
1313
1314static bool img_request_layered_test(struct rbd_img_request *img_request)
1315{
d0b2e944
AE
1316 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1317}
1318
3da691bf 1319static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
6e2a4505 1320{
3da691bf 1321 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
b9434c5b 1322
43df3d35
ID
1323 return !obj_req->ex.oe_off &&
1324 obj_req->ex.oe_len == rbd_dev->layout.object_size;
6e2a4505
AE
1325}
1326
3da691bf 1327static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
bf0d5f50 1328{
3da691bf 1329 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bf0d5f50 1330
43df3d35 1331 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
3da691bf 1332 rbd_dev->layout.object_size;
0dcc685e
ID
1333}
1334
13488d53
ID
1335/*
1336 * Must be called after rbd_obj_calc_img_extents().
1337 */
1338static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1339{
1340 if (!obj_req->num_img_extents ||
9b17eb2c
ID
1341 (rbd_obj_is_entire(obj_req) &&
1342 !obj_req->img_request->snapc->num_snaps))
13488d53
ID
1343 return false;
1344
1345 return true;
1346}
1347
86bd7998 1348static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
bf0d5f50 1349{
86bd7998
ID
1350 return ceph_file_extents_bytes(obj_req->img_extents,
1351 obj_req->num_img_extents);
bf0d5f50
AE
1352}
1353
3da691bf 1354static bool rbd_img_is_write(struct rbd_img_request *img_req)
bf0d5f50 1355{
9bb0248d 1356 switch (img_req->op_type) {
3da691bf
ID
1357 case OBJ_OP_READ:
1358 return false;
1359 case OBJ_OP_WRITE:
1360 case OBJ_OP_DISCARD:
6484cbe9 1361 case OBJ_OP_ZEROOUT:
3da691bf
ID
1362 return true;
1363 default:
c6244b3b 1364 BUG();
3da691bf 1365 }
90e98c52
GZ
1366}
1367
85e084fe 1368static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
bf0d5f50 1369{
3da691bf 1370 struct rbd_obj_request *obj_req = osd_req->r_priv;
54ab3b24 1371 int result;
bf0d5f50 1372
3da691bf
ID
1373 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1374 osd_req->r_result, obj_req);
bf0d5f50 1375
54ab3b24
ID
1376 /*
1377 * Writes aren't allowed to return a data payload. In some
1378 * guarded write cases (e.g. stat + zero on an empty object)
1379 * a stat response makes it through, but we don't care.
1380 */
1381 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1382 result = 0;
3da691bf 1383 else
54ab3b24 1384 result = osd_req->r_result;
bf0d5f50 1385
54ab3b24 1386 rbd_obj_handle_request(obj_req, result);
bf0d5f50
AE
1387}
1388
bcbab1db 1389static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
430c28c3 1390{
bcbab1db 1391 struct rbd_obj_request *obj_request = osd_req->r_priv;
22d2cfdf
ID
1392 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1393 struct ceph_options *opt = rbd_dev->rbd_client->client->options;
430c28c3 1394
22d2cfdf 1395 osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
7c84883a 1396 osd_req->r_snapid = obj_request->img_request->snap_id;
9d4df01f
AE
1397}
1398
bcbab1db 1399static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
9d4df01f 1400{
bcbab1db 1401 struct rbd_obj_request *obj_request = osd_req->r_priv;
9d4df01f 1402
a162b308 1403 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
fac02ddf 1404 ktime_get_real_ts64(&osd_req->r_mtime);
43df3d35 1405 osd_req->r_data_offset = obj_request->ex.oe_off;
430c28c3
AE
1406}
1407
bc81207e 1408static struct ceph_osd_request *
bcbab1db
ID
1409__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1410 struct ceph_snap_context *snapc, int num_ops)
bc81207e 1411{
e28eded5 1412 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bc81207e
ID
1413 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1414 struct ceph_osd_request *req;
a90bb0c1
ID
1415 const char *name_format = rbd_dev->image_format == 1 ?
1416 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
bcbab1db 1417 int ret;
bc81207e 1418
e28eded5 1419 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
bc81207e 1420 if (!req)
bcbab1db 1421 return ERR_PTR(-ENOMEM);
bc81207e 1422
bcbab1db 1423 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
bc81207e 1424 req->r_callback = rbd_osd_req_callback;
a162b308 1425 req->r_priv = obj_req;
bc81207e 1426
b26c047b
ID
1427 /*
1428 * Data objects may be stored in a separate pool, but always in
1429 * the same namespace in that pool as the header in its pool.
1430 */
1431 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
bc81207e 1432 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
b26c047b 1433
bcbab1db
ID
1434 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1435 rbd_dev->header.object_prefix,
1436 obj_req->ex.oe_objno);
1437 if (ret)
1438 return ERR_PTR(ret);
bc81207e 1439
bc81207e 1440 return req;
bc81207e
ID
1441}
1442
e28eded5 1443static struct ceph_osd_request *
bcbab1db 1444rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
bf0d5f50 1445{
bcbab1db
ID
1446 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1447 num_ops);
bf0d5f50
AE
1448}
1449
ecc633ca 1450static struct rbd_obj_request *rbd_obj_request_create(void)
bf0d5f50
AE
1451{
1452 struct rbd_obj_request *obj_request;
bf0d5f50 1453
5a60e876 1454 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
6c696d85 1455 if (!obj_request)
f907ad55 1456 return NULL;
f907ad55 1457
43df3d35 1458 ceph_object_extent_init(&obj_request->ex);
bcbab1db 1459 INIT_LIST_HEAD(&obj_request->osd_reqs);
85b5e6d1 1460 mutex_init(&obj_request->state_mutex);
bf0d5f50
AE
1461 kref_init(&obj_request->kref);
1462
67e2b652 1463 dout("%s %p\n", __func__, obj_request);
bf0d5f50
AE
1464 return obj_request;
1465}
1466
1467static void rbd_obj_request_destroy(struct kref *kref)
1468{
1469 struct rbd_obj_request *obj_request;
bcbab1db 1470 struct ceph_osd_request *osd_req;
7e07efb1 1471 u32 i;
bf0d5f50
AE
1472
1473 obj_request = container_of(kref, struct rbd_obj_request, kref);
1474
37206ee5
AE
1475 dout("%s: obj %p\n", __func__, obj_request);
1476
bcbab1db
ID
1477 while (!list_empty(&obj_request->osd_reqs)) {
1478 osd_req = list_first_entry(&obj_request->osd_reqs,
1479 struct ceph_osd_request, r_private_item);
1480 list_del_init(&osd_req->r_private_item);
1481 ceph_osdc_put_request(osd_req);
1482 }
bf0d5f50 1483
ecc633ca 1484 switch (obj_request->img_request->data_type) {
9969ebc5 1485 case OBJ_REQUEST_NODATA:
bf0d5f50 1486 case OBJ_REQUEST_BIO:
7e07efb1 1487 case OBJ_REQUEST_BVECS:
5359a17d 1488 break; /* Nothing to do */
afb97888
ID
1489 case OBJ_REQUEST_OWN_BVECS:
1490 kfree(obj_request->bvec_pos.bvecs);
788e2df3 1491 break;
7e07efb1 1492 default:
16809372 1493 BUG();
bf0d5f50
AE
1494 }
1495
86bd7998 1496 kfree(obj_request->img_extents);
7e07efb1
ID
1497 if (obj_request->copyup_bvecs) {
1498 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1499 if (obj_request->copyup_bvecs[i].bv_page)
1500 __free_page(obj_request->copyup_bvecs[i].bv_page);
1501 }
1502 kfree(obj_request->copyup_bvecs);
bf0d5f50
AE
1503 }
1504
868311b1 1505 kmem_cache_free(rbd_obj_request_cache, obj_request);
bf0d5f50
AE
1506}
1507
fb65d228
AE
1508/* It's OK to call this for a device with no parent */
1509
1510static void rbd_spec_put(struct rbd_spec *spec);
1511static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1512{
1513 rbd_dev_remove_parent(rbd_dev);
1514 rbd_spec_put(rbd_dev->parent_spec);
1515 rbd_dev->parent_spec = NULL;
1516 rbd_dev->parent_overlap = 0;
1517}
1518
a2acd00e
AE
1519/*
1520 * Parent image reference counting is used to determine when an
1521 * image's parent fields can be safely torn down--after there are no
1522 * more in-flight requests to the parent image. When the last
1523 * reference is dropped, cleaning them up is safe.
1524 */
1525static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1526{
1527 int counter;
1528
1529 if (!rbd_dev->parent_spec)
1530 return;
1531
1532 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1533 if (counter > 0)
1534 return;
1535
1536 /* Last reference; clean up parent data structures */
1537
1538 if (!counter)
1539 rbd_dev_unparent(rbd_dev);
1540 else
9584d508 1541 rbd_warn(rbd_dev, "parent reference underflow");
a2acd00e
AE
1542}
1543
1544/*
1545 * If an image has a non-zero parent overlap, get a reference to its
1546 * parent.
1547 *
1548 * Returns true if the rbd device has a parent with a non-zero
1549 * overlap and a reference for it was successfully taken, or
1550 * false otherwise.
1551 */
1552static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1553{
ae43e9d0 1554 int counter = 0;
a2acd00e
AE
1555
1556 if (!rbd_dev->parent_spec)
1557 return false;
1558
ae43e9d0
ID
1559 if (rbd_dev->parent_overlap)
1560 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
a2acd00e
AE
1561
1562 if (counter < 0)
9584d508 1563 rbd_warn(rbd_dev, "parent reference overflow");
a2acd00e 1564
ae43e9d0 1565 return counter > 0;
a2acd00e
AE
1566}
1567
59e542c8
ID
1568static void rbd_img_request_init(struct rbd_img_request *img_request,
1569 struct rbd_device *rbd_dev,
1570 enum obj_operation_type op_type)
bf0d5f50 1571{
59e542c8 1572 memset(img_request, 0, sizeof(*img_request));
bf0d5f50 1573
bf0d5f50 1574 img_request->rbd_dev = rbd_dev;
9bb0248d 1575 img_request->op_type = op_type;
a0c5895b 1576
e1fddc8f 1577 INIT_LIST_HEAD(&img_request->lock_item);
43df3d35 1578 INIT_LIST_HEAD(&img_request->object_extents);
0192ce2e 1579 mutex_init(&img_request->state_mutex);
bf0d5f50
AE
1580}
1581
a52cc685
ID
1582static void rbd_img_capture_header(struct rbd_img_request *img_req)
1583{
1584 struct rbd_device *rbd_dev = img_req->rbd_dev;
1585
1586 lockdep_assert_held(&rbd_dev->header_rwsem);
1587
1588 if (rbd_img_is_write(img_req))
1589 img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1590 else
1591 img_req->snap_id = rbd_dev->spec->snap_id;
1592
1593 if (rbd_dev_parent_get(rbd_dev))
1594 img_request_layered_set(img_req);
1595}
1596
679a97d2 1597static void rbd_img_request_destroy(struct rbd_img_request *img_request)
bf0d5f50 1598{
bf0d5f50
AE
1599 struct rbd_obj_request *obj_request;
1600 struct rbd_obj_request *next_obj_request;
1601
37206ee5
AE
1602 dout("%s: img %p\n", __func__, img_request);
1603
e1fddc8f 1604 WARN_ON(!list_empty(&img_request->lock_item));
bf0d5f50
AE
1605 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1606 rbd_img_obj_request_del(img_request, obj_request);
1607
78b42a87 1608 if (img_request_layered_test(img_request))
a2acd00e 1609 rbd_dev_parent_put(img_request->rbd_dev);
a2acd00e 1610
9bb0248d 1611 if (rbd_img_is_write(img_request))
812164f8 1612 ceph_put_snap_context(img_request->snapc);
bf0d5f50 1613
59e542c8
ID
1614 if (test_bit(IMG_REQ_CHILD, &img_request->flags))
1615 kmem_cache_free(rbd_img_request_cache, img_request);
bf0d5f50
AE
1616}
1617
22e8bd51
ID
1618#define BITS_PER_OBJ 2
1619#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
1620#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
e93f3152 1621
22e8bd51
ID
1622static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1623 u64 *index, u8 *shift)
1624{
1625 u32 off;
e93f3152 1626
22e8bd51
ID
1627 rbd_assert(objno < rbd_dev->object_map_size);
1628 *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1629 *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1630}
e93f3152 1631
22e8bd51
ID
1632static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1633{
1634 u64 index;
1635 u8 shift;
e93f3152 1636
22e8bd51
ID
1637 lockdep_assert_held(&rbd_dev->object_map_lock);
1638 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1639 return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
e93f3152
AE
1640}
1641
22e8bd51 1642static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
e93f3152 1643{
22e8bd51
ID
1644 u64 index;
1645 u8 shift;
1646 u8 *p;
e93f3152 1647
22e8bd51
ID
1648 lockdep_assert_held(&rbd_dev->object_map_lock);
1649 rbd_assert(!(val & ~OBJ_MASK));
e93f3152 1650
22e8bd51
ID
1651 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1652 p = &rbd_dev->object_map[index];
1653 *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
e93f3152
AE
1654}
1655
22e8bd51 1656static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1217857f 1657{
22e8bd51
ID
1658 u8 state;
1659
1660 spin_lock(&rbd_dev->object_map_lock);
1661 state = __rbd_object_map_get(rbd_dev, objno);
1662 spin_unlock(&rbd_dev->object_map_lock);
1663 return state;
3da691bf 1664}
1217857f 1665
22e8bd51 1666static bool use_object_map(struct rbd_device *rbd_dev)
3da691bf 1667{
3fe69921
ID
1668 /*
1669 * An image mapped read-only can't use the object map -- it isn't
1670 * loaded because the header lock isn't acquired. Someone else can
1671 * write to the image and update the object map behind our back.
1672 *
1673 * A snapshot can't be written to, so using the object map is always
1674 * safe.
1675 */
1676 if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
1677 return false;
1678
22e8bd51
ID
1679 return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1680 !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
3da691bf
ID
1681}
1682
22e8bd51 1683static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
3da691bf 1684{
22e8bd51 1685 u8 state;
8b3e1a56 1686
22e8bd51
ID
1687 /* fall back to default logic if object map is disabled or invalid */
1688 if (!use_object_map(rbd_dev))
1689 return true;
3da691bf 1690
22e8bd51
ID
1691 state = rbd_object_map_get(rbd_dev, objno);
1692 return state != OBJECT_NONEXISTENT;
1217857f
AE
1693}
1694
22e8bd51
ID
1695static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1696 struct ceph_object_id *oid)
13488d53 1697{
22e8bd51
ID
1698 if (snap_id == CEPH_NOSNAP)
1699 ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1700 rbd_dev->spec->image_id);
1701 else
1702 ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1703 rbd_dev->spec->image_id, snap_id);
13488d53
ID
1704}
1705
22e8bd51 1706static int rbd_object_map_lock(struct rbd_device *rbd_dev)
2169238d 1707{
22e8bd51
ID
1708 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1709 CEPH_DEFINE_OID_ONSTACK(oid);
1710 u8 lock_type;
1711 char *lock_tag;
1712 struct ceph_locker *lockers;
1713 u32 num_lockers;
1714 bool broke_lock = false;
1715 int ret;
2169238d 1716
22e8bd51 1717 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
2169238d 1718
22e8bd51
ID
1719again:
1720 ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1721 CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1722 if (ret != -EBUSY || broke_lock) {
1723 if (ret == -EEXIST)
1724 ret = 0; /* already locked by myself */
1725 if (ret)
1726 rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1727 return ret;
1728 }
2169238d 1729
22e8bd51
ID
1730 ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1731 RBD_LOCK_NAME, &lock_type, &lock_tag,
1732 &lockers, &num_lockers);
1733 if (ret) {
1734 if (ret == -ENOENT)
1735 goto again;
3da691bf 1736
22e8bd51 1737 rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
86bd7998 1738 return ret;
22e8bd51 1739 }
86bd7998 1740
22e8bd51
ID
1741 kfree(lock_tag);
1742 if (num_lockers == 0)
1743 goto again;
2169238d 1744
22e8bd51
ID
1745 rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1746 ENTITY_NAME(lockers[0].id.name));
2169238d 1747
22e8bd51
ID
1748 ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1749 RBD_LOCK_NAME, lockers[0].id.cookie,
1750 &lockers[0].id.name);
1751 ceph_free_lockers(lockers, num_lockers);
1752 if (ret) {
1753 if (ret == -ENOENT)
1754 goto again;
13488d53 1755
22e8bd51
ID
1756 rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1757 return ret;
3da691bf
ID
1758 }
1759
22e8bd51
ID
1760 broke_lock = true;
1761 goto again;
2169238d
AE
1762}
1763
22e8bd51 1764static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
6484cbe9 1765{
22e8bd51
ID
1766 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1767 CEPH_DEFINE_OID_ONSTACK(oid);
1768 int ret;
1769
1770 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1771
1772 ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1773 "");
1774 if (ret && ret != -ENOENT)
1775 rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
6484cbe9
ID
1776}
1777
22e8bd51 1778static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
6484cbe9 1779{
22e8bd51
ID
1780 u8 struct_v;
1781 u32 struct_len;
1782 u32 header_len;
1783 void *header_end;
6484cbe9
ID
1784 int ret;
1785
22e8bd51
ID
1786 ceph_decode_32_safe(p, end, header_len, e_inval);
1787 header_end = *p + header_len;
0c93e1b7 1788
22e8bd51
ID
1789 ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1790 &struct_len);
6484cbe9
ID
1791 if (ret)
1792 return ret;
1793
22e8bd51 1794 ceph_decode_64_safe(p, end, *object_map_size, e_inval);
6484cbe9 1795
22e8bd51 1796 *p = header_end;
6484cbe9 1797 return 0;
22e8bd51
ID
1798
1799e_inval:
1800 return -EINVAL;
6484cbe9
ID
1801}
1802
22e8bd51 1803static int __rbd_object_map_load(struct rbd_device *rbd_dev)
13488d53 1804{
22e8bd51
ID
1805 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1806 CEPH_DEFINE_OID_ONSTACK(oid);
1807 struct page **pages;
1808 void *p, *end;
1809 size_t reply_len;
1810 u64 num_objects;
1811 u64 object_map_bytes;
1812 u64 object_map_size;
1813 int num_pages;
1814 int ret;
13488d53 1815
22e8bd51 1816 rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
13488d53 1817
22e8bd51
ID
1818 num_objects = ceph_get_num_objects(&rbd_dev->layout,
1819 rbd_dev->mapping.size);
1820 object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1821 BITS_PER_BYTE);
1822 num_pages = calc_pages_for(0, object_map_bytes) + 1;
1823 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1824 if (IS_ERR(pages))
1825 return PTR_ERR(pages);
13488d53 1826
22e8bd51
ID
1827 reply_len = num_pages * PAGE_SIZE;
1828 rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1829 ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1830 "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1831 NULL, 0, pages, &reply_len);
1832 if (ret)
1833 goto out;
3b434a2a 1834
22e8bd51
ID
1835 p = page_address(pages[0]);
1836 end = p + min(reply_len, (size_t)PAGE_SIZE);
1837 ret = decode_object_map_header(&p, end, &object_map_size);
1838 if (ret)
1839 goto out;
1840
1841 if (object_map_size != num_objects) {
1842 rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1843 object_map_size, num_objects);
1844 ret = -EINVAL;
1845 goto out;
3b434a2a
JD
1846 }
1847
22e8bd51
ID
1848 if (offset_in_page(p) + object_map_bytes > reply_len) {
1849 ret = -EINVAL;
1850 goto out;
1851 }
1852
1853 rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
1854 if (!rbd_dev->object_map) {
1855 ret = -ENOMEM;
1856 goto out;
1857 }
1858
1859 rbd_dev->object_map_size = object_map_size;
1860 ceph_copy_from_page_vector(pages, rbd_dev->object_map,
1861 offset_in_page(p), object_map_bytes);
1862
1863out:
1864 ceph_release_page_vector(pages, num_pages);
1865 return ret;
1866}
3da691bf 1867
22e8bd51
ID
1868static void rbd_object_map_free(struct rbd_device *rbd_dev)
1869{
1870 kvfree(rbd_dev->object_map);
1871 rbd_dev->object_map = NULL;
1872 rbd_dev->object_map_size = 0;
3b434a2a
JD
1873}
1874
22e8bd51 1875static int rbd_object_map_load(struct rbd_device *rbd_dev)
bf0d5f50 1876{
3da691bf 1877 int ret;
37206ee5 1878
22e8bd51 1879 ret = __rbd_object_map_load(rbd_dev);
86bd7998
ID
1880 if (ret)
1881 return ret;
f1a4739f 1882
22e8bd51
ID
1883 ret = rbd_dev_v2_get_flags(rbd_dev);
1884 if (ret) {
1885 rbd_object_map_free(rbd_dev);
1886 return ret;
1887 }
1888
1889 if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
1890 rbd_warn(rbd_dev, "object map is invalid");
1891
1892 return 0;
1893}
1894
1895static int rbd_object_map_open(struct rbd_device *rbd_dev)
1896{
1897 int ret;
1898
1899 ret = rbd_object_map_lock(rbd_dev);
1900 if (ret)
1901 return ret;
1902
1903 ret = rbd_object_map_load(rbd_dev);
1904 if (ret) {
1905 rbd_object_map_unlock(rbd_dev);
1906 return ret;
1907 }
1908
1909 return 0;
1910}
1911
1912static void rbd_object_map_close(struct rbd_device *rbd_dev)
1913{
1914 rbd_object_map_free(rbd_dev);
1915 rbd_object_map_unlock(rbd_dev);
1916}
1917
1918/*
1919 * This function needs snap_id (or more precisely just something to
1920 * distinguish between HEAD and snapshot object maps), new_state and
1921 * current_state that were passed to rbd_object_map_update().
1922 *
1923 * To avoid allocating and stashing a context we piggyback on the OSD
1924 * request. A HEAD update has two ops (assert_locked). For new_state
1925 * and current_state we decode our own object_map_update op, encoded in
1926 * rbd_cls_object_map_update().
1927 */
1928static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
1929 struct ceph_osd_request *osd_req)
1930{
1931 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1932 struct ceph_osd_data *osd_data;
1933 u64 objno;
3f649ab7 1934 u8 state, new_state, current_state;
22e8bd51
ID
1935 bool has_current_state;
1936 void *p;
1937
1938 if (osd_req->r_result)
1939 return osd_req->r_result;
1940
1941 /*
1942 * Nothing to do for a snapshot object map.
1943 */
1944 if (osd_req->r_num_ops == 1)
1945 return 0;
1946
1947 /*
1948 * Update in-memory HEAD object map.
1949 */
1950 rbd_assert(osd_req->r_num_ops == 2);
1951 osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
1952 rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
1953
1954 p = page_address(osd_data->pages[0]);
1955 objno = ceph_decode_64(&p);
1956 rbd_assert(objno == obj_req->ex.oe_objno);
1957 rbd_assert(ceph_decode_64(&p) == objno + 1);
1958 new_state = ceph_decode_8(&p);
1959 has_current_state = ceph_decode_8(&p);
1960 if (has_current_state)
1961 current_state = ceph_decode_8(&p);
1962
1963 spin_lock(&rbd_dev->object_map_lock);
1964 state = __rbd_object_map_get(rbd_dev, objno);
1965 if (!has_current_state || current_state == state ||
1966 (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
1967 __rbd_object_map_set(rbd_dev, objno, new_state);
1968 spin_unlock(&rbd_dev->object_map_lock);
1969
1970 return 0;
1971}
1972
1973static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
1974{
1975 struct rbd_obj_request *obj_req = osd_req->r_priv;
1976 int result;
1977
1978 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1979 osd_req->r_result, obj_req);
1980
1981 result = rbd_object_map_update_finish(obj_req, osd_req);
1982 rbd_obj_handle_request(obj_req, result);
1983}
1984
1985static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
1986{
1987 u8 state = rbd_object_map_get(rbd_dev, objno);
bf0d5f50 1988
22e8bd51
ID
1989 if (state == new_state ||
1990 (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
1991 (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
1992 return false;
1993
1994 return true;
1995}
1996
1997static int rbd_cls_object_map_update(struct ceph_osd_request *req,
1998 int which, u64 objno, u8 new_state,
1999 const u8 *current_state)
2000{
2001 struct page **pages;
2002 void *p, *start;
2003 int ret;
2004
2005 ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2006 if (ret)
2007 return ret;
2008
2009 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2010 if (IS_ERR(pages))
2011 return PTR_ERR(pages);
2012
2013 p = start = page_address(pages[0]);
2014 ceph_encode_64(&p, objno);
2015 ceph_encode_64(&p, objno + 1);
2016 ceph_encode_8(&p, new_state);
2017 if (current_state) {
2018 ceph_encode_8(&p, 1);
2019 ceph_encode_8(&p, *current_state);
2020 } else {
2021 ceph_encode_8(&p, 0);
2022 }
2023
2024 osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2025 false, true);
2026 return 0;
2027}
2028
2029/*
2030 * Return:
2031 * 0 - object map update sent
2032 * 1 - object map update isn't needed
2033 * <0 - error
2034 */
2035static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2036 u8 new_state, const u8 *current_state)
2037{
2038 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2039 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2040 struct ceph_osd_request *req;
2041 int num_ops = 1;
2042 int which = 0;
2043 int ret;
2044
2045 if (snap_id == CEPH_NOSNAP) {
2046 if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2047 return 1;
2048
2049 num_ops++; /* assert_locked */
2050 }
2051
2052 req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2053 if (!req)
2054 return -ENOMEM;
2055
2056 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2057 req->r_callback = rbd_object_map_callback;
2058 req->r_priv = obj_req;
2059
2060 rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2061 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2062 req->r_flags = CEPH_OSD_FLAG_WRITE;
2063 ktime_get_real_ts64(&req->r_mtime);
2064
2065 if (snap_id == CEPH_NOSNAP) {
2066 /*
2067 * Protect against possible race conditions during lock
2068 * ownership transitions.
2069 */
2070 ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2071 CEPH_CLS_LOCK_EXCLUSIVE, "", "");
3da691bf
ID
2072 if (ret)
2073 return ret;
22e8bd51
ID
2074 }
2075
2076 ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2077 new_state, current_state);
2078 if (ret)
2079 return ret;
2080
2081 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2082 if (ret)
2083 return ret;
13488d53 2084
22e8bd51
ID
2085 ceph_osdc_start_request(osdc, req, false);
2086 return 0;
2087}
2088
86bd7998
ID
2089static void prune_extents(struct ceph_file_extent *img_extents,
2090 u32 *num_img_extents, u64 overlap)
e93f3152 2091{
86bd7998 2092 u32 cnt = *num_img_extents;
e93f3152 2093
86bd7998
ID
2094 /* drop extents completely beyond the overlap */
2095 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2096 cnt--;
e93f3152 2097
86bd7998
ID
2098 if (cnt) {
2099 struct ceph_file_extent *ex = &img_extents[cnt - 1];
e93f3152 2100
86bd7998
ID
2101 /* trim final overlapping extent */
2102 if (ex->fe_off + ex->fe_len > overlap)
2103 ex->fe_len = overlap - ex->fe_off;
2104 }
e93f3152 2105
86bd7998 2106 *num_img_extents = cnt;
e93f3152
AE
2107}
2108
86bd7998
ID
2109/*
2110 * Determine the byte range(s) covered by either just the object extent
2111 * or the entire object in the parent image.
2112 */
2113static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2114 bool entire)
e93f3152 2115{
86bd7998
ID
2116 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2117 int ret;
e93f3152 2118
86bd7998
ID
2119 if (!rbd_dev->parent_overlap)
2120 return 0;
e93f3152 2121
86bd7998
ID
2122 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2123 entire ? 0 : obj_req->ex.oe_off,
2124 entire ? rbd_dev->layout.object_size :
2125 obj_req->ex.oe_len,
2126 &obj_req->img_extents,
2127 &obj_req->num_img_extents);
2128 if (ret)
2129 return ret;
e93f3152 2130
86bd7998
ID
2131 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2132 rbd_dev->parent_overlap);
2133 return 0;
e93f3152
AE
2134}
2135
bcbab1db 2136static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
1217857f 2137{
bcbab1db
ID
2138 struct rbd_obj_request *obj_req = osd_req->r_priv;
2139
ecc633ca 2140 switch (obj_req->img_request->data_type) {
3da691bf 2141 case OBJ_REQUEST_BIO:
bcbab1db 2142 osd_req_op_extent_osd_data_bio(osd_req, which,
3da691bf 2143 &obj_req->bio_pos,
43df3d35 2144 obj_req->ex.oe_len);
3da691bf
ID
2145 break;
2146 case OBJ_REQUEST_BVECS:
afb97888 2147 case OBJ_REQUEST_OWN_BVECS:
3da691bf 2148 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
43df3d35 2149 obj_req->ex.oe_len);
afb97888 2150 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
bcbab1db 2151 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
3da691bf
ID
2152 &obj_req->bvec_pos);
2153 break;
2154 default:
16809372 2155 BUG();
1217857f 2156 }
3da691bf 2157}
1217857f 2158
bcbab1db 2159static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
3da691bf
ID
2160{
2161 struct page **pages;
8b3e1a56 2162
3da691bf
ID
2163 /*
2164 * The response data for a STAT call consists of:
2165 * le64 length;
2166 * struct {
2167 * le32 tv_sec;
2168 * le32 tv_nsec;
2169 * } mtime;
2170 */
2171 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2172 if (IS_ERR(pages))
2173 return PTR_ERR(pages);
2174
bcbab1db
ID
2175 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2176 osd_req_op_raw_data_in_pages(osd_req, which, pages,
3da691bf
ID
2177 8 + sizeof(struct ceph_timespec),
2178 0, false, true);
2179 return 0;
1217857f
AE
2180}
2181
b5ae8cbc
ID
2182static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2183 u32 bytes)
2184{
2185 struct rbd_obj_request *obj_req = osd_req->r_priv;
2186 int ret;
2187
2188 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2189 if (ret)
2190 return ret;
2191
2192 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2193 obj_req->copyup_bvec_count, bytes);
2194 return 0;
2195}
2196
ea9b743c
ID
2197static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2198{
2199 obj_req->read_state = RBD_OBJ_READ_START;
2200 return 0;
2201}
2202
bcbab1db
ID
2203static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2204 int which)
2169238d 2205{
bcbab1db 2206 struct rbd_obj_request *obj_req = osd_req->r_priv;
3da691bf
ID
2207 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2208 u16 opcode;
2169238d 2209
8b5bec5c
ID
2210 if (!use_object_map(rbd_dev) ||
2211 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2212 osd_req_op_alloc_hint_init(osd_req, which++,
2213 rbd_dev->layout.object_size,
d3798acc 2214 rbd_dev->layout.object_size,
dc1dad8e 2215 rbd_dev->opts->alloc_hint_flags);
8b5bec5c 2216 }
2169238d 2217
3da691bf
ID
2218 if (rbd_obj_is_entire(obj_req))
2219 opcode = CEPH_OSD_OP_WRITEFULL;
2220 else
2221 opcode = CEPH_OSD_OP_WRITE;
2169238d 2222
bcbab1db 2223 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2224 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
bcbab1db 2225 rbd_osd_setup_data(osd_req, which);
3da691bf 2226}
2169238d 2227
ea9b743c 2228static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
3da691bf 2229{
3da691bf
ID
2230 int ret;
2231
86bd7998
ID
2232 /* reverse map the entire object onto the parent */
2233 ret = rbd_obj_calc_img_extents(obj_req, true);
2234 if (ret)
2235 return ret;
2236
0ad5d953
ID
2237 if (rbd_obj_copyup_enabled(obj_req))
2238 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2239
85b5e6d1 2240 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf 2241 return 0;
2169238d
AE
2242}
2243
6484cbe9
ID
2244static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2245{
2246 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2247 CEPH_OSD_OP_ZERO;
2248}
2249
27bbd911
ID
2250static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2251 int which)
2252{
2253 struct rbd_obj_request *obj_req = osd_req->r_priv;
2254
2255 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2256 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2257 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
13488d53 2258 } else {
27bbd911
ID
2259 osd_req_op_extent_init(osd_req, which,
2260 truncate_or_zero_opcode(obj_req),
2261 obj_req->ex.oe_off, obj_req->ex.oe_len,
2262 0, 0);
2263 }
2264}
2265
ea9b743c 2266static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
6484cbe9 2267{
0c93e1b7 2268 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
27bbd911 2269 u64 off, next_off;
6484cbe9
ID
2270 int ret;
2271
0c93e1b7
ID
2272 /*
2273 * Align the range to alloc_size boundary and punt on discards
2274 * that are too small to free up any space.
2275 *
2276 * alloc_size == object_size && is_tail() is a special case for
2277 * filestore with filestore_punch_hole = false, needed to allow
2278 * truncate (in addition to delete).
2279 */
2280 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2281 !rbd_obj_is_tail(obj_req)) {
27bbd911
ID
2282 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2283 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2284 rbd_dev->opts->alloc_size);
0c93e1b7
ID
2285 if (off >= next_off)
2286 return 1;
27bbd911
ID
2287
2288 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2289 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2290 off, next_off - off);
2291 obj_req->ex.oe_off = off;
2292 obj_req->ex.oe_len = next_off - off;
0c93e1b7
ID
2293 }
2294
6484cbe9
ID
2295 /* reverse map the entire object onto the parent */
2296 ret = rbd_obj_calc_img_extents(obj_req, true);
2297 if (ret)
2298 return ret;
2299
22e8bd51 2300 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
0ad5d953
ID
2301 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2302 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2303
85b5e6d1 2304 obj_req->write_state = RBD_OBJ_WRITE_START;
6484cbe9
ID
2305 return 0;
2306}
2307
bcbab1db
ID
2308static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2309 int which)
3da691bf 2310{
bcbab1db 2311 struct rbd_obj_request *obj_req = osd_req->r_priv;
3b434a2a
JD
2312 u16 opcode;
2313
3da691bf 2314 if (rbd_obj_is_entire(obj_req)) {
86bd7998 2315 if (obj_req->num_img_extents) {
0ad5d953 2316 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
bcbab1db 2317 osd_req_op_init(osd_req, which++,
9b17eb2c 2318 CEPH_OSD_OP_CREATE, 0);
3b434a2a
JD
2319 opcode = CEPH_OSD_OP_TRUNCATE;
2320 } else {
0ad5d953 2321 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
bcbab1db 2322 osd_req_op_init(osd_req, which++,
3da691bf
ID
2323 CEPH_OSD_OP_DELETE, 0);
2324 opcode = 0;
3b434a2a 2325 }
3b434a2a 2326 } else {
6484cbe9 2327 opcode = truncate_or_zero_opcode(obj_req);
3b434a2a
JD
2328 }
2329
3da691bf 2330 if (opcode)
bcbab1db 2331 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2332 obj_req->ex.oe_off, obj_req->ex.oe_len,
3da691bf 2333 0, 0);
3b434a2a
JD
2334}
2335
ea9b743c 2336static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
bf0d5f50 2337{
3da691bf 2338 int ret;
37206ee5 2339
86bd7998
ID
2340 /* reverse map the entire object onto the parent */
2341 ret = rbd_obj_calc_img_extents(obj_req, true);
2342 if (ret)
2343 return ret;
f1a4739f 2344
0ad5d953
ID
2345 if (rbd_obj_copyup_enabled(obj_req))
2346 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2347 if (!obj_req->num_img_extents) {
22e8bd51 2348 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
0ad5d953
ID
2349 if (rbd_obj_is_entire(obj_req))
2350 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
3da691bf 2351 }
3b434a2a 2352
a086a1b8 2353 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf
ID
2354 return 0;
2355}
9d4df01f 2356
a086a1b8
ID
2357static int count_write_ops(struct rbd_obj_request *obj_req)
2358{
8b5bec5c
ID
2359 struct rbd_img_request *img_req = obj_req->img_request;
2360
2361 switch (img_req->op_type) {
a086a1b8 2362 case OBJ_OP_WRITE:
8b5bec5c
ID
2363 if (!use_object_map(img_req->rbd_dev) ||
2364 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2365 return 2; /* setallochint + write/writefull */
2366
2367 return 1; /* write/writefull */
a086a1b8
ID
2368 case OBJ_OP_DISCARD:
2369 return 1; /* delete/truncate/zero */
2370 case OBJ_OP_ZEROOUT:
2371 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2372 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2373 return 2; /* create + truncate */
bf0d5f50 2374
a086a1b8
ID
2375 return 1; /* delete/truncate/zero */
2376 default:
2377 BUG();
3da691bf 2378 }
a086a1b8 2379}
3b434a2a 2380
a086a1b8
ID
2381static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2382 int which)
2383{
2384 struct rbd_obj_request *obj_req = osd_req->r_priv;
2385
2386 switch (obj_req->img_request->op_type) {
2387 case OBJ_OP_WRITE:
2388 __rbd_osd_setup_write_ops(osd_req, which);
2389 break;
2390 case OBJ_OP_DISCARD:
2391 __rbd_osd_setup_discard_ops(osd_req, which);
2392 break;
2393 case OBJ_OP_ZEROOUT:
2394 __rbd_osd_setup_zeroout_ops(osd_req, which);
2395 break;
2396 default:
2397 BUG();
2398 }
3da691bf 2399}
9d4df01f 2400
3da691bf 2401/*
a086a1b8
ID
2402 * Prune the list of object requests (adjust offset and/or length, drop
2403 * redundant requests). Prepare object request state machines and image
2404 * request state machine for execution.
3da691bf
ID
2405 */
2406static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2407{
0c93e1b7 2408 struct rbd_obj_request *obj_req, *next_obj_req;
3da691bf 2409 int ret;
430c28c3 2410
0c93e1b7 2411 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
9bb0248d 2412 switch (img_req->op_type) {
3da691bf 2413 case OBJ_OP_READ:
ea9b743c 2414 ret = rbd_obj_init_read(obj_req);
3da691bf
ID
2415 break;
2416 case OBJ_OP_WRITE:
ea9b743c 2417 ret = rbd_obj_init_write(obj_req);
3da691bf
ID
2418 break;
2419 case OBJ_OP_DISCARD:
ea9b743c 2420 ret = rbd_obj_init_discard(obj_req);
3da691bf 2421 break;
6484cbe9 2422 case OBJ_OP_ZEROOUT:
ea9b743c 2423 ret = rbd_obj_init_zeroout(obj_req);
6484cbe9 2424 break;
3da691bf 2425 default:
16809372 2426 BUG();
3da691bf 2427 }
0c93e1b7 2428 if (ret < 0)
3da691bf 2429 return ret;
0c93e1b7 2430 if (ret > 0) {
0c93e1b7
ID
2431 rbd_img_obj_request_del(img_req, obj_req);
2432 continue;
2433 }
bf0d5f50
AE
2434 }
2435
0192ce2e 2436 img_req->state = RBD_IMG_START;
bf0d5f50 2437 return 0;
3da691bf 2438}
bf0d5f50 2439
5a237819
ID
2440union rbd_img_fill_iter {
2441 struct ceph_bio_iter bio_iter;
2442 struct ceph_bvec_iter bvec_iter;
2443};
bf0d5f50 2444
5a237819
ID
2445struct rbd_img_fill_ctx {
2446 enum obj_request_type pos_type;
2447 union rbd_img_fill_iter *pos;
2448 union rbd_img_fill_iter iter;
2449 ceph_object_extent_fn_t set_pos_fn;
afb97888
ID
2450 ceph_object_extent_fn_t count_fn;
2451 ceph_object_extent_fn_t copy_fn;
5a237819 2452};
bf0d5f50 2453
5a237819 2454static struct ceph_object_extent *alloc_object_extent(void *arg)
0eefd470 2455{
5a237819
ID
2456 struct rbd_img_request *img_req = arg;
2457 struct rbd_obj_request *obj_req;
0eefd470 2458
5a237819
ID
2459 obj_req = rbd_obj_request_create();
2460 if (!obj_req)
2461 return NULL;
2761713d 2462
5a237819
ID
2463 rbd_img_obj_request_add(img_req, obj_req);
2464 return &obj_req->ex;
2465}
0eefd470 2466
afb97888
ID
2467/*
2468 * While su != os && sc == 1 is technically not fancy (it's the same
2469 * layout as su == os && sc == 1), we can't use the nocopy path for it
2470 * because ->set_pos_fn() should be called only once per object.
2471 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2472 * treat su != os && sc == 1 as fancy.
2473 */
2474static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2475{
2476 return l->stripe_unit != l->object_size;
2477}
0eefd470 2478
afb97888
ID
2479static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2480 struct ceph_file_extent *img_extents,
2481 u32 num_img_extents,
2482 struct rbd_img_fill_ctx *fctx)
2483{
2484 u32 i;
2485 int ret;
2486
2487 img_req->data_type = fctx->pos_type;
0eefd470
AE
2488
2489 /*
afb97888
ID
2490 * Create object requests and set each object request's starting
2491 * position in the provided bio (list) or bio_vec array.
0eefd470 2492 */
afb97888
ID
2493 fctx->iter = *fctx->pos;
2494 for (i = 0; i < num_img_extents; i++) {
2495 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2496 img_extents[i].fe_off,
2497 img_extents[i].fe_len,
2498 &img_req->object_extents,
2499 alloc_object_extent, img_req,
2500 fctx->set_pos_fn, &fctx->iter);
2501 if (ret)
2502 return ret;
2503 }
0eefd470 2504
afb97888 2505 return __rbd_img_fill_request(img_req);
0eefd470
AE
2506}
2507
5a237819
ID
2508/*
2509 * Map a list of image extents to a list of object extents, create the
2510 * corresponding object requests (normally each to a different object,
2511 * but not always) and add them to @img_req. For each object request,
afb97888 2512 * set up its data descriptor to point to the corresponding chunk(s) of
5a237819
ID
2513 * @fctx->pos data buffer.
2514 *
afb97888
ID
2515 * Because ceph_file_to_extents() will merge adjacent object extents
2516 * together, each object request's data descriptor may point to multiple
2517 * different chunks of @fctx->pos data buffer.
2518 *
5a237819
ID
2519 * @fctx->pos data buffer is assumed to be large enough.
2520 */
2521static int rbd_img_fill_request(struct rbd_img_request *img_req,
2522 struct ceph_file_extent *img_extents,
2523 u32 num_img_extents,
2524 struct rbd_img_fill_ctx *fctx)
3d7efd18 2525{
afb97888
ID
2526 struct rbd_device *rbd_dev = img_req->rbd_dev;
2527 struct rbd_obj_request *obj_req;
5a237819
ID
2528 u32 i;
2529 int ret;
2530
afb97888
ID
2531 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2532 !rbd_layout_is_fancy(&rbd_dev->layout))
2533 return rbd_img_fill_request_nocopy(img_req, img_extents,
2534 num_img_extents, fctx);
3d7efd18 2535
afb97888 2536 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
0eefd470 2537
bbea1c1a 2538 /*
afb97888
ID
2539 * Create object requests and determine ->bvec_count for each object
2540 * request. Note that ->bvec_count sum over all object requests may
2541 * be greater than the number of bio_vecs in the provided bio (list)
2542 * or bio_vec array because when mapped, those bio_vecs can straddle
2543 * stripe unit boundaries.
bbea1c1a 2544 */
5a237819
ID
2545 fctx->iter = *fctx->pos;
2546 for (i = 0; i < num_img_extents; i++) {
afb97888 2547 ret = ceph_file_to_extents(&rbd_dev->layout,
5a237819
ID
2548 img_extents[i].fe_off,
2549 img_extents[i].fe_len,
2550 &img_req->object_extents,
2551 alloc_object_extent, img_req,
afb97888
ID
2552 fctx->count_fn, &fctx->iter);
2553 if (ret)
2554 return ret;
bbea1c1a 2555 }
0eefd470 2556
afb97888
ID
2557 for_each_obj_request(img_req, obj_req) {
2558 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2559 sizeof(*obj_req->bvec_pos.bvecs),
2560 GFP_NOIO);
2561 if (!obj_req->bvec_pos.bvecs)
2562 return -ENOMEM;
2563 }
0eefd470 2564
8785b1d4 2565 /*
afb97888
ID
2566 * Fill in each object request's private bio_vec array, splitting and
2567 * rearranging the provided bio_vecs in stripe unit chunks as needed.
8785b1d4 2568 */
afb97888
ID
2569 fctx->iter = *fctx->pos;
2570 for (i = 0; i < num_img_extents; i++) {
2571 ret = ceph_iterate_extents(&rbd_dev->layout,
2572 img_extents[i].fe_off,
2573 img_extents[i].fe_len,
2574 &img_req->object_extents,
2575 fctx->copy_fn, &fctx->iter);
5a237819
ID
2576 if (ret)
2577 return ret;
2578 }
3d7efd18 2579
5a237819
ID
2580 return __rbd_img_fill_request(img_req);
2581}
2582
2583static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2584 u64 off, u64 len)
2585{
2586 struct ceph_file_extent ex = { off, len };
a55e601b 2587 union rbd_img_fill_iter dummy = {};
5a237819
ID
2588 struct rbd_img_fill_ctx fctx = {
2589 .pos_type = OBJ_REQUEST_NODATA,
2590 .pos = &dummy,
2591 };
2592
2593 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2594}
2595
2596static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2597{
2598 struct rbd_obj_request *obj_req =
2599 container_of(ex, struct rbd_obj_request, ex);
2600 struct ceph_bio_iter *it = arg;
3d7efd18 2601
5a237819
ID
2602 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2603 obj_req->bio_pos = *it;
2604 ceph_bio_iter_advance(it, bytes);
2605}
3d7efd18 2606
afb97888
ID
2607static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2608{
2609 struct rbd_obj_request *obj_req =
2610 container_of(ex, struct rbd_obj_request, ex);
2611 struct ceph_bio_iter *it = arg;
0eefd470 2612
afb97888
ID
2613 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2614 ceph_bio_iter_advance_step(it, bytes, ({
2615 obj_req->bvec_count++;
2616 }));
0eefd470 2617
afb97888 2618}
0eefd470 2619
afb97888
ID
2620static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2621{
2622 struct rbd_obj_request *obj_req =
2623 container_of(ex, struct rbd_obj_request, ex);
2624 struct ceph_bio_iter *it = arg;
0eefd470 2625
afb97888
ID
2626 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2627 ceph_bio_iter_advance_step(it, bytes, ({
2628 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2629 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2630 }));
3d7efd18
AE
2631}
2632
5a237819
ID
2633static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2634 struct ceph_file_extent *img_extents,
2635 u32 num_img_extents,
2636 struct ceph_bio_iter *bio_pos)
2637{
2638 struct rbd_img_fill_ctx fctx = {
2639 .pos_type = OBJ_REQUEST_BIO,
2640 .pos = (union rbd_img_fill_iter *)bio_pos,
2641 .set_pos_fn = set_bio_pos,
afb97888
ID
2642 .count_fn = count_bio_bvecs,
2643 .copy_fn = copy_bio_bvecs,
5a237819 2644 };
3d7efd18 2645
5a237819
ID
2646 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2647 &fctx);
2648}
3d7efd18 2649
5a237819
ID
2650static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2651 u64 off, u64 len, struct bio *bio)
2652{
2653 struct ceph_file_extent ex = { off, len };
2654 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
3d7efd18 2655
5a237819
ID
2656 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2657}
a9e8ba2c 2658
5a237819
ID
2659static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2660{
2661 struct rbd_obj_request *obj_req =
2662 container_of(ex, struct rbd_obj_request, ex);
2663 struct ceph_bvec_iter *it = arg;
3d7efd18 2664
5a237819
ID
2665 obj_req->bvec_pos = *it;
2666 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2667 ceph_bvec_iter_advance(it, bytes);
2668}
3d7efd18 2669
afb97888
ID
2670static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2671{
2672 struct rbd_obj_request *obj_req =
2673 container_of(ex, struct rbd_obj_request, ex);
2674 struct ceph_bvec_iter *it = arg;
058aa991 2675
afb97888
ID
2676 ceph_bvec_iter_advance_step(it, bytes, ({
2677 obj_req->bvec_count++;
2678 }));
2679}
058aa991 2680
afb97888
ID
2681static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2682{
2683 struct rbd_obj_request *obj_req =
2684 container_of(ex, struct rbd_obj_request, ex);
2685 struct ceph_bvec_iter *it = arg;
3d7efd18 2686
afb97888
ID
2687 ceph_bvec_iter_advance_step(it, bytes, ({
2688 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2689 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2690 }));
3d7efd18
AE
2691}
2692
5a237819
ID
2693static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2694 struct ceph_file_extent *img_extents,
2695 u32 num_img_extents,
2696 struct ceph_bvec_iter *bvec_pos)
c5b5ef6c 2697{
5a237819
ID
2698 struct rbd_img_fill_ctx fctx = {
2699 .pos_type = OBJ_REQUEST_BVECS,
2700 .pos = (union rbd_img_fill_iter *)bvec_pos,
2701 .set_pos_fn = set_bvec_pos,
afb97888
ID
2702 .count_fn = count_bvecs,
2703 .copy_fn = copy_bvecs,
5a237819 2704 };
c5b5ef6c 2705
5a237819
ID
2706 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2707 &fctx);
2708}
c5b5ef6c 2709
5a237819
ID
2710static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2711 struct ceph_file_extent *img_extents,
2712 u32 num_img_extents,
2713 struct bio_vec *bvecs)
2714{
2715 struct ceph_bvec_iter it = {
2716 .bvecs = bvecs,
2717 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2718 num_img_extents) },
2719 };
c5b5ef6c 2720
5a237819
ID
2721 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2722 &it);
2723}
c5b5ef6c 2724
0192ce2e 2725static void rbd_img_handle_request_work(struct work_struct *work)
bf0d5f50 2726{
0192ce2e
ID
2727 struct rbd_img_request *img_req =
2728 container_of(work, struct rbd_img_request, work);
c5b5ef6c 2729
0192ce2e
ID
2730 rbd_img_handle_request(img_req, img_req->work_result);
2731}
c2e82414 2732
0192ce2e
ID
2733static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2734{
2735 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2736 img_req->work_result = result;
2737 queue_work(rbd_wq, &img_req->work);
c5b5ef6c 2738}
c2e82414 2739
22e8bd51
ID
2740static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2741{
2742 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2743
2744 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2745 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2746 return true;
2747 }
2748
2749 dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2750 obj_req->ex.oe_objno);
2751 return false;
2752}
2753
85b5e6d1
ID
2754static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2755{
a086a1b8
ID
2756 struct ceph_osd_request *osd_req;
2757 int ret;
2758
2759 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2760 if (IS_ERR(osd_req))
2761 return PTR_ERR(osd_req);
2762
2763 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2764 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2765 rbd_osd_setup_data(osd_req, 0);
2766 rbd_osd_format_read(osd_req);
2767
2768 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2769 if (ret)
2770 return ret;
2771
2772 rbd_osd_submit(osd_req);
85b5e6d1 2773 return 0;
c5b5ef6c
AE
2774}
2775
86bd7998 2776static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
c5b5ef6c 2777{
3da691bf 2778 struct rbd_img_request *img_req = obj_req->img_request;
a52cc685 2779 struct rbd_device *parent = img_req->rbd_dev->parent;
3da691bf 2780 struct rbd_img_request *child_img_req;
c5b5ef6c
AE
2781 int ret;
2782
59e542c8 2783 child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
3da691bf 2784 if (!child_img_req)
710214e3
ID
2785 return -ENOMEM;
2786
59e542c8 2787 rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
e93aca0a
ID
2788 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2789 child_img_req->obj_request = obj_req;
a90bb0c1 2790
a52cc685
ID
2791 down_read(&parent->header_rwsem);
2792 rbd_img_capture_header(child_img_req);
2793 up_read(&parent->header_rwsem);
2794
21ed05a8
ID
2795 dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2796 obj_req);
2797
3da691bf 2798 if (!rbd_img_is_write(img_req)) {
ecc633ca 2799 switch (img_req->data_type) {
3da691bf 2800 case OBJ_REQUEST_BIO:
5a237819
ID
2801 ret = __rbd_img_fill_from_bio(child_img_req,
2802 obj_req->img_extents,
2803 obj_req->num_img_extents,
2804 &obj_req->bio_pos);
3da691bf
ID
2805 break;
2806 case OBJ_REQUEST_BVECS:
afb97888 2807 case OBJ_REQUEST_OWN_BVECS:
5a237819
ID
2808 ret = __rbd_img_fill_from_bvecs(child_img_req,
2809 obj_req->img_extents,
2810 obj_req->num_img_extents,
2811 &obj_req->bvec_pos);
3da691bf
ID
2812 break;
2813 default:
d342a15b 2814 BUG();
3da691bf
ID
2815 }
2816 } else {
5a237819
ID
2817 ret = rbd_img_fill_from_bvecs(child_img_req,
2818 obj_req->img_extents,
2819 obj_req->num_img_extents,
2820 obj_req->copyup_bvecs);
3da691bf
ID
2821 }
2822 if (ret) {
679a97d2 2823 rbd_img_request_destroy(child_img_req);
3da691bf
ID
2824 return ret;
2825 }
2826
0192ce2e
ID
2827 /* avoid parent chain recursion */
2828 rbd_img_schedule(child_img_req, 0);
3da691bf
ID
2829 return 0;
2830}
2831
85b5e6d1 2832static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
3da691bf
ID
2833{
2834 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2835 int ret;
2836
22e8bd51 2837again:
a9b67e69 2838 switch (obj_req->read_state) {
85b5e6d1
ID
2839 case RBD_OBJ_READ_START:
2840 rbd_assert(!*result);
2841
22e8bd51
ID
2842 if (!rbd_obj_may_exist(obj_req)) {
2843 *result = -ENOENT;
2844 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2845 goto again;
2846 }
2847
85b5e6d1 2848 ret = rbd_obj_read_object(obj_req);
3da691bf 2849 if (ret) {
85b5e6d1 2850 *result = ret;
3da691bf
ID
2851 return true;
2852 }
85b5e6d1
ID
2853 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2854 return false;
a9b67e69
ID
2855 case RBD_OBJ_READ_OBJECT:
2856 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2857 /* reverse map this object extent onto the parent */
2858 ret = rbd_obj_calc_img_extents(obj_req, false);
86bd7998 2859 if (ret) {
54ab3b24 2860 *result = ret;
86bd7998
ID
2861 return true;
2862 }
a9b67e69
ID
2863 if (obj_req->num_img_extents) {
2864 ret = rbd_obj_read_from_parent(obj_req);
2865 if (ret) {
2866 *result = ret;
2867 return true;
2868 }
2869 obj_req->read_state = RBD_OBJ_READ_PARENT;
2870 return false;
2871 }
86bd7998 2872 }
710214e3 2873
a9b67e69
ID
2874 /*
2875 * -ENOENT means a hole in the image -- zero-fill the entire
2876 * length of the request. A short read also implies zero-fill
2877 * to the end of the request.
2878 */
2879 if (*result == -ENOENT) {
2880 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2881 *result = 0;
2882 } else if (*result >= 0) {
2883 if (*result < obj_req->ex.oe_len)
2884 rbd_obj_zero_range(obj_req, *result,
2885 obj_req->ex.oe_len - *result);
2886 else
2887 rbd_assert(*result == obj_req->ex.oe_len);
2888 *result = 0;
2889 }
2890 return true;
2891 case RBD_OBJ_READ_PARENT:
d435c9a7
ID
2892 /*
2893 * The parent image is read only up to the overlap -- zero-fill
2894 * from the overlap to the end of the request.
2895 */
2896 if (!*result) {
2897 u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2898
2899 if (obj_overlap < obj_req->ex.oe_len)
2900 rbd_obj_zero_range(obj_req, obj_overlap,
2901 obj_req->ex.oe_len - obj_overlap);
2902 }
a9b67e69
ID
2903 return true;
2904 default:
2905 BUG();
710214e3 2906 }
3da691bf 2907}
c5b5ef6c 2908
22e8bd51
ID
2909static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
2910{
2911 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2912
2913 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
2914 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2915
2916 if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
2917 (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
2918 dout("%s %p noop for nonexistent\n", __func__, obj_req);
2919 return true;
2920 }
2921
2922 return false;
2923}
2924
2925/*
2926 * Return:
2927 * 0 - object map update sent
2928 * 1 - object map update isn't needed
2929 * <0 - error
2930 */
2931static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
2932{
2933 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2934 u8 new_state;
2935
2936 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
2937 return 1;
2938
2939 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
2940 new_state = OBJECT_PENDING;
2941 else
2942 new_state = OBJECT_EXISTS;
2943
2944 return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
2945}
2946
85b5e6d1
ID
2947static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
2948{
a086a1b8
ID
2949 struct ceph_osd_request *osd_req;
2950 int num_ops = count_write_ops(obj_req);
2951 int which = 0;
2952 int ret;
710214e3 2953
a086a1b8
ID
2954 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2955 num_ops++; /* stat */
2956
2957 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
2958 if (IS_ERR(osd_req))
2959 return PTR_ERR(osd_req);
2960
2961 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2962 ret = rbd_osd_setup_stat(osd_req, which++);
2963 if (ret)
2964 return ret;
710214e3 2965 }
c5b5ef6c 2966
a086a1b8
ID
2967 rbd_osd_setup_write_ops(osd_req, which);
2968 rbd_osd_format_write(osd_req);
2969
2970 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2971 if (ret)
2972 return ret;
2973
2974 rbd_osd_submit(osd_req);
85b5e6d1 2975 return 0;
3da691bf 2976}
c5b5ef6c 2977
3da691bf
ID
2978/*
2979 * copyup_bvecs pages are never highmem pages
2980 */
2981static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2982{
2983 struct ceph_bvec_iter it = {
2984 .bvecs = bvecs,
2985 .iter = { .bi_size = bytes },
2986 };
c5b5ef6c 2987
3da691bf 2988 ceph_bvec_iter_advance_step(&it, bytes, ({
cf58b537 2989 if (memchr_inv(bvec_virt(&bv), 0, bv.bv_len))
3da691bf
ID
2990 return false;
2991 }));
2992 return true;
c5b5ef6c
AE
2993}
2994
3a482501
ID
2995#define MODS_ONLY U32_MAX
2996
793333a3
ID
2997static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
2998 u32 bytes)
b454e36d 2999{
bcbab1db 3000 struct ceph_osd_request *osd_req;
fe943d50 3001 int ret;
70d045f6 3002
3da691bf 3003 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
89a59c1c 3004 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
70d045f6 3005
bcbab1db
ID
3006 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3007 if (IS_ERR(osd_req))
3008 return PTR_ERR(osd_req);
b454e36d 3009
b5ae8cbc 3010 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
fe943d50
CX
3011 if (ret)
3012 return ret;
3013
bcbab1db 3014 rbd_osd_format_write(osd_req);
3da691bf 3015
bcbab1db 3016 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
89a59c1c
ID
3017 if (ret)
3018 return ret;
3019
a086a1b8 3020 rbd_osd_submit(osd_req);
89a59c1c
ID
3021 return 0;
3022}
3023
793333a3
ID
3024static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3025 u32 bytes)
b454e36d 3026{
bcbab1db 3027 struct ceph_osd_request *osd_req;
a086a1b8
ID
3028 int num_ops = count_write_ops(obj_req);
3029 int which = 0;
fe943d50 3030 int ret;
70d045f6 3031
3da691bf 3032 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
70d045f6 3033
a086a1b8
ID
3034 if (bytes != MODS_ONLY)
3035 num_ops++; /* copyup */
13488d53 3036
a086a1b8 3037 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
bcbab1db
ID
3038 if (IS_ERR(osd_req))
3039 return PTR_ERR(osd_req);
b454e36d 3040
3a482501 3041 if (bytes != MODS_ONLY) {
b5ae8cbc 3042 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3a482501
ID
3043 if (ret)
3044 return ret;
3da691bf 3045 }
3da691bf 3046
a086a1b8
ID
3047 rbd_osd_setup_write_ops(osd_req, which);
3048 rbd_osd_format_write(osd_req);
70d045f6 3049
bcbab1db 3050 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
26f887e0
ID
3051 if (ret)
3052 return ret;
3053
a086a1b8 3054 rbd_osd_submit(osd_req);
3da691bf 3055 return 0;
70d045f6
ID
3056}
3057
7e07efb1 3058static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
70d045f6 3059{
7e07efb1 3060 u32 i;
b454e36d 3061
7e07efb1
ID
3062 rbd_assert(!obj_req->copyup_bvecs);
3063 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3064 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3065 sizeof(*obj_req->copyup_bvecs),
3066 GFP_NOIO);
3067 if (!obj_req->copyup_bvecs)
3068 return -ENOMEM;
b454e36d 3069
7e07efb1
ID
3070 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3071 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3072
3073 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3074 if (!obj_req->copyup_bvecs[i].bv_page)
3075 return -ENOMEM;
3d7efd18 3076
7e07efb1
ID
3077 obj_req->copyup_bvecs[i].bv_offset = 0;
3078 obj_req->copyup_bvecs[i].bv_len = len;
3079 obj_overlap -= len;
3080 }
b454e36d 3081
7e07efb1
ID
3082 rbd_assert(!obj_overlap);
3083 return 0;
b454e36d
AE
3084}
3085
0ad5d953
ID
3086/*
3087 * The target object doesn't exist. Read the data for the entire
3088 * target object up to the overlap point (if any) from the parent,
3089 * so we can use it for a copyup.
3090 */
793333a3 3091static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
bf0d5f50 3092{
3da691bf 3093 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 3094 int ret;
bf0d5f50 3095
86bd7998
ID
3096 rbd_assert(obj_req->num_img_extents);
3097 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3098 rbd_dev->parent_overlap);
3099 if (!obj_req->num_img_extents) {
3da691bf
ID
3100 /*
3101 * The overlap has become 0 (most likely because the
3a482501
ID
3102 * image has been flattened). Re-submit the original write
3103 * request -- pass MODS_ONLY since the copyup isn't needed
3104 * anymore.
3da691bf 3105 */
793333a3 3106 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
bf0d5f50
AE
3107 }
3108
86bd7998 3109 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3da691bf
ID
3110 if (ret)
3111 return ret;
3112
86bd7998 3113 return rbd_obj_read_from_parent(obj_req);
bf0d5f50 3114}
8b3e1a56 3115
22e8bd51
ID
3116static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3117{
3118 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3119 struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3120 u8 new_state;
3121 u32 i;
3122 int ret;
3123
3124 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3125
3126 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3127 return;
3128
3129 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3130 return;
3131
3132 for (i = 0; i < snapc->num_snaps; i++) {
3133 if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3134 i + 1 < snapc->num_snaps)
3135 new_state = OBJECT_EXISTS_CLEAN;
3136 else
3137 new_state = OBJECT_EXISTS;
3138
3139 ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3140 new_state, NULL);
3141 if (ret < 0) {
3142 obj_req->pending.result = ret;
3143 return;
3144 }
3145
3146 rbd_assert(!ret);
3147 obj_req->pending.num_pending++;
3148 }
3149}
3150
793333a3
ID
3151static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3152{
3153 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3154 int ret;
3155
3156 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3157
3158 /*
3159 * Only send non-zero copyup data to save some I/O and network
3160 * bandwidth -- zero copyup data is equivalent to the object not
3161 * existing.
3162 */
3163 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3164 bytes = 0;
3165
3166 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3167 /*
3168 * Send a copyup request with an empty snapshot context to
3169 * deep-copyup the object through all existing snapshots.
3170 * A second request with the current snapshot context will be
3171 * sent for the actual modification.
3172 */
3173 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3174 if (ret) {
3175 obj_req->pending.result = ret;
3176 return;
3177 }
3178
3179 obj_req->pending.num_pending++;
3180 bytes = MODS_ONLY;
3181 }
3182
3183 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3184 if (ret) {
3185 obj_req->pending.result = ret;
3186 return;
3187 }
3188
3189 obj_req->pending.num_pending++;
3190}
3191
3192static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3193{
22e8bd51 3194 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
793333a3
ID
3195 int ret;
3196
3197again:
3198 switch (obj_req->copyup_state) {
3199 case RBD_OBJ_COPYUP_START:
3200 rbd_assert(!*result);
3201
3202 ret = rbd_obj_copyup_read_parent(obj_req);
3203 if (ret) {
3204 *result = ret;
3205 return true;
3206 }
3207 if (obj_req->num_img_extents)
3208 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3209 else
3210 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3211 return false;
3212 case RBD_OBJ_COPYUP_READ_PARENT:
3213 if (*result)
3214 return true;
3215
3216 if (is_zero_bvecs(obj_req->copyup_bvecs,
3217 rbd_obj_img_extents_bytes(obj_req))) {
3218 dout("%s %p detected zeros\n", __func__, obj_req);
3219 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3220 }
3221
22e8bd51
ID
3222 rbd_obj_copyup_object_maps(obj_req);
3223 if (!obj_req->pending.num_pending) {
3224 *result = obj_req->pending.result;
3225 obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3226 goto again;
3227 }
3228 obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3229 return false;
3230 case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3231 if (!pending_result_dec(&obj_req->pending, result))
3232 return false;
df561f66 3233 fallthrough;
22e8bd51
ID
3234 case RBD_OBJ_COPYUP_OBJECT_MAPS:
3235 if (*result) {
3236 rbd_warn(rbd_dev, "snap object map update failed: %d",
3237 *result);
3238 return true;
3239 }
3240
793333a3
ID
3241 rbd_obj_copyup_write_object(obj_req);
3242 if (!obj_req->pending.num_pending) {
3243 *result = obj_req->pending.result;
3244 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3245 goto again;
3246 }
3247 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3248 return false;
3249 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3250 if (!pending_result_dec(&obj_req->pending, result))
3251 return false;
df561f66 3252 fallthrough;
793333a3
ID
3253 case RBD_OBJ_COPYUP_WRITE_OBJECT:
3254 return true;
3255 default:
3256 BUG();
3257 }
3258}
3259
22e8bd51
ID
3260/*
3261 * Return:
3262 * 0 - object map update sent
3263 * 1 - object map update isn't needed
3264 * <0 - error
3265 */
3266static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
3267{
3268 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3269 u8 current_state = OBJECT_PENDING;
3270
3271 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3272 return 1;
3273
3274 if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3275 return 1;
3276
3277 return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3278 &current_state);
3279}
3280
85b5e6d1 3281static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
8b3e1a56 3282{
793333a3 3283 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 3284 int ret;
8b3e1a56 3285
793333a3 3286again:
3da691bf 3287 switch (obj_req->write_state) {
85b5e6d1
ID
3288 case RBD_OBJ_WRITE_START:
3289 rbd_assert(!*result);
3290
22e8bd51
ID
3291 if (rbd_obj_write_is_noop(obj_req))
3292 return true;
3293
3294 ret = rbd_obj_write_pre_object_map(obj_req);
3295 if (ret < 0) {
3296 *result = ret;
3297 return true;
3298 }
3299 obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3300 if (ret > 0)
3301 goto again;
3302 return false;
3303 case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3304 if (*result) {
3305 rbd_warn(rbd_dev, "pre object map update failed: %d",
3306 *result);
3307 return true;
3308 }
85b5e6d1
ID
3309 ret = rbd_obj_write_object(obj_req);
3310 if (ret) {
3311 *result = ret;
3312 return true;
3313 }
3314 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3315 return false;
0ad5d953 3316 case RBD_OBJ_WRITE_OBJECT:
54ab3b24 3317 if (*result == -ENOENT) {
0ad5d953 3318 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
793333a3
ID
3319 *result = 0;
3320 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3321 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3322 goto again;
0ad5d953 3323 }
3da691bf 3324 /*
0ad5d953
ID
3325 * On a non-existent object:
3326 * delete - -ENOENT, truncate/zero - 0
3da691bf 3327 */
0ad5d953
ID
3328 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3329 *result = 0;
3da691bf 3330 }
a9b67e69 3331 if (*result)
3a482501 3332 return true;
8b3e1a56 3333
793333a3
ID
3334 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3335 goto again;
3336 case __RBD_OBJ_WRITE_COPYUP:
3337 if (!rbd_obj_advance_copyup(obj_req, result))
3338 return false;
df561f66 3339 fallthrough;
793333a3 3340 case RBD_OBJ_WRITE_COPYUP:
22e8bd51 3341 if (*result) {
793333a3 3342 rbd_warn(rbd_dev, "copyup failed: %d", *result);
22e8bd51
ID
3343 return true;
3344 }
3345 ret = rbd_obj_write_post_object_map(obj_req);
3346 if (ret < 0) {
3347 *result = ret;
3348 return true;
3349 }
3350 obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3351 if (ret > 0)
3352 goto again;
3353 return false;
3354 case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3355 if (*result)
3356 rbd_warn(rbd_dev, "post object map update failed: %d",
3357 *result);
793333a3 3358 return true;
3da691bf 3359 default:
c6244b3b 3360 BUG();
3da691bf
ID
3361 }
3362}
02c74fba 3363
3da691bf 3364/*
0ad5d953 3365 * Return true if @obj_req is completed.
3da691bf 3366 */
54ab3b24
ID
3367static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3368 int *result)
3da691bf 3369{
0ad5d953 3370 struct rbd_img_request *img_req = obj_req->img_request;
0192ce2e 3371 struct rbd_device *rbd_dev = img_req->rbd_dev;
0ad5d953
ID
3372 bool done;
3373
85b5e6d1 3374 mutex_lock(&obj_req->state_mutex);
0ad5d953 3375 if (!rbd_img_is_write(img_req))
85b5e6d1 3376 done = rbd_obj_advance_read(obj_req, result);
0ad5d953 3377 else
85b5e6d1
ID
3378 done = rbd_obj_advance_write(obj_req, result);
3379 mutex_unlock(&obj_req->state_mutex);
0ad5d953 3380
0192ce2e
ID
3381 if (done && *result) {
3382 rbd_assert(*result < 0);
3383 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3384 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3385 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3386 }
0ad5d953 3387 return done;
3da691bf 3388}
02c74fba 3389
0192ce2e
ID
3390/*
3391 * This is open-coded in rbd_img_handle_request() to avoid parent chain
3392 * recursion.
3393 */
3394static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3395{
3396 if (__rbd_obj_handle_request(obj_req, &result))
3397 rbd_img_handle_request(obj_req->img_request, result);
3398}
3399
e1fddc8f
ID
3400static bool need_exclusive_lock(struct rbd_img_request *img_req)
3401{
3402 struct rbd_device *rbd_dev = img_req->rbd_dev;
3403
3404 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3405 return false;
3406
3fe69921 3407 if (rbd_is_ro(rbd_dev))
e1fddc8f
ID
3408 return false;
3409
3410 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
22e8bd51
ID
3411 if (rbd_dev->opts->lock_on_read ||
3412 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
e1fddc8f
ID
3413 return true;
3414
3415 return rbd_img_is_write(img_req);
3416}
3417
637cd060 3418static bool rbd_lock_add_request(struct rbd_img_request *img_req)
e1fddc8f
ID
3419{
3420 struct rbd_device *rbd_dev = img_req->rbd_dev;
637cd060 3421 bool locked;
e1fddc8f
ID
3422
3423 lockdep_assert_held(&rbd_dev->lock_rwsem);
637cd060 3424 locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
e1fddc8f
ID
3425 spin_lock(&rbd_dev->lock_lists_lock);
3426 rbd_assert(list_empty(&img_req->lock_item));
637cd060
ID
3427 if (!locked)
3428 list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3429 else
3430 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
e1fddc8f 3431 spin_unlock(&rbd_dev->lock_lists_lock);
637cd060 3432 return locked;
e1fddc8f
ID
3433}
3434
3435static void rbd_lock_del_request(struct rbd_img_request *img_req)
3436{
3437 struct rbd_device *rbd_dev = img_req->rbd_dev;
3438 bool need_wakeup;
3439
3440 lockdep_assert_held(&rbd_dev->lock_rwsem);
3441 spin_lock(&rbd_dev->lock_lists_lock);
3442 rbd_assert(!list_empty(&img_req->lock_item));
3443 list_del_init(&img_req->lock_item);
3444 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3445 list_empty(&rbd_dev->running_list));
3446 spin_unlock(&rbd_dev->lock_lists_lock);
3447 if (need_wakeup)
3448 complete(&rbd_dev->releasing_wait);
3449}
3450
637cd060
ID
3451static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3452{
3453 struct rbd_device *rbd_dev = img_req->rbd_dev;
3454
3455 if (!need_exclusive_lock(img_req))
3456 return 1;
3457
3458 if (rbd_lock_add_request(img_req))
3459 return 1;
3460
3461 if (rbd_dev->opts->exclusive) {
3462 WARN_ON(1); /* lock got released? */
3463 return -EROFS;
3464 }
3465
3466 /*
3467 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3468 * and cancel_delayed_work() in wake_lock_waiters().
3469 */
3470 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3471 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3472 return 0;
3473}
3474
0192ce2e 3475static void rbd_img_object_requests(struct rbd_img_request *img_req)
7114edac 3476{
0192ce2e 3477 struct rbd_obj_request *obj_req;
7114edac 3478
0192ce2e
ID
3479 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3480
3481 for_each_obj_request(img_req, obj_req) {
3482 int result = 0;
a9e8ba2c 3483
0192ce2e
ID
3484 if (__rbd_obj_handle_request(obj_req, &result)) {
3485 if (result) {
3486 img_req->pending.result = result;
3487 return;
3488 }
3489 } else {
3490 img_req->pending.num_pending++;
3491 }
3492 }
8b3e1a56
AE
3493}
3494
0192ce2e 3495static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
8b3e1a56 3496{
637cd060 3497 struct rbd_device *rbd_dev = img_req->rbd_dev;
3da691bf 3498 int ret;
8b3e1a56 3499
0192ce2e
ID
3500again:
3501 switch (img_req->state) {
3502 case RBD_IMG_START:
3503 rbd_assert(!*result);
8b3e1a56 3504
637cd060
ID
3505 ret = rbd_img_exclusive_lock(img_req);
3506 if (ret < 0) {
3507 *result = ret;
3da691bf
ID
3508 return true;
3509 }
637cd060
ID
3510 img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3511 if (ret > 0)
3512 goto again;
3da691bf 3513 return false;
637cd060
ID
3514 case RBD_IMG_EXCLUSIVE_LOCK:
3515 if (*result)
89a59c1c
ID
3516 return true;
3517
637cd060
ID
3518 rbd_assert(!need_exclusive_lock(img_req) ||
3519 __rbd_is_lock_owner(rbd_dev));
3520
0192ce2e
ID
3521 rbd_img_object_requests(img_req);
3522 if (!img_req->pending.num_pending) {
3523 *result = img_req->pending.result;
3524 img_req->state = RBD_IMG_OBJECT_REQUESTS;
3525 goto again;
3da691bf 3526 }
0192ce2e 3527 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3da691bf 3528 return false;
0192ce2e
ID
3529 case __RBD_IMG_OBJECT_REQUESTS:
3530 if (!pending_result_dec(&img_req->pending, result))
3531 return false;
df561f66 3532 fallthrough;
0192ce2e
ID
3533 case RBD_IMG_OBJECT_REQUESTS:
3534 return true;
3da691bf 3535 default:
c6244b3b 3536 BUG();
3da691bf
ID
3537 }
3538}
02c74fba 3539
3da691bf 3540/*
0192ce2e 3541 * Return true if @img_req is completed.
3da691bf 3542 */
0192ce2e
ID
3543static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3544 int *result)
7114edac 3545{
0192ce2e
ID
3546 struct rbd_device *rbd_dev = img_req->rbd_dev;
3547 bool done;
7114edac 3548
e1fddc8f
ID
3549 if (need_exclusive_lock(img_req)) {
3550 down_read(&rbd_dev->lock_rwsem);
3551 mutex_lock(&img_req->state_mutex);
3552 done = rbd_img_advance(img_req, result);
3553 if (done)
3554 rbd_lock_del_request(img_req);
3555 mutex_unlock(&img_req->state_mutex);
3556 up_read(&rbd_dev->lock_rwsem);
3557 } else {
3558 mutex_lock(&img_req->state_mutex);
3559 done = rbd_img_advance(img_req, result);
3560 mutex_unlock(&img_req->state_mutex);
02c74fba 3561 }
a9e8ba2c 3562
0192ce2e
ID
3563 if (done && *result) {
3564 rbd_assert(*result < 0);
3565 rbd_warn(rbd_dev, "%s%s result %d",
3566 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3567 obj_op_name(img_req->op_type), *result);
7114edac 3568 }
0192ce2e 3569 return done;
7114edac 3570}
a9e8ba2c 3571
0192ce2e 3572static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3da691bf 3573{
7114edac 3574again:
0192ce2e 3575 if (!__rbd_img_handle_request(img_req, &result))
7114edac 3576 return;
8b3e1a56 3577
7114edac 3578 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
0192ce2e
ID
3579 struct rbd_obj_request *obj_req = img_req->obj_request;
3580
679a97d2 3581 rbd_img_request_destroy(img_req);
0192ce2e
ID
3582 if (__rbd_obj_handle_request(obj_req, &result)) {
3583 img_req = obj_req->img_request;
3584 goto again;
3585 }
3586 } else {
59e542c8 3587 struct request *rq = blk_mq_rq_from_pdu(img_req);
0192ce2e 3588
679a97d2 3589 rbd_img_request_destroy(img_req);
0192ce2e 3590 blk_mq_end_request(rq, errno_to_blk_status(result));
7114edac 3591 }
8b3e1a56 3592}
bf0d5f50 3593
ed95b21a 3594static const struct rbd_client_id rbd_empty_cid;
b8d70035 3595
ed95b21a
ID
3596static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3597 const struct rbd_client_id *rhs)
3598{
3599 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3600}
3601
3602static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3603{
3604 struct rbd_client_id cid;
3605
3606 mutex_lock(&rbd_dev->watch_mutex);
3607 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3608 cid.handle = rbd_dev->watch_cookie;
3609 mutex_unlock(&rbd_dev->watch_mutex);
3610 return cid;
3611}
3612
3613/*
3614 * lock_rwsem must be held for write
3615 */
3616static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3617 const struct rbd_client_id *cid)
3618{
3619 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3620 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3621 cid->gid, cid->handle);
3622 rbd_dev->owner_cid = *cid; /* struct */
3623}
3624
3625static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3626{
3627 mutex_lock(&rbd_dev->watch_mutex);
3628 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3629 mutex_unlock(&rbd_dev->watch_mutex);
3630}
3631
edd8ca80
FM
3632static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3633{
3634 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3635
a2b1da09 3636 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
edd8ca80
FM
3637 strcpy(rbd_dev->lock_cookie, cookie);
3638 rbd_set_owner_cid(rbd_dev, &cid);
3639 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3640}
3641
ed95b21a
ID
3642/*
3643 * lock_rwsem must be held for write
3644 */
3645static int rbd_lock(struct rbd_device *rbd_dev)
b8d70035 3646{
922dab61 3647 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3648 char cookie[32];
e627db08 3649 int ret;
b8d70035 3650
cbbfb0ff
ID
3651 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3652 rbd_dev->lock_cookie[0] != '\0');
52bb1f9b 3653
ed95b21a
ID
3654 format_lock_cookie(rbd_dev, cookie);
3655 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3656 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3657 RBD_LOCK_TAG, "", 0);
e627db08 3658 if (ret)
ed95b21a 3659 return ret;
b8d70035 3660
edd8ca80 3661 __rbd_lock(rbd_dev, cookie);
ed95b21a 3662 return 0;
b8d70035
AE
3663}
3664
ed95b21a
ID
3665/*
3666 * lock_rwsem must be held for write
3667 */
bbead745 3668static void rbd_unlock(struct rbd_device *rbd_dev)
bb040aa0 3669{
922dab61 3670 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
bb040aa0
ID
3671 int ret;
3672
cbbfb0ff
ID
3673 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3674 rbd_dev->lock_cookie[0] == '\0');
bb040aa0 3675
ed95b21a 3676 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
cbbfb0ff 3677 RBD_LOCK_NAME, rbd_dev->lock_cookie);
bbead745 3678 if (ret && ret != -ENOENT)
637cd060 3679 rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
bb040aa0 3680
bbead745
ID
3681 /* treat errors as the image is unlocked */
3682 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
cbbfb0ff 3683 rbd_dev->lock_cookie[0] = '\0';
ed95b21a
ID
3684 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3685 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
bb040aa0
ID
3686}
3687
ed95b21a
ID
3688static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3689 enum rbd_notify_op notify_op,
3690 struct page ***preply_pages,
3691 size_t *preply_len)
9969ebc5
AE
3692{
3693 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3694 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
08a79102
KS
3695 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3696 int buf_size = sizeof(buf);
ed95b21a 3697 void *p = buf;
9969ebc5 3698
ed95b21a 3699 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
9969ebc5 3700
ed95b21a
ID
3701 /* encode *LockPayload NotifyMessage (op + ClientId) */
3702 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3703 ceph_encode_32(&p, notify_op);
3704 ceph_encode_64(&p, cid.gid);
3705 ceph_encode_64(&p, cid.handle);
8eb87565 3706
ed95b21a
ID
3707 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3708 &rbd_dev->header_oloc, buf, buf_size,
3709 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
b30a01f2
ID
3710}
3711
ed95b21a
ID
3712static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3713 enum rbd_notify_op notify_op)
b30a01f2 3714{
8ae0299a 3715 __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
ed95b21a 3716}
b30a01f2 3717
ed95b21a
ID
3718static void rbd_notify_acquired_lock(struct work_struct *work)
3719{
3720 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3721 acquired_lock_work);
76756a51 3722
ed95b21a 3723 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
c525f036
ID
3724}
3725
ed95b21a 3726static void rbd_notify_released_lock(struct work_struct *work)
c525f036 3727{
ed95b21a
ID
3728 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3729 released_lock_work);
811c6688 3730
ed95b21a 3731 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
fca27065
ID
3732}
3733
ed95b21a 3734static int rbd_request_lock(struct rbd_device *rbd_dev)
36be9a76 3735{
ed95b21a
ID
3736 struct page **reply_pages;
3737 size_t reply_len;
3738 bool lock_owner_responded = false;
36be9a76
AE
3739 int ret;
3740
ed95b21a 3741 dout("%s rbd_dev %p\n", __func__, rbd_dev);
36be9a76 3742
ed95b21a
ID
3743 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3744 &reply_pages, &reply_len);
3745 if (ret && ret != -ETIMEDOUT) {
3746 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
36be9a76 3747 goto out;
ed95b21a 3748 }
36be9a76 3749
ed95b21a
ID
3750 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3751 void *p = page_address(reply_pages[0]);
3752 void *const end = p + reply_len;
3753 u32 n;
36be9a76 3754
ed95b21a
ID
3755 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3756 while (n--) {
3757 u8 struct_v;
3758 u32 len;
36be9a76 3759
ed95b21a
ID
3760 ceph_decode_need(&p, end, 8 + 8, e_inval);
3761 p += 8 + 8; /* skip gid and cookie */
04017e29 3762
ed95b21a
ID
3763 ceph_decode_32_safe(&p, end, len, e_inval);
3764 if (!len)
3765 continue;
3766
3767 if (lock_owner_responded) {
3768 rbd_warn(rbd_dev,
3769 "duplicate lock owners detected");
3770 ret = -EIO;
3771 goto out;
3772 }
3773
3774 lock_owner_responded = true;
3775 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3776 &struct_v, &len);
3777 if (ret) {
3778 rbd_warn(rbd_dev,
3779 "failed to decode ResponseMessage: %d",
3780 ret);
3781 goto e_inval;
3782 }
3783
3784 ret = ceph_decode_32(&p);
3785 }
3786 }
3787
3788 if (!lock_owner_responded) {
3789 rbd_warn(rbd_dev, "no lock owners detected");
3790 ret = -ETIMEDOUT;
3791 }
3792
3793out:
3794 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3795 return ret;
3796
3797e_inval:
3798 ret = -EINVAL;
3799 goto out;
3800}
3801
637cd060
ID
3802/*
3803 * Either image request state machine(s) or rbd_add_acquire_lock()
3804 * (i.e. "rbd map").
3805 */
3806static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
ed95b21a 3807{
637cd060
ID
3808 struct rbd_img_request *img_req;
3809
3810 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
d9b9c893 3811 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
3812
3813 cancel_delayed_work(&rbd_dev->lock_dwork);
637cd060
ID
3814 if (!completion_done(&rbd_dev->acquire_wait)) {
3815 rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3816 list_empty(&rbd_dev->running_list));
3817 rbd_dev->acquire_err = result;
3818 complete_all(&rbd_dev->acquire_wait);
3819 return;
3820 }
3821
3822 list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3823 mutex_lock(&img_req->state_mutex);
3824 rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3825 rbd_img_schedule(img_req, result);
3826 mutex_unlock(&img_req->state_mutex);
3827 }
3828
3829 list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
ed95b21a
ID
3830}
3831
3832static int get_lock_owner_info(struct rbd_device *rbd_dev,
3833 struct ceph_locker **lockers, u32 *num_lockers)
3834{
3835 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3836 u8 lock_type;
3837 char *lock_tag;
3838 int ret;
3839
3840 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3841
3842 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3843 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3844 &lock_type, &lock_tag, lockers, num_lockers);
3845 if (ret)
3846 return ret;
3847
3848 if (*num_lockers == 0) {
3849 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3850 goto out;
3851 }
3852
3853 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3854 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3855 lock_tag);
3856 ret = -EBUSY;
3857 goto out;
3858 }
3859
3860 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3861 rbd_warn(rbd_dev, "shared lock type detected");
3862 ret = -EBUSY;
3863 goto out;
3864 }
3865
3866 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3867 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3868 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3869 (*lockers)[0].id.cookie);
3870 ret = -EBUSY;
3871 goto out;
3872 }
3873
3874out:
3875 kfree(lock_tag);
3876 return ret;
3877}
3878
3879static int find_watcher(struct rbd_device *rbd_dev,
3880 const struct ceph_locker *locker)
3881{
3882 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3883 struct ceph_watch_item *watchers;
3884 u32 num_watchers;
3885 u64 cookie;
3886 int i;
3887 int ret;
3888
3889 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3890 &rbd_dev->header_oloc, &watchers,
3891 &num_watchers);
3892 if (ret)
3893 return ret;
3894
3895 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3896 for (i = 0; i < num_watchers; i++) {
313771e8
ID
3897 /*
3898 * Ignore addr->type while comparing. This mimics
3899 * entity_addr_t::get_legacy_str() + strcmp().
3900 */
3901 if (ceph_addr_equal_no_type(&watchers[i].addr,
3902 &locker->info.addr) &&
ed95b21a
ID
3903 watchers[i].cookie == cookie) {
3904 struct rbd_client_id cid = {
3905 .gid = le64_to_cpu(watchers[i].name.num),
3906 .handle = cookie,
3907 };
3908
3909 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3910 rbd_dev, cid.gid, cid.handle);
3911 rbd_set_owner_cid(rbd_dev, &cid);
3912 ret = 1;
3913 goto out;
3914 }
3915 }
3916
3917 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3918 ret = 0;
3919out:
3920 kfree(watchers);
3921 return ret;
3922}
3923
3924/*
3925 * lock_rwsem must be held for write
3926 */
3927static int rbd_try_lock(struct rbd_device *rbd_dev)
3928{
3929 struct ceph_client *client = rbd_dev->rbd_client->client;
3930 struct ceph_locker *lockers;
3931 u32 num_lockers;
3932 int ret;
3933
3934 for (;;) {
3935 ret = rbd_lock(rbd_dev);
3936 if (ret != -EBUSY)
3937 return ret;
3938
3939 /* determine if the current lock holder is still alive */
3940 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3941 if (ret)
3942 return ret;
3943
3944 if (num_lockers == 0)
3945 goto again;
3946
3947 ret = find_watcher(rbd_dev, lockers);
637cd060
ID
3948 if (ret)
3949 goto out; /* request lock or error */
ed95b21a 3950
22e8bd51 3951 rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
ed95b21a
ID
3952 ENTITY_NAME(lockers[0].id.name));
3953
0b98acd6 3954 ret = ceph_monc_blocklist_add(&client->monc,
ed95b21a
ID
3955 &lockers[0].info.addr);
3956 if (ret) {
0b98acd6 3957 rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
ed95b21a
ID
3958 ENTITY_NAME(lockers[0].id.name), ret);
3959 goto out;
3960 }
3961
3962 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3963 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3964 lockers[0].id.cookie,
3965 &lockers[0].id.name);
3966 if (ret && ret != -ENOENT)
3967 goto out;
3968
3969again:
3970 ceph_free_lockers(lockers, num_lockers);
3971 }
3972
3973out:
3974 ceph_free_lockers(lockers, num_lockers);
3975 return ret;
3976}
3977
22e8bd51
ID
3978static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
3979{
3980 int ret;
3981
3982 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
3983 ret = rbd_object_map_open(rbd_dev);
3984 if (ret)
3985 return ret;
3986 }
3987
3988 return 0;
3989}
3990
ed95b21a 3991/*
637cd060
ID
3992 * Return:
3993 * 0 - lock acquired
3994 * 1 - caller should call rbd_request_lock()
3995 * <0 - error
ed95b21a 3996 */
637cd060 3997static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
ed95b21a 3998{
637cd060 3999 int ret;
ed95b21a
ID
4000
4001 down_read(&rbd_dev->lock_rwsem);
4002 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4003 rbd_dev->lock_state);
4004 if (__rbd_is_lock_owner(rbd_dev)) {
ed95b21a 4005 up_read(&rbd_dev->lock_rwsem);
637cd060 4006 return 0;
ed95b21a
ID
4007 }
4008
4009 up_read(&rbd_dev->lock_rwsem);
4010 down_write(&rbd_dev->lock_rwsem);
4011 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4012 rbd_dev->lock_state);
637cd060
ID
4013 if (__rbd_is_lock_owner(rbd_dev)) {
4014 up_write(&rbd_dev->lock_rwsem);
4015 return 0;
ed95b21a
ID
4016 }
4017
637cd060
ID
4018 ret = rbd_try_lock(rbd_dev);
4019 if (ret < 0) {
4020 rbd_warn(rbd_dev, "failed to lock header: %d", ret);
0b98acd6 4021 if (ret == -EBLOCKLISTED)
637cd060
ID
4022 goto out;
4023
4024 ret = 1; /* request lock anyway */
4025 }
4026 if (ret > 0) {
4027 up_write(&rbd_dev->lock_rwsem);
4028 return ret;
4029 }
4030
4031 rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4032 rbd_assert(list_empty(&rbd_dev->running_list));
4033
22e8bd51
ID
4034 ret = rbd_post_acquire_action(rbd_dev);
4035 if (ret) {
4036 rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4037 /*
4038 * Can't stay in RBD_LOCK_STATE_LOCKED because
4039 * rbd_lock_add_request() would let the request through,
4040 * assuming that e.g. object map is locked and loaded.
4041 */
4042 rbd_unlock(rbd_dev);
ed95b21a
ID
4043 }
4044
637cd060
ID
4045out:
4046 wake_lock_waiters(rbd_dev, ret);
ed95b21a 4047 up_write(&rbd_dev->lock_rwsem);
637cd060 4048 return ret;
ed95b21a
ID
4049}
4050
4051static void rbd_acquire_lock(struct work_struct *work)
4052{
4053 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4054 struct rbd_device, lock_dwork);
637cd060 4055 int ret;
ed95b21a
ID
4056
4057 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4058again:
637cd060
ID
4059 ret = rbd_try_acquire_lock(rbd_dev);
4060 if (ret <= 0) {
4061 dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
ed95b21a
ID
4062 return;
4063 }
4064
4065 ret = rbd_request_lock(rbd_dev);
4066 if (ret == -ETIMEDOUT) {
4067 goto again; /* treat this as a dead client */
e010dd0a
ID
4068 } else if (ret == -EROFS) {
4069 rbd_warn(rbd_dev, "peer will not release lock");
637cd060
ID
4070 down_write(&rbd_dev->lock_rwsem);
4071 wake_lock_waiters(rbd_dev, ret);
4072 up_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
4073 } else if (ret < 0) {
4074 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4075 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4076 RBD_RETRY_DELAY);
4077 } else {
4078 /*
4079 * lock owner acked, but resend if we don't see them
4080 * release the lock
4081 */
6b0a8774 4082 dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
ed95b21a
ID
4083 rbd_dev);
4084 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4085 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4086 }
4087}
4088
a2b1da09 4089static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
ed95b21a 4090{
a2b1da09 4091 dout("%s rbd_dev %p\n", __func__, rbd_dev);
d9b9c893 4092 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
a2b1da09 4093
ed95b21a
ID
4094 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4095 return false;
4096
52bb1f9b 4097 /*
ed95b21a 4098 * Ensure that all in-flight IO is flushed.
52bb1f9b 4099 */
e1fddc8f
ID
4100 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4101 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
ed9eb710
ID
4102 if (list_empty(&rbd_dev->running_list))
4103 return true;
4104
4105 up_write(&rbd_dev->lock_rwsem);
4106 wait_for_completion(&rbd_dev->releasing_wait);
ed95b21a
ID
4107
4108 down_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
4109 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4110 return false;
4111
e1fddc8f 4112 rbd_assert(list_empty(&rbd_dev->running_list));
a2b1da09
ID
4113 return true;
4114}
4115
22e8bd51
ID
4116static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4117{
4118 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4119 rbd_object_map_close(rbd_dev);
4120}
4121
e1fddc8f
ID
4122static void __rbd_release_lock(struct rbd_device *rbd_dev)
4123{
4124 rbd_assert(list_empty(&rbd_dev->running_list));
4125
22e8bd51 4126 rbd_pre_release_action(rbd_dev);
bbead745 4127 rbd_unlock(rbd_dev);
e1fddc8f
ID
4128}
4129
a2b1da09
ID
4130/*
4131 * lock_rwsem must be held for write
4132 */
4133static void rbd_release_lock(struct rbd_device *rbd_dev)
4134{
4135 if (!rbd_quiesce_lock(rbd_dev))
4136 return;
4137
e1fddc8f 4138 __rbd_release_lock(rbd_dev);
a2b1da09 4139
bbead745
ID
4140 /*
4141 * Give others a chance to grab the lock - we would re-acquire
637cd060
ID
4142 * almost immediately if we got new IO while draining the running
4143 * list otherwise. We need to ack our own notifications, so this
4144 * lock_dwork will be requeued from rbd_handle_released_lock() by
4145 * way of maybe_kick_acquire().
bbead745
ID
4146 */
4147 cancel_delayed_work(&rbd_dev->lock_dwork);
ed95b21a
ID
4148}
4149
4150static void rbd_release_lock_work(struct work_struct *work)
4151{
4152 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4153 unlock_work);
4154
4155 down_write(&rbd_dev->lock_rwsem);
4156 rbd_release_lock(rbd_dev);
4157 up_write(&rbd_dev->lock_rwsem);
4158}
4159
637cd060
ID
4160static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4161{
4162 bool have_requests;
4163
4164 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4165 if (__rbd_is_lock_owner(rbd_dev))
4166 return;
4167
4168 spin_lock(&rbd_dev->lock_lists_lock);
4169 have_requests = !list_empty(&rbd_dev->acquiring_list);
4170 spin_unlock(&rbd_dev->lock_lists_lock);
4171 if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4172 dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4173 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4174 }
4175}
4176
ed95b21a
ID
4177static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4178 void **p)
4179{
4180 struct rbd_client_id cid = { 0 };
4181
4182 if (struct_v >= 2) {
4183 cid.gid = ceph_decode_64(p);
4184 cid.handle = ceph_decode_64(p);
4185 }
4186
4187 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4188 cid.handle);
4189 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4190 down_write(&rbd_dev->lock_rwsem);
4191 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
8798d070
ID
4192 dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n",
4193 __func__, rbd_dev, cid.gid, cid.handle);
4194 } else {
4195 rbd_set_owner_cid(rbd_dev, &cid);
ed95b21a 4196 }
ed95b21a
ID
4197 downgrade_write(&rbd_dev->lock_rwsem);
4198 } else {
4199 down_read(&rbd_dev->lock_rwsem);
4200 }
4201
637cd060 4202 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
4203 up_read(&rbd_dev->lock_rwsem);
4204}
4205
4206static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4207 void **p)
4208{
4209 struct rbd_client_id cid = { 0 };
4210
4211 if (struct_v >= 2) {
4212 cid.gid = ceph_decode_64(p);
4213 cid.handle = ceph_decode_64(p);
4214 }
4215
4216 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4217 cid.handle);
4218 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4219 down_write(&rbd_dev->lock_rwsem);
4220 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
8798d070 4221 dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n",
ed95b21a
ID
4222 __func__, rbd_dev, cid.gid, cid.handle,
4223 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
8798d070
ID
4224 } else {
4225 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
ed95b21a 4226 }
ed95b21a
ID
4227 downgrade_write(&rbd_dev->lock_rwsem);
4228 } else {
4229 down_read(&rbd_dev->lock_rwsem);
4230 }
4231
637cd060 4232 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
4233 up_read(&rbd_dev->lock_rwsem);
4234}
4235
3b77faa0
ID
4236/*
4237 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4238 * ResponseMessage is needed.
4239 */
4240static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4241 void **p)
ed95b21a
ID
4242{
4243 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4244 struct rbd_client_id cid = { 0 };
3b77faa0 4245 int result = 1;
ed95b21a
ID
4246
4247 if (struct_v >= 2) {
4248 cid.gid = ceph_decode_64(p);
4249 cid.handle = ceph_decode_64(p);
4250 }
4251
4252 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4253 cid.handle);
4254 if (rbd_cid_equal(&cid, &my_cid))
3b77faa0 4255 return result;
ed95b21a
ID
4256
4257 down_read(&rbd_dev->lock_rwsem);
3b77faa0
ID
4258 if (__rbd_is_lock_owner(rbd_dev)) {
4259 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4260 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4261 goto out_unlock;
4262
4263 /*
4264 * encode ResponseMessage(0) so the peer can detect
4265 * a missing owner
4266 */
4267 result = 0;
4268
4269 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
e010dd0a
ID
4270 if (!rbd_dev->opts->exclusive) {
4271 dout("%s rbd_dev %p queueing unlock_work\n",
4272 __func__, rbd_dev);
4273 queue_work(rbd_dev->task_wq,
4274 &rbd_dev->unlock_work);
4275 } else {
4276 /* refuse to release the lock */
4277 result = -EROFS;
4278 }
ed95b21a
ID
4279 }
4280 }
3b77faa0
ID
4281
4282out_unlock:
ed95b21a 4283 up_read(&rbd_dev->lock_rwsem);
3b77faa0 4284 return result;
ed95b21a
ID
4285}
4286
4287static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4288 u64 notify_id, u64 cookie, s32 *result)
4289{
4290 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
08a79102
KS
4291 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4292 int buf_size = sizeof(buf);
ed95b21a
ID
4293 int ret;
4294
4295 if (result) {
4296 void *p = buf;
4297
4298 /* encode ResponseMessage */
4299 ceph_start_encoding(&p, 1, 1,
4300 buf_size - CEPH_ENCODING_START_BLK_LEN);
4301 ceph_encode_32(&p, *result);
4302 } else {
4303 buf_size = 0;
4304 }
b8d70035 4305
922dab61
ID
4306 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4307 &rbd_dev->header_oloc, notify_id, cookie,
ed95b21a 4308 buf, buf_size);
52bb1f9b 4309 if (ret)
ed95b21a
ID
4310 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4311}
4312
4313static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4314 u64 cookie)
4315{
4316 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4317 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4318}
4319
4320static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4321 u64 notify_id, u64 cookie, s32 result)
4322{
4323 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4324 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4325}
4326
4327static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4328 u64 notifier_id, void *data, size_t data_len)
4329{
4330 struct rbd_device *rbd_dev = arg;
4331 void *p = data;
4332 void *const end = p + data_len;
d4c2269b 4333 u8 struct_v = 0;
ed95b21a
ID
4334 u32 len;
4335 u32 notify_op;
4336 int ret;
4337
4338 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4339 __func__, rbd_dev, cookie, notify_id, data_len);
4340 if (data_len) {
4341 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4342 &struct_v, &len);
4343 if (ret) {
4344 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4345 ret);
4346 return;
4347 }
4348
4349 notify_op = ceph_decode_32(&p);
4350 } else {
4351 /* legacy notification for header updates */
4352 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4353 len = 0;
4354 }
4355
4356 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4357 switch (notify_op) {
4358 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4359 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4360 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4361 break;
4362 case RBD_NOTIFY_OP_RELEASED_LOCK:
4363 rbd_handle_released_lock(rbd_dev, struct_v, &p);
4364 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4365 break;
4366 case RBD_NOTIFY_OP_REQUEST_LOCK:
3b77faa0
ID
4367 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4368 if (ret <= 0)
ed95b21a 4369 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3b77faa0 4370 cookie, ret);
ed95b21a
ID
4371 else
4372 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4373 break;
4374 case RBD_NOTIFY_OP_HEADER_UPDATE:
4375 ret = rbd_dev_refresh(rbd_dev);
4376 if (ret)
4377 rbd_warn(rbd_dev, "refresh failed: %d", ret);
4378
4379 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4380 break;
4381 default:
4382 if (rbd_is_lock_owner(rbd_dev))
4383 rbd_acknowledge_notify_result(rbd_dev, notify_id,
4384 cookie, -EOPNOTSUPP);
4385 else
4386 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4387 break;
4388 }
b8d70035
AE
4389}
4390
99d16943
ID
4391static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4392
922dab61 4393static void rbd_watch_errcb(void *arg, u64 cookie, int err)
bb040aa0 4394{
922dab61 4395 struct rbd_device *rbd_dev = arg;
bb040aa0 4396
922dab61 4397 rbd_warn(rbd_dev, "encountered watch error: %d", err);
bb040aa0 4398
ed95b21a
ID
4399 down_write(&rbd_dev->lock_rwsem);
4400 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4401 up_write(&rbd_dev->lock_rwsem);
4402
99d16943
ID
4403 mutex_lock(&rbd_dev->watch_mutex);
4404 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4405 __rbd_unregister_watch(rbd_dev);
4406 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
bb040aa0 4407
99d16943 4408 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
bb040aa0 4409 }
99d16943 4410 mutex_unlock(&rbd_dev->watch_mutex);
bb040aa0
ID
4411}
4412
9969ebc5 4413/*
99d16943 4414 * watch_mutex must be locked
9969ebc5 4415 */
99d16943 4416static int __rbd_register_watch(struct rbd_device *rbd_dev)
9969ebc5
AE
4417{
4418 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
922dab61 4419 struct ceph_osd_linger_request *handle;
9969ebc5 4420
922dab61 4421 rbd_assert(!rbd_dev->watch_handle);
99d16943 4422 dout("%s rbd_dev %p\n", __func__, rbd_dev);
9969ebc5 4423
922dab61
ID
4424 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4425 &rbd_dev->header_oloc, rbd_watch_cb,
4426 rbd_watch_errcb, rbd_dev);
4427 if (IS_ERR(handle))
4428 return PTR_ERR(handle);
8eb87565 4429
922dab61 4430 rbd_dev->watch_handle = handle;
b30a01f2 4431 return 0;
b30a01f2
ID
4432}
4433
99d16943
ID
4434/*
4435 * watch_mutex must be locked
4436 */
4437static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
b30a01f2 4438{
922dab61
ID
4439 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4440 int ret;
b30a01f2 4441
99d16943
ID
4442 rbd_assert(rbd_dev->watch_handle);
4443 dout("%s rbd_dev %p\n", __func__, rbd_dev);
b30a01f2 4444
922dab61
ID
4445 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4446 if (ret)
4447 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
76756a51 4448
922dab61 4449 rbd_dev->watch_handle = NULL;
c525f036
ID
4450}
4451
99d16943
ID
4452static int rbd_register_watch(struct rbd_device *rbd_dev)
4453{
4454 int ret;
4455
4456 mutex_lock(&rbd_dev->watch_mutex);
4457 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4458 ret = __rbd_register_watch(rbd_dev);
4459 if (ret)
4460 goto out;
4461
4462 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4463 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4464
4465out:
4466 mutex_unlock(&rbd_dev->watch_mutex);
4467 return ret;
4468}
4469
4470static void cancel_tasks_sync(struct rbd_device *rbd_dev)
c525f036 4471{
99d16943
ID
4472 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4473
ed95b21a
ID
4474 cancel_work_sync(&rbd_dev->acquired_lock_work);
4475 cancel_work_sync(&rbd_dev->released_lock_work);
4476 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4477 cancel_work_sync(&rbd_dev->unlock_work);
99d16943
ID
4478}
4479
0e4e1de5
ID
4480/*
4481 * header_rwsem must not be held to avoid a deadlock with
4482 * rbd_dev_refresh() when flushing notifies.
4483 */
99d16943
ID
4484static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4485{
4486 cancel_tasks_sync(rbd_dev);
4487
4488 mutex_lock(&rbd_dev->watch_mutex);
4489 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4490 __rbd_unregister_watch(rbd_dev);
4491 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4492 mutex_unlock(&rbd_dev->watch_mutex);
811c6688 4493
23edca86 4494 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
811c6688 4495 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
fca27065
ID
4496}
4497
14bb211d
ID
4498/*
4499 * lock_rwsem must be held for write
4500 */
4501static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4502{
4503 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4504 char cookie[32];
4505 int ret;
4506
a2b1da09
ID
4507 if (!rbd_quiesce_lock(rbd_dev))
4508 return;
14bb211d
ID
4509
4510 format_lock_cookie(rbd_dev, cookie);
4511 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4512 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4513 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4514 RBD_LOCK_TAG, cookie);
4515 if (ret) {
4516 if (ret != -EOPNOTSUPP)
4517 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4518 ret);
4519
4520 /*
4521 * Lock cookie cannot be updated on older OSDs, so do
4522 * a manual release and queue an acquire.
4523 */
e1fddc8f 4524 __rbd_release_lock(rbd_dev);
a2b1da09 4525 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
14bb211d 4526 } else {
edd8ca80 4527 __rbd_lock(rbd_dev, cookie);
637cd060 4528 wake_lock_waiters(rbd_dev, 0);
14bb211d
ID
4529 }
4530}
4531
99d16943
ID
4532static void rbd_reregister_watch(struct work_struct *work)
4533{
4534 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4535 struct rbd_device, watch_dwork);
4536 int ret;
4537
4538 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4539
4540 mutex_lock(&rbd_dev->watch_mutex);
87c0fded
ID
4541 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4542 mutex_unlock(&rbd_dev->watch_mutex);
14bb211d 4543 return;
87c0fded 4544 }
99d16943
ID
4545
4546 ret = __rbd_register_watch(rbd_dev);
4547 if (ret) {
4548 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
0b98acd6 4549 if (ret != -EBLOCKLISTED && ret != -ENOENT) {
99d16943
ID
4550 queue_delayed_work(rbd_dev->task_wq,
4551 &rbd_dev->watch_dwork,
4552 RBD_RETRY_DELAY);
637cd060
ID
4553 mutex_unlock(&rbd_dev->watch_mutex);
4554 return;
87c0fded 4555 }
637cd060 4556
87c0fded 4557 mutex_unlock(&rbd_dev->watch_mutex);
637cd060
ID
4558 down_write(&rbd_dev->lock_rwsem);
4559 wake_lock_waiters(rbd_dev, ret);
4560 up_write(&rbd_dev->lock_rwsem);
14bb211d 4561 return;
99d16943
ID
4562 }
4563
4564 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4565 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4566 mutex_unlock(&rbd_dev->watch_mutex);
4567
14bb211d
ID
4568 down_write(&rbd_dev->lock_rwsem);
4569 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4570 rbd_reacquire_lock(rbd_dev);
4571 up_write(&rbd_dev->lock_rwsem);
4572
99d16943
ID
4573 ret = rbd_dev_refresh(rbd_dev);
4574 if (ret)
f6870cc9 4575 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
99d16943
ID
4576}
4577
36be9a76 4578/*
f40eb349
AE
4579 * Synchronous osd object method call. Returns the number of bytes
4580 * returned in the outbound buffer, or a negative error code.
36be9a76
AE
4581 */
4582static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
ecd4a68a
ID
4583 struct ceph_object_id *oid,
4584 struct ceph_object_locator *oloc,
36be9a76 4585 const char *method_name,
4157976b 4586 const void *outbound,
36be9a76 4587 size_t outbound_size,
4157976b 4588 void *inbound,
e2a58ee5 4589 size_t inbound_size)
36be9a76 4590{
ecd4a68a
ID
4591 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4592 struct page *req_page = NULL;
4593 struct page *reply_page;
36be9a76
AE
4594 int ret;
4595
4596 /*
6010a451
AE
4597 * Method calls are ultimately read operations. The result
4598 * should placed into the inbound buffer provided. They
4599 * also supply outbound data--parameters for the object
4600 * method. Currently if this is present it will be a
4601 * snapshot id.
36be9a76 4602 */
ecd4a68a
ID
4603 if (outbound) {
4604 if (outbound_size > PAGE_SIZE)
4605 return -E2BIG;
36be9a76 4606
ecd4a68a
ID
4607 req_page = alloc_page(GFP_KERNEL);
4608 if (!req_page)
4609 return -ENOMEM;
04017e29 4610
ecd4a68a 4611 memcpy(page_address(req_page), outbound, outbound_size);
04017e29 4612 }
36be9a76 4613
ecd4a68a
ID
4614 reply_page = alloc_page(GFP_KERNEL);
4615 if (!reply_page) {
4616 if (req_page)
4617 __free_page(req_page);
4618 return -ENOMEM;
4619 }
57385b51 4620
ecd4a68a
ID
4621 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4622 CEPH_OSD_FLAG_READ, req_page, outbound_size,
68ada915 4623 &reply_page, &inbound_size);
ecd4a68a
ID
4624 if (!ret) {
4625 memcpy(inbound, page_address(reply_page), inbound_size);
4626 ret = inbound_size;
4627 }
36be9a76 4628
ecd4a68a
ID
4629 if (req_page)
4630 __free_page(req_page);
4631 __free_page(reply_page);
36be9a76
AE
4632 return ret;
4633}
4634
7ad18afa 4635static void rbd_queue_workfn(struct work_struct *work)
bf0d5f50 4636{
59e542c8
ID
4637 struct rbd_img_request *img_request =
4638 container_of(work, struct rbd_img_request, work);
4639 struct rbd_device *rbd_dev = img_request->rbd_dev;
4640 enum obj_operation_type op_type = img_request->op_type;
4641 struct request *rq = blk_mq_rq_from_pdu(img_request);
bc1ecc65
ID
4642 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4643 u64 length = blk_rq_bytes(rq);
4e752f0a 4644 u64 mapping_size;
bf0d5f50
AE
4645 int result;
4646
bc1ecc65 4647 /* Ignore/skip any zero-length requests */
bc1ecc65
ID
4648 if (!length) {
4649 dout("%s: zero-length request\n", __func__);
4650 result = 0;
59e542c8 4651 goto err_img_request;
bc1ecc65 4652 }
4dda41d3 4653
7ad18afa
CH
4654 blk_mq_start_request(rq);
4655
4e752f0a
JD
4656 down_read(&rbd_dev->header_rwsem);
4657 mapping_size = rbd_dev->mapping.size;
a52cc685 4658 rbd_img_capture_header(img_request);
4e752f0a
JD
4659 up_read(&rbd_dev->header_rwsem);
4660
4661 if (offset + length > mapping_size) {
bc1ecc65 4662 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4e752f0a 4663 length, mapping_size);
bc1ecc65 4664 result = -EIO;
a52cc685 4665 goto err_img_request;
bc1ecc65 4666 }
bf0d5f50 4667
21ed05a8
ID
4668 dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4669 img_request, obj_op_name(op_type), offset, length);
4670
6484cbe9 4671 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
5a237819 4672 result = rbd_img_fill_nodata(img_request, offset, length);
90e98c52 4673 else
5a237819
ID
4674 result = rbd_img_fill_from_bio(img_request, offset, length,
4675 rq->bio);
0192ce2e 4676 if (result)
bc1ecc65 4677 goto err_img_request;
bf0d5f50 4678
e1fddc8f 4679 rbd_img_handle_request(img_request, 0);
bc1ecc65 4680 return;
bf0d5f50 4681
bc1ecc65 4682err_img_request:
679a97d2 4683 rbd_img_request_destroy(img_request);
bc1ecc65
ID
4684 if (result)
4685 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
6d2940c8 4686 obj_op_name(op_type), length, offset, result);
2a842aca 4687 blk_mq_end_request(rq, errno_to_blk_status(result));
bc1ecc65 4688}
bf0d5f50 4689
fc17b653 4690static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
7ad18afa 4691 const struct blk_mq_queue_data *bd)
bc1ecc65 4692{
59e542c8
ID
4693 struct rbd_device *rbd_dev = hctx->queue->queuedata;
4694 struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
4695 enum obj_operation_type op_type;
bf0d5f50 4696
59e542c8
ID
4697 switch (req_op(bd->rq)) {
4698 case REQ_OP_DISCARD:
4699 op_type = OBJ_OP_DISCARD;
4700 break;
4701 case REQ_OP_WRITE_ZEROES:
4702 op_type = OBJ_OP_ZEROOUT;
4703 break;
4704 case REQ_OP_WRITE:
4705 op_type = OBJ_OP_WRITE;
4706 break;
4707 case REQ_OP_READ:
4708 op_type = OBJ_OP_READ;
4709 break;
4710 default:
4711 rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
4712 return BLK_STS_IOERR;
4713 }
4714
4715 rbd_img_request_init(img_req, rbd_dev, op_type);
4716
4717 if (rbd_img_is_write(img_req)) {
4718 if (rbd_is_ro(rbd_dev)) {
4719 rbd_warn(rbd_dev, "%s on read-only mapping",
4720 obj_op_name(img_req->op_type));
4721 return BLK_STS_IOERR;
4722 }
4723 rbd_assert(!rbd_is_snap(rbd_dev));
4724 }
4725
4726 INIT_WORK(&img_req->work, rbd_queue_workfn);
4727 queue_work(rbd_wq, &img_req->work);
fc17b653 4728 return BLK_STS_OK;
bf0d5f50
AE
4729}
4730
602adf40
YS
4731static void rbd_free_disk(struct rbd_device *rbd_dev)
4732{
195b1956 4733 blk_cleanup_disk(rbd_dev->disk);
5769ed0c 4734 blk_mq_free_tag_set(&rbd_dev->tag_set);
a0cab924 4735 rbd_dev->disk = NULL;
602adf40
YS
4736}
4737
788e2df3 4738static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
fe5478e0
ID
4739 struct ceph_object_id *oid,
4740 struct ceph_object_locator *oloc,
4741 void *buf, int buf_len)
788e2df3
AE
4742
4743{
fe5478e0
ID
4744 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4745 struct ceph_osd_request *req;
4746 struct page **pages;
4747 int num_pages = calc_pages_for(0, buf_len);
788e2df3
AE
4748 int ret;
4749
fe5478e0
ID
4750 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4751 if (!req)
4752 return -ENOMEM;
788e2df3 4753
fe5478e0
ID
4754 ceph_oid_copy(&req->r_base_oid, oid);
4755 ceph_oloc_copy(&req->r_base_oloc, oloc);
4756 req->r_flags = CEPH_OSD_FLAG_READ;
430c28c3 4757
fe5478e0
ID
4758 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4759 if (IS_ERR(pages)) {
4760 ret = PTR_ERR(pages);
4761 goto out_req;
4762 }
1ceae7ef 4763
fe5478e0
ID
4764 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4765 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4766 true);
4767
26f887e0
ID
4768 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4769 if (ret)
4770 goto out_req;
4771
fe5478e0
ID
4772 ceph_osdc_start_request(osdc, req, false);
4773 ret = ceph_osdc_wait_request(osdc, req);
4774 if (ret >= 0)
4775 ceph_copy_from_page_vector(pages, buf, 0, ret);
788e2df3 4776
fe5478e0
ID
4777out_req:
4778 ceph_osdc_put_request(req);
788e2df3
AE
4779 return ret;
4780}
4781
602adf40 4782/*
662518b1
AE
4783 * Read the complete header for the given rbd device. On successful
4784 * return, the rbd_dev->header field will contain up-to-date
4785 * information about the image.
602adf40 4786 */
99a41ebc 4787static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
602adf40 4788{
4156d998 4789 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 4790 u32 snap_count = 0;
4156d998
AE
4791 u64 names_size = 0;
4792 u32 want_count;
4793 int ret;
602adf40 4794
00f1f36f 4795 /*
4156d998
AE
4796 * The complete header will include an array of its 64-bit
4797 * snapshot ids, followed by the names of those snapshots as
4798 * a contiguous block of NUL-terminated strings. Note that
4799 * the number of snapshots could change by the time we read
4800 * it in, in which case we re-read it.
00f1f36f 4801 */
4156d998
AE
4802 do {
4803 size_t size;
4804
4805 kfree(ondisk);
4806
4807 size = sizeof (*ondisk);
4808 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4809 size += names_size;
4810 ondisk = kmalloc(size, GFP_KERNEL);
4811 if (!ondisk)
662518b1 4812 return -ENOMEM;
4156d998 4813
fe5478e0
ID
4814 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4815 &rbd_dev->header_oloc, ondisk, size);
4156d998 4816 if (ret < 0)
662518b1 4817 goto out;
c0cd10db 4818 if ((size_t)ret < size) {
4156d998 4819 ret = -ENXIO;
06ecc6cb
AE
4820 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4821 size, ret);
662518b1 4822 goto out;
4156d998
AE
4823 }
4824 if (!rbd_dev_ondisk_valid(ondisk)) {
4825 ret = -ENXIO;
06ecc6cb 4826 rbd_warn(rbd_dev, "invalid header");
662518b1 4827 goto out;
81e759fb 4828 }
602adf40 4829
4156d998
AE
4830 names_size = le64_to_cpu(ondisk->snap_names_len);
4831 want_count = snap_count;
4832 snap_count = le32_to_cpu(ondisk->snap_count);
4833 } while (snap_count != want_count);
00f1f36f 4834
662518b1
AE
4835 ret = rbd_header_from_disk(rbd_dev, ondisk);
4836out:
4156d998
AE
4837 kfree(ondisk);
4838
4839 return ret;
602adf40
YS
4840}
4841
9875201e
JD
4842static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4843{
4844 sector_t size;
9875201e
JD
4845
4846 /*
811c6688
ID
4847 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4848 * try to update its size. If REMOVING is set, updating size
4849 * is just useless work since the device can't be opened.
9875201e 4850 */
811c6688
ID
4851 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4852 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
9875201e
JD
4853 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4854 dout("setting size to %llu sectors", (unsigned long long)size);
e864e49a 4855 set_capacity_and_notify(rbd_dev->disk, size);
9875201e
JD
4856 }
4857}
4858
cc4a38bd 4859static int rbd_dev_refresh(struct rbd_device *rbd_dev)
1fe5e993 4860{
e627db08 4861 u64 mapping_size;
1fe5e993
AE
4862 int ret;
4863
cfbf6377 4864 down_write(&rbd_dev->header_rwsem);
3b5cf2a2 4865 mapping_size = rbd_dev->mapping.size;
a720ae09
ID
4866
4867 ret = rbd_dev_header_info(rbd_dev);
52bb1f9b 4868 if (ret)
73e39e4d 4869 goto out;
15228ede 4870
e8f59b59
ID
4871 /*
4872 * If there is a parent, see if it has disappeared due to the
4873 * mapped image getting flattened.
4874 */
4875 if (rbd_dev->parent) {
4876 ret = rbd_dev_v2_parent_info(rbd_dev);
4877 if (ret)
73e39e4d 4878 goto out;
e8f59b59
ID
4879 }
4880
686238b7
ID
4881 rbd_assert(!rbd_is_snap(rbd_dev));
4882 rbd_dev->mapping.size = rbd_dev->header.image_size;
15228ede 4883
73e39e4d 4884out:
cfbf6377 4885 up_write(&rbd_dev->header_rwsem);
73e39e4d 4886 if (!ret && mapping_size != rbd_dev->mapping.size)
9875201e 4887 rbd_dev_update_size(rbd_dev);
1fe5e993 4888
73e39e4d 4889 return ret;
1fe5e993
AE
4890}
4891
f363b089 4892static const struct blk_mq_ops rbd_mq_ops = {
7ad18afa 4893 .queue_rq = rbd_queue_rq,
7ad18afa
CH
4894};
4895
602adf40
YS
4896static int rbd_init_disk(struct rbd_device *rbd_dev)
4897{
4898 struct gendisk *disk;
4899 struct request_queue *q;
420efbdf
ID
4900 unsigned int objset_bytes =
4901 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
7ad18afa 4902 int err;
602adf40 4903
7ad18afa
CH
4904 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4905 rbd_dev->tag_set.ops = &rbd_mq_ops;
b5584180 4906 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
7ad18afa 4907 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
56d18f62 4908 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
f9b6b98d 4909 rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
59e542c8 4910 rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
7ad18afa
CH
4911
4912 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4913 if (err)
195b1956 4914 return err;
029bcbd8 4915
195b1956
CH
4916 disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev);
4917 if (IS_ERR(disk)) {
4918 err = PTR_ERR(disk);
7ad18afa
CH
4919 goto out_tag_set;
4920 }
195b1956
CH
4921 q = disk->queue;
4922
4923 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4924 rbd_dev->dev_id);
4925 disk->major = rbd_dev->major;
4926 disk->first_minor = rbd_dev->minor;
1ebe2e5f 4927 if (single_major)
195b1956 4928 disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT);
1ebe2e5f 4929 else
195b1956 4930 disk->minors = RBD_MINORS_PER_MAJOR;
195b1956 4931 disk->fops = &rbd_bd_ops;
0077a500 4932 disk->private_data = rbd_dev;
7ad18afa 4933
8b904b5b 4934 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
d8a2c89c 4935 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
593a9e7b 4936
420efbdf 4937 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
0d9fde4f 4938 q->limits.max_sectors = queue_max_hw_sectors(q);
21acdf45 4939 blk_queue_max_segments(q, USHRT_MAX);
24f1df60 4940 blk_queue_max_segment_size(q, UINT_MAX);
16d80c54
ID
4941 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
4942 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
029bcbd8 4943
d9360540
ID
4944 if (rbd_dev->opts->trim) {
4945 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
16d80c54 4946 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
d9360540
ID
4947 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4948 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4949 }
90e98c52 4950
bae818ee 4951 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
1cb039f3 4952 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
bae818ee 4953
602adf40 4954 rbd_dev->disk = disk;
602adf40 4955
602adf40 4956 return 0;
7ad18afa
CH
4957out_tag_set:
4958 blk_mq_free_tag_set(&rbd_dev->tag_set);
7ad18afa 4959 return err;
602adf40
YS
4960}
4961
dfc5606d
YS
4962/*
4963 sysfs
4964*/
4965
593a9e7b
AE
4966static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4967{
4968 return container_of(dev, struct rbd_device, dev);
4969}
4970
dfc5606d
YS
4971static ssize_t rbd_size_show(struct device *dev,
4972 struct device_attribute *attr, char *buf)
4973{
593a9e7b 4974 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0 4975
fc71d833
AE
4976 return sprintf(buf, "%llu\n",
4977 (unsigned long long)rbd_dev->mapping.size);
dfc5606d
YS
4978}
4979
34b13184
AE
4980static ssize_t rbd_features_show(struct device *dev,
4981 struct device_attribute *attr, char *buf)
4982{
4983 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4984
fa58bcad 4985 return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
34b13184
AE
4986}
4987
dfc5606d
YS
4988static ssize_t rbd_major_show(struct device *dev,
4989 struct device_attribute *attr, char *buf)
4990{
593a9e7b 4991 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 4992
fc71d833
AE
4993 if (rbd_dev->major)
4994 return sprintf(buf, "%d\n", rbd_dev->major);
4995
4996 return sprintf(buf, "(none)\n");
dd82fff1
ID
4997}
4998
4999static ssize_t rbd_minor_show(struct device *dev,
5000 struct device_attribute *attr, char *buf)
5001{
5002 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
fc71d833 5003
dd82fff1 5004 return sprintf(buf, "%d\n", rbd_dev->minor);
dfc5606d
YS
5005}
5006
005a07bf
ID
5007static ssize_t rbd_client_addr_show(struct device *dev,
5008 struct device_attribute *attr, char *buf)
5009{
5010 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5011 struct ceph_entity_addr *client_addr =
5012 ceph_client_addr(rbd_dev->rbd_client->client);
5013
5014 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5015 le32_to_cpu(client_addr->nonce));
5016}
5017
dfc5606d
YS
5018static ssize_t rbd_client_id_show(struct device *dev,
5019 struct device_attribute *attr, char *buf)
602adf40 5020{
593a9e7b 5021 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5022
1dbb4399 5023 return sprintf(buf, "client%lld\n",
033268a5 5024 ceph_client_gid(rbd_dev->rbd_client->client));
602adf40
YS
5025}
5026
267fb90b
MC
5027static ssize_t rbd_cluster_fsid_show(struct device *dev,
5028 struct device_attribute *attr, char *buf)
5029{
5030 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5031
5032 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5033}
5034
0d6d1e9c
MC
5035static ssize_t rbd_config_info_show(struct device *dev,
5036 struct device_attribute *attr, char *buf)
5037{
5038 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5039
f44d04e6
ID
5040 if (!capable(CAP_SYS_ADMIN))
5041 return -EPERM;
5042
0d6d1e9c 5043 return sprintf(buf, "%s\n", rbd_dev->config_info);
602adf40
YS
5044}
5045
dfc5606d
YS
5046static ssize_t rbd_pool_show(struct device *dev,
5047 struct device_attribute *attr, char *buf)
602adf40 5048{
593a9e7b 5049 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5050
0d7dbfce 5051 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
5052}
5053
9bb2f334
AE
5054static ssize_t rbd_pool_id_show(struct device *dev,
5055 struct device_attribute *attr, char *buf)
5056{
5057 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5058
0d7dbfce 5059 return sprintf(buf, "%llu\n",
fc71d833 5060 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
5061}
5062
b26c047b
ID
5063static ssize_t rbd_pool_ns_show(struct device *dev,
5064 struct device_attribute *attr, char *buf)
5065{
5066 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5067
5068 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5069}
5070
dfc5606d
YS
5071static ssize_t rbd_name_show(struct device *dev,
5072 struct device_attribute *attr, char *buf)
5073{
593a9e7b 5074 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5075
a92ffdf8
AE
5076 if (rbd_dev->spec->image_name)
5077 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5078
5079 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
5080}
5081
589d30e0
AE
5082static ssize_t rbd_image_id_show(struct device *dev,
5083 struct device_attribute *attr, char *buf)
5084{
5085 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5086
0d7dbfce 5087 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
5088}
5089
34b13184
AE
5090/*
5091 * Shows the name of the currently-mapped snapshot (or
5092 * RBD_SNAP_HEAD_NAME for the base image).
5093 */
dfc5606d
YS
5094static ssize_t rbd_snap_show(struct device *dev,
5095 struct device_attribute *attr,
5096 char *buf)
5097{
593a9e7b 5098 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5099
0d7dbfce 5100 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
5101}
5102
92a58671
MC
5103static ssize_t rbd_snap_id_show(struct device *dev,
5104 struct device_attribute *attr, char *buf)
5105{
5106 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5107
5108 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5109}
5110
86b00e0d 5111/*
ff96128f
ID
5112 * For a v2 image, shows the chain of parent images, separated by empty
5113 * lines. For v1 images or if there is no parent, shows "(no parent
5114 * image)".
86b00e0d
AE
5115 */
5116static ssize_t rbd_parent_show(struct device *dev,
ff96128f
ID
5117 struct device_attribute *attr,
5118 char *buf)
86b00e0d
AE
5119{
5120 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
ff96128f 5121 ssize_t count = 0;
86b00e0d 5122
ff96128f 5123 if (!rbd_dev->parent)
86b00e0d
AE
5124 return sprintf(buf, "(no parent image)\n");
5125
ff96128f
ID
5126 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5127 struct rbd_spec *spec = rbd_dev->parent_spec;
5128
5129 count += sprintf(&buf[count], "%s"
5130 "pool_id %llu\npool_name %s\n"
e92c0eaf 5131 "pool_ns %s\n"
ff96128f
ID
5132 "image_id %s\nimage_name %s\n"
5133 "snap_id %llu\nsnap_name %s\n"
5134 "overlap %llu\n",
5135 !count ? "" : "\n", /* first? */
5136 spec->pool_id, spec->pool_name,
e92c0eaf 5137 spec->pool_ns ?: "",
ff96128f
ID
5138 spec->image_id, spec->image_name ?: "(unknown)",
5139 spec->snap_id, spec->snap_name,
5140 rbd_dev->parent_overlap);
5141 }
5142
5143 return count;
86b00e0d
AE
5144}
5145
dfc5606d
YS
5146static ssize_t rbd_image_refresh(struct device *dev,
5147 struct device_attribute *attr,
5148 const char *buf,
5149 size_t size)
5150{
593a9e7b 5151 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 5152 int ret;
602adf40 5153
f44d04e6
ID
5154 if (!capable(CAP_SYS_ADMIN))
5155 return -EPERM;
5156
cc4a38bd 5157 ret = rbd_dev_refresh(rbd_dev);
e627db08 5158 if (ret)
52bb1f9b 5159 return ret;
b813623a 5160
52bb1f9b 5161 return size;
dfc5606d 5162}
602adf40 5163
5657a819
JP
5164static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5165static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5166static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5167static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5168static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5169static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5170static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5171static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5172static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5173static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
b26c047b 5174static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5657a819
JP
5175static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5176static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5177static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5178static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5179static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5180static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
dfc5606d
YS
5181
5182static struct attribute *rbd_attrs[] = {
5183 &dev_attr_size.attr,
34b13184 5184 &dev_attr_features.attr,
dfc5606d 5185 &dev_attr_major.attr,
dd82fff1 5186 &dev_attr_minor.attr,
005a07bf 5187 &dev_attr_client_addr.attr,
dfc5606d 5188 &dev_attr_client_id.attr,
267fb90b 5189 &dev_attr_cluster_fsid.attr,
0d6d1e9c 5190 &dev_attr_config_info.attr,
dfc5606d 5191 &dev_attr_pool.attr,
9bb2f334 5192 &dev_attr_pool_id.attr,
b26c047b 5193 &dev_attr_pool_ns.attr,
dfc5606d 5194 &dev_attr_name.attr,
589d30e0 5195 &dev_attr_image_id.attr,
dfc5606d 5196 &dev_attr_current_snap.attr,
92a58671 5197 &dev_attr_snap_id.attr,
86b00e0d 5198 &dev_attr_parent.attr,
dfc5606d 5199 &dev_attr_refresh.attr,
dfc5606d
YS
5200 NULL
5201};
5202
5203static struct attribute_group rbd_attr_group = {
5204 .attrs = rbd_attrs,
5205};
5206
5207static const struct attribute_group *rbd_attr_groups[] = {
5208 &rbd_attr_group,
5209 NULL
5210};
5211
6cac4695 5212static void rbd_dev_release(struct device *dev);
dfc5606d 5213
b9942bc9 5214static const struct device_type rbd_device_type = {
dfc5606d
YS
5215 .name = "rbd",
5216 .groups = rbd_attr_groups,
6cac4695 5217 .release = rbd_dev_release,
dfc5606d
YS
5218};
5219
8b8fb99c
AE
5220static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5221{
5222 kref_get(&spec->kref);
5223
5224 return spec;
5225}
5226
5227static void rbd_spec_free(struct kref *kref);
5228static void rbd_spec_put(struct rbd_spec *spec)
5229{
5230 if (spec)
5231 kref_put(&spec->kref, rbd_spec_free);
5232}
5233
5234static struct rbd_spec *rbd_spec_alloc(void)
5235{
5236 struct rbd_spec *spec;
5237
5238 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5239 if (!spec)
5240 return NULL;
04077599
ID
5241
5242 spec->pool_id = CEPH_NOPOOL;
5243 spec->snap_id = CEPH_NOSNAP;
8b8fb99c
AE
5244 kref_init(&spec->kref);
5245
8b8fb99c
AE
5246 return spec;
5247}
5248
5249static void rbd_spec_free(struct kref *kref)
5250{
5251 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5252
5253 kfree(spec->pool_name);
b26c047b 5254 kfree(spec->pool_ns);
8b8fb99c
AE
5255 kfree(spec->image_id);
5256 kfree(spec->image_name);
5257 kfree(spec->snap_name);
5258 kfree(spec);
5259}
5260
1643dfa4 5261static void rbd_dev_free(struct rbd_device *rbd_dev)
dd5ac32d 5262{
99d16943 5263 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
ed95b21a 5264 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
dd5ac32d 5265
c41d13a3 5266 ceph_oid_destroy(&rbd_dev->header_oid);
6b6dddbe 5267 ceph_oloc_destroy(&rbd_dev->header_oloc);
0d6d1e9c 5268 kfree(rbd_dev->config_info);
c41d13a3 5269
dd5ac32d
ID
5270 rbd_put_client(rbd_dev->rbd_client);
5271 rbd_spec_put(rbd_dev->spec);
5272 kfree(rbd_dev->opts);
5273 kfree(rbd_dev);
1643dfa4
ID
5274}
5275
5276static void rbd_dev_release(struct device *dev)
5277{
5278 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5279 bool need_put = !!rbd_dev->opts;
5280
5281 if (need_put) {
5282 destroy_workqueue(rbd_dev->task_wq);
5283 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5284 }
5285
5286 rbd_dev_free(rbd_dev);
dd5ac32d
ID
5287
5288 /*
5289 * This is racy, but way better than putting module outside of
5290 * the release callback. The race window is pretty small, so
5291 * doing something similar to dm (dm-builtin.c) is overkill.
5292 */
5293 if (need_put)
5294 module_put(THIS_MODULE);
5295}
5296
1643dfa4
ID
5297static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5298 struct rbd_spec *spec)
c53d5893
AE
5299{
5300 struct rbd_device *rbd_dev;
5301
1643dfa4 5302 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
c53d5893
AE
5303 if (!rbd_dev)
5304 return NULL;
5305
5306 spin_lock_init(&rbd_dev->lock);
5307 INIT_LIST_HEAD(&rbd_dev->node);
c53d5893
AE
5308 init_rwsem(&rbd_dev->header_rwsem);
5309
7e97332e 5310 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
c41d13a3 5311 ceph_oid_init(&rbd_dev->header_oid);
431a02cd 5312 rbd_dev->header_oloc.pool = spec->pool_id;
b26c047b
ID
5313 if (spec->pool_ns) {
5314 WARN_ON(!*spec->pool_ns);
5315 rbd_dev->header_oloc.pool_ns =
5316 ceph_find_or_create_string(spec->pool_ns,
5317 strlen(spec->pool_ns));
5318 }
c41d13a3 5319
99d16943
ID
5320 mutex_init(&rbd_dev->watch_mutex);
5321 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5322 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5323
ed95b21a
ID
5324 init_rwsem(&rbd_dev->lock_rwsem);
5325 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5326 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5327 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5328 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5329 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
e1fddc8f 5330 spin_lock_init(&rbd_dev->lock_lists_lock);
637cd060 5331 INIT_LIST_HEAD(&rbd_dev->acquiring_list);
e1fddc8f 5332 INIT_LIST_HEAD(&rbd_dev->running_list);
637cd060 5333 init_completion(&rbd_dev->acquire_wait);
e1fddc8f 5334 init_completion(&rbd_dev->releasing_wait);
ed95b21a 5335
22e8bd51 5336 spin_lock_init(&rbd_dev->object_map_lock);
ed95b21a 5337
dd5ac32d
ID
5338 rbd_dev->dev.bus = &rbd_bus_type;
5339 rbd_dev->dev.type = &rbd_device_type;
5340 rbd_dev->dev.parent = &rbd_root_dev;
dd5ac32d
ID
5341 device_initialize(&rbd_dev->dev);
5342
c53d5893 5343 rbd_dev->rbd_client = rbdc;
d147543d 5344 rbd_dev->spec = spec;
0903e875 5345
1643dfa4
ID
5346 return rbd_dev;
5347}
5348
5349/*
5350 * Create a mapping rbd_dev.
5351 */
5352static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5353 struct rbd_spec *spec,
5354 struct rbd_options *opts)
5355{
5356 struct rbd_device *rbd_dev;
5357
5358 rbd_dev = __rbd_dev_create(rbdc, spec);
5359 if (!rbd_dev)
5360 return NULL;
5361
5362 rbd_dev->opts = opts;
5363
5364 /* get an id and fill in device name */
5365 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5366 minor_to_rbd_dev_id(1 << MINORBITS),
5367 GFP_KERNEL);
5368 if (rbd_dev->dev_id < 0)
5369 goto fail_rbd_dev;
5370
5371 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5372 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5373 rbd_dev->name);
5374 if (!rbd_dev->task_wq)
5375 goto fail_dev_id;
dd5ac32d 5376
1643dfa4
ID
5377 /* we have a ref from do_rbd_add() */
5378 __module_get(THIS_MODULE);
dd5ac32d 5379
1643dfa4 5380 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
c53d5893 5381 return rbd_dev;
1643dfa4
ID
5382
5383fail_dev_id:
5384 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5385fail_rbd_dev:
5386 rbd_dev_free(rbd_dev);
5387 return NULL;
c53d5893
AE
5388}
5389
5390static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5391{
dd5ac32d
ID
5392 if (rbd_dev)
5393 put_device(&rbd_dev->dev);
c53d5893
AE
5394}
5395
9d475de5
AE
5396/*
5397 * Get the size and object order for an image snapshot, or if
5398 * snap_id is CEPH_NOSNAP, gets this information for the base
5399 * image.
5400 */
5401static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5402 u8 *order, u64 *snap_size)
5403{
5404 __le64 snapid = cpu_to_le64(snap_id);
5405 int ret;
5406 struct {
5407 u8 order;
5408 __le64 size;
5409 } __attribute__ ((packed)) size_buf = { 0 };
5410
ecd4a68a
ID
5411 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5412 &rbd_dev->header_oloc, "get_size",
5413 &snapid, sizeof(snapid),
5414 &size_buf, sizeof(size_buf));
36be9a76 5415 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
5416 if (ret < 0)
5417 return ret;
57385b51
AE
5418 if (ret < sizeof (size_buf))
5419 return -ERANGE;
9d475de5 5420
c3545579 5421 if (order) {
c86f86e9 5422 *order = size_buf.order;
c3545579
JD
5423 dout(" order %u", (unsigned int)*order);
5424 }
9d475de5
AE
5425 *snap_size = le64_to_cpu(size_buf.size);
5426
c3545579
JD
5427 dout(" snap_id 0x%016llx snap_size = %llu\n",
5428 (unsigned long long)snap_id,
57385b51 5429 (unsigned long long)*snap_size);
9d475de5
AE
5430
5431 return 0;
5432}
5433
5434static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5435{
5436 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5437 &rbd_dev->header.obj_order,
5438 &rbd_dev->header.image_size);
5439}
5440
1e130199
AE
5441static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5442{
5435d206 5443 size_t size;
1e130199
AE
5444 void *reply_buf;
5445 int ret;
5446 void *p;
5447
5435d206
DY
5448 /* Response will be an encoded string, which includes a length */
5449 size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5450 reply_buf = kzalloc(size, GFP_KERNEL);
1e130199
AE
5451 if (!reply_buf)
5452 return -ENOMEM;
5453
ecd4a68a
ID
5454 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5455 &rbd_dev->header_oloc, "get_object_prefix",
5435d206 5456 NULL, 0, reply_buf, size);
36be9a76 5457 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
5458 if (ret < 0)
5459 goto out;
5460
5461 p = reply_buf;
5462 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
57385b51
AE
5463 p + ret, NULL, GFP_NOIO);
5464 ret = 0;
1e130199
AE
5465
5466 if (IS_ERR(rbd_dev->header.object_prefix)) {
5467 ret = PTR_ERR(rbd_dev->header.object_prefix);
5468 rbd_dev->header.object_prefix = NULL;
5469 } else {
5470 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
5471 }
1e130199
AE
5472out:
5473 kfree(reply_buf);
5474
5475 return ret;
5476}
5477
b1b5402a 5478static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
196e2d6d 5479 bool read_only, u64 *snap_features)
b1b5402a 5480{
196e2d6d
ID
5481 struct {
5482 __le64 snap_id;
5483 u8 read_only;
5484 } features_in;
b1b5402a
AE
5485 struct {
5486 __le64 features;
5487 __le64 incompat;
4157976b 5488 } __attribute__ ((packed)) features_buf = { 0 };
d3767f0f 5489 u64 unsup;
b1b5402a
AE
5490 int ret;
5491
196e2d6d
ID
5492 features_in.snap_id = cpu_to_le64(snap_id);
5493 features_in.read_only = read_only;
5494
ecd4a68a
ID
5495 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5496 &rbd_dev->header_oloc, "get_features",
196e2d6d 5497 &features_in, sizeof(features_in),
ecd4a68a 5498 &features_buf, sizeof(features_buf));
36be9a76 5499 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
5500 if (ret < 0)
5501 return ret;
57385b51
AE
5502 if (ret < sizeof (features_buf))
5503 return -ERANGE;
d889140c 5504
d3767f0f
ID
5505 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5506 if (unsup) {
5507 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5508 unsup);
b8f5c6ed 5509 return -ENXIO;
d3767f0f 5510 }
d889140c 5511
b1b5402a
AE
5512 *snap_features = le64_to_cpu(features_buf.features);
5513
5514 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
57385b51
AE
5515 (unsigned long long)snap_id,
5516 (unsigned long long)*snap_features,
5517 (unsigned long long)le64_to_cpu(features_buf.incompat));
b1b5402a
AE
5518
5519 return 0;
5520}
5521
5522static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5523{
5524 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
196e2d6d
ID
5525 rbd_is_ro(rbd_dev),
5526 &rbd_dev->header.features);
b1b5402a
AE
5527}
5528
22e8bd51
ID
5529/*
5530 * These are generic image flags, but since they are used only for
5531 * object map, store them in rbd_dev->object_map_flags.
5532 *
5533 * For the same reason, this function is called only on object map
5534 * (re)load and not on header refresh.
5535 */
5536static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5537{
5538 __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5539 __le64 flags;
5540 int ret;
5541
5542 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5543 &rbd_dev->header_oloc, "get_flags",
5544 &snapid, sizeof(snapid),
5545 &flags, sizeof(flags));
5546 if (ret < 0)
5547 return ret;
5548 if (ret < sizeof(flags))
5549 return -EBADMSG;
5550
5551 rbd_dev->object_map_flags = le64_to_cpu(flags);
5552 return 0;
5553}
5554
eb3b2d6b
ID
5555struct parent_image_info {
5556 u64 pool_id;
e92c0eaf 5557 const char *pool_ns;
eb3b2d6b
ID
5558 const char *image_id;
5559 u64 snap_id;
5560
e92c0eaf 5561 bool has_overlap;
eb3b2d6b
ID
5562 u64 overlap;
5563};
5564
e92c0eaf
ID
5565/*
5566 * The caller is responsible for @pii.
5567 */
5568static int decode_parent_image_spec(void **p, void *end,
5569 struct parent_image_info *pii)
5570{
5571 u8 struct_v;
5572 u32 struct_len;
5573 int ret;
5574
5575 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5576 &struct_v, &struct_len);
5577 if (ret)
5578 return ret;
5579
5580 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5581 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5582 if (IS_ERR(pii->pool_ns)) {
5583 ret = PTR_ERR(pii->pool_ns);
5584 pii->pool_ns = NULL;
5585 return ret;
5586 }
5587 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5588 if (IS_ERR(pii->image_id)) {
5589 ret = PTR_ERR(pii->image_id);
5590 pii->image_id = NULL;
5591 return ret;
5592 }
5593 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5594 return 0;
5595
5596e_inval:
5597 return -EINVAL;
5598}
5599
5600static int __get_parent_info(struct rbd_device *rbd_dev,
5601 struct page *req_page,
5602 struct page *reply_page,
5603 struct parent_image_info *pii)
5604{
5605 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5606 size_t reply_len = PAGE_SIZE;
5607 void *p, *end;
5608 int ret;
5609
5610 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5611 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
68ada915 5612 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5613 if (ret)
5614 return ret == -EOPNOTSUPP ? 1 : ret;
5615
5616 p = page_address(reply_page);
5617 end = p + reply_len;
5618 ret = decode_parent_image_spec(&p, end, pii);
5619 if (ret)
5620 return ret;
5621
5622 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5623 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
68ada915 5624 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5625 if (ret)
5626 return ret;
5627
5628 p = page_address(reply_page);
5629 end = p + reply_len;
5630 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5631 if (pii->has_overlap)
5632 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5633
5634 return 0;
5635
5636e_inval:
5637 return -EINVAL;
5638}
5639
eb3b2d6b
ID
5640/*
5641 * The caller is responsible for @pii.
5642 */
5643static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5644 struct page *req_page,
5645 struct page *reply_page,
5646 struct parent_image_info *pii)
5647{
5648 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5649 size_t reply_len = PAGE_SIZE;
5650 void *p, *end;
5651 int ret;
5652
5653 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5654 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
68ada915 5655 req_page, sizeof(u64), &reply_page, &reply_len);
eb3b2d6b
ID
5656 if (ret)
5657 return ret;
5658
5659 p = page_address(reply_page);
5660 end = p + reply_len;
5661 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5662 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5663 if (IS_ERR(pii->image_id)) {
5664 ret = PTR_ERR(pii->image_id);
5665 pii->image_id = NULL;
5666 return ret;
5667 }
5668 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
e92c0eaf 5669 pii->has_overlap = true;
eb3b2d6b
ID
5670 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5671
5672 return 0;
5673
5674e_inval:
5675 return -EINVAL;
5676}
5677
5678static int get_parent_info(struct rbd_device *rbd_dev,
5679 struct parent_image_info *pii)
5680{
5681 struct page *req_page, *reply_page;
5682 void *p;
5683 int ret;
5684
5685 req_page = alloc_page(GFP_KERNEL);
5686 if (!req_page)
5687 return -ENOMEM;
5688
5689 reply_page = alloc_page(GFP_KERNEL);
5690 if (!reply_page) {
5691 __free_page(req_page);
5692 return -ENOMEM;
5693 }
5694
5695 p = page_address(req_page);
5696 ceph_encode_64(&p, rbd_dev->spec->snap_id);
e92c0eaf
ID
5697 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5698 if (ret > 0)
5699 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5700 pii);
eb3b2d6b
ID
5701
5702 __free_page(req_page);
5703 __free_page(reply_page);
5704 return ret;
5705}
5706
86b00e0d
AE
5707static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5708{
5709 struct rbd_spec *parent_spec;
eb3b2d6b 5710 struct parent_image_info pii = { 0 };
86b00e0d
AE
5711 int ret;
5712
5713 parent_spec = rbd_spec_alloc();
5714 if (!parent_spec)
5715 return -ENOMEM;
5716
eb3b2d6b
ID
5717 ret = get_parent_info(rbd_dev, &pii);
5718 if (ret)
86b00e0d 5719 goto out_err;
86b00e0d 5720
e92c0eaf
ID
5721 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5722 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5723 pii.has_overlap, pii.overlap);
86b00e0d 5724
e92c0eaf 5725 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
392a9dad
AE
5726 /*
5727 * Either the parent never existed, or we have
5728 * record of it but the image got flattened so it no
5729 * longer has a parent. When the parent of a
5730 * layered image disappears we immediately set the
5731 * overlap to 0. The effect of this is that all new
5732 * requests will be treated as if the image had no
5733 * parent.
e92c0eaf
ID
5734 *
5735 * If !pii.has_overlap, the parent image spec is not
5736 * applicable. It's there to avoid duplication in each
5737 * snapshot record.
392a9dad
AE
5738 */
5739 if (rbd_dev->parent_overlap) {
5740 rbd_dev->parent_overlap = 0;
392a9dad
AE
5741 rbd_dev_parent_put(rbd_dev);
5742 pr_info("%s: clone image has been flattened\n",
5743 rbd_dev->disk->disk_name);
5744 }
5745
86b00e0d 5746 goto out; /* No parent? No problem. */
392a9dad 5747 }
86b00e0d 5748
0903e875
AE
5749 /* The ceph file layout needs to fit pool id in 32 bits */
5750
5751 ret = -EIO;
eb3b2d6b 5752 if (pii.pool_id > (u64)U32_MAX) {
9584d508 5753 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
eb3b2d6b 5754 (unsigned long long)pii.pool_id, U32_MAX);
86b00e0d
AE
5755 goto out_err;
5756 }
86b00e0d 5757
3b5cf2a2
AE
5758 /*
5759 * The parent won't change (except when the clone is
5760 * flattened, already handled that). So we only need to
5761 * record the parent spec we have not already done so.
5762 */
5763 if (!rbd_dev->parent_spec) {
eb3b2d6b 5764 parent_spec->pool_id = pii.pool_id;
e92c0eaf
ID
5765 if (pii.pool_ns && *pii.pool_ns) {
5766 parent_spec->pool_ns = pii.pool_ns;
5767 pii.pool_ns = NULL;
5768 }
eb3b2d6b
ID
5769 parent_spec->image_id = pii.image_id;
5770 pii.image_id = NULL;
5771 parent_spec->snap_id = pii.snap_id;
b26c047b 5772
70cf49cf
AE
5773 rbd_dev->parent_spec = parent_spec;
5774 parent_spec = NULL; /* rbd_dev now owns this */
3b5cf2a2
AE
5775 }
5776
5777 /*
cf32bd9c
ID
5778 * We always update the parent overlap. If it's zero we issue
5779 * a warning, as we will proceed as if there was no parent.
3b5cf2a2 5780 */
eb3b2d6b 5781 if (!pii.overlap) {
3b5cf2a2 5782 if (parent_spec) {
cf32bd9c
ID
5783 /* refresh, careful to warn just once */
5784 if (rbd_dev->parent_overlap)
5785 rbd_warn(rbd_dev,
5786 "clone now standalone (overlap became 0)");
3b5cf2a2 5787 } else {
cf32bd9c
ID
5788 /* initial probe */
5789 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
3b5cf2a2 5790 }
70cf49cf 5791 }
eb3b2d6b 5792 rbd_dev->parent_overlap = pii.overlap;
cf32bd9c 5793
86b00e0d
AE
5794out:
5795 ret = 0;
5796out_err:
e92c0eaf 5797 kfree(pii.pool_ns);
eb3b2d6b 5798 kfree(pii.image_id);
86b00e0d 5799 rbd_spec_put(parent_spec);
86b00e0d
AE
5800 return ret;
5801}
5802
cc070d59
AE
5803static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5804{
5805 struct {
5806 __le64 stripe_unit;
5807 __le64 stripe_count;
5808 } __attribute__ ((packed)) striping_info_buf = { 0 };
5809 size_t size = sizeof (striping_info_buf);
5810 void *p;
cc070d59
AE
5811 int ret;
5812
ecd4a68a
ID
5813 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5814 &rbd_dev->header_oloc, "get_stripe_unit_count",
5815 NULL, 0, &striping_info_buf, size);
cc070d59
AE
5816 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5817 if (ret < 0)
5818 return ret;
5819 if (ret < size)
5820 return -ERANGE;
5821
cc070d59 5822 p = &striping_info_buf;
b1331852
ID
5823 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5824 rbd_dev->header.stripe_count = ceph_decode_64(&p);
cc070d59
AE
5825 return 0;
5826}
5827
7e97332e
ID
5828static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5829{
5830 __le64 data_pool_id;
5831 int ret;
5832
5833 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5834 &rbd_dev->header_oloc, "get_data_pool",
5835 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5836 if (ret < 0)
5837 return ret;
5838 if (ret < sizeof(data_pool_id))
5839 return -EBADMSG;
5840
5841 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5842 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5843 return 0;
5844}
5845
9e15b77d
AE
5846static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5847{
ecd4a68a 5848 CEPH_DEFINE_OID_ONSTACK(oid);
9e15b77d
AE
5849 size_t image_id_size;
5850 char *image_id;
5851 void *p;
5852 void *end;
5853 size_t size;
5854 void *reply_buf = NULL;
5855 size_t len = 0;
5856 char *image_name = NULL;
5857 int ret;
5858
5859 rbd_assert(!rbd_dev->spec->image_name);
5860
69e7a02f
AE
5861 len = strlen(rbd_dev->spec->image_id);
5862 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
5863 image_id = kmalloc(image_id_size, GFP_KERNEL);
5864 if (!image_id)
5865 return NULL;
5866
5867 p = image_id;
4157976b 5868 end = image_id + image_id_size;
57385b51 5869 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
9e15b77d
AE
5870
5871 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5872 reply_buf = kmalloc(size, GFP_KERNEL);
5873 if (!reply_buf)
5874 goto out;
5875
ecd4a68a
ID
5876 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5877 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5878 "dir_get_name", image_id, image_id_size,
5879 reply_buf, size);
9e15b77d
AE
5880 if (ret < 0)
5881 goto out;
5882 p = reply_buf;
f40eb349
AE
5883 end = reply_buf + ret;
5884
9e15b77d
AE
5885 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5886 if (IS_ERR(image_name))
5887 image_name = NULL;
5888 else
5889 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5890out:
5891 kfree(reply_buf);
5892 kfree(image_id);
5893
5894 return image_name;
5895}
5896
2ad3d716
AE
5897static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5898{
5899 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5900 const char *snap_name;
5901 u32 which = 0;
5902
5903 /* Skip over names until we find the one we are looking for */
5904
5905 snap_name = rbd_dev->header.snap_names;
5906 while (which < snapc->num_snaps) {
5907 if (!strcmp(name, snap_name))
5908 return snapc->snaps[which];
5909 snap_name += strlen(snap_name) + 1;
5910 which++;
5911 }
5912 return CEPH_NOSNAP;
5913}
5914
5915static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5916{
5917 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5918 u32 which;
5919 bool found = false;
5920 u64 snap_id;
5921
5922 for (which = 0; !found && which < snapc->num_snaps; which++) {
5923 const char *snap_name;
5924
5925 snap_id = snapc->snaps[which];
5926 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
efadc98a
JD
5927 if (IS_ERR(snap_name)) {
5928 /* ignore no-longer existing snapshots */
5929 if (PTR_ERR(snap_name) == -ENOENT)
5930 continue;
5931 else
5932 break;
5933 }
2ad3d716
AE
5934 found = !strcmp(name, snap_name);
5935 kfree(snap_name);
5936 }
5937 return found ? snap_id : CEPH_NOSNAP;
5938}
5939
5940/*
5941 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5942 * no snapshot by that name is found, or if an error occurs.
5943 */
5944static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5945{
5946 if (rbd_dev->image_format == 1)
5947 return rbd_v1_snap_id_by_name(rbd_dev, name);
5948
5949 return rbd_v2_snap_id_by_name(rbd_dev, name);
5950}
5951
9e15b77d 5952/*
04077599
ID
5953 * An image being mapped will have everything but the snap id.
5954 */
5955static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5956{
5957 struct rbd_spec *spec = rbd_dev->spec;
5958
5959 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5960 rbd_assert(spec->image_id && spec->image_name);
5961 rbd_assert(spec->snap_name);
5962
5963 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5964 u64 snap_id;
5965
5966 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5967 if (snap_id == CEPH_NOSNAP)
5968 return -ENOENT;
5969
5970 spec->snap_id = snap_id;
5971 } else {
5972 spec->snap_id = CEPH_NOSNAP;
5973 }
5974
5975 return 0;
5976}
5977
5978/*
5979 * A parent image will have all ids but none of the names.
e1d4213f 5980 *
04077599
ID
5981 * All names in an rbd spec are dynamically allocated. It's OK if we
5982 * can't figure out the name for an image id.
9e15b77d 5983 */
04077599 5984static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
9e15b77d 5985{
2e9f7f1c
AE
5986 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5987 struct rbd_spec *spec = rbd_dev->spec;
5988 const char *pool_name;
5989 const char *image_name;
5990 const char *snap_name;
9e15b77d
AE
5991 int ret;
5992
04077599
ID
5993 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5994 rbd_assert(spec->image_id);
5995 rbd_assert(spec->snap_id != CEPH_NOSNAP);
9e15b77d 5996
2e9f7f1c 5997 /* Get the pool name; we have to make our own copy of this */
9e15b77d 5998
2e9f7f1c
AE
5999 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6000 if (!pool_name) {
6001 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
935dc89f
AE
6002 return -EIO;
6003 }
2e9f7f1c
AE
6004 pool_name = kstrdup(pool_name, GFP_KERNEL);
6005 if (!pool_name)
9e15b77d
AE
6006 return -ENOMEM;
6007
6008 /* Fetch the image name; tolerate failure here */
6009
2e9f7f1c
AE
6010 image_name = rbd_dev_image_name(rbd_dev);
6011 if (!image_name)
06ecc6cb 6012 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d 6013
04077599 6014 /* Fetch the snapshot name */
9e15b77d 6015
2e9f7f1c 6016 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
da6a6b63
JD
6017 if (IS_ERR(snap_name)) {
6018 ret = PTR_ERR(snap_name);
9e15b77d 6019 goto out_err;
2e9f7f1c
AE
6020 }
6021
6022 spec->pool_name = pool_name;
6023 spec->image_name = image_name;
6024 spec->snap_name = snap_name;
9e15b77d
AE
6025
6026 return 0;
04077599 6027
9e15b77d 6028out_err:
2e9f7f1c
AE
6029 kfree(image_name);
6030 kfree(pool_name);
9e15b77d
AE
6031 return ret;
6032}
6033
cc4a38bd 6034static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
35d489f9
AE
6035{
6036 size_t size;
6037 int ret;
6038 void *reply_buf;
6039 void *p;
6040 void *end;
6041 u64 seq;
6042 u32 snap_count;
6043 struct ceph_snap_context *snapc;
6044 u32 i;
6045
6046 /*
6047 * We'll need room for the seq value (maximum snapshot id),
6048 * snapshot count, and array of that many snapshot ids.
6049 * For now we have a fixed upper limit on the number we're
6050 * prepared to receive.
6051 */
6052 size = sizeof (__le64) + sizeof (__le32) +
6053 RBD_MAX_SNAP_COUNT * sizeof (__le64);
6054 reply_buf = kzalloc(size, GFP_KERNEL);
6055 if (!reply_buf)
6056 return -ENOMEM;
6057
ecd4a68a
ID
6058 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6059 &rbd_dev->header_oloc, "get_snapcontext",
6060 NULL, 0, reply_buf, size);
36be9a76 6061 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
6062 if (ret < 0)
6063 goto out;
6064
35d489f9 6065 p = reply_buf;
57385b51
AE
6066 end = reply_buf + ret;
6067 ret = -ERANGE;
35d489f9
AE
6068 ceph_decode_64_safe(&p, end, seq, out);
6069 ceph_decode_32_safe(&p, end, snap_count, out);
6070
6071 /*
6072 * Make sure the reported number of snapshot ids wouldn't go
6073 * beyond the end of our buffer. But before checking that,
6074 * make sure the computed size of the snapshot context we
6075 * allocate is representable in a size_t.
6076 */
6077 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6078 / sizeof (u64)) {
6079 ret = -EINVAL;
6080 goto out;
6081 }
6082 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6083 goto out;
468521c1 6084 ret = 0;
35d489f9 6085
812164f8 6086 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
35d489f9
AE
6087 if (!snapc) {
6088 ret = -ENOMEM;
6089 goto out;
6090 }
35d489f9 6091 snapc->seq = seq;
35d489f9
AE
6092 for (i = 0; i < snap_count; i++)
6093 snapc->snaps[i] = ceph_decode_64(&p);
6094
49ece554 6095 ceph_put_snap_context(rbd_dev->header.snapc);
35d489f9
AE
6096 rbd_dev->header.snapc = snapc;
6097
6098 dout(" snap context seq = %llu, snap_count = %u\n",
57385b51 6099 (unsigned long long)seq, (unsigned int)snap_count);
35d489f9
AE
6100out:
6101 kfree(reply_buf);
6102
57385b51 6103 return ret;
35d489f9
AE
6104}
6105
54cac61f
AE
6106static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6107 u64 snap_id)
b8b1e2db
AE
6108{
6109 size_t size;
6110 void *reply_buf;
54cac61f 6111 __le64 snapid;
b8b1e2db
AE
6112 int ret;
6113 void *p;
6114 void *end;
b8b1e2db
AE
6115 char *snap_name;
6116
6117 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6118 reply_buf = kmalloc(size, GFP_KERNEL);
6119 if (!reply_buf)
6120 return ERR_PTR(-ENOMEM);
6121
54cac61f 6122 snapid = cpu_to_le64(snap_id);
ecd4a68a
ID
6123 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6124 &rbd_dev->header_oloc, "get_snapshot_name",
6125 &snapid, sizeof(snapid), reply_buf, size);
36be9a76 6126 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
f40eb349
AE
6127 if (ret < 0) {
6128 snap_name = ERR_PTR(ret);
b8b1e2db 6129 goto out;
f40eb349 6130 }
b8b1e2db
AE
6131
6132 p = reply_buf;
f40eb349 6133 end = reply_buf + ret;
e5c35534 6134 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
f40eb349 6135 if (IS_ERR(snap_name))
b8b1e2db 6136 goto out;
b8b1e2db 6137
f40eb349 6138 dout(" snap_id 0x%016llx snap_name = %s\n",
54cac61f 6139 (unsigned long long)snap_id, snap_name);
b8b1e2db
AE
6140out:
6141 kfree(reply_buf);
6142
f40eb349 6143 return snap_name;
b8b1e2db
AE
6144}
6145
2df3fac7 6146static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
117973fb 6147{
2df3fac7 6148 bool first_time = rbd_dev->header.object_prefix == NULL;
117973fb 6149 int ret;
117973fb 6150
1617e40c
JD
6151 ret = rbd_dev_v2_image_size(rbd_dev);
6152 if (ret)
cfbf6377 6153 return ret;
1617e40c 6154
2df3fac7
AE
6155 if (first_time) {
6156 ret = rbd_dev_v2_header_onetime(rbd_dev);
6157 if (ret)
cfbf6377 6158 return ret;
2df3fac7
AE
6159 }
6160
cc4a38bd 6161 ret = rbd_dev_v2_snap_context(rbd_dev);
d194cd1d
ID
6162 if (ret && first_time) {
6163 kfree(rbd_dev->header.object_prefix);
6164 rbd_dev->header.object_prefix = NULL;
6165 }
117973fb
AE
6166
6167 return ret;
6168}
6169
a720ae09
ID
6170static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6171{
6172 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6173
6174 if (rbd_dev->image_format == 1)
6175 return rbd_dev_v1_header_info(rbd_dev);
6176
6177 return rbd_dev_v2_header_info(rbd_dev);
6178}
6179
e28fff26
AE
6180/*
6181 * Skips over white space at *buf, and updates *buf to point to the
6182 * first found non-space character (if any). Returns the length of
593a9e7b
AE
6183 * the token (string of non-white space characters) found. Note
6184 * that *buf must be terminated with '\0'.
e28fff26
AE
6185 */
6186static inline size_t next_token(const char **buf)
6187{
6188 /*
6189 * These are the characters that produce nonzero for
6190 * isspace() in the "C" and "POSIX" locales.
6191 */
6192 const char *spaces = " \f\n\r\t\v";
6193
6194 *buf += strspn(*buf, spaces); /* Find start of token */
6195
6196 return strcspn(*buf, spaces); /* Return token length */
6197}
6198
ea3352f4
AE
6199/*
6200 * Finds the next token in *buf, dynamically allocates a buffer big
6201 * enough to hold a copy of it, and copies the token into the new
6202 * buffer. The copy is guaranteed to be terminated with '\0'. Note
6203 * that a duplicate buffer is created even for a zero-length token.
6204 *
6205 * Returns a pointer to the newly-allocated duplicate, or a null
6206 * pointer if memory for the duplicate was not available. If
6207 * the lenp argument is a non-null pointer, the length of the token
6208 * (not including the '\0') is returned in *lenp.
6209 *
6210 * If successful, the *buf pointer will be updated to point beyond
6211 * the end of the found token.
6212 *
6213 * Note: uses GFP_KERNEL for allocation.
6214 */
6215static inline char *dup_token(const char **buf, size_t *lenp)
6216{
6217 char *dup;
6218 size_t len;
6219
6220 len = next_token(buf);
4caf35f9 6221 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
6222 if (!dup)
6223 return NULL;
ea3352f4
AE
6224 *(dup + len) = '\0';
6225 *buf += len;
6226
6227 if (lenp)
6228 *lenp = len;
6229
6230 return dup;
6231}
6232
82995cc6
DH
6233static int rbd_parse_param(struct fs_parameter *param,
6234 struct rbd_parse_opts_ctx *pctx)
6235{
6236 struct rbd_options *opt = pctx->opts;
6237 struct fs_parse_result result;
3fbb8d55 6238 struct p_log log = {.prefix = "rbd"};
82995cc6
DH
6239 int token, ret;
6240
6241 ret = ceph_parse_param(param, pctx->copts, NULL);
6242 if (ret != -ENOPARAM)
6243 return ret;
6244
d7167b14 6245 token = __fs_parse(&log, rbd_parameters, param, &result);
82995cc6
DH
6246 dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
6247 if (token < 0) {
2c3f3dc3
AV
6248 if (token == -ENOPARAM)
6249 return inval_plog(&log, "Unknown parameter '%s'",
6250 param->key);
82995cc6
DH
6251 return token;
6252 }
6253
6254 switch (token) {
6255 case Opt_queue_depth:
6256 if (result.uint_32 < 1)
6257 goto out_of_range;
6258 opt->queue_depth = result.uint_32;
6259 break;
6260 case Opt_alloc_size:
6261 if (result.uint_32 < SECTOR_SIZE)
6262 goto out_of_range;
2c3f3dc3
AV
6263 if (!is_power_of_2(result.uint_32))
6264 return inval_plog(&log, "alloc_size must be a power of 2");
82995cc6
DH
6265 opt->alloc_size = result.uint_32;
6266 break;
6267 case Opt_lock_timeout:
6268 /* 0 is "wait forever" (i.e. infinite timeout) */
6269 if (result.uint_32 > INT_MAX / 1000)
6270 goto out_of_range;
6271 opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
6272 break;
6273 case Opt_pool_ns:
6274 kfree(pctx->spec->pool_ns);
6275 pctx->spec->pool_ns = param->string;
6276 param->string = NULL;
6277 break;
dc1dad8e
ID
6278 case Opt_compression_hint:
6279 switch (result.uint_32) {
6280 case Opt_compression_hint_none:
6281 opt->alloc_hint_flags &=
6282 ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6283 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6284 break;
6285 case Opt_compression_hint_compressible:
6286 opt->alloc_hint_flags |=
6287 CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6288 opt->alloc_hint_flags &=
6289 ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6290 break;
6291 case Opt_compression_hint_incompressible:
6292 opt->alloc_hint_flags |=
6293 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6294 opt->alloc_hint_flags &=
6295 ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6296 break;
6297 default:
6298 BUG();
6299 }
6300 break;
82995cc6
DH
6301 case Opt_read_only:
6302 opt->read_only = true;
6303 break;
6304 case Opt_read_write:
6305 opt->read_only = false;
6306 break;
6307 case Opt_lock_on_read:
6308 opt->lock_on_read = true;
6309 break;
6310 case Opt_exclusive:
6311 opt->exclusive = true;
6312 break;
6313 case Opt_notrim:
6314 opt->trim = false;
6315 break;
6316 default:
6317 BUG();
6318 }
6319
6320 return 0;
6321
6322out_of_range:
2c3f3dc3 6323 return inval_plog(&log, "%s out of range", param->key);
82995cc6
DH
6324}
6325
6326/*
6327 * This duplicates most of generic_parse_monolithic(), untying it from
6328 * fs_context and skipping standard superblock and security options.
6329 */
6330static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
6331{
6332 char *key;
6333 int ret = 0;
6334
6335 dout("%s '%s'\n", __func__, options);
6336 while ((key = strsep(&options, ",")) != NULL) {
6337 if (*key) {
6338 struct fs_parameter param = {
6339 .key = key,
0f89589a 6340 .type = fs_value_is_flag,
82995cc6
DH
6341 };
6342 char *value = strchr(key, '=');
6343 size_t v_len = 0;
6344
6345 if (value) {
6346 if (value == key)
6347 continue;
6348 *value++ = 0;
6349 v_len = strlen(value);
82995cc6
DH
6350 param.string = kmemdup_nul(value, v_len,
6351 GFP_KERNEL);
6352 if (!param.string)
6353 return -ENOMEM;
0f89589a 6354 param.type = fs_value_is_string;
82995cc6
DH
6355 }
6356 param.size = v_len;
6357
6358 ret = rbd_parse_param(&param, pctx);
6359 kfree(param.string);
6360 if (ret)
6361 break;
6362 }
6363 }
6364
6365 return ret;
6366}
6367
a725f65e 6368/*
859c31df
AE
6369 * Parse the options provided for an "rbd add" (i.e., rbd image
6370 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
6371 * and the data written is passed here via a NUL-terminated buffer.
6372 * Returns 0 if successful or an error code otherwise.
d22f76e7 6373 *
859c31df
AE
6374 * The information extracted from these options is recorded in
6375 * the other parameters which return dynamically-allocated
6376 * structures:
6377 * ceph_opts
6378 * The address of a pointer that will refer to a ceph options
6379 * structure. Caller must release the returned pointer using
6380 * ceph_destroy_options() when it is no longer needed.
6381 * rbd_opts
6382 * Address of an rbd options pointer. Fully initialized by
6383 * this function; caller must release with kfree().
6384 * spec
6385 * Address of an rbd image specification pointer. Fully
6386 * initialized by this function based on parsed options.
6387 * Caller must release with rbd_spec_put().
6388 *
6389 * The options passed take this form:
6390 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6391 * where:
6392 * <mon_addrs>
6393 * A comma-separated list of one or more monitor addresses.
6394 * A monitor address is an ip address, optionally followed
6395 * by a port number (separated by a colon).
6396 * I.e.: ip1[:port1][,ip2[:port2]...]
6397 * <options>
6398 * A comma-separated list of ceph and/or rbd options.
6399 * <pool_name>
6400 * The name of the rados pool containing the rbd image.
6401 * <image_name>
6402 * The name of the image in that pool to map.
6403 * <snap_id>
6404 * An optional snapshot id. If provided, the mapping will
6405 * present data from the image at the time that snapshot was
6406 * created. The image head is used if no snapshot id is
6407 * provided. Snapshot mappings are always read-only.
a725f65e 6408 */
859c31df 6409static int rbd_add_parse_args(const char *buf,
dc79b113 6410 struct ceph_options **ceph_opts,
859c31df
AE
6411 struct rbd_options **opts,
6412 struct rbd_spec **rbd_spec)
e28fff26 6413{
d22f76e7 6414 size_t len;
859c31df 6415 char *options;
0ddebc0c 6416 const char *mon_addrs;
ecb4dc22 6417 char *snap_name;
0ddebc0c 6418 size_t mon_addrs_size;
82995cc6 6419 struct rbd_parse_opts_ctx pctx = { 0 };
dc79b113 6420 int ret;
e28fff26
AE
6421
6422 /* The first four tokens are required */
6423
7ef3214a 6424 len = next_token(&buf);
4fb5d671
AE
6425 if (!len) {
6426 rbd_warn(NULL, "no monitor address(es) provided");
6427 return -EINVAL;
6428 }
0ddebc0c 6429 mon_addrs = buf;
82995cc6 6430 mon_addrs_size = len;
7ef3214a 6431 buf += len;
a725f65e 6432
dc79b113 6433 ret = -EINVAL;
f28e565a
AE
6434 options = dup_token(&buf, NULL);
6435 if (!options)
dc79b113 6436 return -ENOMEM;
4fb5d671
AE
6437 if (!*options) {
6438 rbd_warn(NULL, "no options provided");
6439 goto out_err;
6440 }
e28fff26 6441
c300156b
ID
6442 pctx.spec = rbd_spec_alloc();
6443 if (!pctx.spec)
f28e565a 6444 goto out_mem;
859c31df 6445
c300156b
ID
6446 pctx.spec->pool_name = dup_token(&buf, NULL);
6447 if (!pctx.spec->pool_name)
859c31df 6448 goto out_mem;
c300156b 6449 if (!*pctx.spec->pool_name) {
4fb5d671
AE
6450 rbd_warn(NULL, "no pool name provided");
6451 goto out_err;
6452 }
e28fff26 6453
c300156b
ID
6454 pctx.spec->image_name = dup_token(&buf, NULL);
6455 if (!pctx.spec->image_name)
f28e565a 6456 goto out_mem;
c300156b 6457 if (!*pctx.spec->image_name) {
4fb5d671
AE
6458 rbd_warn(NULL, "no image name provided");
6459 goto out_err;
6460 }
d4b125e9 6461
f28e565a
AE
6462 /*
6463 * Snapshot name is optional; default is to use "-"
6464 * (indicating the head/no snapshot).
6465 */
3feeb894 6466 len = next_token(&buf);
820a5f3e 6467 if (!len) {
3feeb894
AE
6468 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6469 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 6470 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 6471 ret = -ENAMETOOLONG;
f28e565a 6472 goto out_err;
849b4260 6473 }
ecb4dc22
AE
6474 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6475 if (!snap_name)
f28e565a 6476 goto out_mem;
ecb4dc22 6477 *(snap_name + len) = '\0';
c300156b 6478 pctx.spec->snap_name = snap_name;
e5c35534 6479
82995cc6
DH
6480 pctx.copts = ceph_alloc_options();
6481 if (!pctx.copts)
6482 goto out_mem;
6483
0ddebc0c 6484 /* Initialize all rbd options to the defaults */
e28fff26 6485
c300156b
ID
6486 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6487 if (!pctx.opts)
4e9afeba
AE
6488 goto out_mem;
6489
c300156b
ID
6490 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6491 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
0c93e1b7 6492 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
c300156b
ID
6493 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6494 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6495 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6496 pctx.opts->trim = RBD_TRIM_DEFAULT;
d22f76e7 6497
82995cc6
DH
6498 ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL);
6499 if (ret)
dc79b113 6500 goto out_err;
859c31df 6501
82995cc6
DH
6502 ret = rbd_parse_options(options, &pctx);
6503 if (ret)
6504 goto out_err;
6505
6506 *ceph_opts = pctx.copts;
c300156b
ID
6507 *opts = pctx.opts;
6508 *rbd_spec = pctx.spec;
82995cc6 6509 kfree(options);
dc79b113 6510 return 0;
82995cc6 6511
f28e565a 6512out_mem:
dc79b113 6513 ret = -ENOMEM;
d22f76e7 6514out_err:
c300156b 6515 kfree(pctx.opts);
82995cc6 6516 ceph_destroy_options(pctx.copts);
c300156b 6517 rbd_spec_put(pctx.spec);
f28e565a 6518 kfree(options);
dc79b113 6519 return ret;
a725f65e
AE
6520}
6521
e010dd0a
ID
6522static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6523{
6524 down_write(&rbd_dev->lock_rwsem);
6525 if (__rbd_is_lock_owner(rbd_dev))
e1fddc8f 6526 __rbd_release_lock(rbd_dev);
e010dd0a
ID
6527 up_write(&rbd_dev->lock_rwsem);
6528}
6529
637cd060
ID
6530/*
6531 * If the wait is interrupted, an error is returned even if the lock
6532 * was successfully acquired. rbd_dev_image_unlock() will release it
6533 * if needed.
6534 */
e010dd0a
ID
6535static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6536{
637cd060 6537 long ret;
2f18d466 6538
e010dd0a 6539 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
637cd060
ID
6540 if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6541 return 0;
6542
e010dd0a
ID
6543 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6544 return -EINVAL;
6545 }
6546
3fe69921 6547 if (rbd_is_ro(rbd_dev))
637cd060
ID
6548 return 0;
6549
6550 rbd_assert(!rbd_is_lock_owner(rbd_dev));
6551 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6552 ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6553 ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
25e6be21 6554 if (ret > 0) {
637cd060 6555 ret = rbd_dev->acquire_err;
25e6be21
DY
6556 } else {
6557 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6558 if (!ret)
6559 ret = -ETIMEDOUT;
6560 }
637cd060 6561
2f18d466 6562 if (ret) {
637cd060
ID
6563 rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6564 return ret;
e010dd0a
ID
6565 }
6566
637cd060
ID
6567 /*
6568 * The lock may have been released by now, unless automatic lock
6569 * transitions are disabled.
6570 */
6571 rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
e010dd0a
ID
6572 return 0;
6573}
6574
589d30e0
AE
6575/*
6576 * An rbd format 2 image has a unique identifier, distinct from the
6577 * name given to it by the user. Internally, that identifier is
6578 * what's used to specify the names of objects related to the image.
6579 *
6580 * A special "rbd id" object is used to map an rbd image name to its
6581 * id. If that object doesn't exist, then there is no v2 rbd image
6582 * with the supplied name.
6583 *
6584 * This function will record the given rbd_dev's image_id field if
6585 * it can be determined, and in that case will return 0. If any
6586 * errors occur a negative errno will be returned and the rbd_dev's
6587 * image_id field will be unchanged (and should be NULL).
6588 */
6589static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6590{
6591 int ret;
6592 size_t size;
ecd4a68a 6593 CEPH_DEFINE_OID_ONSTACK(oid);
589d30e0 6594 void *response;
c0fba368 6595 char *image_id;
2f82ee54 6596
2c0d0a10
AE
6597 /*
6598 * When probing a parent image, the image id is already
6599 * known (and the image name likely is not). There's no
c0fba368
AE
6600 * need to fetch the image id again in this case. We
6601 * do still need to set the image format though.
2c0d0a10 6602 */
c0fba368
AE
6603 if (rbd_dev->spec->image_id) {
6604 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6605
2c0d0a10 6606 return 0;
c0fba368 6607 }
2c0d0a10 6608
589d30e0
AE
6609 /*
6610 * First, see if the format 2 image id file exists, and if
6611 * so, get the image's persistent id from it.
6612 */
ecd4a68a
ID
6613 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6614 rbd_dev->spec->image_name);
6615 if (ret)
6616 return ret;
6617
6618 dout("rbd id object name is %s\n", oid.name);
589d30e0
AE
6619
6620 /* Response will be an encoded string, which includes a length */
589d30e0
AE
6621 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6622 response = kzalloc(size, GFP_NOIO);
6623 if (!response) {
6624 ret = -ENOMEM;
6625 goto out;
6626 }
6627
c0fba368
AE
6628 /* If it doesn't exist we'll assume it's a format 1 image */
6629
ecd4a68a
ID
6630 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6631 "get_id", NULL, 0,
5435d206 6632 response, size);
36be9a76 6633 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
c0fba368
AE
6634 if (ret == -ENOENT) {
6635 image_id = kstrdup("", GFP_KERNEL);
6636 ret = image_id ? 0 : -ENOMEM;
6637 if (!ret)
6638 rbd_dev->image_format = 1;
7dd440c9 6639 } else if (ret >= 0) {
c0fba368
AE
6640 void *p = response;
6641
6642 image_id = ceph_extract_encoded_string(&p, p + ret,
979ed480 6643 NULL, GFP_NOIO);
461f758a 6644 ret = PTR_ERR_OR_ZERO(image_id);
c0fba368
AE
6645 if (!ret)
6646 rbd_dev->image_format = 2;
c0fba368
AE
6647 }
6648
6649 if (!ret) {
6650 rbd_dev->spec->image_id = image_id;
6651 dout("image_id is %s\n", image_id);
589d30e0
AE
6652 }
6653out:
6654 kfree(response);
ecd4a68a 6655 ceph_oid_destroy(&oid);
589d30e0
AE
6656 return ret;
6657}
6658
3abef3b3
AE
6659/*
6660 * Undo whatever state changes are made by v1 or v2 header info
6661 * call.
6662 */
6fd48b3b
AE
6663static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6664{
6665 struct rbd_image_header *header;
6666
e69b8d41 6667 rbd_dev_parent_put(rbd_dev);
22e8bd51 6668 rbd_object_map_free(rbd_dev);
da5ef6be 6669 rbd_dev_mapping_clear(rbd_dev);
6fd48b3b
AE
6670
6671 /* Free dynamic fields from the header, then zero it out */
6672
6673 header = &rbd_dev->header;
812164f8 6674 ceph_put_snap_context(header->snapc);
6fd48b3b
AE
6675 kfree(header->snap_sizes);
6676 kfree(header->snap_names);
6677 kfree(header->object_prefix);
6678 memset(header, 0, sizeof (*header));
6679}
6680
2df3fac7 6681static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
a30b71b9
AE
6682{
6683 int ret;
a30b71b9 6684
1e130199 6685 ret = rbd_dev_v2_object_prefix(rbd_dev);
57385b51 6686 if (ret)
b1b5402a
AE
6687 goto out_err;
6688
2df3fac7
AE
6689 /*
6690 * Get the and check features for the image. Currently the
6691 * features are assumed to never change.
6692 */
b1b5402a 6693 ret = rbd_dev_v2_features(rbd_dev);
57385b51 6694 if (ret)
9d475de5 6695 goto out_err;
35d489f9 6696
cc070d59
AE
6697 /* If the image supports fancy striping, get its parameters */
6698
6699 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6700 ret = rbd_dev_v2_striping_info(rbd_dev);
6701 if (ret < 0)
6702 goto out_err;
6703 }
a30b71b9 6704
7e97332e
ID
6705 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6706 ret = rbd_dev_v2_data_pool(rbd_dev);
6707 if (ret)
6708 goto out_err;
6709 }
6710
263423f8 6711 rbd_init_layout(rbd_dev);
35152979 6712 return 0;
263423f8 6713
9d475de5 6714out_err:
642a2537 6715 rbd_dev->header.features = 0;
1e130199
AE
6716 kfree(rbd_dev->header.object_prefix);
6717 rbd_dev->header.object_prefix = NULL;
9d475de5 6718 return ret;
a30b71b9
AE
6719}
6720
6d69bb53
ID
6721/*
6722 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6723 * rbd_dev_image_probe() recursion depth, which means it's also the
6724 * length of the already discovered part of the parent chain.
6725 */
6726static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
83a06263 6727{
2f82ee54 6728 struct rbd_device *parent = NULL;
124afba2
AE
6729 int ret;
6730
6731 if (!rbd_dev->parent_spec)
6732 return 0;
124afba2 6733
6d69bb53
ID
6734 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6735 pr_info("parent chain is too long (%d)\n", depth);
6736 ret = -EINVAL;
6737 goto out_err;
6738 }
6739
1643dfa4 6740 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
1f2c6651
ID
6741 if (!parent) {
6742 ret = -ENOMEM;
124afba2 6743 goto out_err;
1f2c6651
ID
6744 }
6745
6746 /*
6747 * Images related by parent/child relationships always share
6748 * rbd_client and spec/parent_spec, so bump their refcounts.
6749 */
6750 __rbd_get_client(rbd_dev->rbd_client);
6751 rbd_spec_get(rbd_dev->parent_spec);
124afba2 6752
39258aa2
ID
6753 __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
6754
6d69bb53 6755 ret = rbd_dev_image_probe(parent, depth);
124afba2
AE
6756 if (ret < 0)
6757 goto out_err;
1f2c6651 6758
124afba2 6759 rbd_dev->parent = parent;
a2acd00e 6760 atomic_set(&rbd_dev->parent_ref, 1);
124afba2 6761 return 0;
1f2c6651 6762
124afba2 6763out_err:
1f2c6651 6764 rbd_dev_unparent(rbd_dev);
1761b229 6765 rbd_dev_destroy(parent);
124afba2
AE
6766 return ret;
6767}
6768
5769ed0c
ID
6769static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6770{
6771 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5769ed0c
ID
6772 rbd_free_disk(rbd_dev);
6773 if (!single_major)
6774 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6775}
6776
811c6688
ID
6777/*
6778 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6779 * upon return.
6780 */
200a6a8b 6781static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
124afba2 6782{
83a06263 6783 int ret;
d1cf5788 6784
9b60e70b 6785 /* Record our major and minor device numbers. */
83a06263 6786
9b60e70b
ID
6787 if (!single_major) {
6788 ret = register_blkdev(0, rbd_dev->name);
6789 if (ret < 0)
1643dfa4 6790 goto err_out_unlock;
9b60e70b
ID
6791
6792 rbd_dev->major = ret;
6793 rbd_dev->minor = 0;
6794 } else {
6795 rbd_dev->major = rbd_major;
6796 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6797 }
83a06263
AE
6798
6799 /* Set up the blkdev mapping. */
6800
6801 ret = rbd_init_disk(rbd_dev);
6802 if (ret)
6803 goto err_out_blkdev;
6804
f35a4dee 6805 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
39258aa2 6806 set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
f35a4dee 6807
5769ed0c 6808 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
f35a4dee 6809 if (ret)
da5ef6be 6810 goto err_out_disk;
83a06263 6811
129b79d4 6812 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
811c6688 6813 up_write(&rbd_dev->header_rwsem);
5769ed0c 6814 return 0;
2f82ee54 6815
83a06263
AE
6816err_out_disk:
6817 rbd_free_disk(rbd_dev);
6818err_out_blkdev:
9b60e70b
ID
6819 if (!single_major)
6820 unregister_blkdev(rbd_dev->major, rbd_dev->name);
811c6688
ID
6821err_out_unlock:
6822 up_write(&rbd_dev->header_rwsem);
83a06263
AE
6823 return ret;
6824}
6825
332bb12d
AE
6826static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6827{
6828 struct rbd_spec *spec = rbd_dev->spec;
c41d13a3 6829 int ret;
332bb12d
AE
6830
6831 /* Record the header object name for this rbd image. */
6832
6833 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
332bb12d 6834 if (rbd_dev->image_format == 1)
c41d13a3
ID
6835 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6836 spec->image_name, RBD_SUFFIX);
332bb12d 6837 else
c41d13a3
ID
6838 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6839 RBD_HEADER_PREFIX, spec->image_id);
332bb12d 6840
c41d13a3 6841 return ret;
332bb12d
AE
6842}
6843
b9ef2b88
ID
6844static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6845{
6846 if (!is_snap) {
6847 pr_info("image %s/%s%s%s does not exist\n",
6848 rbd_dev->spec->pool_name,
6849 rbd_dev->spec->pool_ns ?: "",
6850 rbd_dev->spec->pool_ns ? "/" : "",
6851 rbd_dev->spec->image_name);
6852 } else {
6853 pr_info("snap %s/%s%s%s@%s does not exist\n",
6854 rbd_dev->spec->pool_name,
6855 rbd_dev->spec->pool_ns ?: "",
6856 rbd_dev->spec->pool_ns ? "/" : "",
6857 rbd_dev->spec->image_name,
6858 rbd_dev->spec->snap_name);
6859 }
6860}
6861
200a6a8b
AE
6862static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6863{
b8776051 6864 if (!rbd_is_ro(rbd_dev))
fd22aef8 6865 rbd_unregister_watch(rbd_dev);
952c48b0
ID
6866
6867 rbd_dev_unprobe(rbd_dev);
6fd48b3b
AE
6868 rbd_dev->image_format = 0;
6869 kfree(rbd_dev->spec->image_id);
6870 rbd_dev->spec->image_id = NULL;
200a6a8b
AE
6871}
6872
a30b71b9
AE
6873/*
6874 * Probe for the existence of the header object for the given rbd
1f3ef788
AE
6875 * device. If this image is the one being mapped (i.e., not a
6876 * parent), initiate a watch on its header object before using that
6877 * object to get detailed information about the rbd image.
0e4e1de5
ID
6878 *
6879 * On success, returns with header_rwsem held for write if called
6880 * with @depth == 0.
a30b71b9 6881 */
6d69bb53 6882static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
a30b71b9 6883{
b9ef2b88 6884 bool need_watch = !rbd_is_ro(rbd_dev);
a30b71b9
AE
6885 int ret;
6886
6887 /*
3abef3b3
AE
6888 * Get the id from the image id object. Unless there's an
6889 * error, rbd_dev->spec->image_id will be filled in with
6890 * a dynamically-allocated string, and rbd_dev->image_format
6891 * will be set to either 1 or 2.
a30b71b9
AE
6892 */
6893 ret = rbd_dev_image_id(rbd_dev);
6894 if (ret)
c0fba368 6895 return ret;
c0fba368 6896
332bb12d
AE
6897 ret = rbd_dev_header_name(rbd_dev);
6898 if (ret)
6899 goto err_out_format;
6900
b9ef2b88 6901 if (need_watch) {
99d16943 6902 ret = rbd_register_watch(rbd_dev);
1fe48023
ID
6903 if (ret) {
6904 if (ret == -ENOENT)
b9ef2b88 6905 rbd_print_dne(rbd_dev, false);
c41d13a3 6906 goto err_out_format;
1fe48023 6907 }
1f3ef788 6908 }
b644de2b 6909
0e4e1de5
ID
6910 if (!depth)
6911 down_write(&rbd_dev->header_rwsem);
6912
a720ae09 6913 ret = rbd_dev_header_info(rbd_dev);
b9ef2b88
ID
6914 if (ret) {
6915 if (ret == -ENOENT && !need_watch)
6916 rbd_print_dne(rbd_dev, false);
952c48b0 6917 goto err_out_probe;
b9ef2b88 6918 }
83a06263 6919
04077599
ID
6920 /*
6921 * If this image is the one being mapped, we have pool name and
6922 * id, image name and id, and snap name - need to fill snap id.
6923 * Otherwise this is a parent image, identified by pool, image
6924 * and snap ids - need to fill in names for those ids.
6925 */
6d69bb53 6926 if (!depth)
04077599
ID
6927 ret = rbd_spec_fill_snap_id(rbd_dev);
6928 else
6929 ret = rbd_spec_fill_names(rbd_dev);
1fe48023
ID
6930 if (ret) {
6931 if (ret == -ENOENT)
b9ef2b88 6932 rbd_print_dne(rbd_dev, true);
33dca39f 6933 goto err_out_probe;
1fe48023 6934 }
9bb81c9b 6935
da5ef6be
ID
6936 ret = rbd_dev_mapping_set(rbd_dev);
6937 if (ret)
6938 goto err_out_probe;
6939
f3c0e459 6940 if (rbd_is_snap(rbd_dev) &&
22e8bd51
ID
6941 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
6942 ret = rbd_object_map_load(rbd_dev);
6943 if (ret)
6944 goto err_out_probe;
6945 }
6946
e8f59b59
ID
6947 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6948 ret = rbd_dev_v2_parent_info(rbd_dev);
6949 if (ret)
6950 goto err_out_probe;
e8f59b59
ID
6951 }
6952
6d69bb53 6953 ret = rbd_dev_probe_parent(rbd_dev, depth);
30d60ba2
AE
6954 if (ret)
6955 goto err_out_probe;
6956
6957 dout("discovered format %u image, header name is %s\n",
c41d13a3 6958 rbd_dev->image_format, rbd_dev->header_oid.name);
30d60ba2 6959 return 0;
e8f59b59 6960
6fd48b3b 6961err_out_probe:
0e4e1de5
ID
6962 if (!depth)
6963 up_write(&rbd_dev->header_rwsem);
b9ef2b88 6964 if (need_watch)
99d16943 6965 rbd_unregister_watch(rbd_dev);
952c48b0 6966 rbd_dev_unprobe(rbd_dev);
332bb12d
AE
6967err_out_format:
6968 rbd_dev->image_format = 0;
5655c4d9
AE
6969 kfree(rbd_dev->spec->image_id);
6970 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
6971 return ret;
6972}
6973
9b60e70b
ID
6974static ssize_t do_rbd_add(struct bus_type *bus,
6975 const char *buf,
6976 size_t count)
602adf40 6977{
cb8627c7 6978 struct rbd_device *rbd_dev = NULL;
dc79b113 6979 struct ceph_options *ceph_opts = NULL;
4e9afeba 6980 struct rbd_options *rbd_opts = NULL;
859c31df 6981 struct rbd_spec *spec = NULL;
9d3997fd 6982 struct rbd_client *rbdc;
b51c83c2 6983 int rc;
602adf40 6984
f44d04e6
ID
6985 if (!capable(CAP_SYS_ADMIN))
6986 return -EPERM;
6987
602adf40
YS
6988 if (!try_module_get(THIS_MODULE))
6989 return -ENODEV;
6990
602adf40 6991 /* parse add command */
859c31df 6992 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 6993 if (rc < 0)
dd5ac32d 6994 goto out;
78cea76e 6995
9d3997fd
AE
6996 rbdc = rbd_get_client(ceph_opts);
6997 if (IS_ERR(rbdc)) {
6998 rc = PTR_ERR(rbdc);
0ddebc0c 6999 goto err_out_args;
9d3997fd 7000 }
602adf40 7001
602adf40 7002 /* pick the pool */
dd435855 7003 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
1fe48023
ID
7004 if (rc < 0) {
7005 if (rc == -ENOENT)
7006 pr_info("pool %s does not exist\n", spec->pool_name);
602adf40 7007 goto err_out_client;
1fe48023 7008 }
c0cd10db 7009 spec->pool_id = (u64)rc;
859c31df 7010
d147543d 7011 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
b51c83c2
ID
7012 if (!rbd_dev) {
7013 rc = -ENOMEM;
bd4ba655 7014 goto err_out_client;
b51c83c2 7015 }
c53d5893
AE
7016 rbdc = NULL; /* rbd_dev now owns this */
7017 spec = NULL; /* rbd_dev now owns this */
d147543d 7018 rbd_opts = NULL; /* rbd_dev now owns this */
602adf40 7019
39258aa2
ID
7020 /* if we are mapping a snapshot it will be a read-only mapping */
7021 if (rbd_dev->opts->read_only ||
7022 strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
7023 __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
7024
0d6d1e9c
MC
7025 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7026 if (!rbd_dev->config_info) {
7027 rc = -ENOMEM;
7028 goto err_out_rbd_dev;
7029 }
7030
6d69bb53 7031 rc = rbd_dev_image_probe(rbd_dev, 0);
0e4e1de5 7032 if (rc < 0)
c53d5893 7033 goto err_out_rbd_dev;
05fd6f6f 7034
0c93e1b7
ID
7035 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7036 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7037 rbd_dev->layout.object_size);
7038 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7039 }
7040
b536f69a 7041 rc = rbd_dev_device_setup(rbd_dev);
fd22aef8 7042 if (rc)
8b679ec5 7043 goto err_out_image_probe;
3abef3b3 7044
637cd060
ID
7045 rc = rbd_add_acquire_lock(rbd_dev);
7046 if (rc)
7047 goto err_out_image_lock;
3abef3b3 7048
5769ed0c
ID
7049 /* Everything's ready. Announce the disk to the world. */
7050
7051 rc = device_add(&rbd_dev->dev);
7052 if (rc)
e010dd0a 7053 goto err_out_image_lock;
5769ed0c 7054
27c97abc
LC
7055 rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
7056 if (rc)
7057 goto err_out_cleanup_disk;
5769ed0c
ID
7058
7059 spin_lock(&rbd_dev_list_lock);
7060 list_add_tail(&rbd_dev->node, &rbd_dev_list);
7061 spin_unlock(&rbd_dev_list_lock);
7062
7063 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7064 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7065 rbd_dev->header.features);
dd5ac32d
ID
7066 rc = count;
7067out:
7068 module_put(THIS_MODULE);
7069 return rc;
b536f69a 7070
27c97abc
LC
7071err_out_cleanup_disk:
7072 rbd_free_disk(rbd_dev);
e010dd0a
ID
7073err_out_image_lock:
7074 rbd_dev_image_unlock(rbd_dev);
5769ed0c 7075 rbd_dev_device_release(rbd_dev);
8b679ec5
ID
7076err_out_image_probe:
7077 rbd_dev_image_release(rbd_dev);
c53d5893
AE
7078err_out_rbd_dev:
7079 rbd_dev_destroy(rbd_dev);
bd4ba655 7080err_out_client:
9d3997fd 7081 rbd_put_client(rbdc);
0ddebc0c 7082err_out_args:
859c31df 7083 rbd_spec_put(spec);
d147543d 7084 kfree(rbd_opts);
dd5ac32d 7085 goto out;
602adf40
YS
7086}
7087
7e9586ba 7088static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
7089{
7090 if (single_major)
7091 return -EINVAL;
7092
7093 return do_rbd_add(bus, buf, count);
7094}
7095
7e9586ba
GKH
7096static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7097 size_t count)
9b60e70b
ID
7098{
7099 return do_rbd_add(bus, buf, count);
7100}
7101
05a46afd
AE
7102static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7103{
ad945fc1 7104 while (rbd_dev->parent) {
05a46afd
AE
7105 struct rbd_device *first = rbd_dev;
7106 struct rbd_device *second = first->parent;
7107 struct rbd_device *third;
7108
7109 /*
7110 * Follow to the parent with no grandparent and
7111 * remove it.
7112 */
7113 while (second && (third = second->parent)) {
7114 first = second;
7115 second = third;
7116 }
ad945fc1 7117 rbd_assert(second);
8ad42cd0 7118 rbd_dev_image_release(second);
8b679ec5 7119 rbd_dev_destroy(second);
ad945fc1
AE
7120 first->parent = NULL;
7121 first->parent_overlap = 0;
7122
7123 rbd_assert(first->parent_spec);
05a46afd
AE
7124 rbd_spec_put(first->parent_spec);
7125 first->parent_spec = NULL;
05a46afd
AE
7126 }
7127}
7128
9b60e70b
ID
7129static ssize_t do_rbd_remove(struct bus_type *bus,
7130 const char *buf,
7131 size_t count)
602adf40
YS
7132{
7133 struct rbd_device *rbd_dev = NULL;
751cc0e3
AE
7134 struct list_head *tmp;
7135 int dev_id;
0276dca6 7136 char opt_buf[6];
0276dca6 7137 bool force = false;
0d8189e1 7138 int ret;
602adf40 7139
f44d04e6
ID
7140 if (!capable(CAP_SYS_ADMIN))
7141 return -EPERM;
7142
0276dca6
MC
7143 dev_id = -1;
7144 opt_buf[0] = '\0';
7145 sscanf(buf, "%d %5s", &dev_id, opt_buf);
7146 if (dev_id < 0) {
7147 pr_err("dev_id out of range\n");
602adf40 7148 return -EINVAL;
0276dca6
MC
7149 }
7150 if (opt_buf[0] != '\0') {
7151 if (!strcmp(opt_buf, "force")) {
7152 force = true;
7153 } else {
7154 pr_err("bad remove option at '%s'\n", opt_buf);
7155 return -EINVAL;
7156 }
7157 }
602adf40 7158
751cc0e3
AE
7159 ret = -ENOENT;
7160 spin_lock(&rbd_dev_list_lock);
7161 list_for_each(tmp, &rbd_dev_list) {
7162 rbd_dev = list_entry(tmp, struct rbd_device, node);
7163 if (rbd_dev->dev_id == dev_id) {
7164 ret = 0;
7165 break;
7166 }
42382b70 7167 }
751cc0e3
AE
7168 if (!ret) {
7169 spin_lock_irq(&rbd_dev->lock);
0276dca6 7170 if (rbd_dev->open_count && !force)
751cc0e3 7171 ret = -EBUSY;
85f5a4d6
ID
7172 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7173 &rbd_dev->flags))
7174 ret = -EINPROGRESS;
751cc0e3
AE
7175 spin_unlock_irq(&rbd_dev->lock);
7176 }
7177 spin_unlock(&rbd_dev_list_lock);
85f5a4d6 7178 if (ret)
1ba0f1e7 7179 return ret;
751cc0e3 7180
0276dca6
MC
7181 if (force) {
7182 /*
7183 * Prevent new IO from being queued and wait for existing
7184 * IO to complete/fail.
7185 */
7186 blk_mq_freeze_queue(rbd_dev->disk->queue);
7187 blk_set_queue_dying(rbd_dev->disk->queue);
7188 }
7189
5769ed0c
ID
7190 del_gendisk(rbd_dev->disk);
7191 spin_lock(&rbd_dev_list_lock);
7192 list_del_init(&rbd_dev->node);
7193 spin_unlock(&rbd_dev_list_lock);
7194 device_del(&rbd_dev->dev);
fca27065 7195
e010dd0a 7196 rbd_dev_image_unlock(rbd_dev);
dd5ac32d 7197 rbd_dev_device_release(rbd_dev);
8ad42cd0 7198 rbd_dev_image_release(rbd_dev);
8b679ec5 7199 rbd_dev_destroy(rbd_dev);
1ba0f1e7 7200 return count;
602adf40
YS
7201}
7202
7e9586ba 7203static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
7204{
7205 if (single_major)
7206 return -EINVAL;
7207
7208 return do_rbd_remove(bus, buf, count);
7209}
7210
7e9586ba
GKH
7211static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7212 size_t count)
9b60e70b
ID
7213{
7214 return do_rbd_remove(bus, buf, count);
7215}
7216
602adf40
YS
7217/*
7218 * create control files in sysfs
dfc5606d 7219 * /sys/bus/rbd/...
602adf40 7220 */
7d8dc534 7221static int __init rbd_sysfs_init(void)
602adf40 7222{
dfc5606d 7223 int ret;
602adf40 7224
fed4c143 7225 ret = device_register(&rbd_root_dev);
21079786 7226 if (ret < 0)
dfc5606d 7227 return ret;
602adf40 7228
fed4c143
AE
7229 ret = bus_register(&rbd_bus_type);
7230 if (ret < 0)
7231 device_unregister(&rbd_root_dev);
602adf40 7232
602adf40
YS
7233 return ret;
7234}
7235
7d8dc534 7236static void __exit rbd_sysfs_cleanup(void)
602adf40 7237{
dfc5606d 7238 bus_unregister(&rbd_bus_type);
fed4c143 7239 device_unregister(&rbd_root_dev);
602adf40
YS
7240}
7241
7d8dc534 7242static int __init rbd_slab_init(void)
1c2a9dfe
AE
7243{
7244 rbd_assert(!rbd_img_request_cache);
03d94406 7245 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
868311b1
AE
7246 if (!rbd_img_request_cache)
7247 return -ENOMEM;
7248
7249 rbd_assert(!rbd_obj_request_cache);
03d94406 7250 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
78c2a44a
AE
7251 if (!rbd_obj_request_cache)
7252 goto out_err;
7253
6c696d85 7254 return 0;
1c2a9dfe 7255
6c696d85 7256out_err:
868311b1
AE
7257 kmem_cache_destroy(rbd_img_request_cache);
7258 rbd_img_request_cache = NULL;
1c2a9dfe
AE
7259 return -ENOMEM;
7260}
7261
7262static void rbd_slab_exit(void)
7263{
868311b1
AE
7264 rbd_assert(rbd_obj_request_cache);
7265 kmem_cache_destroy(rbd_obj_request_cache);
7266 rbd_obj_request_cache = NULL;
7267
1c2a9dfe
AE
7268 rbd_assert(rbd_img_request_cache);
7269 kmem_cache_destroy(rbd_img_request_cache);
7270 rbd_img_request_cache = NULL;
7271}
7272
cc344fa1 7273static int __init rbd_init(void)
602adf40
YS
7274{
7275 int rc;
7276
1e32d34c
AE
7277 if (!libceph_compatible(NULL)) {
7278 rbd_warn(NULL, "libceph incompatibility (quitting)");
1e32d34c
AE
7279 return -EINVAL;
7280 }
e1b4d96d 7281
1c2a9dfe 7282 rc = rbd_slab_init();
602adf40
YS
7283 if (rc)
7284 return rc;
e1b4d96d 7285
f5ee37bd
ID
7286 /*
7287 * The number of active work items is limited by the number of
f77303bd 7288 * rbd devices * queue depth, so leave @max_active at default.
f5ee37bd
ID
7289 */
7290 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7291 if (!rbd_wq) {
7292 rc = -ENOMEM;
7293 goto err_out_slab;
7294 }
7295
9b60e70b
ID
7296 if (single_major) {
7297 rbd_major = register_blkdev(0, RBD_DRV_NAME);
7298 if (rbd_major < 0) {
7299 rc = rbd_major;
f5ee37bd 7300 goto err_out_wq;
9b60e70b
ID
7301 }
7302 }
7303
1c2a9dfe
AE
7304 rc = rbd_sysfs_init();
7305 if (rc)
9b60e70b
ID
7306 goto err_out_blkdev;
7307
7308 if (single_major)
7309 pr_info("loaded (major %d)\n", rbd_major);
7310 else
7311 pr_info("loaded\n");
1c2a9dfe 7312
e1b4d96d
ID
7313 return 0;
7314
9b60e70b
ID
7315err_out_blkdev:
7316 if (single_major)
7317 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd
ID
7318err_out_wq:
7319 destroy_workqueue(rbd_wq);
e1b4d96d
ID
7320err_out_slab:
7321 rbd_slab_exit();
1c2a9dfe 7322 return rc;
602adf40
YS
7323}
7324
cc344fa1 7325static void __exit rbd_exit(void)
602adf40 7326{
ffe312cf 7327 ida_destroy(&rbd_dev_id_ida);
602adf40 7328 rbd_sysfs_cleanup();
9b60e70b
ID
7329 if (single_major)
7330 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd 7331 destroy_workqueue(rbd_wq);
1c2a9dfe 7332 rbd_slab_exit();
602adf40
YS
7333}
7334
7335module_init(rbd_init);
7336module_exit(rbd_exit);
7337
d552c619 7338MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
602adf40
YS
7339MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7340MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
602adf40
YS
7341/* following authorship retained from original osdblk.c */
7342MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7343
90da258b 7344MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
602adf40 7345MODULE_LICENSE("GPL");