rbd: don't hold lock_rwsem while running_list is being drained
[linux-block.git] / drivers / block / rbd.c
CommitLineData
e2a58ee5 1
602adf40
YS
2/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
dfc5606d 25 For usage instructions, please refer to:
602adf40 26
dfc5606d 27 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
28
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
ed95b21a 34#include <linux/ceph/cls_lock_client.h>
43df3d35 35#include <linux/ceph/striper.h>
602adf40 36#include <linux/ceph/decode.h>
82995cc6 37#include <linux/fs_parser.h>
30d1cff8 38#include <linux/bsearch.h>
602adf40
YS
39
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
7ad18afa 43#include <linux/blk-mq.h>
602adf40
YS
44#include <linux/fs.h>
45#include <linux/blkdev.h>
1c2a9dfe 46#include <linux/slab.h>
f8a22fc2 47#include <linux/idr.h>
bc1ecc65 48#include <linux/workqueue.h>
602adf40
YS
49
50#include "rbd_types.h"
51
aafb230e
AE
52#define RBD_DEBUG /* Activate rbd_assert() calls */
53
a2acd00e
AE
54/*
55 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
bfc18e38 64 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
a2acd00e
AE
65 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
f0f8cef5 87#define RBD_DRV_NAME "rbd"
602adf40 88
7e513d43
ID
89#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
602adf40 91
6d69bb53
ID
92#define RBD_MAX_PARENT_CHAIN_LEN 16
93
d4b125e9
AE
94#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
35d489f9 98#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
99
100#define RBD_SNAP_HEAD_NAME "-"
101
9682fc6d
AE
102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
9e15b77d
AE
104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 106#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 107
1e130199 108#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 109
ed95b21a 110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
99d16943
ID
111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
d889140c
AE
113/* Feature bits */
114
8767b293
ID
115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
22e8bd51
ID
118#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
119#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
b9f6d447 120#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
8767b293 121#define RBD_FEATURE_DATA_POOL (1ULL<<7)
e573427a 122#define RBD_FEATURE_OPERATIONS (1ULL<<8)
8767b293 123
ed95b21a
ID
124#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
125 RBD_FEATURE_STRIPINGV2 | \
7e97332e 126 RBD_FEATURE_EXCLUSIVE_LOCK | \
22e8bd51
ID
127 RBD_FEATURE_OBJECT_MAP | \
128 RBD_FEATURE_FAST_DIFF | \
b9f6d447 129 RBD_FEATURE_DEEP_FLATTEN | \
e573427a
ID
130 RBD_FEATURE_DATA_POOL | \
131 RBD_FEATURE_OPERATIONS)
d889140c
AE
132
133/* Features supported by this (client software) implementation. */
134
770eba6e 135#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
d889140c 136
81a89793
AE
137/*
138 * An RBD device name will be "rbd#", where the "rbd" comes from
139 * RBD_DRV_NAME above, and # is a unique integer identifier.
81a89793 140 */
602adf40
YS
141#define DEV_NAME_LEN 32
142
143/*
144 * block device image metadata (in-memory version)
145 */
146struct rbd_image_header {
f35a4dee 147 /* These six fields never change for a given rbd image */
849b4260 148 char *object_prefix;
602adf40 149 __u8 obj_order;
f35a4dee
AE
150 u64 stripe_unit;
151 u64 stripe_count;
7e97332e 152 s64 data_pool_id;
f35a4dee 153 u64 features; /* Might be changeable someday? */
602adf40 154
f84344f3
AE
155 /* The remaining fields need to be updated occasionally */
156 u64 image_size;
157 struct ceph_snap_context *snapc;
f35a4dee
AE
158 char *snap_names; /* format 1 only */
159 u64 *snap_sizes; /* format 1 only */
59c2be1e
YS
160};
161
0d7dbfce
AE
162/*
163 * An rbd image specification.
164 *
165 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
166 * identify an image. Each rbd_dev structure includes a pointer to
167 * an rbd_spec structure that encapsulates this identity.
168 *
169 * Each of the id's in an rbd_spec has an associated name. For a
170 * user-mapped image, the names are supplied and the id's associated
171 * with them are looked up. For a layered image, a parent image is
172 * defined by the tuple, and the names are looked up.
173 *
174 * An rbd_dev structure contains a parent_spec pointer which is
175 * non-null if the image it represents is a child in a layered
176 * image. This pointer will refer to the rbd_spec structure used
177 * by the parent rbd_dev for its own identity (i.e., the structure
178 * is shared between the parent and child).
179 *
180 * Since these structures are populated once, during the discovery
181 * phase of image construction, they are effectively immutable so
182 * we make no effort to synchronize access to them.
183 *
184 * Note that code herein does not assume the image name is known (it
185 * could be a null pointer).
0d7dbfce
AE
186 */
187struct rbd_spec {
188 u64 pool_id;
ecb4dc22 189 const char *pool_name;
b26c047b 190 const char *pool_ns; /* NULL if default, never "" */
0d7dbfce 191
ecb4dc22
AE
192 const char *image_id;
193 const char *image_name;
0d7dbfce
AE
194
195 u64 snap_id;
ecb4dc22 196 const char *snap_name;
0d7dbfce
AE
197
198 struct kref kref;
199};
200
602adf40 201/*
f0f8cef5 202 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
203 */
204struct rbd_client {
205 struct ceph_client *client;
206 struct kref kref;
207 struct list_head node;
208};
209
0192ce2e
ID
210struct pending_result {
211 int result; /* first nonzero result */
212 int num_pending;
213};
214
bf0d5f50 215struct rbd_img_request;
bf0d5f50 216
9969ebc5 217enum obj_request_type {
a1fbb5e7 218 OBJ_REQUEST_NODATA = 1,
5359a17d 219 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
7e07efb1 220 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
afb97888 221 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
9969ebc5 222};
bf0d5f50 223
6d2940c8 224enum obj_operation_type {
a1fbb5e7 225 OBJ_OP_READ = 1,
6d2940c8 226 OBJ_OP_WRITE,
90e98c52 227 OBJ_OP_DISCARD,
6484cbe9 228 OBJ_OP_ZEROOUT,
6d2940c8
GZ
229};
230
0ad5d953
ID
231#define RBD_OBJ_FLAG_DELETION (1U << 0)
232#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
793333a3 233#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
22e8bd51
ID
234#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
235#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
0ad5d953 236
a9b67e69 237enum rbd_obj_read_state {
85b5e6d1
ID
238 RBD_OBJ_READ_START = 1,
239 RBD_OBJ_READ_OBJECT,
a9b67e69
ID
240 RBD_OBJ_READ_PARENT,
241};
242
3da691bf
ID
243/*
244 * Writes go through the following state machine to deal with
245 * layering:
246 *
89a59c1c
ID
247 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248 * . | .
249 * . v .
250 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
251 * . | . .
252 * . v v (deep-copyup .
253 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
254 * flattened) v | . .
255 * . v . .
256 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
257 * | not needed) v
258 * v .
259 * done . . . . . . . . . . . . . . . . . .
260 * ^
261 * |
262 * RBD_OBJ_WRITE_FLAT
3da691bf
ID
263 *
264 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
89a59c1c
ID
265 * assert_exists guard is needed or not (in some cases it's not needed
266 * even if there is a parent).
3da691bf
ID
267 */
268enum rbd_obj_write_state {
85b5e6d1 269 RBD_OBJ_WRITE_START = 1,
22e8bd51 270 RBD_OBJ_WRITE_PRE_OBJECT_MAP,
85b5e6d1 271 RBD_OBJ_WRITE_OBJECT,
793333a3
ID
272 __RBD_OBJ_WRITE_COPYUP,
273 RBD_OBJ_WRITE_COPYUP,
22e8bd51 274 RBD_OBJ_WRITE_POST_OBJECT_MAP,
793333a3
ID
275};
276
277enum rbd_obj_copyup_state {
278 RBD_OBJ_COPYUP_START = 1,
279 RBD_OBJ_COPYUP_READ_PARENT,
22e8bd51
ID
280 __RBD_OBJ_COPYUP_OBJECT_MAPS,
281 RBD_OBJ_COPYUP_OBJECT_MAPS,
793333a3
ID
282 __RBD_OBJ_COPYUP_WRITE_OBJECT,
283 RBD_OBJ_COPYUP_WRITE_OBJECT,
926f9b3f
AE
284};
285
bf0d5f50 286struct rbd_obj_request {
43df3d35 287 struct ceph_object_extent ex;
0ad5d953 288 unsigned int flags; /* RBD_OBJ_FLAG_* */
c5b5ef6c 289 union {
a9b67e69 290 enum rbd_obj_read_state read_state; /* for reads */
3da691bf 291 enum rbd_obj_write_state write_state; /* for writes */
c5b5ef6c 292 };
bf0d5f50 293
51c3509e 294 struct rbd_img_request *img_request;
86bd7998
ID
295 struct ceph_file_extent *img_extents;
296 u32 num_img_extents;
bf0d5f50 297
788e2df3 298 union {
5359a17d 299 struct ceph_bio_iter bio_pos;
788e2df3 300 struct {
7e07efb1
ID
301 struct ceph_bvec_iter bvec_pos;
302 u32 bvec_count;
afb97888 303 u32 bvec_idx;
788e2df3
AE
304 };
305 };
793333a3
ID
306
307 enum rbd_obj_copyup_state copyup_state;
7e07efb1
ID
308 struct bio_vec *copyup_bvecs;
309 u32 copyup_bvec_count;
bf0d5f50 310
bcbab1db 311 struct list_head osd_reqs; /* w/ r_private_item */
bf0d5f50 312
85b5e6d1 313 struct mutex state_mutex;
793333a3 314 struct pending_result pending;
bf0d5f50
AE
315 struct kref kref;
316};
317
0c425248 318enum img_req_flags {
9849e986 319 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
d0b2e944 320 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
0c425248
AE
321};
322
0192ce2e
ID
323enum rbd_img_state {
324 RBD_IMG_START = 1,
637cd060 325 RBD_IMG_EXCLUSIVE_LOCK,
0192ce2e
ID
326 __RBD_IMG_OBJECT_REQUESTS,
327 RBD_IMG_OBJECT_REQUESTS,
328};
329
bf0d5f50 330struct rbd_img_request {
bf0d5f50 331 struct rbd_device *rbd_dev;
9bb0248d 332 enum obj_operation_type op_type;
ecc633ca 333 enum obj_request_type data_type;
0c425248 334 unsigned long flags;
0192ce2e 335 enum rbd_img_state state;
bf0d5f50 336 union {
9849e986 337 u64 snap_id; /* for reads */
bf0d5f50 338 struct ceph_snap_context *snapc; /* for writes */
9849e986 339 };
59e542c8 340 struct rbd_obj_request *obj_request; /* obj req initiator */
bf0d5f50 341
e1fddc8f 342 struct list_head lock_item;
43df3d35 343 struct list_head object_extents; /* obj_req.ex structs */
bf0d5f50 344
0192ce2e
ID
345 struct mutex state_mutex;
346 struct pending_result pending;
347 struct work_struct work;
348 int work_result;
bf0d5f50
AE
349};
350
351#define for_each_obj_request(ireq, oreq) \
43df3d35 352 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 353#define for_each_obj_request_safe(ireq, oreq, n) \
43df3d35 354 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 355
99d16943
ID
356enum rbd_watch_state {
357 RBD_WATCH_STATE_UNREGISTERED,
358 RBD_WATCH_STATE_REGISTERED,
359 RBD_WATCH_STATE_ERROR,
360};
361
ed95b21a
ID
362enum rbd_lock_state {
363 RBD_LOCK_STATE_UNLOCKED,
364 RBD_LOCK_STATE_LOCKED,
365 RBD_LOCK_STATE_RELEASING,
366};
367
368/* WatchNotify::ClientId */
369struct rbd_client_id {
370 u64 gid;
371 u64 handle;
372};
373
f84344f3 374struct rbd_mapping {
99c1f08f 375 u64 size;
f84344f3
AE
376};
377
602adf40
YS
378/*
379 * a single device
380 */
381struct rbd_device {
de71a297 382 int dev_id; /* blkdev unique id */
602adf40
YS
383
384 int major; /* blkdev assigned major */
dd82fff1 385 int minor;
602adf40 386 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 387
a30b71b9 388 u32 image_format; /* Either 1 or 2 */
602adf40
YS
389 struct rbd_client *rbd_client;
390
391 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392
b82d167b 393 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
394
395 struct rbd_image_header header;
b82d167b 396 unsigned long flags; /* possibly lock protected */
0d7dbfce 397 struct rbd_spec *spec;
d147543d 398 struct rbd_options *opts;
0d6d1e9c 399 char *config_info; /* add{,_single_major} string */
602adf40 400
c41d13a3 401 struct ceph_object_id header_oid;
922dab61 402 struct ceph_object_locator header_oloc;
971f839a 403
1643dfa4 404 struct ceph_file_layout layout; /* used for all rbd requests */
0903e875 405
99d16943
ID
406 struct mutex watch_mutex;
407 enum rbd_watch_state watch_state;
922dab61 408 struct ceph_osd_linger_request *watch_handle;
99d16943
ID
409 u64 watch_cookie;
410 struct delayed_work watch_dwork;
59c2be1e 411
ed95b21a
ID
412 struct rw_semaphore lock_rwsem;
413 enum rbd_lock_state lock_state;
cbbfb0ff 414 char lock_cookie[32];
ed95b21a
ID
415 struct rbd_client_id owner_cid;
416 struct work_struct acquired_lock_work;
417 struct work_struct released_lock_work;
418 struct delayed_work lock_dwork;
419 struct work_struct unlock_work;
e1fddc8f 420 spinlock_t lock_lists_lock;
637cd060 421 struct list_head acquiring_list;
e1fddc8f 422 struct list_head running_list;
637cd060
ID
423 struct completion acquire_wait;
424 int acquire_err;
e1fddc8f 425 struct completion releasing_wait;
ed95b21a 426
22e8bd51
ID
427 spinlock_t object_map_lock;
428 u8 *object_map;
429 u64 object_map_size; /* in objects */
430 u64 object_map_flags;
ed95b21a 431
1643dfa4 432 struct workqueue_struct *task_wq;
59c2be1e 433
86b00e0d
AE
434 struct rbd_spec *parent_spec;
435 u64 parent_overlap;
a2acd00e 436 atomic_t parent_ref;
2f82ee54 437 struct rbd_device *parent;
86b00e0d 438
7ad18afa
CH
439 /* Block layer tags. */
440 struct blk_mq_tag_set tag_set;
441
c666601a
JD
442 /* protects updating the header */
443 struct rw_semaphore header_rwsem;
f84344f3
AE
444
445 struct rbd_mapping mapping;
602adf40
YS
446
447 struct list_head node;
dfc5606d 448
dfc5606d
YS
449 /* sysfs related */
450 struct device dev;
b82d167b 451 unsigned long open_count; /* protected by lock */
dfc5606d
YS
452};
453
b82d167b 454/*
87c0fded
ID
455 * Flag bits for rbd_dev->flags:
456 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
457 * by rbd_dev->lock
b82d167b 458 */
6d292906 459enum rbd_dev_flags {
686238b7 460 RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */
b82d167b 461 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
39258aa2 462 RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */
6d292906
AE
463};
464
cfbf6377 465static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
e124a82f 466
602adf40 467static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
468static DEFINE_SPINLOCK(rbd_dev_list_lock);
469
432b8587
AE
470static LIST_HEAD(rbd_client_list); /* clients */
471static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 472
78c2a44a
AE
473/* Slab caches for frequently-allocated structures */
474
1c2a9dfe 475static struct kmem_cache *rbd_img_request_cache;
868311b1 476static struct kmem_cache *rbd_obj_request_cache;
1c2a9dfe 477
9b60e70b 478static int rbd_major;
f8a22fc2
ID
479static DEFINE_IDA(rbd_dev_id_ida);
480
f5ee37bd
ID
481static struct workqueue_struct *rbd_wq;
482
89a59c1c
ID
483static struct ceph_snap_context rbd_empty_snapc = {
484 .nref = REFCOUNT_INIT(1),
485};
486
9b60e70b 487/*
3cfa3b16 488 * single-major requires >= 0.75 version of userspace rbd utility.
9b60e70b 489 */
3cfa3b16 490static bool single_major = true;
5657a819 491module_param(single_major, bool, 0444);
3cfa3b16 492MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
9b60e70b 493
7e9586ba
GKH
494static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
495static ssize_t remove_store(struct bus_type *bus, const char *buf,
496 size_t count);
497static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
498 size_t count);
499static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
500 size_t count);
6d69bb53 501static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
f0f8cef5 502
9b60e70b
ID
503static int rbd_dev_id_to_minor(int dev_id)
504{
7e513d43 505 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
506}
507
508static int minor_to_rbd_dev_id(int minor)
509{
7e513d43 510 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
511}
512
39258aa2
ID
513static bool rbd_is_ro(struct rbd_device *rbd_dev)
514{
515 return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
516}
517
f3c0e459
ID
518static bool rbd_is_snap(struct rbd_device *rbd_dev)
519{
520 return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521}
522
ed95b21a
ID
523static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524{
637cd060
ID
525 lockdep_assert_held(&rbd_dev->lock_rwsem);
526
ed95b21a
ID
527 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529}
530
531static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532{
533 bool is_lock_owner;
534
535 down_read(&rbd_dev->lock_rwsem);
536 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537 up_read(&rbd_dev->lock_rwsem);
538 return is_lock_owner;
539}
540
7e9586ba 541static ssize_t supported_features_show(struct bus_type *bus, char *buf)
8767b293
ID
542{
543 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
544}
545
7e9586ba
GKH
546static BUS_ATTR_WO(add);
547static BUS_ATTR_WO(remove);
548static BUS_ATTR_WO(add_single_major);
549static BUS_ATTR_WO(remove_single_major);
550static BUS_ATTR_RO(supported_features);
b15a21dd
GKH
551
552static struct attribute *rbd_bus_attrs[] = {
553 &bus_attr_add.attr,
554 &bus_attr_remove.attr,
9b60e70b
ID
555 &bus_attr_add_single_major.attr,
556 &bus_attr_remove_single_major.attr,
8767b293 557 &bus_attr_supported_features.attr,
b15a21dd 558 NULL,
f0f8cef5 559};
92c76dc0
ID
560
561static umode_t rbd_bus_is_visible(struct kobject *kobj,
562 struct attribute *attr, int index)
563{
9b60e70b
ID
564 if (!single_major &&
565 (attr == &bus_attr_add_single_major.attr ||
566 attr == &bus_attr_remove_single_major.attr))
567 return 0;
568
92c76dc0
ID
569 return attr->mode;
570}
571
572static const struct attribute_group rbd_bus_group = {
573 .attrs = rbd_bus_attrs,
574 .is_visible = rbd_bus_is_visible,
575};
576__ATTRIBUTE_GROUPS(rbd_bus);
f0f8cef5
AE
577
578static struct bus_type rbd_bus_type = {
579 .name = "rbd",
b15a21dd 580 .bus_groups = rbd_bus_groups,
f0f8cef5
AE
581};
582
583static void rbd_root_dev_release(struct device *dev)
584{
585}
586
587static struct device rbd_root_dev = {
588 .init_name = "rbd",
589 .release = rbd_root_dev_release,
590};
591
06ecc6cb
AE
592static __printf(2, 3)
593void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
594{
595 struct va_format vaf;
596 va_list args;
597
598 va_start(args, fmt);
599 vaf.fmt = fmt;
600 vaf.va = &args;
601
602 if (!rbd_dev)
603 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
604 else if (rbd_dev->disk)
605 printk(KERN_WARNING "%s: %s: %pV\n",
606 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
607 else if (rbd_dev->spec && rbd_dev->spec->image_name)
608 printk(KERN_WARNING "%s: image %s: %pV\n",
609 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
610 else if (rbd_dev->spec && rbd_dev->spec->image_id)
611 printk(KERN_WARNING "%s: id %s: %pV\n",
612 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
613 else /* punt */
614 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
615 RBD_DRV_NAME, rbd_dev, &vaf);
616 va_end(args);
617}
618
aafb230e
AE
619#ifdef RBD_DEBUG
620#define rbd_assert(expr) \
621 if (unlikely(!(expr))) { \
622 printk(KERN_ERR "\nAssertion failure in %s() " \
623 "at line %d:\n\n" \
624 "\trbd_assert(%s);\n\n", \
625 __func__, __LINE__, #expr); \
626 BUG(); \
627 }
628#else /* !RBD_DEBUG */
629# define rbd_assert(expr) ((void) 0)
630#endif /* !RBD_DEBUG */
dfc5606d 631
05a46afd 632static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
8b3e1a56 633
cc4a38bd 634static int rbd_dev_refresh(struct rbd_device *rbd_dev);
2df3fac7 635static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
a720ae09 636static int rbd_dev_header_info(struct rbd_device *rbd_dev);
e8f59b59 637static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
54cac61f
AE
638static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
639 u64 snap_id);
2ad3d716
AE
640static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
641 u8 *order, u64 *snap_size);
22e8bd51 642static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
59c2be1e 643
54ab3b24 644static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
0192ce2e
ID
645static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
646
647/*
648 * Return true if nothing else is pending.
649 */
650static bool pending_result_dec(struct pending_result *pending, int *result)
651{
652 rbd_assert(pending->num_pending > 0);
653
654 if (*result && !pending->result)
655 pending->result = *result;
656 if (--pending->num_pending)
657 return false;
658
659 *result = pending->result;
660 return true;
661}
59c2be1e 662
602adf40
YS
663static int rbd_open(struct block_device *bdev, fmode_t mode)
664{
f0f8cef5 665 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 666 bool removing = false;
602adf40 667
a14ea269 668 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
669 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
670 removing = true;
671 else
672 rbd_dev->open_count++;
a14ea269 673 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
674 if (removing)
675 return -ENOENT;
676
c3e946ce 677 (void) get_device(&rbd_dev->dev);
340c7a2b 678
602adf40
YS
679 return 0;
680}
681
db2a144b 682static void rbd_release(struct gendisk *disk, fmode_t mode)
dfc5606d
YS
683{
684 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
685 unsigned long open_count_before;
686
a14ea269 687 spin_lock_irq(&rbd_dev->lock);
b82d167b 688 open_count_before = rbd_dev->open_count--;
a14ea269 689 spin_unlock_irq(&rbd_dev->lock);
b82d167b 690 rbd_assert(open_count_before > 0);
dfc5606d 691
c3e946ce 692 put_device(&rbd_dev->dev);
dfc5606d
YS
693}
694
602adf40
YS
695static const struct block_device_operations rbd_bd_ops = {
696 .owner = THIS_MODULE,
697 .open = rbd_open,
dfc5606d 698 .release = rbd_release,
602adf40
YS
699};
700
701/*
7262cfca 702 * Initialize an rbd client instance. Success or not, this function
cfbf6377 703 * consumes ceph_opts. Caller holds client_mutex.
602adf40 704 */
f8c38929 705static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
706{
707 struct rbd_client *rbdc;
708 int ret = -ENOMEM;
709
37206ee5 710 dout("%s:\n", __func__);
602adf40
YS
711 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
712 if (!rbdc)
713 goto out_opt;
714
715 kref_init(&rbdc->kref);
716 INIT_LIST_HEAD(&rbdc->node);
717
74da4a0f 718 rbdc->client = ceph_create_client(ceph_opts, rbdc);
602adf40 719 if (IS_ERR(rbdc->client))
08f75463 720 goto out_rbdc;
43ae4701 721 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
722
723 ret = ceph_open_session(rbdc->client);
724 if (ret < 0)
08f75463 725 goto out_client;
602adf40 726
432b8587 727 spin_lock(&rbd_client_list_lock);
602adf40 728 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 729 spin_unlock(&rbd_client_list_lock);
602adf40 730
37206ee5 731 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 732
602adf40 733 return rbdc;
08f75463 734out_client:
602adf40 735 ceph_destroy_client(rbdc->client);
08f75463 736out_rbdc:
602adf40
YS
737 kfree(rbdc);
738out_opt:
43ae4701
AE
739 if (ceph_opts)
740 ceph_destroy_options(ceph_opts);
37206ee5
AE
741 dout("%s: error %d\n", __func__, ret);
742
28f259b7 743 return ERR_PTR(ret);
602adf40
YS
744}
745
2f82ee54
AE
746static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
747{
748 kref_get(&rbdc->kref);
749
750 return rbdc;
751}
752
602adf40 753/*
1f7ba331
AE
754 * Find a ceph client with specific addr and configuration. If
755 * found, bump its reference count.
602adf40 756 */
1f7ba331 757static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
758{
759 struct rbd_client *client_node;
1f7ba331 760 bool found = false;
602adf40 761
43ae4701 762 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
763 return NULL;
764
1f7ba331
AE
765 spin_lock(&rbd_client_list_lock);
766 list_for_each_entry(client_node, &rbd_client_list, node) {
767 if (!ceph_compare_options(ceph_opts, client_node->client)) {
2f82ee54
AE
768 __rbd_get_client(client_node);
769
1f7ba331
AE
770 found = true;
771 break;
772 }
773 }
774 spin_unlock(&rbd_client_list_lock);
775
776 return found ? client_node : NULL;
602adf40
YS
777}
778
59c2be1e 779/*
210c104c 780 * (Per device) rbd map options
59c2be1e
YS
781 */
782enum {
b5584180 783 Opt_queue_depth,
0c93e1b7 784 Opt_alloc_size,
34f55d0b 785 Opt_lock_timeout,
59c2be1e 786 /* int args above */
b26c047b 787 Opt_pool_ns,
dc1dad8e 788 Opt_compression_hint,
59c2be1e 789 /* string args above */
cc0538b6
AE
790 Opt_read_only,
791 Opt_read_write,
80de1912 792 Opt_lock_on_read,
e010dd0a 793 Opt_exclusive,
d9360540 794 Opt_notrim,
59c2be1e
YS
795};
796
dc1dad8e
ID
797enum {
798 Opt_compression_hint_none,
799 Opt_compression_hint_compressible,
800 Opt_compression_hint_incompressible,
801};
802
803static const struct constant_table rbd_param_compression_hint[] = {
804 {"none", Opt_compression_hint_none},
805 {"compressible", Opt_compression_hint_compressible},
806 {"incompressible", Opt_compression_hint_incompressible},
807 {}
808};
809
d7167b14 810static const struct fs_parameter_spec rbd_parameters[] = {
82995cc6 811 fsparam_u32 ("alloc_size", Opt_alloc_size),
dc1dad8e
ID
812 fsparam_enum ("compression_hint", Opt_compression_hint,
813 rbd_param_compression_hint),
82995cc6
DH
814 fsparam_flag ("exclusive", Opt_exclusive),
815 fsparam_flag ("lock_on_read", Opt_lock_on_read),
816 fsparam_u32 ("lock_timeout", Opt_lock_timeout),
817 fsparam_flag ("notrim", Opt_notrim),
818 fsparam_string ("_pool_ns", Opt_pool_ns),
819 fsparam_u32 ("queue_depth", Opt_queue_depth),
820 fsparam_flag ("read_only", Opt_read_only),
821 fsparam_flag ("read_write", Opt_read_write),
822 fsparam_flag ("ro", Opt_read_only),
823 fsparam_flag ("rw", Opt_read_write),
824 {}
825};
826
98571b5a 827struct rbd_options {
b5584180 828 int queue_depth;
0c93e1b7 829 int alloc_size;
34f55d0b 830 unsigned long lock_timeout;
98571b5a 831 bool read_only;
80de1912 832 bool lock_on_read;
e010dd0a 833 bool exclusive;
d9360540 834 bool trim;
dc1dad8e
ID
835
836 u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
98571b5a
AE
837};
838
b5584180 839#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
0c93e1b7 840#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
34f55d0b 841#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
98571b5a 842#define RBD_READ_ONLY_DEFAULT false
80de1912 843#define RBD_LOCK_ON_READ_DEFAULT false
e010dd0a 844#define RBD_EXCLUSIVE_DEFAULT false
d9360540 845#define RBD_TRIM_DEFAULT true
98571b5a 846
82995cc6 847struct rbd_parse_opts_ctx {
c300156b 848 struct rbd_spec *spec;
82995cc6 849 struct ceph_options *copts;
c300156b
ID
850 struct rbd_options *opts;
851};
852
6d2940c8
GZ
853static char* obj_op_name(enum obj_operation_type op_type)
854{
855 switch (op_type) {
856 case OBJ_OP_READ:
857 return "read";
858 case OBJ_OP_WRITE:
859 return "write";
90e98c52
GZ
860 case OBJ_OP_DISCARD:
861 return "discard";
6484cbe9
ID
862 case OBJ_OP_ZEROOUT:
863 return "zeroout";
6d2940c8
GZ
864 default:
865 return "???";
866 }
867}
868
602adf40
YS
869/*
870 * Destroy ceph client
d23a4b3f 871 *
432b8587 872 * Caller must hold rbd_client_list_lock.
602adf40
YS
873 */
874static void rbd_client_release(struct kref *kref)
875{
876 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
877
37206ee5 878 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 879 spin_lock(&rbd_client_list_lock);
602adf40 880 list_del(&rbdc->node);
cd9d9f5d 881 spin_unlock(&rbd_client_list_lock);
602adf40
YS
882
883 ceph_destroy_client(rbdc->client);
884 kfree(rbdc);
885}
886
887/*
888 * Drop reference to ceph client node. If it's not referenced anymore, release
889 * it.
890 */
9d3997fd 891static void rbd_put_client(struct rbd_client *rbdc)
602adf40 892{
c53d5893
AE
893 if (rbdc)
894 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
895}
896
5feb0d8d
ID
897/*
898 * Get a ceph client with specific addr and configuration, if one does
899 * not exist create it. Either way, ceph_opts is consumed by this
900 * function.
901 */
902static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
903{
904 struct rbd_client *rbdc;
dd435855 905 int ret;
5feb0d8d 906
a32e4143 907 mutex_lock(&client_mutex);
5feb0d8d 908 rbdc = rbd_client_find(ceph_opts);
dd435855 909 if (rbdc) {
5feb0d8d 910 ceph_destroy_options(ceph_opts);
dd435855
ID
911
912 /*
913 * Using an existing client. Make sure ->pg_pools is up to
914 * date before we look up the pool id in do_rbd_add().
915 */
9d4a227f
ID
916 ret = ceph_wait_for_latest_osdmap(rbdc->client,
917 rbdc->client->options->mount_timeout);
dd435855
ID
918 if (ret) {
919 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
920 rbd_put_client(rbdc);
921 rbdc = ERR_PTR(ret);
922 }
923 } else {
5feb0d8d 924 rbdc = rbd_client_create(ceph_opts);
dd435855 925 }
5feb0d8d
ID
926 mutex_unlock(&client_mutex);
927
928 return rbdc;
929}
930
a30b71b9
AE
931static bool rbd_image_format_valid(u32 image_format)
932{
933 return image_format == 1 || image_format == 2;
934}
935
8e94af8e
AE
936static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
937{
103a150f
AE
938 size_t size;
939 u32 snap_count;
940
941 /* The header has to start with the magic rbd header text */
942 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
943 return false;
944
db2388b6
AE
945 /* The bio layer requires at least sector-sized I/O */
946
947 if (ondisk->options.order < SECTOR_SHIFT)
948 return false;
949
950 /* If we use u64 in a few spots we may be able to loosen this */
951
952 if (ondisk->options.order > 8 * sizeof (int) - 1)
953 return false;
954
103a150f
AE
955 /*
956 * The size of a snapshot header has to fit in a size_t, and
957 * that limits the number of snapshots.
958 */
959 snap_count = le32_to_cpu(ondisk->snap_count);
960 size = SIZE_MAX - sizeof (struct ceph_snap_context);
961 if (snap_count > size / sizeof (__le64))
962 return false;
963
964 /*
965 * Not only that, but the size of the entire the snapshot
966 * header must also be representable in a size_t.
967 */
968 size -= snap_count * sizeof (__le64);
969 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
970 return false;
971
972 return true;
8e94af8e
AE
973}
974
5bc3fb17
ID
975/*
976 * returns the size of an object in the image
977 */
978static u32 rbd_obj_bytes(struct rbd_image_header *header)
979{
980 return 1U << header->obj_order;
981}
982
263423f8
ID
983static void rbd_init_layout(struct rbd_device *rbd_dev)
984{
985 if (rbd_dev->header.stripe_unit == 0 ||
986 rbd_dev->header.stripe_count == 0) {
987 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
988 rbd_dev->header.stripe_count = 1;
989 }
990
991 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
992 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
993 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
7e97332e
ID
994 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
995 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
263423f8
ID
996 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
997}
998
602adf40 999/*
bb23e37a
AE
1000 * Fill an rbd image header with information from the given format 1
1001 * on-disk header.
602adf40 1002 */
662518b1 1003static int rbd_header_from_disk(struct rbd_device *rbd_dev,
4156d998 1004 struct rbd_image_header_ondisk *ondisk)
602adf40 1005{
662518b1 1006 struct rbd_image_header *header = &rbd_dev->header;
bb23e37a
AE
1007 bool first_time = header->object_prefix == NULL;
1008 struct ceph_snap_context *snapc;
1009 char *object_prefix = NULL;
1010 char *snap_names = NULL;
1011 u64 *snap_sizes = NULL;
ccece235 1012 u32 snap_count;
bb23e37a 1013 int ret = -ENOMEM;
621901d6 1014 u32 i;
602adf40 1015
bb23e37a 1016 /* Allocate this now to avoid having to handle failure below */
6a52325f 1017
bb23e37a 1018 if (first_time) {
848d796c
ID
1019 object_prefix = kstrndup(ondisk->object_prefix,
1020 sizeof(ondisk->object_prefix),
1021 GFP_KERNEL);
bb23e37a
AE
1022 if (!object_prefix)
1023 return -ENOMEM;
bb23e37a 1024 }
00f1f36f 1025
bb23e37a 1026 /* Allocate the snapshot context and fill it in */
00f1f36f 1027
bb23e37a
AE
1028 snap_count = le32_to_cpu(ondisk->snap_count);
1029 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1030 if (!snapc)
1031 goto out_err;
1032 snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 1033 if (snap_count) {
bb23e37a 1034 struct rbd_image_snap_ondisk *snaps;
f785cc1d
AE
1035 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1036
bb23e37a 1037 /* We'll keep a copy of the snapshot names... */
621901d6 1038
bb23e37a
AE
1039 if (snap_names_len > (u64)SIZE_MAX)
1040 goto out_2big;
1041 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1042 if (!snap_names)
6a52325f
AE
1043 goto out_err;
1044
bb23e37a 1045 /* ...as well as the array of their sizes. */
88a25a5f
ME
1046 snap_sizes = kmalloc_array(snap_count,
1047 sizeof(*header->snap_sizes),
1048 GFP_KERNEL);
bb23e37a 1049 if (!snap_sizes)
6a52325f 1050 goto out_err;
bb23e37a 1051
f785cc1d 1052 /*
bb23e37a
AE
1053 * Copy the names, and fill in each snapshot's id
1054 * and size.
1055 *
99a41ebc 1056 * Note that rbd_dev_v1_header_info() guarantees the
bb23e37a 1057 * ondisk buffer we're working with has
f785cc1d
AE
1058 * snap_names_len bytes beyond the end of the
1059 * snapshot id array, this memcpy() is safe.
1060 */
bb23e37a
AE
1061 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1062 snaps = ondisk->snaps;
1063 for (i = 0; i < snap_count; i++) {
1064 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1065 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1066 }
602adf40 1067 }
6a52325f 1068
bb23e37a 1069 /* We won't fail any more, fill in the header */
621901d6 1070
bb23e37a
AE
1071 if (first_time) {
1072 header->object_prefix = object_prefix;
1073 header->obj_order = ondisk->options.order;
263423f8 1074 rbd_init_layout(rbd_dev);
602adf40 1075 } else {
662518b1
AE
1076 ceph_put_snap_context(header->snapc);
1077 kfree(header->snap_names);
1078 kfree(header->snap_sizes);
602adf40 1079 }
849b4260 1080
bb23e37a 1081 /* The remaining fields always get updated (when we refresh) */
621901d6 1082
f84344f3 1083 header->image_size = le64_to_cpu(ondisk->image_size);
bb23e37a
AE
1084 header->snapc = snapc;
1085 header->snap_names = snap_names;
1086 header->snap_sizes = snap_sizes;
468521c1 1087
602adf40 1088 return 0;
bb23e37a
AE
1089out_2big:
1090 ret = -EIO;
6a52325f 1091out_err:
bb23e37a
AE
1092 kfree(snap_sizes);
1093 kfree(snap_names);
1094 ceph_put_snap_context(snapc);
1095 kfree(object_prefix);
ccece235 1096
bb23e37a 1097 return ret;
602adf40
YS
1098}
1099
9682fc6d
AE
1100static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1101{
1102 const char *snap_name;
1103
1104 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1105
1106 /* Skip over names until we find the one we are looking for */
1107
1108 snap_name = rbd_dev->header.snap_names;
1109 while (which--)
1110 snap_name += strlen(snap_name) + 1;
1111
1112 return kstrdup(snap_name, GFP_KERNEL);
1113}
1114
30d1cff8
AE
1115/*
1116 * Snapshot id comparison function for use with qsort()/bsearch().
1117 * Note that result is for snapshots in *descending* order.
1118 */
1119static int snapid_compare_reverse(const void *s1, const void *s2)
1120{
1121 u64 snap_id1 = *(u64 *)s1;
1122 u64 snap_id2 = *(u64 *)s2;
1123
1124 if (snap_id1 < snap_id2)
1125 return 1;
1126 return snap_id1 == snap_id2 ? 0 : -1;
1127}
1128
1129/*
1130 * Search a snapshot context to see if the given snapshot id is
1131 * present.
1132 *
1133 * Returns the position of the snapshot id in the array if it's found,
1134 * or BAD_SNAP_INDEX otherwise.
1135 *
1136 * Note: The snapshot array is in kept sorted (by the osd) in
1137 * reverse order, highest snapshot id first.
1138 */
9682fc6d
AE
1139static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1140{
1141 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
30d1cff8 1142 u64 *found;
9682fc6d 1143
30d1cff8
AE
1144 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1145 sizeof (snap_id), snapid_compare_reverse);
9682fc6d 1146
30d1cff8 1147 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9682fc6d
AE
1148}
1149
2ad3d716
AE
1150static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1151 u64 snap_id)
9e15b77d 1152{
54cac61f 1153 u32 which;
da6a6b63 1154 const char *snap_name;
9e15b77d 1155
54cac61f
AE
1156 which = rbd_dev_snap_index(rbd_dev, snap_id);
1157 if (which == BAD_SNAP_INDEX)
da6a6b63 1158 return ERR_PTR(-ENOENT);
54cac61f 1159
da6a6b63
JD
1160 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1161 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
54cac61f
AE
1162}
1163
1164static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1165{
9e15b77d
AE
1166 if (snap_id == CEPH_NOSNAP)
1167 return RBD_SNAP_HEAD_NAME;
1168
54cac61f
AE
1169 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1170 if (rbd_dev->image_format == 1)
1171 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9e15b77d 1172
54cac61f 1173 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
9e15b77d
AE
1174}
1175
2ad3d716
AE
1176static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1177 u64 *snap_size)
602adf40 1178{
2ad3d716
AE
1179 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1180 if (snap_id == CEPH_NOSNAP) {
1181 *snap_size = rbd_dev->header.image_size;
1182 } else if (rbd_dev->image_format == 1) {
1183 u32 which;
602adf40 1184
2ad3d716
AE
1185 which = rbd_dev_snap_index(rbd_dev, snap_id);
1186 if (which == BAD_SNAP_INDEX)
1187 return -ENOENT;
e86924a8 1188
2ad3d716
AE
1189 *snap_size = rbd_dev->header.snap_sizes[which];
1190 } else {
1191 u64 size = 0;
1192 int ret;
1193
1194 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1195 if (ret)
1196 return ret;
1197
1198 *snap_size = size;
1199 }
1200 return 0;
602adf40
YS
1201}
1202
2ad3d716
AE
1203static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1204{
8f4b7d98 1205 u64 snap_id = rbd_dev->spec->snap_id;
2ad3d716 1206 u64 size = 0;
2ad3d716
AE
1207 int ret;
1208
2ad3d716 1209 ret = rbd_snap_size(rbd_dev, snap_id, &size);
2ad3d716
AE
1210 if (ret)
1211 return ret;
1212
1213 rbd_dev->mapping.size = size;
8b0241f8 1214 return 0;
602adf40
YS
1215}
1216
d1cf5788
AE
1217static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1218{
1219 rbd_dev->mapping.size = 0;
200a6a8b
AE
1220}
1221
5359a17d 1222static void zero_bvec(struct bio_vec *bv)
602adf40 1223{
602adf40 1224 void *buf;
5359a17d 1225 unsigned long flags;
602adf40 1226
5359a17d
ID
1227 buf = bvec_kmap_irq(bv, &flags);
1228 memset(buf, 0, bv->bv_len);
1229 flush_dcache_page(bv->bv_page);
1230 bvec_kunmap_irq(buf, &flags);
602adf40
YS
1231}
1232
5359a17d 1233static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
b9434c5b 1234{
5359a17d 1235 struct ceph_bio_iter it = *bio_pos;
b9434c5b 1236
5359a17d
ID
1237 ceph_bio_iter_advance(&it, off);
1238 ceph_bio_iter_advance_step(&it, bytes, ({
1239 zero_bvec(&bv);
1240 }));
b9434c5b
AE
1241}
1242
7e07efb1 1243static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
602adf40 1244{
7e07efb1 1245 struct ceph_bvec_iter it = *bvec_pos;
602adf40 1246
7e07efb1
ID
1247 ceph_bvec_iter_advance(&it, off);
1248 ceph_bvec_iter_advance_step(&it, bytes, ({
1249 zero_bvec(&bv);
1250 }));
f7760dad
AE
1251}
1252
1253/*
3da691bf 1254 * Zero a range in @obj_req data buffer defined by a bio (list) or
afb97888 1255 * (private) bio_vec array.
f7760dad 1256 *
3da691bf 1257 * @off is relative to the start of the data buffer.
926f9b3f 1258 */
3da691bf
ID
1259static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1260 u32 bytes)
926f9b3f 1261{
54ab3b24
ID
1262 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1263
ecc633ca 1264 switch (obj_req->img_request->data_type) {
3da691bf
ID
1265 case OBJ_REQUEST_BIO:
1266 zero_bios(&obj_req->bio_pos, off, bytes);
1267 break;
1268 case OBJ_REQUEST_BVECS:
afb97888 1269 case OBJ_REQUEST_OWN_BVECS:
3da691bf
ID
1270 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1271 break;
1272 default:
16809372 1273 BUG();
6365d33a
AE
1274 }
1275}
1276
bf0d5f50
AE
1277static void rbd_obj_request_destroy(struct kref *kref);
1278static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1279{
1280 rbd_assert(obj_request != NULL);
37206ee5 1281 dout("%s: obj %p (was %d)\n", __func__, obj_request,
2c935bc5 1282 kref_read(&obj_request->kref));
bf0d5f50
AE
1283 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1284}
1285
bf0d5f50
AE
1286static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1287 struct rbd_obj_request *obj_request)
1288{
25dcf954
AE
1289 rbd_assert(obj_request->img_request == NULL);
1290
b155e86c 1291 /* Image request now owns object's original reference */
bf0d5f50 1292 obj_request->img_request = img_request;
15961b44 1293 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
bf0d5f50
AE
1294}
1295
1296static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1297 struct rbd_obj_request *obj_request)
1298{
15961b44 1299 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
43df3d35 1300 list_del(&obj_request->ex.oe_item);
bf0d5f50 1301 rbd_assert(obj_request->img_request == img_request);
bf0d5f50
AE
1302 rbd_obj_request_put(obj_request);
1303}
1304
a086a1b8 1305static void rbd_osd_submit(struct ceph_osd_request *osd_req)
bf0d5f50 1306{
a086a1b8 1307 struct rbd_obj_request *obj_req = osd_req->r_priv;
980917fc 1308
a086a1b8
ID
1309 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1310 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1311 obj_req->ex.oe_off, obj_req->ex.oe_len);
980917fc 1312 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
bf0d5f50
AE
1313}
1314
0c425248
AE
1315/*
1316 * The default/initial value for all image request flags is 0. Each
1317 * is conditionally set to 1 at image request initialization time
1318 * and currently never change thereafter.
1319 */
d0b2e944
AE
1320static void img_request_layered_set(struct rbd_img_request *img_request)
1321{
1322 set_bit(IMG_REQ_LAYERED, &img_request->flags);
d0b2e944
AE
1323}
1324
1325static bool img_request_layered_test(struct rbd_img_request *img_request)
1326{
d0b2e944
AE
1327 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1328}
1329
3da691bf 1330static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
6e2a4505 1331{
3da691bf 1332 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
b9434c5b 1333
43df3d35
ID
1334 return !obj_req->ex.oe_off &&
1335 obj_req->ex.oe_len == rbd_dev->layout.object_size;
6e2a4505
AE
1336}
1337
3da691bf 1338static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
bf0d5f50 1339{
3da691bf 1340 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bf0d5f50 1341
43df3d35 1342 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
3da691bf 1343 rbd_dev->layout.object_size;
0dcc685e
ID
1344}
1345
13488d53
ID
1346/*
1347 * Must be called after rbd_obj_calc_img_extents().
1348 */
1349static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1350{
1351 if (!obj_req->num_img_extents ||
9b17eb2c
ID
1352 (rbd_obj_is_entire(obj_req) &&
1353 !obj_req->img_request->snapc->num_snaps))
13488d53
ID
1354 return false;
1355
1356 return true;
1357}
1358
86bd7998 1359static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
bf0d5f50 1360{
86bd7998
ID
1361 return ceph_file_extents_bytes(obj_req->img_extents,
1362 obj_req->num_img_extents);
bf0d5f50
AE
1363}
1364
3da691bf 1365static bool rbd_img_is_write(struct rbd_img_request *img_req)
bf0d5f50 1366{
9bb0248d 1367 switch (img_req->op_type) {
3da691bf
ID
1368 case OBJ_OP_READ:
1369 return false;
1370 case OBJ_OP_WRITE:
1371 case OBJ_OP_DISCARD:
6484cbe9 1372 case OBJ_OP_ZEROOUT:
3da691bf
ID
1373 return true;
1374 default:
c6244b3b 1375 BUG();
3da691bf 1376 }
90e98c52
GZ
1377}
1378
85e084fe 1379static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
bf0d5f50 1380{
3da691bf 1381 struct rbd_obj_request *obj_req = osd_req->r_priv;
54ab3b24 1382 int result;
bf0d5f50 1383
3da691bf
ID
1384 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1385 osd_req->r_result, obj_req);
bf0d5f50 1386
54ab3b24
ID
1387 /*
1388 * Writes aren't allowed to return a data payload. In some
1389 * guarded write cases (e.g. stat + zero on an empty object)
1390 * a stat response makes it through, but we don't care.
1391 */
1392 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1393 result = 0;
3da691bf 1394 else
54ab3b24 1395 result = osd_req->r_result;
bf0d5f50 1396
54ab3b24 1397 rbd_obj_handle_request(obj_req, result);
bf0d5f50
AE
1398}
1399
bcbab1db 1400static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
430c28c3 1401{
bcbab1db 1402 struct rbd_obj_request *obj_request = osd_req->r_priv;
22d2cfdf
ID
1403 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1404 struct ceph_options *opt = rbd_dev->rbd_client->client->options;
430c28c3 1405
22d2cfdf 1406 osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
7c84883a 1407 osd_req->r_snapid = obj_request->img_request->snap_id;
9d4df01f
AE
1408}
1409
bcbab1db 1410static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
9d4df01f 1411{
bcbab1db 1412 struct rbd_obj_request *obj_request = osd_req->r_priv;
9d4df01f 1413
a162b308 1414 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
fac02ddf 1415 ktime_get_real_ts64(&osd_req->r_mtime);
43df3d35 1416 osd_req->r_data_offset = obj_request->ex.oe_off;
430c28c3
AE
1417}
1418
bc81207e 1419static struct ceph_osd_request *
bcbab1db
ID
1420__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1421 struct ceph_snap_context *snapc, int num_ops)
bc81207e 1422{
e28eded5 1423 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bc81207e
ID
1424 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1425 struct ceph_osd_request *req;
a90bb0c1
ID
1426 const char *name_format = rbd_dev->image_format == 1 ?
1427 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
bcbab1db 1428 int ret;
bc81207e 1429
e28eded5 1430 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
bc81207e 1431 if (!req)
bcbab1db 1432 return ERR_PTR(-ENOMEM);
bc81207e 1433
bcbab1db 1434 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
bc81207e 1435 req->r_callback = rbd_osd_req_callback;
a162b308 1436 req->r_priv = obj_req;
bc81207e 1437
b26c047b
ID
1438 /*
1439 * Data objects may be stored in a separate pool, but always in
1440 * the same namespace in that pool as the header in its pool.
1441 */
1442 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
bc81207e 1443 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
b26c047b 1444
bcbab1db
ID
1445 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1446 rbd_dev->header.object_prefix,
1447 obj_req->ex.oe_objno);
1448 if (ret)
1449 return ERR_PTR(ret);
bc81207e 1450
bc81207e 1451 return req;
bc81207e
ID
1452}
1453
e28eded5 1454static struct ceph_osd_request *
bcbab1db 1455rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
bf0d5f50 1456{
bcbab1db
ID
1457 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1458 num_ops);
bf0d5f50
AE
1459}
1460
ecc633ca 1461static struct rbd_obj_request *rbd_obj_request_create(void)
bf0d5f50
AE
1462{
1463 struct rbd_obj_request *obj_request;
bf0d5f50 1464
5a60e876 1465 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
6c696d85 1466 if (!obj_request)
f907ad55 1467 return NULL;
f907ad55 1468
43df3d35 1469 ceph_object_extent_init(&obj_request->ex);
bcbab1db 1470 INIT_LIST_HEAD(&obj_request->osd_reqs);
85b5e6d1 1471 mutex_init(&obj_request->state_mutex);
bf0d5f50
AE
1472 kref_init(&obj_request->kref);
1473
67e2b652 1474 dout("%s %p\n", __func__, obj_request);
bf0d5f50
AE
1475 return obj_request;
1476}
1477
1478static void rbd_obj_request_destroy(struct kref *kref)
1479{
1480 struct rbd_obj_request *obj_request;
bcbab1db 1481 struct ceph_osd_request *osd_req;
7e07efb1 1482 u32 i;
bf0d5f50
AE
1483
1484 obj_request = container_of(kref, struct rbd_obj_request, kref);
1485
37206ee5
AE
1486 dout("%s: obj %p\n", __func__, obj_request);
1487
bcbab1db
ID
1488 while (!list_empty(&obj_request->osd_reqs)) {
1489 osd_req = list_first_entry(&obj_request->osd_reqs,
1490 struct ceph_osd_request, r_private_item);
1491 list_del_init(&osd_req->r_private_item);
1492 ceph_osdc_put_request(osd_req);
1493 }
bf0d5f50 1494
ecc633ca 1495 switch (obj_request->img_request->data_type) {
9969ebc5 1496 case OBJ_REQUEST_NODATA:
bf0d5f50 1497 case OBJ_REQUEST_BIO:
7e07efb1 1498 case OBJ_REQUEST_BVECS:
5359a17d 1499 break; /* Nothing to do */
afb97888
ID
1500 case OBJ_REQUEST_OWN_BVECS:
1501 kfree(obj_request->bvec_pos.bvecs);
788e2df3 1502 break;
7e07efb1 1503 default:
16809372 1504 BUG();
bf0d5f50
AE
1505 }
1506
86bd7998 1507 kfree(obj_request->img_extents);
7e07efb1
ID
1508 if (obj_request->copyup_bvecs) {
1509 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1510 if (obj_request->copyup_bvecs[i].bv_page)
1511 __free_page(obj_request->copyup_bvecs[i].bv_page);
1512 }
1513 kfree(obj_request->copyup_bvecs);
bf0d5f50
AE
1514 }
1515
868311b1 1516 kmem_cache_free(rbd_obj_request_cache, obj_request);
bf0d5f50
AE
1517}
1518
fb65d228
AE
1519/* It's OK to call this for a device with no parent */
1520
1521static void rbd_spec_put(struct rbd_spec *spec);
1522static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1523{
1524 rbd_dev_remove_parent(rbd_dev);
1525 rbd_spec_put(rbd_dev->parent_spec);
1526 rbd_dev->parent_spec = NULL;
1527 rbd_dev->parent_overlap = 0;
1528}
1529
a2acd00e
AE
1530/*
1531 * Parent image reference counting is used to determine when an
1532 * image's parent fields can be safely torn down--after there are no
1533 * more in-flight requests to the parent image. When the last
1534 * reference is dropped, cleaning them up is safe.
1535 */
1536static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1537{
1538 int counter;
1539
1540 if (!rbd_dev->parent_spec)
1541 return;
1542
1543 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1544 if (counter > 0)
1545 return;
1546
1547 /* Last reference; clean up parent data structures */
1548
1549 if (!counter)
1550 rbd_dev_unparent(rbd_dev);
1551 else
9584d508 1552 rbd_warn(rbd_dev, "parent reference underflow");
a2acd00e
AE
1553}
1554
1555/*
1556 * If an image has a non-zero parent overlap, get a reference to its
1557 * parent.
1558 *
1559 * Returns true if the rbd device has a parent with a non-zero
1560 * overlap and a reference for it was successfully taken, or
1561 * false otherwise.
1562 */
1563static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1564{
ae43e9d0 1565 int counter = 0;
a2acd00e
AE
1566
1567 if (!rbd_dev->parent_spec)
1568 return false;
1569
ae43e9d0
ID
1570 if (rbd_dev->parent_overlap)
1571 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
a2acd00e
AE
1572
1573 if (counter < 0)
9584d508 1574 rbd_warn(rbd_dev, "parent reference overflow");
a2acd00e 1575
ae43e9d0 1576 return counter > 0;
a2acd00e
AE
1577}
1578
59e542c8
ID
1579static void rbd_img_request_init(struct rbd_img_request *img_request,
1580 struct rbd_device *rbd_dev,
1581 enum obj_operation_type op_type)
bf0d5f50 1582{
59e542c8 1583 memset(img_request, 0, sizeof(*img_request));
bf0d5f50 1584
bf0d5f50 1585 img_request->rbd_dev = rbd_dev;
9bb0248d 1586 img_request->op_type = op_type;
a0c5895b 1587
e1fddc8f 1588 INIT_LIST_HEAD(&img_request->lock_item);
43df3d35 1589 INIT_LIST_HEAD(&img_request->object_extents);
0192ce2e 1590 mutex_init(&img_request->state_mutex);
bf0d5f50
AE
1591}
1592
a52cc685
ID
1593static void rbd_img_capture_header(struct rbd_img_request *img_req)
1594{
1595 struct rbd_device *rbd_dev = img_req->rbd_dev;
1596
1597 lockdep_assert_held(&rbd_dev->header_rwsem);
1598
1599 if (rbd_img_is_write(img_req))
1600 img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1601 else
1602 img_req->snap_id = rbd_dev->spec->snap_id;
1603
1604 if (rbd_dev_parent_get(rbd_dev))
1605 img_request_layered_set(img_req);
1606}
1607
679a97d2 1608static void rbd_img_request_destroy(struct rbd_img_request *img_request)
bf0d5f50 1609{
bf0d5f50
AE
1610 struct rbd_obj_request *obj_request;
1611 struct rbd_obj_request *next_obj_request;
1612
37206ee5
AE
1613 dout("%s: img %p\n", __func__, img_request);
1614
e1fddc8f 1615 WARN_ON(!list_empty(&img_request->lock_item));
bf0d5f50
AE
1616 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1617 rbd_img_obj_request_del(img_request, obj_request);
1618
78b42a87 1619 if (img_request_layered_test(img_request))
a2acd00e 1620 rbd_dev_parent_put(img_request->rbd_dev);
a2acd00e 1621
9bb0248d 1622 if (rbd_img_is_write(img_request))
812164f8 1623 ceph_put_snap_context(img_request->snapc);
bf0d5f50 1624
59e542c8
ID
1625 if (test_bit(IMG_REQ_CHILD, &img_request->flags))
1626 kmem_cache_free(rbd_img_request_cache, img_request);
bf0d5f50
AE
1627}
1628
22e8bd51
ID
1629#define BITS_PER_OBJ 2
1630#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
1631#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
e93f3152 1632
22e8bd51
ID
1633static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1634 u64 *index, u8 *shift)
1635{
1636 u32 off;
e93f3152 1637
22e8bd51
ID
1638 rbd_assert(objno < rbd_dev->object_map_size);
1639 *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1640 *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1641}
e93f3152 1642
22e8bd51
ID
1643static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1644{
1645 u64 index;
1646 u8 shift;
e93f3152 1647
22e8bd51
ID
1648 lockdep_assert_held(&rbd_dev->object_map_lock);
1649 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1650 return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
e93f3152
AE
1651}
1652
22e8bd51 1653static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
e93f3152 1654{
22e8bd51
ID
1655 u64 index;
1656 u8 shift;
1657 u8 *p;
e93f3152 1658
22e8bd51
ID
1659 lockdep_assert_held(&rbd_dev->object_map_lock);
1660 rbd_assert(!(val & ~OBJ_MASK));
e93f3152 1661
22e8bd51
ID
1662 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1663 p = &rbd_dev->object_map[index];
1664 *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
e93f3152
AE
1665}
1666
22e8bd51 1667static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1217857f 1668{
22e8bd51
ID
1669 u8 state;
1670
1671 spin_lock(&rbd_dev->object_map_lock);
1672 state = __rbd_object_map_get(rbd_dev, objno);
1673 spin_unlock(&rbd_dev->object_map_lock);
1674 return state;
3da691bf 1675}
1217857f 1676
22e8bd51 1677static bool use_object_map(struct rbd_device *rbd_dev)
3da691bf 1678{
3fe69921
ID
1679 /*
1680 * An image mapped read-only can't use the object map -- it isn't
1681 * loaded because the header lock isn't acquired. Someone else can
1682 * write to the image and update the object map behind our back.
1683 *
1684 * A snapshot can't be written to, so using the object map is always
1685 * safe.
1686 */
1687 if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
1688 return false;
1689
22e8bd51
ID
1690 return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1691 !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
3da691bf
ID
1692}
1693
22e8bd51 1694static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
3da691bf 1695{
22e8bd51 1696 u8 state;
8b3e1a56 1697
22e8bd51
ID
1698 /* fall back to default logic if object map is disabled or invalid */
1699 if (!use_object_map(rbd_dev))
1700 return true;
3da691bf 1701
22e8bd51
ID
1702 state = rbd_object_map_get(rbd_dev, objno);
1703 return state != OBJECT_NONEXISTENT;
1217857f
AE
1704}
1705
22e8bd51
ID
1706static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1707 struct ceph_object_id *oid)
13488d53 1708{
22e8bd51
ID
1709 if (snap_id == CEPH_NOSNAP)
1710 ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1711 rbd_dev->spec->image_id);
1712 else
1713 ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1714 rbd_dev->spec->image_id, snap_id);
13488d53
ID
1715}
1716
22e8bd51 1717static int rbd_object_map_lock(struct rbd_device *rbd_dev)
2169238d 1718{
22e8bd51
ID
1719 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1720 CEPH_DEFINE_OID_ONSTACK(oid);
1721 u8 lock_type;
1722 char *lock_tag;
1723 struct ceph_locker *lockers;
1724 u32 num_lockers;
1725 bool broke_lock = false;
1726 int ret;
2169238d 1727
22e8bd51 1728 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
2169238d 1729
22e8bd51
ID
1730again:
1731 ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1732 CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1733 if (ret != -EBUSY || broke_lock) {
1734 if (ret == -EEXIST)
1735 ret = 0; /* already locked by myself */
1736 if (ret)
1737 rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1738 return ret;
1739 }
2169238d 1740
22e8bd51
ID
1741 ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1742 RBD_LOCK_NAME, &lock_type, &lock_tag,
1743 &lockers, &num_lockers);
1744 if (ret) {
1745 if (ret == -ENOENT)
1746 goto again;
3da691bf 1747
22e8bd51 1748 rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
86bd7998 1749 return ret;
22e8bd51 1750 }
86bd7998 1751
22e8bd51
ID
1752 kfree(lock_tag);
1753 if (num_lockers == 0)
1754 goto again;
2169238d 1755
22e8bd51
ID
1756 rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1757 ENTITY_NAME(lockers[0].id.name));
2169238d 1758
22e8bd51
ID
1759 ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1760 RBD_LOCK_NAME, lockers[0].id.cookie,
1761 &lockers[0].id.name);
1762 ceph_free_lockers(lockers, num_lockers);
1763 if (ret) {
1764 if (ret == -ENOENT)
1765 goto again;
13488d53 1766
22e8bd51
ID
1767 rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1768 return ret;
3da691bf
ID
1769 }
1770
22e8bd51
ID
1771 broke_lock = true;
1772 goto again;
2169238d
AE
1773}
1774
22e8bd51 1775static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
6484cbe9 1776{
22e8bd51
ID
1777 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1778 CEPH_DEFINE_OID_ONSTACK(oid);
1779 int ret;
1780
1781 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1782
1783 ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1784 "");
1785 if (ret && ret != -ENOENT)
1786 rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
6484cbe9
ID
1787}
1788
22e8bd51 1789static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
6484cbe9 1790{
22e8bd51
ID
1791 u8 struct_v;
1792 u32 struct_len;
1793 u32 header_len;
1794 void *header_end;
6484cbe9
ID
1795 int ret;
1796
22e8bd51
ID
1797 ceph_decode_32_safe(p, end, header_len, e_inval);
1798 header_end = *p + header_len;
0c93e1b7 1799
22e8bd51
ID
1800 ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1801 &struct_len);
6484cbe9
ID
1802 if (ret)
1803 return ret;
1804
22e8bd51 1805 ceph_decode_64_safe(p, end, *object_map_size, e_inval);
6484cbe9 1806
22e8bd51 1807 *p = header_end;
6484cbe9 1808 return 0;
22e8bd51
ID
1809
1810e_inval:
1811 return -EINVAL;
6484cbe9
ID
1812}
1813
22e8bd51 1814static int __rbd_object_map_load(struct rbd_device *rbd_dev)
13488d53 1815{
22e8bd51
ID
1816 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1817 CEPH_DEFINE_OID_ONSTACK(oid);
1818 struct page **pages;
1819 void *p, *end;
1820 size_t reply_len;
1821 u64 num_objects;
1822 u64 object_map_bytes;
1823 u64 object_map_size;
1824 int num_pages;
1825 int ret;
13488d53 1826
22e8bd51 1827 rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
13488d53 1828
22e8bd51
ID
1829 num_objects = ceph_get_num_objects(&rbd_dev->layout,
1830 rbd_dev->mapping.size);
1831 object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1832 BITS_PER_BYTE);
1833 num_pages = calc_pages_for(0, object_map_bytes) + 1;
1834 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1835 if (IS_ERR(pages))
1836 return PTR_ERR(pages);
13488d53 1837
22e8bd51
ID
1838 reply_len = num_pages * PAGE_SIZE;
1839 rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1840 ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1841 "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1842 NULL, 0, pages, &reply_len);
1843 if (ret)
1844 goto out;
3b434a2a 1845
22e8bd51
ID
1846 p = page_address(pages[0]);
1847 end = p + min(reply_len, (size_t)PAGE_SIZE);
1848 ret = decode_object_map_header(&p, end, &object_map_size);
1849 if (ret)
1850 goto out;
1851
1852 if (object_map_size != num_objects) {
1853 rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1854 object_map_size, num_objects);
1855 ret = -EINVAL;
1856 goto out;
3b434a2a
JD
1857 }
1858
22e8bd51
ID
1859 if (offset_in_page(p) + object_map_bytes > reply_len) {
1860 ret = -EINVAL;
1861 goto out;
1862 }
1863
1864 rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
1865 if (!rbd_dev->object_map) {
1866 ret = -ENOMEM;
1867 goto out;
1868 }
1869
1870 rbd_dev->object_map_size = object_map_size;
1871 ceph_copy_from_page_vector(pages, rbd_dev->object_map,
1872 offset_in_page(p), object_map_bytes);
1873
1874out:
1875 ceph_release_page_vector(pages, num_pages);
1876 return ret;
1877}
3da691bf 1878
22e8bd51
ID
1879static void rbd_object_map_free(struct rbd_device *rbd_dev)
1880{
1881 kvfree(rbd_dev->object_map);
1882 rbd_dev->object_map = NULL;
1883 rbd_dev->object_map_size = 0;
3b434a2a
JD
1884}
1885
22e8bd51 1886static int rbd_object_map_load(struct rbd_device *rbd_dev)
bf0d5f50 1887{
3da691bf 1888 int ret;
37206ee5 1889
22e8bd51 1890 ret = __rbd_object_map_load(rbd_dev);
86bd7998
ID
1891 if (ret)
1892 return ret;
f1a4739f 1893
22e8bd51
ID
1894 ret = rbd_dev_v2_get_flags(rbd_dev);
1895 if (ret) {
1896 rbd_object_map_free(rbd_dev);
1897 return ret;
1898 }
1899
1900 if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
1901 rbd_warn(rbd_dev, "object map is invalid");
1902
1903 return 0;
1904}
1905
1906static int rbd_object_map_open(struct rbd_device *rbd_dev)
1907{
1908 int ret;
1909
1910 ret = rbd_object_map_lock(rbd_dev);
1911 if (ret)
1912 return ret;
1913
1914 ret = rbd_object_map_load(rbd_dev);
1915 if (ret) {
1916 rbd_object_map_unlock(rbd_dev);
1917 return ret;
1918 }
1919
1920 return 0;
1921}
1922
1923static void rbd_object_map_close(struct rbd_device *rbd_dev)
1924{
1925 rbd_object_map_free(rbd_dev);
1926 rbd_object_map_unlock(rbd_dev);
1927}
1928
1929/*
1930 * This function needs snap_id (or more precisely just something to
1931 * distinguish between HEAD and snapshot object maps), new_state and
1932 * current_state that were passed to rbd_object_map_update().
1933 *
1934 * To avoid allocating and stashing a context we piggyback on the OSD
1935 * request. A HEAD update has two ops (assert_locked). For new_state
1936 * and current_state we decode our own object_map_update op, encoded in
1937 * rbd_cls_object_map_update().
1938 */
1939static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
1940 struct ceph_osd_request *osd_req)
1941{
1942 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1943 struct ceph_osd_data *osd_data;
1944 u64 objno;
3f649ab7 1945 u8 state, new_state, current_state;
22e8bd51
ID
1946 bool has_current_state;
1947 void *p;
1948
1949 if (osd_req->r_result)
1950 return osd_req->r_result;
1951
1952 /*
1953 * Nothing to do for a snapshot object map.
1954 */
1955 if (osd_req->r_num_ops == 1)
1956 return 0;
1957
1958 /*
1959 * Update in-memory HEAD object map.
1960 */
1961 rbd_assert(osd_req->r_num_ops == 2);
1962 osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
1963 rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
1964
1965 p = page_address(osd_data->pages[0]);
1966 objno = ceph_decode_64(&p);
1967 rbd_assert(objno == obj_req->ex.oe_objno);
1968 rbd_assert(ceph_decode_64(&p) == objno + 1);
1969 new_state = ceph_decode_8(&p);
1970 has_current_state = ceph_decode_8(&p);
1971 if (has_current_state)
1972 current_state = ceph_decode_8(&p);
1973
1974 spin_lock(&rbd_dev->object_map_lock);
1975 state = __rbd_object_map_get(rbd_dev, objno);
1976 if (!has_current_state || current_state == state ||
1977 (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
1978 __rbd_object_map_set(rbd_dev, objno, new_state);
1979 spin_unlock(&rbd_dev->object_map_lock);
1980
1981 return 0;
1982}
1983
1984static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
1985{
1986 struct rbd_obj_request *obj_req = osd_req->r_priv;
1987 int result;
1988
1989 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1990 osd_req->r_result, obj_req);
1991
1992 result = rbd_object_map_update_finish(obj_req, osd_req);
1993 rbd_obj_handle_request(obj_req, result);
1994}
1995
1996static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
1997{
1998 u8 state = rbd_object_map_get(rbd_dev, objno);
bf0d5f50 1999
22e8bd51
ID
2000 if (state == new_state ||
2001 (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2002 (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2003 return false;
2004
2005 return true;
2006}
2007
2008static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2009 int which, u64 objno, u8 new_state,
2010 const u8 *current_state)
2011{
2012 struct page **pages;
2013 void *p, *start;
2014 int ret;
2015
2016 ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2017 if (ret)
2018 return ret;
2019
2020 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2021 if (IS_ERR(pages))
2022 return PTR_ERR(pages);
2023
2024 p = start = page_address(pages[0]);
2025 ceph_encode_64(&p, objno);
2026 ceph_encode_64(&p, objno + 1);
2027 ceph_encode_8(&p, new_state);
2028 if (current_state) {
2029 ceph_encode_8(&p, 1);
2030 ceph_encode_8(&p, *current_state);
2031 } else {
2032 ceph_encode_8(&p, 0);
2033 }
2034
2035 osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2036 false, true);
2037 return 0;
2038}
2039
2040/*
2041 * Return:
2042 * 0 - object map update sent
2043 * 1 - object map update isn't needed
2044 * <0 - error
2045 */
2046static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2047 u8 new_state, const u8 *current_state)
2048{
2049 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2050 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2051 struct ceph_osd_request *req;
2052 int num_ops = 1;
2053 int which = 0;
2054 int ret;
2055
2056 if (snap_id == CEPH_NOSNAP) {
2057 if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2058 return 1;
2059
2060 num_ops++; /* assert_locked */
2061 }
2062
2063 req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2064 if (!req)
2065 return -ENOMEM;
2066
2067 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2068 req->r_callback = rbd_object_map_callback;
2069 req->r_priv = obj_req;
2070
2071 rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2072 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2073 req->r_flags = CEPH_OSD_FLAG_WRITE;
2074 ktime_get_real_ts64(&req->r_mtime);
2075
2076 if (snap_id == CEPH_NOSNAP) {
2077 /*
2078 * Protect against possible race conditions during lock
2079 * ownership transitions.
2080 */
2081 ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2082 CEPH_CLS_LOCK_EXCLUSIVE, "", "");
3da691bf
ID
2083 if (ret)
2084 return ret;
22e8bd51
ID
2085 }
2086
2087 ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2088 new_state, current_state);
2089 if (ret)
2090 return ret;
2091
2092 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2093 if (ret)
2094 return ret;
13488d53 2095
22e8bd51
ID
2096 ceph_osdc_start_request(osdc, req, false);
2097 return 0;
2098}
2099
86bd7998
ID
2100static void prune_extents(struct ceph_file_extent *img_extents,
2101 u32 *num_img_extents, u64 overlap)
e93f3152 2102{
86bd7998 2103 u32 cnt = *num_img_extents;
e93f3152 2104
86bd7998
ID
2105 /* drop extents completely beyond the overlap */
2106 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2107 cnt--;
e93f3152 2108
86bd7998
ID
2109 if (cnt) {
2110 struct ceph_file_extent *ex = &img_extents[cnt - 1];
e93f3152 2111
86bd7998
ID
2112 /* trim final overlapping extent */
2113 if (ex->fe_off + ex->fe_len > overlap)
2114 ex->fe_len = overlap - ex->fe_off;
2115 }
e93f3152 2116
86bd7998 2117 *num_img_extents = cnt;
e93f3152
AE
2118}
2119
86bd7998
ID
2120/*
2121 * Determine the byte range(s) covered by either just the object extent
2122 * or the entire object in the parent image.
2123 */
2124static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2125 bool entire)
e93f3152 2126{
86bd7998
ID
2127 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2128 int ret;
e93f3152 2129
86bd7998
ID
2130 if (!rbd_dev->parent_overlap)
2131 return 0;
e93f3152 2132
86bd7998
ID
2133 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2134 entire ? 0 : obj_req->ex.oe_off,
2135 entire ? rbd_dev->layout.object_size :
2136 obj_req->ex.oe_len,
2137 &obj_req->img_extents,
2138 &obj_req->num_img_extents);
2139 if (ret)
2140 return ret;
e93f3152 2141
86bd7998
ID
2142 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2143 rbd_dev->parent_overlap);
2144 return 0;
e93f3152
AE
2145}
2146
bcbab1db 2147static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
1217857f 2148{
bcbab1db
ID
2149 struct rbd_obj_request *obj_req = osd_req->r_priv;
2150
ecc633ca 2151 switch (obj_req->img_request->data_type) {
3da691bf 2152 case OBJ_REQUEST_BIO:
bcbab1db 2153 osd_req_op_extent_osd_data_bio(osd_req, which,
3da691bf 2154 &obj_req->bio_pos,
43df3d35 2155 obj_req->ex.oe_len);
3da691bf
ID
2156 break;
2157 case OBJ_REQUEST_BVECS:
afb97888 2158 case OBJ_REQUEST_OWN_BVECS:
3da691bf 2159 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
43df3d35 2160 obj_req->ex.oe_len);
afb97888 2161 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
bcbab1db 2162 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
3da691bf
ID
2163 &obj_req->bvec_pos);
2164 break;
2165 default:
16809372 2166 BUG();
1217857f 2167 }
3da691bf 2168}
1217857f 2169
bcbab1db 2170static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
3da691bf
ID
2171{
2172 struct page **pages;
8b3e1a56 2173
3da691bf
ID
2174 /*
2175 * The response data for a STAT call consists of:
2176 * le64 length;
2177 * struct {
2178 * le32 tv_sec;
2179 * le32 tv_nsec;
2180 * } mtime;
2181 */
2182 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2183 if (IS_ERR(pages))
2184 return PTR_ERR(pages);
2185
bcbab1db
ID
2186 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2187 osd_req_op_raw_data_in_pages(osd_req, which, pages,
3da691bf
ID
2188 8 + sizeof(struct ceph_timespec),
2189 0, false, true);
2190 return 0;
1217857f
AE
2191}
2192
b5ae8cbc
ID
2193static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2194 u32 bytes)
2195{
2196 struct rbd_obj_request *obj_req = osd_req->r_priv;
2197 int ret;
2198
2199 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2200 if (ret)
2201 return ret;
2202
2203 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2204 obj_req->copyup_bvec_count, bytes);
2205 return 0;
2206}
2207
ea9b743c
ID
2208static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2209{
2210 obj_req->read_state = RBD_OBJ_READ_START;
2211 return 0;
2212}
2213
bcbab1db
ID
2214static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2215 int which)
2169238d 2216{
bcbab1db 2217 struct rbd_obj_request *obj_req = osd_req->r_priv;
3da691bf
ID
2218 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2219 u16 opcode;
2169238d 2220
8b5bec5c
ID
2221 if (!use_object_map(rbd_dev) ||
2222 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2223 osd_req_op_alloc_hint_init(osd_req, which++,
2224 rbd_dev->layout.object_size,
d3798acc 2225 rbd_dev->layout.object_size,
dc1dad8e 2226 rbd_dev->opts->alloc_hint_flags);
8b5bec5c 2227 }
2169238d 2228
3da691bf
ID
2229 if (rbd_obj_is_entire(obj_req))
2230 opcode = CEPH_OSD_OP_WRITEFULL;
2231 else
2232 opcode = CEPH_OSD_OP_WRITE;
2169238d 2233
bcbab1db 2234 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2235 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
bcbab1db 2236 rbd_osd_setup_data(osd_req, which);
3da691bf 2237}
2169238d 2238
ea9b743c 2239static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
3da691bf 2240{
3da691bf
ID
2241 int ret;
2242
86bd7998
ID
2243 /* reverse map the entire object onto the parent */
2244 ret = rbd_obj_calc_img_extents(obj_req, true);
2245 if (ret)
2246 return ret;
2247
0ad5d953
ID
2248 if (rbd_obj_copyup_enabled(obj_req))
2249 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2250
85b5e6d1 2251 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf 2252 return 0;
2169238d
AE
2253}
2254
6484cbe9
ID
2255static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2256{
2257 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2258 CEPH_OSD_OP_ZERO;
2259}
2260
27bbd911
ID
2261static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2262 int which)
2263{
2264 struct rbd_obj_request *obj_req = osd_req->r_priv;
2265
2266 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2267 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2268 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
13488d53 2269 } else {
27bbd911
ID
2270 osd_req_op_extent_init(osd_req, which,
2271 truncate_or_zero_opcode(obj_req),
2272 obj_req->ex.oe_off, obj_req->ex.oe_len,
2273 0, 0);
2274 }
2275}
2276
ea9b743c 2277static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
6484cbe9 2278{
0c93e1b7 2279 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
27bbd911 2280 u64 off, next_off;
6484cbe9
ID
2281 int ret;
2282
0c93e1b7
ID
2283 /*
2284 * Align the range to alloc_size boundary and punt on discards
2285 * that are too small to free up any space.
2286 *
2287 * alloc_size == object_size && is_tail() is a special case for
2288 * filestore with filestore_punch_hole = false, needed to allow
2289 * truncate (in addition to delete).
2290 */
2291 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2292 !rbd_obj_is_tail(obj_req)) {
27bbd911
ID
2293 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2294 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2295 rbd_dev->opts->alloc_size);
0c93e1b7
ID
2296 if (off >= next_off)
2297 return 1;
27bbd911
ID
2298
2299 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2300 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2301 off, next_off - off);
2302 obj_req->ex.oe_off = off;
2303 obj_req->ex.oe_len = next_off - off;
0c93e1b7
ID
2304 }
2305
6484cbe9
ID
2306 /* reverse map the entire object onto the parent */
2307 ret = rbd_obj_calc_img_extents(obj_req, true);
2308 if (ret)
2309 return ret;
2310
22e8bd51 2311 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
0ad5d953
ID
2312 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2313 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2314
85b5e6d1 2315 obj_req->write_state = RBD_OBJ_WRITE_START;
6484cbe9
ID
2316 return 0;
2317}
2318
bcbab1db
ID
2319static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2320 int which)
3da691bf 2321{
bcbab1db 2322 struct rbd_obj_request *obj_req = osd_req->r_priv;
3b434a2a
JD
2323 u16 opcode;
2324
3da691bf 2325 if (rbd_obj_is_entire(obj_req)) {
86bd7998 2326 if (obj_req->num_img_extents) {
0ad5d953 2327 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
bcbab1db 2328 osd_req_op_init(osd_req, which++,
9b17eb2c 2329 CEPH_OSD_OP_CREATE, 0);
3b434a2a
JD
2330 opcode = CEPH_OSD_OP_TRUNCATE;
2331 } else {
0ad5d953 2332 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
bcbab1db 2333 osd_req_op_init(osd_req, which++,
3da691bf
ID
2334 CEPH_OSD_OP_DELETE, 0);
2335 opcode = 0;
3b434a2a 2336 }
3b434a2a 2337 } else {
6484cbe9 2338 opcode = truncate_or_zero_opcode(obj_req);
3b434a2a
JD
2339 }
2340
3da691bf 2341 if (opcode)
bcbab1db 2342 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2343 obj_req->ex.oe_off, obj_req->ex.oe_len,
3da691bf 2344 0, 0);
3b434a2a
JD
2345}
2346
ea9b743c 2347static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
bf0d5f50 2348{
3da691bf 2349 int ret;
37206ee5 2350
86bd7998
ID
2351 /* reverse map the entire object onto the parent */
2352 ret = rbd_obj_calc_img_extents(obj_req, true);
2353 if (ret)
2354 return ret;
f1a4739f 2355
0ad5d953
ID
2356 if (rbd_obj_copyup_enabled(obj_req))
2357 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2358 if (!obj_req->num_img_extents) {
22e8bd51 2359 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
0ad5d953
ID
2360 if (rbd_obj_is_entire(obj_req))
2361 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
3da691bf 2362 }
3b434a2a 2363
a086a1b8 2364 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf
ID
2365 return 0;
2366}
9d4df01f 2367
a086a1b8
ID
2368static int count_write_ops(struct rbd_obj_request *obj_req)
2369{
8b5bec5c
ID
2370 struct rbd_img_request *img_req = obj_req->img_request;
2371
2372 switch (img_req->op_type) {
a086a1b8 2373 case OBJ_OP_WRITE:
8b5bec5c
ID
2374 if (!use_object_map(img_req->rbd_dev) ||
2375 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2376 return 2; /* setallochint + write/writefull */
2377
2378 return 1; /* write/writefull */
a086a1b8
ID
2379 case OBJ_OP_DISCARD:
2380 return 1; /* delete/truncate/zero */
2381 case OBJ_OP_ZEROOUT:
2382 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2383 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2384 return 2; /* create + truncate */
bf0d5f50 2385
a086a1b8
ID
2386 return 1; /* delete/truncate/zero */
2387 default:
2388 BUG();
3da691bf 2389 }
a086a1b8 2390}
3b434a2a 2391
a086a1b8
ID
2392static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2393 int which)
2394{
2395 struct rbd_obj_request *obj_req = osd_req->r_priv;
2396
2397 switch (obj_req->img_request->op_type) {
2398 case OBJ_OP_WRITE:
2399 __rbd_osd_setup_write_ops(osd_req, which);
2400 break;
2401 case OBJ_OP_DISCARD:
2402 __rbd_osd_setup_discard_ops(osd_req, which);
2403 break;
2404 case OBJ_OP_ZEROOUT:
2405 __rbd_osd_setup_zeroout_ops(osd_req, which);
2406 break;
2407 default:
2408 BUG();
2409 }
3da691bf 2410}
9d4df01f 2411
3da691bf 2412/*
a086a1b8
ID
2413 * Prune the list of object requests (adjust offset and/or length, drop
2414 * redundant requests). Prepare object request state machines and image
2415 * request state machine for execution.
3da691bf
ID
2416 */
2417static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2418{
0c93e1b7 2419 struct rbd_obj_request *obj_req, *next_obj_req;
3da691bf 2420 int ret;
430c28c3 2421
0c93e1b7 2422 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
9bb0248d 2423 switch (img_req->op_type) {
3da691bf 2424 case OBJ_OP_READ:
ea9b743c 2425 ret = rbd_obj_init_read(obj_req);
3da691bf
ID
2426 break;
2427 case OBJ_OP_WRITE:
ea9b743c 2428 ret = rbd_obj_init_write(obj_req);
3da691bf
ID
2429 break;
2430 case OBJ_OP_DISCARD:
ea9b743c 2431 ret = rbd_obj_init_discard(obj_req);
3da691bf 2432 break;
6484cbe9 2433 case OBJ_OP_ZEROOUT:
ea9b743c 2434 ret = rbd_obj_init_zeroout(obj_req);
6484cbe9 2435 break;
3da691bf 2436 default:
16809372 2437 BUG();
3da691bf 2438 }
0c93e1b7 2439 if (ret < 0)
3da691bf 2440 return ret;
0c93e1b7 2441 if (ret > 0) {
0c93e1b7
ID
2442 rbd_img_obj_request_del(img_req, obj_req);
2443 continue;
2444 }
bf0d5f50
AE
2445 }
2446
0192ce2e 2447 img_req->state = RBD_IMG_START;
bf0d5f50 2448 return 0;
3da691bf 2449}
bf0d5f50 2450
5a237819
ID
2451union rbd_img_fill_iter {
2452 struct ceph_bio_iter bio_iter;
2453 struct ceph_bvec_iter bvec_iter;
2454};
bf0d5f50 2455
5a237819
ID
2456struct rbd_img_fill_ctx {
2457 enum obj_request_type pos_type;
2458 union rbd_img_fill_iter *pos;
2459 union rbd_img_fill_iter iter;
2460 ceph_object_extent_fn_t set_pos_fn;
afb97888
ID
2461 ceph_object_extent_fn_t count_fn;
2462 ceph_object_extent_fn_t copy_fn;
5a237819 2463};
bf0d5f50 2464
5a237819 2465static struct ceph_object_extent *alloc_object_extent(void *arg)
0eefd470 2466{
5a237819
ID
2467 struct rbd_img_request *img_req = arg;
2468 struct rbd_obj_request *obj_req;
0eefd470 2469
5a237819
ID
2470 obj_req = rbd_obj_request_create();
2471 if (!obj_req)
2472 return NULL;
2761713d 2473
5a237819
ID
2474 rbd_img_obj_request_add(img_req, obj_req);
2475 return &obj_req->ex;
2476}
0eefd470 2477
afb97888
ID
2478/*
2479 * While su != os && sc == 1 is technically not fancy (it's the same
2480 * layout as su == os && sc == 1), we can't use the nocopy path for it
2481 * because ->set_pos_fn() should be called only once per object.
2482 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2483 * treat su != os && sc == 1 as fancy.
2484 */
2485static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2486{
2487 return l->stripe_unit != l->object_size;
2488}
0eefd470 2489
afb97888
ID
2490static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2491 struct ceph_file_extent *img_extents,
2492 u32 num_img_extents,
2493 struct rbd_img_fill_ctx *fctx)
2494{
2495 u32 i;
2496 int ret;
2497
2498 img_req->data_type = fctx->pos_type;
0eefd470
AE
2499
2500 /*
afb97888
ID
2501 * Create object requests and set each object request's starting
2502 * position in the provided bio (list) or bio_vec array.
0eefd470 2503 */
afb97888
ID
2504 fctx->iter = *fctx->pos;
2505 for (i = 0; i < num_img_extents; i++) {
2506 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2507 img_extents[i].fe_off,
2508 img_extents[i].fe_len,
2509 &img_req->object_extents,
2510 alloc_object_extent, img_req,
2511 fctx->set_pos_fn, &fctx->iter);
2512 if (ret)
2513 return ret;
2514 }
0eefd470 2515
afb97888 2516 return __rbd_img_fill_request(img_req);
0eefd470
AE
2517}
2518
5a237819
ID
2519/*
2520 * Map a list of image extents to a list of object extents, create the
2521 * corresponding object requests (normally each to a different object,
2522 * but not always) and add them to @img_req. For each object request,
afb97888 2523 * set up its data descriptor to point to the corresponding chunk(s) of
5a237819
ID
2524 * @fctx->pos data buffer.
2525 *
afb97888
ID
2526 * Because ceph_file_to_extents() will merge adjacent object extents
2527 * together, each object request's data descriptor may point to multiple
2528 * different chunks of @fctx->pos data buffer.
2529 *
5a237819
ID
2530 * @fctx->pos data buffer is assumed to be large enough.
2531 */
2532static int rbd_img_fill_request(struct rbd_img_request *img_req,
2533 struct ceph_file_extent *img_extents,
2534 u32 num_img_extents,
2535 struct rbd_img_fill_ctx *fctx)
3d7efd18 2536{
afb97888
ID
2537 struct rbd_device *rbd_dev = img_req->rbd_dev;
2538 struct rbd_obj_request *obj_req;
5a237819
ID
2539 u32 i;
2540 int ret;
2541
afb97888
ID
2542 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2543 !rbd_layout_is_fancy(&rbd_dev->layout))
2544 return rbd_img_fill_request_nocopy(img_req, img_extents,
2545 num_img_extents, fctx);
3d7efd18 2546
afb97888 2547 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
0eefd470 2548
bbea1c1a 2549 /*
afb97888
ID
2550 * Create object requests and determine ->bvec_count for each object
2551 * request. Note that ->bvec_count sum over all object requests may
2552 * be greater than the number of bio_vecs in the provided bio (list)
2553 * or bio_vec array because when mapped, those bio_vecs can straddle
2554 * stripe unit boundaries.
bbea1c1a 2555 */
5a237819
ID
2556 fctx->iter = *fctx->pos;
2557 for (i = 0; i < num_img_extents; i++) {
afb97888 2558 ret = ceph_file_to_extents(&rbd_dev->layout,
5a237819
ID
2559 img_extents[i].fe_off,
2560 img_extents[i].fe_len,
2561 &img_req->object_extents,
2562 alloc_object_extent, img_req,
afb97888
ID
2563 fctx->count_fn, &fctx->iter);
2564 if (ret)
2565 return ret;
bbea1c1a 2566 }
0eefd470 2567
afb97888
ID
2568 for_each_obj_request(img_req, obj_req) {
2569 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2570 sizeof(*obj_req->bvec_pos.bvecs),
2571 GFP_NOIO);
2572 if (!obj_req->bvec_pos.bvecs)
2573 return -ENOMEM;
2574 }
0eefd470 2575
8785b1d4 2576 /*
afb97888
ID
2577 * Fill in each object request's private bio_vec array, splitting and
2578 * rearranging the provided bio_vecs in stripe unit chunks as needed.
8785b1d4 2579 */
afb97888
ID
2580 fctx->iter = *fctx->pos;
2581 for (i = 0; i < num_img_extents; i++) {
2582 ret = ceph_iterate_extents(&rbd_dev->layout,
2583 img_extents[i].fe_off,
2584 img_extents[i].fe_len,
2585 &img_req->object_extents,
2586 fctx->copy_fn, &fctx->iter);
5a237819
ID
2587 if (ret)
2588 return ret;
2589 }
3d7efd18 2590
5a237819
ID
2591 return __rbd_img_fill_request(img_req);
2592}
2593
2594static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2595 u64 off, u64 len)
2596{
2597 struct ceph_file_extent ex = { off, len };
a55e601b 2598 union rbd_img_fill_iter dummy = {};
5a237819
ID
2599 struct rbd_img_fill_ctx fctx = {
2600 .pos_type = OBJ_REQUEST_NODATA,
2601 .pos = &dummy,
2602 };
2603
2604 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2605}
2606
2607static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2608{
2609 struct rbd_obj_request *obj_req =
2610 container_of(ex, struct rbd_obj_request, ex);
2611 struct ceph_bio_iter *it = arg;
3d7efd18 2612
5a237819
ID
2613 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2614 obj_req->bio_pos = *it;
2615 ceph_bio_iter_advance(it, bytes);
2616}
3d7efd18 2617
afb97888
ID
2618static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2619{
2620 struct rbd_obj_request *obj_req =
2621 container_of(ex, struct rbd_obj_request, ex);
2622 struct ceph_bio_iter *it = arg;
0eefd470 2623
afb97888
ID
2624 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2625 ceph_bio_iter_advance_step(it, bytes, ({
2626 obj_req->bvec_count++;
2627 }));
0eefd470 2628
afb97888 2629}
0eefd470 2630
afb97888
ID
2631static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2632{
2633 struct rbd_obj_request *obj_req =
2634 container_of(ex, struct rbd_obj_request, ex);
2635 struct ceph_bio_iter *it = arg;
0eefd470 2636
afb97888
ID
2637 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2638 ceph_bio_iter_advance_step(it, bytes, ({
2639 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2640 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2641 }));
3d7efd18
AE
2642}
2643
5a237819
ID
2644static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2645 struct ceph_file_extent *img_extents,
2646 u32 num_img_extents,
2647 struct ceph_bio_iter *bio_pos)
2648{
2649 struct rbd_img_fill_ctx fctx = {
2650 .pos_type = OBJ_REQUEST_BIO,
2651 .pos = (union rbd_img_fill_iter *)bio_pos,
2652 .set_pos_fn = set_bio_pos,
afb97888
ID
2653 .count_fn = count_bio_bvecs,
2654 .copy_fn = copy_bio_bvecs,
5a237819 2655 };
3d7efd18 2656
5a237819
ID
2657 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2658 &fctx);
2659}
3d7efd18 2660
5a237819
ID
2661static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2662 u64 off, u64 len, struct bio *bio)
2663{
2664 struct ceph_file_extent ex = { off, len };
2665 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
3d7efd18 2666
5a237819
ID
2667 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2668}
a9e8ba2c 2669
5a237819
ID
2670static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2671{
2672 struct rbd_obj_request *obj_req =
2673 container_of(ex, struct rbd_obj_request, ex);
2674 struct ceph_bvec_iter *it = arg;
3d7efd18 2675
5a237819
ID
2676 obj_req->bvec_pos = *it;
2677 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2678 ceph_bvec_iter_advance(it, bytes);
2679}
3d7efd18 2680
afb97888
ID
2681static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2682{
2683 struct rbd_obj_request *obj_req =
2684 container_of(ex, struct rbd_obj_request, ex);
2685 struct ceph_bvec_iter *it = arg;
058aa991 2686
afb97888
ID
2687 ceph_bvec_iter_advance_step(it, bytes, ({
2688 obj_req->bvec_count++;
2689 }));
2690}
058aa991 2691
afb97888
ID
2692static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2693{
2694 struct rbd_obj_request *obj_req =
2695 container_of(ex, struct rbd_obj_request, ex);
2696 struct ceph_bvec_iter *it = arg;
3d7efd18 2697
afb97888
ID
2698 ceph_bvec_iter_advance_step(it, bytes, ({
2699 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2700 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2701 }));
3d7efd18
AE
2702}
2703
5a237819
ID
2704static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2705 struct ceph_file_extent *img_extents,
2706 u32 num_img_extents,
2707 struct ceph_bvec_iter *bvec_pos)
c5b5ef6c 2708{
5a237819
ID
2709 struct rbd_img_fill_ctx fctx = {
2710 .pos_type = OBJ_REQUEST_BVECS,
2711 .pos = (union rbd_img_fill_iter *)bvec_pos,
2712 .set_pos_fn = set_bvec_pos,
afb97888
ID
2713 .count_fn = count_bvecs,
2714 .copy_fn = copy_bvecs,
5a237819 2715 };
c5b5ef6c 2716
5a237819
ID
2717 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2718 &fctx);
2719}
c5b5ef6c 2720
5a237819
ID
2721static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2722 struct ceph_file_extent *img_extents,
2723 u32 num_img_extents,
2724 struct bio_vec *bvecs)
2725{
2726 struct ceph_bvec_iter it = {
2727 .bvecs = bvecs,
2728 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2729 num_img_extents) },
2730 };
c5b5ef6c 2731
5a237819
ID
2732 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2733 &it);
2734}
c5b5ef6c 2735
0192ce2e 2736static void rbd_img_handle_request_work(struct work_struct *work)
bf0d5f50 2737{
0192ce2e
ID
2738 struct rbd_img_request *img_req =
2739 container_of(work, struct rbd_img_request, work);
c5b5ef6c 2740
0192ce2e
ID
2741 rbd_img_handle_request(img_req, img_req->work_result);
2742}
c2e82414 2743
0192ce2e
ID
2744static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2745{
2746 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2747 img_req->work_result = result;
2748 queue_work(rbd_wq, &img_req->work);
c5b5ef6c 2749}
c2e82414 2750
22e8bd51
ID
2751static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2752{
2753 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2754
2755 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2756 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2757 return true;
2758 }
2759
2760 dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2761 obj_req->ex.oe_objno);
2762 return false;
2763}
2764
85b5e6d1
ID
2765static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2766{
a086a1b8
ID
2767 struct ceph_osd_request *osd_req;
2768 int ret;
2769
2770 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2771 if (IS_ERR(osd_req))
2772 return PTR_ERR(osd_req);
2773
2774 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2775 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2776 rbd_osd_setup_data(osd_req, 0);
2777 rbd_osd_format_read(osd_req);
2778
2779 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2780 if (ret)
2781 return ret;
2782
2783 rbd_osd_submit(osd_req);
85b5e6d1 2784 return 0;
c5b5ef6c
AE
2785}
2786
86bd7998 2787static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
c5b5ef6c 2788{
3da691bf 2789 struct rbd_img_request *img_req = obj_req->img_request;
a52cc685 2790 struct rbd_device *parent = img_req->rbd_dev->parent;
3da691bf 2791 struct rbd_img_request *child_img_req;
c5b5ef6c
AE
2792 int ret;
2793
59e542c8 2794 child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
3da691bf 2795 if (!child_img_req)
710214e3
ID
2796 return -ENOMEM;
2797
59e542c8 2798 rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
e93aca0a
ID
2799 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2800 child_img_req->obj_request = obj_req;
a90bb0c1 2801
a52cc685
ID
2802 down_read(&parent->header_rwsem);
2803 rbd_img_capture_header(child_img_req);
2804 up_read(&parent->header_rwsem);
2805
21ed05a8
ID
2806 dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2807 obj_req);
2808
3da691bf 2809 if (!rbd_img_is_write(img_req)) {
ecc633ca 2810 switch (img_req->data_type) {
3da691bf 2811 case OBJ_REQUEST_BIO:
5a237819
ID
2812 ret = __rbd_img_fill_from_bio(child_img_req,
2813 obj_req->img_extents,
2814 obj_req->num_img_extents,
2815 &obj_req->bio_pos);
3da691bf
ID
2816 break;
2817 case OBJ_REQUEST_BVECS:
afb97888 2818 case OBJ_REQUEST_OWN_BVECS:
5a237819
ID
2819 ret = __rbd_img_fill_from_bvecs(child_img_req,
2820 obj_req->img_extents,
2821 obj_req->num_img_extents,
2822 &obj_req->bvec_pos);
3da691bf
ID
2823 break;
2824 default:
d342a15b 2825 BUG();
3da691bf
ID
2826 }
2827 } else {
5a237819
ID
2828 ret = rbd_img_fill_from_bvecs(child_img_req,
2829 obj_req->img_extents,
2830 obj_req->num_img_extents,
2831 obj_req->copyup_bvecs);
3da691bf
ID
2832 }
2833 if (ret) {
679a97d2 2834 rbd_img_request_destroy(child_img_req);
3da691bf
ID
2835 return ret;
2836 }
2837
0192ce2e
ID
2838 /* avoid parent chain recursion */
2839 rbd_img_schedule(child_img_req, 0);
3da691bf
ID
2840 return 0;
2841}
2842
85b5e6d1 2843static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
3da691bf
ID
2844{
2845 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2846 int ret;
2847
22e8bd51 2848again:
a9b67e69 2849 switch (obj_req->read_state) {
85b5e6d1
ID
2850 case RBD_OBJ_READ_START:
2851 rbd_assert(!*result);
2852
22e8bd51
ID
2853 if (!rbd_obj_may_exist(obj_req)) {
2854 *result = -ENOENT;
2855 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2856 goto again;
2857 }
2858
85b5e6d1 2859 ret = rbd_obj_read_object(obj_req);
3da691bf 2860 if (ret) {
85b5e6d1 2861 *result = ret;
3da691bf
ID
2862 return true;
2863 }
85b5e6d1
ID
2864 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2865 return false;
a9b67e69
ID
2866 case RBD_OBJ_READ_OBJECT:
2867 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2868 /* reverse map this object extent onto the parent */
2869 ret = rbd_obj_calc_img_extents(obj_req, false);
86bd7998 2870 if (ret) {
54ab3b24 2871 *result = ret;
86bd7998
ID
2872 return true;
2873 }
a9b67e69
ID
2874 if (obj_req->num_img_extents) {
2875 ret = rbd_obj_read_from_parent(obj_req);
2876 if (ret) {
2877 *result = ret;
2878 return true;
2879 }
2880 obj_req->read_state = RBD_OBJ_READ_PARENT;
2881 return false;
2882 }
86bd7998 2883 }
710214e3 2884
a9b67e69
ID
2885 /*
2886 * -ENOENT means a hole in the image -- zero-fill the entire
2887 * length of the request. A short read also implies zero-fill
2888 * to the end of the request.
2889 */
2890 if (*result == -ENOENT) {
2891 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2892 *result = 0;
2893 } else if (*result >= 0) {
2894 if (*result < obj_req->ex.oe_len)
2895 rbd_obj_zero_range(obj_req, *result,
2896 obj_req->ex.oe_len - *result);
2897 else
2898 rbd_assert(*result == obj_req->ex.oe_len);
2899 *result = 0;
2900 }
2901 return true;
2902 case RBD_OBJ_READ_PARENT:
d435c9a7
ID
2903 /*
2904 * The parent image is read only up to the overlap -- zero-fill
2905 * from the overlap to the end of the request.
2906 */
2907 if (!*result) {
2908 u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2909
2910 if (obj_overlap < obj_req->ex.oe_len)
2911 rbd_obj_zero_range(obj_req, obj_overlap,
2912 obj_req->ex.oe_len - obj_overlap);
2913 }
a9b67e69
ID
2914 return true;
2915 default:
2916 BUG();
710214e3 2917 }
3da691bf 2918}
c5b5ef6c 2919
22e8bd51
ID
2920static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
2921{
2922 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2923
2924 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
2925 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2926
2927 if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
2928 (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
2929 dout("%s %p noop for nonexistent\n", __func__, obj_req);
2930 return true;
2931 }
2932
2933 return false;
2934}
2935
2936/*
2937 * Return:
2938 * 0 - object map update sent
2939 * 1 - object map update isn't needed
2940 * <0 - error
2941 */
2942static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
2943{
2944 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2945 u8 new_state;
2946
2947 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
2948 return 1;
2949
2950 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
2951 new_state = OBJECT_PENDING;
2952 else
2953 new_state = OBJECT_EXISTS;
2954
2955 return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
2956}
2957
85b5e6d1
ID
2958static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
2959{
a086a1b8
ID
2960 struct ceph_osd_request *osd_req;
2961 int num_ops = count_write_ops(obj_req);
2962 int which = 0;
2963 int ret;
710214e3 2964
a086a1b8
ID
2965 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2966 num_ops++; /* stat */
2967
2968 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
2969 if (IS_ERR(osd_req))
2970 return PTR_ERR(osd_req);
2971
2972 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2973 ret = rbd_osd_setup_stat(osd_req, which++);
2974 if (ret)
2975 return ret;
710214e3 2976 }
c5b5ef6c 2977
a086a1b8
ID
2978 rbd_osd_setup_write_ops(osd_req, which);
2979 rbd_osd_format_write(osd_req);
2980
2981 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2982 if (ret)
2983 return ret;
2984
2985 rbd_osd_submit(osd_req);
85b5e6d1 2986 return 0;
3da691bf 2987}
c5b5ef6c 2988
3da691bf
ID
2989/*
2990 * copyup_bvecs pages are never highmem pages
2991 */
2992static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2993{
2994 struct ceph_bvec_iter it = {
2995 .bvecs = bvecs,
2996 .iter = { .bi_size = bytes },
2997 };
c5b5ef6c 2998
3da691bf
ID
2999 ceph_bvec_iter_advance_step(&it, bytes, ({
3000 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3001 bv.bv_len))
3002 return false;
3003 }));
3004 return true;
c5b5ef6c
AE
3005}
3006
3a482501
ID
3007#define MODS_ONLY U32_MAX
3008
793333a3
ID
3009static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3010 u32 bytes)
b454e36d 3011{
bcbab1db 3012 struct ceph_osd_request *osd_req;
fe943d50 3013 int ret;
70d045f6 3014
3da691bf 3015 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
89a59c1c 3016 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
70d045f6 3017
bcbab1db
ID
3018 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3019 if (IS_ERR(osd_req))
3020 return PTR_ERR(osd_req);
b454e36d 3021
b5ae8cbc 3022 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
fe943d50
CX
3023 if (ret)
3024 return ret;
3025
bcbab1db 3026 rbd_osd_format_write(osd_req);
3da691bf 3027
bcbab1db 3028 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
89a59c1c
ID
3029 if (ret)
3030 return ret;
3031
a086a1b8 3032 rbd_osd_submit(osd_req);
89a59c1c
ID
3033 return 0;
3034}
3035
793333a3
ID
3036static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3037 u32 bytes)
b454e36d 3038{
bcbab1db 3039 struct ceph_osd_request *osd_req;
a086a1b8
ID
3040 int num_ops = count_write_ops(obj_req);
3041 int which = 0;
fe943d50 3042 int ret;
70d045f6 3043
3da691bf 3044 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
70d045f6 3045
a086a1b8
ID
3046 if (bytes != MODS_ONLY)
3047 num_ops++; /* copyup */
13488d53 3048
a086a1b8 3049 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
bcbab1db
ID
3050 if (IS_ERR(osd_req))
3051 return PTR_ERR(osd_req);
b454e36d 3052
3a482501 3053 if (bytes != MODS_ONLY) {
b5ae8cbc 3054 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3a482501
ID
3055 if (ret)
3056 return ret;
3da691bf 3057 }
3da691bf 3058
a086a1b8
ID
3059 rbd_osd_setup_write_ops(osd_req, which);
3060 rbd_osd_format_write(osd_req);
70d045f6 3061
bcbab1db 3062 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
26f887e0
ID
3063 if (ret)
3064 return ret;
3065
a086a1b8 3066 rbd_osd_submit(osd_req);
3da691bf 3067 return 0;
70d045f6
ID
3068}
3069
7e07efb1 3070static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
70d045f6 3071{
7e07efb1 3072 u32 i;
b454e36d 3073
7e07efb1
ID
3074 rbd_assert(!obj_req->copyup_bvecs);
3075 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3076 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3077 sizeof(*obj_req->copyup_bvecs),
3078 GFP_NOIO);
3079 if (!obj_req->copyup_bvecs)
3080 return -ENOMEM;
b454e36d 3081
7e07efb1
ID
3082 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3083 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3084
3085 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3086 if (!obj_req->copyup_bvecs[i].bv_page)
3087 return -ENOMEM;
3d7efd18 3088
7e07efb1
ID
3089 obj_req->copyup_bvecs[i].bv_offset = 0;
3090 obj_req->copyup_bvecs[i].bv_len = len;
3091 obj_overlap -= len;
3092 }
b454e36d 3093
7e07efb1
ID
3094 rbd_assert(!obj_overlap);
3095 return 0;
b454e36d
AE
3096}
3097
0ad5d953
ID
3098/*
3099 * The target object doesn't exist. Read the data for the entire
3100 * target object up to the overlap point (if any) from the parent,
3101 * so we can use it for a copyup.
3102 */
793333a3 3103static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
bf0d5f50 3104{
3da691bf 3105 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 3106 int ret;
bf0d5f50 3107
86bd7998
ID
3108 rbd_assert(obj_req->num_img_extents);
3109 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3110 rbd_dev->parent_overlap);
3111 if (!obj_req->num_img_extents) {
3da691bf
ID
3112 /*
3113 * The overlap has become 0 (most likely because the
3a482501
ID
3114 * image has been flattened). Re-submit the original write
3115 * request -- pass MODS_ONLY since the copyup isn't needed
3116 * anymore.
3da691bf 3117 */
793333a3 3118 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
bf0d5f50
AE
3119 }
3120
86bd7998 3121 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3da691bf
ID
3122 if (ret)
3123 return ret;
3124
86bd7998 3125 return rbd_obj_read_from_parent(obj_req);
bf0d5f50 3126}
8b3e1a56 3127
22e8bd51
ID
3128static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3129{
3130 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3131 struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3132 u8 new_state;
3133 u32 i;
3134 int ret;
3135
3136 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3137
3138 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3139 return;
3140
3141 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3142 return;
3143
3144 for (i = 0; i < snapc->num_snaps; i++) {
3145 if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3146 i + 1 < snapc->num_snaps)
3147 new_state = OBJECT_EXISTS_CLEAN;
3148 else
3149 new_state = OBJECT_EXISTS;
3150
3151 ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3152 new_state, NULL);
3153 if (ret < 0) {
3154 obj_req->pending.result = ret;
3155 return;
3156 }
3157
3158 rbd_assert(!ret);
3159 obj_req->pending.num_pending++;
3160 }
3161}
3162
793333a3
ID
3163static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3164{
3165 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3166 int ret;
3167
3168 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3169
3170 /*
3171 * Only send non-zero copyup data to save some I/O and network
3172 * bandwidth -- zero copyup data is equivalent to the object not
3173 * existing.
3174 */
3175 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3176 bytes = 0;
3177
3178 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3179 /*
3180 * Send a copyup request with an empty snapshot context to
3181 * deep-copyup the object through all existing snapshots.
3182 * A second request with the current snapshot context will be
3183 * sent for the actual modification.
3184 */
3185 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3186 if (ret) {
3187 obj_req->pending.result = ret;
3188 return;
3189 }
3190
3191 obj_req->pending.num_pending++;
3192 bytes = MODS_ONLY;
3193 }
3194
3195 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3196 if (ret) {
3197 obj_req->pending.result = ret;
3198 return;
3199 }
3200
3201 obj_req->pending.num_pending++;
3202}
3203
3204static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3205{
22e8bd51 3206 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
793333a3
ID
3207 int ret;
3208
3209again:
3210 switch (obj_req->copyup_state) {
3211 case RBD_OBJ_COPYUP_START:
3212 rbd_assert(!*result);
3213
3214 ret = rbd_obj_copyup_read_parent(obj_req);
3215 if (ret) {
3216 *result = ret;
3217 return true;
3218 }
3219 if (obj_req->num_img_extents)
3220 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3221 else
3222 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3223 return false;
3224 case RBD_OBJ_COPYUP_READ_PARENT:
3225 if (*result)
3226 return true;
3227
3228 if (is_zero_bvecs(obj_req->copyup_bvecs,
3229 rbd_obj_img_extents_bytes(obj_req))) {
3230 dout("%s %p detected zeros\n", __func__, obj_req);
3231 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3232 }
3233
22e8bd51
ID
3234 rbd_obj_copyup_object_maps(obj_req);
3235 if (!obj_req->pending.num_pending) {
3236 *result = obj_req->pending.result;
3237 obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3238 goto again;
3239 }
3240 obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3241 return false;
3242 case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3243 if (!pending_result_dec(&obj_req->pending, result))
3244 return false;
df561f66 3245 fallthrough;
22e8bd51
ID
3246 case RBD_OBJ_COPYUP_OBJECT_MAPS:
3247 if (*result) {
3248 rbd_warn(rbd_dev, "snap object map update failed: %d",
3249 *result);
3250 return true;
3251 }
3252
793333a3
ID
3253 rbd_obj_copyup_write_object(obj_req);
3254 if (!obj_req->pending.num_pending) {
3255 *result = obj_req->pending.result;
3256 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3257 goto again;
3258 }
3259 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3260 return false;
3261 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3262 if (!pending_result_dec(&obj_req->pending, result))
3263 return false;
df561f66 3264 fallthrough;
793333a3
ID
3265 case RBD_OBJ_COPYUP_WRITE_OBJECT:
3266 return true;
3267 default:
3268 BUG();
3269 }
3270}
3271
22e8bd51
ID
3272/*
3273 * Return:
3274 * 0 - object map update sent
3275 * 1 - object map update isn't needed
3276 * <0 - error
3277 */
3278static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
3279{
3280 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3281 u8 current_state = OBJECT_PENDING;
3282
3283 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3284 return 1;
3285
3286 if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3287 return 1;
3288
3289 return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3290 &current_state);
3291}
3292
85b5e6d1 3293static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
8b3e1a56 3294{
793333a3 3295 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 3296 int ret;
8b3e1a56 3297
793333a3 3298again:
3da691bf 3299 switch (obj_req->write_state) {
85b5e6d1
ID
3300 case RBD_OBJ_WRITE_START:
3301 rbd_assert(!*result);
3302
22e8bd51
ID
3303 if (rbd_obj_write_is_noop(obj_req))
3304 return true;
3305
3306 ret = rbd_obj_write_pre_object_map(obj_req);
3307 if (ret < 0) {
3308 *result = ret;
3309 return true;
3310 }
3311 obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3312 if (ret > 0)
3313 goto again;
3314 return false;
3315 case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3316 if (*result) {
3317 rbd_warn(rbd_dev, "pre object map update failed: %d",
3318 *result);
3319 return true;
3320 }
85b5e6d1
ID
3321 ret = rbd_obj_write_object(obj_req);
3322 if (ret) {
3323 *result = ret;
3324 return true;
3325 }
3326 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3327 return false;
0ad5d953 3328 case RBD_OBJ_WRITE_OBJECT:
54ab3b24 3329 if (*result == -ENOENT) {
0ad5d953 3330 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
793333a3
ID
3331 *result = 0;
3332 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3333 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3334 goto again;
0ad5d953 3335 }
3da691bf 3336 /*
0ad5d953
ID
3337 * On a non-existent object:
3338 * delete - -ENOENT, truncate/zero - 0
3da691bf 3339 */
0ad5d953
ID
3340 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3341 *result = 0;
3da691bf 3342 }
a9b67e69 3343 if (*result)
3a482501 3344 return true;
8b3e1a56 3345
793333a3
ID
3346 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3347 goto again;
3348 case __RBD_OBJ_WRITE_COPYUP:
3349 if (!rbd_obj_advance_copyup(obj_req, result))
3350 return false;
df561f66 3351 fallthrough;
793333a3 3352 case RBD_OBJ_WRITE_COPYUP:
22e8bd51 3353 if (*result) {
793333a3 3354 rbd_warn(rbd_dev, "copyup failed: %d", *result);
22e8bd51
ID
3355 return true;
3356 }
3357 ret = rbd_obj_write_post_object_map(obj_req);
3358 if (ret < 0) {
3359 *result = ret;
3360 return true;
3361 }
3362 obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3363 if (ret > 0)
3364 goto again;
3365 return false;
3366 case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3367 if (*result)
3368 rbd_warn(rbd_dev, "post object map update failed: %d",
3369 *result);
793333a3 3370 return true;
3da691bf 3371 default:
c6244b3b 3372 BUG();
3da691bf
ID
3373 }
3374}
02c74fba 3375
3da691bf 3376/*
0ad5d953 3377 * Return true if @obj_req is completed.
3da691bf 3378 */
54ab3b24
ID
3379static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3380 int *result)
3da691bf 3381{
0ad5d953 3382 struct rbd_img_request *img_req = obj_req->img_request;
0192ce2e 3383 struct rbd_device *rbd_dev = img_req->rbd_dev;
0ad5d953
ID
3384 bool done;
3385
85b5e6d1 3386 mutex_lock(&obj_req->state_mutex);
0ad5d953 3387 if (!rbd_img_is_write(img_req))
85b5e6d1 3388 done = rbd_obj_advance_read(obj_req, result);
0ad5d953 3389 else
85b5e6d1
ID
3390 done = rbd_obj_advance_write(obj_req, result);
3391 mutex_unlock(&obj_req->state_mutex);
0ad5d953 3392
0192ce2e
ID
3393 if (done && *result) {
3394 rbd_assert(*result < 0);
3395 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3396 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3397 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3398 }
0ad5d953 3399 return done;
3da691bf 3400}
02c74fba 3401
0192ce2e
ID
3402/*
3403 * This is open-coded in rbd_img_handle_request() to avoid parent chain
3404 * recursion.
3405 */
3406static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3407{
3408 if (__rbd_obj_handle_request(obj_req, &result))
3409 rbd_img_handle_request(obj_req->img_request, result);
3410}
3411
e1fddc8f
ID
3412static bool need_exclusive_lock(struct rbd_img_request *img_req)
3413{
3414 struct rbd_device *rbd_dev = img_req->rbd_dev;
3415
3416 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3417 return false;
3418
3fe69921 3419 if (rbd_is_ro(rbd_dev))
e1fddc8f
ID
3420 return false;
3421
3422 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
22e8bd51
ID
3423 if (rbd_dev->opts->lock_on_read ||
3424 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
e1fddc8f
ID
3425 return true;
3426
3427 return rbd_img_is_write(img_req);
3428}
3429
637cd060 3430static bool rbd_lock_add_request(struct rbd_img_request *img_req)
e1fddc8f
ID
3431{
3432 struct rbd_device *rbd_dev = img_req->rbd_dev;
637cd060 3433 bool locked;
e1fddc8f
ID
3434
3435 lockdep_assert_held(&rbd_dev->lock_rwsem);
637cd060 3436 locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
e1fddc8f
ID
3437 spin_lock(&rbd_dev->lock_lists_lock);
3438 rbd_assert(list_empty(&img_req->lock_item));
637cd060
ID
3439 if (!locked)
3440 list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3441 else
3442 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
e1fddc8f 3443 spin_unlock(&rbd_dev->lock_lists_lock);
637cd060 3444 return locked;
e1fddc8f
ID
3445}
3446
3447static void rbd_lock_del_request(struct rbd_img_request *img_req)
3448{
3449 struct rbd_device *rbd_dev = img_req->rbd_dev;
3450 bool need_wakeup;
3451
3452 lockdep_assert_held(&rbd_dev->lock_rwsem);
3453 spin_lock(&rbd_dev->lock_lists_lock);
3454 rbd_assert(!list_empty(&img_req->lock_item));
3455 list_del_init(&img_req->lock_item);
3456 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3457 list_empty(&rbd_dev->running_list));
3458 spin_unlock(&rbd_dev->lock_lists_lock);
3459 if (need_wakeup)
3460 complete(&rbd_dev->releasing_wait);
3461}
3462
637cd060
ID
3463static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3464{
3465 struct rbd_device *rbd_dev = img_req->rbd_dev;
3466
3467 if (!need_exclusive_lock(img_req))
3468 return 1;
3469
3470 if (rbd_lock_add_request(img_req))
3471 return 1;
3472
3473 if (rbd_dev->opts->exclusive) {
3474 WARN_ON(1); /* lock got released? */
3475 return -EROFS;
3476 }
3477
3478 /*
3479 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3480 * and cancel_delayed_work() in wake_lock_waiters().
3481 */
3482 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3483 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3484 return 0;
3485}
3486
0192ce2e 3487static void rbd_img_object_requests(struct rbd_img_request *img_req)
7114edac 3488{
0192ce2e 3489 struct rbd_obj_request *obj_req;
7114edac 3490
0192ce2e
ID
3491 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3492
3493 for_each_obj_request(img_req, obj_req) {
3494 int result = 0;
a9e8ba2c 3495
0192ce2e
ID
3496 if (__rbd_obj_handle_request(obj_req, &result)) {
3497 if (result) {
3498 img_req->pending.result = result;
3499 return;
3500 }
3501 } else {
3502 img_req->pending.num_pending++;
3503 }
3504 }
8b3e1a56
AE
3505}
3506
0192ce2e 3507static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
8b3e1a56 3508{
637cd060 3509 struct rbd_device *rbd_dev = img_req->rbd_dev;
3da691bf 3510 int ret;
8b3e1a56 3511
0192ce2e
ID
3512again:
3513 switch (img_req->state) {
3514 case RBD_IMG_START:
3515 rbd_assert(!*result);
8b3e1a56 3516
637cd060
ID
3517 ret = rbd_img_exclusive_lock(img_req);
3518 if (ret < 0) {
3519 *result = ret;
3da691bf
ID
3520 return true;
3521 }
637cd060
ID
3522 img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3523 if (ret > 0)
3524 goto again;
3da691bf 3525 return false;
637cd060
ID
3526 case RBD_IMG_EXCLUSIVE_LOCK:
3527 if (*result)
89a59c1c
ID
3528 return true;
3529
637cd060
ID
3530 rbd_assert(!need_exclusive_lock(img_req) ||
3531 __rbd_is_lock_owner(rbd_dev));
3532
0192ce2e
ID
3533 rbd_img_object_requests(img_req);
3534 if (!img_req->pending.num_pending) {
3535 *result = img_req->pending.result;
3536 img_req->state = RBD_IMG_OBJECT_REQUESTS;
3537 goto again;
3da691bf 3538 }
0192ce2e 3539 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3da691bf 3540 return false;
0192ce2e
ID
3541 case __RBD_IMG_OBJECT_REQUESTS:
3542 if (!pending_result_dec(&img_req->pending, result))
3543 return false;
df561f66 3544 fallthrough;
0192ce2e
ID
3545 case RBD_IMG_OBJECT_REQUESTS:
3546 return true;
3da691bf 3547 default:
c6244b3b 3548 BUG();
3da691bf
ID
3549 }
3550}
02c74fba 3551
3da691bf 3552/*
0192ce2e 3553 * Return true if @img_req is completed.
3da691bf 3554 */
0192ce2e
ID
3555static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3556 int *result)
7114edac 3557{
0192ce2e
ID
3558 struct rbd_device *rbd_dev = img_req->rbd_dev;
3559 bool done;
7114edac 3560
e1fddc8f
ID
3561 if (need_exclusive_lock(img_req)) {
3562 down_read(&rbd_dev->lock_rwsem);
3563 mutex_lock(&img_req->state_mutex);
3564 done = rbd_img_advance(img_req, result);
3565 if (done)
3566 rbd_lock_del_request(img_req);
3567 mutex_unlock(&img_req->state_mutex);
3568 up_read(&rbd_dev->lock_rwsem);
3569 } else {
3570 mutex_lock(&img_req->state_mutex);
3571 done = rbd_img_advance(img_req, result);
3572 mutex_unlock(&img_req->state_mutex);
02c74fba 3573 }
a9e8ba2c 3574
0192ce2e
ID
3575 if (done && *result) {
3576 rbd_assert(*result < 0);
3577 rbd_warn(rbd_dev, "%s%s result %d",
3578 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3579 obj_op_name(img_req->op_type), *result);
7114edac 3580 }
0192ce2e 3581 return done;
7114edac 3582}
a9e8ba2c 3583
0192ce2e 3584static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3da691bf 3585{
7114edac 3586again:
0192ce2e 3587 if (!__rbd_img_handle_request(img_req, &result))
7114edac 3588 return;
8b3e1a56 3589
7114edac 3590 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
0192ce2e
ID
3591 struct rbd_obj_request *obj_req = img_req->obj_request;
3592
679a97d2 3593 rbd_img_request_destroy(img_req);
0192ce2e
ID
3594 if (__rbd_obj_handle_request(obj_req, &result)) {
3595 img_req = obj_req->img_request;
3596 goto again;
3597 }
3598 } else {
59e542c8 3599 struct request *rq = blk_mq_rq_from_pdu(img_req);
0192ce2e 3600
679a97d2 3601 rbd_img_request_destroy(img_req);
0192ce2e 3602 blk_mq_end_request(rq, errno_to_blk_status(result));
7114edac 3603 }
8b3e1a56 3604}
bf0d5f50 3605
ed95b21a 3606static const struct rbd_client_id rbd_empty_cid;
b8d70035 3607
ed95b21a
ID
3608static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3609 const struct rbd_client_id *rhs)
3610{
3611 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3612}
3613
3614static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3615{
3616 struct rbd_client_id cid;
3617
3618 mutex_lock(&rbd_dev->watch_mutex);
3619 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3620 cid.handle = rbd_dev->watch_cookie;
3621 mutex_unlock(&rbd_dev->watch_mutex);
3622 return cid;
3623}
3624
3625/*
3626 * lock_rwsem must be held for write
3627 */
3628static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3629 const struct rbd_client_id *cid)
3630{
3631 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3632 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3633 cid->gid, cid->handle);
3634 rbd_dev->owner_cid = *cid; /* struct */
3635}
3636
3637static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3638{
3639 mutex_lock(&rbd_dev->watch_mutex);
3640 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3641 mutex_unlock(&rbd_dev->watch_mutex);
3642}
3643
edd8ca80
FM
3644static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3645{
3646 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3647
a2b1da09 3648 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
edd8ca80
FM
3649 strcpy(rbd_dev->lock_cookie, cookie);
3650 rbd_set_owner_cid(rbd_dev, &cid);
3651 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3652}
3653
ed95b21a
ID
3654/*
3655 * lock_rwsem must be held for write
3656 */
3657static int rbd_lock(struct rbd_device *rbd_dev)
b8d70035 3658{
922dab61 3659 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3660 char cookie[32];
e627db08 3661 int ret;
b8d70035 3662
cbbfb0ff
ID
3663 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3664 rbd_dev->lock_cookie[0] != '\0');
52bb1f9b 3665
ed95b21a
ID
3666 format_lock_cookie(rbd_dev, cookie);
3667 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3668 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3669 RBD_LOCK_TAG, "", 0);
e627db08 3670 if (ret)
ed95b21a 3671 return ret;
b8d70035 3672
edd8ca80 3673 __rbd_lock(rbd_dev, cookie);
ed95b21a 3674 return 0;
b8d70035
AE
3675}
3676
ed95b21a
ID
3677/*
3678 * lock_rwsem must be held for write
3679 */
bbead745 3680static void rbd_unlock(struct rbd_device *rbd_dev)
bb040aa0 3681{
922dab61 3682 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
bb040aa0
ID
3683 int ret;
3684
cbbfb0ff
ID
3685 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3686 rbd_dev->lock_cookie[0] == '\0');
bb040aa0 3687
ed95b21a 3688 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
cbbfb0ff 3689 RBD_LOCK_NAME, rbd_dev->lock_cookie);
bbead745 3690 if (ret && ret != -ENOENT)
637cd060 3691 rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
bb040aa0 3692
bbead745
ID
3693 /* treat errors as the image is unlocked */
3694 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
cbbfb0ff 3695 rbd_dev->lock_cookie[0] = '\0';
ed95b21a
ID
3696 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3697 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
bb040aa0
ID
3698}
3699
ed95b21a
ID
3700static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3701 enum rbd_notify_op notify_op,
3702 struct page ***preply_pages,
3703 size_t *preply_len)
9969ebc5
AE
3704{
3705 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3706 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
08a79102
KS
3707 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3708 int buf_size = sizeof(buf);
ed95b21a 3709 void *p = buf;
9969ebc5 3710
ed95b21a 3711 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
9969ebc5 3712
ed95b21a
ID
3713 /* encode *LockPayload NotifyMessage (op + ClientId) */
3714 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3715 ceph_encode_32(&p, notify_op);
3716 ceph_encode_64(&p, cid.gid);
3717 ceph_encode_64(&p, cid.handle);
8eb87565 3718
ed95b21a
ID
3719 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3720 &rbd_dev->header_oloc, buf, buf_size,
3721 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
b30a01f2
ID
3722}
3723
ed95b21a
ID
3724static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3725 enum rbd_notify_op notify_op)
b30a01f2 3726{
8ae0299a 3727 __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
ed95b21a 3728}
b30a01f2 3729
ed95b21a
ID
3730static void rbd_notify_acquired_lock(struct work_struct *work)
3731{
3732 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3733 acquired_lock_work);
76756a51 3734
ed95b21a 3735 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
c525f036
ID
3736}
3737
ed95b21a 3738static void rbd_notify_released_lock(struct work_struct *work)
c525f036 3739{
ed95b21a
ID
3740 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3741 released_lock_work);
811c6688 3742
ed95b21a 3743 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
fca27065
ID
3744}
3745
ed95b21a 3746static int rbd_request_lock(struct rbd_device *rbd_dev)
36be9a76 3747{
ed95b21a
ID
3748 struct page **reply_pages;
3749 size_t reply_len;
3750 bool lock_owner_responded = false;
36be9a76
AE
3751 int ret;
3752
ed95b21a 3753 dout("%s rbd_dev %p\n", __func__, rbd_dev);
36be9a76 3754
ed95b21a
ID
3755 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3756 &reply_pages, &reply_len);
3757 if (ret && ret != -ETIMEDOUT) {
3758 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
36be9a76 3759 goto out;
ed95b21a 3760 }
36be9a76 3761
ed95b21a
ID
3762 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3763 void *p = page_address(reply_pages[0]);
3764 void *const end = p + reply_len;
3765 u32 n;
36be9a76 3766
ed95b21a
ID
3767 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3768 while (n--) {
3769 u8 struct_v;
3770 u32 len;
36be9a76 3771
ed95b21a
ID
3772 ceph_decode_need(&p, end, 8 + 8, e_inval);
3773 p += 8 + 8; /* skip gid and cookie */
04017e29 3774
ed95b21a
ID
3775 ceph_decode_32_safe(&p, end, len, e_inval);
3776 if (!len)
3777 continue;
3778
3779 if (lock_owner_responded) {
3780 rbd_warn(rbd_dev,
3781 "duplicate lock owners detected");
3782 ret = -EIO;
3783 goto out;
3784 }
3785
3786 lock_owner_responded = true;
3787 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3788 &struct_v, &len);
3789 if (ret) {
3790 rbd_warn(rbd_dev,
3791 "failed to decode ResponseMessage: %d",
3792 ret);
3793 goto e_inval;
3794 }
3795
3796 ret = ceph_decode_32(&p);
3797 }
3798 }
3799
3800 if (!lock_owner_responded) {
3801 rbd_warn(rbd_dev, "no lock owners detected");
3802 ret = -ETIMEDOUT;
3803 }
3804
3805out:
3806 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3807 return ret;
3808
3809e_inval:
3810 ret = -EINVAL;
3811 goto out;
3812}
3813
637cd060
ID
3814/*
3815 * Either image request state machine(s) or rbd_add_acquire_lock()
3816 * (i.e. "rbd map").
3817 */
3818static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
ed95b21a 3819{
637cd060
ID
3820 struct rbd_img_request *img_req;
3821
3822 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
d9b9c893 3823 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
3824
3825 cancel_delayed_work(&rbd_dev->lock_dwork);
637cd060
ID
3826 if (!completion_done(&rbd_dev->acquire_wait)) {
3827 rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3828 list_empty(&rbd_dev->running_list));
3829 rbd_dev->acquire_err = result;
3830 complete_all(&rbd_dev->acquire_wait);
3831 return;
3832 }
3833
3834 list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3835 mutex_lock(&img_req->state_mutex);
3836 rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3837 rbd_img_schedule(img_req, result);
3838 mutex_unlock(&img_req->state_mutex);
3839 }
3840
3841 list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
ed95b21a
ID
3842}
3843
3844static int get_lock_owner_info(struct rbd_device *rbd_dev,
3845 struct ceph_locker **lockers, u32 *num_lockers)
3846{
3847 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3848 u8 lock_type;
3849 char *lock_tag;
3850 int ret;
3851
3852 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3853
3854 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3855 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3856 &lock_type, &lock_tag, lockers, num_lockers);
3857 if (ret)
3858 return ret;
3859
3860 if (*num_lockers == 0) {
3861 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3862 goto out;
3863 }
3864
3865 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3866 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3867 lock_tag);
3868 ret = -EBUSY;
3869 goto out;
3870 }
3871
3872 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3873 rbd_warn(rbd_dev, "shared lock type detected");
3874 ret = -EBUSY;
3875 goto out;
3876 }
3877
3878 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3879 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3880 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3881 (*lockers)[0].id.cookie);
3882 ret = -EBUSY;
3883 goto out;
3884 }
3885
3886out:
3887 kfree(lock_tag);
3888 return ret;
3889}
3890
3891static int find_watcher(struct rbd_device *rbd_dev,
3892 const struct ceph_locker *locker)
3893{
3894 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3895 struct ceph_watch_item *watchers;
3896 u32 num_watchers;
3897 u64 cookie;
3898 int i;
3899 int ret;
3900
3901 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3902 &rbd_dev->header_oloc, &watchers,
3903 &num_watchers);
3904 if (ret)
3905 return ret;
3906
3907 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3908 for (i = 0; i < num_watchers; i++) {
313771e8
ID
3909 /*
3910 * Ignore addr->type while comparing. This mimics
3911 * entity_addr_t::get_legacy_str() + strcmp().
3912 */
3913 if (ceph_addr_equal_no_type(&watchers[i].addr,
3914 &locker->info.addr) &&
ed95b21a
ID
3915 watchers[i].cookie == cookie) {
3916 struct rbd_client_id cid = {
3917 .gid = le64_to_cpu(watchers[i].name.num),
3918 .handle = cookie,
3919 };
3920
3921 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3922 rbd_dev, cid.gid, cid.handle);
3923 rbd_set_owner_cid(rbd_dev, &cid);
3924 ret = 1;
3925 goto out;
3926 }
3927 }
3928
3929 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3930 ret = 0;
3931out:
3932 kfree(watchers);
3933 return ret;
3934}
3935
3936/*
3937 * lock_rwsem must be held for write
3938 */
3939static int rbd_try_lock(struct rbd_device *rbd_dev)
3940{
3941 struct ceph_client *client = rbd_dev->rbd_client->client;
3942 struct ceph_locker *lockers;
3943 u32 num_lockers;
3944 int ret;
3945
3946 for (;;) {
3947 ret = rbd_lock(rbd_dev);
3948 if (ret != -EBUSY)
3949 return ret;
3950
3951 /* determine if the current lock holder is still alive */
3952 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3953 if (ret)
3954 return ret;
3955
3956 if (num_lockers == 0)
3957 goto again;
3958
3959 ret = find_watcher(rbd_dev, lockers);
637cd060
ID
3960 if (ret)
3961 goto out; /* request lock or error */
ed95b21a 3962
22e8bd51 3963 rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
ed95b21a
ID
3964 ENTITY_NAME(lockers[0].id.name));
3965
0b98acd6 3966 ret = ceph_monc_blocklist_add(&client->monc,
ed95b21a
ID
3967 &lockers[0].info.addr);
3968 if (ret) {
0b98acd6 3969 rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
ed95b21a
ID
3970 ENTITY_NAME(lockers[0].id.name), ret);
3971 goto out;
3972 }
3973
3974 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3975 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3976 lockers[0].id.cookie,
3977 &lockers[0].id.name);
3978 if (ret && ret != -ENOENT)
3979 goto out;
3980
3981again:
3982 ceph_free_lockers(lockers, num_lockers);
3983 }
3984
3985out:
3986 ceph_free_lockers(lockers, num_lockers);
3987 return ret;
3988}
3989
22e8bd51
ID
3990static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
3991{
3992 int ret;
3993
3994 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
3995 ret = rbd_object_map_open(rbd_dev);
3996 if (ret)
3997 return ret;
3998 }
3999
4000 return 0;
4001}
4002
ed95b21a 4003/*
637cd060
ID
4004 * Return:
4005 * 0 - lock acquired
4006 * 1 - caller should call rbd_request_lock()
4007 * <0 - error
ed95b21a 4008 */
637cd060 4009static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
ed95b21a 4010{
637cd060 4011 int ret;
ed95b21a
ID
4012
4013 down_read(&rbd_dev->lock_rwsem);
4014 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4015 rbd_dev->lock_state);
4016 if (__rbd_is_lock_owner(rbd_dev)) {
ed95b21a 4017 up_read(&rbd_dev->lock_rwsem);
637cd060 4018 return 0;
ed95b21a
ID
4019 }
4020
4021 up_read(&rbd_dev->lock_rwsem);
4022 down_write(&rbd_dev->lock_rwsem);
4023 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4024 rbd_dev->lock_state);
637cd060
ID
4025 if (__rbd_is_lock_owner(rbd_dev)) {
4026 up_write(&rbd_dev->lock_rwsem);
4027 return 0;
ed95b21a
ID
4028 }
4029
637cd060
ID
4030 ret = rbd_try_lock(rbd_dev);
4031 if (ret < 0) {
4032 rbd_warn(rbd_dev, "failed to lock header: %d", ret);
0b98acd6 4033 if (ret == -EBLOCKLISTED)
637cd060
ID
4034 goto out;
4035
4036 ret = 1; /* request lock anyway */
4037 }
4038 if (ret > 0) {
4039 up_write(&rbd_dev->lock_rwsem);
4040 return ret;
4041 }
4042
4043 rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4044 rbd_assert(list_empty(&rbd_dev->running_list));
4045
22e8bd51
ID
4046 ret = rbd_post_acquire_action(rbd_dev);
4047 if (ret) {
4048 rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4049 /*
4050 * Can't stay in RBD_LOCK_STATE_LOCKED because
4051 * rbd_lock_add_request() would let the request through,
4052 * assuming that e.g. object map is locked and loaded.
4053 */
4054 rbd_unlock(rbd_dev);
ed95b21a
ID
4055 }
4056
637cd060
ID
4057out:
4058 wake_lock_waiters(rbd_dev, ret);
ed95b21a 4059 up_write(&rbd_dev->lock_rwsem);
637cd060 4060 return ret;
ed95b21a
ID
4061}
4062
4063static void rbd_acquire_lock(struct work_struct *work)
4064{
4065 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4066 struct rbd_device, lock_dwork);
637cd060 4067 int ret;
ed95b21a
ID
4068
4069 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4070again:
637cd060
ID
4071 ret = rbd_try_acquire_lock(rbd_dev);
4072 if (ret <= 0) {
4073 dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
ed95b21a
ID
4074 return;
4075 }
4076
4077 ret = rbd_request_lock(rbd_dev);
4078 if (ret == -ETIMEDOUT) {
4079 goto again; /* treat this as a dead client */
e010dd0a
ID
4080 } else if (ret == -EROFS) {
4081 rbd_warn(rbd_dev, "peer will not release lock");
637cd060
ID
4082 down_write(&rbd_dev->lock_rwsem);
4083 wake_lock_waiters(rbd_dev, ret);
4084 up_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
4085 } else if (ret < 0) {
4086 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4087 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4088 RBD_RETRY_DELAY);
4089 } else {
4090 /*
4091 * lock owner acked, but resend if we don't see them
4092 * release the lock
4093 */
6b0a8774 4094 dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
ed95b21a
ID
4095 rbd_dev);
4096 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4097 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4098 }
4099}
4100
a2b1da09 4101static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
ed95b21a 4102{
a2b1da09 4103 dout("%s rbd_dev %p\n", __func__, rbd_dev);
d9b9c893 4104 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
a2b1da09 4105
ed95b21a
ID
4106 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4107 return false;
4108
52bb1f9b 4109 /*
ed95b21a 4110 * Ensure that all in-flight IO is flushed.
52bb1f9b 4111 */
e1fddc8f
ID
4112 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4113 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
ed9eb710
ID
4114 if (list_empty(&rbd_dev->running_list))
4115 return true;
4116
4117 up_write(&rbd_dev->lock_rwsem);
4118 wait_for_completion(&rbd_dev->releasing_wait);
ed95b21a
ID
4119
4120 down_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
4121 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4122 return false;
4123
e1fddc8f 4124 rbd_assert(list_empty(&rbd_dev->running_list));
a2b1da09
ID
4125 return true;
4126}
4127
22e8bd51
ID
4128static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4129{
4130 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4131 rbd_object_map_close(rbd_dev);
4132}
4133
e1fddc8f
ID
4134static void __rbd_release_lock(struct rbd_device *rbd_dev)
4135{
4136 rbd_assert(list_empty(&rbd_dev->running_list));
4137
22e8bd51 4138 rbd_pre_release_action(rbd_dev);
bbead745 4139 rbd_unlock(rbd_dev);
e1fddc8f
ID
4140}
4141
a2b1da09
ID
4142/*
4143 * lock_rwsem must be held for write
4144 */
4145static void rbd_release_lock(struct rbd_device *rbd_dev)
4146{
4147 if (!rbd_quiesce_lock(rbd_dev))
4148 return;
4149
e1fddc8f 4150 __rbd_release_lock(rbd_dev);
a2b1da09 4151
bbead745
ID
4152 /*
4153 * Give others a chance to grab the lock - we would re-acquire
637cd060
ID
4154 * almost immediately if we got new IO while draining the running
4155 * list otherwise. We need to ack our own notifications, so this
4156 * lock_dwork will be requeued from rbd_handle_released_lock() by
4157 * way of maybe_kick_acquire().
bbead745
ID
4158 */
4159 cancel_delayed_work(&rbd_dev->lock_dwork);
ed95b21a
ID
4160}
4161
4162static void rbd_release_lock_work(struct work_struct *work)
4163{
4164 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4165 unlock_work);
4166
4167 down_write(&rbd_dev->lock_rwsem);
4168 rbd_release_lock(rbd_dev);
4169 up_write(&rbd_dev->lock_rwsem);
4170}
4171
637cd060
ID
4172static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4173{
4174 bool have_requests;
4175
4176 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4177 if (__rbd_is_lock_owner(rbd_dev))
4178 return;
4179
4180 spin_lock(&rbd_dev->lock_lists_lock);
4181 have_requests = !list_empty(&rbd_dev->acquiring_list);
4182 spin_unlock(&rbd_dev->lock_lists_lock);
4183 if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4184 dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4185 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4186 }
4187}
4188
ed95b21a
ID
4189static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4190 void **p)
4191{
4192 struct rbd_client_id cid = { 0 };
4193
4194 if (struct_v >= 2) {
4195 cid.gid = ceph_decode_64(p);
4196 cid.handle = ceph_decode_64(p);
4197 }
4198
4199 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4200 cid.handle);
4201 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4202 down_write(&rbd_dev->lock_rwsem);
4203 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
8798d070
ID
4204 dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n",
4205 __func__, rbd_dev, cid.gid, cid.handle);
4206 } else {
4207 rbd_set_owner_cid(rbd_dev, &cid);
ed95b21a 4208 }
ed95b21a
ID
4209 downgrade_write(&rbd_dev->lock_rwsem);
4210 } else {
4211 down_read(&rbd_dev->lock_rwsem);
4212 }
4213
637cd060 4214 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
4215 up_read(&rbd_dev->lock_rwsem);
4216}
4217
4218static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4219 void **p)
4220{
4221 struct rbd_client_id cid = { 0 };
4222
4223 if (struct_v >= 2) {
4224 cid.gid = ceph_decode_64(p);
4225 cid.handle = ceph_decode_64(p);
4226 }
4227
4228 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4229 cid.handle);
4230 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4231 down_write(&rbd_dev->lock_rwsem);
4232 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
8798d070 4233 dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n",
ed95b21a
ID
4234 __func__, rbd_dev, cid.gid, cid.handle,
4235 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
8798d070
ID
4236 } else {
4237 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
ed95b21a 4238 }
ed95b21a
ID
4239 downgrade_write(&rbd_dev->lock_rwsem);
4240 } else {
4241 down_read(&rbd_dev->lock_rwsem);
4242 }
4243
637cd060 4244 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
4245 up_read(&rbd_dev->lock_rwsem);
4246}
4247
3b77faa0
ID
4248/*
4249 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4250 * ResponseMessage is needed.
4251 */
4252static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4253 void **p)
ed95b21a
ID
4254{
4255 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4256 struct rbd_client_id cid = { 0 };
3b77faa0 4257 int result = 1;
ed95b21a
ID
4258
4259 if (struct_v >= 2) {
4260 cid.gid = ceph_decode_64(p);
4261 cid.handle = ceph_decode_64(p);
4262 }
4263
4264 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4265 cid.handle);
4266 if (rbd_cid_equal(&cid, &my_cid))
3b77faa0 4267 return result;
ed95b21a
ID
4268
4269 down_read(&rbd_dev->lock_rwsem);
3b77faa0
ID
4270 if (__rbd_is_lock_owner(rbd_dev)) {
4271 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4272 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4273 goto out_unlock;
4274
4275 /*
4276 * encode ResponseMessage(0) so the peer can detect
4277 * a missing owner
4278 */
4279 result = 0;
4280
4281 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
e010dd0a
ID
4282 if (!rbd_dev->opts->exclusive) {
4283 dout("%s rbd_dev %p queueing unlock_work\n",
4284 __func__, rbd_dev);
4285 queue_work(rbd_dev->task_wq,
4286 &rbd_dev->unlock_work);
4287 } else {
4288 /* refuse to release the lock */
4289 result = -EROFS;
4290 }
ed95b21a
ID
4291 }
4292 }
3b77faa0
ID
4293
4294out_unlock:
ed95b21a 4295 up_read(&rbd_dev->lock_rwsem);
3b77faa0 4296 return result;
ed95b21a
ID
4297}
4298
4299static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4300 u64 notify_id, u64 cookie, s32 *result)
4301{
4302 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
08a79102
KS
4303 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4304 int buf_size = sizeof(buf);
ed95b21a
ID
4305 int ret;
4306
4307 if (result) {
4308 void *p = buf;
4309
4310 /* encode ResponseMessage */
4311 ceph_start_encoding(&p, 1, 1,
4312 buf_size - CEPH_ENCODING_START_BLK_LEN);
4313 ceph_encode_32(&p, *result);
4314 } else {
4315 buf_size = 0;
4316 }
b8d70035 4317
922dab61
ID
4318 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4319 &rbd_dev->header_oloc, notify_id, cookie,
ed95b21a 4320 buf, buf_size);
52bb1f9b 4321 if (ret)
ed95b21a
ID
4322 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4323}
4324
4325static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4326 u64 cookie)
4327{
4328 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4329 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4330}
4331
4332static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4333 u64 notify_id, u64 cookie, s32 result)
4334{
4335 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4336 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4337}
4338
4339static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4340 u64 notifier_id, void *data, size_t data_len)
4341{
4342 struct rbd_device *rbd_dev = arg;
4343 void *p = data;
4344 void *const end = p + data_len;
d4c2269b 4345 u8 struct_v = 0;
ed95b21a
ID
4346 u32 len;
4347 u32 notify_op;
4348 int ret;
4349
4350 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4351 __func__, rbd_dev, cookie, notify_id, data_len);
4352 if (data_len) {
4353 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4354 &struct_v, &len);
4355 if (ret) {
4356 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4357 ret);
4358 return;
4359 }
4360
4361 notify_op = ceph_decode_32(&p);
4362 } else {
4363 /* legacy notification for header updates */
4364 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4365 len = 0;
4366 }
4367
4368 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4369 switch (notify_op) {
4370 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4371 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4372 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4373 break;
4374 case RBD_NOTIFY_OP_RELEASED_LOCK:
4375 rbd_handle_released_lock(rbd_dev, struct_v, &p);
4376 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4377 break;
4378 case RBD_NOTIFY_OP_REQUEST_LOCK:
3b77faa0
ID
4379 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4380 if (ret <= 0)
ed95b21a 4381 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3b77faa0 4382 cookie, ret);
ed95b21a
ID
4383 else
4384 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4385 break;
4386 case RBD_NOTIFY_OP_HEADER_UPDATE:
4387 ret = rbd_dev_refresh(rbd_dev);
4388 if (ret)
4389 rbd_warn(rbd_dev, "refresh failed: %d", ret);
4390
4391 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4392 break;
4393 default:
4394 if (rbd_is_lock_owner(rbd_dev))
4395 rbd_acknowledge_notify_result(rbd_dev, notify_id,
4396 cookie, -EOPNOTSUPP);
4397 else
4398 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4399 break;
4400 }
b8d70035
AE
4401}
4402
99d16943
ID
4403static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4404
922dab61 4405static void rbd_watch_errcb(void *arg, u64 cookie, int err)
bb040aa0 4406{
922dab61 4407 struct rbd_device *rbd_dev = arg;
bb040aa0 4408
922dab61 4409 rbd_warn(rbd_dev, "encountered watch error: %d", err);
bb040aa0 4410
ed95b21a
ID
4411 down_write(&rbd_dev->lock_rwsem);
4412 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4413 up_write(&rbd_dev->lock_rwsem);
4414
99d16943
ID
4415 mutex_lock(&rbd_dev->watch_mutex);
4416 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4417 __rbd_unregister_watch(rbd_dev);
4418 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
bb040aa0 4419
99d16943 4420 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
bb040aa0 4421 }
99d16943 4422 mutex_unlock(&rbd_dev->watch_mutex);
bb040aa0
ID
4423}
4424
9969ebc5 4425/*
99d16943 4426 * watch_mutex must be locked
9969ebc5 4427 */
99d16943 4428static int __rbd_register_watch(struct rbd_device *rbd_dev)
9969ebc5
AE
4429{
4430 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
922dab61 4431 struct ceph_osd_linger_request *handle;
9969ebc5 4432
922dab61 4433 rbd_assert(!rbd_dev->watch_handle);
99d16943 4434 dout("%s rbd_dev %p\n", __func__, rbd_dev);
9969ebc5 4435
922dab61
ID
4436 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4437 &rbd_dev->header_oloc, rbd_watch_cb,
4438 rbd_watch_errcb, rbd_dev);
4439 if (IS_ERR(handle))
4440 return PTR_ERR(handle);
8eb87565 4441
922dab61 4442 rbd_dev->watch_handle = handle;
b30a01f2 4443 return 0;
b30a01f2
ID
4444}
4445
99d16943
ID
4446/*
4447 * watch_mutex must be locked
4448 */
4449static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
b30a01f2 4450{
922dab61
ID
4451 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4452 int ret;
b30a01f2 4453
99d16943
ID
4454 rbd_assert(rbd_dev->watch_handle);
4455 dout("%s rbd_dev %p\n", __func__, rbd_dev);
b30a01f2 4456
922dab61
ID
4457 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4458 if (ret)
4459 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
76756a51 4460
922dab61 4461 rbd_dev->watch_handle = NULL;
c525f036
ID
4462}
4463
99d16943
ID
4464static int rbd_register_watch(struct rbd_device *rbd_dev)
4465{
4466 int ret;
4467
4468 mutex_lock(&rbd_dev->watch_mutex);
4469 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4470 ret = __rbd_register_watch(rbd_dev);
4471 if (ret)
4472 goto out;
4473
4474 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4475 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4476
4477out:
4478 mutex_unlock(&rbd_dev->watch_mutex);
4479 return ret;
4480}
4481
4482static void cancel_tasks_sync(struct rbd_device *rbd_dev)
c525f036 4483{
99d16943
ID
4484 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4485
ed95b21a
ID
4486 cancel_work_sync(&rbd_dev->acquired_lock_work);
4487 cancel_work_sync(&rbd_dev->released_lock_work);
4488 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4489 cancel_work_sync(&rbd_dev->unlock_work);
99d16943
ID
4490}
4491
0e4e1de5
ID
4492/*
4493 * header_rwsem must not be held to avoid a deadlock with
4494 * rbd_dev_refresh() when flushing notifies.
4495 */
99d16943
ID
4496static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4497{
4498 cancel_tasks_sync(rbd_dev);
4499
4500 mutex_lock(&rbd_dev->watch_mutex);
4501 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4502 __rbd_unregister_watch(rbd_dev);
4503 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4504 mutex_unlock(&rbd_dev->watch_mutex);
811c6688 4505
23edca86 4506 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
811c6688 4507 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
fca27065
ID
4508}
4509
14bb211d
ID
4510/*
4511 * lock_rwsem must be held for write
4512 */
4513static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4514{
4515 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4516 char cookie[32];
4517 int ret;
4518
a2b1da09
ID
4519 if (!rbd_quiesce_lock(rbd_dev))
4520 return;
14bb211d
ID
4521
4522 format_lock_cookie(rbd_dev, cookie);
4523 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4524 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4525 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4526 RBD_LOCK_TAG, cookie);
4527 if (ret) {
4528 if (ret != -EOPNOTSUPP)
4529 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4530 ret);
4531
4532 /*
4533 * Lock cookie cannot be updated on older OSDs, so do
4534 * a manual release and queue an acquire.
4535 */
e1fddc8f 4536 __rbd_release_lock(rbd_dev);
a2b1da09 4537 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
14bb211d 4538 } else {
edd8ca80 4539 __rbd_lock(rbd_dev, cookie);
637cd060 4540 wake_lock_waiters(rbd_dev, 0);
14bb211d
ID
4541 }
4542}
4543
99d16943
ID
4544static void rbd_reregister_watch(struct work_struct *work)
4545{
4546 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4547 struct rbd_device, watch_dwork);
4548 int ret;
4549
4550 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4551
4552 mutex_lock(&rbd_dev->watch_mutex);
87c0fded
ID
4553 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4554 mutex_unlock(&rbd_dev->watch_mutex);
14bb211d 4555 return;
87c0fded 4556 }
99d16943
ID
4557
4558 ret = __rbd_register_watch(rbd_dev);
4559 if (ret) {
4560 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
0b98acd6 4561 if (ret != -EBLOCKLISTED && ret != -ENOENT) {
99d16943
ID
4562 queue_delayed_work(rbd_dev->task_wq,
4563 &rbd_dev->watch_dwork,
4564 RBD_RETRY_DELAY);
637cd060
ID
4565 mutex_unlock(&rbd_dev->watch_mutex);
4566 return;
87c0fded 4567 }
637cd060 4568
87c0fded 4569 mutex_unlock(&rbd_dev->watch_mutex);
637cd060
ID
4570 down_write(&rbd_dev->lock_rwsem);
4571 wake_lock_waiters(rbd_dev, ret);
4572 up_write(&rbd_dev->lock_rwsem);
14bb211d 4573 return;
99d16943
ID
4574 }
4575
4576 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4577 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4578 mutex_unlock(&rbd_dev->watch_mutex);
4579
14bb211d
ID
4580 down_write(&rbd_dev->lock_rwsem);
4581 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4582 rbd_reacquire_lock(rbd_dev);
4583 up_write(&rbd_dev->lock_rwsem);
4584
99d16943
ID
4585 ret = rbd_dev_refresh(rbd_dev);
4586 if (ret)
f6870cc9 4587 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
99d16943
ID
4588}
4589
36be9a76 4590/*
f40eb349
AE
4591 * Synchronous osd object method call. Returns the number of bytes
4592 * returned in the outbound buffer, or a negative error code.
36be9a76
AE
4593 */
4594static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
ecd4a68a
ID
4595 struct ceph_object_id *oid,
4596 struct ceph_object_locator *oloc,
36be9a76 4597 const char *method_name,
4157976b 4598 const void *outbound,
36be9a76 4599 size_t outbound_size,
4157976b 4600 void *inbound,
e2a58ee5 4601 size_t inbound_size)
36be9a76 4602{
ecd4a68a
ID
4603 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4604 struct page *req_page = NULL;
4605 struct page *reply_page;
36be9a76
AE
4606 int ret;
4607
4608 /*
6010a451
AE
4609 * Method calls are ultimately read operations. The result
4610 * should placed into the inbound buffer provided. They
4611 * also supply outbound data--parameters for the object
4612 * method. Currently if this is present it will be a
4613 * snapshot id.
36be9a76 4614 */
ecd4a68a
ID
4615 if (outbound) {
4616 if (outbound_size > PAGE_SIZE)
4617 return -E2BIG;
36be9a76 4618
ecd4a68a
ID
4619 req_page = alloc_page(GFP_KERNEL);
4620 if (!req_page)
4621 return -ENOMEM;
04017e29 4622
ecd4a68a 4623 memcpy(page_address(req_page), outbound, outbound_size);
04017e29 4624 }
36be9a76 4625
ecd4a68a
ID
4626 reply_page = alloc_page(GFP_KERNEL);
4627 if (!reply_page) {
4628 if (req_page)
4629 __free_page(req_page);
4630 return -ENOMEM;
4631 }
57385b51 4632
ecd4a68a
ID
4633 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4634 CEPH_OSD_FLAG_READ, req_page, outbound_size,
68ada915 4635 &reply_page, &inbound_size);
ecd4a68a
ID
4636 if (!ret) {
4637 memcpy(inbound, page_address(reply_page), inbound_size);
4638 ret = inbound_size;
4639 }
36be9a76 4640
ecd4a68a
ID
4641 if (req_page)
4642 __free_page(req_page);
4643 __free_page(reply_page);
36be9a76
AE
4644 return ret;
4645}
4646
7ad18afa 4647static void rbd_queue_workfn(struct work_struct *work)
bf0d5f50 4648{
59e542c8
ID
4649 struct rbd_img_request *img_request =
4650 container_of(work, struct rbd_img_request, work);
4651 struct rbd_device *rbd_dev = img_request->rbd_dev;
4652 enum obj_operation_type op_type = img_request->op_type;
4653 struct request *rq = blk_mq_rq_from_pdu(img_request);
bc1ecc65
ID
4654 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4655 u64 length = blk_rq_bytes(rq);
4e752f0a 4656 u64 mapping_size;
bf0d5f50
AE
4657 int result;
4658
bc1ecc65 4659 /* Ignore/skip any zero-length requests */
bc1ecc65
ID
4660 if (!length) {
4661 dout("%s: zero-length request\n", __func__);
4662 result = 0;
59e542c8 4663 goto err_img_request;
bc1ecc65 4664 }
4dda41d3 4665
7ad18afa
CH
4666 blk_mq_start_request(rq);
4667
4e752f0a
JD
4668 down_read(&rbd_dev->header_rwsem);
4669 mapping_size = rbd_dev->mapping.size;
a52cc685 4670 rbd_img_capture_header(img_request);
4e752f0a
JD
4671 up_read(&rbd_dev->header_rwsem);
4672
4673 if (offset + length > mapping_size) {
bc1ecc65 4674 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4e752f0a 4675 length, mapping_size);
bc1ecc65 4676 result = -EIO;
a52cc685 4677 goto err_img_request;
bc1ecc65 4678 }
bf0d5f50 4679
21ed05a8
ID
4680 dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4681 img_request, obj_op_name(op_type), offset, length);
4682
6484cbe9 4683 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
5a237819 4684 result = rbd_img_fill_nodata(img_request, offset, length);
90e98c52 4685 else
5a237819
ID
4686 result = rbd_img_fill_from_bio(img_request, offset, length,
4687 rq->bio);
0192ce2e 4688 if (result)
bc1ecc65 4689 goto err_img_request;
bf0d5f50 4690
e1fddc8f 4691 rbd_img_handle_request(img_request, 0);
bc1ecc65 4692 return;
bf0d5f50 4693
bc1ecc65 4694err_img_request:
679a97d2 4695 rbd_img_request_destroy(img_request);
bc1ecc65
ID
4696 if (result)
4697 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
6d2940c8 4698 obj_op_name(op_type), length, offset, result);
2a842aca 4699 blk_mq_end_request(rq, errno_to_blk_status(result));
bc1ecc65 4700}
bf0d5f50 4701
fc17b653 4702static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
7ad18afa 4703 const struct blk_mq_queue_data *bd)
bc1ecc65 4704{
59e542c8
ID
4705 struct rbd_device *rbd_dev = hctx->queue->queuedata;
4706 struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
4707 enum obj_operation_type op_type;
bf0d5f50 4708
59e542c8
ID
4709 switch (req_op(bd->rq)) {
4710 case REQ_OP_DISCARD:
4711 op_type = OBJ_OP_DISCARD;
4712 break;
4713 case REQ_OP_WRITE_ZEROES:
4714 op_type = OBJ_OP_ZEROOUT;
4715 break;
4716 case REQ_OP_WRITE:
4717 op_type = OBJ_OP_WRITE;
4718 break;
4719 case REQ_OP_READ:
4720 op_type = OBJ_OP_READ;
4721 break;
4722 default:
4723 rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
4724 return BLK_STS_IOERR;
4725 }
4726
4727 rbd_img_request_init(img_req, rbd_dev, op_type);
4728
4729 if (rbd_img_is_write(img_req)) {
4730 if (rbd_is_ro(rbd_dev)) {
4731 rbd_warn(rbd_dev, "%s on read-only mapping",
4732 obj_op_name(img_req->op_type));
4733 return BLK_STS_IOERR;
4734 }
4735 rbd_assert(!rbd_is_snap(rbd_dev));
4736 }
4737
4738 INIT_WORK(&img_req->work, rbd_queue_workfn);
4739 queue_work(rbd_wq, &img_req->work);
fc17b653 4740 return BLK_STS_OK;
bf0d5f50
AE
4741}
4742
602adf40
YS
4743static void rbd_free_disk(struct rbd_device *rbd_dev)
4744{
195b1956 4745 blk_cleanup_disk(rbd_dev->disk);
5769ed0c 4746 blk_mq_free_tag_set(&rbd_dev->tag_set);
a0cab924 4747 rbd_dev->disk = NULL;
602adf40
YS
4748}
4749
788e2df3 4750static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
fe5478e0
ID
4751 struct ceph_object_id *oid,
4752 struct ceph_object_locator *oloc,
4753 void *buf, int buf_len)
788e2df3
AE
4754
4755{
fe5478e0
ID
4756 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4757 struct ceph_osd_request *req;
4758 struct page **pages;
4759 int num_pages = calc_pages_for(0, buf_len);
788e2df3
AE
4760 int ret;
4761
fe5478e0
ID
4762 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4763 if (!req)
4764 return -ENOMEM;
788e2df3 4765
fe5478e0
ID
4766 ceph_oid_copy(&req->r_base_oid, oid);
4767 ceph_oloc_copy(&req->r_base_oloc, oloc);
4768 req->r_flags = CEPH_OSD_FLAG_READ;
430c28c3 4769
fe5478e0
ID
4770 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4771 if (IS_ERR(pages)) {
4772 ret = PTR_ERR(pages);
4773 goto out_req;
4774 }
1ceae7ef 4775
fe5478e0
ID
4776 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4777 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4778 true);
4779
26f887e0
ID
4780 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4781 if (ret)
4782 goto out_req;
4783
fe5478e0
ID
4784 ceph_osdc_start_request(osdc, req, false);
4785 ret = ceph_osdc_wait_request(osdc, req);
4786 if (ret >= 0)
4787 ceph_copy_from_page_vector(pages, buf, 0, ret);
788e2df3 4788
fe5478e0
ID
4789out_req:
4790 ceph_osdc_put_request(req);
788e2df3
AE
4791 return ret;
4792}
4793
602adf40 4794/*
662518b1
AE
4795 * Read the complete header for the given rbd device. On successful
4796 * return, the rbd_dev->header field will contain up-to-date
4797 * information about the image.
602adf40 4798 */
99a41ebc 4799static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
602adf40 4800{
4156d998 4801 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 4802 u32 snap_count = 0;
4156d998
AE
4803 u64 names_size = 0;
4804 u32 want_count;
4805 int ret;
602adf40 4806
00f1f36f 4807 /*
4156d998
AE
4808 * The complete header will include an array of its 64-bit
4809 * snapshot ids, followed by the names of those snapshots as
4810 * a contiguous block of NUL-terminated strings. Note that
4811 * the number of snapshots could change by the time we read
4812 * it in, in which case we re-read it.
00f1f36f 4813 */
4156d998
AE
4814 do {
4815 size_t size;
4816
4817 kfree(ondisk);
4818
4819 size = sizeof (*ondisk);
4820 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4821 size += names_size;
4822 ondisk = kmalloc(size, GFP_KERNEL);
4823 if (!ondisk)
662518b1 4824 return -ENOMEM;
4156d998 4825
fe5478e0
ID
4826 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4827 &rbd_dev->header_oloc, ondisk, size);
4156d998 4828 if (ret < 0)
662518b1 4829 goto out;
c0cd10db 4830 if ((size_t)ret < size) {
4156d998 4831 ret = -ENXIO;
06ecc6cb
AE
4832 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4833 size, ret);
662518b1 4834 goto out;
4156d998
AE
4835 }
4836 if (!rbd_dev_ondisk_valid(ondisk)) {
4837 ret = -ENXIO;
06ecc6cb 4838 rbd_warn(rbd_dev, "invalid header");
662518b1 4839 goto out;
81e759fb 4840 }
602adf40 4841
4156d998
AE
4842 names_size = le64_to_cpu(ondisk->snap_names_len);
4843 want_count = snap_count;
4844 snap_count = le32_to_cpu(ondisk->snap_count);
4845 } while (snap_count != want_count);
00f1f36f 4846
662518b1
AE
4847 ret = rbd_header_from_disk(rbd_dev, ondisk);
4848out:
4156d998
AE
4849 kfree(ondisk);
4850
4851 return ret;
602adf40
YS
4852}
4853
9875201e
JD
4854static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4855{
4856 sector_t size;
9875201e
JD
4857
4858 /*
811c6688
ID
4859 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4860 * try to update its size. If REMOVING is set, updating size
4861 * is just useless work since the device can't be opened.
9875201e 4862 */
811c6688
ID
4863 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4864 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
9875201e
JD
4865 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4866 dout("setting size to %llu sectors", (unsigned long long)size);
e864e49a 4867 set_capacity_and_notify(rbd_dev->disk, size);
9875201e
JD
4868 }
4869}
4870
cc4a38bd 4871static int rbd_dev_refresh(struct rbd_device *rbd_dev)
1fe5e993 4872{
e627db08 4873 u64 mapping_size;
1fe5e993
AE
4874 int ret;
4875
cfbf6377 4876 down_write(&rbd_dev->header_rwsem);
3b5cf2a2 4877 mapping_size = rbd_dev->mapping.size;
a720ae09
ID
4878
4879 ret = rbd_dev_header_info(rbd_dev);
52bb1f9b 4880 if (ret)
73e39e4d 4881 goto out;
15228ede 4882
e8f59b59
ID
4883 /*
4884 * If there is a parent, see if it has disappeared due to the
4885 * mapped image getting flattened.
4886 */
4887 if (rbd_dev->parent) {
4888 ret = rbd_dev_v2_parent_info(rbd_dev);
4889 if (ret)
73e39e4d 4890 goto out;
e8f59b59
ID
4891 }
4892
686238b7
ID
4893 rbd_assert(!rbd_is_snap(rbd_dev));
4894 rbd_dev->mapping.size = rbd_dev->header.image_size;
15228ede 4895
73e39e4d 4896out:
cfbf6377 4897 up_write(&rbd_dev->header_rwsem);
73e39e4d 4898 if (!ret && mapping_size != rbd_dev->mapping.size)
9875201e 4899 rbd_dev_update_size(rbd_dev);
1fe5e993 4900
73e39e4d 4901 return ret;
1fe5e993
AE
4902}
4903
f363b089 4904static const struct blk_mq_ops rbd_mq_ops = {
7ad18afa 4905 .queue_rq = rbd_queue_rq,
7ad18afa
CH
4906};
4907
602adf40
YS
4908static int rbd_init_disk(struct rbd_device *rbd_dev)
4909{
4910 struct gendisk *disk;
4911 struct request_queue *q;
420efbdf
ID
4912 unsigned int objset_bytes =
4913 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
7ad18afa 4914 int err;
602adf40 4915
7ad18afa
CH
4916 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4917 rbd_dev->tag_set.ops = &rbd_mq_ops;
b5584180 4918 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
7ad18afa 4919 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
56d18f62 4920 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
f9b6b98d 4921 rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
59e542c8 4922 rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
7ad18afa
CH
4923
4924 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4925 if (err)
195b1956 4926 return err;
029bcbd8 4927
195b1956
CH
4928 disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev);
4929 if (IS_ERR(disk)) {
4930 err = PTR_ERR(disk);
7ad18afa
CH
4931 goto out_tag_set;
4932 }
195b1956
CH
4933 q = disk->queue;
4934
4935 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4936 rbd_dev->dev_id);
4937 disk->major = rbd_dev->major;
4938 disk->first_minor = rbd_dev->minor;
4939 if (single_major) {
4940 disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT);
4941 disk->flags |= GENHD_FL_EXT_DEVT;
4942 } else {
4943 disk->minors = RBD_MINORS_PER_MAJOR;
4944 }
4945 disk->fops = &rbd_bd_ops;
7ad18afa 4946
8b904b5b 4947 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
d8a2c89c 4948 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
593a9e7b 4949
420efbdf 4950 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
0d9fde4f 4951 q->limits.max_sectors = queue_max_hw_sectors(q);
21acdf45 4952 blk_queue_max_segments(q, USHRT_MAX);
24f1df60 4953 blk_queue_max_segment_size(q, UINT_MAX);
16d80c54
ID
4954 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
4955 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
029bcbd8 4956
d9360540
ID
4957 if (rbd_dev->opts->trim) {
4958 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
16d80c54 4959 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
d9360540
ID
4960 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4961 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4962 }
90e98c52 4963
bae818ee 4964 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
1cb039f3 4965 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
bae818ee 4966
602adf40 4967 rbd_dev->disk = disk;
602adf40 4968
602adf40 4969 return 0;
7ad18afa
CH
4970out_tag_set:
4971 blk_mq_free_tag_set(&rbd_dev->tag_set);
7ad18afa 4972 return err;
602adf40
YS
4973}
4974
dfc5606d
YS
4975/*
4976 sysfs
4977*/
4978
593a9e7b
AE
4979static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4980{
4981 return container_of(dev, struct rbd_device, dev);
4982}
4983
dfc5606d
YS
4984static ssize_t rbd_size_show(struct device *dev,
4985 struct device_attribute *attr, char *buf)
4986{
593a9e7b 4987 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0 4988
fc71d833
AE
4989 return sprintf(buf, "%llu\n",
4990 (unsigned long long)rbd_dev->mapping.size);
dfc5606d
YS
4991}
4992
34b13184
AE
4993static ssize_t rbd_features_show(struct device *dev,
4994 struct device_attribute *attr, char *buf)
4995{
4996 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4997
fa58bcad 4998 return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
34b13184
AE
4999}
5000
dfc5606d
YS
5001static ssize_t rbd_major_show(struct device *dev,
5002 struct device_attribute *attr, char *buf)
5003{
593a9e7b 5004 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 5005
fc71d833
AE
5006 if (rbd_dev->major)
5007 return sprintf(buf, "%d\n", rbd_dev->major);
5008
5009 return sprintf(buf, "(none)\n");
dd82fff1
ID
5010}
5011
5012static ssize_t rbd_minor_show(struct device *dev,
5013 struct device_attribute *attr, char *buf)
5014{
5015 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
fc71d833 5016
dd82fff1 5017 return sprintf(buf, "%d\n", rbd_dev->minor);
dfc5606d
YS
5018}
5019
005a07bf
ID
5020static ssize_t rbd_client_addr_show(struct device *dev,
5021 struct device_attribute *attr, char *buf)
5022{
5023 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5024 struct ceph_entity_addr *client_addr =
5025 ceph_client_addr(rbd_dev->rbd_client->client);
5026
5027 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5028 le32_to_cpu(client_addr->nonce));
5029}
5030
dfc5606d
YS
5031static ssize_t rbd_client_id_show(struct device *dev,
5032 struct device_attribute *attr, char *buf)
602adf40 5033{
593a9e7b 5034 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5035
1dbb4399 5036 return sprintf(buf, "client%lld\n",
033268a5 5037 ceph_client_gid(rbd_dev->rbd_client->client));
602adf40
YS
5038}
5039
267fb90b
MC
5040static ssize_t rbd_cluster_fsid_show(struct device *dev,
5041 struct device_attribute *attr, char *buf)
5042{
5043 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5044
5045 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5046}
5047
0d6d1e9c
MC
5048static ssize_t rbd_config_info_show(struct device *dev,
5049 struct device_attribute *attr, char *buf)
5050{
5051 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5052
f44d04e6
ID
5053 if (!capable(CAP_SYS_ADMIN))
5054 return -EPERM;
5055
0d6d1e9c 5056 return sprintf(buf, "%s\n", rbd_dev->config_info);
602adf40
YS
5057}
5058
dfc5606d
YS
5059static ssize_t rbd_pool_show(struct device *dev,
5060 struct device_attribute *attr, char *buf)
602adf40 5061{
593a9e7b 5062 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5063
0d7dbfce 5064 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
5065}
5066
9bb2f334
AE
5067static ssize_t rbd_pool_id_show(struct device *dev,
5068 struct device_attribute *attr, char *buf)
5069{
5070 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5071
0d7dbfce 5072 return sprintf(buf, "%llu\n",
fc71d833 5073 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
5074}
5075
b26c047b
ID
5076static ssize_t rbd_pool_ns_show(struct device *dev,
5077 struct device_attribute *attr, char *buf)
5078{
5079 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5080
5081 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5082}
5083
dfc5606d
YS
5084static ssize_t rbd_name_show(struct device *dev,
5085 struct device_attribute *attr, char *buf)
5086{
593a9e7b 5087 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5088
a92ffdf8
AE
5089 if (rbd_dev->spec->image_name)
5090 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5091
5092 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
5093}
5094
589d30e0
AE
5095static ssize_t rbd_image_id_show(struct device *dev,
5096 struct device_attribute *attr, char *buf)
5097{
5098 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5099
0d7dbfce 5100 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
5101}
5102
34b13184
AE
5103/*
5104 * Shows the name of the currently-mapped snapshot (or
5105 * RBD_SNAP_HEAD_NAME for the base image).
5106 */
dfc5606d
YS
5107static ssize_t rbd_snap_show(struct device *dev,
5108 struct device_attribute *attr,
5109 char *buf)
5110{
593a9e7b 5111 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5112
0d7dbfce 5113 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
5114}
5115
92a58671
MC
5116static ssize_t rbd_snap_id_show(struct device *dev,
5117 struct device_attribute *attr, char *buf)
5118{
5119 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5120
5121 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5122}
5123
86b00e0d 5124/*
ff96128f
ID
5125 * For a v2 image, shows the chain of parent images, separated by empty
5126 * lines. For v1 images or if there is no parent, shows "(no parent
5127 * image)".
86b00e0d
AE
5128 */
5129static ssize_t rbd_parent_show(struct device *dev,
ff96128f
ID
5130 struct device_attribute *attr,
5131 char *buf)
86b00e0d
AE
5132{
5133 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
ff96128f 5134 ssize_t count = 0;
86b00e0d 5135
ff96128f 5136 if (!rbd_dev->parent)
86b00e0d
AE
5137 return sprintf(buf, "(no parent image)\n");
5138
ff96128f
ID
5139 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5140 struct rbd_spec *spec = rbd_dev->parent_spec;
5141
5142 count += sprintf(&buf[count], "%s"
5143 "pool_id %llu\npool_name %s\n"
e92c0eaf 5144 "pool_ns %s\n"
ff96128f
ID
5145 "image_id %s\nimage_name %s\n"
5146 "snap_id %llu\nsnap_name %s\n"
5147 "overlap %llu\n",
5148 !count ? "" : "\n", /* first? */
5149 spec->pool_id, spec->pool_name,
e92c0eaf 5150 spec->pool_ns ?: "",
ff96128f
ID
5151 spec->image_id, spec->image_name ?: "(unknown)",
5152 spec->snap_id, spec->snap_name,
5153 rbd_dev->parent_overlap);
5154 }
5155
5156 return count;
86b00e0d
AE
5157}
5158
dfc5606d
YS
5159static ssize_t rbd_image_refresh(struct device *dev,
5160 struct device_attribute *attr,
5161 const char *buf,
5162 size_t size)
5163{
593a9e7b 5164 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 5165 int ret;
602adf40 5166
f44d04e6
ID
5167 if (!capable(CAP_SYS_ADMIN))
5168 return -EPERM;
5169
cc4a38bd 5170 ret = rbd_dev_refresh(rbd_dev);
e627db08 5171 if (ret)
52bb1f9b 5172 return ret;
b813623a 5173
52bb1f9b 5174 return size;
dfc5606d 5175}
602adf40 5176
5657a819
JP
5177static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5178static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5179static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5180static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5181static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5182static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5183static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5184static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5185static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5186static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
b26c047b 5187static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5657a819
JP
5188static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5189static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5190static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5191static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5192static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5193static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
dfc5606d
YS
5194
5195static struct attribute *rbd_attrs[] = {
5196 &dev_attr_size.attr,
34b13184 5197 &dev_attr_features.attr,
dfc5606d 5198 &dev_attr_major.attr,
dd82fff1 5199 &dev_attr_minor.attr,
005a07bf 5200 &dev_attr_client_addr.attr,
dfc5606d 5201 &dev_attr_client_id.attr,
267fb90b 5202 &dev_attr_cluster_fsid.attr,
0d6d1e9c 5203 &dev_attr_config_info.attr,
dfc5606d 5204 &dev_attr_pool.attr,
9bb2f334 5205 &dev_attr_pool_id.attr,
b26c047b 5206 &dev_attr_pool_ns.attr,
dfc5606d 5207 &dev_attr_name.attr,
589d30e0 5208 &dev_attr_image_id.attr,
dfc5606d 5209 &dev_attr_current_snap.attr,
92a58671 5210 &dev_attr_snap_id.attr,
86b00e0d 5211 &dev_attr_parent.attr,
dfc5606d 5212 &dev_attr_refresh.attr,
dfc5606d
YS
5213 NULL
5214};
5215
5216static struct attribute_group rbd_attr_group = {
5217 .attrs = rbd_attrs,
5218};
5219
5220static const struct attribute_group *rbd_attr_groups[] = {
5221 &rbd_attr_group,
5222 NULL
5223};
5224
6cac4695 5225static void rbd_dev_release(struct device *dev);
dfc5606d 5226
b9942bc9 5227static const struct device_type rbd_device_type = {
dfc5606d
YS
5228 .name = "rbd",
5229 .groups = rbd_attr_groups,
6cac4695 5230 .release = rbd_dev_release,
dfc5606d
YS
5231};
5232
8b8fb99c
AE
5233static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5234{
5235 kref_get(&spec->kref);
5236
5237 return spec;
5238}
5239
5240static void rbd_spec_free(struct kref *kref);
5241static void rbd_spec_put(struct rbd_spec *spec)
5242{
5243 if (spec)
5244 kref_put(&spec->kref, rbd_spec_free);
5245}
5246
5247static struct rbd_spec *rbd_spec_alloc(void)
5248{
5249 struct rbd_spec *spec;
5250
5251 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5252 if (!spec)
5253 return NULL;
04077599
ID
5254
5255 spec->pool_id = CEPH_NOPOOL;
5256 spec->snap_id = CEPH_NOSNAP;
8b8fb99c
AE
5257 kref_init(&spec->kref);
5258
8b8fb99c
AE
5259 return spec;
5260}
5261
5262static void rbd_spec_free(struct kref *kref)
5263{
5264 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5265
5266 kfree(spec->pool_name);
b26c047b 5267 kfree(spec->pool_ns);
8b8fb99c
AE
5268 kfree(spec->image_id);
5269 kfree(spec->image_name);
5270 kfree(spec->snap_name);
5271 kfree(spec);
5272}
5273
1643dfa4 5274static void rbd_dev_free(struct rbd_device *rbd_dev)
dd5ac32d 5275{
99d16943 5276 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
ed95b21a 5277 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
dd5ac32d 5278
c41d13a3 5279 ceph_oid_destroy(&rbd_dev->header_oid);
6b6dddbe 5280 ceph_oloc_destroy(&rbd_dev->header_oloc);
0d6d1e9c 5281 kfree(rbd_dev->config_info);
c41d13a3 5282
dd5ac32d
ID
5283 rbd_put_client(rbd_dev->rbd_client);
5284 rbd_spec_put(rbd_dev->spec);
5285 kfree(rbd_dev->opts);
5286 kfree(rbd_dev);
1643dfa4
ID
5287}
5288
5289static void rbd_dev_release(struct device *dev)
5290{
5291 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5292 bool need_put = !!rbd_dev->opts;
5293
5294 if (need_put) {
5295 destroy_workqueue(rbd_dev->task_wq);
5296 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5297 }
5298
5299 rbd_dev_free(rbd_dev);
dd5ac32d
ID
5300
5301 /*
5302 * This is racy, but way better than putting module outside of
5303 * the release callback. The race window is pretty small, so
5304 * doing something similar to dm (dm-builtin.c) is overkill.
5305 */
5306 if (need_put)
5307 module_put(THIS_MODULE);
5308}
5309
1643dfa4
ID
5310static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5311 struct rbd_spec *spec)
c53d5893
AE
5312{
5313 struct rbd_device *rbd_dev;
5314
1643dfa4 5315 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
c53d5893
AE
5316 if (!rbd_dev)
5317 return NULL;
5318
5319 spin_lock_init(&rbd_dev->lock);
5320 INIT_LIST_HEAD(&rbd_dev->node);
c53d5893
AE
5321 init_rwsem(&rbd_dev->header_rwsem);
5322
7e97332e 5323 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
c41d13a3 5324 ceph_oid_init(&rbd_dev->header_oid);
431a02cd 5325 rbd_dev->header_oloc.pool = spec->pool_id;
b26c047b
ID
5326 if (spec->pool_ns) {
5327 WARN_ON(!*spec->pool_ns);
5328 rbd_dev->header_oloc.pool_ns =
5329 ceph_find_or_create_string(spec->pool_ns,
5330 strlen(spec->pool_ns));
5331 }
c41d13a3 5332
99d16943
ID
5333 mutex_init(&rbd_dev->watch_mutex);
5334 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5335 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5336
ed95b21a
ID
5337 init_rwsem(&rbd_dev->lock_rwsem);
5338 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5339 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5340 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5341 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5342 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
e1fddc8f 5343 spin_lock_init(&rbd_dev->lock_lists_lock);
637cd060 5344 INIT_LIST_HEAD(&rbd_dev->acquiring_list);
e1fddc8f 5345 INIT_LIST_HEAD(&rbd_dev->running_list);
637cd060 5346 init_completion(&rbd_dev->acquire_wait);
e1fddc8f 5347 init_completion(&rbd_dev->releasing_wait);
ed95b21a 5348
22e8bd51 5349 spin_lock_init(&rbd_dev->object_map_lock);
ed95b21a 5350
dd5ac32d
ID
5351 rbd_dev->dev.bus = &rbd_bus_type;
5352 rbd_dev->dev.type = &rbd_device_type;
5353 rbd_dev->dev.parent = &rbd_root_dev;
dd5ac32d
ID
5354 device_initialize(&rbd_dev->dev);
5355
c53d5893 5356 rbd_dev->rbd_client = rbdc;
d147543d 5357 rbd_dev->spec = spec;
0903e875 5358
1643dfa4
ID
5359 return rbd_dev;
5360}
5361
5362/*
5363 * Create a mapping rbd_dev.
5364 */
5365static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5366 struct rbd_spec *spec,
5367 struct rbd_options *opts)
5368{
5369 struct rbd_device *rbd_dev;
5370
5371 rbd_dev = __rbd_dev_create(rbdc, spec);
5372 if (!rbd_dev)
5373 return NULL;
5374
5375 rbd_dev->opts = opts;
5376
5377 /* get an id and fill in device name */
5378 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5379 minor_to_rbd_dev_id(1 << MINORBITS),
5380 GFP_KERNEL);
5381 if (rbd_dev->dev_id < 0)
5382 goto fail_rbd_dev;
5383
5384 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5385 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5386 rbd_dev->name);
5387 if (!rbd_dev->task_wq)
5388 goto fail_dev_id;
dd5ac32d 5389
1643dfa4
ID
5390 /* we have a ref from do_rbd_add() */
5391 __module_get(THIS_MODULE);
dd5ac32d 5392
1643dfa4 5393 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
c53d5893 5394 return rbd_dev;
1643dfa4
ID
5395
5396fail_dev_id:
5397 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5398fail_rbd_dev:
5399 rbd_dev_free(rbd_dev);
5400 return NULL;
c53d5893
AE
5401}
5402
5403static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5404{
dd5ac32d
ID
5405 if (rbd_dev)
5406 put_device(&rbd_dev->dev);
c53d5893
AE
5407}
5408
9d475de5
AE
5409/*
5410 * Get the size and object order for an image snapshot, or if
5411 * snap_id is CEPH_NOSNAP, gets this information for the base
5412 * image.
5413 */
5414static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5415 u8 *order, u64 *snap_size)
5416{
5417 __le64 snapid = cpu_to_le64(snap_id);
5418 int ret;
5419 struct {
5420 u8 order;
5421 __le64 size;
5422 } __attribute__ ((packed)) size_buf = { 0 };
5423
ecd4a68a
ID
5424 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5425 &rbd_dev->header_oloc, "get_size",
5426 &snapid, sizeof(snapid),
5427 &size_buf, sizeof(size_buf));
36be9a76 5428 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
5429 if (ret < 0)
5430 return ret;
57385b51
AE
5431 if (ret < sizeof (size_buf))
5432 return -ERANGE;
9d475de5 5433
c3545579 5434 if (order) {
c86f86e9 5435 *order = size_buf.order;
c3545579
JD
5436 dout(" order %u", (unsigned int)*order);
5437 }
9d475de5
AE
5438 *snap_size = le64_to_cpu(size_buf.size);
5439
c3545579
JD
5440 dout(" snap_id 0x%016llx snap_size = %llu\n",
5441 (unsigned long long)snap_id,
57385b51 5442 (unsigned long long)*snap_size);
9d475de5
AE
5443
5444 return 0;
5445}
5446
5447static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5448{
5449 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5450 &rbd_dev->header.obj_order,
5451 &rbd_dev->header.image_size);
5452}
5453
1e130199
AE
5454static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5455{
5435d206 5456 size_t size;
1e130199
AE
5457 void *reply_buf;
5458 int ret;
5459 void *p;
5460
5435d206
DY
5461 /* Response will be an encoded string, which includes a length */
5462 size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5463 reply_buf = kzalloc(size, GFP_KERNEL);
1e130199
AE
5464 if (!reply_buf)
5465 return -ENOMEM;
5466
ecd4a68a
ID
5467 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5468 &rbd_dev->header_oloc, "get_object_prefix",
5435d206 5469 NULL, 0, reply_buf, size);
36be9a76 5470 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
5471 if (ret < 0)
5472 goto out;
5473
5474 p = reply_buf;
5475 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
57385b51
AE
5476 p + ret, NULL, GFP_NOIO);
5477 ret = 0;
1e130199
AE
5478
5479 if (IS_ERR(rbd_dev->header.object_prefix)) {
5480 ret = PTR_ERR(rbd_dev->header.object_prefix);
5481 rbd_dev->header.object_prefix = NULL;
5482 } else {
5483 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
5484 }
1e130199
AE
5485out:
5486 kfree(reply_buf);
5487
5488 return ret;
5489}
5490
b1b5402a 5491static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
196e2d6d 5492 bool read_only, u64 *snap_features)
b1b5402a 5493{
196e2d6d
ID
5494 struct {
5495 __le64 snap_id;
5496 u8 read_only;
5497 } features_in;
b1b5402a
AE
5498 struct {
5499 __le64 features;
5500 __le64 incompat;
4157976b 5501 } __attribute__ ((packed)) features_buf = { 0 };
d3767f0f 5502 u64 unsup;
b1b5402a
AE
5503 int ret;
5504
196e2d6d
ID
5505 features_in.snap_id = cpu_to_le64(snap_id);
5506 features_in.read_only = read_only;
5507
ecd4a68a
ID
5508 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5509 &rbd_dev->header_oloc, "get_features",
196e2d6d 5510 &features_in, sizeof(features_in),
ecd4a68a 5511 &features_buf, sizeof(features_buf));
36be9a76 5512 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
5513 if (ret < 0)
5514 return ret;
57385b51
AE
5515 if (ret < sizeof (features_buf))
5516 return -ERANGE;
d889140c 5517
d3767f0f
ID
5518 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5519 if (unsup) {
5520 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5521 unsup);
b8f5c6ed 5522 return -ENXIO;
d3767f0f 5523 }
d889140c 5524
b1b5402a
AE
5525 *snap_features = le64_to_cpu(features_buf.features);
5526
5527 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
57385b51
AE
5528 (unsigned long long)snap_id,
5529 (unsigned long long)*snap_features,
5530 (unsigned long long)le64_to_cpu(features_buf.incompat));
b1b5402a
AE
5531
5532 return 0;
5533}
5534
5535static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5536{
5537 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
196e2d6d
ID
5538 rbd_is_ro(rbd_dev),
5539 &rbd_dev->header.features);
b1b5402a
AE
5540}
5541
22e8bd51
ID
5542/*
5543 * These are generic image flags, but since they are used only for
5544 * object map, store them in rbd_dev->object_map_flags.
5545 *
5546 * For the same reason, this function is called only on object map
5547 * (re)load and not on header refresh.
5548 */
5549static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5550{
5551 __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5552 __le64 flags;
5553 int ret;
5554
5555 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5556 &rbd_dev->header_oloc, "get_flags",
5557 &snapid, sizeof(snapid),
5558 &flags, sizeof(flags));
5559 if (ret < 0)
5560 return ret;
5561 if (ret < sizeof(flags))
5562 return -EBADMSG;
5563
5564 rbd_dev->object_map_flags = le64_to_cpu(flags);
5565 return 0;
5566}
5567
eb3b2d6b
ID
5568struct parent_image_info {
5569 u64 pool_id;
e92c0eaf 5570 const char *pool_ns;
eb3b2d6b
ID
5571 const char *image_id;
5572 u64 snap_id;
5573
e92c0eaf 5574 bool has_overlap;
eb3b2d6b
ID
5575 u64 overlap;
5576};
5577
e92c0eaf
ID
5578/*
5579 * The caller is responsible for @pii.
5580 */
5581static int decode_parent_image_spec(void **p, void *end,
5582 struct parent_image_info *pii)
5583{
5584 u8 struct_v;
5585 u32 struct_len;
5586 int ret;
5587
5588 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5589 &struct_v, &struct_len);
5590 if (ret)
5591 return ret;
5592
5593 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5594 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5595 if (IS_ERR(pii->pool_ns)) {
5596 ret = PTR_ERR(pii->pool_ns);
5597 pii->pool_ns = NULL;
5598 return ret;
5599 }
5600 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5601 if (IS_ERR(pii->image_id)) {
5602 ret = PTR_ERR(pii->image_id);
5603 pii->image_id = NULL;
5604 return ret;
5605 }
5606 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5607 return 0;
5608
5609e_inval:
5610 return -EINVAL;
5611}
5612
5613static int __get_parent_info(struct rbd_device *rbd_dev,
5614 struct page *req_page,
5615 struct page *reply_page,
5616 struct parent_image_info *pii)
5617{
5618 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5619 size_t reply_len = PAGE_SIZE;
5620 void *p, *end;
5621 int ret;
5622
5623 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5624 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
68ada915 5625 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5626 if (ret)
5627 return ret == -EOPNOTSUPP ? 1 : ret;
5628
5629 p = page_address(reply_page);
5630 end = p + reply_len;
5631 ret = decode_parent_image_spec(&p, end, pii);
5632 if (ret)
5633 return ret;
5634
5635 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5636 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
68ada915 5637 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5638 if (ret)
5639 return ret;
5640
5641 p = page_address(reply_page);
5642 end = p + reply_len;
5643 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5644 if (pii->has_overlap)
5645 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5646
5647 return 0;
5648
5649e_inval:
5650 return -EINVAL;
5651}
5652
eb3b2d6b
ID
5653/*
5654 * The caller is responsible for @pii.
5655 */
5656static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5657 struct page *req_page,
5658 struct page *reply_page,
5659 struct parent_image_info *pii)
5660{
5661 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5662 size_t reply_len = PAGE_SIZE;
5663 void *p, *end;
5664 int ret;
5665
5666 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5667 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
68ada915 5668 req_page, sizeof(u64), &reply_page, &reply_len);
eb3b2d6b
ID
5669 if (ret)
5670 return ret;
5671
5672 p = page_address(reply_page);
5673 end = p + reply_len;
5674 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5675 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5676 if (IS_ERR(pii->image_id)) {
5677 ret = PTR_ERR(pii->image_id);
5678 pii->image_id = NULL;
5679 return ret;
5680 }
5681 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
e92c0eaf 5682 pii->has_overlap = true;
eb3b2d6b
ID
5683 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5684
5685 return 0;
5686
5687e_inval:
5688 return -EINVAL;
5689}
5690
5691static int get_parent_info(struct rbd_device *rbd_dev,
5692 struct parent_image_info *pii)
5693{
5694 struct page *req_page, *reply_page;
5695 void *p;
5696 int ret;
5697
5698 req_page = alloc_page(GFP_KERNEL);
5699 if (!req_page)
5700 return -ENOMEM;
5701
5702 reply_page = alloc_page(GFP_KERNEL);
5703 if (!reply_page) {
5704 __free_page(req_page);
5705 return -ENOMEM;
5706 }
5707
5708 p = page_address(req_page);
5709 ceph_encode_64(&p, rbd_dev->spec->snap_id);
e92c0eaf
ID
5710 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5711 if (ret > 0)
5712 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5713 pii);
eb3b2d6b
ID
5714
5715 __free_page(req_page);
5716 __free_page(reply_page);
5717 return ret;
5718}
5719
86b00e0d
AE
5720static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5721{
5722 struct rbd_spec *parent_spec;
eb3b2d6b 5723 struct parent_image_info pii = { 0 };
86b00e0d
AE
5724 int ret;
5725
5726 parent_spec = rbd_spec_alloc();
5727 if (!parent_spec)
5728 return -ENOMEM;
5729
eb3b2d6b
ID
5730 ret = get_parent_info(rbd_dev, &pii);
5731 if (ret)
86b00e0d 5732 goto out_err;
86b00e0d 5733
e92c0eaf
ID
5734 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5735 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5736 pii.has_overlap, pii.overlap);
86b00e0d 5737
e92c0eaf 5738 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
392a9dad
AE
5739 /*
5740 * Either the parent never existed, or we have
5741 * record of it but the image got flattened so it no
5742 * longer has a parent. When the parent of a
5743 * layered image disappears we immediately set the
5744 * overlap to 0. The effect of this is that all new
5745 * requests will be treated as if the image had no
5746 * parent.
e92c0eaf
ID
5747 *
5748 * If !pii.has_overlap, the parent image spec is not
5749 * applicable. It's there to avoid duplication in each
5750 * snapshot record.
392a9dad
AE
5751 */
5752 if (rbd_dev->parent_overlap) {
5753 rbd_dev->parent_overlap = 0;
392a9dad
AE
5754 rbd_dev_parent_put(rbd_dev);
5755 pr_info("%s: clone image has been flattened\n",
5756 rbd_dev->disk->disk_name);
5757 }
5758
86b00e0d 5759 goto out; /* No parent? No problem. */
392a9dad 5760 }
86b00e0d 5761
0903e875
AE
5762 /* The ceph file layout needs to fit pool id in 32 bits */
5763
5764 ret = -EIO;
eb3b2d6b 5765 if (pii.pool_id > (u64)U32_MAX) {
9584d508 5766 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
eb3b2d6b 5767 (unsigned long long)pii.pool_id, U32_MAX);
86b00e0d
AE
5768 goto out_err;
5769 }
86b00e0d 5770
3b5cf2a2
AE
5771 /*
5772 * The parent won't change (except when the clone is
5773 * flattened, already handled that). So we only need to
5774 * record the parent spec we have not already done so.
5775 */
5776 if (!rbd_dev->parent_spec) {
eb3b2d6b 5777 parent_spec->pool_id = pii.pool_id;
e92c0eaf
ID
5778 if (pii.pool_ns && *pii.pool_ns) {
5779 parent_spec->pool_ns = pii.pool_ns;
5780 pii.pool_ns = NULL;
5781 }
eb3b2d6b
ID
5782 parent_spec->image_id = pii.image_id;
5783 pii.image_id = NULL;
5784 parent_spec->snap_id = pii.snap_id;
b26c047b 5785
70cf49cf
AE
5786 rbd_dev->parent_spec = parent_spec;
5787 parent_spec = NULL; /* rbd_dev now owns this */
3b5cf2a2
AE
5788 }
5789
5790 /*
cf32bd9c
ID
5791 * We always update the parent overlap. If it's zero we issue
5792 * a warning, as we will proceed as if there was no parent.
3b5cf2a2 5793 */
eb3b2d6b 5794 if (!pii.overlap) {
3b5cf2a2 5795 if (parent_spec) {
cf32bd9c
ID
5796 /* refresh, careful to warn just once */
5797 if (rbd_dev->parent_overlap)
5798 rbd_warn(rbd_dev,
5799 "clone now standalone (overlap became 0)");
3b5cf2a2 5800 } else {
cf32bd9c
ID
5801 /* initial probe */
5802 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
3b5cf2a2 5803 }
70cf49cf 5804 }
eb3b2d6b 5805 rbd_dev->parent_overlap = pii.overlap;
cf32bd9c 5806
86b00e0d
AE
5807out:
5808 ret = 0;
5809out_err:
e92c0eaf 5810 kfree(pii.pool_ns);
eb3b2d6b 5811 kfree(pii.image_id);
86b00e0d 5812 rbd_spec_put(parent_spec);
86b00e0d
AE
5813 return ret;
5814}
5815
cc070d59
AE
5816static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5817{
5818 struct {
5819 __le64 stripe_unit;
5820 __le64 stripe_count;
5821 } __attribute__ ((packed)) striping_info_buf = { 0 };
5822 size_t size = sizeof (striping_info_buf);
5823 void *p;
cc070d59
AE
5824 int ret;
5825
ecd4a68a
ID
5826 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5827 &rbd_dev->header_oloc, "get_stripe_unit_count",
5828 NULL, 0, &striping_info_buf, size);
cc070d59
AE
5829 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5830 if (ret < 0)
5831 return ret;
5832 if (ret < size)
5833 return -ERANGE;
5834
cc070d59 5835 p = &striping_info_buf;
b1331852
ID
5836 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5837 rbd_dev->header.stripe_count = ceph_decode_64(&p);
cc070d59
AE
5838 return 0;
5839}
5840
7e97332e
ID
5841static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5842{
5843 __le64 data_pool_id;
5844 int ret;
5845
5846 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5847 &rbd_dev->header_oloc, "get_data_pool",
5848 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5849 if (ret < 0)
5850 return ret;
5851 if (ret < sizeof(data_pool_id))
5852 return -EBADMSG;
5853
5854 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5855 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5856 return 0;
5857}
5858
9e15b77d
AE
5859static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5860{
ecd4a68a 5861 CEPH_DEFINE_OID_ONSTACK(oid);
9e15b77d
AE
5862 size_t image_id_size;
5863 char *image_id;
5864 void *p;
5865 void *end;
5866 size_t size;
5867 void *reply_buf = NULL;
5868 size_t len = 0;
5869 char *image_name = NULL;
5870 int ret;
5871
5872 rbd_assert(!rbd_dev->spec->image_name);
5873
69e7a02f
AE
5874 len = strlen(rbd_dev->spec->image_id);
5875 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
5876 image_id = kmalloc(image_id_size, GFP_KERNEL);
5877 if (!image_id)
5878 return NULL;
5879
5880 p = image_id;
4157976b 5881 end = image_id + image_id_size;
57385b51 5882 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
9e15b77d
AE
5883
5884 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5885 reply_buf = kmalloc(size, GFP_KERNEL);
5886 if (!reply_buf)
5887 goto out;
5888
ecd4a68a
ID
5889 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5890 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5891 "dir_get_name", image_id, image_id_size,
5892 reply_buf, size);
9e15b77d
AE
5893 if (ret < 0)
5894 goto out;
5895 p = reply_buf;
f40eb349
AE
5896 end = reply_buf + ret;
5897
9e15b77d
AE
5898 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5899 if (IS_ERR(image_name))
5900 image_name = NULL;
5901 else
5902 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5903out:
5904 kfree(reply_buf);
5905 kfree(image_id);
5906
5907 return image_name;
5908}
5909
2ad3d716
AE
5910static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5911{
5912 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5913 const char *snap_name;
5914 u32 which = 0;
5915
5916 /* Skip over names until we find the one we are looking for */
5917
5918 snap_name = rbd_dev->header.snap_names;
5919 while (which < snapc->num_snaps) {
5920 if (!strcmp(name, snap_name))
5921 return snapc->snaps[which];
5922 snap_name += strlen(snap_name) + 1;
5923 which++;
5924 }
5925 return CEPH_NOSNAP;
5926}
5927
5928static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5929{
5930 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5931 u32 which;
5932 bool found = false;
5933 u64 snap_id;
5934
5935 for (which = 0; !found && which < snapc->num_snaps; which++) {
5936 const char *snap_name;
5937
5938 snap_id = snapc->snaps[which];
5939 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
efadc98a
JD
5940 if (IS_ERR(snap_name)) {
5941 /* ignore no-longer existing snapshots */
5942 if (PTR_ERR(snap_name) == -ENOENT)
5943 continue;
5944 else
5945 break;
5946 }
2ad3d716
AE
5947 found = !strcmp(name, snap_name);
5948 kfree(snap_name);
5949 }
5950 return found ? snap_id : CEPH_NOSNAP;
5951}
5952
5953/*
5954 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5955 * no snapshot by that name is found, or if an error occurs.
5956 */
5957static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5958{
5959 if (rbd_dev->image_format == 1)
5960 return rbd_v1_snap_id_by_name(rbd_dev, name);
5961
5962 return rbd_v2_snap_id_by_name(rbd_dev, name);
5963}
5964
9e15b77d 5965/*
04077599
ID
5966 * An image being mapped will have everything but the snap id.
5967 */
5968static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5969{
5970 struct rbd_spec *spec = rbd_dev->spec;
5971
5972 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5973 rbd_assert(spec->image_id && spec->image_name);
5974 rbd_assert(spec->snap_name);
5975
5976 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5977 u64 snap_id;
5978
5979 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5980 if (snap_id == CEPH_NOSNAP)
5981 return -ENOENT;
5982
5983 spec->snap_id = snap_id;
5984 } else {
5985 spec->snap_id = CEPH_NOSNAP;
5986 }
5987
5988 return 0;
5989}
5990
5991/*
5992 * A parent image will have all ids but none of the names.
e1d4213f 5993 *
04077599
ID
5994 * All names in an rbd spec are dynamically allocated. It's OK if we
5995 * can't figure out the name for an image id.
9e15b77d 5996 */
04077599 5997static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
9e15b77d 5998{
2e9f7f1c
AE
5999 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6000 struct rbd_spec *spec = rbd_dev->spec;
6001 const char *pool_name;
6002 const char *image_name;
6003 const char *snap_name;
9e15b77d
AE
6004 int ret;
6005
04077599
ID
6006 rbd_assert(spec->pool_id != CEPH_NOPOOL);
6007 rbd_assert(spec->image_id);
6008 rbd_assert(spec->snap_id != CEPH_NOSNAP);
9e15b77d 6009
2e9f7f1c 6010 /* Get the pool name; we have to make our own copy of this */
9e15b77d 6011
2e9f7f1c
AE
6012 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6013 if (!pool_name) {
6014 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
935dc89f
AE
6015 return -EIO;
6016 }
2e9f7f1c
AE
6017 pool_name = kstrdup(pool_name, GFP_KERNEL);
6018 if (!pool_name)
9e15b77d
AE
6019 return -ENOMEM;
6020
6021 /* Fetch the image name; tolerate failure here */
6022
2e9f7f1c
AE
6023 image_name = rbd_dev_image_name(rbd_dev);
6024 if (!image_name)
06ecc6cb 6025 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d 6026
04077599 6027 /* Fetch the snapshot name */
9e15b77d 6028
2e9f7f1c 6029 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
da6a6b63
JD
6030 if (IS_ERR(snap_name)) {
6031 ret = PTR_ERR(snap_name);
9e15b77d 6032 goto out_err;
2e9f7f1c
AE
6033 }
6034
6035 spec->pool_name = pool_name;
6036 spec->image_name = image_name;
6037 spec->snap_name = snap_name;
9e15b77d
AE
6038
6039 return 0;
04077599 6040
9e15b77d 6041out_err:
2e9f7f1c
AE
6042 kfree(image_name);
6043 kfree(pool_name);
9e15b77d
AE
6044 return ret;
6045}
6046
cc4a38bd 6047static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
35d489f9
AE
6048{
6049 size_t size;
6050 int ret;
6051 void *reply_buf;
6052 void *p;
6053 void *end;
6054 u64 seq;
6055 u32 snap_count;
6056 struct ceph_snap_context *snapc;
6057 u32 i;
6058
6059 /*
6060 * We'll need room for the seq value (maximum snapshot id),
6061 * snapshot count, and array of that many snapshot ids.
6062 * For now we have a fixed upper limit on the number we're
6063 * prepared to receive.
6064 */
6065 size = sizeof (__le64) + sizeof (__le32) +
6066 RBD_MAX_SNAP_COUNT * sizeof (__le64);
6067 reply_buf = kzalloc(size, GFP_KERNEL);
6068 if (!reply_buf)
6069 return -ENOMEM;
6070
ecd4a68a
ID
6071 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6072 &rbd_dev->header_oloc, "get_snapcontext",
6073 NULL, 0, reply_buf, size);
36be9a76 6074 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
6075 if (ret < 0)
6076 goto out;
6077
35d489f9 6078 p = reply_buf;
57385b51
AE
6079 end = reply_buf + ret;
6080 ret = -ERANGE;
35d489f9
AE
6081 ceph_decode_64_safe(&p, end, seq, out);
6082 ceph_decode_32_safe(&p, end, snap_count, out);
6083
6084 /*
6085 * Make sure the reported number of snapshot ids wouldn't go
6086 * beyond the end of our buffer. But before checking that,
6087 * make sure the computed size of the snapshot context we
6088 * allocate is representable in a size_t.
6089 */
6090 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6091 / sizeof (u64)) {
6092 ret = -EINVAL;
6093 goto out;
6094 }
6095 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6096 goto out;
468521c1 6097 ret = 0;
35d489f9 6098
812164f8 6099 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
35d489f9
AE
6100 if (!snapc) {
6101 ret = -ENOMEM;
6102 goto out;
6103 }
35d489f9 6104 snapc->seq = seq;
35d489f9
AE
6105 for (i = 0; i < snap_count; i++)
6106 snapc->snaps[i] = ceph_decode_64(&p);
6107
49ece554 6108 ceph_put_snap_context(rbd_dev->header.snapc);
35d489f9
AE
6109 rbd_dev->header.snapc = snapc;
6110
6111 dout(" snap context seq = %llu, snap_count = %u\n",
57385b51 6112 (unsigned long long)seq, (unsigned int)snap_count);
35d489f9
AE
6113out:
6114 kfree(reply_buf);
6115
57385b51 6116 return ret;
35d489f9
AE
6117}
6118
54cac61f
AE
6119static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6120 u64 snap_id)
b8b1e2db
AE
6121{
6122 size_t size;
6123 void *reply_buf;
54cac61f 6124 __le64 snapid;
b8b1e2db
AE
6125 int ret;
6126 void *p;
6127 void *end;
b8b1e2db
AE
6128 char *snap_name;
6129
6130 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6131 reply_buf = kmalloc(size, GFP_KERNEL);
6132 if (!reply_buf)
6133 return ERR_PTR(-ENOMEM);
6134
54cac61f 6135 snapid = cpu_to_le64(snap_id);
ecd4a68a
ID
6136 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6137 &rbd_dev->header_oloc, "get_snapshot_name",
6138 &snapid, sizeof(snapid), reply_buf, size);
36be9a76 6139 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
f40eb349
AE
6140 if (ret < 0) {
6141 snap_name = ERR_PTR(ret);
b8b1e2db 6142 goto out;
f40eb349 6143 }
b8b1e2db
AE
6144
6145 p = reply_buf;
f40eb349 6146 end = reply_buf + ret;
e5c35534 6147 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
f40eb349 6148 if (IS_ERR(snap_name))
b8b1e2db 6149 goto out;
b8b1e2db 6150
f40eb349 6151 dout(" snap_id 0x%016llx snap_name = %s\n",
54cac61f 6152 (unsigned long long)snap_id, snap_name);
b8b1e2db
AE
6153out:
6154 kfree(reply_buf);
6155
f40eb349 6156 return snap_name;
b8b1e2db
AE
6157}
6158
2df3fac7 6159static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
117973fb 6160{
2df3fac7 6161 bool first_time = rbd_dev->header.object_prefix == NULL;
117973fb 6162 int ret;
117973fb 6163
1617e40c
JD
6164 ret = rbd_dev_v2_image_size(rbd_dev);
6165 if (ret)
cfbf6377 6166 return ret;
1617e40c 6167
2df3fac7
AE
6168 if (first_time) {
6169 ret = rbd_dev_v2_header_onetime(rbd_dev);
6170 if (ret)
cfbf6377 6171 return ret;
2df3fac7
AE
6172 }
6173
cc4a38bd 6174 ret = rbd_dev_v2_snap_context(rbd_dev);
d194cd1d
ID
6175 if (ret && first_time) {
6176 kfree(rbd_dev->header.object_prefix);
6177 rbd_dev->header.object_prefix = NULL;
6178 }
117973fb
AE
6179
6180 return ret;
6181}
6182
a720ae09
ID
6183static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6184{
6185 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6186
6187 if (rbd_dev->image_format == 1)
6188 return rbd_dev_v1_header_info(rbd_dev);
6189
6190 return rbd_dev_v2_header_info(rbd_dev);
6191}
6192
e28fff26
AE
6193/*
6194 * Skips over white space at *buf, and updates *buf to point to the
6195 * first found non-space character (if any). Returns the length of
593a9e7b
AE
6196 * the token (string of non-white space characters) found. Note
6197 * that *buf must be terminated with '\0'.
e28fff26
AE
6198 */
6199static inline size_t next_token(const char **buf)
6200{
6201 /*
6202 * These are the characters that produce nonzero for
6203 * isspace() in the "C" and "POSIX" locales.
6204 */
6205 const char *spaces = " \f\n\r\t\v";
6206
6207 *buf += strspn(*buf, spaces); /* Find start of token */
6208
6209 return strcspn(*buf, spaces); /* Return token length */
6210}
6211
ea3352f4
AE
6212/*
6213 * Finds the next token in *buf, dynamically allocates a buffer big
6214 * enough to hold a copy of it, and copies the token into the new
6215 * buffer. The copy is guaranteed to be terminated with '\0'. Note
6216 * that a duplicate buffer is created even for a zero-length token.
6217 *
6218 * Returns a pointer to the newly-allocated duplicate, or a null
6219 * pointer if memory for the duplicate was not available. If
6220 * the lenp argument is a non-null pointer, the length of the token
6221 * (not including the '\0') is returned in *lenp.
6222 *
6223 * If successful, the *buf pointer will be updated to point beyond
6224 * the end of the found token.
6225 *
6226 * Note: uses GFP_KERNEL for allocation.
6227 */
6228static inline char *dup_token(const char **buf, size_t *lenp)
6229{
6230 char *dup;
6231 size_t len;
6232
6233 len = next_token(buf);
4caf35f9 6234 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
6235 if (!dup)
6236 return NULL;
ea3352f4
AE
6237 *(dup + len) = '\0';
6238 *buf += len;
6239
6240 if (lenp)
6241 *lenp = len;
6242
6243 return dup;
6244}
6245
82995cc6
DH
6246static int rbd_parse_param(struct fs_parameter *param,
6247 struct rbd_parse_opts_ctx *pctx)
6248{
6249 struct rbd_options *opt = pctx->opts;
6250 struct fs_parse_result result;
3fbb8d55 6251 struct p_log log = {.prefix = "rbd"};
82995cc6
DH
6252 int token, ret;
6253
6254 ret = ceph_parse_param(param, pctx->copts, NULL);
6255 if (ret != -ENOPARAM)
6256 return ret;
6257
d7167b14 6258 token = __fs_parse(&log, rbd_parameters, param, &result);
82995cc6
DH
6259 dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
6260 if (token < 0) {
2c3f3dc3
AV
6261 if (token == -ENOPARAM)
6262 return inval_plog(&log, "Unknown parameter '%s'",
6263 param->key);
82995cc6
DH
6264 return token;
6265 }
6266
6267 switch (token) {
6268 case Opt_queue_depth:
6269 if (result.uint_32 < 1)
6270 goto out_of_range;
6271 opt->queue_depth = result.uint_32;
6272 break;
6273 case Opt_alloc_size:
6274 if (result.uint_32 < SECTOR_SIZE)
6275 goto out_of_range;
2c3f3dc3
AV
6276 if (!is_power_of_2(result.uint_32))
6277 return inval_plog(&log, "alloc_size must be a power of 2");
82995cc6
DH
6278 opt->alloc_size = result.uint_32;
6279 break;
6280 case Opt_lock_timeout:
6281 /* 0 is "wait forever" (i.e. infinite timeout) */
6282 if (result.uint_32 > INT_MAX / 1000)
6283 goto out_of_range;
6284 opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
6285 break;
6286 case Opt_pool_ns:
6287 kfree(pctx->spec->pool_ns);
6288 pctx->spec->pool_ns = param->string;
6289 param->string = NULL;
6290 break;
dc1dad8e
ID
6291 case Opt_compression_hint:
6292 switch (result.uint_32) {
6293 case Opt_compression_hint_none:
6294 opt->alloc_hint_flags &=
6295 ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6296 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6297 break;
6298 case Opt_compression_hint_compressible:
6299 opt->alloc_hint_flags |=
6300 CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6301 opt->alloc_hint_flags &=
6302 ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6303 break;
6304 case Opt_compression_hint_incompressible:
6305 opt->alloc_hint_flags |=
6306 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6307 opt->alloc_hint_flags &=
6308 ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6309 break;
6310 default:
6311 BUG();
6312 }
6313 break;
82995cc6
DH
6314 case Opt_read_only:
6315 opt->read_only = true;
6316 break;
6317 case Opt_read_write:
6318 opt->read_only = false;
6319 break;
6320 case Opt_lock_on_read:
6321 opt->lock_on_read = true;
6322 break;
6323 case Opt_exclusive:
6324 opt->exclusive = true;
6325 break;
6326 case Opt_notrim:
6327 opt->trim = false;
6328 break;
6329 default:
6330 BUG();
6331 }
6332
6333 return 0;
6334
6335out_of_range:
2c3f3dc3 6336 return inval_plog(&log, "%s out of range", param->key);
82995cc6
DH
6337}
6338
6339/*
6340 * This duplicates most of generic_parse_monolithic(), untying it from
6341 * fs_context and skipping standard superblock and security options.
6342 */
6343static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
6344{
6345 char *key;
6346 int ret = 0;
6347
6348 dout("%s '%s'\n", __func__, options);
6349 while ((key = strsep(&options, ",")) != NULL) {
6350 if (*key) {
6351 struct fs_parameter param = {
6352 .key = key,
0f89589a 6353 .type = fs_value_is_flag,
82995cc6
DH
6354 };
6355 char *value = strchr(key, '=');
6356 size_t v_len = 0;
6357
6358 if (value) {
6359 if (value == key)
6360 continue;
6361 *value++ = 0;
6362 v_len = strlen(value);
82995cc6
DH
6363 param.string = kmemdup_nul(value, v_len,
6364 GFP_KERNEL);
6365 if (!param.string)
6366 return -ENOMEM;
0f89589a 6367 param.type = fs_value_is_string;
82995cc6
DH
6368 }
6369 param.size = v_len;
6370
6371 ret = rbd_parse_param(&param, pctx);
6372 kfree(param.string);
6373 if (ret)
6374 break;
6375 }
6376 }
6377
6378 return ret;
6379}
6380
a725f65e 6381/*
859c31df
AE
6382 * Parse the options provided for an "rbd add" (i.e., rbd image
6383 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
6384 * and the data written is passed here via a NUL-terminated buffer.
6385 * Returns 0 if successful or an error code otherwise.
d22f76e7 6386 *
859c31df
AE
6387 * The information extracted from these options is recorded in
6388 * the other parameters which return dynamically-allocated
6389 * structures:
6390 * ceph_opts
6391 * The address of a pointer that will refer to a ceph options
6392 * structure. Caller must release the returned pointer using
6393 * ceph_destroy_options() when it is no longer needed.
6394 * rbd_opts
6395 * Address of an rbd options pointer. Fully initialized by
6396 * this function; caller must release with kfree().
6397 * spec
6398 * Address of an rbd image specification pointer. Fully
6399 * initialized by this function based on parsed options.
6400 * Caller must release with rbd_spec_put().
6401 *
6402 * The options passed take this form:
6403 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6404 * where:
6405 * <mon_addrs>
6406 * A comma-separated list of one or more monitor addresses.
6407 * A monitor address is an ip address, optionally followed
6408 * by a port number (separated by a colon).
6409 * I.e.: ip1[:port1][,ip2[:port2]...]
6410 * <options>
6411 * A comma-separated list of ceph and/or rbd options.
6412 * <pool_name>
6413 * The name of the rados pool containing the rbd image.
6414 * <image_name>
6415 * The name of the image in that pool to map.
6416 * <snap_id>
6417 * An optional snapshot id. If provided, the mapping will
6418 * present data from the image at the time that snapshot was
6419 * created. The image head is used if no snapshot id is
6420 * provided. Snapshot mappings are always read-only.
a725f65e 6421 */
859c31df 6422static int rbd_add_parse_args(const char *buf,
dc79b113 6423 struct ceph_options **ceph_opts,
859c31df
AE
6424 struct rbd_options **opts,
6425 struct rbd_spec **rbd_spec)
e28fff26 6426{
d22f76e7 6427 size_t len;
859c31df 6428 char *options;
0ddebc0c 6429 const char *mon_addrs;
ecb4dc22 6430 char *snap_name;
0ddebc0c 6431 size_t mon_addrs_size;
82995cc6 6432 struct rbd_parse_opts_ctx pctx = { 0 };
dc79b113 6433 int ret;
e28fff26
AE
6434
6435 /* The first four tokens are required */
6436
7ef3214a 6437 len = next_token(&buf);
4fb5d671
AE
6438 if (!len) {
6439 rbd_warn(NULL, "no monitor address(es) provided");
6440 return -EINVAL;
6441 }
0ddebc0c 6442 mon_addrs = buf;
82995cc6 6443 mon_addrs_size = len;
7ef3214a 6444 buf += len;
a725f65e 6445
dc79b113 6446 ret = -EINVAL;
f28e565a
AE
6447 options = dup_token(&buf, NULL);
6448 if (!options)
dc79b113 6449 return -ENOMEM;
4fb5d671
AE
6450 if (!*options) {
6451 rbd_warn(NULL, "no options provided");
6452 goto out_err;
6453 }
e28fff26 6454
c300156b
ID
6455 pctx.spec = rbd_spec_alloc();
6456 if (!pctx.spec)
f28e565a 6457 goto out_mem;
859c31df 6458
c300156b
ID
6459 pctx.spec->pool_name = dup_token(&buf, NULL);
6460 if (!pctx.spec->pool_name)
859c31df 6461 goto out_mem;
c300156b 6462 if (!*pctx.spec->pool_name) {
4fb5d671
AE
6463 rbd_warn(NULL, "no pool name provided");
6464 goto out_err;
6465 }
e28fff26 6466
c300156b
ID
6467 pctx.spec->image_name = dup_token(&buf, NULL);
6468 if (!pctx.spec->image_name)
f28e565a 6469 goto out_mem;
c300156b 6470 if (!*pctx.spec->image_name) {
4fb5d671
AE
6471 rbd_warn(NULL, "no image name provided");
6472 goto out_err;
6473 }
d4b125e9 6474
f28e565a
AE
6475 /*
6476 * Snapshot name is optional; default is to use "-"
6477 * (indicating the head/no snapshot).
6478 */
3feeb894 6479 len = next_token(&buf);
820a5f3e 6480 if (!len) {
3feeb894
AE
6481 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6482 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 6483 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 6484 ret = -ENAMETOOLONG;
f28e565a 6485 goto out_err;
849b4260 6486 }
ecb4dc22
AE
6487 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6488 if (!snap_name)
f28e565a 6489 goto out_mem;
ecb4dc22 6490 *(snap_name + len) = '\0';
c300156b 6491 pctx.spec->snap_name = snap_name;
e5c35534 6492
82995cc6
DH
6493 pctx.copts = ceph_alloc_options();
6494 if (!pctx.copts)
6495 goto out_mem;
6496
0ddebc0c 6497 /* Initialize all rbd options to the defaults */
e28fff26 6498
c300156b
ID
6499 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6500 if (!pctx.opts)
4e9afeba
AE
6501 goto out_mem;
6502
c300156b
ID
6503 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6504 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
0c93e1b7 6505 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
c300156b
ID
6506 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6507 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6508 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6509 pctx.opts->trim = RBD_TRIM_DEFAULT;
d22f76e7 6510
82995cc6
DH
6511 ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL);
6512 if (ret)
dc79b113 6513 goto out_err;
859c31df 6514
82995cc6
DH
6515 ret = rbd_parse_options(options, &pctx);
6516 if (ret)
6517 goto out_err;
6518
6519 *ceph_opts = pctx.copts;
c300156b
ID
6520 *opts = pctx.opts;
6521 *rbd_spec = pctx.spec;
82995cc6 6522 kfree(options);
dc79b113 6523 return 0;
82995cc6 6524
f28e565a 6525out_mem:
dc79b113 6526 ret = -ENOMEM;
d22f76e7 6527out_err:
c300156b 6528 kfree(pctx.opts);
82995cc6 6529 ceph_destroy_options(pctx.copts);
c300156b 6530 rbd_spec_put(pctx.spec);
f28e565a 6531 kfree(options);
dc79b113 6532 return ret;
a725f65e
AE
6533}
6534
e010dd0a
ID
6535static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6536{
6537 down_write(&rbd_dev->lock_rwsem);
6538 if (__rbd_is_lock_owner(rbd_dev))
e1fddc8f 6539 __rbd_release_lock(rbd_dev);
e010dd0a
ID
6540 up_write(&rbd_dev->lock_rwsem);
6541}
6542
637cd060
ID
6543/*
6544 * If the wait is interrupted, an error is returned even if the lock
6545 * was successfully acquired. rbd_dev_image_unlock() will release it
6546 * if needed.
6547 */
e010dd0a
ID
6548static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6549{
637cd060 6550 long ret;
2f18d466 6551
e010dd0a 6552 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
637cd060
ID
6553 if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6554 return 0;
6555
e010dd0a
ID
6556 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6557 return -EINVAL;
6558 }
6559
3fe69921 6560 if (rbd_is_ro(rbd_dev))
637cd060
ID
6561 return 0;
6562
6563 rbd_assert(!rbd_is_lock_owner(rbd_dev));
6564 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6565 ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6566 ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
25e6be21 6567 if (ret > 0) {
637cd060 6568 ret = rbd_dev->acquire_err;
25e6be21
DY
6569 } else {
6570 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6571 if (!ret)
6572 ret = -ETIMEDOUT;
6573 }
637cd060 6574
2f18d466 6575 if (ret) {
637cd060
ID
6576 rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6577 return ret;
e010dd0a
ID
6578 }
6579
637cd060
ID
6580 /*
6581 * The lock may have been released by now, unless automatic lock
6582 * transitions are disabled.
6583 */
6584 rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
e010dd0a
ID
6585 return 0;
6586}
6587
589d30e0
AE
6588/*
6589 * An rbd format 2 image has a unique identifier, distinct from the
6590 * name given to it by the user. Internally, that identifier is
6591 * what's used to specify the names of objects related to the image.
6592 *
6593 * A special "rbd id" object is used to map an rbd image name to its
6594 * id. If that object doesn't exist, then there is no v2 rbd image
6595 * with the supplied name.
6596 *
6597 * This function will record the given rbd_dev's image_id field if
6598 * it can be determined, and in that case will return 0. If any
6599 * errors occur a negative errno will be returned and the rbd_dev's
6600 * image_id field will be unchanged (and should be NULL).
6601 */
6602static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6603{
6604 int ret;
6605 size_t size;
ecd4a68a 6606 CEPH_DEFINE_OID_ONSTACK(oid);
589d30e0 6607 void *response;
c0fba368 6608 char *image_id;
2f82ee54 6609
2c0d0a10
AE
6610 /*
6611 * When probing a parent image, the image id is already
6612 * known (and the image name likely is not). There's no
c0fba368
AE
6613 * need to fetch the image id again in this case. We
6614 * do still need to set the image format though.
2c0d0a10 6615 */
c0fba368
AE
6616 if (rbd_dev->spec->image_id) {
6617 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6618
2c0d0a10 6619 return 0;
c0fba368 6620 }
2c0d0a10 6621
589d30e0
AE
6622 /*
6623 * First, see if the format 2 image id file exists, and if
6624 * so, get the image's persistent id from it.
6625 */
ecd4a68a
ID
6626 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6627 rbd_dev->spec->image_name);
6628 if (ret)
6629 return ret;
6630
6631 dout("rbd id object name is %s\n", oid.name);
589d30e0
AE
6632
6633 /* Response will be an encoded string, which includes a length */
589d30e0
AE
6634 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6635 response = kzalloc(size, GFP_NOIO);
6636 if (!response) {
6637 ret = -ENOMEM;
6638 goto out;
6639 }
6640
c0fba368
AE
6641 /* If it doesn't exist we'll assume it's a format 1 image */
6642
ecd4a68a
ID
6643 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6644 "get_id", NULL, 0,
5435d206 6645 response, size);
36be9a76 6646 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
c0fba368
AE
6647 if (ret == -ENOENT) {
6648 image_id = kstrdup("", GFP_KERNEL);
6649 ret = image_id ? 0 : -ENOMEM;
6650 if (!ret)
6651 rbd_dev->image_format = 1;
7dd440c9 6652 } else if (ret >= 0) {
c0fba368
AE
6653 void *p = response;
6654
6655 image_id = ceph_extract_encoded_string(&p, p + ret,
979ed480 6656 NULL, GFP_NOIO);
461f758a 6657 ret = PTR_ERR_OR_ZERO(image_id);
c0fba368
AE
6658 if (!ret)
6659 rbd_dev->image_format = 2;
c0fba368
AE
6660 }
6661
6662 if (!ret) {
6663 rbd_dev->spec->image_id = image_id;
6664 dout("image_id is %s\n", image_id);
589d30e0
AE
6665 }
6666out:
6667 kfree(response);
ecd4a68a 6668 ceph_oid_destroy(&oid);
589d30e0
AE
6669 return ret;
6670}
6671
3abef3b3
AE
6672/*
6673 * Undo whatever state changes are made by v1 or v2 header info
6674 * call.
6675 */
6fd48b3b
AE
6676static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6677{
6678 struct rbd_image_header *header;
6679
e69b8d41 6680 rbd_dev_parent_put(rbd_dev);
22e8bd51 6681 rbd_object_map_free(rbd_dev);
da5ef6be 6682 rbd_dev_mapping_clear(rbd_dev);
6fd48b3b
AE
6683
6684 /* Free dynamic fields from the header, then zero it out */
6685
6686 header = &rbd_dev->header;
812164f8 6687 ceph_put_snap_context(header->snapc);
6fd48b3b
AE
6688 kfree(header->snap_sizes);
6689 kfree(header->snap_names);
6690 kfree(header->object_prefix);
6691 memset(header, 0, sizeof (*header));
6692}
6693
2df3fac7 6694static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
a30b71b9
AE
6695{
6696 int ret;
a30b71b9 6697
1e130199 6698 ret = rbd_dev_v2_object_prefix(rbd_dev);
57385b51 6699 if (ret)
b1b5402a
AE
6700 goto out_err;
6701
2df3fac7
AE
6702 /*
6703 * Get the and check features for the image. Currently the
6704 * features are assumed to never change.
6705 */
b1b5402a 6706 ret = rbd_dev_v2_features(rbd_dev);
57385b51 6707 if (ret)
9d475de5 6708 goto out_err;
35d489f9 6709
cc070d59
AE
6710 /* If the image supports fancy striping, get its parameters */
6711
6712 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6713 ret = rbd_dev_v2_striping_info(rbd_dev);
6714 if (ret < 0)
6715 goto out_err;
6716 }
a30b71b9 6717
7e97332e
ID
6718 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6719 ret = rbd_dev_v2_data_pool(rbd_dev);
6720 if (ret)
6721 goto out_err;
6722 }
6723
263423f8 6724 rbd_init_layout(rbd_dev);
35152979 6725 return 0;
263423f8 6726
9d475de5 6727out_err:
642a2537 6728 rbd_dev->header.features = 0;
1e130199
AE
6729 kfree(rbd_dev->header.object_prefix);
6730 rbd_dev->header.object_prefix = NULL;
9d475de5 6731 return ret;
a30b71b9
AE
6732}
6733
6d69bb53
ID
6734/*
6735 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6736 * rbd_dev_image_probe() recursion depth, which means it's also the
6737 * length of the already discovered part of the parent chain.
6738 */
6739static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
83a06263 6740{
2f82ee54 6741 struct rbd_device *parent = NULL;
124afba2
AE
6742 int ret;
6743
6744 if (!rbd_dev->parent_spec)
6745 return 0;
124afba2 6746
6d69bb53
ID
6747 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6748 pr_info("parent chain is too long (%d)\n", depth);
6749 ret = -EINVAL;
6750 goto out_err;
6751 }
6752
1643dfa4 6753 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
1f2c6651
ID
6754 if (!parent) {
6755 ret = -ENOMEM;
124afba2 6756 goto out_err;
1f2c6651
ID
6757 }
6758
6759 /*
6760 * Images related by parent/child relationships always share
6761 * rbd_client and spec/parent_spec, so bump their refcounts.
6762 */
6763 __rbd_get_client(rbd_dev->rbd_client);
6764 rbd_spec_get(rbd_dev->parent_spec);
124afba2 6765
39258aa2
ID
6766 __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
6767
6d69bb53 6768 ret = rbd_dev_image_probe(parent, depth);
124afba2
AE
6769 if (ret < 0)
6770 goto out_err;
1f2c6651 6771
124afba2 6772 rbd_dev->parent = parent;
a2acd00e 6773 atomic_set(&rbd_dev->parent_ref, 1);
124afba2 6774 return 0;
1f2c6651 6775
124afba2 6776out_err:
1f2c6651 6777 rbd_dev_unparent(rbd_dev);
1761b229 6778 rbd_dev_destroy(parent);
124afba2
AE
6779 return ret;
6780}
6781
5769ed0c
ID
6782static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6783{
6784 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5769ed0c
ID
6785 rbd_free_disk(rbd_dev);
6786 if (!single_major)
6787 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6788}
6789
811c6688
ID
6790/*
6791 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6792 * upon return.
6793 */
200a6a8b 6794static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
124afba2 6795{
83a06263 6796 int ret;
d1cf5788 6797
9b60e70b 6798 /* Record our major and minor device numbers. */
83a06263 6799
9b60e70b
ID
6800 if (!single_major) {
6801 ret = register_blkdev(0, rbd_dev->name);
6802 if (ret < 0)
1643dfa4 6803 goto err_out_unlock;
9b60e70b
ID
6804
6805 rbd_dev->major = ret;
6806 rbd_dev->minor = 0;
6807 } else {
6808 rbd_dev->major = rbd_major;
6809 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6810 }
83a06263
AE
6811
6812 /* Set up the blkdev mapping. */
6813
6814 ret = rbd_init_disk(rbd_dev);
6815 if (ret)
6816 goto err_out_blkdev;
6817
f35a4dee 6818 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
39258aa2 6819 set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
f35a4dee 6820
5769ed0c 6821 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
f35a4dee 6822 if (ret)
da5ef6be 6823 goto err_out_disk;
83a06263 6824
129b79d4 6825 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
811c6688 6826 up_write(&rbd_dev->header_rwsem);
5769ed0c 6827 return 0;
2f82ee54 6828
83a06263
AE
6829err_out_disk:
6830 rbd_free_disk(rbd_dev);
6831err_out_blkdev:
9b60e70b
ID
6832 if (!single_major)
6833 unregister_blkdev(rbd_dev->major, rbd_dev->name);
811c6688
ID
6834err_out_unlock:
6835 up_write(&rbd_dev->header_rwsem);
83a06263
AE
6836 return ret;
6837}
6838
332bb12d
AE
6839static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6840{
6841 struct rbd_spec *spec = rbd_dev->spec;
c41d13a3 6842 int ret;
332bb12d
AE
6843
6844 /* Record the header object name for this rbd image. */
6845
6846 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
332bb12d 6847 if (rbd_dev->image_format == 1)
c41d13a3
ID
6848 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6849 spec->image_name, RBD_SUFFIX);
332bb12d 6850 else
c41d13a3
ID
6851 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6852 RBD_HEADER_PREFIX, spec->image_id);
332bb12d 6853
c41d13a3 6854 return ret;
332bb12d
AE
6855}
6856
b9ef2b88
ID
6857static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6858{
6859 if (!is_snap) {
6860 pr_info("image %s/%s%s%s does not exist\n",
6861 rbd_dev->spec->pool_name,
6862 rbd_dev->spec->pool_ns ?: "",
6863 rbd_dev->spec->pool_ns ? "/" : "",
6864 rbd_dev->spec->image_name);
6865 } else {
6866 pr_info("snap %s/%s%s%s@%s does not exist\n",
6867 rbd_dev->spec->pool_name,
6868 rbd_dev->spec->pool_ns ?: "",
6869 rbd_dev->spec->pool_ns ? "/" : "",
6870 rbd_dev->spec->image_name,
6871 rbd_dev->spec->snap_name);
6872 }
6873}
6874
200a6a8b
AE
6875static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6876{
b8776051 6877 if (!rbd_is_ro(rbd_dev))
fd22aef8 6878 rbd_unregister_watch(rbd_dev);
952c48b0
ID
6879
6880 rbd_dev_unprobe(rbd_dev);
6fd48b3b
AE
6881 rbd_dev->image_format = 0;
6882 kfree(rbd_dev->spec->image_id);
6883 rbd_dev->spec->image_id = NULL;
200a6a8b
AE
6884}
6885
a30b71b9
AE
6886/*
6887 * Probe for the existence of the header object for the given rbd
1f3ef788
AE
6888 * device. If this image is the one being mapped (i.e., not a
6889 * parent), initiate a watch on its header object before using that
6890 * object to get detailed information about the rbd image.
0e4e1de5
ID
6891 *
6892 * On success, returns with header_rwsem held for write if called
6893 * with @depth == 0.
a30b71b9 6894 */
6d69bb53 6895static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
a30b71b9 6896{
b9ef2b88 6897 bool need_watch = !rbd_is_ro(rbd_dev);
a30b71b9
AE
6898 int ret;
6899
6900 /*
3abef3b3
AE
6901 * Get the id from the image id object. Unless there's an
6902 * error, rbd_dev->spec->image_id will be filled in with
6903 * a dynamically-allocated string, and rbd_dev->image_format
6904 * will be set to either 1 or 2.
a30b71b9
AE
6905 */
6906 ret = rbd_dev_image_id(rbd_dev);
6907 if (ret)
c0fba368 6908 return ret;
c0fba368 6909
332bb12d
AE
6910 ret = rbd_dev_header_name(rbd_dev);
6911 if (ret)
6912 goto err_out_format;
6913
b9ef2b88 6914 if (need_watch) {
99d16943 6915 ret = rbd_register_watch(rbd_dev);
1fe48023
ID
6916 if (ret) {
6917 if (ret == -ENOENT)
b9ef2b88 6918 rbd_print_dne(rbd_dev, false);
c41d13a3 6919 goto err_out_format;
1fe48023 6920 }
1f3ef788 6921 }
b644de2b 6922
0e4e1de5
ID
6923 if (!depth)
6924 down_write(&rbd_dev->header_rwsem);
6925
a720ae09 6926 ret = rbd_dev_header_info(rbd_dev);
b9ef2b88
ID
6927 if (ret) {
6928 if (ret == -ENOENT && !need_watch)
6929 rbd_print_dne(rbd_dev, false);
952c48b0 6930 goto err_out_probe;
b9ef2b88 6931 }
83a06263 6932
04077599
ID
6933 /*
6934 * If this image is the one being mapped, we have pool name and
6935 * id, image name and id, and snap name - need to fill snap id.
6936 * Otherwise this is a parent image, identified by pool, image
6937 * and snap ids - need to fill in names for those ids.
6938 */
6d69bb53 6939 if (!depth)
04077599
ID
6940 ret = rbd_spec_fill_snap_id(rbd_dev);
6941 else
6942 ret = rbd_spec_fill_names(rbd_dev);
1fe48023
ID
6943 if (ret) {
6944 if (ret == -ENOENT)
b9ef2b88 6945 rbd_print_dne(rbd_dev, true);
33dca39f 6946 goto err_out_probe;
1fe48023 6947 }
9bb81c9b 6948
da5ef6be
ID
6949 ret = rbd_dev_mapping_set(rbd_dev);
6950 if (ret)
6951 goto err_out_probe;
6952
f3c0e459 6953 if (rbd_is_snap(rbd_dev) &&
22e8bd51
ID
6954 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
6955 ret = rbd_object_map_load(rbd_dev);
6956 if (ret)
6957 goto err_out_probe;
6958 }
6959
e8f59b59
ID
6960 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6961 ret = rbd_dev_v2_parent_info(rbd_dev);
6962 if (ret)
6963 goto err_out_probe;
e8f59b59
ID
6964 }
6965
6d69bb53 6966 ret = rbd_dev_probe_parent(rbd_dev, depth);
30d60ba2
AE
6967 if (ret)
6968 goto err_out_probe;
6969
6970 dout("discovered format %u image, header name is %s\n",
c41d13a3 6971 rbd_dev->image_format, rbd_dev->header_oid.name);
30d60ba2 6972 return 0;
e8f59b59 6973
6fd48b3b 6974err_out_probe:
0e4e1de5
ID
6975 if (!depth)
6976 up_write(&rbd_dev->header_rwsem);
b9ef2b88 6977 if (need_watch)
99d16943 6978 rbd_unregister_watch(rbd_dev);
952c48b0 6979 rbd_dev_unprobe(rbd_dev);
332bb12d
AE
6980err_out_format:
6981 rbd_dev->image_format = 0;
5655c4d9
AE
6982 kfree(rbd_dev->spec->image_id);
6983 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
6984 return ret;
6985}
6986
9b60e70b
ID
6987static ssize_t do_rbd_add(struct bus_type *bus,
6988 const char *buf,
6989 size_t count)
602adf40 6990{
cb8627c7 6991 struct rbd_device *rbd_dev = NULL;
dc79b113 6992 struct ceph_options *ceph_opts = NULL;
4e9afeba 6993 struct rbd_options *rbd_opts = NULL;
859c31df 6994 struct rbd_spec *spec = NULL;
9d3997fd 6995 struct rbd_client *rbdc;
b51c83c2 6996 int rc;
602adf40 6997
f44d04e6
ID
6998 if (!capable(CAP_SYS_ADMIN))
6999 return -EPERM;
7000
602adf40
YS
7001 if (!try_module_get(THIS_MODULE))
7002 return -ENODEV;
7003
602adf40 7004 /* parse add command */
859c31df 7005 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 7006 if (rc < 0)
dd5ac32d 7007 goto out;
78cea76e 7008
9d3997fd
AE
7009 rbdc = rbd_get_client(ceph_opts);
7010 if (IS_ERR(rbdc)) {
7011 rc = PTR_ERR(rbdc);
0ddebc0c 7012 goto err_out_args;
9d3997fd 7013 }
602adf40 7014
602adf40 7015 /* pick the pool */
dd435855 7016 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
1fe48023
ID
7017 if (rc < 0) {
7018 if (rc == -ENOENT)
7019 pr_info("pool %s does not exist\n", spec->pool_name);
602adf40 7020 goto err_out_client;
1fe48023 7021 }
c0cd10db 7022 spec->pool_id = (u64)rc;
859c31df 7023
d147543d 7024 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
b51c83c2
ID
7025 if (!rbd_dev) {
7026 rc = -ENOMEM;
bd4ba655 7027 goto err_out_client;
b51c83c2 7028 }
c53d5893
AE
7029 rbdc = NULL; /* rbd_dev now owns this */
7030 spec = NULL; /* rbd_dev now owns this */
d147543d 7031 rbd_opts = NULL; /* rbd_dev now owns this */
602adf40 7032
39258aa2
ID
7033 /* if we are mapping a snapshot it will be a read-only mapping */
7034 if (rbd_dev->opts->read_only ||
7035 strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
7036 __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
7037
0d6d1e9c
MC
7038 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7039 if (!rbd_dev->config_info) {
7040 rc = -ENOMEM;
7041 goto err_out_rbd_dev;
7042 }
7043
6d69bb53 7044 rc = rbd_dev_image_probe(rbd_dev, 0);
0e4e1de5 7045 if (rc < 0)
c53d5893 7046 goto err_out_rbd_dev;
05fd6f6f 7047
0c93e1b7
ID
7048 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7049 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7050 rbd_dev->layout.object_size);
7051 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7052 }
7053
b536f69a 7054 rc = rbd_dev_device_setup(rbd_dev);
fd22aef8 7055 if (rc)
8b679ec5 7056 goto err_out_image_probe;
3abef3b3 7057
637cd060
ID
7058 rc = rbd_add_acquire_lock(rbd_dev);
7059 if (rc)
7060 goto err_out_image_lock;
3abef3b3 7061
5769ed0c
ID
7062 /* Everything's ready. Announce the disk to the world. */
7063
7064 rc = device_add(&rbd_dev->dev);
7065 if (rc)
e010dd0a 7066 goto err_out_image_lock;
5769ed0c 7067
3325322f 7068 device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
5769ed0c
ID
7069
7070 spin_lock(&rbd_dev_list_lock);
7071 list_add_tail(&rbd_dev->node, &rbd_dev_list);
7072 spin_unlock(&rbd_dev_list_lock);
7073
7074 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7075 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7076 rbd_dev->header.features);
dd5ac32d
ID
7077 rc = count;
7078out:
7079 module_put(THIS_MODULE);
7080 return rc;
b536f69a 7081
e010dd0a
ID
7082err_out_image_lock:
7083 rbd_dev_image_unlock(rbd_dev);
5769ed0c 7084 rbd_dev_device_release(rbd_dev);
8b679ec5
ID
7085err_out_image_probe:
7086 rbd_dev_image_release(rbd_dev);
c53d5893
AE
7087err_out_rbd_dev:
7088 rbd_dev_destroy(rbd_dev);
bd4ba655 7089err_out_client:
9d3997fd 7090 rbd_put_client(rbdc);
0ddebc0c 7091err_out_args:
859c31df 7092 rbd_spec_put(spec);
d147543d 7093 kfree(rbd_opts);
dd5ac32d 7094 goto out;
602adf40
YS
7095}
7096
7e9586ba 7097static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
7098{
7099 if (single_major)
7100 return -EINVAL;
7101
7102 return do_rbd_add(bus, buf, count);
7103}
7104
7e9586ba
GKH
7105static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7106 size_t count)
9b60e70b
ID
7107{
7108 return do_rbd_add(bus, buf, count);
7109}
7110
05a46afd
AE
7111static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7112{
ad945fc1 7113 while (rbd_dev->parent) {
05a46afd
AE
7114 struct rbd_device *first = rbd_dev;
7115 struct rbd_device *second = first->parent;
7116 struct rbd_device *third;
7117
7118 /*
7119 * Follow to the parent with no grandparent and
7120 * remove it.
7121 */
7122 while (second && (third = second->parent)) {
7123 first = second;
7124 second = third;
7125 }
ad945fc1 7126 rbd_assert(second);
8ad42cd0 7127 rbd_dev_image_release(second);
8b679ec5 7128 rbd_dev_destroy(second);
ad945fc1
AE
7129 first->parent = NULL;
7130 first->parent_overlap = 0;
7131
7132 rbd_assert(first->parent_spec);
05a46afd
AE
7133 rbd_spec_put(first->parent_spec);
7134 first->parent_spec = NULL;
05a46afd
AE
7135 }
7136}
7137
9b60e70b
ID
7138static ssize_t do_rbd_remove(struct bus_type *bus,
7139 const char *buf,
7140 size_t count)
602adf40
YS
7141{
7142 struct rbd_device *rbd_dev = NULL;
751cc0e3
AE
7143 struct list_head *tmp;
7144 int dev_id;
0276dca6 7145 char opt_buf[6];
0276dca6 7146 bool force = false;
0d8189e1 7147 int ret;
602adf40 7148
f44d04e6
ID
7149 if (!capable(CAP_SYS_ADMIN))
7150 return -EPERM;
7151
0276dca6
MC
7152 dev_id = -1;
7153 opt_buf[0] = '\0';
7154 sscanf(buf, "%d %5s", &dev_id, opt_buf);
7155 if (dev_id < 0) {
7156 pr_err("dev_id out of range\n");
602adf40 7157 return -EINVAL;
0276dca6
MC
7158 }
7159 if (opt_buf[0] != '\0') {
7160 if (!strcmp(opt_buf, "force")) {
7161 force = true;
7162 } else {
7163 pr_err("bad remove option at '%s'\n", opt_buf);
7164 return -EINVAL;
7165 }
7166 }
602adf40 7167
751cc0e3
AE
7168 ret = -ENOENT;
7169 spin_lock(&rbd_dev_list_lock);
7170 list_for_each(tmp, &rbd_dev_list) {
7171 rbd_dev = list_entry(tmp, struct rbd_device, node);
7172 if (rbd_dev->dev_id == dev_id) {
7173 ret = 0;
7174 break;
7175 }
42382b70 7176 }
751cc0e3
AE
7177 if (!ret) {
7178 spin_lock_irq(&rbd_dev->lock);
0276dca6 7179 if (rbd_dev->open_count && !force)
751cc0e3 7180 ret = -EBUSY;
85f5a4d6
ID
7181 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7182 &rbd_dev->flags))
7183 ret = -EINPROGRESS;
751cc0e3
AE
7184 spin_unlock_irq(&rbd_dev->lock);
7185 }
7186 spin_unlock(&rbd_dev_list_lock);
85f5a4d6 7187 if (ret)
1ba0f1e7 7188 return ret;
751cc0e3 7189
0276dca6
MC
7190 if (force) {
7191 /*
7192 * Prevent new IO from being queued and wait for existing
7193 * IO to complete/fail.
7194 */
7195 blk_mq_freeze_queue(rbd_dev->disk->queue);
7196 blk_set_queue_dying(rbd_dev->disk->queue);
7197 }
7198
5769ed0c
ID
7199 del_gendisk(rbd_dev->disk);
7200 spin_lock(&rbd_dev_list_lock);
7201 list_del_init(&rbd_dev->node);
7202 spin_unlock(&rbd_dev_list_lock);
7203 device_del(&rbd_dev->dev);
fca27065 7204
e010dd0a 7205 rbd_dev_image_unlock(rbd_dev);
dd5ac32d 7206 rbd_dev_device_release(rbd_dev);
8ad42cd0 7207 rbd_dev_image_release(rbd_dev);
8b679ec5 7208 rbd_dev_destroy(rbd_dev);
1ba0f1e7 7209 return count;
602adf40
YS
7210}
7211
7e9586ba 7212static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
7213{
7214 if (single_major)
7215 return -EINVAL;
7216
7217 return do_rbd_remove(bus, buf, count);
7218}
7219
7e9586ba
GKH
7220static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7221 size_t count)
9b60e70b
ID
7222{
7223 return do_rbd_remove(bus, buf, count);
7224}
7225
602adf40
YS
7226/*
7227 * create control files in sysfs
dfc5606d 7228 * /sys/bus/rbd/...
602adf40 7229 */
7d8dc534 7230static int __init rbd_sysfs_init(void)
602adf40 7231{
dfc5606d 7232 int ret;
602adf40 7233
fed4c143 7234 ret = device_register(&rbd_root_dev);
21079786 7235 if (ret < 0)
dfc5606d 7236 return ret;
602adf40 7237
fed4c143
AE
7238 ret = bus_register(&rbd_bus_type);
7239 if (ret < 0)
7240 device_unregister(&rbd_root_dev);
602adf40 7241
602adf40
YS
7242 return ret;
7243}
7244
7d8dc534 7245static void __exit rbd_sysfs_cleanup(void)
602adf40 7246{
dfc5606d 7247 bus_unregister(&rbd_bus_type);
fed4c143 7248 device_unregister(&rbd_root_dev);
602adf40
YS
7249}
7250
7d8dc534 7251static int __init rbd_slab_init(void)
1c2a9dfe
AE
7252{
7253 rbd_assert(!rbd_img_request_cache);
03d94406 7254 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
868311b1
AE
7255 if (!rbd_img_request_cache)
7256 return -ENOMEM;
7257
7258 rbd_assert(!rbd_obj_request_cache);
03d94406 7259 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
78c2a44a
AE
7260 if (!rbd_obj_request_cache)
7261 goto out_err;
7262
6c696d85 7263 return 0;
1c2a9dfe 7264
6c696d85 7265out_err:
868311b1
AE
7266 kmem_cache_destroy(rbd_img_request_cache);
7267 rbd_img_request_cache = NULL;
1c2a9dfe
AE
7268 return -ENOMEM;
7269}
7270
7271static void rbd_slab_exit(void)
7272{
868311b1
AE
7273 rbd_assert(rbd_obj_request_cache);
7274 kmem_cache_destroy(rbd_obj_request_cache);
7275 rbd_obj_request_cache = NULL;
7276
1c2a9dfe
AE
7277 rbd_assert(rbd_img_request_cache);
7278 kmem_cache_destroy(rbd_img_request_cache);
7279 rbd_img_request_cache = NULL;
7280}
7281
cc344fa1 7282static int __init rbd_init(void)
602adf40
YS
7283{
7284 int rc;
7285
1e32d34c
AE
7286 if (!libceph_compatible(NULL)) {
7287 rbd_warn(NULL, "libceph incompatibility (quitting)");
1e32d34c
AE
7288 return -EINVAL;
7289 }
e1b4d96d 7290
1c2a9dfe 7291 rc = rbd_slab_init();
602adf40
YS
7292 if (rc)
7293 return rc;
e1b4d96d 7294
f5ee37bd
ID
7295 /*
7296 * The number of active work items is limited by the number of
f77303bd 7297 * rbd devices * queue depth, so leave @max_active at default.
f5ee37bd
ID
7298 */
7299 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7300 if (!rbd_wq) {
7301 rc = -ENOMEM;
7302 goto err_out_slab;
7303 }
7304
9b60e70b
ID
7305 if (single_major) {
7306 rbd_major = register_blkdev(0, RBD_DRV_NAME);
7307 if (rbd_major < 0) {
7308 rc = rbd_major;
f5ee37bd 7309 goto err_out_wq;
9b60e70b
ID
7310 }
7311 }
7312
1c2a9dfe
AE
7313 rc = rbd_sysfs_init();
7314 if (rc)
9b60e70b
ID
7315 goto err_out_blkdev;
7316
7317 if (single_major)
7318 pr_info("loaded (major %d)\n", rbd_major);
7319 else
7320 pr_info("loaded\n");
1c2a9dfe 7321
e1b4d96d
ID
7322 return 0;
7323
9b60e70b
ID
7324err_out_blkdev:
7325 if (single_major)
7326 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd
ID
7327err_out_wq:
7328 destroy_workqueue(rbd_wq);
e1b4d96d
ID
7329err_out_slab:
7330 rbd_slab_exit();
1c2a9dfe 7331 return rc;
602adf40
YS
7332}
7333
cc344fa1 7334static void __exit rbd_exit(void)
602adf40 7335{
ffe312cf 7336 ida_destroy(&rbd_dev_id_ida);
602adf40 7337 rbd_sysfs_cleanup();
9b60e70b
ID
7338 if (single_major)
7339 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd 7340 destroy_workqueue(rbd_wq);
1c2a9dfe 7341 rbd_slab_exit();
602adf40
YS
7342}
7343
7344module_init(rbd_init);
7345module_exit(rbd_exit);
7346
d552c619 7347MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
602adf40
YS
7348MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7349MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
602adf40
YS
7350/* following authorship retained from original osdblk.c */
7351MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7352
90da258b 7353MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
602adf40 7354MODULE_LICENSE("GPL");