rbd: call rbd_dev_mapping_set() from rbd_dev_image_probe()
[linux-block.git] / drivers / block / rbd.c
CommitLineData
e2a58ee5 1
602adf40
YS
2/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
dfc5606d 25 For usage instructions, please refer to:
602adf40 26
dfc5606d 27 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
28
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
ed95b21a 34#include <linux/ceph/cls_lock_client.h>
43df3d35 35#include <linux/ceph/striper.h>
602adf40 36#include <linux/ceph/decode.h>
59c2be1e 37#include <linux/parser.h>
30d1cff8 38#include <linux/bsearch.h>
602adf40
YS
39
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
7ad18afa 43#include <linux/blk-mq.h>
602adf40
YS
44#include <linux/fs.h>
45#include <linux/blkdev.h>
1c2a9dfe 46#include <linux/slab.h>
f8a22fc2 47#include <linux/idr.h>
bc1ecc65 48#include <linux/workqueue.h>
602adf40
YS
49
50#include "rbd_types.h"
51
aafb230e
AE
52#define RBD_DEBUG /* Activate rbd_assert() calls */
53
a2acd00e
AE
54/*
55 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
bfc18e38 64 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
a2acd00e
AE
65 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
f0f8cef5 87#define RBD_DRV_NAME "rbd"
602adf40 88
7e513d43
ID
89#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
602adf40 91
6d69bb53
ID
92#define RBD_MAX_PARENT_CHAIN_LEN 16
93
d4b125e9
AE
94#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
35d489f9 98#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
99
100#define RBD_SNAP_HEAD_NAME "-"
101
9682fc6d
AE
102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
9e15b77d
AE
104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 106#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 107
1e130199 108#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 109
ed95b21a 110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
99d16943
ID
111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
d889140c
AE
113/* Feature bits */
114
8767b293
ID
115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
b9f6d447 118#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
8767b293 119#define RBD_FEATURE_DATA_POOL (1ULL<<7)
e573427a 120#define RBD_FEATURE_OPERATIONS (1ULL<<8)
8767b293 121
ed95b21a
ID
122#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
123 RBD_FEATURE_STRIPINGV2 | \
7e97332e 124 RBD_FEATURE_EXCLUSIVE_LOCK | \
b9f6d447 125 RBD_FEATURE_DEEP_FLATTEN | \
e573427a
ID
126 RBD_FEATURE_DATA_POOL | \
127 RBD_FEATURE_OPERATIONS)
d889140c
AE
128
129/* Features supported by this (client software) implementation. */
130
770eba6e 131#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
d889140c 132
81a89793
AE
133/*
134 * An RBD device name will be "rbd#", where the "rbd" comes from
135 * RBD_DRV_NAME above, and # is a unique integer identifier.
81a89793 136 */
602adf40
YS
137#define DEV_NAME_LEN 32
138
139/*
140 * block device image metadata (in-memory version)
141 */
142struct rbd_image_header {
f35a4dee 143 /* These six fields never change for a given rbd image */
849b4260 144 char *object_prefix;
602adf40 145 __u8 obj_order;
f35a4dee
AE
146 u64 stripe_unit;
147 u64 stripe_count;
7e97332e 148 s64 data_pool_id;
f35a4dee 149 u64 features; /* Might be changeable someday? */
602adf40 150
f84344f3
AE
151 /* The remaining fields need to be updated occasionally */
152 u64 image_size;
153 struct ceph_snap_context *snapc;
f35a4dee
AE
154 char *snap_names; /* format 1 only */
155 u64 *snap_sizes; /* format 1 only */
59c2be1e
YS
156};
157
0d7dbfce
AE
158/*
159 * An rbd image specification.
160 *
161 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
162 * identify an image. Each rbd_dev structure includes a pointer to
163 * an rbd_spec structure that encapsulates this identity.
164 *
165 * Each of the id's in an rbd_spec has an associated name. For a
166 * user-mapped image, the names are supplied and the id's associated
167 * with them are looked up. For a layered image, a parent image is
168 * defined by the tuple, and the names are looked up.
169 *
170 * An rbd_dev structure contains a parent_spec pointer which is
171 * non-null if the image it represents is a child in a layered
172 * image. This pointer will refer to the rbd_spec structure used
173 * by the parent rbd_dev for its own identity (i.e., the structure
174 * is shared between the parent and child).
175 *
176 * Since these structures are populated once, during the discovery
177 * phase of image construction, they are effectively immutable so
178 * we make no effort to synchronize access to them.
179 *
180 * Note that code herein does not assume the image name is known (it
181 * could be a null pointer).
0d7dbfce
AE
182 */
183struct rbd_spec {
184 u64 pool_id;
ecb4dc22 185 const char *pool_name;
b26c047b 186 const char *pool_ns; /* NULL if default, never "" */
0d7dbfce 187
ecb4dc22
AE
188 const char *image_id;
189 const char *image_name;
0d7dbfce
AE
190
191 u64 snap_id;
ecb4dc22 192 const char *snap_name;
0d7dbfce
AE
193
194 struct kref kref;
195};
196
602adf40 197/*
f0f8cef5 198 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
199 */
200struct rbd_client {
201 struct ceph_client *client;
202 struct kref kref;
203 struct list_head node;
204};
205
0192ce2e
ID
206struct pending_result {
207 int result; /* first nonzero result */
208 int num_pending;
209};
210
bf0d5f50 211struct rbd_img_request;
bf0d5f50 212
9969ebc5 213enum obj_request_type {
a1fbb5e7 214 OBJ_REQUEST_NODATA = 1,
5359a17d 215 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
7e07efb1 216 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
afb97888 217 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
9969ebc5 218};
bf0d5f50 219
6d2940c8 220enum obj_operation_type {
a1fbb5e7 221 OBJ_OP_READ = 1,
6d2940c8 222 OBJ_OP_WRITE,
90e98c52 223 OBJ_OP_DISCARD,
6484cbe9 224 OBJ_OP_ZEROOUT,
6d2940c8
GZ
225};
226
0ad5d953
ID
227#define RBD_OBJ_FLAG_DELETION (1U << 0)
228#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
793333a3 229#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
0ad5d953 230
a9b67e69 231enum rbd_obj_read_state {
85b5e6d1
ID
232 RBD_OBJ_READ_START = 1,
233 RBD_OBJ_READ_OBJECT,
a9b67e69
ID
234 RBD_OBJ_READ_PARENT,
235};
236
3da691bf
ID
237/*
238 * Writes go through the following state machine to deal with
239 * layering:
240 *
89a59c1c
ID
241 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
242 * . | .
243 * . v .
244 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
245 * . | . .
246 * . v v (deep-copyup .
247 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
248 * flattened) v | . .
249 * . v . .
250 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
251 * | not needed) v
252 * v .
253 * done . . . . . . . . . . . . . . . . . .
254 * ^
255 * |
256 * RBD_OBJ_WRITE_FLAT
3da691bf
ID
257 *
258 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
89a59c1c
ID
259 * assert_exists guard is needed or not (in some cases it's not needed
260 * even if there is a parent).
3da691bf
ID
261 */
262enum rbd_obj_write_state {
85b5e6d1
ID
263 RBD_OBJ_WRITE_START = 1,
264 RBD_OBJ_WRITE_OBJECT,
793333a3
ID
265 __RBD_OBJ_WRITE_COPYUP,
266 RBD_OBJ_WRITE_COPYUP,
267};
268
269enum rbd_obj_copyup_state {
270 RBD_OBJ_COPYUP_START = 1,
271 RBD_OBJ_COPYUP_READ_PARENT,
272 __RBD_OBJ_COPYUP_WRITE_OBJECT,
273 RBD_OBJ_COPYUP_WRITE_OBJECT,
926f9b3f
AE
274};
275
bf0d5f50 276struct rbd_obj_request {
43df3d35 277 struct ceph_object_extent ex;
0ad5d953 278 unsigned int flags; /* RBD_OBJ_FLAG_* */
c5b5ef6c 279 union {
a9b67e69 280 enum rbd_obj_read_state read_state; /* for reads */
3da691bf 281 enum rbd_obj_write_state write_state; /* for writes */
c5b5ef6c 282 };
bf0d5f50 283
51c3509e 284 struct rbd_img_request *img_request;
86bd7998
ID
285 struct ceph_file_extent *img_extents;
286 u32 num_img_extents;
bf0d5f50 287
788e2df3 288 union {
5359a17d 289 struct ceph_bio_iter bio_pos;
788e2df3 290 struct {
7e07efb1
ID
291 struct ceph_bvec_iter bvec_pos;
292 u32 bvec_count;
afb97888 293 u32 bvec_idx;
788e2df3
AE
294 };
295 };
793333a3
ID
296
297 enum rbd_obj_copyup_state copyup_state;
7e07efb1
ID
298 struct bio_vec *copyup_bvecs;
299 u32 copyup_bvec_count;
bf0d5f50 300
bcbab1db 301 struct list_head osd_reqs; /* w/ r_private_item */
bf0d5f50 302
85b5e6d1 303 struct mutex state_mutex;
793333a3 304 struct pending_result pending;
bf0d5f50
AE
305 struct kref kref;
306};
307
0c425248 308enum img_req_flags {
9849e986 309 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
d0b2e944 310 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
0c425248
AE
311};
312
0192ce2e
ID
313enum rbd_img_state {
314 RBD_IMG_START = 1,
637cd060 315 RBD_IMG_EXCLUSIVE_LOCK,
0192ce2e
ID
316 __RBD_IMG_OBJECT_REQUESTS,
317 RBD_IMG_OBJECT_REQUESTS,
318};
319
bf0d5f50 320struct rbd_img_request {
bf0d5f50 321 struct rbd_device *rbd_dev;
9bb0248d 322 enum obj_operation_type op_type;
ecc633ca 323 enum obj_request_type data_type;
0c425248 324 unsigned long flags;
0192ce2e 325 enum rbd_img_state state;
bf0d5f50 326 union {
9849e986 327 u64 snap_id; /* for reads */
bf0d5f50 328 struct ceph_snap_context *snapc; /* for writes */
9849e986
AE
329 };
330 union {
331 struct request *rq; /* block request */
332 struct rbd_obj_request *obj_request; /* obj req initiator */
bf0d5f50 333 };
bf0d5f50 334
e1fddc8f 335 struct list_head lock_item;
43df3d35 336 struct list_head object_extents; /* obj_req.ex structs */
bf0d5f50 337
0192ce2e
ID
338 struct mutex state_mutex;
339 struct pending_result pending;
340 struct work_struct work;
341 int work_result;
bf0d5f50
AE
342 struct kref kref;
343};
344
345#define for_each_obj_request(ireq, oreq) \
43df3d35 346 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 347#define for_each_obj_request_safe(ireq, oreq, n) \
43df3d35 348 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 349
99d16943
ID
350enum rbd_watch_state {
351 RBD_WATCH_STATE_UNREGISTERED,
352 RBD_WATCH_STATE_REGISTERED,
353 RBD_WATCH_STATE_ERROR,
354};
355
ed95b21a
ID
356enum rbd_lock_state {
357 RBD_LOCK_STATE_UNLOCKED,
358 RBD_LOCK_STATE_LOCKED,
359 RBD_LOCK_STATE_RELEASING,
360};
361
362/* WatchNotify::ClientId */
363struct rbd_client_id {
364 u64 gid;
365 u64 handle;
366};
367
f84344f3 368struct rbd_mapping {
99c1f08f 369 u64 size;
34b13184 370 u64 features;
f84344f3
AE
371};
372
602adf40
YS
373/*
374 * a single device
375 */
376struct rbd_device {
de71a297 377 int dev_id; /* blkdev unique id */
602adf40
YS
378
379 int major; /* blkdev assigned major */
dd82fff1 380 int minor;
602adf40 381 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 382
a30b71b9 383 u32 image_format; /* Either 1 or 2 */
602adf40
YS
384 struct rbd_client *rbd_client;
385
386 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
387
b82d167b 388 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
389
390 struct rbd_image_header header;
b82d167b 391 unsigned long flags; /* possibly lock protected */
0d7dbfce 392 struct rbd_spec *spec;
d147543d 393 struct rbd_options *opts;
0d6d1e9c 394 char *config_info; /* add{,_single_major} string */
602adf40 395
c41d13a3 396 struct ceph_object_id header_oid;
922dab61 397 struct ceph_object_locator header_oloc;
971f839a 398
1643dfa4 399 struct ceph_file_layout layout; /* used for all rbd requests */
0903e875 400
99d16943
ID
401 struct mutex watch_mutex;
402 enum rbd_watch_state watch_state;
922dab61 403 struct ceph_osd_linger_request *watch_handle;
99d16943
ID
404 u64 watch_cookie;
405 struct delayed_work watch_dwork;
59c2be1e 406
ed95b21a
ID
407 struct rw_semaphore lock_rwsem;
408 enum rbd_lock_state lock_state;
cbbfb0ff 409 char lock_cookie[32];
ed95b21a
ID
410 struct rbd_client_id owner_cid;
411 struct work_struct acquired_lock_work;
412 struct work_struct released_lock_work;
413 struct delayed_work lock_dwork;
414 struct work_struct unlock_work;
e1fddc8f 415 spinlock_t lock_lists_lock;
637cd060 416 struct list_head acquiring_list;
e1fddc8f 417 struct list_head running_list;
637cd060
ID
418 struct completion acquire_wait;
419 int acquire_err;
e1fddc8f 420 struct completion releasing_wait;
ed95b21a 421
1643dfa4 422 struct workqueue_struct *task_wq;
59c2be1e 423
86b00e0d
AE
424 struct rbd_spec *parent_spec;
425 u64 parent_overlap;
a2acd00e 426 atomic_t parent_ref;
2f82ee54 427 struct rbd_device *parent;
86b00e0d 428
7ad18afa
CH
429 /* Block layer tags. */
430 struct blk_mq_tag_set tag_set;
431
c666601a
JD
432 /* protects updating the header */
433 struct rw_semaphore header_rwsem;
f84344f3
AE
434
435 struct rbd_mapping mapping;
602adf40
YS
436
437 struct list_head node;
dfc5606d 438
dfc5606d
YS
439 /* sysfs related */
440 struct device dev;
b82d167b 441 unsigned long open_count; /* protected by lock */
dfc5606d
YS
442};
443
b82d167b 444/*
87c0fded
ID
445 * Flag bits for rbd_dev->flags:
446 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
447 * by rbd_dev->lock
b82d167b 448 */
6d292906
AE
449enum rbd_dev_flags {
450 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
b82d167b 451 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
6d292906
AE
452};
453
cfbf6377 454static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
e124a82f 455
602adf40 456static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
457static DEFINE_SPINLOCK(rbd_dev_list_lock);
458
432b8587
AE
459static LIST_HEAD(rbd_client_list); /* clients */
460static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 461
78c2a44a
AE
462/* Slab caches for frequently-allocated structures */
463
1c2a9dfe 464static struct kmem_cache *rbd_img_request_cache;
868311b1 465static struct kmem_cache *rbd_obj_request_cache;
1c2a9dfe 466
9b60e70b 467static int rbd_major;
f8a22fc2
ID
468static DEFINE_IDA(rbd_dev_id_ida);
469
f5ee37bd
ID
470static struct workqueue_struct *rbd_wq;
471
89a59c1c
ID
472static struct ceph_snap_context rbd_empty_snapc = {
473 .nref = REFCOUNT_INIT(1),
474};
475
9b60e70b 476/*
3cfa3b16 477 * single-major requires >= 0.75 version of userspace rbd utility.
9b60e70b 478 */
3cfa3b16 479static bool single_major = true;
5657a819 480module_param(single_major, bool, 0444);
3cfa3b16 481MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
9b60e70b 482
7e9586ba
GKH
483static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
484static ssize_t remove_store(struct bus_type *bus, const char *buf,
485 size_t count);
486static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
487 size_t count);
488static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
489 size_t count);
6d69bb53 490static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
f0f8cef5 491
9b60e70b
ID
492static int rbd_dev_id_to_minor(int dev_id)
493{
7e513d43 494 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
495}
496
497static int minor_to_rbd_dev_id(int minor)
498{
7e513d43 499 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
500}
501
ed95b21a
ID
502static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
503{
637cd060
ID
504 lockdep_assert_held(&rbd_dev->lock_rwsem);
505
ed95b21a
ID
506 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
507 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
508}
509
510static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
511{
512 bool is_lock_owner;
513
514 down_read(&rbd_dev->lock_rwsem);
515 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
516 up_read(&rbd_dev->lock_rwsem);
517 return is_lock_owner;
518}
519
7e9586ba 520static ssize_t supported_features_show(struct bus_type *bus, char *buf)
8767b293
ID
521{
522 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
523}
524
7e9586ba
GKH
525static BUS_ATTR_WO(add);
526static BUS_ATTR_WO(remove);
527static BUS_ATTR_WO(add_single_major);
528static BUS_ATTR_WO(remove_single_major);
529static BUS_ATTR_RO(supported_features);
b15a21dd
GKH
530
531static struct attribute *rbd_bus_attrs[] = {
532 &bus_attr_add.attr,
533 &bus_attr_remove.attr,
9b60e70b
ID
534 &bus_attr_add_single_major.attr,
535 &bus_attr_remove_single_major.attr,
8767b293 536 &bus_attr_supported_features.attr,
b15a21dd 537 NULL,
f0f8cef5 538};
92c76dc0
ID
539
540static umode_t rbd_bus_is_visible(struct kobject *kobj,
541 struct attribute *attr, int index)
542{
9b60e70b
ID
543 if (!single_major &&
544 (attr == &bus_attr_add_single_major.attr ||
545 attr == &bus_attr_remove_single_major.attr))
546 return 0;
547
92c76dc0
ID
548 return attr->mode;
549}
550
551static const struct attribute_group rbd_bus_group = {
552 .attrs = rbd_bus_attrs,
553 .is_visible = rbd_bus_is_visible,
554};
555__ATTRIBUTE_GROUPS(rbd_bus);
f0f8cef5
AE
556
557static struct bus_type rbd_bus_type = {
558 .name = "rbd",
b15a21dd 559 .bus_groups = rbd_bus_groups,
f0f8cef5
AE
560};
561
562static void rbd_root_dev_release(struct device *dev)
563{
564}
565
566static struct device rbd_root_dev = {
567 .init_name = "rbd",
568 .release = rbd_root_dev_release,
569};
570
06ecc6cb
AE
571static __printf(2, 3)
572void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
573{
574 struct va_format vaf;
575 va_list args;
576
577 va_start(args, fmt);
578 vaf.fmt = fmt;
579 vaf.va = &args;
580
581 if (!rbd_dev)
582 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
583 else if (rbd_dev->disk)
584 printk(KERN_WARNING "%s: %s: %pV\n",
585 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
586 else if (rbd_dev->spec && rbd_dev->spec->image_name)
587 printk(KERN_WARNING "%s: image %s: %pV\n",
588 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
589 else if (rbd_dev->spec && rbd_dev->spec->image_id)
590 printk(KERN_WARNING "%s: id %s: %pV\n",
591 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
592 else /* punt */
593 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
594 RBD_DRV_NAME, rbd_dev, &vaf);
595 va_end(args);
596}
597
aafb230e
AE
598#ifdef RBD_DEBUG
599#define rbd_assert(expr) \
600 if (unlikely(!(expr))) { \
601 printk(KERN_ERR "\nAssertion failure in %s() " \
602 "at line %d:\n\n" \
603 "\trbd_assert(%s);\n\n", \
604 __func__, __LINE__, #expr); \
605 BUG(); \
606 }
607#else /* !RBD_DEBUG */
608# define rbd_assert(expr) ((void) 0)
609#endif /* !RBD_DEBUG */
dfc5606d 610
05a46afd 611static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
8b3e1a56 612
cc4a38bd 613static int rbd_dev_refresh(struct rbd_device *rbd_dev);
2df3fac7 614static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
a720ae09 615static int rbd_dev_header_info(struct rbd_device *rbd_dev);
e8f59b59 616static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
54cac61f
AE
617static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
618 u64 snap_id);
2ad3d716
AE
619static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
620 u8 *order, u64 *snap_size);
621static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
622 u64 *snap_features);
59c2be1e 623
54ab3b24 624static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
0192ce2e
ID
625static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
626
627/*
628 * Return true if nothing else is pending.
629 */
630static bool pending_result_dec(struct pending_result *pending, int *result)
631{
632 rbd_assert(pending->num_pending > 0);
633
634 if (*result && !pending->result)
635 pending->result = *result;
636 if (--pending->num_pending)
637 return false;
638
639 *result = pending->result;
640 return true;
641}
54ab3b24 642
602adf40
YS
643static int rbd_open(struct block_device *bdev, fmode_t mode)
644{
f0f8cef5 645 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 646 bool removing = false;
602adf40 647
a14ea269 648 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
649 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
650 removing = true;
651 else
652 rbd_dev->open_count++;
a14ea269 653 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
654 if (removing)
655 return -ENOENT;
656
c3e946ce 657 (void) get_device(&rbd_dev->dev);
340c7a2b 658
602adf40
YS
659 return 0;
660}
661
db2a144b 662static void rbd_release(struct gendisk *disk, fmode_t mode)
dfc5606d
YS
663{
664 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
665 unsigned long open_count_before;
666
a14ea269 667 spin_lock_irq(&rbd_dev->lock);
b82d167b 668 open_count_before = rbd_dev->open_count--;
a14ea269 669 spin_unlock_irq(&rbd_dev->lock);
b82d167b 670 rbd_assert(open_count_before > 0);
dfc5606d 671
c3e946ce 672 put_device(&rbd_dev->dev);
dfc5606d
YS
673}
674
131fd9f6
GZ
675static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
676{
1de797bb 677 int ro;
131fd9f6 678
1de797bb 679 if (get_user(ro, (int __user *)arg))
131fd9f6
GZ
680 return -EFAULT;
681
1de797bb 682 /* Snapshots can't be marked read-write */
131fd9f6
GZ
683 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
684 return -EROFS;
685
1de797bb
ID
686 /* Let blkdev_roset() handle it */
687 return -ENOTTY;
131fd9f6
GZ
688}
689
690static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
691 unsigned int cmd, unsigned long arg)
692{
693 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
1de797bb 694 int ret;
131fd9f6 695
131fd9f6
GZ
696 switch (cmd) {
697 case BLKROSET:
698 ret = rbd_ioctl_set_ro(rbd_dev, arg);
699 break;
700 default:
701 ret = -ENOTTY;
702 }
703
131fd9f6
GZ
704 return ret;
705}
706
707#ifdef CONFIG_COMPAT
708static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
709 unsigned int cmd, unsigned long arg)
710{
711 return rbd_ioctl(bdev, mode, cmd, arg);
712}
713#endif /* CONFIG_COMPAT */
714
602adf40
YS
715static const struct block_device_operations rbd_bd_ops = {
716 .owner = THIS_MODULE,
717 .open = rbd_open,
dfc5606d 718 .release = rbd_release,
131fd9f6
GZ
719 .ioctl = rbd_ioctl,
720#ifdef CONFIG_COMPAT
721 .compat_ioctl = rbd_compat_ioctl,
722#endif
602adf40
YS
723};
724
725/*
7262cfca 726 * Initialize an rbd client instance. Success or not, this function
cfbf6377 727 * consumes ceph_opts. Caller holds client_mutex.
602adf40 728 */
f8c38929 729static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
730{
731 struct rbd_client *rbdc;
732 int ret = -ENOMEM;
733
37206ee5 734 dout("%s:\n", __func__);
602adf40
YS
735 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
736 if (!rbdc)
737 goto out_opt;
738
739 kref_init(&rbdc->kref);
740 INIT_LIST_HEAD(&rbdc->node);
741
74da4a0f 742 rbdc->client = ceph_create_client(ceph_opts, rbdc);
602adf40 743 if (IS_ERR(rbdc->client))
08f75463 744 goto out_rbdc;
43ae4701 745 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
746
747 ret = ceph_open_session(rbdc->client);
748 if (ret < 0)
08f75463 749 goto out_client;
602adf40 750
432b8587 751 spin_lock(&rbd_client_list_lock);
602adf40 752 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 753 spin_unlock(&rbd_client_list_lock);
602adf40 754
37206ee5 755 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 756
602adf40 757 return rbdc;
08f75463 758out_client:
602adf40 759 ceph_destroy_client(rbdc->client);
08f75463 760out_rbdc:
602adf40
YS
761 kfree(rbdc);
762out_opt:
43ae4701
AE
763 if (ceph_opts)
764 ceph_destroy_options(ceph_opts);
37206ee5
AE
765 dout("%s: error %d\n", __func__, ret);
766
28f259b7 767 return ERR_PTR(ret);
602adf40
YS
768}
769
2f82ee54
AE
770static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
771{
772 kref_get(&rbdc->kref);
773
774 return rbdc;
775}
776
602adf40 777/*
1f7ba331
AE
778 * Find a ceph client with specific addr and configuration. If
779 * found, bump its reference count.
602adf40 780 */
1f7ba331 781static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
782{
783 struct rbd_client *client_node;
1f7ba331 784 bool found = false;
602adf40 785
43ae4701 786 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
787 return NULL;
788
1f7ba331
AE
789 spin_lock(&rbd_client_list_lock);
790 list_for_each_entry(client_node, &rbd_client_list, node) {
791 if (!ceph_compare_options(ceph_opts, client_node->client)) {
2f82ee54
AE
792 __rbd_get_client(client_node);
793
1f7ba331
AE
794 found = true;
795 break;
796 }
797 }
798 spin_unlock(&rbd_client_list_lock);
799
800 return found ? client_node : NULL;
602adf40
YS
801}
802
59c2be1e 803/*
210c104c 804 * (Per device) rbd map options
59c2be1e
YS
805 */
806enum {
b5584180 807 Opt_queue_depth,
0c93e1b7 808 Opt_alloc_size,
34f55d0b 809 Opt_lock_timeout,
59c2be1e
YS
810 Opt_last_int,
811 /* int args above */
b26c047b 812 Opt_pool_ns,
59c2be1e
YS
813 Opt_last_string,
814 /* string args above */
cc0538b6
AE
815 Opt_read_only,
816 Opt_read_write,
80de1912 817 Opt_lock_on_read,
e010dd0a 818 Opt_exclusive,
d9360540 819 Opt_notrim,
210c104c 820 Opt_err
59c2be1e
YS
821};
822
43ae4701 823static match_table_t rbd_opts_tokens = {
b5584180 824 {Opt_queue_depth, "queue_depth=%d"},
0c93e1b7 825 {Opt_alloc_size, "alloc_size=%d"},
34f55d0b 826 {Opt_lock_timeout, "lock_timeout=%d"},
59c2be1e 827 /* int args above */
b26c047b 828 {Opt_pool_ns, "_pool_ns=%s"},
59c2be1e 829 /* string args above */
be466c1c 830 {Opt_read_only, "read_only"},
cc0538b6
AE
831 {Opt_read_only, "ro"}, /* Alternate spelling */
832 {Opt_read_write, "read_write"},
833 {Opt_read_write, "rw"}, /* Alternate spelling */
80de1912 834 {Opt_lock_on_read, "lock_on_read"},
e010dd0a 835 {Opt_exclusive, "exclusive"},
d9360540 836 {Opt_notrim, "notrim"},
210c104c 837 {Opt_err, NULL}
59c2be1e
YS
838};
839
98571b5a 840struct rbd_options {
b5584180 841 int queue_depth;
0c93e1b7 842 int alloc_size;
34f55d0b 843 unsigned long lock_timeout;
98571b5a 844 bool read_only;
80de1912 845 bool lock_on_read;
e010dd0a 846 bool exclusive;
d9360540 847 bool trim;
98571b5a
AE
848};
849
b5584180 850#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
0c93e1b7 851#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
34f55d0b 852#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
98571b5a 853#define RBD_READ_ONLY_DEFAULT false
80de1912 854#define RBD_LOCK_ON_READ_DEFAULT false
e010dd0a 855#define RBD_EXCLUSIVE_DEFAULT false
d9360540 856#define RBD_TRIM_DEFAULT true
98571b5a 857
c300156b
ID
858struct parse_rbd_opts_ctx {
859 struct rbd_spec *spec;
860 struct rbd_options *opts;
861};
862
59c2be1e
YS
863static int parse_rbd_opts_token(char *c, void *private)
864{
c300156b 865 struct parse_rbd_opts_ctx *pctx = private;
59c2be1e
YS
866 substring_t argstr[MAX_OPT_ARGS];
867 int token, intval, ret;
868
43ae4701 869 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
870 if (token < Opt_last_int) {
871 ret = match_int(&argstr[0], &intval);
872 if (ret < 0) {
2f56b6ba 873 pr_err("bad option arg (not int) at '%s'\n", c);
59c2be1e
YS
874 return ret;
875 }
876 dout("got int token %d val %d\n", token, intval);
877 } else if (token > Opt_last_int && token < Opt_last_string) {
210c104c 878 dout("got string token %d val %s\n", token, argstr[0].from);
59c2be1e
YS
879 } else {
880 dout("got token %d\n", token);
881 }
882
883 switch (token) {
b5584180
ID
884 case Opt_queue_depth:
885 if (intval < 1) {
886 pr_err("queue_depth out of range\n");
887 return -EINVAL;
888 }
c300156b 889 pctx->opts->queue_depth = intval;
b5584180 890 break;
0c93e1b7 891 case Opt_alloc_size:
16d80c54 892 if (intval < SECTOR_SIZE) {
0c93e1b7
ID
893 pr_err("alloc_size out of range\n");
894 return -EINVAL;
895 }
896 if (!is_power_of_2(intval)) {
897 pr_err("alloc_size must be a power of 2\n");
898 return -EINVAL;
899 }
900 pctx->opts->alloc_size = intval;
901 break;
34f55d0b
DY
902 case Opt_lock_timeout:
903 /* 0 is "wait forever" (i.e. infinite timeout) */
904 if (intval < 0 || intval > INT_MAX / 1000) {
905 pr_err("lock_timeout out of range\n");
906 return -EINVAL;
907 }
c300156b 908 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
34f55d0b 909 break;
b26c047b
ID
910 case Opt_pool_ns:
911 kfree(pctx->spec->pool_ns);
912 pctx->spec->pool_ns = match_strdup(argstr);
913 if (!pctx->spec->pool_ns)
914 return -ENOMEM;
34f55d0b 915 break;
cc0538b6 916 case Opt_read_only:
c300156b 917 pctx->opts->read_only = true;
cc0538b6
AE
918 break;
919 case Opt_read_write:
c300156b 920 pctx->opts->read_only = false;
cc0538b6 921 break;
80de1912 922 case Opt_lock_on_read:
c300156b 923 pctx->opts->lock_on_read = true;
80de1912 924 break;
e010dd0a 925 case Opt_exclusive:
c300156b 926 pctx->opts->exclusive = true;
e010dd0a 927 break;
d9360540 928 case Opt_notrim:
c300156b 929 pctx->opts->trim = false;
d9360540 930 break;
59c2be1e 931 default:
210c104c
ID
932 /* libceph prints "bad option" msg */
933 return -EINVAL;
59c2be1e 934 }
210c104c 935
59c2be1e
YS
936 return 0;
937}
938
6d2940c8
GZ
939static char* obj_op_name(enum obj_operation_type op_type)
940{
941 switch (op_type) {
942 case OBJ_OP_READ:
943 return "read";
944 case OBJ_OP_WRITE:
945 return "write";
90e98c52
GZ
946 case OBJ_OP_DISCARD:
947 return "discard";
6484cbe9
ID
948 case OBJ_OP_ZEROOUT:
949 return "zeroout";
6d2940c8
GZ
950 default:
951 return "???";
952 }
953}
954
602adf40
YS
955/*
956 * Destroy ceph client
d23a4b3f 957 *
432b8587 958 * Caller must hold rbd_client_list_lock.
602adf40
YS
959 */
960static void rbd_client_release(struct kref *kref)
961{
962 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
963
37206ee5 964 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 965 spin_lock(&rbd_client_list_lock);
602adf40 966 list_del(&rbdc->node);
cd9d9f5d 967 spin_unlock(&rbd_client_list_lock);
602adf40
YS
968
969 ceph_destroy_client(rbdc->client);
970 kfree(rbdc);
971}
972
973/*
974 * Drop reference to ceph client node. If it's not referenced anymore, release
975 * it.
976 */
9d3997fd 977static void rbd_put_client(struct rbd_client *rbdc)
602adf40 978{
c53d5893
AE
979 if (rbdc)
980 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
981}
982
5feb0d8d
ID
983/*
984 * Get a ceph client with specific addr and configuration, if one does
985 * not exist create it. Either way, ceph_opts is consumed by this
986 * function.
987 */
988static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
989{
990 struct rbd_client *rbdc;
dd435855 991 int ret;
5feb0d8d 992
a32e4143 993 mutex_lock(&client_mutex);
5feb0d8d 994 rbdc = rbd_client_find(ceph_opts);
dd435855 995 if (rbdc) {
5feb0d8d 996 ceph_destroy_options(ceph_opts);
dd435855
ID
997
998 /*
999 * Using an existing client. Make sure ->pg_pools is up to
1000 * date before we look up the pool id in do_rbd_add().
1001 */
9d4a227f
ID
1002 ret = ceph_wait_for_latest_osdmap(rbdc->client,
1003 rbdc->client->options->mount_timeout);
dd435855
ID
1004 if (ret) {
1005 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
1006 rbd_put_client(rbdc);
1007 rbdc = ERR_PTR(ret);
1008 }
1009 } else {
5feb0d8d 1010 rbdc = rbd_client_create(ceph_opts);
dd435855 1011 }
5feb0d8d
ID
1012 mutex_unlock(&client_mutex);
1013
1014 return rbdc;
1015}
1016
a30b71b9
AE
1017static bool rbd_image_format_valid(u32 image_format)
1018{
1019 return image_format == 1 || image_format == 2;
1020}
1021
8e94af8e
AE
1022static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
1023{
103a150f
AE
1024 size_t size;
1025 u32 snap_count;
1026
1027 /* The header has to start with the magic rbd header text */
1028 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
1029 return false;
1030
db2388b6
AE
1031 /* The bio layer requires at least sector-sized I/O */
1032
1033 if (ondisk->options.order < SECTOR_SHIFT)
1034 return false;
1035
1036 /* If we use u64 in a few spots we may be able to loosen this */
1037
1038 if (ondisk->options.order > 8 * sizeof (int) - 1)
1039 return false;
1040
103a150f
AE
1041 /*
1042 * The size of a snapshot header has to fit in a size_t, and
1043 * that limits the number of snapshots.
1044 */
1045 snap_count = le32_to_cpu(ondisk->snap_count);
1046 size = SIZE_MAX - sizeof (struct ceph_snap_context);
1047 if (snap_count > size / sizeof (__le64))
1048 return false;
1049
1050 /*
1051 * Not only that, but the size of the entire the snapshot
1052 * header must also be representable in a size_t.
1053 */
1054 size -= snap_count * sizeof (__le64);
1055 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1056 return false;
1057
1058 return true;
8e94af8e
AE
1059}
1060
5bc3fb17
ID
1061/*
1062 * returns the size of an object in the image
1063 */
1064static u32 rbd_obj_bytes(struct rbd_image_header *header)
1065{
1066 return 1U << header->obj_order;
1067}
1068
263423f8
ID
1069static void rbd_init_layout(struct rbd_device *rbd_dev)
1070{
1071 if (rbd_dev->header.stripe_unit == 0 ||
1072 rbd_dev->header.stripe_count == 0) {
1073 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1074 rbd_dev->header.stripe_count = 1;
1075 }
1076
1077 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1078 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1079 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
7e97332e
ID
1080 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1081 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
263423f8
ID
1082 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1083}
1084
602adf40 1085/*
bb23e37a
AE
1086 * Fill an rbd image header with information from the given format 1
1087 * on-disk header.
602adf40 1088 */
662518b1 1089static int rbd_header_from_disk(struct rbd_device *rbd_dev,
4156d998 1090 struct rbd_image_header_ondisk *ondisk)
602adf40 1091{
662518b1 1092 struct rbd_image_header *header = &rbd_dev->header;
bb23e37a
AE
1093 bool first_time = header->object_prefix == NULL;
1094 struct ceph_snap_context *snapc;
1095 char *object_prefix = NULL;
1096 char *snap_names = NULL;
1097 u64 *snap_sizes = NULL;
ccece235 1098 u32 snap_count;
bb23e37a 1099 int ret = -ENOMEM;
621901d6 1100 u32 i;
602adf40 1101
bb23e37a 1102 /* Allocate this now to avoid having to handle failure below */
6a52325f 1103
bb23e37a 1104 if (first_time) {
848d796c
ID
1105 object_prefix = kstrndup(ondisk->object_prefix,
1106 sizeof(ondisk->object_prefix),
1107 GFP_KERNEL);
bb23e37a
AE
1108 if (!object_prefix)
1109 return -ENOMEM;
bb23e37a 1110 }
00f1f36f 1111
bb23e37a 1112 /* Allocate the snapshot context and fill it in */
00f1f36f 1113
bb23e37a
AE
1114 snap_count = le32_to_cpu(ondisk->snap_count);
1115 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1116 if (!snapc)
1117 goto out_err;
1118 snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 1119 if (snap_count) {
bb23e37a 1120 struct rbd_image_snap_ondisk *snaps;
f785cc1d
AE
1121 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1122
bb23e37a 1123 /* We'll keep a copy of the snapshot names... */
621901d6 1124
bb23e37a
AE
1125 if (snap_names_len > (u64)SIZE_MAX)
1126 goto out_2big;
1127 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1128 if (!snap_names)
6a52325f
AE
1129 goto out_err;
1130
bb23e37a 1131 /* ...as well as the array of their sizes. */
88a25a5f
ME
1132 snap_sizes = kmalloc_array(snap_count,
1133 sizeof(*header->snap_sizes),
1134 GFP_KERNEL);
bb23e37a 1135 if (!snap_sizes)
6a52325f 1136 goto out_err;
bb23e37a 1137
f785cc1d 1138 /*
bb23e37a
AE
1139 * Copy the names, and fill in each snapshot's id
1140 * and size.
1141 *
99a41ebc 1142 * Note that rbd_dev_v1_header_info() guarantees the
bb23e37a 1143 * ondisk buffer we're working with has
f785cc1d
AE
1144 * snap_names_len bytes beyond the end of the
1145 * snapshot id array, this memcpy() is safe.
1146 */
bb23e37a
AE
1147 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1148 snaps = ondisk->snaps;
1149 for (i = 0; i < snap_count; i++) {
1150 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1151 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1152 }
602adf40 1153 }
6a52325f 1154
bb23e37a 1155 /* We won't fail any more, fill in the header */
621901d6 1156
bb23e37a
AE
1157 if (first_time) {
1158 header->object_prefix = object_prefix;
1159 header->obj_order = ondisk->options.order;
263423f8 1160 rbd_init_layout(rbd_dev);
602adf40 1161 } else {
662518b1
AE
1162 ceph_put_snap_context(header->snapc);
1163 kfree(header->snap_names);
1164 kfree(header->snap_sizes);
602adf40 1165 }
849b4260 1166
bb23e37a 1167 /* The remaining fields always get updated (when we refresh) */
621901d6 1168
f84344f3 1169 header->image_size = le64_to_cpu(ondisk->image_size);
bb23e37a
AE
1170 header->snapc = snapc;
1171 header->snap_names = snap_names;
1172 header->snap_sizes = snap_sizes;
468521c1 1173
602adf40 1174 return 0;
bb23e37a
AE
1175out_2big:
1176 ret = -EIO;
6a52325f 1177out_err:
bb23e37a
AE
1178 kfree(snap_sizes);
1179 kfree(snap_names);
1180 ceph_put_snap_context(snapc);
1181 kfree(object_prefix);
ccece235 1182
bb23e37a 1183 return ret;
602adf40
YS
1184}
1185
9682fc6d
AE
1186static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1187{
1188 const char *snap_name;
1189
1190 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1191
1192 /* Skip over names until we find the one we are looking for */
1193
1194 snap_name = rbd_dev->header.snap_names;
1195 while (which--)
1196 snap_name += strlen(snap_name) + 1;
1197
1198 return kstrdup(snap_name, GFP_KERNEL);
1199}
1200
30d1cff8
AE
1201/*
1202 * Snapshot id comparison function for use with qsort()/bsearch().
1203 * Note that result is for snapshots in *descending* order.
1204 */
1205static int snapid_compare_reverse(const void *s1, const void *s2)
1206{
1207 u64 snap_id1 = *(u64 *)s1;
1208 u64 snap_id2 = *(u64 *)s2;
1209
1210 if (snap_id1 < snap_id2)
1211 return 1;
1212 return snap_id1 == snap_id2 ? 0 : -1;
1213}
1214
1215/*
1216 * Search a snapshot context to see if the given snapshot id is
1217 * present.
1218 *
1219 * Returns the position of the snapshot id in the array if it's found,
1220 * or BAD_SNAP_INDEX otherwise.
1221 *
1222 * Note: The snapshot array is in kept sorted (by the osd) in
1223 * reverse order, highest snapshot id first.
1224 */
9682fc6d
AE
1225static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1226{
1227 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
30d1cff8 1228 u64 *found;
9682fc6d 1229
30d1cff8
AE
1230 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1231 sizeof (snap_id), snapid_compare_reverse);
9682fc6d 1232
30d1cff8 1233 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9682fc6d
AE
1234}
1235
2ad3d716
AE
1236static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1237 u64 snap_id)
9e15b77d 1238{
54cac61f 1239 u32 which;
da6a6b63 1240 const char *snap_name;
9e15b77d 1241
54cac61f
AE
1242 which = rbd_dev_snap_index(rbd_dev, snap_id);
1243 if (which == BAD_SNAP_INDEX)
da6a6b63 1244 return ERR_PTR(-ENOENT);
54cac61f 1245
da6a6b63
JD
1246 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1247 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
54cac61f
AE
1248}
1249
1250static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1251{
9e15b77d
AE
1252 if (snap_id == CEPH_NOSNAP)
1253 return RBD_SNAP_HEAD_NAME;
1254
54cac61f
AE
1255 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1256 if (rbd_dev->image_format == 1)
1257 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9e15b77d 1258
54cac61f 1259 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
9e15b77d
AE
1260}
1261
2ad3d716
AE
1262static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1263 u64 *snap_size)
602adf40 1264{
2ad3d716
AE
1265 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1266 if (snap_id == CEPH_NOSNAP) {
1267 *snap_size = rbd_dev->header.image_size;
1268 } else if (rbd_dev->image_format == 1) {
1269 u32 which;
602adf40 1270
2ad3d716
AE
1271 which = rbd_dev_snap_index(rbd_dev, snap_id);
1272 if (which == BAD_SNAP_INDEX)
1273 return -ENOENT;
e86924a8 1274
2ad3d716
AE
1275 *snap_size = rbd_dev->header.snap_sizes[which];
1276 } else {
1277 u64 size = 0;
1278 int ret;
1279
1280 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1281 if (ret)
1282 return ret;
1283
1284 *snap_size = size;
1285 }
1286 return 0;
602adf40
YS
1287}
1288
2ad3d716
AE
1289static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1290 u64 *snap_features)
602adf40 1291{
2ad3d716
AE
1292 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1293 if (snap_id == CEPH_NOSNAP) {
1294 *snap_features = rbd_dev->header.features;
1295 } else if (rbd_dev->image_format == 1) {
1296 *snap_features = 0; /* No features for format 1 */
602adf40 1297 } else {
2ad3d716
AE
1298 u64 features = 0;
1299 int ret;
8b0241f8 1300
2ad3d716
AE
1301 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1302 if (ret)
1303 return ret;
1304
1305 *snap_features = features;
1306 }
1307 return 0;
1308}
1309
1310static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1311{
8f4b7d98 1312 u64 snap_id = rbd_dev->spec->snap_id;
2ad3d716
AE
1313 u64 size = 0;
1314 u64 features = 0;
1315 int ret;
1316
2ad3d716
AE
1317 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1318 if (ret)
1319 return ret;
1320 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1321 if (ret)
1322 return ret;
1323
1324 rbd_dev->mapping.size = size;
1325 rbd_dev->mapping.features = features;
1326
8b0241f8 1327 return 0;
602adf40
YS
1328}
1329
d1cf5788
AE
1330static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1331{
1332 rbd_dev->mapping.size = 0;
1333 rbd_dev->mapping.features = 0;
200a6a8b
AE
1334}
1335
5359a17d 1336static void zero_bvec(struct bio_vec *bv)
602adf40 1337{
602adf40 1338 void *buf;
5359a17d 1339 unsigned long flags;
602adf40 1340
5359a17d
ID
1341 buf = bvec_kmap_irq(bv, &flags);
1342 memset(buf, 0, bv->bv_len);
1343 flush_dcache_page(bv->bv_page);
1344 bvec_kunmap_irq(buf, &flags);
602adf40
YS
1345}
1346
5359a17d 1347static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
b9434c5b 1348{
5359a17d 1349 struct ceph_bio_iter it = *bio_pos;
b9434c5b 1350
5359a17d
ID
1351 ceph_bio_iter_advance(&it, off);
1352 ceph_bio_iter_advance_step(&it, bytes, ({
1353 zero_bvec(&bv);
1354 }));
b9434c5b
AE
1355}
1356
7e07efb1 1357static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
602adf40 1358{
7e07efb1 1359 struct ceph_bvec_iter it = *bvec_pos;
602adf40 1360
7e07efb1
ID
1361 ceph_bvec_iter_advance(&it, off);
1362 ceph_bvec_iter_advance_step(&it, bytes, ({
1363 zero_bvec(&bv);
1364 }));
f7760dad
AE
1365}
1366
1367/*
3da691bf 1368 * Zero a range in @obj_req data buffer defined by a bio (list) or
afb97888 1369 * (private) bio_vec array.
f7760dad 1370 *
3da691bf 1371 * @off is relative to the start of the data buffer.
926f9b3f 1372 */
3da691bf
ID
1373static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1374 u32 bytes)
926f9b3f 1375{
54ab3b24
ID
1376 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1377
ecc633ca 1378 switch (obj_req->img_request->data_type) {
3da691bf
ID
1379 case OBJ_REQUEST_BIO:
1380 zero_bios(&obj_req->bio_pos, off, bytes);
1381 break;
1382 case OBJ_REQUEST_BVECS:
afb97888 1383 case OBJ_REQUEST_OWN_BVECS:
3da691bf
ID
1384 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1385 break;
1386 default:
16809372 1387 BUG();
6365d33a
AE
1388 }
1389}
1390
bf0d5f50
AE
1391static void rbd_obj_request_destroy(struct kref *kref);
1392static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1393{
1394 rbd_assert(obj_request != NULL);
37206ee5 1395 dout("%s: obj %p (was %d)\n", __func__, obj_request,
2c935bc5 1396 kref_read(&obj_request->kref));
bf0d5f50
AE
1397 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1398}
1399
bf0d5f50
AE
1400static void rbd_img_request_destroy(struct kref *kref);
1401static void rbd_img_request_put(struct rbd_img_request *img_request)
1402{
1403 rbd_assert(img_request != NULL);
37206ee5 1404 dout("%s: img %p (was %d)\n", __func__, img_request,
2c935bc5 1405 kref_read(&img_request->kref));
e93aca0a 1406 kref_put(&img_request->kref, rbd_img_request_destroy);
bf0d5f50
AE
1407}
1408
1409static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1410 struct rbd_obj_request *obj_request)
1411{
25dcf954
AE
1412 rbd_assert(obj_request->img_request == NULL);
1413
b155e86c 1414 /* Image request now owns object's original reference */
bf0d5f50 1415 obj_request->img_request = img_request;
15961b44 1416 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
bf0d5f50
AE
1417}
1418
1419static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1420 struct rbd_obj_request *obj_request)
1421{
15961b44 1422 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
43df3d35 1423 list_del(&obj_request->ex.oe_item);
bf0d5f50 1424 rbd_assert(obj_request->img_request == img_request);
bf0d5f50
AE
1425 rbd_obj_request_put(obj_request);
1426}
1427
a086a1b8 1428static void rbd_osd_submit(struct ceph_osd_request *osd_req)
bf0d5f50 1429{
a086a1b8 1430 struct rbd_obj_request *obj_req = osd_req->r_priv;
980917fc 1431
a086a1b8
ID
1432 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1433 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1434 obj_req->ex.oe_off, obj_req->ex.oe_len);
980917fc 1435 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
bf0d5f50
AE
1436}
1437
0c425248
AE
1438/*
1439 * The default/initial value for all image request flags is 0. Each
1440 * is conditionally set to 1 at image request initialization time
1441 * and currently never change thereafter.
1442 */
d0b2e944
AE
1443static void img_request_layered_set(struct rbd_img_request *img_request)
1444{
1445 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1446 smp_mb();
1447}
1448
a2acd00e
AE
1449static void img_request_layered_clear(struct rbd_img_request *img_request)
1450{
1451 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1452 smp_mb();
1453}
1454
d0b2e944
AE
1455static bool img_request_layered_test(struct rbd_img_request *img_request)
1456{
1457 smp_mb();
1458 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1459}
1460
3da691bf 1461static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
6e2a4505 1462{
3da691bf 1463 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
b9434c5b 1464
43df3d35
ID
1465 return !obj_req->ex.oe_off &&
1466 obj_req->ex.oe_len == rbd_dev->layout.object_size;
6e2a4505
AE
1467}
1468
3da691bf 1469static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
bf0d5f50 1470{
3da691bf 1471 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bf0d5f50 1472
43df3d35 1473 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
3da691bf 1474 rbd_dev->layout.object_size;
0dcc685e
ID
1475}
1476
13488d53
ID
1477/*
1478 * Must be called after rbd_obj_calc_img_extents().
1479 */
1480static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1481{
1482 if (!obj_req->num_img_extents ||
9b17eb2c
ID
1483 (rbd_obj_is_entire(obj_req) &&
1484 !obj_req->img_request->snapc->num_snaps))
13488d53
ID
1485 return false;
1486
1487 return true;
1488}
1489
86bd7998 1490static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
bf0d5f50 1491{
86bd7998
ID
1492 return ceph_file_extents_bytes(obj_req->img_extents,
1493 obj_req->num_img_extents);
bf0d5f50
AE
1494}
1495
3da691bf 1496static bool rbd_img_is_write(struct rbd_img_request *img_req)
bf0d5f50 1497{
9bb0248d 1498 switch (img_req->op_type) {
3da691bf
ID
1499 case OBJ_OP_READ:
1500 return false;
1501 case OBJ_OP_WRITE:
1502 case OBJ_OP_DISCARD:
6484cbe9 1503 case OBJ_OP_ZEROOUT:
3da691bf
ID
1504 return true;
1505 default:
c6244b3b 1506 BUG();
3da691bf 1507 }
90e98c52
GZ
1508}
1509
85e084fe 1510static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
bf0d5f50 1511{
3da691bf 1512 struct rbd_obj_request *obj_req = osd_req->r_priv;
54ab3b24 1513 int result;
bf0d5f50 1514
3da691bf
ID
1515 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1516 osd_req->r_result, obj_req);
bf0d5f50 1517
54ab3b24
ID
1518 /*
1519 * Writes aren't allowed to return a data payload. In some
1520 * guarded write cases (e.g. stat + zero on an empty object)
1521 * a stat response makes it through, but we don't care.
1522 */
1523 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1524 result = 0;
3da691bf 1525 else
54ab3b24 1526 result = osd_req->r_result;
bf0d5f50 1527
54ab3b24 1528 rbd_obj_handle_request(obj_req, result);
bf0d5f50
AE
1529}
1530
bcbab1db 1531static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
430c28c3 1532{
bcbab1db 1533 struct rbd_obj_request *obj_request = osd_req->r_priv;
430c28c3 1534
a162b308 1535 osd_req->r_flags = CEPH_OSD_FLAG_READ;
7c84883a 1536 osd_req->r_snapid = obj_request->img_request->snap_id;
9d4df01f
AE
1537}
1538
bcbab1db 1539static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
9d4df01f 1540{
bcbab1db 1541 struct rbd_obj_request *obj_request = osd_req->r_priv;
9d4df01f 1542
a162b308 1543 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
fac02ddf 1544 ktime_get_real_ts64(&osd_req->r_mtime);
43df3d35 1545 osd_req->r_data_offset = obj_request->ex.oe_off;
430c28c3
AE
1546}
1547
bc81207e 1548static struct ceph_osd_request *
bcbab1db
ID
1549__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1550 struct ceph_snap_context *snapc, int num_ops)
bc81207e 1551{
e28eded5 1552 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bc81207e
ID
1553 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1554 struct ceph_osd_request *req;
a90bb0c1
ID
1555 const char *name_format = rbd_dev->image_format == 1 ?
1556 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
bcbab1db 1557 int ret;
bc81207e 1558
e28eded5 1559 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
bc81207e 1560 if (!req)
bcbab1db 1561 return ERR_PTR(-ENOMEM);
bc81207e 1562
bcbab1db 1563 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
bc81207e 1564 req->r_callback = rbd_osd_req_callback;
a162b308 1565 req->r_priv = obj_req;
bc81207e 1566
b26c047b
ID
1567 /*
1568 * Data objects may be stored in a separate pool, but always in
1569 * the same namespace in that pool as the header in its pool.
1570 */
1571 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
bc81207e 1572 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
b26c047b 1573
bcbab1db
ID
1574 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1575 rbd_dev->header.object_prefix,
1576 obj_req->ex.oe_objno);
1577 if (ret)
1578 return ERR_PTR(ret);
bc81207e 1579
bc81207e 1580 return req;
bc81207e
ID
1581}
1582
e28eded5 1583static struct ceph_osd_request *
bcbab1db 1584rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
bf0d5f50 1585{
bcbab1db
ID
1586 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1587 num_ops);
bf0d5f50
AE
1588}
1589
ecc633ca 1590static struct rbd_obj_request *rbd_obj_request_create(void)
bf0d5f50
AE
1591{
1592 struct rbd_obj_request *obj_request;
bf0d5f50 1593
5a60e876 1594 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
6c696d85 1595 if (!obj_request)
f907ad55 1596 return NULL;
f907ad55 1597
43df3d35 1598 ceph_object_extent_init(&obj_request->ex);
bcbab1db 1599 INIT_LIST_HEAD(&obj_request->osd_reqs);
85b5e6d1 1600 mutex_init(&obj_request->state_mutex);
bf0d5f50
AE
1601 kref_init(&obj_request->kref);
1602
67e2b652 1603 dout("%s %p\n", __func__, obj_request);
bf0d5f50
AE
1604 return obj_request;
1605}
1606
1607static void rbd_obj_request_destroy(struct kref *kref)
1608{
1609 struct rbd_obj_request *obj_request;
bcbab1db 1610 struct ceph_osd_request *osd_req;
7e07efb1 1611 u32 i;
bf0d5f50
AE
1612
1613 obj_request = container_of(kref, struct rbd_obj_request, kref);
1614
37206ee5
AE
1615 dout("%s: obj %p\n", __func__, obj_request);
1616
bcbab1db
ID
1617 while (!list_empty(&obj_request->osd_reqs)) {
1618 osd_req = list_first_entry(&obj_request->osd_reqs,
1619 struct ceph_osd_request, r_private_item);
1620 list_del_init(&osd_req->r_private_item);
1621 ceph_osdc_put_request(osd_req);
1622 }
bf0d5f50 1623
ecc633ca 1624 switch (obj_request->img_request->data_type) {
9969ebc5 1625 case OBJ_REQUEST_NODATA:
bf0d5f50 1626 case OBJ_REQUEST_BIO:
7e07efb1 1627 case OBJ_REQUEST_BVECS:
5359a17d 1628 break; /* Nothing to do */
afb97888
ID
1629 case OBJ_REQUEST_OWN_BVECS:
1630 kfree(obj_request->bvec_pos.bvecs);
788e2df3 1631 break;
7e07efb1 1632 default:
16809372 1633 BUG();
bf0d5f50
AE
1634 }
1635
86bd7998 1636 kfree(obj_request->img_extents);
7e07efb1
ID
1637 if (obj_request->copyup_bvecs) {
1638 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1639 if (obj_request->copyup_bvecs[i].bv_page)
1640 __free_page(obj_request->copyup_bvecs[i].bv_page);
1641 }
1642 kfree(obj_request->copyup_bvecs);
bf0d5f50
AE
1643 }
1644
868311b1 1645 kmem_cache_free(rbd_obj_request_cache, obj_request);
bf0d5f50
AE
1646}
1647
fb65d228
AE
1648/* It's OK to call this for a device with no parent */
1649
1650static void rbd_spec_put(struct rbd_spec *spec);
1651static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1652{
1653 rbd_dev_remove_parent(rbd_dev);
1654 rbd_spec_put(rbd_dev->parent_spec);
1655 rbd_dev->parent_spec = NULL;
1656 rbd_dev->parent_overlap = 0;
1657}
1658
a2acd00e
AE
1659/*
1660 * Parent image reference counting is used to determine when an
1661 * image's parent fields can be safely torn down--after there are no
1662 * more in-flight requests to the parent image. When the last
1663 * reference is dropped, cleaning them up is safe.
1664 */
1665static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1666{
1667 int counter;
1668
1669 if (!rbd_dev->parent_spec)
1670 return;
1671
1672 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1673 if (counter > 0)
1674 return;
1675
1676 /* Last reference; clean up parent data structures */
1677
1678 if (!counter)
1679 rbd_dev_unparent(rbd_dev);
1680 else
9584d508 1681 rbd_warn(rbd_dev, "parent reference underflow");
a2acd00e
AE
1682}
1683
1684/*
1685 * If an image has a non-zero parent overlap, get a reference to its
1686 * parent.
1687 *
1688 * Returns true if the rbd device has a parent with a non-zero
1689 * overlap and a reference for it was successfully taken, or
1690 * false otherwise.
1691 */
1692static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1693{
ae43e9d0 1694 int counter = 0;
a2acd00e
AE
1695
1696 if (!rbd_dev->parent_spec)
1697 return false;
1698
ae43e9d0
ID
1699 down_read(&rbd_dev->header_rwsem);
1700 if (rbd_dev->parent_overlap)
1701 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1702 up_read(&rbd_dev->header_rwsem);
a2acd00e
AE
1703
1704 if (counter < 0)
9584d508 1705 rbd_warn(rbd_dev, "parent reference overflow");
a2acd00e 1706
ae43e9d0 1707 return counter > 0;
a2acd00e
AE
1708}
1709
bf0d5f50
AE
1710/*
1711 * Caller is responsible for filling in the list of object requests
1712 * that comprises the image request, and the Linux request pointer
1713 * (if there is one).
1714 */
cc344fa1
AE
1715static struct rbd_img_request *rbd_img_request_create(
1716 struct rbd_device *rbd_dev,
6d2940c8 1717 enum obj_operation_type op_type,
4e752f0a 1718 struct ceph_snap_context *snapc)
bf0d5f50
AE
1719{
1720 struct rbd_img_request *img_request;
bf0d5f50 1721
a0c5895b 1722 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
bf0d5f50
AE
1723 if (!img_request)
1724 return NULL;
1725
bf0d5f50 1726 img_request->rbd_dev = rbd_dev;
9bb0248d 1727 img_request->op_type = op_type;
9bb0248d 1728 if (!rbd_img_is_write(img_request))
bf0d5f50 1729 img_request->snap_id = rbd_dev->spec->snap_id;
9bb0248d
ID
1730 else
1731 img_request->snapc = snapc;
1732
a2acd00e 1733 if (rbd_dev_parent_get(rbd_dev))
d0b2e944 1734 img_request_layered_set(img_request);
a0c5895b 1735
e1fddc8f 1736 INIT_LIST_HEAD(&img_request->lock_item);
43df3d35 1737 INIT_LIST_HEAD(&img_request->object_extents);
0192ce2e 1738 mutex_init(&img_request->state_mutex);
bf0d5f50
AE
1739 kref_init(&img_request->kref);
1740
dfd9875f
ID
1741 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1742 obj_op_name(op_type), img_request);
bf0d5f50
AE
1743 return img_request;
1744}
1745
1746static void rbd_img_request_destroy(struct kref *kref)
1747{
1748 struct rbd_img_request *img_request;
1749 struct rbd_obj_request *obj_request;
1750 struct rbd_obj_request *next_obj_request;
1751
1752 img_request = container_of(kref, struct rbd_img_request, kref);
1753
37206ee5
AE
1754 dout("%s: img %p\n", __func__, img_request);
1755
e1fddc8f 1756 WARN_ON(!list_empty(&img_request->lock_item));
bf0d5f50
AE
1757 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1758 rbd_img_obj_request_del(img_request, obj_request);
1759
a2acd00e
AE
1760 if (img_request_layered_test(img_request)) {
1761 img_request_layered_clear(img_request);
1762 rbd_dev_parent_put(img_request->rbd_dev);
1763 }
1764
9bb0248d 1765 if (rbd_img_is_write(img_request))
812164f8 1766 ceph_put_snap_context(img_request->snapc);
bf0d5f50 1767
1c2a9dfe 1768 kmem_cache_free(rbd_img_request_cache, img_request);
bf0d5f50
AE
1769}
1770
86bd7998
ID
1771static void prune_extents(struct ceph_file_extent *img_extents,
1772 u32 *num_img_extents, u64 overlap)
e93f3152 1773{
86bd7998 1774 u32 cnt = *num_img_extents;
e93f3152 1775
86bd7998
ID
1776 /* drop extents completely beyond the overlap */
1777 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1778 cnt--;
e93f3152 1779
86bd7998
ID
1780 if (cnt) {
1781 struct ceph_file_extent *ex = &img_extents[cnt - 1];
e93f3152 1782
86bd7998
ID
1783 /* trim final overlapping extent */
1784 if (ex->fe_off + ex->fe_len > overlap)
1785 ex->fe_len = overlap - ex->fe_off;
1786 }
e93f3152 1787
86bd7998 1788 *num_img_extents = cnt;
e93f3152
AE
1789}
1790
86bd7998
ID
1791/*
1792 * Determine the byte range(s) covered by either just the object extent
1793 * or the entire object in the parent image.
1794 */
1795static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1796 bool entire)
e93f3152 1797{
86bd7998
ID
1798 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1799 int ret;
e93f3152 1800
86bd7998
ID
1801 if (!rbd_dev->parent_overlap)
1802 return 0;
e93f3152 1803
86bd7998
ID
1804 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1805 entire ? 0 : obj_req->ex.oe_off,
1806 entire ? rbd_dev->layout.object_size :
1807 obj_req->ex.oe_len,
1808 &obj_req->img_extents,
1809 &obj_req->num_img_extents);
1810 if (ret)
1811 return ret;
e93f3152 1812
86bd7998
ID
1813 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1814 rbd_dev->parent_overlap);
1815 return 0;
e93f3152
AE
1816}
1817
bcbab1db 1818static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
1217857f 1819{
bcbab1db
ID
1820 struct rbd_obj_request *obj_req = osd_req->r_priv;
1821
ecc633ca 1822 switch (obj_req->img_request->data_type) {
3da691bf 1823 case OBJ_REQUEST_BIO:
bcbab1db 1824 osd_req_op_extent_osd_data_bio(osd_req, which,
3da691bf 1825 &obj_req->bio_pos,
43df3d35 1826 obj_req->ex.oe_len);
3da691bf
ID
1827 break;
1828 case OBJ_REQUEST_BVECS:
afb97888 1829 case OBJ_REQUEST_OWN_BVECS:
3da691bf 1830 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
43df3d35 1831 obj_req->ex.oe_len);
afb97888 1832 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
bcbab1db 1833 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
3da691bf
ID
1834 &obj_req->bvec_pos);
1835 break;
1836 default:
16809372 1837 BUG();
1217857f 1838 }
3da691bf 1839}
1217857f 1840
bcbab1db 1841static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
3da691bf
ID
1842{
1843 struct page **pages;
8b3e1a56 1844
3da691bf
ID
1845 /*
1846 * The response data for a STAT call consists of:
1847 * le64 length;
1848 * struct {
1849 * le32 tv_sec;
1850 * le32 tv_nsec;
1851 * } mtime;
1852 */
1853 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1854 if (IS_ERR(pages))
1855 return PTR_ERR(pages);
1856
bcbab1db
ID
1857 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
1858 osd_req_op_raw_data_in_pages(osd_req, which, pages,
3da691bf
ID
1859 8 + sizeof(struct ceph_timespec),
1860 0, false, true);
1861 return 0;
1217857f
AE
1862}
1863
b5ae8cbc
ID
1864static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
1865 u32 bytes)
1866{
1867 struct rbd_obj_request *obj_req = osd_req->r_priv;
1868 int ret;
1869
1870 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
1871 if (ret)
1872 return ret;
1873
1874 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
1875 obj_req->copyup_bvec_count, bytes);
1876 return 0;
1877}
1878
ea9b743c
ID
1879static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
1880{
1881 obj_req->read_state = RBD_OBJ_READ_START;
1882 return 0;
1883}
1884
bcbab1db
ID
1885static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
1886 int which)
2169238d 1887{
bcbab1db 1888 struct rbd_obj_request *obj_req = osd_req->r_priv;
3da691bf
ID
1889 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1890 u16 opcode;
2169238d 1891
bcbab1db 1892 osd_req_op_alloc_hint_init(osd_req, which++,
3da691bf
ID
1893 rbd_dev->layout.object_size,
1894 rbd_dev->layout.object_size);
2169238d 1895
3da691bf
ID
1896 if (rbd_obj_is_entire(obj_req))
1897 opcode = CEPH_OSD_OP_WRITEFULL;
1898 else
1899 opcode = CEPH_OSD_OP_WRITE;
2169238d 1900
bcbab1db 1901 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 1902 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
bcbab1db 1903 rbd_osd_setup_data(osd_req, which);
3da691bf 1904}
2169238d 1905
ea9b743c 1906static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
3da691bf 1907{
3da691bf
ID
1908 int ret;
1909
86bd7998
ID
1910 /* reverse map the entire object onto the parent */
1911 ret = rbd_obj_calc_img_extents(obj_req, true);
1912 if (ret)
1913 return ret;
1914
0ad5d953
ID
1915 if (rbd_obj_copyup_enabled(obj_req))
1916 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
1917
85b5e6d1 1918 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf 1919 return 0;
2169238d
AE
1920}
1921
6484cbe9
ID
1922static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
1923{
1924 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
1925 CEPH_OSD_OP_ZERO;
1926}
1927
27bbd911
ID
1928static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
1929 int which)
1930{
1931 struct rbd_obj_request *obj_req = osd_req->r_priv;
1932
1933 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
1934 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
1935 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
1936 } else {
1937 osd_req_op_extent_init(osd_req, which,
1938 truncate_or_zero_opcode(obj_req),
1939 obj_req->ex.oe_off, obj_req->ex.oe_len,
1940 0, 0);
1941 }
1942}
1943
ea9b743c 1944static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
6484cbe9 1945{
0c93e1b7 1946 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
27bbd911 1947 u64 off, next_off;
6484cbe9
ID
1948 int ret;
1949
0c93e1b7
ID
1950 /*
1951 * Align the range to alloc_size boundary and punt on discards
1952 * that are too small to free up any space.
1953 *
1954 * alloc_size == object_size && is_tail() is a special case for
1955 * filestore with filestore_punch_hole = false, needed to allow
1956 * truncate (in addition to delete).
1957 */
1958 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
1959 !rbd_obj_is_tail(obj_req)) {
27bbd911
ID
1960 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
1961 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
1962 rbd_dev->opts->alloc_size);
0c93e1b7
ID
1963 if (off >= next_off)
1964 return 1;
27bbd911
ID
1965
1966 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
1967 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
1968 off, next_off - off);
1969 obj_req->ex.oe_off = off;
1970 obj_req->ex.oe_len = next_off - off;
0c93e1b7
ID
1971 }
1972
6484cbe9
ID
1973 /* reverse map the entire object onto the parent */
1974 ret = rbd_obj_calc_img_extents(obj_req, true);
1975 if (ret)
1976 return ret;
1977
0ad5d953
ID
1978 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
1979 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
1980
85b5e6d1 1981 obj_req->write_state = RBD_OBJ_WRITE_START;
6484cbe9
ID
1982 return 0;
1983}
1984
bcbab1db
ID
1985static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
1986 int which)
3da691bf 1987{
bcbab1db 1988 struct rbd_obj_request *obj_req = osd_req->r_priv;
3b434a2a
JD
1989 u16 opcode;
1990
3da691bf 1991 if (rbd_obj_is_entire(obj_req)) {
86bd7998 1992 if (obj_req->num_img_extents) {
0ad5d953 1993 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
bcbab1db 1994 osd_req_op_init(osd_req, which++,
9b17eb2c 1995 CEPH_OSD_OP_CREATE, 0);
3b434a2a
JD
1996 opcode = CEPH_OSD_OP_TRUNCATE;
1997 } else {
0ad5d953 1998 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
bcbab1db 1999 osd_req_op_init(osd_req, which++,
3da691bf
ID
2000 CEPH_OSD_OP_DELETE, 0);
2001 opcode = 0;
3b434a2a 2002 }
3b434a2a 2003 } else {
6484cbe9 2004 opcode = truncate_or_zero_opcode(obj_req);
3b434a2a
JD
2005 }
2006
3da691bf 2007 if (opcode)
bcbab1db 2008 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2009 obj_req->ex.oe_off, obj_req->ex.oe_len,
3da691bf 2010 0, 0);
3b434a2a
JD
2011}
2012
ea9b743c 2013static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
bf0d5f50 2014{
3da691bf 2015 int ret;
37206ee5 2016
86bd7998
ID
2017 /* reverse map the entire object onto the parent */
2018 ret = rbd_obj_calc_img_extents(obj_req, true);
2019 if (ret)
2020 return ret;
f1a4739f 2021
0ad5d953
ID
2022 if (rbd_obj_copyup_enabled(obj_req))
2023 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2024 if (!obj_req->num_img_extents) {
2025 if (rbd_obj_is_entire(obj_req))
2026 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2027 }
2028
a086a1b8
ID
2029 obj_req->write_state = RBD_OBJ_WRITE_START;
2030 return 0;
2031}
f1a4739f 2032
a086a1b8
ID
2033static int count_write_ops(struct rbd_obj_request *obj_req)
2034{
2035 switch (obj_req->img_request->op_type) {
2036 case OBJ_OP_WRITE:
2037 return 2; /* setallochint + write/writefull */
2038 case OBJ_OP_DISCARD:
2039 return 1; /* delete/truncate/zero */
2040 case OBJ_OP_ZEROOUT:
2041 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2042 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2043 return 2; /* create + truncate */
bf0d5f50 2044
a086a1b8
ID
2045 return 1; /* delete/truncate/zero */
2046 default:
2047 BUG();
3da691bf 2048 }
a086a1b8 2049}
3b434a2a 2050
a086a1b8
ID
2051static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2052 int which)
2053{
2054 struct rbd_obj_request *obj_req = osd_req->r_priv;
2055
2056 switch (obj_req->img_request->op_type) {
2057 case OBJ_OP_WRITE:
2058 __rbd_osd_setup_write_ops(osd_req, which);
2059 break;
2060 case OBJ_OP_DISCARD:
2061 __rbd_osd_setup_discard_ops(osd_req, which);
2062 break;
2063 case OBJ_OP_ZEROOUT:
2064 __rbd_osd_setup_zeroout_ops(osd_req, which);
2065 break;
2066 default:
2067 BUG();
2068 }
3da691bf 2069}
9d4df01f 2070
3da691bf 2071/*
a086a1b8
ID
2072 * Prune the list of object requests (adjust offset and/or length, drop
2073 * redundant requests). Prepare object request state machines and image
2074 * request state machine for execution.
3da691bf
ID
2075 */
2076static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2077{
0c93e1b7 2078 struct rbd_obj_request *obj_req, *next_obj_req;
3da691bf 2079 int ret;
430c28c3 2080
0c93e1b7 2081 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
9bb0248d 2082 switch (img_req->op_type) {
3da691bf 2083 case OBJ_OP_READ:
ea9b743c 2084 ret = rbd_obj_init_read(obj_req);
3da691bf
ID
2085 break;
2086 case OBJ_OP_WRITE:
ea9b743c 2087 ret = rbd_obj_init_write(obj_req);
3da691bf
ID
2088 break;
2089 case OBJ_OP_DISCARD:
ea9b743c 2090 ret = rbd_obj_init_discard(obj_req);
3da691bf 2091 break;
6484cbe9 2092 case OBJ_OP_ZEROOUT:
ea9b743c 2093 ret = rbd_obj_init_zeroout(obj_req);
6484cbe9 2094 break;
3da691bf 2095 default:
16809372 2096 BUG();
3da691bf 2097 }
0c93e1b7 2098 if (ret < 0)
3da691bf 2099 return ret;
0c93e1b7 2100 if (ret > 0) {
0c93e1b7
ID
2101 rbd_img_obj_request_del(img_req, obj_req);
2102 continue;
2103 }
bf0d5f50
AE
2104 }
2105
0192ce2e 2106 img_req->state = RBD_IMG_START;
bf0d5f50 2107 return 0;
3da691bf 2108}
bf0d5f50 2109
5a237819
ID
2110union rbd_img_fill_iter {
2111 struct ceph_bio_iter bio_iter;
2112 struct ceph_bvec_iter bvec_iter;
2113};
bf0d5f50 2114
5a237819
ID
2115struct rbd_img_fill_ctx {
2116 enum obj_request_type pos_type;
2117 union rbd_img_fill_iter *pos;
2118 union rbd_img_fill_iter iter;
2119 ceph_object_extent_fn_t set_pos_fn;
afb97888
ID
2120 ceph_object_extent_fn_t count_fn;
2121 ceph_object_extent_fn_t copy_fn;
5a237819 2122};
bf0d5f50 2123
5a237819 2124static struct ceph_object_extent *alloc_object_extent(void *arg)
0eefd470 2125{
5a237819
ID
2126 struct rbd_img_request *img_req = arg;
2127 struct rbd_obj_request *obj_req;
0eefd470 2128
5a237819
ID
2129 obj_req = rbd_obj_request_create();
2130 if (!obj_req)
2131 return NULL;
2761713d 2132
5a237819
ID
2133 rbd_img_obj_request_add(img_req, obj_req);
2134 return &obj_req->ex;
2135}
0eefd470 2136
afb97888
ID
2137/*
2138 * While su != os && sc == 1 is technically not fancy (it's the same
2139 * layout as su == os && sc == 1), we can't use the nocopy path for it
2140 * because ->set_pos_fn() should be called only once per object.
2141 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2142 * treat su != os && sc == 1 as fancy.
2143 */
2144static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2145{
2146 return l->stripe_unit != l->object_size;
2147}
0eefd470 2148
afb97888
ID
2149static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2150 struct ceph_file_extent *img_extents,
2151 u32 num_img_extents,
2152 struct rbd_img_fill_ctx *fctx)
2153{
2154 u32 i;
2155 int ret;
2156
2157 img_req->data_type = fctx->pos_type;
0eefd470
AE
2158
2159 /*
afb97888
ID
2160 * Create object requests and set each object request's starting
2161 * position in the provided bio (list) or bio_vec array.
0eefd470 2162 */
afb97888
ID
2163 fctx->iter = *fctx->pos;
2164 for (i = 0; i < num_img_extents; i++) {
2165 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2166 img_extents[i].fe_off,
2167 img_extents[i].fe_len,
2168 &img_req->object_extents,
2169 alloc_object_extent, img_req,
2170 fctx->set_pos_fn, &fctx->iter);
2171 if (ret)
2172 return ret;
2173 }
0eefd470 2174
afb97888 2175 return __rbd_img_fill_request(img_req);
0eefd470
AE
2176}
2177
5a237819
ID
2178/*
2179 * Map a list of image extents to a list of object extents, create the
2180 * corresponding object requests (normally each to a different object,
2181 * but not always) and add them to @img_req. For each object request,
afb97888 2182 * set up its data descriptor to point to the corresponding chunk(s) of
5a237819
ID
2183 * @fctx->pos data buffer.
2184 *
afb97888
ID
2185 * Because ceph_file_to_extents() will merge adjacent object extents
2186 * together, each object request's data descriptor may point to multiple
2187 * different chunks of @fctx->pos data buffer.
2188 *
5a237819
ID
2189 * @fctx->pos data buffer is assumed to be large enough.
2190 */
2191static int rbd_img_fill_request(struct rbd_img_request *img_req,
2192 struct ceph_file_extent *img_extents,
2193 u32 num_img_extents,
2194 struct rbd_img_fill_ctx *fctx)
3d7efd18 2195{
afb97888
ID
2196 struct rbd_device *rbd_dev = img_req->rbd_dev;
2197 struct rbd_obj_request *obj_req;
5a237819
ID
2198 u32 i;
2199 int ret;
2200
afb97888
ID
2201 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2202 !rbd_layout_is_fancy(&rbd_dev->layout))
2203 return rbd_img_fill_request_nocopy(img_req, img_extents,
2204 num_img_extents, fctx);
3d7efd18 2205
afb97888 2206 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
0eefd470 2207
bbea1c1a 2208 /*
afb97888
ID
2209 * Create object requests and determine ->bvec_count for each object
2210 * request. Note that ->bvec_count sum over all object requests may
2211 * be greater than the number of bio_vecs in the provided bio (list)
2212 * or bio_vec array because when mapped, those bio_vecs can straddle
2213 * stripe unit boundaries.
bbea1c1a 2214 */
5a237819
ID
2215 fctx->iter = *fctx->pos;
2216 for (i = 0; i < num_img_extents; i++) {
afb97888 2217 ret = ceph_file_to_extents(&rbd_dev->layout,
5a237819
ID
2218 img_extents[i].fe_off,
2219 img_extents[i].fe_len,
2220 &img_req->object_extents,
2221 alloc_object_extent, img_req,
afb97888
ID
2222 fctx->count_fn, &fctx->iter);
2223 if (ret)
2224 return ret;
bbea1c1a 2225 }
0eefd470 2226
afb97888
ID
2227 for_each_obj_request(img_req, obj_req) {
2228 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2229 sizeof(*obj_req->bvec_pos.bvecs),
2230 GFP_NOIO);
2231 if (!obj_req->bvec_pos.bvecs)
2232 return -ENOMEM;
2233 }
0eefd470 2234
8785b1d4 2235 /*
afb97888
ID
2236 * Fill in each object request's private bio_vec array, splitting and
2237 * rearranging the provided bio_vecs in stripe unit chunks as needed.
8785b1d4 2238 */
afb97888
ID
2239 fctx->iter = *fctx->pos;
2240 for (i = 0; i < num_img_extents; i++) {
2241 ret = ceph_iterate_extents(&rbd_dev->layout,
2242 img_extents[i].fe_off,
2243 img_extents[i].fe_len,
2244 &img_req->object_extents,
2245 fctx->copy_fn, &fctx->iter);
5a237819
ID
2246 if (ret)
2247 return ret;
2248 }
3d7efd18 2249
5a237819
ID
2250 return __rbd_img_fill_request(img_req);
2251}
2252
2253static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2254 u64 off, u64 len)
2255{
2256 struct ceph_file_extent ex = { off, len };
2257 union rbd_img_fill_iter dummy;
2258 struct rbd_img_fill_ctx fctx = {
2259 .pos_type = OBJ_REQUEST_NODATA,
2260 .pos = &dummy,
2261 };
2262
2263 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2264}
2265
2266static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2267{
2268 struct rbd_obj_request *obj_req =
2269 container_of(ex, struct rbd_obj_request, ex);
2270 struct ceph_bio_iter *it = arg;
3d7efd18 2271
5a237819
ID
2272 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2273 obj_req->bio_pos = *it;
2274 ceph_bio_iter_advance(it, bytes);
2275}
3d7efd18 2276
afb97888
ID
2277static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2278{
2279 struct rbd_obj_request *obj_req =
2280 container_of(ex, struct rbd_obj_request, ex);
2281 struct ceph_bio_iter *it = arg;
0eefd470 2282
afb97888
ID
2283 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2284 ceph_bio_iter_advance_step(it, bytes, ({
2285 obj_req->bvec_count++;
2286 }));
0eefd470 2287
afb97888 2288}
0eefd470 2289
afb97888
ID
2290static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2291{
2292 struct rbd_obj_request *obj_req =
2293 container_of(ex, struct rbd_obj_request, ex);
2294 struct ceph_bio_iter *it = arg;
0eefd470 2295
afb97888
ID
2296 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2297 ceph_bio_iter_advance_step(it, bytes, ({
2298 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2299 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2300 }));
3d7efd18
AE
2301}
2302
5a237819
ID
2303static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2304 struct ceph_file_extent *img_extents,
2305 u32 num_img_extents,
2306 struct ceph_bio_iter *bio_pos)
2307{
2308 struct rbd_img_fill_ctx fctx = {
2309 .pos_type = OBJ_REQUEST_BIO,
2310 .pos = (union rbd_img_fill_iter *)bio_pos,
2311 .set_pos_fn = set_bio_pos,
afb97888
ID
2312 .count_fn = count_bio_bvecs,
2313 .copy_fn = copy_bio_bvecs,
5a237819 2314 };
3d7efd18 2315
5a237819
ID
2316 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2317 &fctx);
2318}
3d7efd18 2319
5a237819
ID
2320static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2321 u64 off, u64 len, struct bio *bio)
2322{
2323 struct ceph_file_extent ex = { off, len };
2324 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
3d7efd18 2325
5a237819
ID
2326 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2327}
a9e8ba2c 2328
5a237819
ID
2329static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2330{
2331 struct rbd_obj_request *obj_req =
2332 container_of(ex, struct rbd_obj_request, ex);
2333 struct ceph_bvec_iter *it = arg;
3d7efd18 2334
5a237819
ID
2335 obj_req->bvec_pos = *it;
2336 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2337 ceph_bvec_iter_advance(it, bytes);
2338}
3d7efd18 2339
afb97888
ID
2340static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2341{
2342 struct rbd_obj_request *obj_req =
2343 container_of(ex, struct rbd_obj_request, ex);
2344 struct ceph_bvec_iter *it = arg;
058aa991 2345
afb97888
ID
2346 ceph_bvec_iter_advance_step(it, bytes, ({
2347 obj_req->bvec_count++;
2348 }));
2349}
058aa991 2350
afb97888
ID
2351static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2352{
2353 struct rbd_obj_request *obj_req =
2354 container_of(ex, struct rbd_obj_request, ex);
2355 struct ceph_bvec_iter *it = arg;
3d7efd18 2356
afb97888
ID
2357 ceph_bvec_iter_advance_step(it, bytes, ({
2358 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2359 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2360 }));
3d7efd18
AE
2361}
2362
5a237819
ID
2363static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2364 struct ceph_file_extent *img_extents,
2365 u32 num_img_extents,
2366 struct ceph_bvec_iter *bvec_pos)
c5b5ef6c 2367{
5a237819
ID
2368 struct rbd_img_fill_ctx fctx = {
2369 .pos_type = OBJ_REQUEST_BVECS,
2370 .pos = (union rbd_img_fill_iter *)bvec_pos,
2371 .set_pos_fn = set_bvec_pos,
afb97888
ID
2372 .count_fn = count_bvecs,
2373 .copy_fn = copy_bvecs,
5a237819 2374 };
c5b5ef6c 2375
5a237819
ID
2376 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2377 &fctx);
2378}
c5b5ef6c 2379
5a237819
ID
2380static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2381 struct ceph_file_extent *img_extents,
2382 u32 num_img_extents,
2383 struct bio_vec *bvecs)
2384{
2385 struct ceph_bvec_iter it = {
2386 .bvecs = bvecs,
2387 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2388 num_img_extents) },
2389 };
c5b5ef6c 2390
5a237819
ID
2391 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2392 &it);
2393}
c5b5ef6c 2394
0192ce2e 2395static void rbd_img_handle_request_work(struct work_struct *work)
bf0d5f50 2396{
0192ce2e
ID
2397 struct rbd_img_request *img_req =
2398 container_of(work, struct rbd_img_request, work);
c5b5ef6c 2399
0192ce2e
ID
2400 rbd_img_handle_request(img_req, img_req->work_result);
2401}
c2e82414 2402
0192ce2e
ID
2403static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2404{
2405 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2406 img_req->work_result = result;
2407 queue_work(rbd_wq, &img_req->work);
c5b5ef6c
AE
2408}
2409
85b5e6d1
ID
2410static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2411{
a086a1b8
ID
2412 struct ceph_osd_request *osd_req;
2413 int ret;
2414
2415 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2416 if (IS_ERR(osd_req))
2417 return PTR_ERR(osd_req);
2418
2419 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2420 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2421 rbd_osd_setup_data(osd_req, 0);
2422 rbd_osd_format_read(osd_req);
2423
2424 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2425 if (ret)
2426 return ret;
2427
2428 rbd_osd_submit(osd_req);
85b5e6d1
ID
2429 return 0;
2430}
2431
86bd7998 2432static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
c5b5ef6c 2433{
3da691bf
ID
2434 struct rbd_img_request *img_req = obj_req->img_request;
2435 struct rbd_img_request *child_img_req;
c5b5ef6c
AE
2436 int ret;
2437
e93aca0a
ID
2438 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2439 OBJ_OP_READ, NULL);
3da691bf 2440 if (!child_img_req)
710214e3
ID
2441 return -ENOMEM;
2442
e93aca0a
ID
2443 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2444 child_img_req->obj_request = obj_req;
a90bb0c1 2445
3da691bf 2446 if (!rbd_img_is_write(img_req)) {
ecc633ca 2447 switch (img_req->data_type) {
3da691bf 2448 case OBJ_REQUEST_BIO:
5a237819
ID
2449 ret = __rbd_img_fill_from_bio(child_img_req,
2450 obj_req->img_extents,
2451 obj_req->num_img_extents,
2452 &obj_req->bio_pos);
3da691bf
ID
2453 break;
2454 case OBJ_REQUEST_BVECS:
afb97888 2455 case OBJ_REQUEST_OWN_BVECS:
5a237819
ID
2456 ret = __rbd_img_fill_from_bvecs(child_img_req,
2457 obj_req->img_extents,
2458 obj_req->num_img_extents,
2459 &obj_req->bvec_pos);
3da691bf
ID
2460 break;
2461 default:
d342a15b 2462 BUG();
3da691bf
ID
2463 }
2464 } else {
5a237819
ID
2465 ret = rbd_img_fill_from_bvecs(child_img_req,
2466 obj_req->img_extents,
2467 obj_req->num_img_extents,
2468 obj_req->copyup_bvecs);
3da691bf
ID
2469 }
2470 if (ret) {
2471 rbd_img_request_put(child_img_req);
2472 return ret;
2473 }
2474
0192ce2e
ID
2475 /* avoid parent chain recursion */
2476 rbd_img_schedule(child_img_req, 0);
3da691bf
ID
2477 return 0;
2478}
2479
85b5e6d1 2480static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
3da691bf
ID
2481{
2482 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2483 int ret;
2484
a9b67e69 2485 switch (obj_req->read_state) {
85b5e6d1
ID
2486 case RBD_OBJ_READ_START:
2487 rbd_assert(!*result);
2488
2489 ret = rbd_obj_read_object(obj_req);
2490 if (ret) {
2491 *result = ret;
2492 return true;
2493 }
2494 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2495 return false;
a9b67e69
ID
2496 case RBD_OBJ_READ_OBJECT:
2497 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2498 /* reverse map this object extent onto the parent */
2499 ret = rbd_obj_calc_img_extents(obj_req, false);
86bd7998 2500 if (ret) {
54ab3b24 2501 *result = ret;
86bd7998
ID
2502 return true;
2503 }
a9b67e69
ID
2504 if (obj_req->num_img_extents) {
2505 ret = rbd_obj_read_from_parent(obj_req);
2506 if (ret) {
2507 *result = ret;
2508 return true;
2509 }
2510 obj_req->read_state = RBD_OBJ_READ_PARENT;
2511 return false;
2512 }
86bd7998 2513 }
710214e3 2514
a9b67e69
ID
2515 /*
2516 * -ENOENT means a hole in the image -- zero-fill the entire
2517 * length of the request. A short read also implies zero-fill
2518 * to the end of the request.
2519 */
2520 if (*result == -ENOENT) {
2521 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2522 *result = 0;
2523 } else if (*result >= 0) {
2524 if (*result < obj_req->ex.oe_len)
2525 rbd_obj_zero_range(obj_req, *result,
2526 obj_req->ex.oe_len - *result);
2527 else
2528 rbd_assert(*result == obj_req->ex.oe_len);
2529 *result = 0;
2530 }
2531 return true;
2532 case RBD_OBJ_READ_PARENT:
2533 return true;
2534 default:
2535 BUG();
710214e3 2536 }
3da691bf 2537}
c5b5ef6c 2538
85b5e6d1
ID
2539static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
2540{
a086a1b8
ID
2541 struct ceph_osd_request *osd_req;
2542 int num_ops = count_write_ops(obj_req);
2543 int which = 0;
2544 int ret;
2545
2546 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2547 num_ops++; /* stat */
2548
2549 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
2550 if (IS_ERR(osd_req))
2551 return PTR_ERR(osd_req);
2552
2553 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2554 ret = rbd_osd_setup_stat(osd_req, which++);
2555 if (ret)
2556 return ret;
2557 }
2558
2559 rbd_osd_setup_write_ops(osd_req, which);
2560 rbd_osd_format_write(osd_req);
2561
2562 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2563 if (ret)
2564 return ret;
2565
2566 rbd_osd_submit(osd_req);
85b5e6d1
ID
2567 return 0;
2568}
2569
3da691bf
ID
2570/*
2571 * copyup_bvecs pages are never highmem pages
2572 */
2573static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2574{
2575 struct ceph_bvec_iter it = {
2576 .bvecs = bvecs,
2577 .iter = { .bi_size = bytes },
2578 };
c5b5ef6c 2579
3da691bf
ID
2580 ceph_bvec_iter_advance_step(&it, bytes, ({
2581 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2582 bv.bv_len))
2583 return false;
2584 }));
2585 return true;
c5b5ef6c
AE
2586}
2587
3a482501
ID
2588#define MODS_ONLY U32_MAX
2589
793333a3
ID
2590static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
2591 u32 bytes)
b454e36d 2592{
bcbab1db 2593 struct ceph_osd_request *osd_req;
fe943d50 2594 int ret;
70d045f6 2595
3da691bf 2596 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
89a59c1c 2597 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
70d045f6 2598
bcbab1db
ID
2599 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
2600 if (IS_ERR(osd_req))
2601 return PTR_ERR(osd_req);
b454e36d 2602
b5ae8cbc 2603 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
fe943d50
CX
2604 if (ret)
2605 return ret;
2606
bcbab1db 2607 rbd_osd_format_write(osd_req);
3da691bf 2608
bcbab1db 2609 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
89a59c1c
ID
2610 if (ret)
2611 return ret;
2612
a086a1b8 2613 rbd_osd_submit(osd_req);
89a59c1c
ID
2614 return 0;
2615}
2616
793333a3
ID
2617static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
2618 u32 bytes)
b454e36d 2619{
bcbab1db 2620 struct ceph_osd_request *osd_req;
a086a1b8
ID
2621 int num_ops = count_write_ops(obj_req);
2622 int which = 0;
fe943d50 2623 int ret;
70d045f6 2624
3da691bf 2625 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
70d045f6 2626
a086a1b8
ID
2627 if (bytes != MODS_ONLY)
2628 num_ops++; /* copyup */
13488d53 2629
a086a1b8 2630 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
bcbab1db
ID
2631 if (IS_ERR(osd_req))
2632 return PTR_ERR(osd_req);
b454e36d 2633
3a482501 2634 if (bytes != MODS_ONLY) {
b5ae8cbc 2635 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3a482501
ID
2636 if (ret)
2637 return ret;
3da691bf 2638 }
3da691bf 2639
a086a1b8
ID
2640 rbd_osd_setup_write_ops(osd_req, which);
2641 rbd_osd_format_write(osd_req);
70d045f6 2642
bcbab1db 2643 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
26f887e0
ID
2644 if (ret)
2645 return ret;
2646
a086a1b8 2647 rbd_osd_submit(osd_req);
3da691bf 2648 return 0;
70d045f6
ID
2649}
2650
7e07efb1 2651static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
70d045f6 2652{
7e07efb1 2653 u32 i;
b454e36d 2654
7e07efb1
ID
2655 rbd_assert(!obj_req->copyup_bvecs);
2656 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2657 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2658 sizeof(*obj_req->copyup_bvecs),
2659 GFP_NOIO);
2660 if (!obj_req->copyup_bvecs)
2661 return -ENOMEM;
b454e36d 2662
7e07efb1
ID
2663 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2664 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2665
2666 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2667 if (!obj_req->copyup_bvecs[i].bv_page)
2668 return -ENOMEM;
3d7efd18 2669
7e07efb1
ID
2670 obj_req->copyup_bvecs[i].bv_offset = 0;
2671 obj_req->copyup_bvecs[i].bv_len = len;
2672 obj_overlap -= len;
2673 }
b454e36d 2674
7e07efb1
ID
2675 rbd_assert(!obj_overlap);
2676 return 0;
b454e36d
AE
2677}
2678
0ad5d953
ID
2679/*
2680 * The target object doesn't exist. Read the data for the entire
2681 * target object up to the overlap point (if any) from the parent,
2682 * so we can use it for a copyup.
2683 */
793333a3 2684static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
bf0d5f50 2685{
3da691bf 2686 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 2687 int ret;
bf0d5f50 2688
86bd7998
ID
2689 rbd_assert(obj_req->num_img_extents);
2690 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2691 rbd_dev->parent_overlap);
2692 if (!obj_req->num_img_extents) {
3da691bf
ID
2693 /*
2694 * The overlap has become 0 (most likely because the
3a482501
ID
2695 * image has been flattened). Re-submit the original write
2696 * request -- pass MODS_ONLY since the copyup isn't needed
2697 * anymore.
3da691bf 2698 */
793333a3 2699 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
bf0d5f50
AE
2700 }
2701
86bd7998 2702 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3da691bf
ID
2703 if (ret)
2704 return ret;
2705
86bd7998 2706 return rbd_obj_read_from_parent(obj_req);
bf0d5f50 2707}
8b3e1a56 2708
793333a3
ID
2709static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
2710{
2711 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
2712 int ret;
2713
2714 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
2715
2716 /*
2717 * Only send non-zero copyup data to save some I/O and network
2718 * bandwidth -- zero copyup data is equivalent to the object not
2719 * existing.
2720 */
2721 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
2722 bytes = 0;
2723
2724 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
2725 /*
2726 * Send a copyup request with an empty snapshot context to
2727 * deep-copyup the object through all existing snapshots.
2728 * A second request with the current snapshot context will be
2729 * sent for the actual modification.
2730 */
2731 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
2732 if (ret) {
2733 obj_req->pending.result = ret;
2734 return;
2735 }
2736
2737 obj_req->pending.num_pending++;
2738 bytes = MODS_ONLY;
2739 }
2740
2741 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
2742 if (ret) {
2743 obj_req->pending.result = ret;
2744 return;
2745 }
2746
2747 obj_req->pending.num_pending++;
2748}
2749
2750static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
2751{
2752 int ret;
2753
2754again:
2755 switch (obj_req->copyup_state) {
2756 case RBD_OBJ_COPYUP_START:
2757 rbd_assert(!*result);
2758
2759 ret = rbd_obj_copyup_read_parent(obj_req);
2760 if (ret) {
2761 *result = ret;
2762 return true;
2763 }
2764 if (obj_req->num_img_extents)
2765 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
2766 else
2767 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
2768 return false;
2769 case RBD_OBJ_COPYUP_READ_PARENT:
2770 if (*result)
2771 return true;
2772
2773 if (is_zero_bvecs(obj_req->copyup_bvecs,
2774 rbd_obj_img_extents_bytes(obj_req))) {
2775 dout("%s %p detected zeros\n", __func__, obj_req);
2776 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
2777 }
2778
2779 rbd_obj_copyup_write_object(obj_req);
2780 if (!obj_req->pending.num_pending) {
2781 *result = obj_req->pending.result;
2782 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
2783 goto again;
2784 }
2785 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
2786 return false;
2787 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
2788 if (!pending_result_dec(&obj_req->pending, result))
2789 return false;
2790 /* fall through */
2791 case RBD_OBJ_COPYUP_WRITE_OBJECT:
2792 return true;
2793 default:
2794 BUG();
2795 }
2796}
2797
85b5e6d1 2798static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
8b3e1a56 2799{
793333a3 2800 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 2801 int ret;
8b3e1a56 2802
793333a3 2803again:
3da691bf 2804 switch (obj_req->write_state) {
85b5e6d1
ID
2805 case RBD_OBJ_WRITE_START:
2806 rbd_assert(!*result);
2807
2808 ret = rbd_obj_write_object(obj_req);
2809 if (ret) {
2810 *result = ret;
2811 return true;
2812 }
2813 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
2814 return false;
0ad5d953 2815 case RBD_OBJ_WRITE_OBJECT:
54ab3b24 2816 if (*result == -ENOENT) {
0ad5d953 2817 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
793333a3
ID
2818 *result = 0;
2819 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
2820 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
2821 goto again;
0ad5d953 2822 }
3da691bf 2823 /*
0ad5d953
ID
2824 * On a non-existent object:
2825 * delete - -ENOENT, truncate/zero - 0
3da691bf 2826 */
0ad5d953
ID
2827 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
2828 *result = 0;
3da691bf 2829 }
a9b67e69 2830 if (*result)
3a482501 2831 return true;
8b3e1a56 2832
793333a3
ID
2833 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
2834 goto again;
2835 case __RBD_OBJ_WRITE_COPYUP:
2836 if (!rbd_obj_advance_copyup(obj_req, result))
2837 return false;
2838 /* fall through */
2839 case RBD_OBJ_WRITE_COPYUP:
54ab3b24 2840 if (*result)
793333a3
ID
2841 rbd_warn(rbd_dev, "copyup failed: %d", *result);
2842 return true;
3da691bf 2843 default:
c6244b3b 2844 BUG();
3da691bf
ID
2845 }
2846}
02c74fba 2847
3da691bf 2848/*
0ad5d953 2849 * Return true if @obj_req is completed.
3da691bf 2850 */
54ab3b24
ID
2851static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
2852 int *result)
3da691bf 2853{
0ad5d953 2854 struct rbd_img_request *img_req = obj_req->img_request;
0192ce2e 2855 struct rbd_device *rbd_dev = img_req->rbd_dev;
0ad5d953
ID
2856 bool done;
2857
85b5e6d1 2858 mutex_lock(&obj_req->state_mutex);
0ad5d953 2859 if (!rbd_img_is_write(img_req))
85b5e6d1 2860 done = rbd_obj_advance_read(obj_req, result);
0ad5d953 2861 else
85b5e6d1
ID
2862 done = rbd_obj_advance_write(obj_req, result);
2863 mutex_unlock(&obj_req->state_mutex);
0ad5d953 2864
0192ce2e
ID
2865 if (done && *result) {
2866 rbd_assert(*result < 0);
2867 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
2868 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2869 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
2870 }
0ad5d953 2871 return done;
3da691bf 2872}
02c74fba 2873
0192ce2e
ID
2874/*
2875 * This is open-coded in rbd_img_handle_request() to avoid parent chain
2876 * recursion.
2877 */
2878static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
2879{
2880 if (__rbd_obj_handle_request(obj_req, &result))
2881 rbd_img_handle_request(obj_req->img_request, result);
2882}
2883
e1fddc8f
ID
2884static bool need_exclusive_lock(struct rbd_img_request *img_req)
2885{
2886 struct rbd_device *rbd_dev = img_req->rbd_dev;
2887
2888 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
2889 return false;
2890
2891 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
2892 return false;
2893
2894 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2895 if (rbd_dev->opts->lock_on_read)
2896 return true;
2897
2898 return rbd_img_is_write(img_req);
2899}
2900
637cd060 2901static bool rbd_lock_add_request(struct rbd_img_request *img_req)
e1fddc8f
ID
2902{
2903 struct rbd_device *rbd_dev = img_req->rbd_dev;
637cd060 2904 bool locked;
e1fddc8f
ID
2905
2906 lockdep_assert_held(&rbd_dev->lock_rwsem);
637cd060 2907 locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
e1fddc8f
ID
2908 spin_lock(&rbd_dev->lock_lists_lock);
2909 rbd_assert(list_empty(&img_req->lock_item));
637cd060
ID
2910 if (!locked)
2911 list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
2912 else
2913 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
e1fddc8f 2914 spin_unlock(&rbd_dev->lock_lists_lock);
637cd060 2915 return locked;
e1fddc8f
ID
2916}
2917
2918static void rbd_lock_del_request(struct rbd_img_request *img_req)
2919{
2920 struct rbd_device *rbd_dev = img_req->rbd_dev;
2921 bool need_wakeup;
2922
2923 lockdep_assert_held(&rbd_dev->lock_rwsem);
2924 spin_lock(&rbd_dev->lock_lists_lock);
2925 rbd_assert(!list_empty(&img_req->lock_item));
2926 list_del_init(&img_req->lock_item);
2927 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
2928 list_empty(&rbd_dev->running_list));
2929 spin_unlock(&rbd_dev->lock_lists_lock);
2930 if (need_wakeup)
2931 complete(&rbd_dev->releasing_wait);
2932}
2933
637cd060
ID
2934static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
2935{
2936 struct rbd_device *rbd_dev = img_req->rbd_dev;
2937
2938 if (!need_exclusive_lock(img_req))
2939 return 1;
2940
2941 if (rbd_lock_add_request(img_req))
2942 return 1;
2943
2944 if (rbd_dev->opts->exclusive) {
2945 WARN_ON(1); /* lock got released? */
2946 return -EROFS;
2947 }
2948
2949 /*
2950 * Note the use of mod_delayed_work() in rbd_acquire_lock()
2951 * and cancel_delayed_work() in wake_lock_waiters().
2952 */
2953 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
2954 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
2955 return 0;
2956}
2957
0192ce2e 2958static void rbd_img_object_requests(struct rbd_img_request *img_req)
7114edac 2959{
0192ce2e 2960 struct rbd_obj_request *obj_req;
7114edac 2961
0192ce2e
ID
2962 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
2963
2964 for_each_obj_request(img_req, obj_req) {
2965 int result = 0;
a9e8ba2c 2966
0192ce2e
ID
2967 if (__rbd_obj_handle_request(obj_req, &result)) {
2968 if (result) {
2969 img_req->pending.result = result;
2970 return;
2971 }
2972 } else {
2973 img_req->pending.num_pending++;
2974 }
2975 }
8b3e1a56
AE
2976}
2977
0192ce2e 2978static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
8b3e1a56 2979{
637cd060
ID
2980 struct rbd_device *rbd_dev = img_req->rbd_dev;
2981 int ret;
2982
0192ce2e
ID
2983again:
2984 switch (img_req->state) {
2985 case RBD_IMG_START:
2986 rbd_assert(!*result);
8b3e1a56 2987
637cd060
ID
2988 ret = rbd_img_exclusive_lock(img_req);
2989 if (ret < 0) {
2990 *result = ret;
2991 return true;
2992 }
2993 img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
2994 if (ret > 0)
2995 goto again;
2996 return false;
2997 case RBD_IMG_EXCLUSIVE_LOCK:
2998 if (*result)
2999 return true;
3000
3001 rbd_assert(!need_exclusive_lock(img_req) ||
3002 __rbd_is_lock_owner(rbd_dev));
3003
0192ce2e
ID
3004 rbd_img_object_requests(img_req);
3005 if (!img_req->pending.num_pending) {
3006 *result = img_req->pending.result;
3007 img_req->state = RBD_IMG_OBJECT_REQUESTS;
3008 goto again;
3009 }
3010 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3011 return false;
3012 case __RBD_IMG_OBJECT_REQUESTS:
3013 if (!pending_result_dec(&img_req->pending, result))
3014 return false;
3015 /* fall through */
3016 case RBD_IMG_OBJECT_REQUESTS:
3017 return true;
3018 default:
3019 BUG();
3020 }
3da691bf 3021}
8b3e1a56 3022
0192ce2e
ID
3023/*
3024 * Return true if @img_req is completed.
3025 */
3026static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3027 int *result)
3da691bf 3028{
0192ce2e
ID
3029 struct rbd_device *rbd_dev = img_req->rbd_dev;
3030 bool done;
8b3e1a56 3031
e1fddc8f
ID
3032 if (need_exclusive_lock(img_req)) {
3033 down_read(&rbd_dev->lock_rwsem);
3034 mutex_lock(&img_req->state_mutex);
3035 done = rbd_img_advance(img_req, result);
3036 if (done)
3037 rbd_lock_del_request(img_req);
3038 mutex_unlock(&img_req->state_mutex);
3039 up_read(&rbd_dev->lock_rwsem);
3040 } else {
3041 mutex_lock(&img_req->state_mutex);
3042 done = rbd_img_advance(img_req, result);
3043 mutex_unlock(&img_req->state_mutex);
3044 }
8b3e1a56 3045
0192ce2e
ID
3046 if (done && *result) {
3047 rbd_assert(*result < 0);
3048 rbd_warn(rbd_dev, "%s%s result %d",
3049 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3050 obj_op_name(img_req->op_type), *result);
7114edac 3051 }
0192ce2e
ID
3052 return done;
3053}
3054
3055static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3056{
3057again:
3058 if (!__rbd_img_handle_request(img_req, &result))
3059 return;
8b3e1a56 3060
7114edac 3061 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
0192ce2e
ID
3062 struct rbd_obj_request *obj_req = img_req->obj_request;
3063
54ab3b24 3064 rbd_img_request_put(img_req);
0192ce2e
ID
3065 if (__rbd_obj_handle_request(obj_req, &result)) {
3066 img_req = obj_req->img_request;
3067 goto again;
3068 }
3069 } else {
3070 struct request *rq = img_req->rq;
3071
3072 rbd_img_request_put(img_req);
3073 blk_mq_end_request(rq, errno_to_blk_status(result));
7114edac 3074 }
8b3e1a56 3075}
bf0d5f50 3076
ed95b21a 3077static const struct rbd_client_id rbd_empty_cid;
b8d70035 3078
ed95b21a
ID
3079static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3080 const struct rbd_client_id *rhs)
3081{
3082 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3083}
3084
3085static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3086{
3087 struct rbd_client_id cid;
3088
3089 mutex_lock(&rbd_dev->watch_mutex);
3090 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3091 cid.handle = rbd_dev->watch_cookie;
3092 mutex_unlock(&rbd_dev->watch_mutex);
3093 return cid;
3094}
3095
3096/*
3097 * lock_rwsem must be held for write
3098 */
3099static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3100 const struct rbd_client_id *cid)
3101{
3102 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3103 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3104 cid->gid, cid->handle);
3105 rbd_dev->owner_cid = *cid; /* struct */
3106}
3107
3108static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3109{
3110 mutex_lock(&rbd_dev->watch_mutex);
3111 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3112 mutex_unlock(&rbd_dev->watch_mutex);
3113}
3114
edd8ca80
FM
3115static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3116{
3117 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3118
a2b1da09 3119 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
edd8ca80
FM
3120 strcpy(rbd_dev->lock_cookie, cookie);
3121 rbd_set_owner_cid(rbd_dev, &cid);
3122 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3123}
3124
ed95b21a
ID
3125/*
3126 * lock_rwsem must be held for write
3127 */
3128static int rbd_lock(struct rbd_device *rbd_dev)
b8d70035 3129{
922dab61 3130 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3131 char cookie[32];
e627db08 3132 int ret;
b8d70035 3133
cbbfb0ff
ID
3134 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3135 rbd_dev->lock_cookie[0] != '\0');
52bb1f9b 3136
ed95b21a
ID
3137 format_lock_cookie(rbd_dev, cookie);
3138 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3139 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3140 RBD_LOCK_TAG, "", 0);
e627db08 3141 if (ret)
ed95b21a 3142 return ret;
b8d70035 3143
edd8ca80 3144 __rbd_lock(rbd_dev, cookie);
ed95b21a 3145 return 0;
b8d70035
AE
3146}
3147
ed95b21a
ID
3148/*
3149 * lock_rwsem must be held for write
3150 */
bbead745 3151static void rbd_unlock(struct rbd_device *rbd_dev)
bb040aa0 3152{
922dab61 3153 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
bb040aa0
ID
3154 int ret;
3155
cbbfb0ff
ID
3156 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3157 rbd_dev->lock_cookie[0] == '\0');
bb040aa0 3158
ed95b21a 3159 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
cbbfb0ff 3160 RBD_LOCK_NAME, rbd_dev->lock_cookie);
bbead745 3161 if (ret && ret != -ENOENT)
637cd060 3162 rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
bb040aa0 3163
bbead745
ID
3164 /* treat errors as the image is unlocked */
3165 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
cbbfb0ff 3166 rbd_dev->lock_cookie[0] = '\0';
ed95b21a
ID
3167 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3168 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
bb040aa0
ID
3169}
3170
ed95b21a
ID
3171static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3172 enum rbd_notify_op notify_op,
3173 struct page ***preply_pages,
3174 size_t *preply_len)
9969ebc5
AE
3175{
3176 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3177 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
08a79102
KS
3178 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3179 int buf_size = sizeof(buf);
ed95b21a 3180 void *p = buf;
9969ebc5 3181
ed95b21a 3182 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
9969ebc5 3183
ed95b21a
ID
3184 /* encode *LockPayload NotifyMessage (op + ClientId) */
3185 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3186 ceph_encode_32(&p, notify_op);
3187 ceph_encode_64(&p, cid.gid);
3188 ceph_encode_64(&p, cid.handle);
8eb87565 3189
ed95b21a
ID
3190 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3191 &rbd_dev->header_oloc, buf, buf_size,
3192 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
b30a01f2
ID
3193}
3194
ed95b21a
ID
3195static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3196 enum rbd_notify_op notify_op)
b30a01f2 3197{
ed95b21a
ID
3198 struct page **reply_pages;
3199 size_t reply_len;
b30a01f2 3200
ed95b21a
ID
3201 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3202 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3203}
b30a01f2 3204
ed95b21a
ID
3205static void rbd_notify_acquired_lock(struct work_struct *work)
3206{
3207 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3208 acquired_lock_work);
76756a51 3209
ed95b21a 3210 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
c525f036
ID
3211}
3212
ed95b21a 3213static void rbd_notify_released_lock(struct work_struct *work)
c525f036 3214{
ed95b21a
ID
3215 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3216 released_lock_work);
811c6688 3217
ed95b21a 3218 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
fca27065
ID
3219}
3220
ed95b21a 3221static int rbd_request_lock(struct rbd_device *rbd_dev)
36be9a76 3222{
ed95b21a
ID
3223 struct page **reply_pages;
3224 size_t reply_len;
3225 bool lock_owner_responded = false;
36be9a76
AE
3226 int ret;
3227
ed95b21a 3228 dout("%s rbd_dev %p\n", __func__, rbd_dev);
36be9a76 3229
ed95b21a
ID
3230 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3231 &reply_pages, &reply_len);
3232 if (ret && ret != -ETIMEDOUT) {
3233 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
36be9a76 3234 goto out;
ed95b21a 3235 }
36be9a76 3236
ed95b21a
ID
3237 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3238 void *p = page_address(reply_pages[0]);
3239 void *const end = p + reply_len;
3240 u32 n;
36be9a76 3241
ed95b21a
ID
3242 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3243 while (n--) {
3244 u8 struct_v;
3245 u32 len;
36be9a76 3246
ed95b21a
ID
3247 ceph_decode_need(&p, end, 8 + 8, e_inval);
3248 p += 8 + 8; /* skip gid and cookie */
04017e29 3249
ed95b21a
ID
3250 ceph_decode_32_safe(&p, end, len, e_inval);
3251 if (!len)
3252 continue;
3253
3254 if (lock_owner_responded) {
3255 rbd_warn(rbd_dev,
3256 "duplicate lock owners detected");
3257 ret = -EIO;
3258 goto out;
3259 }
3260
3261 lock_owner_responded = true;
3262 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3263 &struct_v, &len);
3264 if (ret) {
3265 rbd_warn(rbd_dev,
3266 "failed to decode ResponseMessage: %d",
3267 ret);
3268 goto e_inval;
3269 }
3270
3271 ret = ceph_decode_32(&p);
3272 }
3273 }
3274
3275 if (!lock_owner_responded) {
3276 rbd_warn(rbd_dev, "no lock owners detected");
3277 ret = -ETIMEDOUT;
3278 }
3279
3280out:
3281 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3282 return ret;
3283
3284e_inval:
3285 ret = -EINVAL;
3286 goto out;
3287}
3288
637cd060
ID
3289/*
3290 * Either image request state machine(s) or rbd_add_acquire_lock()
3291 * (i.e. "rbd map").
3292 */
3293static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
ed95b21a 3294{
637cd060
ID
3295 struct rbd_img_request *img_req;
3296
3297 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3298 lockdep_assert_held_exclusive(&rbd_dev->lock_rwsem);
ed95b21a
ID
3299
3300 cancel_delayed_work(&rbd_dev->lock_dwork);
637cd060
ID
3301 if (!completion_done(&rbd_dev->acquire_wait)) {
3302 rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3303 list_empty(&rbd_dev->running_list));
3304 rbd_dev->acquire_err = result;
3305 complete_all(&rbd_dev->acquire_wait);
3306 return;
3307 }
3308
3309 list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3310 mutex_lock(&img_req->state_mutex);
3311 rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3312 rbd_img_schedule(img_req, result);
3313 mutex_unlock(&img_req->state_mutex);
3314 }
3315
3316 list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
ed95b21a
ID
3317}
3318
3319static int get_lock_owner_info(struct rbd_device *rbd_dev,
3320 struct ceph_locker **lockers, u32 *num_lockers)
3321{
3322 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3323 u8 lock_type;
3324 char *lock_tag;
3325 int ret;
3326
3327 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3328
3329 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3330 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3331 &lock_type, &lock_tag, lockers, num_lockers);
3332 if (ret)
3333 return ret;
3334
3335 if (*num_lockers == 0) {
3336 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3337 goto out;
3338 }
3339
3340 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3341 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3342 lock_tag);
3343 ret = -EBUSY;
3344 goto out;
3345 }
3346
3347 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3348 rbd_warn(rbd_dev, "shared lock type detected");
3349 ret = -EBUSY;
3350 goto out;
3351 }
3352
3353 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3354 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3355 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3356 (*lockers)[0].id.cookie);
3357 ret = -EBUSY;
3358 goto out;
3359 }
3360
3361out:
3362 kfree(lock_tag);
3363 return ret;
3364}
3365
3366static int find_watcher(struct rbd_device *rbd_dev,
3367 const struct ceph_locker *locker)
3368{
3369 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3370 struct ceph_watch_item *watchers;
3371 u32 num_watchers;
3372 u64 cookie;
3373 int i;
3374 int ret;
3375
3376 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3377 &rbd_dev->header_oloc, &watchers,
3378 &num_watchers);
3379 if (ret)
3380 return ret;
3381
3382 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3383 for (i = 0; i < num_watchers; i++) {
3384 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3385 sizeof(locker->info.addr)) &&
3386 watchers[i].cookie == cookie) {
3387 struct rbd_client_id cid = {
3388 .gid = le64_to_cpu(watchers[i].name.num),
3389 .handle = cookie,
3390 };
3391
3392 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3393 rbd_dev, cid.gid, cid.handle);
3394 rbd_set_owner_cid(rbd_dev, &cid);
3395 ret = 1;
3396 goto out;
3397 }
3398 }
3399
3400 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3401 ret = 0;
3402out:
3403 kfree(watchers);
3404 return ret;
3405}
3406
3407/*
3408 * lock_rwsem must be held for write
3409 */
3410static int rbd_try_lock(struct rbd_device *rbd_dev)
3411{
3412 struct ceph_client *client = rbd_dev->rbd_client->client;
3413 struct ceph_locker *lockers;
3414 u32 num_lockers;
3415 int ret;
3416
3417 for (;;) {
3418 ret = rbd_lock(rbd_dev);
3419 if (ret != -EBUSY)
3420 return ret;
3421
3422 /* determine if the current lock holder is still alive */
3423 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3424 if (ret)
3425 return ret;
3426
3427 if (num_lockers == 0)
3428 goto again;
3429
3430 ret = find_watcher(rbd_dev, lockers);
637cd060
ID
3431 if (ret)
3432 goto out; /* request lock or error */
ed95b21a
ID
3433
3434 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3435 ENTITY_NAME(lockers[0].id.name));
3436
3437 ret = ceph_monc_blacklist_add(&client->monc,
3438 &lockers[0].info.addr);
3439 if (ret) {
3440 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3441 ENTITY_NAME(lockers[0].id.name), ret);
3442 goto out;
3443 }
3444
3445 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3446 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3447 lockers[0].id.cookie,
3448 &lockers[0].id.name);
3449 if (ret && ret != -ENOENT)
3450 goto out;
3451
3452again:
3453 ceph_free_lockers(lockers, num_lockers);
3454 }
3455
3456out:
3457 ceph_free_lockers(lockers, num_lockers);
3458 return ret;
3459}
3460
3461/*
637cd060
ID
3462 * Return:
3463 * 0 - lock acquired
3464 * 1 - caller should call rbd_request_lock()
3465 * <0 - error
ed95b21a 3466 */
637cd060 3467static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
ed95b21a 3468{
637cd060 3469 int ret;
ed95b21a
ID
3470
3471 down_read(&rbd_dev->lock_rwsem);
3472 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3473 rbd_dev->lock_state);
3474 if (__rbd_is_lock_owner(rbd_dev)) {
ed95b21a 3475 up_read(&rbd_dev->lock_rwsem);
637cd060 3476 return 0;
ed95b21a
ID
3477 }
3478
3479 up_read(&rbd_dev->lock_rwsem);
3480 down_write(&rbd_dev->lock_rwsem);
3481 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3482 rbd_dev->lock_state);
637cd060
ID
3483 if (__rbd_is_lock_owner(rbd_dev)) {
3484 up_write(&rbd_dev->lock_rwsem);
3485 return 0;
ed95b21a
ID
3486 }
3487
637cd060
ID
3488 ret = rbd_try_lock(rbd_dev);
3489 if (ret < 0) {
3490 rbd_warn(rbd_dev, "failed to lock header: %d", ret);
3491 if (ret == -EBLACKLISTED)
3492 goto out;
3493
3494 ret = 1; /* request lock anyway */
3495 }
3496 if (ret > 0) {
3497 up_write(&rbd_dev->lock_rwsem);
3498 return ret;
3499 }
3500
3501 rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
3502 rbd_assert(list_empty(&rbd_dev->running_list));
3503
3504out:
3505 wake_lock_waiters(rbd_dev, ret);
ed95b21a 3506 up_write(&rbd_dev->lock_rwsem);
637cd060 3507 return ret;
ed95b21a
ID
3508}
3509
3510static void rbd_acquire_lock(struct work_struct *work)
3511{
3512 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3513 struct rbd_device, lock_dwork);
637cd060 3514 int ret;
ed95b21a
ID
3515
3516 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3517again:
637cd060
ID
3518 ret = rbd_try_acquire_lock(rbd_dev);
3519 if (ret <= 0) {
3520 dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
ed95b21a
ID
3521 return;
3522 }
3523
3524 ret = rbd_request_lock(rbd_dev);
3525 if (ret == -ETIMEDOUT) {
3526 goto again; /* treat this as a dead client */
e010dd0a
ID
3527 } else if (ret == -EROFS) {
3528 rbd_warn(rbd_dev, "peer will not release lock");
637cd060
ID
3529 down_write(&rbd_dev->lock_rwsem);
3530 wake_lock_waiters(rbd_dev, ret);
3531 up_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
3532 } else if (ret < 0) {
3533 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3534 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3535 RBD_RETRY_DELAY);
3536 } else {
3537 /*
3538 * lock owner acked, but resend if we don't see them
3539 * release the lock
3540 */
3541 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3542 rbd_dev);
3543 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3544 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3545 }
3546}
3547
a2b1da09 3548static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
ed95b21a 3549{
e1fddc8f
ID
3550 bool need_wait;
3551
a2b1da09
ID
3552 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3553 lockdep_assert_held_exclusive(&rbd_dev->lock_rwsem);
3554
ed95b21a
ID
3555 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3556 return false;
3557
52bb1f9b 3558 /*
ed95b21a 3559 * Ensure that all in-flight IO is flushed.
52bb1f9b 3560 */
e1fddc8f
ID
3561 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3562 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
3563 need_wait = !list_empty(&rbd_dev->running_list);
3564 downgrade_write(&rbd_dev->lock_rwsem);
3565 if (need_wait)
3566 wait_for_completion(&rbd_dev->releasing_wait);
ed95b21a
ID
3567 up_read(&rbd_dev->lock_rwsem);
3568
3569 down_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
3570 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3571 return false;
3572
e1fddc8f 3573 rbd_assert(list_empty(&rbd_dev->running_list));
a2b1da09
ID
3574 return true;
3575}
3576
e1fddc8f
ID
3577static void __rbd_release_lock(struct rbd_device *rbd_dev)
3578{
3579 rbd_assert(list_empty(&rbd_dev->running_list));
3580
3581 rbd_unlock(rbd_dev);
3582}
3583
a2b1da09
ID
3584/*
3585 * lock_rwsem must be held for write
3586 */
3587static void rbd_release_lock(struct rbd_device *rbd_dev)
3588{
3589 if (!rbd_quiesce_lock(rbd_dev))
3590 return;
3591
e1fddc8f 3592 __rbd_release_lock(rbd_dev);
a2b1da09 3593
bbead745
ID
3594 /*
3595 * Give others a chance to grab the lock - we would re-acquire
637cd060
ID
3596 * almost immediately if we got new IO while draining the running
3597 * list otherwise. We need to ack our own notifications, so this
3598 * lock_dwork will be requeued from rbd_handle_released_lock() by
3599 * way of maybe_kick_acquire().
bbead745
ID
3600 */
3601 cancel_delayed_work(&rbd_dev->lock_dwork);
ed95b21a
ID
3602}
3603
3604static void rbd_release_lock_work(struct work_struct *work)
3605{
3606 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3607 unlock_work);
3608
3609 down_write(&rbd_dev->lock_rwsem);
3610 rbd_release_lock(rbd_dev);
3611 up_write(&rbd_dev->lock_rwsem);
3612}
3613
637cd060
ID
3614static void maybe_kick_acquire(struct rbd_device *rbd_dev)
3615{
3616 bool have_requests;
3617
3618 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3619 if (__rbd_is_lock_owner(rbd_dev))
3620 return;
3621
3622 spin_lock(&rbd_dev->lock_lists_lock);
3623 have_requests = !list_empty(&rbd_dev->acquiring_list);
3624 spin_unlock(&rbd_dev->lock_lists_lock);
3625 if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
3626 dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
3627 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3628 }
3629}
3630
ed95b21a
ID
3631static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3632 void **p)
3633{
3634 struct rbd_client_id cid = { 0 };
3635
3636 if (struct_v >= 2) {
3637 cid.gid = ceph_decode_64(p);
3638 cid.handle = ceph_decode_64(p);
3639 }
3640
3641 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3642 cid.handle);
3643 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3644 down_write(&rbd_dev->lock_rwsem);
3645 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3646 /*
3647 * we already know that the remote client is
3648 * the owner
3649 */
3650 up_write(&rbd_dev->lock_rwsem);
3651 return;
3652 }
3653
3654 rbd_set_owner_cid(rbd_dev, &cid);
3655 downgrade_write(&rbd_dev->lock_rwsem);
3656 } else {
3657 down_read(&rbd_dev->lock_rwsem);
3658 }
3659
637cd060 3660 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
3661 up_read(&rbd_dev->lock_rwsem);
3662}
3663
3664static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3665 void **p)
3666{
3667 struct rbd_client_id cid = { 0 };
3668
3669 if (struct_v >= 2) {
3670 cid.gid = ceph_decode_64(p);
3671 cid.handle = ceph_decode_64(p);
3672 }
3673
3674 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3675 cid.handle);
3676 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3677 down_write(&rbd_dev->lock_rwsem);
3678 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3679 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3680 __func__, rbd_dev, cid.gid, cid.handle,
3681 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3682 up_write(&rbd_dev->lock_rwsem);
3683 return;
3684 }
3685
3686 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3687 downgrade_write(&rbd_dev->lock_rwsem);
3688 } else {
3689 down_read(&rbd_dev->lock_rwsem);
3690 }
3691
637cd060 3692 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
3693 up_read(&rbd_dev->lock_rwsem);
3694}
3695
3b77faa0
ID
3696/*
3697 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3698 * ResponseMessage is needed.
3699 */
3700static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3701 void **p)
ed95b21a
ID
3702{
3703 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3704 struct rbd_client_id cid = { 0 };
3b77faa0 3705 int result = 1;
ed95b21a
ID
3706
3707 if (struct_v >= 2) {
3708 cid.gid = ceph_decode_64(p);
3709 cid.handle = ceph_decode_64(p);
3710 }
3711
3712 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3713 cid.handle);
3714 if (rbd_cid_equal(&cid, &my_cid))
3b77faa0 3715 return result;
ed95b21a
ID
3716
3717 down_read(&rbd_dev->lock_rwsem);
3b77faa0
ID
3718 if (__rbd_is_lock_owner(rbd_dev)) {
3719 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3720 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3721 goto out_unlock;
3722
3723 /*
3724 * encode ResponseMessage(0) so the peer can detect
3725 * a missing owner
3726 */
3727 result = 0;
3728
3729 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
e010dd0a
ID
3730 if (!rbd_dev->opts->exclusive) {
3731 dout("%s rbd_dev %p queueing unlock_work\n",
3732 __func__, rbd_dev);
3733 queue_work(rbd_dev->task_wq,
3734 &rbd_dev->unlock_work);
3735 } else {
3736 /* refuse to release the lock */
3737 result = -EROFS;
3738 }
ed95b21a
ID
3739 }
3740 }
3b77faa0
ID
3741
3742out_unlock:
ed95b21a 3743 up_read(&rbd_dev->lock_rwsem);
3b77faa0 3744 return result;
ed95b21a
ID
3745}
3746
3747static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3748 u64 notify_id, u64 cookie, s32 *result)
3749{
3750 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
08a79102
KS
3751 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3752 int buf_size = sizeof(buf);
ed95b21a
ID
3753 int ret;
3754
3755 if (result) {
3756 void *p = buf;
3757
3758 /* encode ResponseMessage */
3759 ceph_start_encoding(&p, 1, 1,
3760 buf_size - CEPH_ENCODING_START_BLK_LEN);
3761 ceph_encode_32(&p, *result);
3762 } else {
3763 buf_size = 0;
3764 }
b8d70035 3765
922dab61
ID
3766 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3767 &rbd_dev->header_oloc, notify_id, cookie,
ed95b21a 3768 buf, buf_size);
52bb1f9b 3769 if (ret)
ed95b21a
ID
3770 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3771}
3772
3773static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3774 u64 cookie)
3775{
3776 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3777 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3778}
3779
3780static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3781 u64 notify_id, u64 cookie, s32 result)
3782{
3783 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3784 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3785}
3786
3787static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3788 u64 notifier_id, void *data, size_t data_len)
3789{
3790 struct rbd_device *rbd_dev = arg;
3791 void *p = data;
3792 void *const end = p + data_len;
d4c2269b 3793 u8 struct_v = 0;
ed95b21a
ID
3794 u32 len;
3795 u32 notify_op;
3796 int ret;
3797
3798 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3799 __func__, rbd_dev, cookie, notify_id, data_len);
3800 if (data_len) {
3801 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3802 &struct_v, &len);
3803 if (ret) {
3804 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3805 ret);
3806 return;
3807 }
3808
3809 notify_op = ceph_decode_32(&p);
3810 } else {
3811 /* legacy notification for header updates */
3812 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3813 len = 0;
3814 }
3815
3816 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3817 switch (notify_op) {
3818 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3819 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3820 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3821 break;
3822 case RBD_NOTIFY_OP_RELEASED_LOCK:
3823 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3824 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3825 break;
3826 case RBD_NOTIFY_OP_REQUEST_LOCK:
3b77faa0
ID
3827 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3828 if (ret <= 0)
ed95b21a 3829 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3b77faa0 3830 cookie, ret);
ed95b21a
ID
3831 else
3832 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3833 break;
3834 case RBD_NOTIFY_OP_HEADER_UPDATE:
3835 ret = rbd_dev_refresh(rbd_dev);
3836 if (ret)
3837 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3838
3839 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3840 break;
3841 default:
3842 if (rbd_is_lock_owner(rbd_dev))
3843 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3844 cookie, -EOPNOTSUPP);
3845 else
3846 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3847 break;
3848 }
b8d70035
AE
3849}
3850
99d16943
ID
3851static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3852
922dab61 3853static void rbd_watch_errcb(void *arg, u64 cookie, int err)
bb040aa0 3854{
922dab61 3855 struct rbd_device *rbd_dev = arg;
bb040aa0 3856
922dab61 3857 rbd_warn(rbd_dev, "encountered watch error: %d", err);
bb040aa0 3858
ed95b21a
ID
3859 down_write(&rbd_dev->lock_rwsem);
3860 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3861 up_write(&rbd_dev->lock_rwsem);
3862
99d16943
ID
3863 mutex_lock(&rbd_dev->watch_mutex);
3864 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3865 __rbd_unregister_watch(rbd_dev);
3866 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
bb040aa0 3867
99d16943 3868 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
bb040aa0 3869 }
99d16943 3870 mutex_unlock(&rbd_dev->watch_mutex);
bb040aa0
ID
3871}
3872
9969ebc5 3873/*
99d16943 3874 * watch_mutex must be locked
9969ebc5 3875 */
99d16943 3876static int __rbd_register_watch(struct rbd_device *rbd_dev)
9969ebc5
AE
3877{
3878 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
922dab61 3879 struct ceph_osd_linger_request *handle;
9969ebc5 3880
922dab61 3881 rbd_assert(!rbd_dev->watch_handle);
99d16943 3882 dout("%s rbd_dev %p\n", __func__, rbd_dev);
9969ebc5 3883
922dab61
ID
3884 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3885 &rbd_dev->header_oloc, rbd_watch_cb,
3886 rbd_watch_errcb, rbd_dev);
3887 if (IS_ERR(handle))
3888 return PTR_ERR(handle);
8eb87565 3889
922dab61 3890 rbd_dev->watch_handle = handle;
b30a01f2 3891 return 0;
b30a01f2
ID
3892}
3893
99d16943
ID
3894/*
3895 * watch_mutex must be locked
3896 */
3897static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
b30a01f2 3898{
922dab61
ID
3899 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3900 int ret;
b30a01f2 3901
99d16943
ID
3902 rbd_assert(rbd_dev->watch_handle);
3903 dout("%s rbd_dev %p\n", __func__, rbd_dev);
b30a01f2 3904
922dab61
ID
3905 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3906 if (ret)
3907 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
76756a51 3908
922dab61 3909 rbd_dev->watch_handle = NULL;
c525f036
ID
3910}
3911
99d16943
ID
3912static int rbd_register_watch(struct rbd_device *rbd_dev)
3913{
3914 int ret;
3915
3916 mutex_lock(&rbd_dev->watch_mutex);
3917 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3918 ret = __rbd_register_watch(rbd_dev);
3919 if (ret)
3920 goto out;
3921
3922 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3923 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3924
3925out:
3926 mutex_unlock(&rbd_dev->watch_mutex);
3927 return ret;
3928}
3929
3930static void cancel_tasks_sync(struct rbd_device *rbd_dev)
c525f036 3931{
99d16943
ID
3932 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3933
ed95b21a
ID
3934 cancel_work_sync(&rbd_dev->acquired_lock_work);
3935 cancel_work_sync(&rbd_dev->released_lock_work);
3936 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3937 cancel_work_sync(&rbd_dev->unlock_work);
99d16943
ID
3938}
3939
3940static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3941{
3942 cancel_tasks_sync(rbd_dev);
3943
3944 mutex_lock(&rbd_dev->watch_mutex);
3945 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3946 __rbd_unregister_watch(rbd_dev);
3947 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3948 mutex_unlock(&rbd_dev->watch_mutex);
811c6688 3949
23edca86 3950 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
811c6688 3951 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
fca27065
ID
3952}
3953
14bb211d
ID
3954/*
3955 * lock_rwsem must be held for write
3956 */
3957static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3958{
3959 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3960 char cookie[32];
3961 int ret;
3962
a2b1da09
ID
3963 if (!rbd_quiesce_lock(rbd_dev))
3964 return;
14bb211d
ID
3965
3966 format_lock_cookie(rbd_dev, cookie);
3967 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3968 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3969 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3970 RBD_LOCK_TAG, cookie);
3971 if (ret) {
3972 if (ret != -EOPNOTSUPP)
3973 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3974 ret);
3975
3976 /*
3977 * Lock cookie cannot be updated on older OSDs, so do
3978 * a manual release and queue an acquire.
3979 */
e1fddc8f 3980 __rbd_release_lock(rbd_dev);
a2b1da09 3981 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
14bb211d 3982 } else {
edd8ca80 3983 __rbd_lock(rbd_dev, cookie);
637cd060 3984 wake_lock_waiters(rbd_dev, 0);
14bb211d
ID
3985 }
3986}
3987
99d16943
ID
3988static void rbd_reregister_watch(struct work_struct *work)
3989{
3990 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3991 struct rbd_device, watch_dwork);
3992 int ret;
3993
3994 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3995
3996 mutex_lock(&rbd_dev->watch_mutex);
87c0fded
ID
3997 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3998 mutex_unlock(&rbd_dev->watch_mutex);
14bb211d 3999 return;
87c0fded 4000 }
99d16943
ID
4001
4002 ret = __rbd_register_watch(rbd_dev);
4003 if (ret) {
4004 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
637cd060 4005 if (ret != -EBLACKLISTED && ret != -ENOENT) {
99d16943
ID
4006 queue_delayed_work(rbd_dev->task_wq,
4007 &rbd_dev->watch_dwork,
4008 RBD_RETRY_DELAY);
637cd060
ID
4009 mutex_unlock(&rbd_dev->watch_mutex);
4010 return;
87c0fded 4011 }
637cd060 4012
87c0fded 4013 mutex_unlock(&rbd_dev->watch_mutex);
637cd060
ID
4014 down_write(&rbd_dev->lock_rwsem);
4015 wake_lock_waiters(rbd_dev, ret);
4016 up_write(&rbd_dev->lock_rwsem);
14bb211d 4017 return;
99d16943
ID
4018 }
4019
4020 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4021 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4022 mutex_unlock(&rbd_dev->watch_mutex);
4023
14bb211d
ID
4024 down_write(&rbd_dev->lock_rwsem);
4025 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4026 rbd_reacquire_lock(rbd_dev);
4027 up_write(&rbd_dev->lock_rwsem);
4028
99d16943
ID
4029 ret = rbd_dev_refresh(rbd_dev);
4030 if (ret)
f6870cc9 4031 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
99d16943
ID
4032}
4033
36be9a76 4034/*
f40eb349
AE
4035 * Synchronous osd object method call. Returns the number of bytes
4036 * returned in the outbound buffer, or a negative error code.
36be9a76
AE
4037 */
4038static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
ecd4a68a
ID
4039 struct ceph_object_id *oid,
4040 struct ceph_object_locator *oloc,
36be9a76 4041 const char *method_name,
4157976b 4042 const void *outbound,
36be9a76 4043 size_t outbound_size,
4157976b 4044 void *inbound,
e2a58ee5 4045 size_t inbound_size)
36be9a76 4046{
ecd4a68a
ID
4047 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4048 struct page *req_page = NULL;
4049 struct page *reply_page;
36be9a76
AE
4050 int ret;
4051
4052 /*
6010a451
AE
4053 * Method calls are ultimately read operations. The result
4054 * should placed into the inbound buffer provided. They
4055 * also supply outbound data--parameters for the object
4056 * method. Currently if this is present it will be a
4057 * snapshot id.
36be9a76 4058 */
ecd4a68a
ID
4059 if (outbound) {
4060 if (outbound_size > PAGE_SIZE)
4061 return -E2BIG;
36be9a76 4062
ecd4a68a
ID
4063 req_page = alloc_page(GFP_KERNEL);
4064 if (!req_page)
4065 return -ENOMEM;
04017e29 4066
ecd4a68a 4067 memcpy(page_address(req_page), outbound, outbound_size);
04017e29 4068 }
36be9a76 4069
ecd4a68a
ID
4070 reply_page = alloc_page(GFP_KERNEL);
4071 if (!reply_page) {
4072 if (req_page)
4073 __free_page(req_page);
4074 return -ENOMEM;
4075 }
57385b51 4076
ecd4a68a
ID
4077 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4078 CEPH_OSD_FLAG_READ, req_page, outbound_size,
68ada915 4079 &reply_page, &inbound_size);
ecd4a68a
ID
4080 if (!ret) {
4081 memcpy(inbound, page_address(reply_page), inbound_size);
4082 ret = inbound_size;
4083 }
36be9a76 4084
ecd4a68a
ID
4085 if (req_page)
4086 __free_page(req_page);
4087 __free_page(reply_page);
36be9a76
AE
4088 return ret;
4089}
4090
7ad18afa 4091static void rbd_queue_workfn(struct work_struct *work)
bf0d5f50 4092{
7ad18afa
CH
4093 struct request *rq = blk_mq_rq_from_pdu(work);
4094 struct rbd_device *rbd_dev = rq->q->queuedata;
bc1ecc65 4095 struct rbd_img_request *img_request;
4e752f0a 4096 struct ceph_snap_context *snapc = NULL;
bc1ecc65
ID
4097 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4098 u64 length = blk_rq_bytes(rq);
6d2940c8 4099 enum obj_operation_type op_type;
4e752f0a 4100 u64 mapping_size;
bf0d5f50
AE
4101 int result;
4102
aebf526b
CH
4103 switch (req_op(rq)) {
4104 case REQ_OP_DISCARD:
90e98c52 4105 op_type = OBJ_OP_DISCARD;
aebf526b 4106 break;
6484cbe9
ID
4107 case REQ_OP_WRITE_ZEROES:
4108 op_type = OBJ_OP_ZEROOUT;
4109 break;
aebf526b 4110 case REQ_OP_WRITE:
6d2940c8 4111 op_type = OBJ_OP_WRITE;
aebf526b
CH
4112 break;
4113 case REQ_OP_READ:
6d2940c8 4114 op_type = OBJ_OP_READ;
aebf526b
CH
4115 break;
4116 default:
4117 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
4118 result = -EIO;
4119 goto err;
4120 }
6d2940c8 4121
bc1ecc65 4122 /* Ignore/skip any zero-length requests */
bf0d5f50 4123
bc1ecc65
ID
4124 if (!length) {
4125 dout("%s: zero-length request\n", __func__);
4126 result = 0;
4127 goto err_rq;
4128 }
bf0d5f50 4129
b91a7bdc
ID
4130 if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) {
4131 rbd_warn(rbd_dev, "%s on read-only snapshot",
4132 obj_op_name(op_type));
4133 result = -EIO;
4134 goto err;
4135 }
4dda41d3 4136
bc1ecc65
ID
4137 /*
4138 * Quit early if the mapped snapshot no longer exists. It's
4139 * still possible the snapshot will have disappeared by the
4140 * time our request arrives at the osd, but there's no sense in
4141 * sending it if we already know.
4142 */
4143 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4144 dout("request for non-existent snapshot");
4145 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4146 result = -ENXIO;
4147 goto err_rq;
4148 }
4dda41d3 4149
bc1ecc65
ID
4150 if (offset && length > U64_MAX - offset + 1) {
4151 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4152 length);
4153 result = -EINVAL;
4154 goto err_rq; /* Shouldn't happen */
4155 }
4dda41d3 4156
7ad18afa
CH
4157 blk_mq_start_request(rq);
4158
4e752f0a
JD
4159 down_read(&rbd_dev->header_rwsem);
4160 mapping_size = rbd_dev->mapping.size;
6d2940c8 4161 if (op_type != OBJ_OP_READ) {
4e752f0a
JD
4162 snapc = rbd_dev->header.snapc;
4163 ceph_get_snap_context(snapc);
4164 }
4165 up_read(&rbd_dev->header_rwsem);
4166
4167 if (offset + length > mapping_size) {
bc1ecc65 4168 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4e752f0a 4169 length, mapping_size);
bc1ecc65
ID
4170 result = -EIO;
4171 goto err_rq;
4172 }
bf0d5f50 4173
dfd9875f 4174 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
bc1ecc65
ID
4175 if (!img_request) {
4176 result = -ENOMEM;
637cd060 4177 goto err_rq;
bc1ecc65
ID
4178 }
4179 img_request->rq = rq;
70b16db8 4180 snapc = NULL; /* img_request consumes a ref */
bf0d5f50 4181
6484cbe9 4182 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
5a237819 4183 result = rbd_img_fill_nodata(img_request, offset, length);
90e98c52 4184 else
5a237819
ID
4185 result = rbd_img_fill_from_bio(img_request, offset, length,
4186 rq->bio);
0192ce2e 4187 if (result)
bc1ecc65 4188 goto err_img_request;
bf0d5f50 4189
e1fddc8f 4190 rbd_img_handle_request(img_request, 0);
bc1ecc65 4191 return;
bf0d5f50 4192
bc1ecc65
ID
4193err_img_request:
4194 rbd_img_request_put(img_request);
4195err_rq:
4196 if (result)
4197 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
6d2940c8 4198 obj_op_name(op_type), length, offset, result);
e96a650a 4199 ceph_put_snap_context(snapc);
7ad18afa 4200err:
2a842aca 4201 blk_mq_end_request(rq, errno_to_blk_status(result));
bc1ecc65 4202}
bf0d5f50 4203
fc17b653 4204static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
7ad18afa 4205 const struct blk_mq_queue_data *bd)
bc1ecc65 4206{
7ad18afa
CH
4207 struct request *rq = bd->rq;
4208 struct work_struct *work = blk_mq_rq_to_pdu(rq);
bf0d5f50 4209
7ad18afa 4210 queue_work(rbd_wq, work);
fc17b653 4211 return BLK_STS_OK;
bf0d5f50
AE
4212}
4213
602adf40
YS
4214static void rbd_free_disk(struct rbd_device *rbd_dev)
4215{
5769ed0c
ID
4216 blk_cleanup_queue(rbd_dev->disk->queue);
4217 blk_mq_free_tag_set(&rbd_dev->tag_set);
4218 put_disk(rbd_dev->disk);
a0cab924 4219 rbd_dev->disk = NULL;
602adf40
YS
4220}
4221
788e2df3 4222static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
fe5478e0
ID
4223 struct ceph_object_id *oid,
4224 struct ceph_object_locator *oloc,
4225 void *buf, int buf_len)
788e2df3
AE
4226
4227{
fe5478e0
ID
4228 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4229 struct ceph_osd_request *req;
4230 struct page **pages;
4231 int num_pages = calc_pages_for(0, buf_len);
788e2df3
AE
4232 int ret;
4233
fe5478e0
ID
4234 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4235 if (!req)
4236 return -ENOMEM;
788e2df3 4237
fe5478e0
ID
4238 ceph_oid_copy(&req->r_base_oid, oid);
4239 ceph_oloc_copy(&req->r_base_oloc, oloc);
4240 req->r_flags = CEPH_OSD_FLAG_READ;
430c28c3 4241
fe5478e0
ID
4242 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4243 if (IS_ERR(pages)) {
4244 ret = PTR_ERR(pages);
4245 goto out_req;
4246 }
1ceae7ef 4247
fe5478e0
ID
4248 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4249 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4250 true);
4251
26f887e0
ID
4252 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4253 if (ret)
4254 goto out_req;
4255
fe5478e0
ID
4256 ceph_osdc_start_request(osdc, req, false);
4257 ret = ceph_osdc_wait_request(osdc, req);
4258 if (ret >= 0)
4259 ceph_copy_from_page_vector(pages, buf, 0, ret);
788e2df3 4260
fe5478e0
ID
4261out_req:
4262 ceph_osdc_put_request(req);
788e2df3
AE
4263 return ret;
4264}
4265
602adf40 4266/*
662518b1
AE
4267 * Read the complete header for the given rbd device. On successful
4268 * return, the rbd_dev->header field will contain up-to-date
4269 * information about the image.
602adf40 4270 */
99a41ebc 4271static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
602adf40 4272{
4156d998 4273 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 4274 u32 snap_count = 0;
4156d998
AE
4275 u64 names_size = 0;
4276 u32 want_count;
4277 int ret;
602adf40 4278
00f1f36f 4279 /*
4156d998
AE
4280 * The complete header will include an array of its 64-bit
4281 * snapshot ids, followed by the names of those snapshots as
4282 * a contiguous block of NUL-terminated strings. Note that
4283 * the number of snapshots could change by the time we read
4284 * it in, in which case we re-read it.
00f1f36f 4285 */
4156d998
AE
4286 do {
4287 size_t size;
4288
4289 kfree(ondisk);
4290
4291 size = sizeof (*ondisk);
4292 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4293 size += names_size;
4294 ondisk = kmalloc(size, GFP_KERNEL);
4295 if (!ondisk)
662518b1 4296 return -ENOMEM;
4156d998 4297
fe5478e0
ID
4298 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4299 &rbd_dev->header_oloc, ondisk, size);
4156d998 4300 if (ret < 0)
662518b1 4301 goto out;
c0cd10db 4302 if ((size_t)ret < size) {
4156d998 4303 ret = -ENXIO;
06ecc6cb
AE
4304 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4305 size, ret);
662518b1 4306 goto out;
4156d998
AE
4307 }
4308 if (!rbd_dev_ondisk_valid(ondisk)) {
4309 ret = -ENXIO;
06ecc6cb 4310 rbd_warn(rbd_dev, "invalid header");
662518b1 4311 goto out;
81e759fb 4312 }
602adf40 4313
4156d998
AE
4314 names_size = le64_to_cpu(ondisk->snap_names_len);
4315 want_count = snap_count;
4316 snap_count = le32_to_cpu(ondisk->snap_count);
4317 } while (snap_count != want_count);
00f1f36f 4318
662518b1
AE
4319 ret = rbd_header_from_disk(rbd_dev, ondisk);
4320out:
4156d998
AE
4321 kfree(ondisk);
4322
4323 return ret;
602adf40
YS
4324}
4325
15228ede
AE
4326/*
4327 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
4328 * has disappeared from the (just updated) snapshot context.
4329 */
4330static void rbd_exists_validate(struct rbd_device *rbd_dev)
4331{
4332 u64 snap_id;
4333
4334 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
4335 return;
4336
4337 snap_id = rbd_dev->spec->snap_id;
4338 if (snap_id == CEPH_NOSNAP)
4339 return;
4340
4341 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
4342 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4343}
4344
9875201e
JD
4345static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4346{
4347 sector_t size;
9875201e
JD
4348
4349 /*
811c6688
ID
4350 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4351 * try to update its size. If REMOVING is set, updating size
4352 * is just useless work since the device can't be opened.
9875201e 4353 */
811c6688
ID
4354 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4355 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
9875201e
JD
4356 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4357 dout("setting size to %llu sectors", (unsigned long long)size);
4358 set_capacity(rbd_dev->disk, size);
4359 revalidate_disk(rbd_dev->disk);
4360 }
4361}
4362
cc4a38bd 4363static int rbd_dev_refresh(struct rbd_device *rbd_dev)
1fe5e993 4364{
e627db08 4365 u64 mapping_size;
1fe5e993
AE
4366 int ret;
4367
cfbf6377 4368 down_write(&rbd_dev->header_rwsem);
3b5cf2a2 4369 mapping_size = rbd_dev->mapping.size;
a720ae09
ID
4370
4371 ret = rbd_dev_header_info(rbd_dev);
52bb1f9b 4372 if (ret)
73e39e4d 4373 goto out;
15228ede 4374
e8f59b59
ID
4375 /*
4376 * If there is a parent, see if it has disappeared due to the
4377 * mapped image getting flattened.
4378 */
4379 if (rbd_dev->parent) {
4380 ret = rbd_dev_v2_parent_info(rbd_dev);
4381 if (ret)
73e39e4d 4382 goto out;
e8f59b59
ID
4383 }
4384
5ff1108c 4385 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
73e39e4d 4386 rbd_dev->mapping.size = rbd_dev->header.image_size;
5ff1108c
ID
4387 } else {
4388 /* validate mapped snapshot's EXISTS flag */
4389 rbd_exists_validate(rbd_dev);
4390 }
15228ede 4391
73e39e4d 4392out:
cfbf6377 4393 up_write(&rbd_dev->header_rwsem);
73e39e4d 4394 if (!ret && mapping_size != rbd_dev->mapping.size)
9875201e 4395 rbd_dev_update_size(rbd_dev);
1fe5e993 4396
73e39e4d 4397 return ret;
1fe5e993
AE
4398}
4399
d6296d39
CH
4400static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4401 unsigned int hctx_idx, unsigned int numa_node)
7ad18afa
CH
4402{
4403 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4404
4405 INIT_WORK(work, rbd_queue_workfn);
4406 return 0;
4407}
4408
f363b089 4409static const struct blk_mq_ops rbd_mq_ops = {
7ad18afa 4410 .queue_rq = rbd_queue_rq,
7ad18afa
CH
4411 .init_request = rbd_init_request,
4412};
4413
602adf40
YS
4414static int rbd_init_disk(struct rbd_device *rbd_dev)
4415{
4416 struct gendisk *disk;
4417 struct request_queue *q;
420efbdf
ID
4418 unsigned int objset_bytes =
4419 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
7ad18afa 4420 int err;
602adf40 4421
602adf40 4422 /* create gendisk info */
7e513d43
ID
4423 disk = alloc_disk(single_major ?
4424 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4425 RBD_MINORS_PER_MAJOR);
602adf40 4426 if (!disk)
1fcdb8aa 4427 return -ENOMEM;
602adf40 4428
f0f8cef5 4429 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 4430 rbd_dev->dev_id);
602adf40 4431 disk->major = rbd_dev->major;
dd82fff1 4432 disk->first_minor = rbd_dev->minor;
7e513d43
ID
4433 if (single_major)
4434 disk->flags |= GENHD_FL_EXT_DEVT;
602adf40
YS
4435 disk->fops = &rbd_bd_ops;
4436 disk->private_data = rbd_dev;
4437
7ad18afa
CH
4438 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4439 rbd_dev->tag_set.ops = &rbd_mq_ops;
b5584180 4440 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
7ad18afa 4441 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
56d18f62 4442 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
7ad18afa
CH
4443 rbd_dev->tag_set.nr_hw_queues = 1;
4444 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4445
4446 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4447 if (err)
602adf40 4448 goto out_disk;
029bcbd8 4449
7ad18afa
CH
4450 q = blk_mq_init_queue(&rbd_dev->tag_set);
4451 if (IS_ERR(q)) {
4452 err = PTR_ERR(q);
4453 goto out_tag_set;
4454 }
4455
8b904b5b 4456 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
d8a2c89c 4457 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
593a9e7b 4458
420efbdf 4459 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
0d9fde4f 4460 q->limits.max_sectors = queue_max_hw_sectors(q);
21acdf45 4461 blk_queue_max_segments(q, USHRT_MAX);
24f1df60 4462 blk_queue_max_segment_size(q, UINT_MAX);
16d80c54
ID
4463 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
4464 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
029bcbd8 4465
d9360540
ID
4466 if (rbd_dev->opts->trim) {
4467 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
16d80c54 4468 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
d9360540
ID
4469 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4470 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4471 }
90e98c52 4472
bae818ee 4473 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
dc3b17cc 4474 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
bae818ee 4475
5769ed0c
ID
4476 /*
4477 * disk_release() expects a queue ref from add_disk() and will
4478 * put it. Hold an extra ref until add_disk() is called.
4479 */
4480 WARN_ON(!blk_get_queue(q));
602adf40 4481 disk->queue = q;
602adf40
YS
4482 q->queuedata = rbd_dev;
4483
4484 rbd_dev->disk = disk;
602adf40 4485
602adf40 4486 return 0;
7ad18afa
CH
4487out_tag_set:
4488 blk_mq_free_tag_set(&rbd_dev->tag_set);
602adf40
YS
4489out_disk:
4490 put_disk(disk);
7ad18afa 4491 return err;
602adf40
YS
4492}
4493
dfc5606d
YS
4494/*
4495 sysfs
4496*/
4497
593a9e7b
AE
4498static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4499{
4500 return container_of(dev, struct rbd_device, dev);
4501}
4502
dfc5606d
YS
4503static ssize_t rbd_size_show(struct device *dev,
4504 struct device_attribute *attr, char *buf)
4505{
593a9e7b 4506 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0 4507
fc71d833
AE
4508 return sprintf(buf, "%llu\n",
4509 (unsigned long long)rbd_dev->mapping.size);
dfc5606d
YS
4510}
4511
34b13184
AE
4512/*
4513 * Note this shows the features for whatever's mapped, which is not
4514 * necessarily the base image.
4515 */
4516static ssize_t rbd_features_show(struct device *dev,
4517 struct device_attribute *attr, char *buf)
4518{
4519 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4520
4521 return sprintf(buf, "0x%016llx\n",
fc71d833 4522 (unsigned long long)rbd_dev->mapping.features);
34b13184
AE
4523}
4524
dfc5606d
YS
4525static ssize_t rbd_major_show(struct device *dev,
4526 struct device_attribute *attr, char *buf)
4527{
593a9e7b 4528 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 4529
fc71d833
AE
4530 if (rbd_dev->major)
4531 return sprintf(buf, "%d\n", rbd_dev->major);
4532
4533 return sprintf(buf, "(none)\n");
dd82fff1
ID
4534}
4535
4536static ssize_t rbd_minor_show(struct device *dev,
4537 struct device_attribute *attr, char *buf)
4538{
4539 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
fc71d833 4540
dd82fff1 4541 return sprintf(buf, "%d\n", rbd_dev->minor);
dfc5606d
YS
4542}
4543
005a07bf
ID
4544static ssize_t rbd_client_addr_show(struct device *dev,
4545 struct device_attribute *attr, char *buf)
4546{
4547 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4548 struct ceph_entity_addr *client_addr =
4549 ceph_client_addr(rbd_dev->rbd_client->client);
4550
4551 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4552 le32_to_cpu(client_addr->nonce));
4553}
4554
dfc5606d
YS
4555static ssize_t rbd_client_id_show(struct device *dev,
4556 struct device_attribute *attr, char *buf)
602adf40 4557{
593a9e7b 4558 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 4559
1dbb4399 4560 return sprintf(buf, "client%lld\n",
033268a5 4561 ceph_client_gid(rbd_dev->rbd_client->client));
602adf40
YS
4562}
4563
267fb90b
MC
4564static ssize_t rbd_cluster_fsid_show(struct device *dev,
4565 struct device_attribute *attr, char *buf)
4566{
4567 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4568
4569 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4570}
4571
0d6d1e9c
MC
4572static ssize_t rbd_config_info_show(struct device *dev,
4573 struct device_attribute *attr, char *buf)
4574{
4575 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4576
4577 return sprintf(buf, "%s\n", rbd_dev->config_info);
602adf40
YS
4578}
4579
dfc5606d
YS
4580static ssize_t rbd_pool_show(struct device *dev,
4581 struct device_attribute *attr, char *buf)
602adf40 4582{
593a9e7b 4583 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 4584
0d7dbfce 4585 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
4586}
4587
9bb2f334
AE
4588static ssize_t rbd_pool_id_show(struct device *dev,
4589 struct device_attribute *attr, char *buf)
4590{
4591 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4592
0d7dbfce 4593 return sprintf(buf, "%llu\n",
fc71d833 4594 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
4595}
4596
b26c047b
ID
4597static ssize_t rbd_pool_ns_show(struct device *dev,
4598 struct device_attribute *attr, char *buf)
4599{
4600 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4601
4602 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4603}
4604
dfc5606d
YS
4605static ssize_t rbd_name_show(struct device *dev,
4606 struct device_attribute *attr, char *buf)
4607{
593a9e7b 4608 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 4609
a92ffdf8
AE
4610 if (rbd_dev->spec->image_name)
4611 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4612
4613 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
4614}
4615
589d30e0
AE
4616static ssize_t rbd_image_id_show(struct device *dev,
4617 struct device_attribute *attr, char *buf)
4618{
4619 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4620
0d7dbfce 4621 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
4622}
4623
34b13184
AE
4624/*
4625 * Shows the name of the currently-mapped snapshot (or
4626 * RBD_SNAP_HEAD_NAME for the base image).
4627 */
dfc5606d
YS
4628static ssize_t rbd_snap_show(struct device *dev,
4629 struct device_attribute *attr,
4630 char *buf)
4631{
593a9e7b 4632 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 4633
0d7dbfce 4634 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
4635}
4636
92a58671
MC
4637static ssize_t rbd_snap_id_show(struct device *dev,
4638 struct device_attribute *attr, char *buf)
4639{
4640 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4641
4642 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4643}
4644
86b00e0d 4645/*
ff96128f
ID
4646 * For a v2 image, shows the chain of parent images, separated by empty
4647 * lines. For v1 images or if there is no parent, shows "(no parent
4648 * image)".
86b00e0d
AE
4649 */
4650static ssize_t rbd_parent_show(struct device *dev,
ff96128f
ID
4651 struct device_attribute *attr,
4652 char *buf)
86b00e0d
AE
4653{
4654 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
ff96128f 4655 ssize_t count = 0;
86b00e0d 4656
ff96128f 4657 if (!rbd_dev->parent)
86b00e0d
AE
4658 return sprintf(buf, "(no parent image)\n");
4659
ff96128f
ID
4660 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4661 struct rbd_spec *spec = rbd_dev->parent_spec;
4662
4663 count += sprintf(&buf[count], "%s"
4664 "pool_id %llu\npool_name %s\n"
e92c0eaf 4665 "pool_ns %s\n"
ff96128f
ID
4666 "image_id %s\nimage_name %s\n"
4667 "snap_id %llu\nsnap_name %s\n"
4668 "overlap %llu\n",
4669 !count ? "" : "\n", /* first? */
4670 spec->pool_id, spec->pool_name,
e92c0eaf 4671 spec->pool_ns ?: "",
ff96128f
ID
4672 spec->image_id, spec->image_name ?: "(unknown)",
4673 spec->snap_id, spec->snap_name,
4674 rbd_dev->parent_overlap);
4675 }
4676
4677 return count;
86b00e0d
AE
4678}
4679
dfc5606d
YS
4680static ssize_t rbd_image_refresh(struct device *dev,
4681 struct device_attribute *attr,
4682 const char *buf,
4683 size_t size)
4684{
593a9e7b 4685 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 4686 int ret;
602adf40 4687
cc4a38bd 4688 ret = rbd_dev_refresh(rbd_dev);
e627db08 4689 if (ret)
52bb1f9b 4690 return ret;
b813623a 4691
52bb1f9b 4692 return size;
dfc5606d 4693}
602adf40 4694
5657a819
JP
4695static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
4696static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
4697static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
4698static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
4699static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
4700static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
4701static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
4702static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
4703static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
4704static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
b26c047b 4705static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5657a819
JP
4706static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
4707static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
4708static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
4709static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
4710static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
4711static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
dfc5606d
YS
4712
4713static struct attribute *rbd_attrs[] = {
4714 &dev_attr_size.attr,
34b13184 4715 &dev_attr_features.attr,
dfc5606d 4716 &dev_attr_major.attr,
dd82fff1 4717 &dev_attr_minor.attr,
005a07bf 4718 &dev_attr_client_addr.attr,
dfc5606d 4719 &dev_attr_client_id.attr,
267fb90b 4720 &dev_attr_cluster_fsid.attr,
0d6d1e9c 4721 &dev_attr_config_info.attr,
dfc5606d 4722 &dev_attr_pool.attr,
9bb2f334 4723 &dev_attr_pool_id.attr,
b26c047b 4724 &dev_attr_pool_ns.attr,
dfc5606d 4725 &dev_attr_name.attr,
589d30e0 4726 &dev_attr_image_id.attr,
dfc5606d 4727 &dev_attr_current_snap.attr,
92a58671 4728 &dev_attr_snap_id.attr,
86b00e0d 4729 &dev_attr_parent.attr,
dfc5606d 4730 &dev_attr_refresh.attr,
dfc5606d
YS
4731 NULL
4732};
4733
4734static struct attribute_group rbd_attr_group = {
4735 .attrs = rbd_attrs,
4736};
4737
4738static const struct attribute_group *rbd_attr_groups[] = {
4739 &rbd_attr_group,
4740 NULL
4741};
4742
6cac4695 4743static void rbd_dev_release(struct device *dev);
dfc5606d 4744
b9942bc9 4745static const struct device_type rbd_device_type = {
dfc5606d
YS
4746 .name = "rbd",
4747 .groups = rbd_attr_groups,
6cac4695 4748 .release = rbd_dev_release,
dfc5606d
YS
4749};
4750
8b8fb99c
AE
4751static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4752{
4753 kref_get(&spec->kref);
4754
4755 return spec;
4756}
4757
4758static void rbd_spec_free(struct kref *kref);
4759static void rbd_spec_put(struct rbd_spec *spec)
4760{
4761 if (spec)
4762 kref_put(&spec->kref, rbd_spec_free);
4763}
4764
4765static struct rbd_spec *rbd_spec_alloc(void)
4766{
4767 struct rbd_spec *spec;
4768
4769 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4770 if (!spec)
4771 return NULL;
04077599
ID
4772
4773 spec->pool_id = CEPH_NOPOOL;
4774 spec->snap_id = CEPH_NOSNAP;
8b8fb99c
AE
4775 kref_init(&spec->kref);
4776
8b8fb99c
AE
4777 return spec;
4778}
4779
4780static void rbd_spec_free(struct kref *kref)
4781{
4782 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4783
4784 kfree(spec->pool_name);
b26c047b 4785 kfree(spec->pool_ns);
8b8fb99c
AE
4786 kfree(spec->image_id);
4787 kfree(spec->image_name);
4788 kfree(spec->snap_name);
4789 kfree(spec);
4790}
4791
1643dfa4 4792static void rbd_dev_free(struct rbd_device *rbd_dev)
dd5ac32d 4793{
99d16943 4794 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
ed95b21a 4795 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
dd5ac32d 4796
c41d13a3 4797 ceph_oid_destroy(&rbd_dev->header_oid);
6b6dddbe 4798 ceph_oloc_destroy(&rbd_dev->header_oloc);
0d6d1e9c 4799 kfree(rbd_dev->config_info);
c41d13a3 4800
dd5ac32d
ID
4801 rbd_put_client(rbd_dev->rbd_client);
4802 rbd_spec_put(rbd_dev->spec);
4803 kfree(rbd_dev->opts);
4804 kfree(rbd_dev);
1643dfa4
ID
4805}
4806
4807static void rbd_dev_release(struct device *dev)
4808{
4809 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4810 bool need_put = !!rbd_dev->opts;
4811
4812 if (need_put) {
4813 destroy_workqueue(rbd_dev->task_wq);
4814 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4815 }
4816
4817 rbd_dev_free(rbd_dev);
dd5ac32d
ID
4818
4819 /*
4820 * This is racy, but way better than putting module outside of
4821 * the release callback. The race window is pretty small, so
4822 * doing something similar to dm (dm-builtin.c) is overkill.
4823 */
4824 if (need_put)
4825 module_put(THIS_MODULE);
4826}
4827
1643dfa4
ID
4828static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4829 struct rbd_spec *spec)
c53d5893
AE
4830{
4831 struct rbd_device *rbd_dev;
4832
1643dfa4 4833 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
c53d5893
AE
4834 if (!rbd_dev)
4835 return NULL;
4836
4837 spin_lock_init(&rbd_dev->lock);
4838 INIT_LIST_HEAD(&rbd_dev->node);
c53d5893
AE
4839 init_rwsem(&rbd_dev->header_rwsem);
4840
7e97332e 4841 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
c41d13a3 4842 ceph_oid_init(&rbd_dev->header_oid);
431a02cd 4843 rbd_dev->header_oloc.pool = spec->pool_id;
b26c047b
ID
4844 if (spec->pool_ns) {
4845 WARN_ON(!*spec->pool_ns);
4846 rbd_dev->header_oloc.pool_ns =
4847 ceph_find_or_create_string(spec->pool_ns,
4848 strlen(spec->pool_ns));
4849 }
c41d13a3 4850
99d16943
ID
4851 mutex_init(&rbd_dev->watch_mutex);
4852 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4853 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4854
ed95b21a
ID
4855 init_rwsem(&rbd_dev->lock_rwsem);
4856 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4857 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4858 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4859 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4860 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
e1fddc8f 4861 spin_lock_init(&rbd_dev->lock_lists_lock);
637cd060 4862 INIT_LIST_HEAD(&rbd_dev->acquiring_list);
e1fddc8f 4863 INIT_LIST_HEAD(&rbd_dev->running_list);
637cd060 4864 init_completion(&rbd_dev->acquire_wait);
e1fddc8f 4865 init_completion(&rbd_dev->releasing_wait);
ed95b21a 4866
dd5ac32d
ID
4867 rbd_dev->dev.bus = &rbd_bus_type;
4868 rbd_dev->dev.type = &rbd_device_type;
4869 rbd_dev->dev.parent = &rbd_root_dev;
dd5ac32d
ID
4870 device_initialize(&rbd_dev->dev);
4871
c53d5893 4872 rbd_dev->rbd_client = rbdc;
d147543d 4873 rbd_dev->spec = spec;
0903e875 4874
1643dfa4
ID
4875 return rbd_dev;
4876}
4877
4878/*
4879 * Create a mapping rbd_dev.
4880 */
4881static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4882 struct rbd_spec *spec,
4883 struct rbd_options *opts)
4884{
4885 struct rbd_device *rbd_dev;
4886
4887 rbd_dev = __rbd_dev_create(rbdc, spec);
4888 if (!rbd_dev)
4889 return NULL;
4890
4891 rbd_dev->opts = opts;
4892
4893 /* get an id and fill in device name */
4894 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4895 minor_to_rbd_dev_id(1 << MINORBITS),
4896 GFP_KERNEL);
4897 if (rbd_dev->dev_id < 0)
4898 goto fail_rbd_dev;
4899
4900 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4901 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4902 rbd_dev->name);
4903 if (!rbd_dev->task_wq)
4904 goto fail_dev_id;
dd5ac32d 4905
1643dfa4
ID
4906 /* we have a ref from do_rbd_add() */
4907 __module_get(THIS_MODULE);
dd5ac32d 4908
1643dfa4 4909 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
c53d5893 4910 return rbd_dev;
1643dfa4
ID
4911
4912fail_dev_id:
4913 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4914fail_rbd_dev:
4915 rbd_dev_free(rbd_dev);
4916 return NULL;
c53d5893
AE
4917}
4918
4919static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4920{
dd5ac32d
ID
4921 if (rbd_dev)
4922 put_device(&rbd_dev->dev);
c53d5893
AE
4923}
4924
9d475de5
AE
4925/*
4926 * Get the size and object order for an image snapshot, or if
4927 * snap_id is CEPH_NOSNAP, gets this information for the base
4928 * image.
4929 */
4930static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4931 u8 *order, u64 *snap_size)
4932{
4933 __le64 snapid = cpu_to_le64(snap_id);
4934 int ret;
4935 struct {
4936 u8 order;
4937 __le64 size;
4938 } __attribute__ ((packed)) size_buf = { 0 };
4939
ecd4a68a
ID
4940 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4941 &rbd_dev->header_oloc, "get_size",
4942 &snapid, sizeof(snapid),
4943 &size_buf, sizeof(size_buf));
36be9a76 4944 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
4945 if (ret < 0)
4946 return ret;
57385b51
AE
4947 if (ret < sizeof (size_buf))
4948 return -ERANGE;
9d475de5 4949
c3545579 4950 if (order) {
c86f86e9 4951 *order = size_buf.order;
c3545579
JD
4952 dout(" order %u", (unsigned int)*order);
4953 }
9d475de5
AE
4954 *snap_size = le64_to_cpu(size_buf.size);
4955
c3545579
JD
4956 dout(" snap_id 0x%016llx snap_size = %llu\n",
4957 (unsigned long long)snap_id,
57385b51 4958 (unsigned long long)*snap_size);
9d475de5
AE
4959
4960 return 0;
4961}
4962
4963static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4964{
4965 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4966 &rbd_dev->header.obj_order,
4967 &rbd_dev->header.image_size);
4968}
4969
1e130199
AE
4970static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4971{
4972 void *reply_buf;
4973 int ret;
4974 void *p;
4975
4976 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4977 if (!reply_buf)
4978 return -ENOMEM;
4979
ecd4a68a
ID
4980 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4981 &rbd_dev->header_oloc, "get_object_prefix",
4982 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
36be9a76 4983 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
4984 if (ret < 0)
4985 goto out;
4986
4987 p = reply_buf;
4988 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
57385b51
AE
4989 p + ret, NULL, GFP_NOIO);
4990 ret = 0;
1e130199
AE
4991
4992 if (IS_ERR(rbd_dev->header.object_prefix)) {
4993 ret = PTR_ERR(rbd_dev->header.object_prefix);
4994 rbd_dev->header.object_prefix = NULL;
4995 } else {
4996 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4997 }
1e130199
AE
4998out:
4999 kfree(reply_buf);
5000
5001 return ret;
5002}
5003
b1b5402a
AE
5004static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5005 u64 *snap_features)
5006{
5007 __le64 snapid = cpu_to_le64(snap_id);
5008 struct {
5009 __le64 features;
5010 __le64 incompat;
4157976b 5011 } __attribute__ ((packed)) features_buf = { 0 };
d3767f0f 5012 u64 unsup;
b1b5402a
AE
5013 int ret;
5014
ecd4a68a
ID
5015 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5016 &rbd_dev->header_oloc, "get_features",
5017 &snapid, sizeof(snapid),
5018 &features_buf, sizeof(features_buf));
36be9a76 5019 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
5020 if (ret < 0)
5021 return ret;
57385b51
AE
5022 if (ret < sizeof (features_buf))
5023 return -ERANGE;
d889140c 5024
d3767f0f
ID
5025 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5026 if (unsup) {
5027 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5028 unsup);
b8f5c6ed 5029 return -ENXIO;
d3767f0f 5030 }
d889140c 5031
b1b5402a
AE
5032 *snap_features = le64_to_cpu(features_buf.features);
5033
5034 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
57385b51
AE
5035 (unsigned long long)snap_id,
5036 (unsigned long long)*snap_features,
5037 (unsigned long long)le64_to_cpu(features_buf.incompat));
b1b5402a
AE
5038
5039 return 0;
5040}
5041
5042static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5043{
5044 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5045 &rbd_dev->header.features);
5046}
5047
eb3b2d6b
ID
5048struct parent_image_info {
5049 u64 pool_id;
e92c0eaf 5050 const char *pool_ns;
eb3b2d6b
ID
5051 const char *image_id;
5052 u64 snap_id;
5053
e92c0eaf 5054 bool has_overlap;
eb3b2d6b
ID
5055 u64 overlap;
5056};
5057
e92c0eaf
ID
5058/*
5059 * The caller is responsible for @pii.
5060 */
5061static int decode_parent_image_spec(void **p, void *end,
5062 struct parent_image_info *pii)
5063{
5064 u8 struct_v;
5065 u32 struct_len;
5066 int ret;
5067
5068 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5069 &struct_v, &struct_len);
5070 if (ret)
5071 return ret;
5072
5073 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5074 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5075 if (IS_ERR(pii->pool_ns)) {
5076 ret = PTR_ERR(pii->pool_ns);
5077 pii->pool_ns = NULL;
5078 return ret;
5079 }
5080 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5081 if (IS_ERR(pii->image_id)) {
5082 ret = PTR_ERR(pii->image_id);
5083 pii->image_id = NULL;
5084 return ret;
5085 }
5086 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5087 return 0;
5088
5089e_inval:
5090 return -EINVAL;
5091}
5092
5093static int __get_parent_info(struct rbd_device *rbd_dev,
5094 struct page *req_page,
5095 struct page *reply_page,
5096 struct parent_image_info *pii)
5097{
5098 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5099 size_t reply_len = PAGE_SIZE;
5100 void *p, *end;
5101 int ret;
5102
5103 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5104 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
68ada915 5105 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5106 if (ret)
5107 return ret == -EOPNOTSUPP ? 1 : ret;
5108
5109 p = page_address(reply_page);
5110 end = p + reply_len;
5111 ret = decode_parent_image_spec(&p, end, pii);
5112 if (ret)
5113 return ret;
5114
5115 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5116 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
68ada915 5117 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5118 if (ret)
5119 return ret;
5120
5121 p = page_address(reply_page);
5122 end = p + reply_len;
5123 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5124 if (pii->has_overlap)
5125 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5126
5127 return 0;
5128
5129e_inval:
5130 return -EINVAL;
5131}
5132
eb3b2d6b
ID
5133/*
5134 * The caller is responsible for @pii.
5135 */
5136static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5137 struct page *req_page,
5138 struct page *reply_page,
5139 struct parent_image_info *pii)
5140{
5141 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5142 size_t reply_len = PAGE_SIZE;
5143 void *p, *end;
5144 int ret;
5145
5146 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5147 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
68ada915 5148 req_page, sizeof(u64), &reply_page, &reply_len);
eb3b2d6b
ID
5149 if (ret)
5150 return ret;
5151
5152 p = page_address(reply_page);
5153 end = p + reply_len;
5154 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5155 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5156 if (IS_ERR(pii->image_id)) {
5157 ret = PTR_ERR(pii->image_id);
5158 pii->image_id = NULL;
5159 return ret;
5160 }
5161 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
e92c0eaf 5162 pii->has_overlap = true;
eb3b2d6b
ID
5163 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5164
5165 return 0;
5166
5167e_inval:
5168 return -EINVAL;
5169}
5170
5171static int get_parent_info(struct rbd_device *rbd_dev,
5172 struct parent_image_info *pii)
5173{
5174 struct page *req_page, *reply_page;
5175 void *p;
5176 int ret;
5177
5178 req_page = alloc_page(GFP_KERNEL);
5179 if (!req_page)
5180 return -ENOMEM;
5181
5182 reply_page = alloc_page(GFP_KERNEL);
5183 if (!reply_page) {
5184 __free_page(req_page);
5185 return -ENOMEM;
5186 }
5187
5188 p = page_address(req_page);
5189 ceph_encode_64(&p, rbd_dev->spec->snap_id);
e92c0eaf
ID
5190 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5191 if (ret > 0)
5192 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5193 pii);
eb3b2d6b
ID
5194
5195 __free_page(req_page);
5196 __free_page(reply_page);
5197 return ret;
5198}
5199
86b00e0d
AE
5200static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5201{
5202 struct rbd_spec *parent_spec;
eb3b2d6b 5203 struct parent_image_info pii = { 0 };
86b00e0d
AE
5204 int ret;
5205
5206 parent_spec = rbd_spec_alloc();
5207 if (!parent_spec)
5208 return -ENOMEM;
5209
eb3b2d6b
ID
5210 ret = get_parent_info(rbd_dev, &pii);
5211 if (ret)
86b00e0d 5212 goto out_err;
86b00e0d 5213
e92c0eaf
ID
5214 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5215 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5216 pii.has_overlap, pii.overlap);
86b00e0d 5217
e92c0eaf 5218 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
392a9dad
AE
5219 /*
5220 * Either the parent never existed, or we have
5221 * record of it but the image got flattened so it no
5222 * longer has a parent. When the parent of a
5223 * layered image disappears we immediately set the
5224 * overlap to 0. The effect of this is that all new
5225 * requests will be treated as if the image had no
5226 * parent.
e92c0eaf
ID
5227 *
5228 * If !pii.has_overlap, the parent image spec is not
5229 * applicable. It's there to avoid duplication in each
5230 * snapshot record.
392a9dad
AE
5231 */
5232 if (rbd_dev->parent_overlap) {
5233 rbd_dev->parent_overlap = 0;
392a9dad
AE
5234 rbd_dev_parent_put(rbd_dev);
5235 pr_info("%s: clone image has been flattened\n",
5236 rbd_dev->disk->disk_name);
5237 }
5238
86b00e0d 5239 goto out; /* No parent? No problem. */
392a9dad 5240 }
86b00e0d 5241
0903e875
AE
5242 /* The ceph file layout needs to fit pool id in 32 bits */
5243
5244 ret = -EIO;
eb3b2d6b 5245 if (pii.pool_id > (u64)U32_MAX) {
9584d508 5246 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
eb3b2d6b 5247 (unsigned long long)pii.pool_id, U32_MAX);
86b00e0d
AE
5248 goto out_err;
5249 }
86b00e0d 5250
3b5cf2a2
AE
5251 /*
5252 * The parent won't change (except when the clone is
5253 * flattened, already handled that). So we only need to
5254 * record the parent spec we have not already done so.
5255 */
5256 if (!rbd_dev->parent_spec) {
eb3b2d6b 5257 parent_spec->pool_id = pii.pool_id;
e92c0eaf
ID
5258 if (pii.pool_ns && *pii.pool_ns) {
5259 parent_spec->pool_ns = pii.pool_ns;
5260 pii.pool_ns = NULL;
5261 }
eb3b2d6b
ID
5262 parent_spec->image_id = pii.image_id;
5263 pii.image_id = NULL;
5264 parent_spec->snap_id = pii.snap_id;
b26c047b 5265
70cf49cf
AE
5266 rbd_dev->parent_spec = parent_spec;
5267 parent_spec = NULL; /* rbd_dev now owns this */
3b5cf2a2
AE
5268 }
5269
5270 /*
cf32bd9c
ID
5271 * We always update the parent overlap. If it's zero we issue
5272 * a warning, as we will proceed as if there was no parent.
3b5cf2a2 5273 */
eb3b2d6b 5274 if (!pii.overlap) {
3b5cf2a2 5275 if (parent_spec) {
cf32bd9c
ID
5276 /* refresh, careful to warn just once */
5277 if (rbd_dev->parent_overlap)
5278 rbd_warn(rbd_dev,
5279 "clone now standalone (overlap became 0)");
3b5cf2a2 5280 } else {
cf32bd9c
ID
5281 /* initial probe */
5282 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
3b5cf2a2 5283 }
70cf49cf 5284 }
eb3b2d6b 5285 rbd_dev->parent_overlap = pii.overlap;
cf32bd9c 5286
86b00e0d
AE
5287out:
5288 ret = 0;
5289out_err:
e92c0eaf 5290 kfree(pii.pool_ns);
eb3b2d6b 5291 kfree(pii.image_id);
86b00e0d 5292 rbd_spec_put(parent_spec);
86b00e0d
AE
5293 return ret;
5294}
5295
cc070d59
AE
5296static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5297{
5298 struct {
5299 __le64 stripe_unit;
5300 __le64 stripe_count;
5301 } __attribute__ ((packed)) striping_info_buf = { 0 };
5302 size_t size = sizeof (striping_info_buf);
5303 void *p;
cc070d59
AE
5304 int ret;
5305
ecd4a68a
ID
5306 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5307 &rbd_dev->header_oloc, "get_stripe_unit_count",
5308 NULL, 0, &striping_info_buf, size);
cc070d59
AE
5309 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5310 if (ret < 0)
5311 return ret;
5312 if (ret < size)
5313 return -ERANGE;
5314
cc070d59 5315 p = &striping_info_buf;
b1331852
ID
5316 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5317 rbd_dev->header.stripe_count = ceph_decode_64(&p);
cc070d59
AE
5318 return 0;
5319}
5320
7e97332e
ID
5321static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5322{
5323 __le64 data_pool_id;
5324 int ret;
5325
5326 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5327 &rbd_dev->header_oloc, "get_data_pool",
5328 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5329 if (ret < 0)
5330 return ret;
5331 if (ret < sizeof(data_pool_id))
5332 return -EBADMSG;
5333
5334 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5335 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5336 return 0;
5337}
5338
9e15b77d
AE
5339static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5340{
ecd4a68a 5341 CEPH_DEFINE_OID_ONSTACK(oid);
9e15b77d
AE
5342 size_t image_id_size;
5343 char *image_id;
5344 void *p;
5345 void *end;
5346 size_t size;
5347 void *reply_buf = NULL;
5348 size_t len = 0;
5349 char *image_name = NULL;
5350 int ret;
5351
5352 rbd_assert(!rbd_dev->spec->image_name);
5353
69e7a02f
AE
5354 len = strlen(rbd_dev->spec->image_id);
5355 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
5356 image_id = kmalloc(image_id_size, GFP_KERNEL);
5357 if (!image_id)
5358 return NULL;
5359
5360 p = image_id;
4157976b 5361 end = image_id + image_id_size;
57385b51 5362 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
9e15b77d
AE
5363
5364 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5365 reply_buf = kmalloc(size, GFP_KERNEL);
5366 if (!reply_buf)
5367 goto out;
5368
ecd4a68a
ID
5369 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5370 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5371 "dir_get_name", image_id, image_id_size,
5372 reply_buf, size);
9e15b77d
AE
5373 if (ret < 0)
5374 goto out;
5375 p = reply_buf;
f40eb349
AE
5376 end = reply_buf + ret;
5377
9e15b77d
AE
5378 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5379 if (IS_ERR(image_name))
5380 image_name = NULL;
5381 else
5382 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5383out:
5384 kfree(reply_buf);
5385 kfree(image_id);
5386
5387 return image_name;
5388}
5389
2ad3d716
AE
5390static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5391{
5392 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5393 const char *snap_name;
5394 u32 which = 0;
5395
5396 /* Skip over names until we find the one we are looking for */
5397
5398 snap_name = rbd_dev->header.snap_names;
5399 while (which < snapc->num_snaps) {
5400 if (!strcmp(name, snap_name))
5401 return snapc->snaps[which];
5402 snap_name += strlen(snap_name) + 1;
5403 which++;
5404 }
5405 return CEPH_NOSNAP;
5406}
5407
5408static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5409{
5410 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5411 u32 which;
5412 bool found = false;
5413 u64 snap_id;
5414
5415 for (which = 0; !found && which < snapc->num_snaps; which++) {
5416 const char *snap_name;
5417
5418 snap_id = snapc->snaps[which];
5419 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
efadc98a
JD
5420 if (IS_ERR(snap_name)) {
5421 /* ignore no-longer existing snapshots */
5422 if (PTR_ERR(snap_name) == -ENOENT)
5423 continue;
5424 else
5425 break;
5426 }
2ad3d716
AE
5427 found = !strcmp(name, snap_name);
5428 kfree(snap_name);
5429 }
5430 return found ? snap_id : CEPH_NOSNAP;
5431}
5432
5433/*
5434 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5435 * no snapshot by that name is found, or if an error occurs.
5436 */
5437static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5438{
5439 if (rbd_dev->image_format == 1)
5440 return rbd_v1_snap_id_by_name(rbd_dev, name);
5441
5442 return rbd_v2_snap_id_by_name(rbd_dev, name);
5443}
5444
9e15b77d 5445/*
04077599
ID
5446 * An image being mapped will have everything but the snap id.
5447 */
5448static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5449{
5450 struct rbd_spec *spec = rbd_dev->spec;
5451
5452 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5453 rbd_assert(spec->image_id && spec->image_name);
5454 rbd_assert(spec->snap_name);
5455
5456 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5457 u64 snap_id;
5458
5459 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5460 if (snap_id == CEPH_NOSNAP)
5461 return -ENOENT;
5462
5463 spec->snap_id = snap_id;
5464 } else {
5465 spec->snap_id = CEPH_NOSNAP;
5466 }
5467
5468 return 0;
5469}
5470
5471/*
5472 * A parent image will have all ids but none of the names.
e1d4213f 5473 *
04077599
ID
5474 * All names in an rbd spec are dynamically allocated. It's OK if we
5475 * can't figure out the name for an image id.
9e15b77d 5476 */
04077599 5477static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
9e15b77d 5478{
2e9f7f1c
AE
5479 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5480 struct rbd_spec *spec = rbd_dev->spec;
5481 const char *pool_name;
5482 const char *image_name;
5483 const char *snap_name;
9e15b77d
AE
5484 int ret;
5485
04077599
ID
5486 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5487 rbd_assert(spec->image_id);
5488 rbd_assert(spec->snap_id != CEPH_NOSNAP);
9e15b77d 5489
2e9f7f1c 5490 /* Get the pool name; we have to make our own copy of this */
9e15b77d 5491
2e9f7f1c
AE
5492 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5493 if (!pool_name) {
5494 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
935dc89f
AE
5495 return -EIO;
5496 }
2e9f7f1c
AE
5497 pool_name = kstrdup(pool_name, GFP_KERNEL);
5498 if (!pool_name)
9e15b77d
AE
5499 return -ENOMEM;
5500
5501 /* Fetch the image name; tolerate failure here */
5502
2e9f7f1c
AE
5503 image_name = rbd_dev_image_name(rbd_dev);
5504 if (!image_name)
06ecc6cb 5505 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d 5506
04077599 5507 /* Fetch the snapshot name */
9e15b77d 5508
2e9f7f1c 5509 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
da6a6b63
JD
5510 if (IS_ERR(snap_name)) {
5511 ret = PTR_ERR(snap_name);
9e15b77d 5512 goto out_err;
2e9f7f1c
AE
5513 }
5514
5515 spec->pool_name = pool_name;
5516 spec->image_name = image_name;
5517 spec->snap_name = snap_name;
9e15b77d
AE
5518
5519 return 0;
04077599 5520
9e15b77d 5521out_err:
2e9f7f1c
AE
5522 kfree(image_name);
5523 kfree(pool_name);
9e15b77d
AE
5524 return ret;
5525}
5526
cc4a38bd 5527static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
35d489f9
AE
5528{
5529 size_t size;
5530 int ret;
5531 void *reply_buf;
5532 void *p;
5533 void *end;
5534 u64 seq;
5535 u32 snap_count;
5536 struct ceph_snap_context *snapc;
5537 u32 i;
5538
5539 /*
5540 * We'll need room for the seq value (maximum snapshot id),
5541 * snapshot count, and array of that many snapshot ids.
5542 * For now we have a fixed upper limit on the number we're
5543 * prepared to receive.
5544 */
5545 size = sizeof (__le64) + sizeof (__le32) +
5546 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5547 reply_buf = kzalloc(size, GFP_KERNEL);
5548 if (!reply_buf)
5549 return -ENOMEM;
5550
ecd4a68a
ID
5551 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5552 &rbd_dev->header_oloc, "get_snapcontext",
5553 NULL, 0, reply_buf, size);
36be9a76 5554 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
5555 if (ret < 0)
5556 goto out;
5557
35d489f9 5558 p = reply_buf;
57385b51
AE
5559 end = reply_buf + ret;
5560 ret = -ERANGE;
35d489f9
AE
5561 ceph_decode_64_safe(&p, end, seq, out);
5562 ceph_decode_32_safe(&p, end, snap_count, out);
5563
5564 /*
5565 * Make sure the reported number of snapshot ids wouldn't go
5566 * beyond the end of our buffer. But before checking that,
5567 * make sure the computed size of the snapshot context we
5568 * allocate is representable in a size_t.
5569 */
5570 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5571 / sizeof (u64)) {
5572 ret = -EINVAL;
5573 goto out;
5574 }
5575 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5576 goto out;
468521c1 5577 ret = 0;
35d489f9 5578
812164f8 5579 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
35d489f9
AE
5580 if (!snapc) {
5581 ret = -ENOMEM;
5582 goto out;
5583 }
35d489f9 5584 snapc->seq = seq;
35d489f9
AE
5585 for (i = 0; i < snap_count; i++)
5586 snapc->snaps[i] = ceph_decode_64(&p);
5587
49ece554 5588 ceph_put_snap_context(rbd_dev->header.snapc);
35d489f9
AE
5589 rbd_dev->header.snapc = snapc;
5590
5591 dout(" snap context seq = %llu, snap_count = %u\n",
57385b51 5592 (unsigned long long)seq, (unsigned int)snap_count);
35d489f9
AE
5593out:
5594 kfree(reply_buf);
5595
57385b51 5596 return ret;
35d489f9
AE
5597}
5598
54cac61f
AE
5599static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5600 u64 snap_id)
b8b1e2db
AE
5601{
5602 size_t size;
5603 void *reply_buf;
54cac61f 5604 __le64 snapid;
b8b1e2db
AE
5605 int ret;
5606 void *p;
5607 void *end;
b8b1e2db
AE
5608 char *snap_name;
5609
5610 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5611 reply_buf = kmalloc(size, GFP_KERNEL);
5612 if (!reply_buf)
5613 return ERR_PTR(-ENOMEM);
5614
54cac61f 5615 snapid = cpu_to_le64(snap_id);
ecd4a68a
ID
5616 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5617 &rbd_dev->header_oloc, "get_snapshot_name",
5618 &snapid, sizeof(snapid), reply_buf, size);
36be9a76 5619 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
f40eb349
AE
5620 if (ret < 0) {
5621 snap_name = ERR_PTR(ret);
b8b1e2db 5622 goto out;
f40eb349 5623 }
b8b1e2db
AE
5624
5625 p = reply_buf;
f40eb349 5626 end = reply_buf + ret;
e5c35534 5627 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
f40eb349 5628 if (IS_ERR(snap_name))
b8b1e2db 5629 goto out;
b8b1e2db 5630
f40eb349 5631 dout(" snap_id 0x%016llx snap_name = %s\n",
54cac61f 5632 (unsigned long long)snap_id, snap_name);
b8b1e2db
AE
5633out:
5634 kfree(reply_buf);
5635
f40eb349 5636 return snap_name;
b8b1e2db
AE
5637}
5638
2df3fac7 5639static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
117973fb 5640{
2df3fac7 5641 bool first_time = rbd_dev->header.object_prefix == NULL;
117973fb 5642 int ret;
117973fb 5643
1617e40c
JD
5644 ret = rbd_dev_v2_image_size(rbd_dev);
5645 if (ret)
cfbf6377 5646 return ret;
1617e40c 5647
2df3fac7
AE
5648 if (first_time) {
5649 ret = rbd_dev_v2_header_onetime(rbd_dev);
5650 if (ret)
cfbf6377 5651 return ret;
2df3fac7
AE
5652 }
5653
cc4a38bd 5654 ret = rbd_dev_v2_snap_context(rbd_dev);
d194cd1d
ID
5655 if (ret && first_time) {
5656 kfree(rbd_dev->header.object_prefix);
5657 rbd_dev->header.object_prefix = NULL;
5658 }
117973fb
AE
5659
5660 return ret;
5661}
5662
a720ae09
ID
5663static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5664{
5665 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5666
5667 if (rbd_dev->image_format == 1)
5668 return rbd_dev_v1_header_info(rbd_dev);
5669
5670 return rbd_dev_v2_header_info(rbd_dev);
5671}
5672
e28fff26
AE
5673/*
5674 * Skips over white space at *buf, and updates *buf to point to the
5675 * first found non-space character (if any). Returns the length of
593a9e7b
AE
5676 * the token (string of non-white space characters) found. Note
5677 * that *buf must be terminated with '\0'.
e28fff26
AE
5678 */
5679static inline size_t next_token(const char **buf)
5680{
5681 /*
5682 * These are the characters that produce nonzero for
5683 * isspace() in the "C" and "POSIX" locales.
5684 */
5685 const char *spaces = " \f\n\r\t\v";
5686
5687 *buf += strspn(*buf, spaces); /* Find start of token */
5688
5689 return strcspn(*buf, spaces); /* Return token length */
5690}
5691
ea3352f4
AE
5692/*
5693 * Finds the next token in *buf, dynamically allocates a buffer big
5694 * enough to hold a copy of it, and copies the token into the new
5695 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5696 * that a duplicate buffer is created even for a zero-length token.
5697 *
5698 * Returns a pointer to the newly-allocated duplicate, or a null
5699 * pointer if memory for the duplicate was not available. If
5700 * the lenp argument is a non-null pointer, the length of the token
5701 * (not including the '\0') is returned in *lenp.
5702 *
5703 * If successful, the *buf pointer will be updated to point beyond
5704 * the end of the found token.
5705 *
5706 * Note: uses GFP_KERNEL for allocation.
5707 */
5708static inline char *dup_token(const char **buf, size_t *lenp)
5709{
5710 char *dup;
5711 size_t len;
5712
5713 len = next_token(buf);
4caf35f9 5714 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
5715 if (!dup)
5716 return NULL;
ea3352f4
AE
5717 *(dup + len) = '\0';
5718 *buf += len;
5719
5720 if (lenp)
5721 *lenp = len;
5722
5723 return dup;
5724}
5725
a725f65e 5726/*
859c31df
AE
5727 * Parse the options provided for an "rbd add" (i.e., rbd image
5728 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5729 * and the data written is passed here via a NUL-terminated buffer.
5730 * Returns 0 if successful or an error code otherwise.
d22f76e7 5731 *
859c31df
AE
5732 * The information extracted from these options is recorded in
5733 * the other parameters which return dynamically-allocated
5734 * structures:
5735 * ceph_opts
5736 * The address of a pointer that will refer to a ceph options
5737 * structure. Caller must release the returned pointer using
5738 * ceph_destroy_options() when it is no longer needed.
5739 * rbd_opts
5740 * Address of an rbd options pointer. Fully initialized by
5741 * this function; caller must release with kfree().
5742 * spec
5743 * Address of an rbd image specification pointer. Fully
5744 * initialized by this function based on parsed options.
5745 * Caller must release with rbd_spec_put().
5746 *
5747 * The options passed take this form:
5748 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5749 * where:
5750 * <mon_addrs>
5751 * A comma-separated list of one or more monitor addresses.
5752 * A monitor address is an ip address, optionally followed
5753 * by a port number (separated by a colon).
5754 * I.e.: ip1[:port1][,ip2[:port2]...]
5755 * <options>
5756 * A comma-separated list of ceph and/or rbd options.
5757 * <pool_name>
5758 * The name of the rados pool containing the rbd image.
5759 * <image_name>
5760 * The name of the image in that pool to map.
5761 * <snap_id>
5762 * An optional snapshot id. If provided, the mapping will
5763 * present data from the image at the time that snapshot was
5764 * created. The image head is used if no snapshot id is
5765 * provided. Snapshot mappings are always read-only.
a725f65e 5766 */
859c31df 5767static int rbd_add_parse_args(const char *buf,
dc79b113 5768 struct ceph_options **ceph_opts,
859c31df
AE
5769 struct rbd_options **opts,
5770 struct rbd_spec **rbd_spec)
e28fff26 5771{
d22f76e7 5772 size_t len;
859c31df 5773 char *options;
0ddebc0c 5774 const char *mon_addrs;
ecb4dc22 5775 char *snap_name;
0ddebc0c 5776 size_t mon_addrs_size;
c300156b 5777 struct parse_rbd_opts_ctx pctx = { 0 };
859c31df 5778 struct ceph_options *copts;
dc79b113 5779 int ret;
e28fff26
AE
5780
5781 /* The first four tokens are required */
5782
7ef3214a 5783 len = next_token(&buf);
4fb5d671
AE
5784 if (!len) {
5785 rbd_warn(NULL, "no monitor address(es) provided");
5786 return -EINVAL;
5787 }
0ddebc0c 5788 mon_addrs = buf;
f28e565a 5789 mon_addrs_size = len + 1;
7ef3214a 5790 buf += len;
a725f65e 5791
dc79b113 5792 ret = -EINVAL;
f28e565a
AE
5793 options = dup_token(&buf, NULL);
5794 if (!options)
dc79b113 5795 return -ENOMEM;
4fb5d671
AE
5796 if (!*options) {
5797 rbd_warn(NULL, "no options provided");
5798 goto out_err;
5799 }
e28fff26 5800
c300156b
ID
5801 pctx.spec = rbd_spec_alloc();
5802 if (!pctx.spec)
f28e565a 5803 goto out_mem;
859c31df 5804
c300156b
ID
5805 pctx.spec->pool_name = dup_token(&buf, NULL);
5806 if (!pctx.spec->pool_name)
859c31df 5807 goto out_mem;
c300156b 5808 if (!*pctx.spec->pool_name) {
4fb5d671
AE
5809 rbd_warn(NULL, "no pool name provided");
5810 goto out_err;
5811 }
e28fff26 5812
c300156b
ID
5813 pctx.spec->image_name = dup_token(&buf, NULL);
5814 if (!pctx.spec->image_name)
f28e565a 5815 goto out_mem;
c300156b 5816 if (!*pctx.spec->image_name) {
4fb5d671
AE
5817 rbd_warn(NULL, "no image name provided");
5818 goto out_err;
5819 }
d4b125e9 5820
f28e565a
AE
5821 /*
5822 * Snapshot name is optional; default is to use "-"
5823 * (indicating the head/no snapshot).
5824 */
3feeb894 5825 len = next_token(&buf);
820a5f3e 5826 if (!len) {
3feeb894
AE
5827 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5828 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 5829 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 5830 ret = -ENAMETOOLONG;
f28e565a 5831 goto out_err;
849b4260 5832 }
ecb4dc22
AE
5833 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5834 if (!snap_name)
f28e565a 5835 goto out_mem;
ecb4dc22 5836 *(snap_name + len) = '\0';
c300156b 5837 pctx.spec->snap_name = snap_name;
e5c35534 5838
0ddebc0c 5839 /* Initialize all rbd options to the defaults */
e28fff26 5840
c300156b
ID
5841 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5842 if (!pctx.opts)
4e9afeba
AE
5843 goto out_mem;
5844
c300156b
ID
5845 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5846 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
0c93e1b7 5847 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
c300156b
ID
5848 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5849 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5850 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5851 pctx.opts->trim = RBD_TRIM_DEFAULT;
d22f76e7 5852
859c31df 5853 copts = ceph_parse_options(options, mon_addrs,
c300156b
ID
5854 mon_addrs + mon_addrs_size - 1,
5855 parse_rbd_opts_token, &pctx);
859c31df
AE
5856 if (IS_ERR(copts)) {
5857 ret = PTR_ERR(copts);
dc79b113
AE
5858 goto out_err;
5859 }
859c31df
AE
5860 kfree(options);
5861
5862 *ceph_opts = copts;
c300156b
ID
5863 *opts = pctx.opts;
5864 *rbd_spec = pctx.spec;
0ddebc0c 5865
dc79b113 5866 return 0;
f28e565a 5867out_mem:
dc79b113 5868 ret = -ENOMEM;
d22f76e7 5869out_err:
c300156b
ID
5870 kfree(pctx.opts);
5871 rbd_spec_put(pctx.spec);
f28e565a 5872 kfree(options);
d22f76e7 5873
dc79b113 5874 return ret;
a725f65e
AE
5875}
5876
e010dd0a
ID
5877static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5878{
5879 down_write(&rbd_dev->lock_rwsem);
5880 if (__rbd_is_lock_owner(rbd_dev))
e1fddc8f 5881 __rbd_release_lock(rbd_dev);
e010dd0a
ID
5882 up_write(&rbd_dev->lock_rwsem);
5883}
5884
637cd060
ID
5885/*
5886 * If the wait is interrupted, an error is returned even if the lock
5887 * was successfully acquired. rbd_dev_image_unlock() will release it
5888 * if needed.
5889 */
e010dd0a
ID
5890static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5891{
637cd060 5892 long ret;
2f18d466 5893
e010dd0a 5894 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
637cd060
ID
5895 if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
5896 return 0;
5897
e010dd0a
ID
5898 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5899 return -EINVAL;
5900 }
5901
637cd060
ID
5902 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
5903 return 0;
5904
5905 rbd_assert(!rbd_is_lock_owner(rbd_dev));
5906 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
5907 ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
5908 ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
5909 if (ret > 0)
5910 ret = rbd_dev->acquire_err;
5911 else if (!ret)
5912 ret = -ETIMEDOUT;
5913
2f18d466 5914 if (ret) {
637cd060
ID
5915 rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
5916 return ret;
e010dd0a
ID
5917 }
5918
637cd060
ID
5919 /*
5920 * The lock may have been released by now, unless automatic lock
5921 * transitions are disabled.
5922 */
5923 rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
e010dd0a
ID
5924 return 0;
5925}
5926
589d30e0
AE
5927/*
5928 * An rbd format 2 image has a unique identifier, distinct from the
5929 * name given to it by the user. Internally, that identifier is
5930 * what's used to specify the names of objects related to the image.
5931 *
5932 * A special "rbd id" object is used to map an rbd image name to its
5933 * id. If that object doesn't exist, then there is no v2 rbd image
5934 * with the supplied name.
5935 *
5936 * This function will record the given rbd_dev's image_id field if
5937 * it can be determined, and in that case will return 0. If any
5938 * errors occur a negative errno will be returned and the rbd_dev's
5939 * image_id field will be unchanged (and should be NULL).
5940 */
5941static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5942{
5943 int ret;
5944 size_t size;
ecd4a68a 5945 CEPH_DEFINE_OID_ONSTACK(oid);
589d30e0 5946 void *response;
c0fba368 5947 char *image_id;
2f82ee54 5948
2c0d0a10
AE
5949 /*
5950 * When probing a parent image, the image id is already
5951 * known (and the image name likely is not). There's no
c0fba368
AE
5952 * need to fetch the image id again in this case. We
5953 * do still need to set the image format though.
2c0d0a10 5954 */
c0fba368
AE
5955 if (rbd_dev->spec->image_id) {
5956 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5957
2c0d0a10 5958 return 0;
c0fba368 5959 }
2c0d0a10 5960
589d30e0
AE
5961 /*
5962 * First, see if the format 2 image id file exists, and if
5963 * so, get the image's persistent id from it.
5964 */
ecd4a68a
ID
5965 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5966 rbd_dev->spec->image_name);
5967 if (ret)
5968 return ret;
5969
5970 dout("rbd id object name is %s\n", oid.name);
589d30e0
AE
5971
5972 /* Response will be an encoded string, which includes a length */
5973
5974 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5975 response = kzalloc(size, GFP_NOIO);
5976 if (!response) {
5977 ret = -ENOMEM;
5978 goto out;
5979 }
5980
c0fba368
AE
5981 /* If it doesn't exist we'll assume it's a format 1 image */
5982
ecd4a68a
ID
5983 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5984 "get_id", NULL, 0,
5985 response, RBD_IMAGE_ID_LEN_MAX);
36be9a76 5986 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
c0fba368
AE
5987 if (ret == -ENOENT) {
5988 image_id = kstrdup("", GFP_KERNEL);
5989 ret = image_id ? 0 : -ENOMEM;
5990 if (!ret)
5991 rbd_dev->image_format = 1;
7dd440c9 5992 } else if (ret >= 0) {
c0fba368
AE
5993 void *p = response;
5994
5995 image_id = ceph_extract_encoded_string(&p, p + ret,
979ed480 5996 NULL, GFP_NOIO);
461f758a 5997 ret = PTR_ERR_OR_ZERO(image_id);
c0fba368
AE
5998 if (!ret)
5999 rbd_dev->image_format = 2;
c0fba368
AE
6000 }
6001
6002 if (!ret) {
6003 rbd_dev->spec->image_id = image_id;
6004 dout("image_id is %s\n", image_id);
589d30e0
AE
6005 }
6006out:
6007 kfree(response);
ecd4a68a 6008 ceph_oid_destroy(&oid);
589d30e0
AE
6009 return ret;
6010}
6011
3abef3b3
AE
6012/*
6013 * Undo whatever state changes are made by v1 or v2 header info
6014 * call.
6015 */
6fd48b3b
AE
6016static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6017{
6018 struct rbd_image_header *header;
6019
e69b8d41 6020 rbd_dev_parent_put(rbd_dev);
da5ef6be 6021 rbd_dev_mapping_clear(rbd_dev);
6fd48b3b
AE
6022
6023 /* Free dynamic fields from the header, then zero it out */
6024
6025 header = &rbd_dev->header;
812164f8 6026 ceph_put_snap_context(header->snapc);
6fd48b3b
AE
6027 kfree(header->snap_sizes);
6028 kfree(header->snap_names);
6029 kfree(header->object_prefix);
6030 memset(header, 0, sizeof (*header));
6031}
6032
2df3fac7 6033static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
a30b71b9
AE
6034{
6035 int ret;
a30b71b9 6036
1e130199 6037 ret = rbd_dev_v2_object_prefix(rbd_dev);
57385b51 6038 if (ret)
b1b5402a
AE
6039 goto out_err;
6040
2df3fac7
AE
6041 /*
6042 * Get the and check features for the image. Currently the
6043 * features are assumed to never change.
6044 */
b1b5402a 6045 ret = rbd_dev_v2_features(rbd_dev);
57385b51 6046 if (ret)
9d475de5 6047 goto out_err;
35d489f9 6048
cc070d59
AE
6049 /* If the image supports fancy striping, get its parameters */
6050
6051 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6052 ret = rbd_dev_v2_striping_info(rbd_dev);
6053 if (ret < 0)
6054 goto out_err;
6055 }
a30b71b9 6056
7e97332e
ID
6057 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6058 ret = rbd_dev_v2_data_pool(rbd_dev);
6059 if (ret)
6060 goto out_err;
6061 }
6062
263423f8 6063 rbd_init_layout(rbd_dev);
35152979 6064 return 0;
263423f8 6065
9d475de5 6066out_err:
642a2537 6067 rbd_dev->header.features = 0;
1e130199
AE
6068 kfree(rbd_dev->header.object_prefix);
6069 rbd_dev->header.object_prefix = NULL;
9d475de5 6070 return ret;
a30b71b9
AE
6071}
6072
6d69bb53
ID
6073/*
6074 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6075 * rbd_dev_image_probe() recursion depth, which means it's also the
6076 * length of the already discovered part of the parent chain.
6077 */
6078static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
83a06263 6079{
2f82ee54 6080 struct rbd_device *parent = NULL;
124afba2
AE
6081 int ret;
6082
6083 if (!rbd_dev->parent_spec)
6084 return 0;
124afba2 6085
6d69bb53
ID
6086 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6087 pr_info("parent chain is too long (%d)\n", depth);
6088 ret = -EINVAL;
6089 goto out_err;
6090 }
6091
1643dfa4 6092 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
1f2c6651
ID
6093 if (!parent) {
6094 ret = -ENOMEM;
124afba2 6095 goto out_err;
1f2c6651
ID
6096 }
6097
6098 /*
6099 * Images related by parent/child relationships always share
6100 * rbd_client and spec/parent_spec, so bump their refcounts.
6101 */
6102 __rbd_get_client(rbd_dev->rbd_client);
6103 rbd_spec_get(rbd_dev->parent_spec);
124afba2 6104
6d69bb53 6105 ret = rbd_dev_image_probe(parent, depth);
124afba2
AE
6106 if (ret < 0)
6107 goto out_err;
1f2c6651 6108
124afba2 6109 rbd_dev->parent = parent;
a2acd00e 6110 atomic_set(&rbd_dev->parent_ref, 1);
124afba2 6111 return 0;
1f2c6651 6112
124afba2 6113out_err:
1f2c6651 6114 rbd_dev_unparent(rbd_dev);
1761b229 6115 rbd_dev_destroy(parent);
124afba2
AE
6116 return ret;
6117}
6118
5769ed0c
ID
6119static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6120{
6121 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5769ed0c
ID
6122 rbd_free_disk(rbd_dev);
6123 if (!single_major)
6124 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6125}
6126
811c6688
ID
6127/*
6128 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6129 * upon return.
6130 */
200a6a8b 6131static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
124afba2 6132{
83a06263 6133 int ret;
d1cf5788 6134
9b60e70b 6135 /* Record our major and minor device numbers. */
83a06263 6136
9b60e70b
ID
6137 if (!single_major) {
6138 ret = register_blkdev(0, rbd_dev->name);
6139 if (ret < 0)
1643dfa4 6140 goto err_out_unlock;
9b60e70b
ID
6141
6142 rbd_dev->major = ret;
6143 rbd_dev->minor = 0;
6144 } else {
6145 rbd_dev->major = rbd_major;
6146 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6147 }
83a06263
AE
6148
6149 /* Set up the blkdev mapping. */
6150
6151 ret = rbd_init_disk(rbd_dev);
6152 if (ret)
6153 goto err_out_blkdev;
6154
f35a4dee 6155 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
9568c93e 6156 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
f35a4dee 6157
5769ed0c 6158 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
f35a4dee 6159 if (ret)
da5ef6be 6160 goto err_out_disk;
83a06263 6161
129b79d4 6162 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
811c6688 6163 up_write(&rbd_dev->header_rwsem);
5769ed0c 6164 return 0;
2f82ee54 6165
83a06263
AE
6166err_out_disk:
6167 rbd_free_disk(rbd_dev);
6168err_out_blkdev:
9b60e70b
ID
6169 if (!single_major)
6170 unregister_blkdev(rbd_dev->major, rbd_dev->name);
811c6688
ID
6171err_out_unlock:
6172 up_write(&rbd_dev->header_rwsem);
83a06263
AE
6173 return ret;
6174}
6175
332bb12d
AE
6176static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6177{
6178 struct rbd_spec *spec = rbd_dev->spec;
c41d13a3 6179 int ret;
332bb12d
AE
6180
6181 /* Record the header object name for this rbd image. */
6182
6183 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
332bb12d 6184 if (rbd_dev->image_format == 1)
c41d13a3
ID
6185 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6186 spec->image_name, RBD_SUFFIX);
332bb12d 6187 else
c41d13a3
ID
6188 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6189 RBD_HEADER_PREFIX, spec->image_id);
332bb12d 6190
c41d13a3 6191 return ret;
332bb12d
AE
6192}
6193
200a6a8b
AE
6194static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6195{
6fd48b3b 6196 rbd_dev_unprobe(rbd_dev);
fd22aef8
ID
6197 if (rbd_dev->opts)
6198 rbd_unregister_watch(rbd_dev);
6fd48b3b
AE
6199 rbd_dev->image_format = 0;
6200 kfree(rbd_dev->spec->image_id);
6201 rbd_dev->spec->image_id = NULL;
200a6a8b
AE
6202}
6203
a30b71b9
AE
6204/*
6205 * Probe for the existence of the header object for the given rbd
1f3ef788
AE
6206 * device. If this image is the one being mapped (i.e., not a
6207 * parent), initiate a watch on its header object before using that
6208 * object to get detailed information about the rbd image.
a30b71b9 6209 */
6d69bb53 6210static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
a30b71b9
AE
6211{
6212 int ret;
6213
6214 /*
3abef3b3
AE
6215 * Get the id from the image id object. Unless there's an
6216 * error, rbd_dev->spec->image_id will be filled in with
6217 * a dynamically-allocated string, and rbd_dev->image_format
6218 * will be set to either 1 or 2.
a30b71b9
AE
6219 */
6220 ret = rbd_dev_image_id(rbd_dev);
6221 if (ret)
c0fba368 6222 return ret;
c0fba368 6223
332bb12d
AE
6224 ret = rbd_dev_header_name(rbd_dev);
6225 if (ret)
6226 goto err_out_format;
6227
6d69bb53 6228 if (!depth) {
99d16943 6229 ret = rbd_register_watch(rbd_dev);
1fe48023
ID
6230 if (ret) {
6231 if (ret == -ENOENT)
b26c047b 6232 pr_info("image %s/%s%s%s does not exist\n",
1fe48023 6233 rbd_dev->spec->pool_name,
b26c047b
ID
6234 rbd_dev->spec->pool_ns ?: "",
6235 rbd_dev->spec->pool_ns ? "/" : "",
1fe48023 6236 rbd_dev->spec->image_name);
c41d13a3 6237 goto err_out_format;
1fe48023 6238 }
1f3ef788 6239 }
b644de2b 6240
a720ae09 6241 ret = rbd_dev_header_info(rbd_dev);
5655c4d9 6242 if (ret)
b644de2b 6243 goto err_out_watch;
83a06263 6244
04077599
ID
6245 /*
6246 * If this image is the one being mapped, we have pool name and
6247 * id, image name and id, and snap name - need to fill snap id.
6248 * Otherwise this is a parent image, identified by pool, image
6249 * and snap ids - need to fill in names for those ids.
6250 */
6d69bb53 6251 if (!depth)
04077599
ID
6252 ret = rbd_spec_fill_snap_id(rbd_dev);
6253 else
6254 ret = rbd_spec_fill_names(rbd_dev);
1fe48023
ID
6255 if (ret) {
6256 if (ret == -ENOENT)
b26c047b 6257 pr_info("snap %s/%s%s%s@%s does not exist\n",
1fe48023 6258 rbd_dev->spec->pool_name,
b26c047b
ID
6259 rbd_dev->spec->pool_ns ?: "",
6260 rbd_dev->spec->pool_ns ? "/" : "",
1fe48023
ID
6261 rbd_dev->spec->image_name,
6262 rbd_dev->spec->snap_name);
33dca39f 6263 goto err_out_probe;
1fe48023 6264 }
9bb81c9b 6265
da5ef6be
ID
6266 ret = rbd_dev_mapping_set(rbd_dev);
6267 if (ret)
6268 goto err_out_probe;
6269
e8f59b59
ID
6270 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6271 ret = rbd_dev_v2_parent_info(rbd_dev);
6272 if (ret)
6273 goto err_out_probe;
e8f59b59
ID
6274 }
6275
6d69bb53 6276 ret = rbd_dev_probe_parent(rbd_dev, depth);
30d60ba2
AE
6277 if (ret)
6278 goto err_out_probe;
6279
6280 dout("discovered format %u image, header name is %s\n",
c41d13a3 6281 rbd_dev->image_format, rbd_dev->header_oid.name);
30d60ba2 6282 return 0;
e8f59b59 6283
6fd48b3b
AE
6284err_out_probe:
6285 rbd_dev_unprobe(rbd_dev);
b644de2b 6286err_out_watch:
6d69bb53 6287 if (!depth)
99d16943 6288 rbd_unregister_watch(rbd_dev);
332bb12d
AE
6289err_out_format:
6290 rbd_dev->image_format = 0;
5655c4d9
AE
6291 kfree(rbd_dev->spec->image_id);
6292 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
6293 return ret;
6294}
6295
9b60e70b
ID
6296static ssize_t do_rbd_add(struct bus_type *bus,
6297 const char *buf,
6298 size_t count)
602adf40 6299{
cb8627c7 6300 struct rbd_device *rbd_dev = NULL;
dc79b113 6301 struct ceph_options *ceph_opts = NULL;
4e9afeba 6302 struct rbd_options *rbd_opts = NULL;
859c31df 6303 struct rbd_spec *spec = NULL;
9d3997fd 6304 struct rbd_client *rbdc;
b51c83c2 6305 int rc;
602adf40
YS
6306
6307 if (!try_module_get(THIS_MODULE))
6308 return -ENODEV;
6309
602adf40 6310 /* parse add command */
859c31df 6311 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 6312 if (rc < 0)
dd5ac32d 6313 goto out;
78cea76e 6314
9d3997fd
AE
6315 rbdc = rbd_get_client(ceph_opts);
6316 if (IS_ERR(rbdc)) {
6317 rc = PTR_ERR(rbdc);
0ddebc0c 6318 goto err_out_args;
9d3997fd 6319 }
602adf40 6320
602adf40 6321 /* pick the pool */
dd435855 6322 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
1fe48023
ID
6323 if (rc < 0) {
6324 if (rc == -ENOENT)
6325 pr_info("pool %s does not exist\n", spec->pool_name);
602adf40 6326 goto err_out_client;
1fe48023 6327 }
c0cd10db 6328 spec->pool_id = (u64)rc;
859c31df 6329
d147543d 6330 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
b51c83c2
ID
6331 if (!rbd_dev) {
6332 rc = -ENOMEM;
bd4ba655 6333 goto err_out_client;
b51c83c2 6334 }
c53d5893
AE
6335 rbdc = NULL; /* rbd_dev now owns this */
6336 spec = NULL; /* rbd_dev now owns this */
d147543d 6337 rbd_opts = NULL; /* rbd_dev now owns this */
602adf40 6338
0d6d1e9c
MC
6339 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
6340 if (!rbd_dev->config_info) {
6341 rc = -ENOMEM;
6342 goto err_out_rbd_dev;
6343 }
6344
811c6688 6345 down_write(&rbd_dev->header_rwsem);
6d69bb53 6346 rc = rbd_dev_image_probe(rbd_dev, 0);
0d6d1e9c
MC
6347 if (rc < 0) {
6348 up_write(&rbd_dev->header_rwsem);
c53d5893 6349 goto err_out_rbd_dev;
0d6d1e9c 6350 }
05fd6f6f 6351
7ce4eef7 6352 /* If we are mapping a snapshot it must be marked read-only */
7ce4eef7 6353 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9568c93e 6354 rbd_dev->opts->read_only = true;
7ce4eef7 6355
0c93e1b7
ID
6356 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
6357 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
6358 rbd_dev->layout.object_size);
6359 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
6360 }
6361
b536f69a 6362 rc = rbd_dev_device_setup(rbd_dev);
fd22aef8 6363 if (rc)
8b679ec5 6364 goto err_out_image_probe;
3abef3b3 6365
637cd060
ID
6366 rc = rbd_add_acquire_lock(rbd_dev);
6367 if (rc)
6368 goto err_out_image_lock;
3abef3b3 6369
5769ed0c
ID
6370 /* Everything's ready. Announce the disk to the world. */
6371
6372 rc = device_add(&rbd_dev->dev);
6373 if (rc)
e010dd0a 6374 goto err_out_image_lock;
5769ed0c
ID
6375
6376 add_disk(rbd_dev->disk);
6377 /* see rbd_init_disk() */
6378 blk_put_queue(rbd_dev->disk->queue);
6379
6380 spin_lock(&rbd_dev_list_lock);
6381 list_add_tail(&rbd_dev->node, &rbd_dev_list);
6382 spin_unlock(&rbd_dev_list_lock);
6383
6384 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
6385 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
6386 rbd_dev->header.features);
dd5ac32d
ID
6387 rc = count;
6388out:
6389 module_put(THIS_MODULE);
6390 return rc;
b536f69a 6391
e010dd0a
ID
6392err_out_image_lock:
6393 rbd_dev_image_unlock(rbd_dev);
5769ed0c 6394 rbd_dev_device_release(rbd_dev);
8b679ec5
ID
6395err_out_image_probe:
6396 rbd_dev_image_release(rbd_dev);
c53d5893
AE
6397err_out_rbd_dev:
6398 rbd_dev_destroy(rbd_dev);
bd4ba655 6399err_out_client:
9d3997fd 6400 rbd_put_client(rbdc);
0ddebc0c 6401err_out_args:
859c31df 6402 rbd_spec_put(spec);
d147543d 6403 kfree(rbd_opts);
dd5ac32d 6404 goto out;
602adf40
YS
6405}
6406
7e9586ba 6407static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
6408{
6409 if (single_major)
6410 return -EINVAL;
6411
6412 return do_rbd_add(bus, buf, count);
6413}
6414
7e9586ba
GKH
6415static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
6416 size_t count)
9b60e70b
ID
6417{
6418 return do_rbd_add(bus, buf, count);
6419}
6420
05a46afd
AE
6421static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6422{
ad945fc1 6423 while (rbd_dev->parent) {
05a46afd
AE
6424 struct rbd_device *first = rbd_dev;
6425 struct rbd_device *second = first->parent;
6426 struct rbd_device *third;
6427
6428 /*
6429 * Follow to the parent with no grandparent and
6430 * remove it.
6431 */
6432 while (second && (third = second->parent)) {
6433 first = second;
6434 second = third;
6435 }
ad945fc1 6436 rbd_assert(second);
8ad42cd0 6437 rbd_dev_image_release(second);
8b679ec5 6438 rbd_dev_destroy(second);
ad945fc1
AE
6439 first->parent = NULL;
6440 first->parent_overlap = 0;
6441
6442 rbd_assert(first->parent_spec);
05a46afd
AE
6443 rbd_spec_put(first->parent_spec);
6444 first->parent_spec = NULL;
05a46afd
AE
6445 }
6446}
6447
9b60e70b
ID
6448static ssize_t do_rbd_remove(struct bus_type *bus,
6449 const char *buf,
6450 size_t count)
602adf40
YS
6451{
6452 struct rbd_device *rbd_dev = NULL;
751cc0e3
AE
6453 struct list_head *tmp;
6454 int dev_id;
0276dca6 6455 char opt_buf[6];
0276dca6 6456 bool force = false;
0d8189e1 6457 int ret;
602adf40 6458
0276dca6
MC
6459 dev_id = -1;
6460 opt_buf[0] = '\0';
6461 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6462 if (dev_id < 0) {
6463 pr_err("dev_id out of range\n");
602adf40 6464 return -EINVAL;
0276dca6
MC
6465 }
6466 if (opt_buf[0] != '\0') {
6467 if (!strcmp(opt_buf, "force")) {
6468 force = true;
6469 } else {
6470 pr_err("bad remove option at '%s'\n", opt_buf);
6471 return -EINVAL;
6472 }
6473 }
602adf40 6474
751cc0e3
AE
6475 ret = -ENOENT;
6476 spin_lock(&rbd_dev_list_lock);
6477 list_for_each(tmp, &rbd_dev_list) {
6478 rbd_dev = list_entry(tmp, struct rbd_device, node);
6479 if (rbd_dev->dev_id == dev_id) {
6480 ret = 0;
6481 break;
6482 }
42382b70 6483 }
751cc0e3
AE
6484 if (!ret) {
6485 spin_lock_irq(&rbd_dev->lock);
0276dca6 6486 if (rbd_dev->open_count && !force)
751cc0e3 6487 ret = -EBUSY;
85f5a4d6
ID
6488 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6489 &rbd_dev->flags))
6490 ret = -EINPROGRESS;
751cc0e3
AE
6491 spin_unlock_irq(&rbd_dev->lock);
6492 }
6493 spin_unlock(&rbd_dev_list_lock);
85f5a4d6 6494 if (ret)
1ba0f1e7 6495 return ret;
751cc0e3 6496
0276dca6
MC
6497 if (force) {
6498 /*
6499 * Prevent new IO from being queued and wait for existing
6500 * IO to complete/fail.
6501 */
6502 blk_mq_freeze_queue(rbd_dev->disk->queue);
6503 blk_set_queue_dying(rbd_dev->disk->queue);
6504 }
6505
5769ed0c
ID
6506 del_gendisk(rbd_dev->disk);
6507 spin_lock(&rbd_dev_list_lock);
6508 list_del_init(&rbd_dev->node);
6509 spin_unlock(&rbd_dev_list_lock);
6510 device_del(&rbd_dev->dev);
fca27065 6511
e010dd0a 6512 rbd_dev_image_unlock(rbd_dev);
dd5ac32d 6513 rbd_dev_device_release(rbd_dev);
8ad42cd0 6514 rbd_dev_image_release(rbd_dev);
8b679ec5 6515 rbd_dev_destroy(rbd_dev);
1ba0f1e7 6516 return count;
602adf40
YS
6517}
6518
7e9586ba 6519static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
6520{
6521 if (single_major)
6522 return -EINVAL;
6523
6524 return do_rbd_remove(bus, buf, count);
6525}
6526
7e9586ba
GKH
6527static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
6528 size_t count)
9b60e70b
ID
6529{
6530 return do_rbd_remove(bus, buf, count);
6531}
6532
602adf40
YS
6533/*
6534 * create control files in sysfs
dfc5606d 6535 * /sys/bus/rbd/...
602adf40 6536 */
7d8dc534 6537static int __init rbd_sysfs_init(void)
602adf40 6538{
dfc5606d 6539 int ret;
602adf40 6540
fed4c143 6541 ret = device_register(&rbd_root_dev);
21079786 6542 if (ret < 0)
dfc5606d 6543 return ret;
602adf40 6544
fed4c143
AE
6545 ret = bus_register(&rbd_bus_type);
6546 if (ret < 0)
6547 device_unregister(&rbd_root_dev);
602adf40 6548
602adf40
YS
6549 return ret;
6550}
6551
7d8dc534 6552static void __exit rbd_sysfs_cleanup(void)
602adf40 6553{
dfc5606d 6554 bus_unregister(&rbd_bus_type);
fed4c143 6555 device_unregister(&rbd_root_dev);
602adf40
YS
6556}
6557
7d8dc534 6558static int __init rbd_slab_init(void)
1c2a9dfe
AE
6559{
6560 rbd_assert(!rbd_img_request_cache);
03d94406 6561 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
868311b1
AE
6562 if (!rbd_img_request_cache)
6563 return -ENOMEM;
6564
6565 rbd_assert(!rbd_obj_request_cache);
03d94406 6566 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
78c2a44a
AE
6567 if (!rbd_obj_request_cache)
6568 goto out_err;
6569
6c696d85 6570 return 0;
1c2a9dfe 6571
6c696d85 6572out_err:
868311b1
AE
6573 kmem_cache_destroy(rbd_img_request_cache);
6574 rbd_img_request_cache = NULL;
1c2a9dfe
AE
6575 return -ENOMEM;
6576}
6577
6578static void rbd_slab_exit(void)
6579{
868311b1
AE
6580 rbd_assert(rbd_obj_request_cache);
6581 kmem_cache_destroy(rbd_obj_request_cache);
6582 rbd_obj_request_cache = NULL;
6583
1c2a9dfe
AE
6584 rbd_assert(rbd_img_request_cache);
6585 kmem_cache_destroy(rbd_img_request_cache);
6586 rbd_img_request_cache = NULL;
6587}
6588
cc344fa1 6589static int __init rbd_init(void)
602adf40
YS
6590{
6591 int rc;
6592
1e32d34c
AE
6593 if (!libceph_compatible(NULL)) {
6594 rbd_warn(NULL, "libceph incompatibility (quitting)");
1e32d34c
AE
6595 return -EINVAL;
6596 }
e1b4d96d 6597
1c2a9dfe 6598 rc = rbd_slab_init();
602adf40
YS
6599 if (rc)
6600 return rc;
e1b4d96d 6601
f5ee37bd
ID
6602 /*
6603 * The number of active work items is limited by the number of
f77303bd 6604 * rbd devices * queue depth, so leave @max_active at default.
f5ee37bd
ID
6605 */
6606 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6607 if (!rbd_wq) {
6608 rc = -ENOMEM;
6609 goto err_out_slab;
6610 }
6611
9b60e70b
ID
6612 if (single_major) {
6613 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6614 if (rbd_major < 0) {
6615 rc = rbd_major;
f5ee37bd 6616 goto err_out_wq;
9b60e70b
ID
6617 }
6618 }
6619
1c2a9dfe
AE
6620 rc = rbd_sysfs_init();
6621 if (rc)
9b60e70b
ID
6622 goto err_out_blkdev;
6623
6624 if (single_major)
6625 pr_info("loaded (major %d)\n", rbd_major);
6626 else
6627 pr_info("loaded\n");
1c2a9dfe 6628
e1b4d96d
ID
6629 return 0;
6630
9b60e70b
ID
6631err_out_blkdev:
6632 if (single_major)
6633 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd
ID
6634err_out_wq:
6635 destroy_workqueue(rbd_wq);
e1b4d96d
ID
6636err_out_slab:
6637 rbd_slab_exit();
1c2a9dfe 6638 return rc;
602adf40
YS
6639}
6640
cc344fa1 6641static void __exit rbd_exit(void)
602adf40 6642{
ffe312cf 6643 ida_destroy(&rbd_dev_id_ida);
602adf40 6644 rbd_sysfs_cleanup();
9b60e70b
ID
6645 if (single_major)
6646 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd 6647 destroy_workqueue(rbd_wq);
1c2a9dfe 6648 rbd_slab_exit();
602adf40
YS
6649}
6650
6651module_init(rbd_init);
6652module_exit(rbd_exit);
6653
d552c619 6654MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
602adf40
YS
6655MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6656MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
602adf40
YS
6657/* following authorship retained from original osdblk.c */
6658MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6659
90da258b 6660MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
602adf40 6661MODULE_LICENSE("GPL");