rbd: introduce obj_req->osd_reqs list
[linux-block.git] / drivers / block / rbd.c
CommitLineData
e2a58ee5 1
602adf40
YS
2/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
dfc5606d 25 For usage instructions, please refer to:
602adf40 26
dfc5606d 27 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
28
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
ed95b21a 34#include <linux/ceph/cls_lock_client.h>
43df3d35 35#include <linux/ceph/striper.h>
602adf40 36#include <linux/ceph/decode.h>
59c2be1e 37#include <linux/parser.h>
30d1cff8 38#include <linux/bsearch.h>
602adf40
YS
39
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
7ad18afa 43#include <linux/blk-mq.h>
602adf40
YS
44#include <linux/fs.h>
45#include <linux/blkdev.h>
1c2a9dfe 46#include <linux/slab.h>
f8a22fc2 47#include <linux/idr.h>
bc1ecc65 48#include <linux/workqueue.h>
602adf40
YS
49
50#include "rbd_types.h"
51
aafb230e
AE
52#define RBD_DEBUG /* Activate rbd_assert() calls */
53
a2acd00e
AE
54/*
55 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
bfc18e38 64 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
a2acd00e
AE
65 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
f0f8cef5 87#define RBD_DRV_NAME "rbd"
602adf40 88
7e513d43
ID
89#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
602adf40 91
6d69bb53
ID
92#define RBD_MAX_PARENT_CHAIN_LEN 16
93
d4b125e9
AE
94#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
35d489f9 98#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
99
100#define RBD_SNAP_HEAD_NAME "-"
101
9682fc6d
AE
102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
9e15b77d
AE
104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 106#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 107
1e130199 108#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 109
ed95b21a 110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
99d16943
ID
111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
d889140c
AE
113/* Feature bits */
114
8767b293
ID
115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
b9f6d447 118#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
8767b293 119#define RBD_FEATURE_DATA_POOL (1ULL<<7)
e573427a 120#define RBD_FEATURE_OPERATIONS (1ULL<<8)
8767b293 121
ed95b21a
ID
122#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
123 RBD_FEATURE_STRIPINGV2 | \
7e97332e 124 RBD_FEATURE_EXCLUSIVE_LOCK | \
b9f6d447 125 RBD_FEATURE_DEEP_FLATTEN | \
e573427a
ID
126 RBD_FEATURE_DATA_POOL | \
127 RBD_FEATURE_OPERATIONS)
d889140c
AE
128
129/* Features supported by this (client software) implementation. */
130
770eba6e 131#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
d889140c 132
81a89793
AE
133/*
134 * An RBD device name will be "rbd#", where the "rbd" comes from
135 * RBD_DRV_NAME above, and # is a unique integer identifier.
81a89793 136 */
602adf40
YS
137#define DEV_NAME_LEN 32
138
139/*
140 * block device image metadata (in-memory version)
141 */
142struct rbd_image_header {
f35a4dee 143 /* These six fields never change for a given rbd image */
849b4260 144 char *object_prefix;
602adf40 145 __u8 obj_order;
f35a4dee
AE
146 u64 stripe_unit;
147 u64 stripe_count;
7e97332e 148 s64 data_pool_id;
f35a4dee 149 u64 features; /* Might be changeable someday? */
602adf40 150
f84344f3
AE
151 /* The remaining fields need to be updated occasionally */
152 u64 image_size;
153 struct ceph_snap_context *snapc;
f35a4dee
AE
154 char *snap_names; /* format 1 only */
155 u64 *snap_sizes; /* format 1 only */
59c2be1e
YS
156};
157
0d7dbfce
AE
158/*
159 * An rbd image specification.
160 *
161 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
162 * identify an image. Each rbd_dev structure includes a pointer to
163 * an rbd_spec structure that encapsulates this identity.
164 *
165 * Each of the id's in an rbd_spec has an associated name. For a
166 * user-mapped image, the names are supplied and the id's associated
167 * with them are looked up. For a layered image, a parent image is
168 * defined by the tuple, and the names are looked up.
169 *
170 * An rbd_dev structure contains a parent_spec pointer which is
171 * non-null if the image it represents is a child in a layered
172 * image. This pointer will refer to the rbd_spec structure used
173 * by the parent rbd_dev for its own identity (i.e., the structure
174 * is shared between the parent and child).
175 *
176 * Since these structures are populated once, during the discovery
177 * phase of image construction, they are effectively immutable so
178 * we make no effort to synchronize access to them.
179 *
180 * Note that code herein does not assume the image name is known (it
181 * could be a null pointer).
0d7dbfce
AE
182 */
183struct rbd_spec {
184 u64 pool_id;
ecb4dc22 185 const char *pool_name;
b26c047b 186 const char *pool_ns; /* NULL if default, never "" */
0d7dbfce 187
ecb4dc22
AE
188 const char *image_id;
189 const char *image_name;
0d7dbfce
AE
190
191 u64 snap_id;
ecb4dc22 192 const char *snap_name;
0d7dbfce
AE
193
194 struct kref kref;
195};
196
602adf40 197/*
f0f8cef5 198 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
199 */
200struct rbd_client {
201 struct ceph_client *client;
202 struct kref kref;
203 struct list_head node;
204};
205
0192ce2e
ID
206struct pending_result {
207 int result; /* first nonzero result */
208 int num_pending;
209};
210
bf0d5f50 211struct rbd_img_request;
bf0d5f50 212
9969ebc5 213enum obj_request_type {
a1fbb5e7 214 OBJ_REQUEST_NODATA = 1,
5359a17d 215 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
7e07efb1 216 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
afb97888 217 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
9969ebc5 218};
bf0d5f50 219
6d2940c8 220enum obj_operation_type {
a1fbb5e7 221 OBJ_OP_READ = 1,
6d2940c8 222 OBJ_OP_WRITE,
90e98c52 223 OBJ_OP_DISCARD,
6484cbe9 224 OBJ_OP_ZEROOUT,
6d2940c8
GZ
225};
226
0ad5d953
ID
227#define RBD_OBJ_FLAG_DELETION (1U << 0)
228#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
229
a9b67e69 230enum rbd_obj_read_state {
85b5e6d1
ID
231 RBD_OBJ_READ_START = 1,
232 RBD_OBJ_READ_OBJECT,
a9b67e69
ID
233 RBD_OBJ_READ_PARENT,
234};
235
3da691bf
ID
236/*
237 * Writes go through the following state machine to deal with
238 * layering:
239 *
89a59c1c
ID
240 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
241 * . | .
242 * . v .
243 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
244 * . | . .
245 * . v v (deep-copyup .
246 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
247 * flattened) v | . .
248 * . v . .
249 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
250 * | not needed) v
251 * v .
252 * done . . . . . . . . . . . . . . . . . .
253 * ^
254 * |
255 * RBD_OBJ_WRITE_FLAT
3da691bf
ID
256 *
257 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
89a59c1c
ID
258 * assert_exists guard is needed or not (in some cases it's not needed
259 * even if there is a parent).
3da691bf
ID
260 */
261enum rbd_obj_write_state {
85b5e6d1
ID
262 RBD_OBJ_WRITE_START = 1,
263 RBD_OBJ_WRITE_OBJECT,
3a482501 264 RBD_OBJ_WRITE_READ_FROM_PARENT,
89a59c1c 265 RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC,
3a482501 266 RBD_OBJ_WRITE_COPYUP_OPS,
926f9b3f
AE
267};
268
bf0d5f50 269struct rbd_obj_request {
43df3d35 270 struct ceph_object_extent ex;
0ad5d953 271 unsigned int flags; /* RBD_OBJ_FLAG_* */
c5b5ef6c 272 union {
a9b67e69 273 enum rbd_obj_read_state read_state; /* for reads */
3da691bf 274 enum rbd_obj_write_state write_state; /* for writes */
c5b5ef6c 275 };
bf0d5f50 276
51c3509e 277 struct rbd_img_request *img_request;
86bd7998
ID
278 struct ceph_file_extent *img_extents;
279 u32 num_img_extents;
bf0d5f50 280
788e2df3 281 union {
5359a17d 282 struct ceph_bio_iter bio_pos;
788e2df3 283 struct {
7e07efb1
ID
284 struct ceph_bvec_iter bvec_pos;
285 u32 bvec_count;
afb97888 286 u32 bvec_idx;
788e2df3
AE
287 };
288 };
7e07efb1
ID
289 struct bio_vec *copyup_bvecs;
290 u32 copyup_bvec_count;
bf0d5f50 291
bcbab1db 292 struct list_head osd_reqs; /* w/ r_private_item */
bf0d5f50 293
85b5e6d1 294 struct mutex state_mutex;
bf0d5f50
AE
295 struct kref kref;
296};
297
0c425248 298enum img_req_flags {
9849e986 299 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
d0b2e944 300 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
0c425248
AE
301};
302
0192ce2e
ID
303enum rbd_img_state {
304 RBD_IMG_START = 1,
305 __RBD_IMG_OBJECT_REQUESTS,
306 RBD_IMG_OBJECT_REQUESTS,
307};
308
bf0d5f50 309struct rbd_img_request {
bf0d5f50 310 struct rbd_device *rbd_dev;
9bb0248d 311 enum obj_operation_type op_type;
ecc633ca 312 enum obj_request_type data_type;
0c425248 313 unsigned long flags;
0192ce2e 314 enum rbd_img_state state;
bf0d5f50 315 union {
9849e986 316 u64 snap_id; /* for reads */
bf0d5f50 317 struct ceph_snap_context *snapc; /* for writes */
9849e986
AE
318 };
319 union {
320 struct request *rq; /* block request */
321 struct rbd_obj_request *obj_request; /* obj req initiator */
bf0d5f50 322 };
bf0d5f50 323
43df3d35 324 struct list_head object_extents; /* obj_req.ex structs */
bf0d5f50 325
0192ce2e
ID
326 struct mutex state_mutex;
327 struct pending_result pending;
328 struct work_struct work;
329 int work_result;
bf0d5f50
AE
330 struct kref kref;
331};
332
333#define for_each_obj_request(ireq, oreq) \
43df3d35 334 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 335#define for_each_obj_request_safe(ireq, oreq, n) \
43df3d35 336 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 337
99d16943
ID
338enum rbd_watch_state {
339 RBD_WATCH_STATE_UNREGISTERED,
340 RBD_WATCH_STATE_REGISTERED,
341 RBD_WATCH_STATE_ERROR,
342};
343
ed95b21a
ID
344enum rbd_lock_state {
345 RBD_LOCK_STATE_UNLOCKED,
346 RBD_LOCK_STATE_LOCKED,
347 RBD_LOCK_STATE_RELEASING,
348};
349
350/* WatchNotify::ClientId */
351struct rbd_client_id {
352 u64 gid;
353 u64 handle;
354};
355
f84344f3 356struct rbd_mapping {
99c1f08f 357 u64 size;
34b13184 358 u64 features;
f84344f3
AE
359};
360
602adf40
YS
361/*
362 * a single device
363 */
364struct rbd_device {
de71a297 365 int dev_id; /* blkdev unique id */
602adf40
YS
366
367 int major; /* blkdev assigned major */
dd82fff1 368 int minor;
602adf40 369 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 370
a30b71b9 371 u32 image_format; /* Either 1 or 2 */
602adf40
YS
372 struct rbd_client *rbd_client;
373
374 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
375
b82d167b 376 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
377
378 struct rbd_image_header header;
b82d167b 379 unsigned long flags; /* possibly lock protected */
0d7dbfce 380 struct rbd_spec *spec;
d147543d 381 struct rbd_options *opts;
0d6d1e9c 382 char *config_info; /* add{,_single_major} string */
602adf40 383
c41d13a3 384 struct ceph_object_id header_oid;
922dab61 385 struct ceph_object_locator header_oloc;
971f839a 386
1643dfa4 387 struct ceph_file_layout layout; /* used for all rbd requests */
0903e875 388
99d16943
ID
389 struct mutex watch_mutex;
390 enum rbd_watch_state watch_state;
922dab61 391 struct ceph_osd_linger_request *watch_handle;
99d16943
ID
392 u64 watch_cookie;
393 struct delayed_work watch_dwork;
59c2be1e 394
ed95b21a
ID
395 struct rw_semaphore lock_rwsem;
396 enum rbd_lock_state lock_state;
cbbfb0ff 397 char lock_cookie[32];
ed95b21a
ID
398 struct rbd_client_id owner_cid;
399 struct work_struct acquired_lock_work;
400 struct work_struct released_lock_work;
401 struct delayed_work lock_dwork;
402 struct work_struct unlock_work;
403 wait_queue_head_t lock_waitq;
404
1643dfa4 405 struct workqueue_struct *task_wq;
59c2be1e 406
86b00e0d
AE
407 struct rbd_spec *parent_spec;
408 u64 parent_overlap;
a2acd00e 409 atomic_t parent_ref;
2f82ee54 410 struct rbd_device *parent;
86b00e0d 411
7ad18afa
CH
412 /* Block layer tags. */
413 struct blk_mq_tag_set tag_set;
414
c666601a
JD
415 /* protects updating the header */
416 struct rw_semaphore header_rwsem;
f84344f3
AE
417
418 struct rbd_mapping mapping;
602adf40
YS
419
420 struct list_head node;
dfc5606d 421
dfc5606d
YS
422 /* sysfs related */
423 struct device dev;
b82d167b 424 unsigned long open_count; /* protected by lock */
dfc5606d
YS
425};
426
b82d167b 427/*
87c0fded
ID
428 * Flag bits for rbd_dev->flags:
429 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
430 * by rbd_dev->lock
431 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
b82d167b 432 */
6d292906
AE
433enum rbd_dev_flags {
434 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
b82d167b 435 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
87c0fded 436 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
6d292906
AE
437};
438
cfbf6377 439static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
e124a82f 440
602adf40 441static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
442static DEFINE_SPINLOCK(rbd_dev_list_lock);
443
432b8587
AE
444static LIST_HEAD(rbd_client_list); /* clients */
445static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 446
78c2a44a
AE
447/* Slab caches for frequently-allocated structures */
448
1c2a9dfe 449static struct kmem_cache *rbd_img_request_cache;
868311b1 450static struct kmem_cache *rbd_obj_request_cache;
1c2a9dfe 451
9b60e70b 452static int rbd_major;
f8a22fc2
ID
453static DEFINE_IDA(rbd_dev_id_ida);
454
f5ee37bd
ID
455static struct workqueue_struct *rbd_wq;
456
89a59c1c
ID
457static struct ceph_snap_context rbd_empty_snapc = {
458 .nref = REFCOUNT_INIT(1),
459};
460
9b60e70b 461/*
3cfa3b16 462 * single-major requires >= 0.75 version of userspace rbd utility.
9b60e70b 463 */
3cfa3b16 464static bool single_major = true;
5657a819 465module_param(single_major, bool, 0444);
3cfa3b16 466MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
9b60e70b 467
7e9586ba
GKH
468static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
469static ssize_t remove_store(struct bus_type *bus, const char *buf,
470 size_t count);
471static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
472 size_t count);
473static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
474 size_t count);
6d69bb53 475static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
f0f8cef5 476
9b60e70b
ID
477static int rbd_dev_id_to_minor(int dev_id)
478{
7e513d43 479 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
480}
481
482static int minor_to_rbd_dev_id(int minor)
483{
7e513d43 484 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
485}
486
ed95b21a
ID
487static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
488{
489 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
490 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
491}
492
493static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
494{
495 bool is_lock_owner;
496
497 down_read(&rbd_dev->lock_rwsem);
498 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
499 up_read(&rbd_dev->lock_rwsem);
500 return is_lock_owner;
501}
502
7e9586ba 503static ssize_t supported_features_show(struct bus_type *bus, char *buf)
8767b293
ID
504{
505 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
506}
507
7e9586ba
GKH
508static BUS_ATTR_WO(add);
509static BUS_ATTR_WO(remove);
510static BUS_ATTR_WO(add_single_major);
511static BUS_ATTR_WO(remove_single_major);
512static BUS_ATTR_RO(supported_features);
b15a21dd
GKH
513
514static struct attribute *rbd_bus_attrs[] = {
515 &bus_attr_add.attr,
516 &bus_attr_remove.attr,
9b60e70b
ID
517 &bus_attr_add_single_major.attr,
518 &bus_attr_remove_single_major.attr,
8767b293 519 &bus_attr_supported_features.attr,
b15a21dd 520 NULL,
f0f8cef5 521};
92c76dc0
ID
522
523static umode_t rbd_bus_is_visible(struct kobject *kobj,
524 struct attribute *attr, int index)
525{
9b60e70b
ID
526 if (!single_major &&
527 (attr == &bus_attr_add_single_major.attr ||
528 attr == &bus_attr_remove_single_major.attr))
529 return 0;
530
92c76dc0
ID
531 return attr->mode;
532}
533
534static const struct attribute_group rbd_bus_group = {
535 .attrs = rbd_bus_attrs,
536 .is_visible = rbd_bus_is_visible,
537};
538__ATTRIBUTE_GROUPS(rbd_bus);
f0f8cef5
AE
539
540static struct bus_type rbd_bus_type = {
541 .name = "rbd",
b15a21dd 542 .bus_groups = rbd_bus_groups,
f0f8cef5
AE
543};
544
545static void rbd_root_dev_release(struct device *dev)
546{
547}
548
549static struct device rbd_root_dev = {
550 .init_name = "rbd",
551 .release = rbd_root_dev_release,
552};
553
06ecc6cb
AE
554static __printf(2, 3)
555void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
556{
557 struct va_format vaf;
558 va_list args;
559
560 va_start(args, fmt);
561 vaf.fmt = fmt;
562 vaf.va = &args;
563
564 if (!rbd_dev)
565 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
566 else if (rbd_dev->disk)
567 printk(KERN_WARNING "%s: %s: %pV\n",
568 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
569 else if (rbd_dev->spec && rbd_dev->spec->image_name)
570 printk(KERN_WARNING "%s: image %s: %pV\n",
571 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
572 else if (rbd_dev->spec && rbd_dev->spec->image_id)
573 printk(KERN_WARNING "%s: id %s: %pV\n",
574 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
575 else /* punt */
576 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
577 RBD_DRV_NAME, rbd_dev, &vaf);
578 va_end(args);
579}
580
aafb230e
AE
581#ifdef RBD_DEBUG
582#define rbd_assert(expr) \
583 if (unlikely(!(expr))) { \
584 printk(KERN_ERR "\nAssertion failure in %s() " \
585 "at line %d:\n\n" \
586 "\trbd_assert(%s);\n\n", \
587 __func__, __LINE__, #expr); \
588 BUG(); \
589 }
590#else /* !RBD_DEBUG */
591# define rbd_assert(expr) ((void) 0)
592#endif /* !RBD_DEBUG */
dfc5606d 593
05a46afd 594static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
8b3e1a56 595
cc4a38bd 596static int rbd_dev_refresh(struct rbd_device *rbd_dev);
2df3fac7 597static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
a720ae09 598static int rbd_dev_header_info(struct rbd_device *rbd_dev);
e8f59b59 599static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
54cac61f
AE
600static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
601 u64 snap_id);
2ad3d716
AE
602static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
603 u8 *order, u64 *snap_size);
604static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
605 u64 *snap_features);
59c2be1e 606
54ab3b24 607static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
0192ce2e
ID
608static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
609
610/*
611 * Return true if nothing else is pending.
612 */
613static bool pending_result_dec(struct pending_result *pending, int *result)
614{
615 rbd_assert(pending->num_pending > 0);
616
617 if (*result && !pending->result)
618 pending->result = *result;
619 if (--pending->num_pending)
620 return false;
621
622 *result = pending->result;
623 return true;
624}
54ab3b24 625
602adf40
YS
626static int rbd_open(struct block_device *bdev, fmode_t mode)
627{
f0f8cef5 628 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 629 bool removing = false;
602adf40 630
a14ea269 631 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
632 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
633 removing = true;
634 else
635 rbd_dev->open_count++;
a14ea269 636 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
637 if (removing)
638 return -ENOENT;
639
c3e946ce 640 (void) get_device(&rbd_dev->dev);
340c7a2b 641
602adf40
YS
642 return 0;
643}
644
db2a144b 645static void rbd_release(struct gendisk *disk, fmode_t mode)
dfc5606d
YS
646{
647 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
648 unsigned long open_count_before;
649
a14ea269 650 spin_lock_irq(&rbd_dev->lock);
b82d167b 651 open_count_before = rbd_dev->open_count--;
a14ea269 652 spin_unlock_irq(&rbd_dev->lock);
b82d167b 653 rbd_assert(open_count_before > 0);
dfc5606d 654
c3e946ce 655 put_device(&rbd_dev->dev);
dfc5606d
YS
656}
657
131fd9f6
GZ
658static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
659{
1de797bb 660 int ro;
131fd9f6 661
1de797bb 662 if (get_user(ro, (int __user *)arg))
131fd9f6
GZ
663 return -EFAULT;
664
1de797bb 665 /* Snapshots can't be marked read-write */
131fd9f6
GZ
666 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
667 return -EROFS;
668
1de797bb
ID
669 /* Let blkdev_roset() handle it */
670 return -ENOTTY;
131fd9f6
GZ
671}
672
673static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
674 unsigned int cmd, unsigned long arg)
675{
676 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
1de797bb 677 int ret;
131fd9f6 678
131fd9f6
GZ
679 switch (cmd) {
680 case BLKROSET:
681 ret = rbd_ioctl_set_ro(rbd_dev, arg);
682 break;
683 default:
684 ret = -ENOTTY;
685 }
686
131fd9f6
GZ
687 return ret;
688}
689
690#ifdef CONFIG_COMPAT
691static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
692 unsigned int cmd, unsigned long arg)
693{
694 return rbd_ioctl(bdev, mode, cmd, arg);
695}
696#endif /* CONFIG_COMPAT */
697
602adf40
YS
698static const struct block_device_operations rbd_bd_ops = {
699 .owner = THIS_MODULE,
700 .open = rbd_open,
dfc5606d 701 .release = rbd_release,
131fd9f6
GZ
702 .ioctl = rbd_ioctl,
703#ifdef CONFIG_COMPAT
704 .compat_ioctl = rbd_compat_ioctl,
705#endif
602adf40
YS
706};
707
708/*
7262cfca 709 * Initialize an rbd client instance. Success or not, this function
cfbf6377 710 * consumes ceph_opts. Caller holds client_mutex.
602adf40 711 */
f8c38929 712static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
713{
714 struct rbd_client *rbdc;
715 int ret = -ENOMEM;
716
37206ee5 717 dout("%s:\n", __func__);
602adf40
YS
718 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
719 if (!rbdc)
720 goto out_opt;
721
722 kref_init(&rbdc->kref);
723 INIT_LIST_HEAD(&rbdc->node);
724
74da4a0f 725 rbdc->client = ceph_create_client(ceph_opts, rbdc);
602adf40 726 if (IS_ERR(rbdc->client))
08f75463 727 goto out_rbdc;
43ae4701 728 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
729
730 ret = ceph_open_session(rbdc->client);
731 if (ret < 0)
08f75463 732 goto out_client;
602adf40 733
432b8587 734 spin_lock(&rbd_client_list_lock);
602adf40 735 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 736 spin_unlock(&rbd_client_list_lock);
602adf40 737
37206ee5 738 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 739
602adf40 740 return rbdc;
08f75463 741out_client:
602adf40 742 ceph_destroy_client(rbdc->client);
08f75463 743out_rbdc:
602adf40
YS
744 kfree(rbdc);
745out_opt:
43ae4701
AE
746 if (ceph_opts)
747 ceph_destroy_options(ceph_opts);
37206ee5
AE
748 dout("%s: error %d\n", __func__, ret);
749
28f259b7 750 return ERR_PTR(ret);
602adf40
YS
751}
752
2f82ee54
AE
753static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
754{
755 kref_get(&rbdc->kref);
756
757 return rbdc;
758}
759
602adf40 760/*
1f7ba331
AE
761 * Find a ceph client with specific addr and configuration. If
762 * found, bump its reference count.
602adf40 763 */
1f7ba331 764static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
765{
766 struct rbd_client *client_node;
1f7ba331 767 bool found = false;
602adf40 768
43ae4701 769 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
770 return NULL;
771
1f7ba331
AE
772 spin_lock(&rbd_client_list_lock);
773 list_for_each_entry(client_node, &rbd_client_list, node) {
774 if (!ceph_compare_options(ceph_opts, client_node->client)) {
2f82ee54
AE
775 __rbd_get_client(client_node);
776
1f7ba331
AE
777 found = true;
778 break;
779 }
780 }
781 spin_unlock(&rbd_client_list_lock);
782
783 return found ? client_node : NULL;
602adf40
YS
784}
785
59c2be1e 786/*
210c104c 787 * (Per device) rbd map options
59c2be1e
YS
788 */
789enum {
b5584180 790 Opt_queue_depth,
0c93e1b7 791 Opt_alloc_size,
34f55d0b 792 Opt_lock_timeout,
59c2be1e
YS
793 Opt_last_int,
794 /* int args above */
b26c047b 795 Opt_pool_ns,
59c2be1e
YS
796 Opt_last_string,
797 /* string args above */
cc0538b6
AE
798 Opt_read_only,
799 Opt_read_write,
80de1912 800 Opt_lock_on_read,
e010dd0a 801 Opt_exclusive,
d9360540 802 Opt_notrim,
210c104c 803 Opt_err
59c2be1e
YS
804};
805
43ae4701 806static match_table_t rbd_opts_tokens = {
b5584180 807 {Opt_queue_depth, "queue_depth=%d"},
0c93e1b7 808 {Opt_alloc_size, "alloc_size=%d"},
34f55d0b 809 {Opt_lock_timeout, "lock_timeout=%d"},
59c2be1e 810 /* int args above */
b26c047b 811 {Opt_pool_ns, "_pool_ns=%s"},
59c2be1e 812 /* string args above */
be466c1c 813 {Opt_read_only, "read_only"},
cc0538b6
AE
814 {Opt_read_only, "ro"}, /* Alternate spelling */
815 {Opt_read_write, "read_write"},
816 {Opt_read_write, "rw"}, /* Alternate spelling */
80de1912 817 {Opt_lock_on_read, "lock_on_read"},
e010dd0a 818 {Opt_exclusive, "exclusive"},
d9360540 819 {Opt_notrim, "notrim"},
210c104c 820 {Opt_err, NULL}
59c2be1e
YS
821};
822
98571b5a 823struct rbd_options {
b5584180 824 int queue_depth;
0c93e1b7 825 int alloc_size;
34f55d0b 826 unsigned long lock_timeout;
98571b5a 827 bool read_only;
80de1912 828 bool lock_on_read;
e010dd0a 829 bool exclusive;
d9360540 830 bool trim;
98571b5a
AE
831};
832
b5584180 833#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
0c93e1b7 834#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
34f55d0b 835#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
98571b5a 836#define RBD_READ_ONLY_DEFAULT false
80de1912 837#define RBD_LOCK_ON_READ_DEFAULT false
e010dd0a 838#define RBD_EXCLUSIVE_DEFAULT false
d9360540 839#define RBD_TRIM_DEFAULT true
98571b5a 840
c300156b
ID
841struct parse_rbd_opts_ctx {
842 struct rbd_spec *spec;
843 struct rbd_options *opts;
844};
845
59c2be1e
YS
846static int parse_rbd_opts_token(char *c, void *private)
847{
c300156b 848 struct parse_rbd_opts_ctx *pctx = private;
59c2be1e
YS
849 substring_t argstr[MAX_OPT_ARGS];
850 int token, intval, ret;
851
43ae4701 852 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
853 if (token < Opt_last_int) {
854 ret = match_int(&argstr[0], &intval);
855 if (ret < 0) {
2f56b6ba 856 pr_err("bad option arg (not int) at '%s'\n", c);
59c2be1e
YS
857 return ret;
858 }
859 dout("got int token %d val %d\n", token, intval);
860 } else if (token > Opt_last_int && token < Opt_last_string) {
210c104c 861 dout("got string token %d val %s\n", token, argstr[0].from);
59c2be1e
YS
862 } else {
863 dout("got token %d\n", token);
864 }
865
866 switch (token) {
b5584180
ID
867 case Opt_queue_depth:
868 if (intval < 1) {
869 pr_err("queue_depth out of range\n");
870 return -EINVAL;
871 }
c300156b 872 pctx->opts->queue_depth = intval;
b5584180 873 break;
0c93e1b7 874 case Opt_alloc_size:
16d80c54 875 if (intval < SECTOR_SIZE) {
0c93e1b7
ID
876 pr_err("alloc_size out of range\n");
877 return -EINVAL;
878 }
879 if (!is_power_of_2(intval)) {
880 pr_err("alloc_size must be a power of 2\n");
881 return -EINVAL;
882 }
883 pctx->opts->alloc_size = intval;
884 break;
34f55d0b
DY
885 case Opt_lock_timeout:
886 /* 0 is "wait forever" (i.e. infinite timeout) */
887 if (intval < 0 || intval > INT_MAX / 1000) {
888 pr_err("lock_timeout out of range\n");
889 return -EINVAL;
890 }
c300156b 891 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
34f55d0b 892 break;
b26c047b
ID
893 case Opt_pool_ns:
894 kfree(pctx->spec->pool_ns);
895 pctx->spec->pool_ns = match_strdup(argstr);
896 if (!pctx->spec->pool_ns)
897 return -ENOMEM;
34f55d0b 898 break;
cc0538b6 899 case Opt_read_only:
c300156b 900 pctx->opts->read_only = true;
cc0538b6
AE
901 break;
902 case Opt_read_write:
c300156b 903 pctx->opts->read_only = false;
cc0538b6 904 break;
80de1912 905 case Opt_lock_on_read:
c300156b 906 pctx->opts->lock_on_read = true;
80de1912 907 break;
e010dd0a 908 case Opt_exclusive:
c300156b 909 pctx->opts->exclusive = true;
e010dd0a 910 break;
d9360540 911 case Opt_notrim:
c300156b 912 pctx->opts->trim = false;
d9360540 913 break;
59c2be1e 914 default:
210c104c
ID
915 /* libceph prints "bad option" msg */
916 return -EINVAL;
59c2be1e 917 }
210c104c 918
59c2be1e
YS
919 return 0;
920}
921
6d2940c8
GZ
922static char* obj_op_name(enum obj_operation_type op_type)
923{
924 switch (op_type) {
925 case OBJ_OP_READ:
926 return "read";
927 case OBJ_OP_WRITE:
928 return "write";
90e98c52
GZ
929 case OBJ_OP_DISCARD:
930 return "discard";
6484cbe9
ID
931 case OBJ_OP_ZEROOUT:
932 return "zeroout";
6d2940c8
GZ
933 default:
934 return "???";
935 }
936}
937
602adf40
YS
938/*
939 * Destroy ceph client
d23a4b3f 940 *
432b8587 941 * Caller must hold rbd_client_list_lock.
602adf40
YS
942 */
943static void rbd_client_release(struct kref *kref)
944{
945 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
946
37206ee5 947 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 948 spin_lock(&rbd_client_list_lock);
602adf40 949 list_del(&rbdc->node);
cd9d9f5d 950 spin_unlock(&rbd_client_list_lock);
602adf40
YS
951
952 ceph_destroy_client(rbdc->client);
953 kfree(rbdc);
954}
955
956/*
957 * Drop reference to ceph client node. If it's not referenced anymore, release
958 * it.
959 */
9d3997fd 960static void rbd_put_client(struct rbd_client *rbdc)
602adf40 961{
c53d5893
AE
962 if (rbdc)
963 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
964}
965
5feb0d8d
ID
966/*
967 * Get a ceph client with specific addr and configuration, if one does
968 * not exist create it. Either way, ceph_opts is consumed by this
969 * function.
970 */
971static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
972{
973 struct rbd_client *rbdc;
dd435855 974 int ret;
5feb0d8d 975
a32e4143 976 mutex_lock(&client_mutex);
5feb0d8d 977 rbdc = rbd_client_find(ceph_opts);
dd435855 978 if (rbdc) {
5feb0d8d 979 ceph_destroy_options(ceph_opts);
dd435855
ID
980
981 /*
982 * Using an existing client. Make sure ->pg_pools is up to
983 * date before we look up the pool id in do_rbd_add().
984 */
9d4a227f
ID
985 ret = ceph_wait_for_latest_osdmap(rbdc->client,
986 rbdc->client->options->mount_timeout);
dd435855
ID
987 if (ret) {
988 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
989 rbd_put_client(rbdc);
990 rbdc = ERR_PTR(ret);
991 }
992 } else {
5feb0d8d 993 rbdc = rbd_client_create(ceph_opts);
dd435855 994 }
5feb0d8d
ID
995 mutex_unlock(&client_mutex);
996
997 return rbdc;
998}
999
a30b71b9
AE
1000static bool rbd_image_format_valid(u32 image_format)
1001{
1002 return image_format == 1 || image_format == 2;
1003}
1004
8e94af8e
AE
1005static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
1006{
103a150f
AE
1007 size_t size;
1008 u32 snap_count;
1009
1010 /* The header has to start with the magic rbd header text */
1011 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
1012 return false;
1013
db2388b6
AE
1014 /* The bio layer requires at least sector-sized I/O */
1015
1016 if (ondisk->options.order < SECTOR_SHIFT)
1017 return false;
1018
1019 /* If we use u64 in a few spots we may be able to loosen this */
1020
1021 if (ondisk->options.order > 8 * sizeof (int) - 1)
1022 return false;
1023
103a150f
AE
1024 /*
1025 * The size of a snapshot header has to fit in a size_t, and
1026 * that limits the number of snapshots.
1027 */
1028 snap_count = le32_to_cpu(ondisk->snap_count);
1029 size = SIZE_MAX - sizeof (struct ceph_snap_context);
1030 if (snap_count > size / sizeof (__le64))
1031 return false;
1032
1033 /*
1034 * Not only that, but the size of the entire the snapshot
1035 * header must also be representable in a size_t.
1036 */
1037 size -= snap_count * sizeof (__le64);
1038 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1039 return false;
1040
1041 return true;
8e94af8e
AE
1042}
1043
5bc3fb17
ID
1044/*
1045 * returns the size of an object in the image
1046 */
1047static u32 rbd_obj_bytes(struct rbd_image_header *header)
1048{
1049 return 1U << header->obj_order;
1050}
1051
263423f8
ID
1052static void rbd_init_layout(struct rbd_device *rbd_dev)
1053{
1054 if (rbd_dev->header.stripe_unit == 0 ||
1055 rbd_dev->header.stripe_count == 0) {
1056 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1057 rbd_dev->header.stripe_count = 1;
1058 }
1059
1060 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1061 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1062 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
7e97332e
ID
1063 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1064 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
263423f8
ID
1065 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1066}
1067
602adf40 1068/*
bb23e37a
AE
1069 * Fill an rbd image header with information from the given format 1
1070 * on-disk header.
602adf40 1071 */
662518b1 1072static int rbd_header_from_disk(struct rbd_device *rbd_dev,
4156d998 1073 struct rbd_image_header_ondisk *ondisk)
602adf40 1074{
662518b1 1075 struct rbd_image_header *header = &rbd_dev->header;
bb23e37a
AE
1076 bool first_time = header->object_prefix == NULL;
1077 struct ceph_snap_context *snapc;
1078 char *object_prefix = NULL;
1079 char *snap_names = NULL;
1080 u64 *snap_sizes = NULL;
ccece235 1081 u32 snap_count;
bb23e37a 1082 int ret = -ENOMEM;
621901d6 1083 u32 i;
602adf40 1084
bb23e37a 1085 /* Allocate this now to avoid having to handle failure below */
6a52325f 1086
bb23e37a 1087 if (first_time) {
848d796c
ID
1088 object_prefix = kstrndup(ondisk->object_prefix,
1089 sizeof(ondisk->object_prefix),
1090 GFP_KERNEL);
bb23e37a
AE
1091 if (!object_prefix)
1092 return -ENOMEM;
bb23e37a 1093 }
00f1f36f 1094
bb23e37a 1095 /* Allocate the snapshot context and fill it in */
00f1f36f 1096
bb23e37a
AE
1097 snap_count = le32_to_cpu(ondisk->snap_count);
1098 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1099 if (!snapc)
1100 goto out_err;
1101 snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 1102 if (snap_count) {
bb23e37a 1103 struct rbd_image_snap_ondisk *snaps;
f785cc1d
AE
1104 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1105
bb23e37a 1106 /* We'll keep a copy of the snapshot names... */
621901d6 1107
bb23e37a
AE
1108 if (snap_names_len > (u64)SIZE_MAX)
1109 goto out_2big;
1110 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1111 if (!snap_names)
6a52325f
AE
1112 goto out_err;
1113
bb23e37a 1114 /* ...as well as the array of their sizes. */
88a25a5f
ME
1115 snap_sizes = kmalloc_array(snap_count,
1116 sizeof(*header->snap_sizes),
1117 GFP_KERNEL);
bb23e37a 1118 if (!snap_sizes)
6a52325f 1119 goto out_err;
bb23e37a 1120
f785cc1d 1121 /*
bb23e37a
AE
1122 * Copy the names, and fill in each snapshot's id
1123 * and size.
1124 *
99a41ebc 1125 * Note that rbd_dev_v1_header_info() guarantees the
bb23e37a 1126 * ondisk buffer we're working with has
f785cc1d
AE
1127 * snap_names_len bytes beyond the end of the
1128 * snapshot id array, this memcpy() is safe.
1129 */
bb23e37a
AE
1130 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1131 snaps = ondisk->snaps;
1132 for (i = 0; i < snap_count; i++) {
1133 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1134 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1135 }
602adf40 1136 }
6a52325f 1137
bb23e37a 1138 /* We won't fail any more, fill in the header */
621901d6 1139
bb23e37a
AE
1140 if (first_time) {
1141 header->object_prefix = object_prefix;
1142 header->obj_order = ondisk->options.order;
263423f8 1143 rbd_init_layout(rbd_dev);
602adf40 1144 } else {
662518b1
AE
1145 ceph_put_snap_context(header->snapc);
1146 kfree(header->snap_names);
1147 kfree(header->snap_sizes);
602adf40 1148 }
849b4260 1149
bb23e37a 1150 /* The remaining fields always get updated (when we refresh) */
621901d6 1151
f84344f3 1152 header->image_size = le64_to_cpu(ondisk->image_size);
bb23e37a
AE
1153 header->snapc = snapc;
1154 header->snap_names = snap_names;
1155 header->snap_sizes = snap_sizes;
468521c1 1156
602adf40 1157 return 0;
bb23e37a
AE
1158out_2big:
1159 ret = -EIO;
6a52325f 1160out_err:
bb23e37a
AE
1161 kfree(snap_sizes);
1162 kfree(snap_names);
1163 ceph_put_snap_context(snapc);
1164 kfree(object_prefix);
ccece235 1165
bb23e37a 1166 return ret;
602adf40
YS
1167}
1168
9682fc6d
AE
1169static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1170{
1171 const char *snap_name;
1172
1173 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1174
1175 /* Skip over names until we find the one we are looking for */
1176
1177 snap_name = rbd_dev->header.snap_names;
1178 while (which--)
1179 snap_name += strlen(snap_name) + 1;
1180
1181 return kstrdup(snap_name, GFP_KERNEL);
1182}
1183
30d1cff8
AE
1184/*
1185 * Snapshot id comparison function for use with qsort()/bsearch().
1186 * Note that result is for snapshots in *descending* order.
1187 */
1188static int snapid_compare_reverse(const void *s1, const void *s2)
1189{
1190 u64 snap_id1 = *(u64 *)s1;
1191 u64 snap_id2 = *(u64 *)s2;
1192
1193 if (snap_id1 < snap_id2)
1194 return 1;
1195 return snap_id1 == snap_id2 ? 0 : -1;
1196}
1197
1198/*
1199 * Search a snapshot context to see if the given snapshot id is
1200 * present.
1201 *
1202 * Returns the position of the snapshot id in the array if it's found,
1203 * or BAD_SNAP_INDEX otherwise.
1204 *
1205 * Note: The snapshot array is in kept sorted (by the osd) in
1206 * reverse order, highest snapshot id first.
1207 */
9682fc6d
AE
1208static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1209{
1210 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
30d1cff8 1211 u64 *found;
9682fc6d 1212
30d1cff8
AE
1213 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1214 sizeof (snap_id), snapid_compare_reverse);
9682fc6d 1215
30d1cff8 1216 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9682fc6d
AE
1217}
1218
2ad3d716
AE
1219static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1220 u64 snap_id)
9e15b77d 1221{
54cac61f 1222 u32 which;
da6a6b63 1223 const char *snap_name;
9e15b77d 1224
54cac61f
AE
1225 which = rbd_dev_snap_index(rbd_dev, snap_id);
1226 if (which == BAD_SNAP_INDEX)
da6a6b63 1227 return ERR_PTR(-ENOENT);
54cac61f 1228
da6a6b63
JD
1229 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1230 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
54cac61f
AE
1231}
1232
1233static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1234{
9e15b77d
AE
1235 if (snap_id == CEPH_NOSNAP)
1236 return RBD_SNAP_HEAD_NAME;
1237
54cac61f
AE
1238 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1239 if (rbd_dev->image_format == 1)
1240 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9e15b77d 1241
54cac61f 1242 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
9e15b77d
AE
1243}
1244
2ad3d716
AE
1245static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1246 u64 *snap_size)
602adf40 1247{
2ad3d716
AE
1248 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1249 if (snap_id == CEPH_NOSNAP) {
1250 *snap_size = rbd_dev->header.image_size;
1251 } else if (rbd_dev->image_format == 1) {
1252 u32 which;
602adf40 1253
2ad3d716
AE
1254 which = rbd_dev_snap_index(rbd_dev, snap_id);
1255 if (which == BAD_SNAP_INDEX)
1256 return -ENOENT;
e86924a8 1257
2ad3d716
AE
1258 *snap_size = rbd_dev->header.snap_sizes[which];
1259 } else {
1260 u64 size = 0;
1261 int ret;
1262
1263 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1264 if (ret)
1265 return ret;
1266
1267 *snap_size = size;
1268 }
1269 return 0;
602adf40
YS
1270}
1271
2ad3d716
AE
1272static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1273 u64 *snap_features)
602adf40 1274{
2ad3d716
AE
1275 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1276 if (snap_id == CEPH_NOSNAP) {
1277 *snap_features = rbd_dev->header.features;
1278 } else if (rbd_dev->image_format == 1) {
1279 *snap_features = 0; /* No features for format 1 */
602adf40 1280 } else {
2ad3d716
AE
1281 u64 features = 0;
1282 int ret;
8b0241f8 1283
2ad3d716
AE
1284 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1285 if (ret)
1286 return ret;
1287
1288 *snap_features = features;
1289 }
1290 return 0;
1291}
1292
1293static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1294{
8f4b7d98 1295 u64 snap_id = rbd_dev->spec->snap_id;
2ad3d716
AE
1296 u64 size = 0;
1297 u64 features = 0;
1298 int ret;
1299
2ad3d716
AE
1300 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1301 if (ret)
1302 return ret;
1303 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1304 if (ret)
1305 return ret;
1306
1307 rbd_dev->mapping.size = size;
1308 rbd_dev->mapping.features = features;
1309
8b0241f8 1310 return 0;
602adf40
YS
1311}
1312
d1cf5788
AE
1313static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1314{
1315 rbd_dev->mapping.size = 0;
1316 rbd_dev->mapping.features = 0;
200a6a8b
AE
1317}
1318
5359a17d 1319static void zero_bvec(struct bio_vec *bv)
602adf40 1320{
602adf40 1321 void *buf;
5359a17d 1322 unsigned long flags;
602adf40 1323
5359a17d
ID
1324 buf = bvec_kmap_irq(bv, &flags);
1325 memset(buf, 0, bv->bv_len);
1326 flush_dcache_page(bv->bv_page);
1327 bvec_kunmap_irq(buf, &flags);
602adf40
YS
1328}
1329
5359a17d 1330static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
b9434c5b 1331{
5359a17d 1332 struct ceph_bio_iter it = *bio_pos;
b9434c5b 1333
5359a17d
ID
1334 ceph_bio_iter_advance(&it, off);
1335 ceph_bio_iter_advance_step(&it, bytes, ({
1336 zero_bvec(&bv);
1337 }));
b9434c5b
AE
1338}
1339
7e07efb1 1340static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
602adf40 1341{
7e07efb1 1342 struct ceph_bvec_iter it = *bvec_pos;
602adf40 1343
7e07efb1
ID
1344 ceph_bvec_iter_advance(&it, off);
1345 ceph_bvec_iter_advance_step(&it, bytes, ({
1346 zero_bvec(&bv);
1347 }));
f7760dad
AE
1348}
1349
1350/*
3da691bf 1351 * Zero a range in @obj_req data buffer defined by a bio (list) or
afb97888 1352 * (private) bio_vec array.
f7760dad 1353 *
3da691bf 1354 * @off is relative to the start of the data buffer.
926f9b3f 1355 */
3da691bf
ID
1356static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1357 u32 bytes)
926f9b3f 1358{
54ab3b24
ID
1359 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1360
ecc633ca 1361 switch (obj_req->img_request->data_type) {
3da691bf
ID
1362 case OBJ_REQUEST_BIO:
1363 zero_bios(&obj_req->bio_pos, off, bytes);
1364 break;
1365 case OBJ_REQUEST_BVECS:
afb97888 1366 case OBJ_REQUEST_OWN_BVECS:
3da691bf
ID
1367 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1368 break;
1369 default:
16809372 1370 BUG();
6365d33a
AE
1371 }
1372}
1373
bf0d5f50
AE
1374static void rbd_obj_request_destroy(struct kref *kref);
1375static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1376{
1377 rbd_assert(obj_request != NULL);
37206ee5 1378 dout("%s: obj %p (was %d)\n", __func__, obj_request,
2c935bc5 1379 kref_read(&obj_request->kref));
bf0d5f50
AE
1380 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1381}
1382
bf0d5f50
AE
1383static void rbd_img_request_destroy(struct kref *kref);
1384static void rbd_img_request_put(struct rbd_img_request *img_request)
1385{
1386 rbd_assert(img_request != NULL);
37206ee5 1387 dout("%s: img %p (was %d)\n", __func__, img_request,
2c935bc5 1388 kref_read(&img_request->kref));
e93aca0a 1389 kref_put(&img_request->kref, rbd_img_request_destroy);
bf0d5f50
AE
1390}
1391
1392static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1393 struct rbd_obj_request *obj_request)
1394{
25dcf954
AE
1395 rbd_assert(obj_request->img_request == NULL);
1396
b155e86c 1397 /* Image request now owns object's original reference */
bf0d5f50 1398 obj_request->img_request = img_request;
15961b44 1399 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
bf0d5f50
AE
1400}
1401
1402static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1403 struct rbd_obj_request *obj_request)
1404{
15961b44 1405 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
43df3d35 1406 list_del(&obj_request->ex.oe_item);
bf0d5f50 1407 rbd_assert(obj_request->img_request == img_request);
bf0d5f50
AE
1408 rbd_obj_request_put(obj_request);
1409}
1410
980917fc 1411static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
bf0d5f50 1412{
bcbab1db
ID
1413 struct ceph_osd_request *osd_req =
1414 list_last_entry(&obj_request->osd_reqs, struct ceph_osd_request,
1415 r_private_item);
980917fc 1416
a90bb0c1 1417 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
43df3d35
ID
1418 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1419 obj_request->ex.oe_len, osd_req);
980917fc 1420 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
bf0d5f50
AE
1421}
1422
0c425248
AE
1423/*
1424 * The default/initial value for all image request flags is 0. Each
1425 * is conditionally set to 1 at image request initialization time
1426 * and currently never change thereafter.
1427 */
d0b2e944
AE
1428static void img_request_layered_set(struct rbd_img_request *img_request)
1429{
1430 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1431 smp_mb();
1432}
1433
a2acd00e
AE
1434static void img_request_layered_clear(struct rbd_img_request *img_request)
1435{
1436 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1437 smp_mb();
1438}
1439
d0b2e944
AE
1440static bool img_request_layered_test(struct rbd_img_request *img_request)
1441{
1442 smp_mb();
1443 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1444}
1445
3da691bf 1446static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
6e2a4505 1447{
3da691bf 1448 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
b9434c5b 1449
43df3d35
ID
1450 return !obj_req->ex.oe_off &&
1451 obj_req->ex.oe_len == rbd_dev->layout.object_size;
6e2a4505
AE
1452}
1453
3da691bf 1454static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
bf0d5f50 1455{
3da691bf 1456 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bf0d5f50 1457
43df3d35 1458 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
3da691bf 1459 rbd_dev->layout.object_size;
0dcc685e
ID
1460}
1461
13488d53
ID
1462/*
1463 * Must be called after rbd_obj_calc_img_extents().
1464 */
1465static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1466{
1467 if (!obj_req->num_img_extents ||
9b17eb2c
ID
1468 (rbd_obj_is_entire(obj_req) &&
1469 !obj_req->img_request->snapc->num_snaps))
13488d53
ID
1470 return false;
1471
1472 return true;
1473}
1474
86bd7998 1475static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
bf0d5f50 1476{
86bd7998
ID
1477 return ceph_file_extents_bytes(obj_req->img_extents,
1478 obj_req->num_img_extents);
bf0d5f50
AE
1479}
1480
3da691bf 1481static bool rbd_img_is_write(struct rbd_img_request *img_req)
bf0d5f50 1482{
9bb0248d 1483 switch (img_req->op_type) {
3da691bf
ID
1484 case OBJ_OP_READ:
1485 return false;
1486 case OBJ_OP_WRITE:
1487 case OBJ_OP_DISCARD:
6484cbe9 1488 case OBJ_OP_ZEROOUT:
3da691bf
ID
1489 return true;
1490 default:
c6244b3b 1491 BUG();
3da691bf 1492 }
90e98c52
GZ
1493}
1494
85e084fe 1495static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
bf0d5f50 1496{
3da691bf 1497 struct rbd_obj_request *obj_req = osd_req->r_priv;
54ab3b24 1498 int result;
bf0d5f50 1499
3da691bf
ID
1500 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1501 osd_req->r_result, obj_req);
bf0d5f50 1502
54ab3b24
ID
1503 /*
1504 * Writes aren't allowed to return a data payload. In some
1505 * guarded write cases (e.g. stat + zero on an empty object)
1506 * a stat response makes it through, but we don't care.
1507 */
1508 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1509 result = 0;
3da691bf 1510 else
54ab3b24 1511 result = osd_req->r_result;
bf0d5f50 1512
54ab3b24 1513 rbd_obj_handle_request(obj_req, result);
bf0d5f50
AE
1514}
1515
bcbab1db 1516static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
430c28c3 1517{
bcbab1db 1518 struct rbd_obj_request *obj_request = osd_req->r_priv;
430c28c3 1519
a162b308 1520 osd_req->r_flags = CEPH_OSD_FLAG_READ;
7c84883a 1521 osd_req->r_snapid = obj_request->img_request->snap_id;
9d4df01f
AE
1522}
1523
bcbab1db 1524static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
9d4df01f 1525{
bcbab1db 1526 struct rbd_obj_request *obj_request = osd_req->r_priv;
9d4df01f 1527
a162b308 1528 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
fac02ddf 1529 ktime_get_real_ts64(&osd_req->r_mtime);
43df3d35 1530 osd_req->r_data_offset = obj_request->ex.oe_off;
430c28c3
AE
1531}
1532
bc81207e 1533static struct ceph_osd_request *
bcbab1db
ID
1534__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1535 struct ceph_snap_context *snapc, int num_ops)
bc81207e 1536{
e28eded5 1537 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bc81207e
ID
1538 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1539 struct ceph_osd_request *req;
a90bb0c1
ID
1540 const char *name_format = rbd_dev->image_format == 1 ?
1541 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
bcbab1db 1542 int ret;
bc81207e 1543
e28eded5 1544 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
bc81207e 1545 if (!req)
bcbab1db 1546 return ERR_PTR(-ENOMEM);
bc81207e 1547
bcbab1db 1548 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
bc81207e 1549 req->r_callback = rbd_osd_req_callback;
a162b308 1550 req->r_priv = obj_req;
bc81207e 1551
b26c047b
ID
1552 /*
1553 * Data objects may be stored in a separate pool, but always in
1554 * the same namespace in that pool as the header in its pool.
1555 */
1556 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
bc81207e 1557 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
b26c047b 1558
bcbab1db
ID
1559 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1560 rbd_dev->header.object_prefix,
1561 obj_req->ex.oe_objno);
1562 if (ret)
1563 return ERR_PTR(ret);
bc81207e 1564
bc81207e 1565 return req;
bc81207e
ID
1566}
1567
e28eded5 1568static struct ceph_osd_request *
bcbab1db 1569rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
bf0d5f50 1570{
bcbab1db
ID
1571 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1572 num_ops);
bf0d5f50
AE
1573}
1574
ecc633ca 1575static struct rbd_obj_request *rbd_obj_request_create(void)
bf0d5f50
AE
1576{
1577 struct rbd_obj_request *obj_request;
bf0d5f50 1578
5a60e876 1579 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
6c696d85 1580 if (!obj_request)
f907ad55 1581 return NULL;
f907ad55 1582
43df3d35 1583 ceph_object_extent_init(&obj_request->ex);
bcbab1db 1584 INIT_LIST_HEAD(&obj_request->osd_reqs);
85b5e6d1 1585 mutex_init(&obj_request->state_mutex);
bf0d5f50
AE
1586 kref_init(&obj_request->kref);
1587
67e2b652 1588 dout("%s %p\n", __func__, obj_request);
bf0d5f50
AE
1589 return obj_request;
1590}
1591
1592static void rbd_obj_request_destroy(struct kref *kref)
1593{
1594 struct rbd_obj_request *obj_request;
bcbab1db 1595 struct ceph_osd_request *osd_req;
7e07efb1 1596 u32 i;
bf0d5f50
AE
1597
1598 obj_request = container_of(kref, struct rbd_obj_request, kref);
1599
37206ee5
AE
1600 dout("%s: obj %p\n", __func__, obj_request);
1601
bcbab1db
ID
1602 while (!list_empty(&obj_request->osd_reqs)) {
1603 osd_req = list_first_entry(&obj_request->osd_reqs,
1604 struct ceph_osd_request, r_private_item);
1605 list_del_init(&osd_req->r_private_item);
1606 ceph_osdc_put_request(osd_req);
1607 }
bf0d5f50 1608
ecc633ca 1609 switch (obj_request->img_request->data_type) {
9969ebc5 1610 case OBJ_REQUEST_NODATA:
bf0d5f50 1611 case OBJ_REQUEST_BIO:
7e07efb1 1612 case OBJ_REQUEST_BVECS:
5359a17d 1613 break; /* Nothing to do */
afb97888
ID
1614 case OBJ_REQUEST_OWN_BVECS:
1615 kfree(obj_request->bvec_pos.bvecs);
788e2df3 1616 break;
7e07efb1 1617 default:
16809372 1618 BUG();
bf0d5f50
AE
1619 }
1620
86bd7998 1621 kfree(obj_request->img_extents);
7e07efb1
ID
1622 if (obj_request->copyup_bvecs) {
1623 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1624 if (obj_request->copyup_bvecs[i].bv_page)
1625 __free_page(obj_request->copyup_bvecs[i].bv_page);
1626 }
1627 kfree(obj_request->copyup_bvecs);
bf0d5f50
AE
1628 }
1629
868311b1 1630 kmem_cache_free(rbd_obj_request_cache, obj_request);
bf0d5f50
AE
1631}
1632
fb65d228
AE
1633/* It's OK to call this for a device with no parent */
1634
1635static void rbd_spec_put(struct rbd_spec *spec);
1636static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1637{
1638 rbd_dev_remove_parent(rbd_dev);
1639 rbd_spec_put(rbd_dev->parent_spec);
1640 rbd_dev->parent_spec = NULL;
1641 rbd_dev->parent_overlap = 0;
1642}
1643
a2acd00e
AE
1644/*
1645 * Parent image reference counting is used to determine when an
1646 * image's parent fields can be safely torn down--after there are no
1647 * more in-flight requests to the parent image. When the last
1648 * reference is dropped, cleaning them up is safe.
1649 */
1650static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1651{
1652 int counter;
1653
1654 if (!rbd_dev->parent_spec)
1655 return;
1656
1657 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1658 if (counter > 0)
1659 return;
1660
1661 /* Last reference; clean up parent data structures */
1662
1663 if (!counter)
1664 rbd_dev_unparent(rbd_dev);
1665 else
9584d508 1666 rbd_warn(rbd_dev, "parent reference underflow");
a2acd00e
AE
1667}
1668
1669/*
1670 * If an image has a non-zero parent overlap, get a reference to its
1671 * parent.
1672 *
1673 * Returns true if the rbd device has a parent with a non-zero
1674 * overlap and a reference for it was successfully taken, or
1675 * false otherwise.
1676 */
1677static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1678{
ae43e9d0 1679 int counter = 0;
a2acd00e
AE
1680
1681 if (!rbd_dev->parent_spec)
1682 return false;
1683
ae43e9d0
ID
1684 down_read(&rbd_dev->header_rwsem);
1685 if (rbd_dev->parent_overlap)
1686 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1687 up_read(&rbd_dev->header_rwsem);
a2acd00e
AE
1688
1689 if (counter < 0)
9584d508 1690 rbd_warn(rbd_dev, "parent reference overflow");
a2acd00e 1691
ae43e9d0 1692 return counter > 0;
a2acd00e
AE
1693}
1694
bf0d5f50
AE
1695/*
1696 * Caller is responsible for filling in the list of object requests
1697 * that comprises the image request, and the Linux request pointer
1698 * (if there is one).
1699 */
cc344fa1
AE
1700static struct rbd_img_request *rbd_img_request_create(
1701 struct rbd_device *rbd_dev,
6d2940c8 1702 enum obj_operation_type op_type,
4e752f0a 1703 struct ceph_snap_context *snapc)
bf0d5f50
AE
1704{
1705 struct rbd_img_request *img_request;
bf0d5f50 1706
a0c5895b 1707 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
bf0d5f50
AE
1708 if (!img_request)
1709 return NULL;
1710
bf0d5f50 1711 img_request->rbd_dev = rbd_dev;
9bb0248d 1712 img_request->op_type = op_type;
9bb0248d 1713 if (!rbd_img_is_write(img_request))
bf0d5f50 1714 img_request->snap_id = rbd_dev->spec->snap_id;
9bb0248d
ID
1715 else
1716 img_request->snapc = snapc;
1717
a2acd00e 1718 if (rbd_dev_parent_get(rbd_dev))
d0b2e944 1719 img_request_layered_set(img_request);
a0c5895b 1720
43df3d35 1721 INIT_LIST_HEAD(&img_request->object_extents);
0192ce2e 1722 mutex_init(&img_request->state_mutex);
bf0d5f50
AE
1723 kref_init(&img_request->kref);
1724
dfd9875f
ID
1725 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1726 obj_op_name(op_type), img_request);
bf0d5f50
AE
1727 return img_request;
1728}
1729
1730static void rbd_img_request_destroy(struct kref *kref)
1731{
1732 struct rbd_img_request *img_request;
1733 struct rbd_obj_request *obj_request;
1734 struct rbd_obj_request *next_obj_request;
1735
1736 img_request = container_of(kref, struct rbd_img_request, kref);
1737
37206ee5
AE
1738 dout("%s: img %p\n", __func__, img_request);
1739
bf0d5f50
AE
1740 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1741 rbd_img_obj_request_del(img_request, obj_request);
1742
a2acd00e
AE
1743 if (img_request_layered_test(img_request)) {
1744 img_request_layered_clear(img_request);
1745 rbd_dev_parent_put(img_request->rbd_dev);
1746 }
1747
9bb0248d 1748 if (rbd_img_is_write(img_request))
812164f8 1749 ceph_put_snap_context(img_request->snapc);
bf0d5f50 1750
1c2a9dfe 1751 kmem_cache_free(rbd_img_request_cache, img_request);
bf0d5f50
AE
1752}
1753
86bd7998
ID
1754static void prune_extents(struct ceph_file_extent *img_extents,
1755 u32 *num_img_extents, u64 overlap)
e93f3152 1756{
86bd7998 1757 u32 cnt = *num_img_extents;
e93f3152 1758
86bd7998
ID
1759 /* drop extents completely beyond the overlap */
1760 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1761 cnt--;
e93f3152 1762
86bd7998
ID
1763 if (cnt) {
1764 struct ceph_file_extent *ex = &img_extents[cnt - 1];
e93f3152 1765
86bd7998
ID
1766 /* trim final overlapping extent */
1767 if (ex->fe_off + ex->fe_len > overlap)
1768 ex->fe_len = overlap - ex->fe_off;
1769 }
e93f3152 1770
86bd7998 1771 *num_img_extents = cnt;
e93f3152
AE
1772}
1773
86bd7998
ID
1774/*
1775 * Determine the byte range(s) covered by either just the object extent
1776 * or the entire object in the parent image.
1777 */
1778static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1779 bool entire)
e93f3152 1780{
86bd7998
ID
1781 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1782 int ret;
e93f3152 1783
86bd7998
ID
1784 if (!rbd_dev->parent_overlap)
1785 return 0;
e93f3152 1786
86bd7998
ID
1787 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1788 entire ? 0 : obj_req->ex.oe_off,
1789 entire ? rbd_dev->layout.object_size :
1790 obj_req->ex.oe_len,
1791 &obj_req->img_extents,
1792 &obj_req->num_img_extents);
1793 if (ret)
1794 return ret;
e93f3152 1795
86bd7998
ID
1796 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1797 rbd_dev->parent_overlap);
1798 return 0;
e93f3152
AE
1799}
1800
bcbab1db 1801static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
1217857f 1802{
bcbab1db
ID
1803 struct rbd_obj_request *obj_req = osd_req->r_priv;
1804
ecc633ca 1805 switch (obj_req->img_request->data_type) {
3da691bf 1806 case OBJ_REQUEST_BIO:
bcbab1db 1807 osd_req_op_extent_osd_data_bio(osd_req, which,
3da691bf 1808 &obj_req->bio_pos,
43df3d35 1809 obj_req->ex.oe_len);
3da691bf
ID
1810 break;
1811 case OBJ_REQUEST_BVECS:
afb97888 1812 case OBJ_REQUEST_OWN_BVECS:
3da691bf 1813 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
43df3d35 1814 obj_req->ex.oe_len);
afb97888 1815 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
bcbab1db 1816 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
3da691bf
ID
1817 &obj_req->bvec_pos);
1818 break;
1819 default:
16809372 1820 BUG();
1217857f 1821 }
3da691bf 1822}
1217857f 1823
3da691bf
ID
1824static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1825{
bcbab1db
ID
1826 struct ceph_osd_request *osd_req;
1827
1828 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
1829 if (IS_ERR(osd_req))
1830 return PTR_ERR(osd_req);
2a842aca 1831
bcbab1db 1832 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
43df3d35 1833 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
bcbab1db 1834 rbd_osd_setup_data(osd_req, 0);
7ad18afa 1835
bcbab1db 1836 rbd_osd_format_read(osd_req);
85b5e6d1 1837 obj_req->read_state = RBD_OBJ_READ_START;
3da691bf
ID
1838 return 0;
1839}
1840
bcbab1db 1841static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
3da691bf
ID
1842{
1843 struct page **pages;
8b3e1a56 1844
3da691bf
ID
1845 /*
1846 * The response data for a STAT call consists of:
1847 * le64 length;
1848 * struct {
1849 * le32 tv_sec;
1850 * le32 tv_nsec;
1851 * } mtime;
1852 */
1853 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1854 if (IS_ERR(pages))
1855 return PTR_ERR(pages);
1856
bcbab1db
ID
1857 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
1858 osd_req_op_raw_data_in_pages(osd_req, which, pages,
3da691bf
ID
1859 8 + sizeof(struct ceph_timespec),
1860 0, false, true);
1861 return 0;
1217857f
AE
1862}
1863
13488d53
ID
1864static int count_write_ops(struct rbd_obj_request *obj_req)
1865{
1866 return 2; /* setallochint + write/writefull */
1867}
1868
bcbab1db
ID
1869static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
1870 int which)
2169238d 1871{
bcbab1db 1872 struct rbd_obj_request *obj_req = osd_req->r_priv;
3da691bf
ID
1873 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1874 u16 opcode;
2169238d 1875
bcbab1db 1876 osd_req_op_alloc_hint_init(osd_req, which++,
3da691bf
ID
1877 rbd_dev->layout.object_size,
1878 rbd_dev->layout.object_size);
2169238d 1879
3da691bf
ID
1880 if (rbd_obj_is_entire(obj_req))
1881 opcode = CEPH_OSD_OP_WRITEFULL;
1882 else
1883 opcode = CEPH_OSD_OP_WRITE;
2169238d 1884
bcbab1db 1885 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 1886 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
bcbab1db 1887 rbd_osd_setup_data(osd_req, which);
2169238d 1888
bcbab1db 1889 rbd_osd_format_write(osd_req);
3da691bf 1890}
2169238d 1891
3da691bf
ID
1892static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
1893{
bcbab1db 1894 struct ceph_osd_request *osd_req;
3da691bf
ID
1895 unsigned int num_osd_ops, which = 0;
1896 int ret;
1897
86bd7998
ID
1898 /* reverse map the entire object onto the parent */
1899 ret = rbd_obj_calc_img_extents(obj_req, true);
1900 if (ret)
1901 return ret;
1902
0ad5d953
ID
1903 if (rbd_obj_copyup_enabled(obj_req))
1904 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
1905
1906 num_osd_ops = count_write_ops(obj_req);
1907 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
1908 num_osd_ops++; /* stat */
2169238d 1909
bcbab1db
ID
1910 osd_req = rbd_obj_add_osd_request(obj_req, num_osd_ops);
1911 if (IS_ERR(osd_req))
1912 return PTR_ERR(osd_req);
2169238d 1913
0ad5d953 1914 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
bcbab1db 1915 ret = rbd_osd_setup_stat(osd_req, which++);
3da691bf
ID
1916 if (ret)
1917 return ret;
1918 }
1919
85b5e6d1 1920 obj_req->write_state = RBD_OBJ_WRITE_START;
bcbab1db 1921 __rbd_osd_setup_write_ops(osd_req, which);
3da691bf 1922 return 0;
2169238d
AE
1923}
1924
6484cbe9
ID
1925static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
1926{
1927 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
1928 CEPH_OSD_OP_ZERO;
1929}
1930
1931static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1932{
0c93e1b7 1933 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bcbab1db 1934 struct ceph_osd_request *osd_req;
0c93e1b7
ID
1935 u64 off = obj_req->ex.oe_off;
1936 u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
6484cbe9
ID
1937 int ret;
1938
0c93e1b7
ID
1939 /*
1940 * Align the range to alloc_size boundary and punt on discards
1941 * that are too small to free up any space.
1942 *
1943 * alloc_size == object_size && is_tail() is a special case for
1944 * filestore with filestore_punch_hole = false, needed to allow
1945 * truncate (in addition to delete).
1946 */
1947 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
1948 !rbd_obj_is_tail(obj_req)) {
1949 off = round_up(off, rbd_dev->opts->alloc_size);
1950 next_off = round_down(next_off, rbd_dev->opts->alloc_size);
1951 if (off >= next_off)
1952 return 1;
1953 }
1954
6484cbe9
ID
1955 /* reverse map the entire object onto the parent */
1956 ret = rbd_obj_calc_img_extents(obj_req, true);
1957 if (ret)
1958 return ret;
1959
0ad5d953
ID
1960 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
1961 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
1962
bcbab1db
ID
1963 osd_req = rbd_obj_add_osd_request(obj_req, 1);
1964 if (IS_ERR(osd_req))
1965 return PTR_ERR(osd_req);
6484cbe9
ID
1966
1967 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
0ad5d953 1968 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
bcbab1db 1969 osd_req_op_init(osd_req, 0, CEPH_OSD_OP_DELETE, 0);
6484cbe9 1970 } else {
0c93e1b7
ID
1971 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
1972 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
1973 off, next_off - off);
bcbab1db 1974 osd_req_op_extent_init(osd_req, 0,
6484cbe9 1975 truncate_or_zero_opcode(obj_req),
0c93e1b7 1976 off, next_off - off, 0, 0);
6484cbe9
ID
1977 }
1978
85b5e6d1 1979 obj_req->write_state = RBD_OBJ_WRITE_START;
bcbab1db 1980 rbd_osd_format_write(osd_req);
6484cbe9
ID
1981 return 0;
1982}
1983
13488d53
ID
1984static int count_zeroout_ops(struct rbd_obj_request *obj_req)
1985{
1986 int num_osd_ops;
1987
9b17eb2c
ID
1988 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
1989 !rbd_obj_copyup_enabled(obj_req))
13488d53
ID
1990 num_osd_ops = 2; /* create + truncate */
1991 else
1992 num_osd_ops = 1; /* delete/truncate/zero */
1993
1994 return num_osd_ops;
1995}
1996
bcbab1db
ID
1997static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
1998 int which)
3da691bf 1999{
bcbab1db 2000 struct rbd_obj_request *obj_req = osd_req->r_priv;
3b434a2a
JD
2001 u16 opcode;
2002
3da691bf 2003 if (rbd_obj_is_entire(obj_req)) {
86bd7998 2004 if (obj_req->num_img_extents) {
0ad5d953 2005 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
bcbab1db 2006 osd_req_op_init(osd_req, which++,
9b17eb2c 2007 CEPH_OSD_OP_CREATE, 0);
3b434a2a
JD
2008 opcode = CEPH_OSD_OP_TRUNCATE;
2009 } else {
0ad5d953 2010 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
bcbab1db 2011 osd_req_op_init(osd_req, which++,
3da691bf
ID
2012 CEPH_OSD_OP_DELETE, 0);
2013 opcode = 0;
3b434a2a 2014 }
3b434a2a 2015 } else {
6484cbe9 2016 opcode = truncate_or_zero_opcode(obj_req);
3b434a2a
JD
2017 }
2018
3da691bf 2019 if (opcode)
bcbab1db 2020 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2021 obj_req->ex.oe_off, obj_req->ex.oe_len,
3da691bf
ID
2022 0, 0);
2023
bcbab1db 2024 rbd_osd_format_write(osd_req);
3b434a2a
JD
2025}
2026
6484cbe9 2027static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
bf0d5f50 2028{
bcbab1db 2029 struct ceph_osd_request *osd_req;
3da691bf
ID
2030 unsigned int num_osd_ops, which = 0;
2031 int ret;
37206ee5 2032
86bd7998
ID
2033 /* reverse map the entire object onto the parent */
2034 ret = rbd_obj_calc_img_extents(obj_req, true);
2035 if (ret)
2036 return ret;
f1a4739f 2037
0ad5d953
ID
2038 if (rbd_obj_copyup_enabled(obj_req))
2039 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2040 if (!obj_req->num_img_extents) {
2041 if (rbd_obj_is_entire(obj_req))
2042 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2043 }
2044
2045 num_osd_ops = count_zeroout_ops(obj_req);
2046 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2047 num_osd_ops++; /* stat */
f1a4739f 2048
bcbab1db
ID
2049 osd_req = rbd_obj_add_osd_request(obj_req, num_osd_ops);
2050 if (IS_ERR(osd_req))
2051 return PTR_ERR(osd_req);
bf0d5f50 2052
0ad5d953 2053 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
bcbab1db 2054 ret = rbd_osd_setup_stat(osd_req, which++);
3da691bf
ID
2055 if (ret)
2056 return ret;
2057 }
3b434a2a 2058
85b5e6d1 2059 obj_req->write_state = RBD_OBJ_WRITE_START;
bcbab1db 2060 __rbd_osd_setup_zeroout_ops(osd_req, which);
3da691bf
ID
2061 return 0;
2062}
9d4df01f 2063
3da691bf
ID
2064/*
2065 * For each object request in @img_req, allocate an OSD request, add
2066 * individual OSD ops and prepare them for submission. The number of
2067 * OSD ops depends on op_type and the overlap point (if any).
2068 */
2069static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2070{
0c93e1b7 2071 struct rbd_obj_request *obj_req, *next_obj_req;
bcbab1db 2072 struct ceph_osd_request *osd_req;
3da691bf 2073 int ret;
430c28c3 2074
0c93e1b7 2075 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
9bb0248d 2076 switch (img_req->op_type) {
3da691bf
ID
2077 case OBJ_OP_READ:
2078 ret = rbd_obj_setup_read(obj_req);
2079 break;
2080 case OBJ_OP_WRITE:
2081 ret = rbd_obj_setup_write(obj_req);
2082 break;
2083 case OBJ_OP_DISCARD:
2084 ret = rbd_obj_setup_discard(obj_req);
2085 break;
6484cbe9
ID
2086 case OBJ_OP_ZEROOUT:
2087 ret = rbd_obj_setup_zeroout(obj_req);
2088 break;
3da691bf 2089 default:
16809372 2090 BUG();
3da691bf 2091 }
0c93e1b7 2092 if (ret < 0)
3da691bf 2093 return ret;
0c93e1b7 2094 if (ret > 0) {
0c93e1b7
ID
2095 rbd_img_obj_request_del(img_req, obj_req);
2096 continue;
2097 }
26f887e0 2098
bcbab1db
ID
2099 osd_req = list_last_entry(&obj_req->osd_reqs,
2100 struct ceph_osd_request,
2101 r_private_item);
2102 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
26f887e0
ID
2103 if (ret)
2104 return ret;
bf0d5f50
AE
2105 }
2106
0192ce2e 2107 img_req->state = RBD_IMG_START;
bf0d5f50 2108 return 0;
3da691bf 2109}
bf0d5f50 2110
5a237819
ID
2111union rbd_img_fill_iter {
2112 struct ceph_bio_iter bio_iter;
2113 struct ceph_bvec_iter bvec_iter;
2114};
bf0d5f50 2115
5a237819
ID
2116struct rbd_img_fill_ctx {
2117 enum obj_request_type pos_type;
2118 union rbd_img_fill_iter *pos;
2119 union rbd_img_fill_iter iter;
2120 ceph_object_extent_fn_t set_pos_fn;
afb97888
ID
2121 ceph_object_extent_fn_t count_fn;
2122 ceph_object_extent_fn_t copy_fn;
5a237819 2123};
bf0d5f50 2124
5a237819 2125static struct ceph_object_extent *alloc_object_extent(void *arg)
0eefd470 2126{
5a237819
ID
2127 struct rbd_img_request *img_req = arg;
2128 struct rbd_obj_request *obj_req;
0eefd470 2129
5a237819
ID
2130 obj_req = rbd_obj_request_create();
2131 if (!obj_req)
2132 return NULL;
2761713d 2133
5a237819
ID
2134 rbd_img_obj_request_add(img_req, obj_req);
2135 return &obj_req->ex;
2136}
0eefd470 2137
afb97888
ID
2138/*
2139 * While su != os && sc == 1 is technically not fancy (it's the same
2140 * layout as su == os && sc == 1), we can't use the nocopy path for it
2141 * because ->set_pos_fn() should be called only once per object.
2142 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2143 * treat su != os && sc == 1 as fancy.
2144 */
2145static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2146{
2147 return l->stripe_unit != l->object_size;
2148}
0eefd470 2149
afb97888
ID
2150static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2151 struct ceph_file_extent *img_extents,
2152 u32 num_img_extents,
2153 struct rbd_img_fill_ctx *fctx)
2154{
2155 u32 i;
2156 int ret;
2157
2158 img_req->data_type = fctx->pos_type;
0eefd470
AE
2159
2160 /*
afb97888
ID
2161 * Create object requests and set each object request's starting
2162 * position in the provided bio (list) or bio_vec array.
0eefd470 2163 */
afb97888
ID
2164 fctx->iter = *fctx->pos;
2165 for (i = 0; i < num_img_extents; i++) {
2166 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2167 img_extents[i].fe_off,
2168 img_extents[i].fe_len,
2169 &img_req->object_extents,
2170 alloc_object_extent, img_req,
2171 fctx->set_pos_fn, &fctx->iter);
2172 if (ret)
2173 return ret;
2174 }
0eefd470 2175
afb97888 2176 return __rbd_img_fill_request(img_req);
0eefd470
AE
2177}
2178
5a237819
ID
2179/*
2180 * Map a list of image extents to a list of object extents, create the
2181 * corresponding object requests (normally each to a different object,
2182 * but not always) and add them to @img_req. For each object request,
afb97888 2183 * set up its data descriptor to point to the corresponding chunk(s) of
5a237819
ID
2184 * @fctx->pos data buffer.
2185 *
afb97888
ID
2186 * Because ceph_file_to_extents() will merge adjacent object extents
2187 * together, each object request's data descriptor may point to multiple
2188 * different chunks of @fctx->pos data buffer.
2189 *
5a237819
ID
2190 * @fctx->pos data buffer is assumed to be large enough.
2191 */
2192static int rbd_img_fill_request(struct rbd_img_request *img_req,
2193 struct ceph_file_extent *img_extents,
2194 u32 num_img_extents,
2195 struct rbd_img_fill_ctx *fctx)
3d7efd18 2196{
afb97888
ID
2197 struct rbd_device *rbd_dev = img_req->rbd_dev;
2198 struct rbd_obj_request *obj_req;
5a237819
ID
2199 u32 i;
2200 int ret;
2201
afb97888
ID
2202 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2203 !rbd_layout_is_fancy(&rbd_dev->layout))
2204 return rbd_img_fill_request_nocopy(img_req, img_extents,
2205 num_img_extents, fctx);
3d7efd18 2206
afb97888 2207 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
0eefd470 2208
bbea1c1a 2209 /*
afb97888
ID
2210 * Create object requests and determine ->bvec_count for each object
2211 * request. Note that ->bvec_count sum over all object requests may
2212 * be greater than the number of bio_vecs in the provided bio (list)
2213 * or bio_vec array because when mapped, those bio_vecs can straddle
2214 * stripe unit boundaries.
bbea1c1a 2215 */
5a237819
ID
2216 fctx->iter = *fctx->pos;
2217 for (i = 0; i < num_img_extents; i++) {
afb97888 2218 ret = ceph_file_to_extents(&rbd_dev->layout,
5a237819
ID
2219 img_extents[i].fe_off,
2220 img_extents[i].fe_len,
2221 &img_req->object_extents,
2222 alloc_object_extent, img_req,
afb97888
ID
2223 fctx->count_fn, &fctx->iter);
2224 if (ret)
2225 return ret;
bbea1c1a 2226 }
0eefd470 2227
afb97888
ID
2228 for_each_obj_request(img_req, obj_req) {
2229 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2230 sizeof(*obj_req->bvec_pos.bvecs),
2231 GFP_NOIO);
2232 if (!obj_req->bvec_pos.bvecs)
2233 return -ENOMEM;
2234 }
0eefd470 2235
8785b1d4 2236 /*
afb97888
ID
2237 * Fill in each object request's private bio_vec array, splitting and
2238 * rearranging the provided bio_vecs in stripe unit chunks as needed.
8785b1d4 2239 */
afb97888
ID
2240 fctx->iter = *fctx->pos;
2241 for (i = 0; i < num_img_extents; i++) {
2242 ret = ceph_iterate_extents(&rbd_dev->layout,
2243 img_extents[i].fe_off,
2244 img_extents[i].fe_len,
2245 &img_req->object_extents,
2246 fctx->copy_fn, &fctx->iter);
5a237819
ID
2247 if (ret)
2248 return ret;
2249 }
3d7efd18 2250
5a237819
ID
2251 return __rbd_img_fill_request(img_req);
2252}
2253
2254static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2255 u64 off, u64 len)
2256{
2257 struct ceph_file_extent ex = { off, len };
2258 union rbd_img_fill_iter dummy;
2259 struct rbd_img_fill_ctx fctx = {
2260 .pos_type = OBJ_REQUEST_NODATA,
2261 .pos = &dummy,
2262 };
2263
2264 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2265}
2266
2267static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2268{
2269 struct rbd_obj_request *obj_req =
2270 container_of(ex, struct rbd_obj_request, ex);
2271 struct ceph_bio_iter *it = arg;
3d7efd18 2272
5a237819
ID
2273 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2274 obj_req->bio_pos = *it;
2275 ceph_bio_iter_advance(it, bytes);
2276}
3d7efd18 2277
afb97888
ID
2278static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2279{
2280 struct rbd_obj_request *obj_req =
2281 container_of(ex, struct rbd_obj_request, ex);
2282 struct ceph_bio_iter *it = arg;
0eefd470 2283
afb97888
ID
2284 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2285 ceph_bio_iter_advance_step(it, bytes, ({
2286 obj_req->bvec_count++;
2287 }));
0eefd470 2288
afb97888 2289}
0eefd470 2290
afb97888
ID
2291static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2292{
2293 struct rbd_obj_request *obj_req =
2294 container_of(ex, struct rbd_obj_request, ex);
2295 struct ceph_bio_iter *it = arg;
0eefd470 2296
afb97888
ID
2297 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2298 ceph_bio_iter_advance_step(it, bytes, ({
2299 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2300 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2301 }));
3d7efd18
AE
2302}
2303
5a237819
ID
2304static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2305 struct ceph_file_extent *img_extents,
2306 u32 num_img_extents,
2307 struct ceph_bio_iter *bio_pos)
2308{
2309 struct rbd_img_fill_ctx fctx = {
2310 .pos_type = OBJ_REQUEST_BIO,
2311 .pos = (union rbd_img_fill_iter *)bio_pos,
2312 .set_pos_fn = set_bio_pos,
afb97888
ID
2313 .count_fn = count_bio_bvecs,
2314 .copy_fn = copy_bio_bvecs,
5a237819 2315 };
3d7efd18 2316
5a237819
ID
2317 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2318 &fctx);
2319}
3d7efd18 2320
5a237819
ID
2321static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2322 u64 off, u64 len, struct bio *bio)
2323{
2324 struct ceph_file_extent ex = { off, len };
2325 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
3d7efd18 2326
5a237819
ID
2327 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2328}
a9e8ba2c 2329
5a237819
ID
2330static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2331{
2332 struct rbd_obj_request *obj_req =
2333 container_of(ex, struct rbd_obj_request, ex);
2334 struct ceph_bvec_iter *it = arg;
3d7efd18 2335
5a237819
ID
2336 obj_req->bvec_pos = *it;
2337 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2338 ceph_bvec_iter_advance(it, bytes);
2339}
3d7efd18 2340
afb97888
ID
2341static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2342{
2343 struct rbd_obj_request *obj_req =
2344 container_of(ex, struct rbd_obj_request, ex);
2345 struct ceph_bvec_iter *it = arg;
058aa991 2346
afb97888
ID
2347 ceph_bvec_iter_advance_step(it, bytes, ({
2348 obj_req->bvec_count++;
2349 }));
2350}
058aa991 2351
afb97888
ID
2352static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2353{
2354 struct rbd_obj_request *obj_req =
2355 container_of(ex, struct rbd_obj_request, ex);
2356 struct ceph_bvec_iter *it = arg;
3d7efd18 2357
afb97888
ID
2358 ceph_bvec_iter_advance_step(it, bytes, ({
2359 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2360 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2361 }));
3d7efd18
AE
2362}
2363
5a237819
ID
2364static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2365 struct ceph_file_extent *img_extents,
2366 u32 num_img_extents,
2367 struct ceph_bvec_iter *bvec_pos)
c5b5ef6c 2368{
5a237819
ID
2369 struct rbd_img_fill_ctx fctx = {
2370 .pos_type = OBJ_REQUEST_BVECS,
2371 .pos = (union rbd_img_fill_iter *)bvec_pos,
2372 .set_pos_fn = set_bvec_pos,
afb97888
ID
2373 .count_fn = count_bvecs,
2374 .copy_fn = copy_bvecs,
5a237819 2375 };
c5b5ef6c 2376
5a237819
ID
2377 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2378 &fctx);
2379}
c5b5ef6c 2380
5a237819
ID
2381static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2382 struct ceph_file_extent *img_extents,
2383 u32 num_img_extents,
2384 struct bio_vec *bvecs)
2385{
2386 struct ceph_bvec_iter it = {
2387 .bvecs = bvecs,
2388 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2389 num_img_extents) },
2390 };
c5b5ef6c 2391
5a237819
ID
2392 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2393 &it);
2394}
c5b5ef6c 2395
0192ce2e 2396static void rbd_img_handle_request_work(struct work_struct *work)
bf0d5f50 2397{
0192ce2e
ID
2398 struct rbd_img_request *img_req =
2399 container_of(work, struct rbd_img_request, work);
c5b5ef6c 2400
0192ce2e
ID
2401 rbd_img_handle_request(img_req, img_req->work_result);
2402}
c2e82414 2403
0192ce2e
ID
2404static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2405{
2406 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2407 img_req->work_result = result;
2408 queue_work(rbd_wq, &img_req->work);
c5b5ef6c
AE
2409}
2410
85b5e6d1
ID
2411static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2412{
2413 rbd_obj_request_submit(obj_req);
2414 return 0;
2415}
2416
86bd7998 2417static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
c5b5ef6c 2418{
3da691bf
ID
2419 struct rbd_img_request *img_req = obj_req->img_request;
2420 struct rbd_img_request *child_img_req;
c5b5ef6c
AE
2421 int ret;
2422
e93aca0a
ID
2423 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2424 OBJ_OP_READ, NULL);
3da691bf 2425 if (!child_img_req)
710214e3
ID
2426 return -ENOMEM;
2427
e93aca0a
ID
2428 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2429 child_img_req->obj_request = obj_req;
a90bb0c1 2430
3da691bf 2431 if (!rbd_img_is_write(img_req)) {
ecc633ca 2432 switch (img_req->data_type) {
3da691bf 2433 case OBJ_REQUEST_BIO:
5a237819
ID
2434 ret = __rbd_img_fill_from_bio(child_img_req,
2435 obj_req->img_extents,
2436 obj_req->num_img_extents,
2437 &obj_req->bio_pos);
3da691bf
ID
2438 break;
2439 case OBJ_REQUEST_BVECS:
afb97888 2440 case OBJ_REQUEST_OWN_BVECS:
5a237819
ID
2441 ret = __rbd_img_fill_from_bvecs(child_img_req,
2442 obj_req->img_extents,
2443 obj_req->num_img_extents,
2444 &obj_req->bvec_pos);
3da691bf
ID
2445 break;
2446 default:
d342a15b 2447 BUG();
3da691bf
ID
2448 }
2449 } else {
5a237819
ID
2450 ret = rbd_img_fill_from_bvecs(child_img_req,
2451 obj_req->img_extents,
2452 obj_req->num_img_extents,
2453 obj_req->copyup_bvecs);
3da691bf
ID
2454 }
2455 if (ret) {
2456 rbd_img_request_put(child_img_req);
2457 return ret;
2458 }
2459
0192ce2e
ID
2460 /* avoid parent chain recursion */
2461 rbd_img_schedule(child_img_req, 0);
3da691bf
ID
2462 return 0;
2463}
2464
85b5e6d1 2465static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
3da691bf
ID
2466{
2467 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2468 int ret;
2469
a9b67e69 2470 switch (obj_req->read_state) {
85b5e6d1
ID
2471 case RBD_OBJ_READ_START:
2472 rbd_assert(!*result);
2473
2474 ret = rbd_obj_read_object(obj_req);
2475 if (ret) {
2476 *result = ret;
2477 return true;
2478 }
2479 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2480 return false;
a9b67e69
ID
2481 case RBD_OBJ_READ_OBJECT:
2482 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2483 /* reverse map this object extent onto the parent */
2484 ret = rbd_obj_calc_img_extents(obj_req, false);
86bd7998 2485 if (ret) {
54ab3b24 2486 *result = ret;
86bd7998
ID
2487 return true;
2488 }
a9b67e69
ID
2489 if (obj_req->num_img_extents) {
2490 ret = rbd_obj_read_from_parent(obj_req);
2491 if (ret) {
2492 *result = ret;
2493 return true;
2494 }
2495 obj_req->read_state = RBD_OBJ_READ_PARENT;
2496 return false;
2497 }
86bd7998 2498 }
710214e3 2499
a9b67e69
ID
2500 /*
2501 * -ENOENT means a hole in the image -- zero-fill the entire
2502 * length of the request. A short read also implies zero-fill
2503 * to the end of the request.
2504 */
2505 if (*result == -ENOENT) {
2506 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2507 *result = 0;
2508 } else if (*result >= 0) {
2509 if (*result < obj_req->ex.oe_len)
2510 rbd_obj_zero_range(obj_req, *result,
2511 obj_req->ex.oe_len - *result);
2512 else
2513 rbd_assert(*result == obj_req->ex.oe_len);
2514 *result = 0;
2515 }
2516 return true;
2517 case RBD_OBJ_READ_PARENT:
2518 return true;
2519 default:
2520 BUG();
710214e3 2521 }
3da691bf 2522}
c5b5ef6c 2523
85b5e6d1
ID
2524static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
2525{
2526 rbd_obj_request_submit(obj_req);
2527 return 0;
2528}
2529
3da691bf
ID
2530/*
2531 * copyup_bvecs pages are never highmem pages
2532 */
2533static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2534{
2535 struct ceph_bvec_iter it = {
2536 .bvecs = bvecs,
2537 .iter = { .bi_size = bytes },
2538 };
c5b5ef6c 2539
3da691bf
ID
2540 ceph_bvec_iter_advance_step(&it, bytes, ({
2541 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2542 bv.bv_len))
2543 return false;
2544 }));
2545 return true;
c5b5ef6c
AE
2546}
2547
3a482501
ID
2548#define MODS_ONLY U32_MAX
2549
89a59c1c
ID
2550static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req,
2551 u32 bytes)
b454e36d 2552{
bcbab1db 2553 struct ceph_osd_request *osd_req;
fe943d50 2554 int ret;
70d045f6 2555
3da691bf 2556 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
89a59c1c 2557 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
70d045f6 2558
bcbab1db
ID
2559 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
2560 if (IS_ERR(osd_req))
2561 return PTR_ERR(osd_req);
b454e36d 2562
bcbab1db 2563 ret = osd_req_op_cls_init(osd_req, 0, "rbd", "copyup");
fe943d50
CX
2564 if (ret)
2565 return ret;
2566
bcbab1db 2567 osd_req_op_cls_request_data_bvecs(osd_req, 0,
0010f705
ID
2568 obj_req->copyup_bvecs,
2569 obj_req->copyup_bvec_count,
2570 bytes);
bcbab1db 2571 rbd_osd_format_write(osd_req);
3da691bf 2572
bcbab1db 2573 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
89a59c1c
ID
2574 if (ret)
2575 return ret;
2576
2577 rbd_obj_request_submit(obj_req);
2578 return 0;
2579}
2580
3a482501 2581static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
b454e36d 2582{
13488d53 2583 struct rbd_img_request *img_req = obj_req->img_request;
bcbab1db 2584 struct ceph_osd_request *osd_req;
3a482501
ID
2585 unsigned int num_osd_ops = (bytes != MODS_ONLY);
2586 unsigned int which = 0;
fe943d50 2587 int ret;
70d045f6 2588
3da691bf 2589 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
70d045f6 2590
13488d53 2591 switch (img_req->op_type) {
3da691bf 2592 case OBJ_OP_WRITE:
13488d53 2593 num_osd_ops += count_write_ops(obj_req);
3da691bf 2594 break;
13488d53
ID
2595 case OBJ_OP_ZEROOUT:
2596 num_osd_ops += count_zeroout_ops(obj_req);
2597 break;
2598 default:
16809372 2599 BUG();
13488d53
ID
2600 }
2601
bcbab1db
ID
2602 osd_req = rbd_obj_add_osd_request(obj_req, num_osd_ops);
2603 if (IS_ERR(osd_req))
2604 return PTR_ERR(osd_req);
b454e36d 2605
3a482501 2606 if (bytes != MODS_ONLY) {
bcbab1db 2607 ret = osd_req_op_cls_init(osd_req, which, "rbd",
3a482501
ID
2608 "copyup");
2609 if (ret)
2610 return ret;
fe943d50 2611
bcbab1db 2612 osd_req_op_cls_request_data_bvecs(osd_req, which++,
3a482501
ID
2613 obj_req->copyup_bvecs,
2614 obj_req->copyup_bvec_count,
2615 bytes);
3da691bf 2616 }
3da691bf 2617
13488d53 2618 switch (img_req->op_type) {
3da691bf 2619 case OBJ_OP_WRITE:
bcbab1db 2620 __rbd_osd_setup_write_ops(osd_req, which);
3da691bf 2621 break;
6484cbe9 2622 case OBJ_OP_ZEROOUT:
bcbab1db 2623 __rbd_osd_setup_zeroout_ops(osd_req, which);
3da691bf
ID
2624 break;
2625 default:
16809372 2626 BUG();
3da691bf 2627 }
70d045f6 2628
bcbab1db 2629 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
26f887e0
ID
2630 if (ret)
2631 return ret;
2632
3da691bf 2633 rbd_obj_request_submit(obj_req);
3da691bf 2634 return 0;
70d045f6
ID
2635}
2636
3a482501
ID
2637static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2638{
2639 /*
2640 * Only send non-zero copyup data to save some I/O and network
2641 * bandwidth -- zero copyup data is equivalent to the object not
2642 * existing.
2643 */
2644 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2645 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2646 bytes = 0;
2647 }
2648
89a59c1c
ID
2649 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
2650 /*
2651 * Send a copyup request with an empty snapshot context to
2652 * deep-copyup the object through all existing snapshots.
2653 * A second request with the current snapshot context will be
2654 * sent for the actual modification.
2655 */
2656 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC;
2657 return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes);
2658 }
2659
3a482501
ID
2660 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2661 return rbd_obj_issue_copyup_ops(obj_req, bytes);
2662}
2663
7e07efb1 2664static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
70d045f6 2665{
7e07efb1 2666 u32 i;
b454e36d 2667
7e07efb1
ID
2668 rbd_assert(!obj_req->copyup_bvecs);
2669 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2670 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2671 sizeof(*obj_req->copyup_bvecs),
2672 GFP_NOIO);
2673 if (!obj_req->copyup_bvecs)
2674 return -ENOMEM;
b454e36d 2675
7e07efb1
ID
2676 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2677 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2678
2679 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2680 if (!obj_req->copyup_bvecs[i].bv_page)
2681 return -ENOMEM;
3d7efd18 2682
7e07efb1
ID
2683 obj_req->copyup_bvecs[i].bv_offset = 0;
2684 obj_req->copyup_bvecs[i].bv_len = len;
2685 obj_overlap -= len;
2686 }
b454e36d 2687
7e07efb1
ID
2688 rbd_assert(!obj_overlap);
2689 return 0;
b454e36d
AE
2690}
2691
0ad5d953
ID
2692/*
2693 * The target object doesn't exist. Read the data for the entire
2694 * target object up to the overlap point (if any) from the parent,
2695 * so we can use it for a copyup.
2696 */
3da691bf 2697static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
bf0d5f50 2698{
3da691bf 2699 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 2700 int ret;
bf0d5f50 2701
86bd7998
ID
2702 rbd_assert(obj_req->num_img_extents);
2703 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2704 rbd_dev->parent_overlap);
2705 if (!obj_req->num_img_extents) {
3da691bf
ID
2706 /*
2707 * The overlap has become 0 (most likely because the
3a482501
ID
2708 * image has been flattened). Re-submit the original write
2709 * request -- pass MODS_ONLY since the copyup isn't needed
2710 * anymore.
3da691bf 2711 */
3a482501
ID
2712 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2713 return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
bf0d5f50
AE
2714 }
2715
86bd7998 2716 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3da691bf
ID
2717 if (ret)
2718 return ret;
2719
3a482501 2720 obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT;
86bd7998 2721 return rbd_obj_read_from_parent(obj_req);
bf0d5f50 2722}
8b3e1a56 2723
85b5e6d1 2724static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
8b3e1a56 2725{
3da691bf 2726 int ret;
8b3e1a56 2727
3da691bf 2728 switch (obj_req->write_state) {
85b5e6d1
ID
2729 case RBD_OBJ_WRITE_START:
2730 rbd_assert(!*result);
2731
2732 ret = rbd_obj_write_object(obj_req);
2733 if (ret) {
2734 *result = ret;
2735 return true;
2736 }
2737 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
2738 return false;
0ad5d953 2739 case RBD_OBJ_WRITE_OBJECT:
54ab3b24 2740 if (*result == -ENOENT) {
0ad5d953
ID
2741 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2742 ret = rbd_obj_handle_write_guard(obj_req);
2743 if (ret) {
2744 *result = ret;
2745 return true;
2746 }
2747 return false;
2748 }
3da691bf 2749 /*
0ad5d953
ID
2750 * On a non-existent object:
2751 * delete - -ENOENT, truncate/zero - 0
3da691bf 2752 */
0ad5d953
ID
2753 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
2754 *result = 0;
3da691bf
ID
2755 }
2756 /* fall through */
3a482501 2757 case RBD_OBJ_WRITE_COPYUP_OPS:
3da691bf 2758 return true;
3a482501 2759 case RBD_OBJ_WRITE_READ_FROM_PARENT:
a9b67e69 2760 if (*result)
3a482501 2761 return true;
8b3e1a56 2762
a9b67e69
ID
2763 ret = rbd_obj_issue_copyup(obj_req,
2764 rbd_obj_img_extents_bytes(obj_req));
3da691bf 2765 if (ret) {
54ab3b24 2766 *result = ret;
3da691bf
ID
2767 return true;
2768 }
2769 return false;
89a59c1c 2770 case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC:
54ab3b24 2771 if (*result)
89a59c1c
ID
2772 return true;
2773
2774 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2775 ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
3da691bf 2776 if (ret) {
54ab3b24 2777 *result = ret;
3da691bf
ID
2778 return true;
2779 }
2780 return false;
2781 default:
c6244b3b 2782 BUG();
3da691bf
ID
2783 }
2784}
02c74fba 2785
3da691bf 2786/*
0ad5d953 2787 * Return true if @obj_req is completed.
3da691bf 2788 */
54ab3b24
ID
2789static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
2790 int *result)
3da691bf 2791{
0ad5d953 2792 struct rbd_img_request *img_req = obj_req->img_request;
0192ce2e 2793 struct rbd_device *rbd_dev = img_req->rbd_dev;
0ad5d953
ID
2794 bool done;
2795
85b5e6d1 2796 mutex_lock(&obj_req->state_mutex);
0ad5d953 2797 if (!rbd_img_is_write(img_req))
85b5e6d1 2798 done = rbd_obj_advance_read(obj_req, result);
0ad5d953 2799 else
85b5e6d1
ID
2800 done = rbd_obj_advance_write(obj_req, result);
2801 mutex_unlock(&obj_req->state_mutex);
0ad5d953 2802
0192ce2e
ID
2803 if (done && *result) {
2804 rbd_assert(*result < 0);
2805 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
2806 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2807 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
2808 }
0ad5d953 2809 return done;
3da691bf 2810}
02c74fba 2811
0192ce2e
ID
2812/*
2813 * This is open-coded in rbd_img_handle_request() to avoid parent chain
2814 * recursion.
2815 */
2816static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
2817{
2818 if (__rbd_obj_handle_request(obj_req, &result))
2819 rbd_img_handle_request(obj_req->img_request, result);
2820}
2821
2822static void rbd_img_object_requests(struct rbd_img_request *img_req)
7114edac 2823{
0192ce2e 2824 struct rbd_obj_request *obj_req;
7114edac 2825
0192ce2e
ID
2826 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
2827
2828 for_each_obj_request(img_req, obj_req) {
2829 int result = 0;
a9e8ba2c 2830
0192ce2e
ID
2831 if (__rbd_obj_handle_request(obj_req, &result)) {
2832 if (result) {
2833 img_req->pending.result = result;
2834 return;
2835 }
2836 } else {
2837 img_req->pending.num_pending++;
2838 }
2839 }
8b3e1a56
AE
2840}
2841
0192ce2e 2842static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
8b3e1a56 2843{
0192ce2e
ID
2844again:
2845 switch (img_req->state) {
2846 case RBD_IMG_START:
2847 rbd_assert(!*result);
8b3e1a56 2848
0192ce2e
ID
2849 rbd_img_object_requests(img_req);
2850 if (!img_req->pending.num_pending) {
2851 *result = img_req->pending.result;
2852 img_req->state = RBD_IMG_OBJECT_REQUESTS;
2853 goto again;
2854 }
2855 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
2856 return false;
2857 case __RBD_IMG_OBJECT_REQUESTS:
2858 if (!pending_result_dec(&img_req->pending, result))
2859 return false;
2860 /* fall through */
2861 case RBD_IMG_OBJECT_REQUESTS:
2862 return true;
2863 default:
2864 BUG();
2865 }
3da691bf 2866}
8b3e1a56 2867
0192ce2e
ID
2868/*
2869 * Return true if @img_req is completed.
2870 */
2871static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
2872 int *result)
3da691bf 2873{
0192ce2e
ID
2874 struct rbd_device *rbd_dev = img_req->rbd_dev;
2875 bool done;
8b3e1a56 2876
0192ce2e
ID
2877 mutex_lock(&img_req->state_mutex);
2878 done = rbd_img_advance(img_req, result);
2879 mutex_unlock(&img_req->state_mutex);
8b3e1a56 2880
0192ce2e
ID
2881 if (done && *result) {
2882 rbd_assert(*result < 0);
2883 rbd_warn(rbd_dev, "%s%s result %d",
2884 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
2885 obj_op_name(img_req->op_type), *result);
7114edac 2886 }
0192ce2e
ID
2887 return done;
2888}
2889
2890static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
2891{
2892again:
2893 if (!__rbd_img_handle_request(img_req, &result))
2894 return;
8b3e1a56 2895
7114edac 2896 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
0192ce2e
ID
2897 struct rbd_obj_request *obj_req = img_req->obj_request;
2898
54ab3b24 2899 rbd_img_request_put(img_req);
0192ce2e
ID
2900 if (__rbd_obj_handle_request(obj_req, &result)) {
2901 img_req = obj_req->img_request;
2902 goto again;
2903 }
2904 } else {
2905 struct request *rq = img_req->rq;
2906
2907 rbd_img_request_put(img_req);
2908 blk_mq_end_request(rq, errno_to_blk_status(result));
7114edac 2909 }
8b3e1a56 2910}
bf0d5f50 2911
ed95b21a 2912static const struct rbd_client_id rbd_empty_cid;
b8d70035 2913
ed95b21a
ID
2914static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2915 const struct rbd_client_id *rhs)
2916{
2917 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2918}
2919
2920static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2921{
2922 struct rbd_client_id cid;
2923
2924 mutex_lock(&rbd_dev->watch_mutex);
2925 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2926 cid.handle = rbd_dev->watch_cookie;
2927 mutex_unlock(&rbd_dev->watch_mutex);
2928 return cid;
2929}
2930
2931/*
2932 * lock_rwsem must be held for write
2933 */
2934static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2935 const struct rbd_client_id *cid)
2936{
2937 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2938 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2939 cid->gid, cid->handle);
2940 rbd_dev->owner_cid = *cid; /* struct */
2941}
2942
2943static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2944{
2945 mutex_lock(&rbd_dev->watch_mutex);
2946 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2947 mutex_unlock(&rbd_dev->watch_mutex);
2948}
2949
edd8ca80
FM
2950static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2951{
2952 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2953
2954 strcpy(rbd_dev->lock_cookie, cookie);
2955 rbd_set_owner_cid(rbd_dev, &cid);
2956 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2957}
2958
ed95b21a
ID
2959/*
2960 * lock_rwsem must be held for write
2961 */
2962static int rbd_lock(struct rbd_device *rbd_dev)
b8d70035 2963{
922dab61 2964 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 2965 char cookie[32];
e627db08 2966 int ret;
b8d70035 2967
cbbfb0ff
ID
2968 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2969 rbd_dev->lock_cookie[0] != '\0');
52bb1f9b 2970
ed95b21a
ID
2971 format_lock_cookie(rbd_dev, cookie);
2972 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2973 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2974 RBD_LOCK_TAG, "", 0);
e627db08 2975 if (ret)
ed95b21a 2976 return ret;
b8d70035 2977
ed95b21a 2978 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
edd8ca80 2979 __rbd_lock(rbd_dev, cookie);
ed95b21a 2980 return 0;
b8d70035
AE
2981}
2982
ed95b21a
ID
2983/*
2984 * lock_rwsem must be held for write
2985 */
bbead745 2986static void rbd_unlock(struct rbd_device *rbd_dev)
bb040aa0 2987{
922dab61 2988 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
bb040aa0
ID
2989 int ret;
2990
cbbfb0ff
ID
2991 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2992 rbd_dev->lock_cookie[0] == '\0');
bb040aa0 2993
ed95b21a 2994 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
cbbfb0ff 2995 RBD_LOCK_NAME, rbd_dev->lock_cookie);
bbead745
ID
2996 if (ret && ret != -ENOENT)
2997 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
bb040aa0 2998
bbead745
ID
2999 /* treat errors as the image is unlocked */
3000 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
cbbfb0ff 3001 rbd_dev->lock_cookie[0] = '\0';
ed95b21a
ID
3002 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3003 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
bb040aa0
ID
3004}
3005
ed95b21a
ID
3006static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3007 enum rbd_notify_op notify_op,
3008 struct page ***preply_pages,
3009 size_t *preply_len)
9969ebc5
AE
3010{
3011 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3012 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
08a79102
KS
3013 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3014 int buf_size = sizeof(buf);
ed95b21a 3015 void *p = buf;
9969ebc5 3016
ed95b21a 3017 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
9969ebc5 3018
ed95b21a
ID
3019 /* encode *LockPayload NotifyMessage (op + ClientId) */
3020 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3021 ceph_encode_32(&p, notify_op);
3022 ceph_encode_64(&p, cid.gid);
3023 ceph_encode_64(&p, cid.handle);
8eb87565 3024
ed95b21a
ID
3025 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3026 &rbd_dev->header_oloc, buf, buf_size,
3027 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
b30a01f2
ID
3028}
3029
ed95b21a
ID
3030static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3031 enum rbd_notify_op notify_op)
b30a01f2 3032{
ed95b21a
ID
3033 struct page **reply_pages;
3034 size_t reply_len;
b30a01f2 3035
ed95b21a
ID
3036 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3037 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3038}
b30a01f2 3039
ed95b21a
ID
3040static void rbd_notify_acquired_lock(struct work_struct *work)
3041{
3042 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3043 acquired_lock_work);
76756a51 3044
ed95b21a 3045 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
c525f036
ID
3046}
3047
ed95b21a 3048static void rbd_notify_released_lock(struct work_struct *work)
c525f036 3049{
ed95b21a
ID
3050 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3051 released_lock_work);
811c6688 3052
ed95b21a 3053 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
fca27065
ID
3054}
3055
ed95b21a 3056static int rbd_request_lock(struct rbd_device *rbd_dev)
36be9a76 3057{
ed95b21a
ID
3058 struct page **reply_pages;
3059 size_t reply_len;
3060 bool lock_owner_responded = false;
36be9a76
AE
3061 int ret;
3062
ed95b21a 3063 dout("%s rbd_dev %p\n", __func__, rbd_dev);
36be9a76 3064
ed95b21a
ID
3065 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3066 &reply_pages, &reply_len);
3067 if (ret && ret != -ETIMEDOUT) {
3068 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
36be9a76 3069 goto out;
ed95b21a 3070 }
36be9a76 3071
ed95b21a
ID
3072 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3073 void *p = page_address(reply_pages[0]);
3074 void *const end = p + reply_len;
3075 u32 n;
36be9a76 3076
ed95b21a
ID
3077 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3078 while (n--) {
3079 u8 struct_v;
3080 u32 len;
36be9a76 3081
ed95b21a
ID
3082 ceph_decode_need(&p, end, 8 + 8, e_inval);
3083 p += 8 + 8; /* skip gid and cookie */
04017e29 3084
ed95b21a
ID
3085 ceph_decode_32_safe(&p, end, len, e_inval);
3086 if (!len)
3087 continue;
3088
3089 if (lock_owner_responded) {
3090 rbd_warn(rbd_dev,
3091 "duplicate lock owners detected");
3092 ret = -EIO;
3093 goto out;
3094 }
3095
3096 lock_owner_responded = true;
3097 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3098 &struct_v, &len);
3099 if (ret) {
3100 rbd_warn(rbd_dev,
3101 "failed to decode ResponseMessage: %d",
3102 ret);
3103 goto e_inval;
3104 }
3105
3106 ret = ceph_decode_32(&p);
3107 }
3108 }
3109
3110 if (!lock_owner_responded) {
3111 rbd_warn(rbd_dev, "no lock owners detected");
3112 ret = -ETIMEDOUT;
3113 }
3114
3115out:
3116 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3117 return ret;
3118
3119e_inval:
3120 ret = -EINVAL;
3121 goto out;
3122}
3123
3124static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3125{
3126 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3127
3128 cancel_delayed_work(&rbd_dev->lock_dwork);
3129 if (wake_all)
3130 wake_up_all(&rbd_dev->lock_waitq);
3131 else
3132 wake_up(&rbd_dev->lock_waitq);
3133}
3134
3135static int get_lock_owner_info(struct rbd_device *rbd_dev,
3136 struct ceph_locker **lockers, u32 *num_lockers)
3137{
3138 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3139 u8 lock_type;
3140 char *lock_tag;
3141 int ret;
3142
3143 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3144
3145 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3146 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3147 &lock_type, &lock_tag, lockers, num_lockers);
3148 if (ret)
3149 return ret;
3150
3151 if (*num_lockers == 0) {
3152 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3153 goto out;
3154 }
3155
3156 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3157 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3158 lock_tag);
3159 ret = -EBUSY;
3160 goto out;
3161 }
3162
3163 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3164 rbd_warn(rbd_dev, "shared lock type detected");
3165 ret = -EBUSY;
3166 goto out;
3167 }
3168
3169 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3170 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3171 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3172 (*lockers)[0].id.cookie);
3173 ret = -EBUSY;
3174 goto out;
3175 }
3176
3177out:
3178 kfree(lock_tag);
3179 return ret;
3180}
3181
3182static int find_watcher(struct rbd_device *rbd_dev,
3183 const struct ceph_locker *locker)
3184{
3185 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3186 struct ceph_watch_item *watchers;
3187 u32 num_watchers;
3188 u64 cookie;
3189 int i;
3190 int ret;
3191
3192 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3193 &rbd_dev->header_oloc, &watchers,
3194 &num_watchers);
3195 if (ret)
3196 return ret;
3197
3198 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3199 for (i = 0; i < num_watchers; i++) {
3200 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3201 sizeof(locker->info.addr)) &&
3202 watchers[i].cookie == cookie) {
3203 struct rbd_client_id cid = {
3204 .gid = le64_to_cpu(watchers[i].name.num),
3205 .handle = cookie,
3206 };
3207
3208 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3209 rbd_dev, cid.gid, cid.handle);
3210 rbd_set_owner_cid(rbd_dev, &cid);
3211 ret = 1;
3212 goto out;
3213 }
3214 }
3215
3216 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3217 ret = 0;
3218out:
3219 kfree(watchers);
3220 return ret;
3221}
3222
3223/*
3224 * lock_rwsem must be held for write
3225 */
3226static int rbd_try_lock(struct rbd_device *rbd_dev)
3227{
3228 struct ceph_client *client = rbd_dev->rbd_client->client;
3229 struct ceph_locker *lockers;
3230 u32 num_lockers;
3231 int ret;
3232
3233 for (;;) {
3234 ret = rbd_lock(rbd_dev);
3235 if (ret != -EBUSY)
3236 return ret;
3237
3238 /* determine if the current lock holder is still alive */
3239 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3240 if (ret)
3241 return ret;
3242
3243 if (num_lockers == 0)
3244 goto again;
3245
3246 ret = find_watcher(rbd_dev, lockers);
3247 if (ret) {
3248 if (ret > 0)
3249 ret = 0; /* have to request lock */
3250 goto out;
3251 }
3252
3253 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3254 ENTITY_NAME(lockers[0].id.name));
3255
3256 ret = ceph_monc_blacklist_add(&client->monc,
3257 &lockers[0].info.addr);
3258 if (ret) {
3259 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3260 ENTITY_NAME(lockers[0].id.name), ret);
3261 goto out;
3262 }
3263
3264 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3265 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3266 lockers[0].id.cookie,
3267 &lockers[0].id.name);
3268 if (ret && ret != -ENOENT)
3269 goto out;
3270
3271again:
3272 ceph_free_lockers(lockers, num_lockers);
3273 }
3274
3275out:
3276 ceph_free_lockers(lockers, num_lockers);
3277 return ret;
3278}
3279
3280/*
3281 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3282 */
3283static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3284 int *pret)
3285{
3286 enum rbd_lock_state lock_state;
3287
3288 down_read(&rbd_dev->lock_rwsem);
3289 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3290 rbd_dev->lock_state);
3291 if (__rbd_is_lock_owner(rbd_dev)) {
3292 lock_state = rbd_dev->lock_state;
3293 up_read(&rbd_dev->lock_rwsem);
3294 return lock_state;
3295 }
3296
3297 up_read(&rbd_dev->lock_rwsem);
3298 down_write(&rbd_dev->lock_rwsem);
3299 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3300 rbd_dev->lock_state);
3301 if (!__rbd_is_lock_owner(rbd_dev)) {
3302 *pret = rbd_try_lock(rbd_dev);
3303 if (*pret)
3304 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3305 }
3306
3307 lock_state = rbd_dev->lock_state;
3308 up_write(&rbd_dev->lock_rwsem);
3309 return lock_state;
3310}
3311
3312static void rbd_acquire_lock(struct work_struct *work)
3313{
3314 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3315 struct rbd_device, lock_dwork);
3316 enum rbd_lock_state lock_state;
37f13252 3317 int ret = 0;
ed95b21a
ID
3318
3319 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3320again:
3321 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3322 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3323 if (lock_state == RBD_LOCK_STATE_LOCKED)
3324 wake_requests(rbd_dev, true);
3325 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3326 rbd_dev, lock_state, ret);
3327 return;
3328 }
3329
3330 ret = rbd_request_lock(rbd_dev);
3331 if (ret == -ETIMEDOUT) {
3332 goto again; /* treat this as a dead client */
e010dd0a
ID
3333 } else if (ret == -EROFS) {
3334 rbd_warn(rbd_dev, "peer will not release lock");
3335 /*
3336 * If this is rbd_add_acquire_lock(), we want to fail
3337 * immediately -- reuse BLACKLISTED flag. Otherwise we
3338 * want to block.
3339 */
3340 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3341 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3342 /* wake "rbd map --exclusive" process */
3343 wake_requests(rbd_dev, false);
3344 }
ed95b21a
ID
3345 } else if (ret < 0) {
3346 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3347 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3348 RBD_RETRY_DELAY);
3349 } else {
3350 /*
3351 * lock owner acked, but resend if we don't see them
3352 * release the lock
3353 */
3354 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3355 rbd_dev);
3356 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3357 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3358 }
3359}
3360
3361/*
3362 * lock_rwsem must be held for write
3363 */
3364static bool rbd_release_lock(struct rbd_device *rbd_dev)
3365{
3366 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3367 rbd_dev->lock_state);
3368 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3369 return false;
3370
3371 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3372 downgrade_write(&rbd_dev->lock_rwsem);
52bb1f9b 3373 /*
ed95b21a 3374 * Ensure that all in-flight IO is flushed.
52bb1f9b 3375 *
ed95b21a
ID
3376 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3377 * may be shared with other devices.
52bb1f9b 3378 */
ed95b21a
ID
3379 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3380 up_read(&rbd_dev->lock_rwsem);
3381
3382 down_write(&rbd_dev->lock_rwsem);
3383 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3384 rbd_dev->lock_state);
3385 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3386 return false;
3387
bbead745
ID
3388 rbd_unlock(rbd_dev);
3389 /*
3390 * Give others a chance to grab the lock - we would re-acquire
3391 * almost immediately if we got new IO during ceph_osdc_sync()
3392 * otherwise. We need to ack our own notifications, so this
3393 * lock_dwork will be requeued from rbd_wait_state_locked()
3394 * after wake_requests() in rbd_handle_released_lock().
3395 */
3396 cancel_delayed_work(&rbd_dev->lock_dwork);
ed95b21a
ID
3397 return true;
3398}
3399
3400static void rbd_release_lock_work(struct work_struct *work)
3401{
3402 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3403 unlock_work);
3404
3405 down_write(&rbd_dev->lock_rwsem);
3406 rbd_release_lock(rbd_dev);
3407 up_write(&rbd_dev->lock_rwsem);
3408}
3409
3410static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3411 void **p)
3412{
3413 struct rbd_client_id cid = { 0 };
3414
3415 if (struct_v >= 2) {
3416 cid.gid = ceph_decode_64(p);
3417 cid.handle = ceph_decode_64(p);
3418 }
3419
3420 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3421 cid.handle);
3422 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3423 down_write(&rbd_dev->lock_rwsem);
3424 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3425 /*
3426 * we already know that the remote client is
3427 * the owner
3428 */
3429 up_write(&rbd_dev->lock_rwsem);
3430 return;
3431 }
3432
3433 rbd_set_owner_cid(rbd_dev, &cid);
3434 downgrade_write(&rbd_dev->lock_rwsem);
3435 } else {
3436 down_read(&rbd_dev->lock_rwsem);
3437 }
3438
3439 if (!__rbd_is_lock_owner(rbd_dev))
3440 wake_requests(rbd_dev, false);
3441 up_read(&rbd_dev->lock_rwsem);
3442}
3443
3444static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3445 void **p)
3446{
3447 struct rbd_client_id cid = { 0 };
3448
3449 if (struct_v >= 2) {
3450 cid.gid = ceph_decode_64(p);
3451 cid.handle = ceph_decode_64(p);
3452 }
3453
3454 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3455 cid.handle);
3456 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3457 down_write(&rbd_dev->lock_rwsem);
3458 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3459 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3460 __func__, rbd_dev, cid.gid, cid.handle,
3461 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3462 up_write(&rbd_dev->lock_rwsem);
3463 return;
3464 }
3465
3466 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3467 downgrade_write(&rbd_dev->lock_rwsem);
3468 } else {
3469 down_read(&rbd_dev->lock_rwsem);
3470 }
3471
3472 if (!__rbd_is_lock_owner(rbd_dev))
3473 wake_requests(rbd_dev, false);
3474 up_read(&rbd_dev->lock_rwsem);
3475}
3476
3b77faa0
ID
3477/*
3478 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3479 * ResponseMessage is needed.
3480 */
3481static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3482 void **p)
ed95b21a
ID
3483{
3484 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3485 struct rbd_client_id cid = { 0 };
3b77faa0 3486 int result = 1;
ed95b21a
ID
3487
3488 if (struct_v >= 2) {
3489 cid.gid = ceph_decode_64(p);
3490 cid.handle = ceph_decode_64(p);
3491 }
3492
3493 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3494 cid.handle);
3495 if (rbd_cid_equal(&cid, &my_cid))
3b77faa0 3496 return result;
ed95b21a
ID
3497
3498 down_read(&rbd_dev->lock_rwsem);
3b77faa0
ID
3499 if (__rbd_is_lock_owner(rbd_dev)) {
3500 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3501 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3502 goto out_unlock;
3503
3504 /*
3505 * encode ResponseMessage(0) so the peer can detect
3506 * a missing owner
3507 */
3508 result = 0;
3509
3510 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
e010dd0a
ID
3511 if (!rbd_dev->opts->exclusive) {
3512 dout("%s rbd_dev %p queueing unlock_work\n",
3513 __func__, rbd_dev);
3514 queue_work(rbd_dev->task_wq,
3515 &rbd_dev->unlock_work);
3516 } else {
3517 /* refuse to release the lock */
3518 result = -EROFS;
3519 }
ed95b21a
ID
3520 }
3521 }
3b77faa0
ID
3522
3523out_unlock:
ed95b21a 3524 up_read(&rbd_dev->lock_rwsem);
3b77faa0 3525 return result;
ed95b21a
ID
3526}
3527
3528static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3529 u64 notify_id, u64 cookie, s32 *result)
3530{
3531 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
08a79102
KS
3532 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3533 int buf_size = sizeof(buf);
ed95b21a
ID
3534 int ret;
3535
3536 if (result) {
3537 void *p = buf;
3538
3539 /* encode ResponseMessage */
3540 ceph_start_encoding(&p, 1, 1,
3541 buf_size - CEPH_ENCODING_START_BLK_LEN);
3542 ceph_encode_32(&p, *result);
3543 } else {
3544 buf_size = 0;
3545 }
b8d70035 3546
922dab61
ID
3547 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3548 &rbd_dev->header_oloc, notify_id, cookie,
ed95b21a 3549 buf, buf_size);
52bb1f9b 3550 if (ret)
ed95b21a
ID
3551 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3552}
3553
3554static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3555 u64 cookie)
3556{
3557 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3558 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3559}
3560
3561static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3562 u64 notify_id, u64 cookie, s32 result)
3563{
3564 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3565 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3566}
3567
3568static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3569 u64 notifier_id, void *data, size_t data_len)
3570{
3571 struct rbd_device *rbd_dev = arg;
3572 void *p = data;
3573 void *const end = p + data_len;
d4c2269b 3574 u8 struct_v = 0;
ed95b21a
ID
3575 u32 len;
3576 u32 notify_op;
3577 int ret;
3578
3579 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3580 __func__, rbd_dev, cookie, notify_id, data_len);
3581 if (data_len) {
3582 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3583 &struct_v, &len);
3584 if (ret) {
3585 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3586 ret);
3587 return;
3588 }
3589
3590 notify_op = ceph_decode_32(&p);
3591 } else {
3592 /* legacy notification for header updates */
3593 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3594 len = 0;
3595 }
3596
3597 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3598 switch (notify_op) {
3599 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3600 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3601 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3602 break;
3603 case RBD_NOTIFY_OP_RELEASED_LOCK:
3604 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3605 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3606 break;
3607 case RBD_NOTIFY_OP_REQUEST_LOCK:
3b77faa0
ID
3608 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3609 if (ret <= 0)
ed95b21a 3610 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3b77faa0 3611 cookie, ret);
ed95b21a
ID
3612 else
3613 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3614 break;
3615 case RBD_NOTIFY_OP_HEADER_UPDATE:
3616 ret = rbd_dev_refresh(rbd_dev);
3617 if (ret)
3618 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3619
3620 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3621 break;
3622 default:
3623 if (rbd_is_lock_owner(rbd_dev))
3624 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3625 cookie, -EOPNOTSUPP);
3626 else
3627 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3628 break;
3629 }
b8d70035
AE
3630}
3631
99d16943
ID
3632static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3633
922dab61 3634static void rbd_watch_errcb(void *arg, u64 cookie, int err)
bb040aa0 3635{
922dab61 3636 struct rbd_device *rbd_dev = arg;
bb040aa0 3637
922dab61 3638 rbd_warn(rbd_dev, "encountered watch error: %d", err);
bb040aa0 3639
ed95b21a
ID
3640 down_write(&rbd_dev->lock_rwsem);
3641 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3642 up_write(&rbd_dev->lock_rwsem);
3643
99d16943
ID
3644 mutex_lock(&rbd_dev->watch_mutex);
3645 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3646 __rbd_unregister_watch(rbd_dev);
3647 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
bb040aa0 3648
99d16943 3649 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
bb040aa0 3650 }
99d16943 3651 mutex_unlock(&rbd_dev->watch_mutex);
bb040aa0
ID
3652}
3653
9969ebc5 3654/*
99d16943 3655 * watch_mutex must be locked
9969ebc5 3656 */
99d16943 3657static int __rbd_register_watch(struct rbd_device *rbd_dev)
9969ebc5
AE
3658{
3659 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
922dab61 3660 struct ceph_osd_linger_request *handle;
9969ebc5 3661
922dab61 3662 rbd_assert(!rbd_dev->watch_handle);
99d16943 3663 dout("%s rbd_dev %p\n", __func__, rbd_dev);
9969ebc5 3664
922dab61
ID
3665 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3666 &rbd_dev->header_oloc, rbd_watch_cb,
3667 rbd_watch_errcb, rbd_dev);
3668 if (IS_ERR(handle))
3669 return PTR_ERR(handle);
8eb87565 3670
922dab61 3671 rbd_dev->watch_handle = handle;
b30a01f2 3672 return 0;
b30a01f2
ID
3673}
3674
99d16943
ID
3675/*
3676 * watch_mutex must be locked
3677 */
3678static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
b30a01f2 3679{
922dab61
ID
3680 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3681 int ret;
b30a01f2 3682
99d16943
ID
3683 rbd_assert(rbd_dev->watch_handle);
3684 dout("%s rbd_dev %p\n", __func__, rbd_dev);
b30a01f2 3685
922dab61
ID
3686 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3687 if (ret)
3688 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
76756a51 3689
922dab61 3690 rbd_dev->watch_handle = NULL;
c525f036
ID
3691}
3692
99d16943
ID
3693static int rbd_register_watch(struct rbd_device *rbd_dev)
3694{
3695 int ret;
3696
3697 mutex_lock(&rbd_dev->watch_mutex);
3698 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3699 ret = __rbd_register_watch(rbd_dev);
3700 if (ret)
3701 goto out;
3702
3703 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3704 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3705
3706out:
3707 mutex_unlock(&rbd_dev->watch_mutex);
3708 return ret;
3709}
3710
3711static void cancel_tasks_sync(struct rbd_device *rbd_dev)
c525f036 3712{
99d16943
ID
3713 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3714
ed95b21a
ID
3715 cancel_work_sync(&rbd_dev->acquired_lock_work);
3716 cancel_work_sync(&rbd_dev->released_lock_work);
3717 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3718 cancel_work_sync(&rbd_dev->unlock_work);
99d16943
ID
3719}
3720
3721static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3722{
ed95b21a 3723 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
99d16943
ID
3724 cancel_tasks_sync(rbd_dev);
3725
3726 mutex_lock(&rbd_dev->watch_mutex);
3727 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3728 __rbd_unregister_watch(rbd_dev);
3729 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3730 mutex_unlock(&rbd_dev->watch_mutex);
811c6688 3731
23edca86 3732 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
811c6688 3733 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
fca27065
ID
3734}
3735
14bb211d
ID
3736/*
3737 * lock_rwsem must be held for write
3738 */
3739static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3740{
3741 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3742 char cookie[32];
3743 int ret;
3744
3745 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3746
3747 format_lock_cookie(rbd_dev, cookie);
3748 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3749 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3750 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3751 RBD_LOCK_TAG, cookie);
3752 if (ret) {
3753 if (ret != -EOPNOTSUPP)
3754 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3755 ret);
3756
3757 /*
3758 * Lock cookie cannot be updated on older OSDs, so do
3759 * a manual release and queue an acquire.
3760 */
3761 if (rbd_release_lock(rbd_dev))
3762 queue_delayed_work(rbd_dev->task_wq,
3763 &rbd_dev->lock_dwork, 0);
3764 } else {
edd8ca80 3765 __rbd_lock(rbd_dev, cookie);
14bb211d
ID
3766 }
3767}
3768
99d16943
ID
3769static void rbd_reregister_watch(struct work_struct *work)
3770{
3771 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3772 struct rbd_device, watch_dwork);
3773 int ret;
3774
3775 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3776
3777 mutex_lock(&rbd_dev->watch_mutex);
87c0fded
ID
3778 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3779 mutex_unlock(&rbd_dev->watch_mutex);
14bb211d 3780 return;
87c0fded 3781 }
99d16943
ID
3782
3783 ret = __rbd_register_watch(rbd_dev);
3784 if (ret) {
3785 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
4d73644b 3786 if (ret == -EBLACKLISTED || ret == -ENOENT) {
87c0fded 3787 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
14bb211d 3788 wake_requests(rbd_dev, true);
87c0fded 3789 } else {
99d16943
ID
3790 queue_delayed_work(rbd_dev->task_wq,
3791 &rbd_dev->watch_dwork,
3792 RBD_RETRY_DELAY);
87c0fded
ID
3793 }
3794 mutex_unlock(&rbd_dev->watch_mutex);
14bb211d 3795 return;
99d16943
ID
3796 }
3797
3798 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3799 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3800 mutex_unlock(&rbd_dev->watch_mutex);
3801
14bb211d
ID
3802 down_write(&rbd_dev->lock_rwsem);
3803 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3804 rbd_reacquire_lock(rbd_dev);
3805 up_write(&rbd_dev->lock_rwsem);
3806
99d16943
ID
3807 ret = rbd_dev_refresh(rbd_dev);
3808 if (ret)
f6870cc9 3809 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
99d16943
ID
3810}
3811
36be9a76 3812/*
f40eb349
AE
3813 * Synchronous osd object method call. Returns the number of bytes
3814 * returned in the outbound buffer, or a negative error code.
36be9a76
AE
3815 */
3816static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
ecd4a68a
ID
3817 struct ceph_object_id *oid,
3818 struct ceph_object_locator *oloc,
36be9a76 3819 const char *method_name,
4157976b 3820 const void *outbound,
36be9a76 3821 size_t outbound_size,
4157976b 3822 void *inbound,
e2a58ee5 3823 size_t inbound_size)
36be9a76 3824{
ecd4a68a
ID
3825 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3826 struct page *req_page = NULL;
3827 struct page *reply_page;
36be9a76
AE
3828 int ret;
3829
3830 /*
6010a451
AE
3831 * Method calls are ultimately read operations. The result
3832 * should placed into the inbound buffer provided. They
3833 * also supply outbound data--parameters for the object
3834 * method. Currently if this is present it will be a
3835 * snapshot id.
36be9a76 3836 */
ecd4a68a
ID
3837 if (outbound) {
3838 if (outbound_size > PAGE_SIZE)
3839 return -E2BIG;
36be9a76 3840
ecd4a68a
ID
3841 req_page = alloc_page(GFP_KERNEL);
3842 if (!req_page)
3843 return -ENOMEM;
04017e29 3844
ecd4a68a 3845 memcpy(page_address(req_page), outbound, outbound_size);
04017e29 3846 }
36be9a76 3847
ecd4a68a
ID
3848 reply_page = alloc_page(GFP_KERNEL);
3849 if (!reply_page) {
3850 if (req_page)
3851 __free_page(req_page);
3852 return -ENOMEM;
3853 }
57385b51 3854
ecd4a68a
ID
3855 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3856 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3857 reply_page, &inbound_size);
3858 if (!ret) {
3859 memcpy(inbound, page_address(reply_page), inbound_size);
3860 ret = inbound_size;
3861 }
36be9a76 3862
ecd4a68a
ID
3863 if (req_page)
3864 __free_page(req_page);
3865 __free_page(reply_page);
36be9a76
AE
3866 return ret;
3867}
3868
ed95b21a
ID
3869/*
3870 * lock_rwsem must be held for read
3871 */
2f18d466 3872static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
ed95b21a
ID
3873{
3874 DEFINE_WAIT(wait);
34f55d0b 3875 unsigned long timeout;
2f18d466
ID
3876 int ret = 0;
3877
3878 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3879 return -EBLACKLISTED;
3880
3881 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3882 return 0;
3883
3884 if (!may_acquire) {
3885 rbd_warn(rbd_dev, "exclusive lock required");
3886 return -EROFS;
3887 }
ed95b21a
ID
3888
3889 do {
3890 /*
3891 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3892 * and cancel_delayed_work() in wake_requests().
3893 */
3894 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3895 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3896 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3897 TASK_UNINTERRUPTIBLE);
3898 up_read(&rbd_dev->lock_rwsem);
34f55d0b
DY
3899 timeout = schedule_timeout(ceph_timeout_jiffies(
3900 rbd_dev->opts->lock_timeout));
ed95b21a 3901 down_read(&rbd_dev->lock_rwsem);
2f18d466
ID
3902 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3903 ret = -EBLACKLISTED;
3904 break;
3905 }
34f55d0b
DY
3906 if (!timeout) {
3907 rbd_warn(rbd_dev, "timed out waiting for lock");
3908 ret = -ETIMEDOUT;
3909 break;
3910 }
2f18d466 3911 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
87c0fded 3912
ed95b21a 3913 finish_wait(&rbd_dev->lock_waitq, &wait);
2f18d466 3914 return ret;
ed95b21a
ID
3915}
3916
7ad18afa 3917static void rbd_queue_workfn(struct work_struct *work)
bf0d5f50 3918{
7ad18afa
CH
3919 struct request *rq = blk_mq_rq_from_pdu(work);
3920 struct rbd_device *rbd_dev = rq->q->queuedata;
bc1ecc65 3921 struct rbd_img_request *img_request;
4e752f0a 3922 struct ceph_snap_context *snapc = NULL;
bc1ecc65
ID
3923 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3924 u64 length = blk_rq_bytes(rq);
6d2940c8 3925 enum obj_operation_type op_type;
4e752f0a 3926 u64 mapping_size;
80de1912 3927 bool must_be_locked;
bf0d5f50
AE
3928 int result;
3929
aebf526b
CH
3930 switch (req_op(rq)) {
3931 case REQ_OP_DISCARD:
90e98c52 3932 op_type = OBJ_OP_DISCARD;
aebf526b 3933 break;
6484cbe9
ID
3934 case REQ_OP_WRITE_ZEROES:
3935 op_type = OBJ_OP_ZEROOUT;
3936 break;
aebf526b 3937 case REQ_OP_WRITE:
6d2940c8 3938 op_type = OBJ_OP_WRITE;
aebf526b
CH
3939 break;
3940 case REQ_OP_READ:
6d2940c8 3941 op_type = OBJ_OP_READ;
aebf526b
CH
3942 break;
3943 default:
3944 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
3945 result = -EIO;
3946 goto err;
3947 }
6d2940c8 3948
bc1ecc65 3949 /* Ignore/skip any zero-length requests */
bf0d5f50 3950
bc1ecc65
ID
3951 if (!length) {
3952 dout("%s: zero-length request\n", __func__);
3953 result = 0;
3954 goto err_rq;
3955 }
bf0d5f50 3956
b91a7bdc
ID
3957 if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) {
3958 rbd_warn(rbd_dev, "%s on read-only snapshot",
3959 obj_op_name(op_type));
3960 result = -EIO;
3961 goto err;
3962 }
4dda41d3 3963
bc1ecc65
ID
3964 /*
3965 * Quit early if the mapped snapshot no longer exists. It's
3966 * still possible the snapshot will have disappeared by the
3967 * time our request arrives at the osd, but there's no sense in
3968 * sending it if we already know.
3969 */
3970 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3971 dout("request for non-existent snapshot");
3972 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3973 result = -ENXIO;
3974 goto err_rq;
3975 }
4dda41d3 3976
bc1ecc65
ID
3977 if (offset && length > U64_MAX - offset + 1) {
3978 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3979 length);
3980 result = -EINVAL;
3981 goto err_rq; /* Shouldn't happen */
3982 }
4dda41d3 3983
7ad18afa
CH
3984 blk_mq_start_request(rq);
3985
4e752f0a
JD
3986 down_read(&rbd_dev->header_rwsem);
3987 mapping_size = rbd_dev->mapping.size;
6d2940c8 3988 if (op_type != OBJ_OP_READ) {
4e752f0a
JD
3989 snapc = rbd_dev->header.snapc;
3990 ceph_get_snap_context(snapc);
3991 }
3992 up_read(&rbd_dev->header_rwsem);
3993
3994 if (offset + length > mapping_size) {
bc1ecc65 3995 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4e752f0a 3996 length, mapping_size);
bc1ecc65
ID
3997 result = -EIO;
3998 goto err_rq;
3999 }
bf0d5f50 4000
f9bebd58
ID
4001 must_be_locked =
4002 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
4003 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
ed95b21a
ID
4004 if (must_be_locked) {
4005 down_read(&rbd_dev->lock_rwsem);
2f18d466
ID
4006 result = rbd_wait_state_locked(rbd_dev,
4007 !rbd_dev->opts->exclusive);
4008 if (result)
87c0fded 4009 goto err_unlock;
ed95b21a
ID
4010 }
4011
dfd9875f 4012 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
bc1ecc65
ID
4013 if (!img_request) {
4014 result = -ENOMEM;
ed95b21a 4015 goto err_unlock;
bc1ecc65
ID
4016 }
4017 img_request->rq = rq;
70b16db8 4018 snapc = NULL; /* img_request consumes a ref */
bf0d5f50 4019
6484cbe9 4020 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
5a237819 4021 result = rbd_img_fill_nodata(img_request, offset, length);
90e98c52 4022 else
5a237819
ID
4023 result = rbd_img_fill_from_bio(img_request, offset, length,
4024 rq->bio);
0192ce2e 4025 if (result)
bc1ecc65 4026 goto err_img_request;
bf0d5f50 4027
0192ce2e 4028 rbd_img_handle_request(img_request, 0);
ed95b21a
ID
4029 if (must_be_locked)
4030 up_read(&rbd_dev->lock_rwsem);
bc1ecc65 4031 return;
bf0d5f50 4032
bc1ecc65
ID
4033err_img_request:
4034 rbd_img_request_put(img_request);
ed95b21a
ID
4035err_unlock:
4036 if (must_be_locked)
4037 up_read(&rbd_dev->lock_rwsem);
bc1ecc65
ID
4038err_rq:
4039 if (result)
4040 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
6d2940c8 4041 obj_op_name(op_type), length, offset, result);
e96a650a 4042 ceph_put_snap_context(snapc);
7ad18afa 4043err:
2a842aca 4044 blk_mq_end_request(rq, errno_to_blk_status(result));
bc1ecc65 4045}
bf0d5f50 4046
fc17b653 4047static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
7ad18afa 4048 const struct blk_mq_queue_data *bd)
bc1ecc65 4049{
7ad18afa
CH
4050 struct request *rq = bd->rq;
4051 struct work_struct *work = blk_mq_rq_to_pdu(rq);
bf0d5f50 4052
7ad18afa 4053 queue_work(rbd_wq, work);
fc17b653 4054 return BLK_STS_OK;
bf0d5f50
AE
4055}
4056
602adf40
YS
4057static void rbd_free_disk(struct rbd_device *rbd_dev)
4058{
5769ed0c
ID
4059 blk_cleanup_queue(rbd_dev->disk->queue);
4060 blk_mq_free_tag_set(&rbd_dev->tag_set);
4061 put_disk(rbd_dev->disk);
a0cab924 4062 rbd_dev->disk = NULL;
602adf40
YS
4063}
4064
788e2df3 4065static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
fe5478e0
ID
4066 struct ceph_object_id *oid,
4067 struct ceph_object_locator *oloc,
4068 void *buf, int buf_len)
788e2df3
AE
4069
4070{
fe5478e0
ID
4071 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4072 struct ceph_osd_request *req;
4073 struct page **pages;
4074 int num_pages = calc_pages_for(0, buf_len);
788e2df3
AE
4075 int ret;
4076
fe5478e0
ID
4077 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4078 if (!req)
4079 return -ENOMEM;
788e2df3 4080
fe5478e0
ID
4081 ceph_oid_copy(&req->r_base_oid, oid);
4082 ceph_oloc_copy(&req->r_base_oloc, oloc);
4083 req->r_flags = CEPH_OSD_FLAG_READ;
430c28c3 4084
fe5478e0
ID
4085 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4086 if (IS_ERR(pages)) {
4087 ret = PTR_ERR(pages);
4088 goto out_req;
4089 }
1ceae7ef 4090
fe5478e0
ID
4091 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4092 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4093 true);
4094
26f887e0
ID
4095 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4096 if (ret)
4097 goto out_req;
4098
fe5478e0
ID
4099 ceph_osdc_start_request(osdc, req, false);
4100 ret = ceph_osdc_wait_request(osdc, req);
4101 if (ret >= 0)
4102 ceph_copy_from_page_vector(pages, buf, 0, ret);
788e2df3 4103
fe5478e0
ID
4104out_req:
4105 ceph_osdc_put_request(req);
788e2df3
AE
4106 return ret;
4107}
4108
602adf40 4109/*
662518b1
AE
4110 * Read the complete header for the given rbd device. On successful
4111 * return, the rbd_dev->header field will contain up-to-date
4112 * information about the image.
602adf40 4113 */
99a41ebc 4114static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
602adf40 4115{
4156d998 4116 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 4117 u32 snap_count = 0;
4156d998
AE
4118 u64 names_size = 0;
4119 u32 want_count;
4120 int ret;
602adf40 4121
00f1f36f 4122 /*
4156d998
AE
4123 * The complete header will include an array of its 64-bit
4124 * snapshot ids, followed by the names of those snapshots as
4125 * a contiguous block of NUL-terminated strings. Note that
4126 * the number of snapshots could change by the time we read
4127 * it in, in which case we re-read it.
00f1f36f 4128 */
4156d998
AE
4129 do {
4130 size_t size;
4131
4132 kfree(ondisk);
4133
4134 size = sizeof (*ondisk);
4135 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4136 size += names_size;
4137 ondisk = kmalloc(size, GFP_KERNEL);
4138 if (!ondisk)
662518b1 4139 return -ENOMEM;
4156d998 4140
fe5478e0
ID
4141 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4142 &rbd_dev->header_oloc, ondisk, size);
4156d998 4143 if (ret < 0)
662518b1 4144 goto out;
c0cd10db 4145 if ((size_t)ret < size) {
4156d998 4146 ret = -ENXIO;
06ecc6cb
AE
4147 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4148 size, ret);
662518b1 4149 goto out;
4156d998
AE
4150 }
4151 if (!rbd_dev_ondisk_valid(ondisk)) {
4152 ret = -ENXIO;
06ecc6cb 4153 rbd_warn(rbd_dev, "invalid header");
662518b1 4154 goto out;
81e759fb 4155 }
602adf40 4156
4156d998
AE
4157 names_size = le64_to_cpu(ondisk->snap_names_len);
4158 want_count = snap_count;
4159 snap_count = le32_to_cpu(ondisk->snap_count);
4160 } while (snap_count != want_count);
00f1f36f 4161
662518b1
AE
4162 ret = rbd_header_from_disk(rbd_dev, ondisk);
4163out:
4156d998
AE
4164 kfree(ondisk);
4165
4166 return ret;
602adf40
YS
4167}
4168
15228ede
AE
4169/*
4170 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
4171 * has disappeared from the (just updated) snapshot context.
4172 */
4173static void rbd_exists_validate(struct rbd_device *rbd_dev)
4174{
4175 u64 snap_id;
4176
4177 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
4178 return;
4179
4180 snap_id = rbd_dev->spec->snap_id;
4181 if (snap_id == CEPH_NOSNAP)
4182 return;
4183
4184 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
4185 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4186}
4187
9875201e
JD
4188static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4189{
4190 sector_t size;
9875201e
JD
4191
4192 /*
811c6688
ID
4193 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4194 * try to update its size. If REMOVING is set, updating size
4195 * is just useless work since the device can't be opened.
9875201e 4196 */
811c6688
ID
4197 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4198 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
9875201e
JD
4199 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4200 dout("setting size to %llu sectors", (unsigned long long)size);
4201 set_capacity(rbd_dev->disk, size);
4202 revalidate_disk(rbd_dev->disk);
4203 }
4204}
4205
cc4a38bd 4206static int rbd_dev_refresh(struct rbd_device *rbd_dev)
1fe5e993 4207{
e627db08 4208 u64 mapping_size;
1fe5e993
AE
4209 int ret;
4210
cfbf6377 4211 down_write(&rbd_dev->header_rwsem);
3b5cf2a2 4212 mapping_size = rbd_dev->mapping.size;
a720ae09
ID
4213
4214 ret = rbd_dev_header_info(rbd_dev);
52bb1f9b 4215 if (ret)
73e39e4d 4216 goto out;
15228ede 4217
e8f59b59
ID
4218 /*
4219 * If there is a parent, see if it has disappeared due to the
4220 * mapped image getting flattened.
4221 */
4222 if (rbd_dev->parent) {
4223 ret = rbd_dev_v2_parent_info(rbd_dev);
4224 if (ret)
73e39e4d 4225 goto out;
e8f59b59
ID
4226 }
4227
5ff1108c 4228 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
73e39e4d 4229 rbd_dev->mapping.size = rbd_dev->header.image_size;
5ff1108c
ID
4230 } else {
4231 /* validate mapped snapshot's EXISTS flag */
4232 rbd_exists_validate(rbd_dev);
4233 }
15228ede 4234
73e39e4d 4235out:
cfbf6377 4236 up_write(&rbd_dev->header_rwsem);
73e39e4d 4237 if (!ret && mapping_size != rbd_dev->mapping.size)
9875201e 4238 rbd_dev_update_size(rbd_dev);
1fe5e993 4239
73e39e4d 4240 return ret;
1fe5e993
AE
4241}
4242
d6296d39
CH
4243static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4244 unsigned int hctx_idx, unsigned int numa_node)
7ad18afa
CH
4245{
4246 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4247
4248 INIT_WORK(work, rbd_queue_workfn);
4249 return 0;
4250}
4251
f363b089 4252static const struct blk_mq_ops rbd_mq_ops = {
7ad18afa 4253 .queue_rq = rbd_queue_rq,
7ad18afa
CH
4254 .init_request = rbd_init_request,
4255};
4256
602adf40
YS
4257static int rbd_init_disk(struct rbd_device *rbd_dev)
4258{
4259 struct gendisk *disk;
4260 struct request_queue *q;
420efbdf
ID
4261 unsigned int objset_bytes =
4262 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
7ad18afa 4263 int err;
602adf40 4264
602adf40 4265 /* create gendisk info */
7e513d43
ID
4266 disk = alloc_disk(single_major ?
4267 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4268 RBD_MINORS_PER_MAJOR);
602adf40 4269 if (!disk)
1fcdb8aa 4270 return -ENOMEM;
602adf40 4271
f0f8cef5 4272 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 4273 rbd_dev->dev_id);
602adf40 4274 disk->major = rbd_dev->major;
dd82fff1 4275 disk->first_minor = rbd_dev->minor;
7e513d43
ID
4276 if (single_major)
4277 disk->flags |= GENHD_FL_EXT_DEVT;
602adf40
YS
4278 disk->fops = &rbd_bd_ops;
4279 disk->private_data = rbd_dev;
4280
7ad18afa
CH
4281 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4282 rbd_dev->tag_set.ops = &rbd_mq_ops;
b5584180 4283 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
7ad18afa 4284 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
56d18f62 4285 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
7ad18afa
CH
4286 rbd_dev->tag_set.nr_hw_queues = 1;
4287 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4288
4289 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4290 if (err)
602adf40 4291 goto out_disk;
029bcbd8 4292
7ad18afa
CH
4293 q = blk_mq_init_queue(&rbd_dev->tag_set);
4294 if (IS_ERR(q)) {
4295 err = PTR_ERR(q);
4296 goto out_tag_set;
4297 }
4298
8b904b5b 4299 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
d8a2c89c 4300 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
593a9e7b 4301
420efbdf 4302 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
0d9fde4f 4303 q->limits.max_sectors = queue_max_hw_sectors(q);
21acdf45 4304 blk_queue_max_segments(q, USHRT_MAX);
24f1df60 4305 blk_queue_max_segment_size(q, UINT_MAX);
16d80c54
ID
4306 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
4307 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
029bcbd8 4308
d9360540
ID
4309 if (rbd_dev->opts->trim) {
4310 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
16d80c54 4311 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
d9360540
ID
4312 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4313 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4314 }
90e98c52 4315
bae818ee 4316 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
dc3b17cc 4317 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
bae818ee 4318
5769ed0c
ID
4319 /*
4320 * disk_release() expects a queue ref from add_disk() and will
4321 * put it. Hold an extra ref until add_disk() is called.
4322 */
4323 WARN_ON(!blk_get_queue(q));
602adf40 4324 disk->queue = q;
602adf40
YS
4325 q->queuedata = rbd_dev;
4326
4327 rbd_dev->disk = disk;
602adf40 4328
602adf40 4329 return 0;
7ad18afa
CH
4330out_tag_set:
4331 blk_mq_free_tag_set(&rbd_dev->tag_set);
602adf40
YS
4332out_disk:
4333 put_disk(disk);
7ad18afa 4334 return err;
602adf40
YS
4335}
4336
dfc5606d
YS
4337/*
4338 sysfs
4339*/
4340
593a9e7b
AE
4341static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4342{
4343 return container_of(dev, struct rbd_device, dev);
4344}
4345
dfc5606d
YS
4346static ssize_t rbd_size_show(struct device *dev,
4347 struct device_attribute *attr, char *buf)
4348{
593a9e7b 4349 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0 4350
fc71d833
AE
4351 return sprintf(buf, "%llu\n",
4352 (unsigned long long)rbd_dev->mapping.size);
dfc5606d
YS
4353}
4354
34b13184
AE
4355/*
4356 * Note this shows the features for whatever's mapped, which is not
4357 * necessarily the base image.
4358 */
4359static ssize_t rbd_features_show(struct device *dev,
4360 struct device_attribute *attr, char *buf)
4361{
4362 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4363
4364 return sprintf(buf, "0x%016llx\n",
fc71d833 4365 (unsigned long long)rbd_dev->mapping.features);
34b13184
AE
4366}
4367
dfc5606d
YS
4368static ssize_t rbd_major_show(struct device *dev,
4369 struct device_attribute *attr, char *buf)
4370{
593a9e7b 4371 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 4372
fc71d833
AE
4373 if (rbd_dev->major)
4374 return sprintf(buf, "%d\n", rbd_dev->major);
4375
4376 return sprintf(buf, "(none)\n");
dd82fff1
ID
4377}
4378
4379static ssize_t rbd_minor_show(struct device *dev,
4380 struct device_attribute *attr, char *buf)
4381{
4382 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
fc71d833 4383
dd82fff1 4384 return sprintf(buf, "%d\n", rbd_dev->minor);
dfc5606d
YS
4385}
4386
005a07bf
ID
4387static ssize_t rbd_client_addr_show(struct device *dev,
4388 struct device_attribute *attr, char *buf)
4389{
4390 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4391 struct ceph_entity_addr *client_addr =
4392 ceph_client_addr(rbd_dev->rbd_client->client);
4393
4394 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4395 le32_to_cpu(client_addr->nonce));
4396}
4397
dfc5606d
YS
4398static ssize_t rbd_client_id_show(struct device *dev,
4399 struct device_attribute *attr, char *buf)
602adf40 4400{
593a9e7b 4401 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 4402
1dbb4399 4403 return sprintf(buf, "client%lld\n",
033268a5 4404 ceph_client_gid(rbd_dev->rbd_client->client));
602adf40
YS
4405}
4406
267fb90b
MC
4407static ssize_t rbd_cluster_fsid_show(struct device *dev,
4408 struct device_attribute *attr, char *buf)
4409{
4410 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4411
4412 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4413}
4414
0d6d1e9c
MC
4415static ssize_t rbd_config_info_show(struct device *dev,
4416 struct device_attribute *attr, char *buf)
4417{
4418 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4419
4420 return sprintf(buf, "%s\n", rbd_dev->config_info);
602adf40
YS
4421}
4422
dfc5606d
YS
4423static ssize_t rbd_pool_show(struct device *dev,
4424 struct device_attribute *attr, char *buf)
602adf40 4425{
593a9e7b 4426 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 4427
0d7dbfce 4428 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
4429}
4430
9bb2f334
AE
4431static ssize_t rbd_pool_id_show(struct device *dev,
4432 struct device_attribute *attr, char *buf)
4433{
4434 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4435
0d7dbfce 4436 return sprintf(buf, "%llu\n",
fc71d833 4437 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
4438}
4439
b26c047b
ID
4440static ssize_t rbd_pool_ns_show(struct device *dev,
4441 struct device_attribute *attr, char *buf)
4442{
4443 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4444
4445 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4446}
4447
dfc5606d
YS
4448static ssize_t rbd_name_show(struct device *dev,
4449 struct device_attribute *attr, char *buf)
4450{
593a9e7b 4451 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 4452
a92ffdf8
AE
4453 if (rbd_dev->spec->image_name)
4454 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4455
4456 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
4457}
4458
589d30e0
AE
4459static ssize_t rbd_image_id_show(struct device *dev,
4460 struct device_attribute *attr, char *buf)
4461{
4462 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4463
0d7dbfce 4464 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
4465}
4466
34b13184
AE
4467/*
4468 * Shows the name of the currently-mapped snapshot (or
4469 * RBD_SNAP_HEAD_NAME for the base image).
4470 */
dfc5606d
YS
4471static ssize_t rbd_snap_show(struct device *dev,
4472 struct device_attribute *attr,
4473 char *buf)
4474{
593a9e7b 4475 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 4476
0d7dbfce 4477 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
4478}
4479
92a58671
MC
4480static ssize_t rbd_snap_id_show(struct device *dev,
4481 struct device_attribute *attr, char *buf)
4482{
4483 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4484
4485 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4486}
4487
86b00e0d 4488/*
ff96128f
ID
4489 * For a v2 image, shows the chain of parent images, separated by empty
4490 * lines. For v1 images or if there is no parent, shows "(no parent
4491 * image)".
86b00e0d
AE
4492 */
4493static ssize_t rbd_parent_show(struct device *dev,
ff96128f
ID
4494 struct device_attribute *attr,
4495 char *buf)
86b00e0d
AE
4496{
4497 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
ff96128f 4498 ssize_t count = 0;
86b00e0d 4499
ff96128f 4500 if (!rbd_dev->parent)
86b00e0d
AE
4501 return sprintf(buf, "(no parent image)\n");
4502
ff96128f
ID
4503 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4504 struct rbd_spec *spec = rbd_dev->parent_spec;
4505
4506 count += sprintf(&buf[count], "%s"
4507 "pool_id %llu\npool_name %s\n"
e92c0eaf 4508 "pool_ns %s\n"
ff96128f
ID
4509 "image_id %s\nimage_name %s\n"
4510 "snap_id %llu\nsnap_name %s\n"
4511 "overlap %llu\n",
4512 !count ? "" : "\n", /* first? */
4513 spec->pool_id, spec->pool_name,
e92c0eaf 4514 spec->pool_ns ?: "",
ff96128f
ID
4515 spec->image_id, spec->image_name ?: "(unknown)",
4516 spec->snap_id, spec->snap_name,
4517 rbd_dev->parent_overlap);
4518 }
4519
4520 return count;
86b00e0d
AE
4521}
4522
dfc5606d
YS
4523static ssize_t rbd_image_refresh(struct device *dev,
4524 struct device_attribute *attr,
4525 const char *buf,
4526 size_t size)
4527{
593a9e7b 4528 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 4529 int ret;
602adf40 4530
cc4a38bd 4531 ret = rbd_dev_refresh(rbd_dev);
e627db08 4532 if (ret)
52bb1f9b 4533 return ret;
b813623a 4534
52bb1f9b 4535 return size;
dfc5606d 4536}
602adf40 4537
5657a819
JP
4538static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
4539static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
4540static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
4541static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
4542static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
4543static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
4544static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
4545static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
4546static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
4547static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
b26c047b 4548static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5657a819
JP
4549static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
4550static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
4551static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
4552static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
4553static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
4554static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
dfc5606d
YS
4555
4556static struct attribute *rbd_attrs[] = {
4557 &dev_attr_size.attr,
34b13184 4558 &dev_attr_features.attr,
dfc5606d 4559 &dev_attr_major.attr,
dd82fff1 4560 &dev_attr_minor.attr,
005a07bf 4561 &dev_attr_client_addr.attr,
dfc5606d 4562 &dev_attr_client_id.attr,
267fb90b 4563 &dev_attr_cluster_fsid.attr,
0d6d1e9c 4564 &dev_attr_config_info.attr,
dfc5606d 4565 &dev_attr_pool.attr,
9bb2f334 4566 &dev_attr_pool_id.attr,
b26c047b 4567 &dev_attr_pool_ns.attr,
dfc5606d 4568 &dev_attr_name.attr,
589d30e0 4569 &dev_attr_image_id.attr,
dfc5606d 4570 &dev_attr_current_snap.attr,
92a58671 4571 &dev_attr_snap_id.attr,
86b00e0d 4572 &dev_attr_parent.attr,
dfc5606d 4573 &dev_attr_refresh.attr,
dfc5606d
YS
4574 NULL
4575};
4576
4577static struct attribute_group rbd_attr_group = {
4578 .attrs = rbd_attrs,
4579};
4580
4581static const struct attribute_group *rbd_attr_groups[] = {
4582 &rbd_attr_group,
4583 NULL
4584};
4585
6cac4695 4586static void rbd_dev_release(struct device *dev);
dfc5606d 4587
b9942bc9 4588static const struct device_type rbd_device_type = {
dfc5606d
YS
4589 .name = "rbd",
4590 .groups = rbd_attr_groups,
6cac4695 4591 .release = rbd_dev_release,
dfc5606d
YS
4592};
4593
8b8fb99c
AE
4594static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4595{
4596 kref_get(&spec->kref);
4597
4598 return spec;
4599}
4600
4601static void rbd_spec_free(struct kref *kref);
4602static void rbd_spec_put(struct rbd_spec *spec)
4603{
4604 if (spec)
4605 kref_put(&spec->kref, rbd_spec_free);
4606}
4607
4608static struct rbd_spec *rbd_spec_alloc(void)
4609{
4610 struct rbd_spec *spec;
4611
4612 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4613 if (!spec)
4614 return NULL;
04077599
ID
4615
4616 spec->pool_id = CEPH_NOPOOL;
4617 spec->snap_id = CEPH_NOSNAP;
8b8fb99c
AE
4618 kref_init(&spec->kref);
4619
8b8fb99c
AE
4620 return spec;
4621}
4622
4623static void rbd_spec_free(struct kref *kref)
4624{
4625 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4626
4627 kfree(spec->pool_name);
b26c047b 4628 kfree(spec->pool_ns);
8b8fb99c
AE
4629 kfree(spec->image_id);
4630 kfree(spec->image_name);
4631 kfree(spec->snap_name);
4632 kfree(spec);
4633}
4634
1643dfa4 4635static void rbd_dev_free(struct rbd_device *rbd_dev)
dd5ac32d 4636{
99d16943 4637 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
ed95b21a 4638 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
dd5ac32d 4639
c41d13a3 4640 ceph_oid_destroy(&rbd_dev->header_oid);
6b6dddbe 4641 ceph_oloc_destroy(&rbd_dev->header_oloc);
0d6d1e9c 4642 kfree(rbd_dev->config_info);
c41d13a3 4643
dd5ac32d
ID
4644 rbd_put_client(rbd_dev->rbd_client);
4645 rbd_spec_put(rbd_dev->spec);
4646 kfree(rbd_dev->opts);
4647 kfree(rbd_dev);
1643dfa4
ID
4648}
4649
4650static void rbd_dev_release(struct device *dev)
4651{
4652 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4653 bool need_put = !!rbd_dev->opts;
4654
4655 if (need_put) {
4656 destroy_workqueue(rbd_dev->task_wq);
4657 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4658 }
4659
4660 rbd_dev_free(rbd_dev);
dd5ac32d
ID
4661
4662 /*
4663 * This is racy, but way better than putting module outside of
4664 * the release callback. The race window is pretty small, so
4665 * doing something similar to dm (dm-builtin.c) is overkill.
4666 */
4667 if (need_put)
4668 module_put(THIS_MODULE);
4669}
4670
1643dfa4
ID
4671static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4672 struct rbd_spec *spec)
c53d5893
AE
4673{
4674 struct rbd_device *rbd_dev;
4675
1643dfa4 4676 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
c53d5893
AE
4677 if (!rbd_dev)
4678 return NULL;
4679
4680 spin_lock_init(&rbd_dev->lock);
4681 INIT_LIST_HEAD(&rbd_dev->node);
c53d5893
AE
4682 init_rwsem(&rbd_dev->header_rwsem);
4683
7e97332e 4684 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
c41d13a3 4685 ceph_oid_init(&rbd_dev->header_oid);
431a02cd 4686 rbd_dev->header_oloc.pool = spec->pool_id;
b26c047b
ID
4687 if (spec->pool_ns) {
4688 WARN_ON(!*spec->pool_ns);
4689 rbd_dev->header_oloc.pool_ns =
4690 ceph_find_or_create_string(spec->pool_ns,
4691 strlen(spec->pool_ns));
4692 }
c41d13a3 4693
99d16943
ID
4694 mutex_init(&rbd_dev->watch_mutex);
4695 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4696 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4697
ed95b21a
ID
4698 init_rwsem(&rbd_dev->lock_rwsem);
4699 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4700 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4701 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4702 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4703 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4704 init_waitqueue_head(&rbd_dev->lock_waitq);
4705
dd5ac32d
ID
4706 rbd_dev->dev.bus = &rbd_bus_type;
4707 rbd_dev->dev.type = &rbd_device_type;
4708 rbd_dev->dev.parent = &rbd_root_dev;
dd5ac32d
ID
4709 device_initialize(&rbd_dev->dev);
4710
c53d5893 4711 rbd_dev->rbd_client = rbdc;
d147543d 4712 rbd_dev->spec = spec;
0903e875 4713
1643dfa4
ID
4714 return rbd_dev;
4715}
4716
4717/*
4718 * Create a mapping rbd_dev.
4719 */
4720static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4721 struct rbd_spec *spec,
4722 struct rbd_options *opts)
4723{
4724 struct rbd_device *rbd_dev;
4725
4726 rbd_dev = __rbd_dev_create(rbdc, spec);
4727 if (!rbd_dev)
4728 return NULL;
4729
4730 rbd_dev->opts = opts;
4731
4732 /* get an id and fill in device name */
4733 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4734 minor_to_rbd_dev_id(1 << MINORBITS),
4735 GFP_KERNEL);
4736 if (rbd_dev->dev_id < 0)
4737 goto fail_rbd_dev;
4738
4739 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4740 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4741 rbd_dev->name);
4742 if (!rbd_dev->task_wq)
4743 goto fail_dev_id;
dd5ac32d 4744
1643dfa4
ID
4745 /* we have a ref from do_rbd_add() */
4746 __module_get(THIS_MODULE);
dd5ac32d 4747
1643dfa4 4748 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
c53d5893 4749 return rbd_dev;
1643dfa4
ID
4750
4751fail_dev_id:
4752 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4753fail_rbd_dev:
4754 rbd_dev_free(rbd_dev);
4755 return NULL;
c53d5893
AE
4756}
4757
4758static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4759{
dd5ac32d
ID
4760 if (rbd_dev)
4761 put_device(&rbd_dev->dev);
c53d5893
AE
4762}
4763
9d475de5
AE
4764/*
4765 * Get the size and object order for an image snapshot, or if
4766 * snap_id is CEPH_NOSNAP, gets this information for the base
4767 * image.
4768 */
4769static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4770 u8 *order, u64 *snap_size)
4771{
4772 __le64 snapid = cpu_to_le64(snap_id);
4773 int ret;
4774 struct {
4775 u8 order;
4776 __le64 size;
4777 } __attribute__ ((packed)) size_buf = { 0 };
4778
ecd4a68a
ID
4779 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4780 &rbd_dev->header_oloc, "get_size",
4781 &snapid, sizeof(snapid),
4782 &size_buf, sizeof(size_buf));
36be9a76 4783 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
4784 if (ret < 0)
4785 return ret;
57385b51
AE
4786 if (ret < sizeof (size_buf))
4787 return -ERANGE;
9d475de5 4788
c3545579 4789 if (order) {
c86f86e9 4790 *order = size_buf.order;
c3545579
JD
4791 dout(" order %u", (unsigned int)*order);
4792 }
9d475de5
AE
4793 *snap_size = le64_to_cpu(size_buf.size);
4794
c3545579
JD
4795 dout(" snap_id 0x%016llx snap_size = %llu\n",
4796 (unsigned long long)snap_id,
57385b51 4797 (unsigned long long)*snap_size);
9d475de5
AE
4798
4799 return 0;
4800}
4801
4802static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4803{
4804 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4805 &rbd_dev->header.obj_order,
4806 &rbd_dev->header.image_size);
4807}
4808
1e130199
AE
4809static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4810{
4811 void *reply_buf;
4812 int ret;
4813 void *p;
4814
4815 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4816 if (!reply_buf)
4817 return -ENOMEM;
4818
ecd4a68a
ID
4819 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4820 &rbd_dev->header_oloc, "get_object_prefix",
4821 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
36be9a76 4822 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
4823 if (ret < 0)
4824 goto out;
4825
4826 p = reply_buf;
4827 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
57385b51
AE
4828 p + ret, NULL, GFP_NOIO);
4829 ret = 0;
1e130199
AE
4830
4831 if (IS_ERR(rbd_dev->header.object_prefix)) {
4832 ret = PTR_ERR(rbd_dev->header.object_prefix);
4833 rbd_dev->header.object_prefix = NULL;
4834 } else {
4835 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4836 }
1e130199
AE
4837out:
4838 kfree(reply_buf);
4839
4840 return ret;
4841}
4842
b1b5402a
AE
4843static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4844 u64 *snap_features)
4845{
4846 __le64 snapid = cpu_to_le64(snap_id);
4847 struct {
4848 __le64 features;
4849 __le64 incompat;
4157976b 4850 } __attribute__ ((packed)) features_buf = { 0 };
d3767f0f 4851 u64 unsup;
b1b5402a
AE
4852 int ret;
4853
ecd4a68a
ID
4854 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4855 &rbd_dev->header_oloc, "get_features",
4856 &snapid, sizeof(snapid),
4857 &features_buf, sizeof(features_buf));
36be9a76 4858 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
4859 if (ret < 0)
4860 return ret;
57385b51
AE
4861 if (ret < sizeof (features_buf))
4862 return -ERANGE;
d889140c 4863
d3767f0f
ID
4864 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4865 if (unsup) {
4866 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4867 unsup);
b8f5c6ed 4868 return -ENXIO;
d3767f0f 4869 }
d889140c 4870
b1b5402a
AE
4871 *snap_features = le64_to_cpu(features_buf.features);
4872
4873 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
57385b51
AE
4874 (unsigned long long)snap_id,
4875 (unsigned long long)*snap_features,
4876 (unsigned long long)le64_to_cpu(features_buf.incompat));
b1b5402a
AE
4877
4878 return 0;
4879}
4880
4881static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4882{
4883 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4884 &rbd_dev->header.features);
4885}
4886
eb3b2d6b
ID
4887struct parent_image_info {
4888 u64 pool_id;
e92c0eaf 4889 const char *pool_ns;
eb3b2d6b
ID
4890 const char *image_id;
4891 u64 snap_id;
4892
e92c0eaf 4893 bool has_overlap;
eb3b2d6b
ID
4894 u64 overlap;
4895};
4896
e92c0eaf
ID
4897/*
4898 * The caller is responsible for @pii.
4899 */
4900static int decode_parent_image_spec(void **p, void *end,
4901 struct parent_image_info *pii)
4902{
4903 u8 struct_v;
4904 u32 struct_len;
4905 int ret;
4906
4907 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
4908 &struct_v, &struct_len);
4909 if (ret)
4910 return ret;
4911
4912 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
4913 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4914 if (IS_ERR(pii->pool_ns)) {
4915 ret = PTR_ERR(pii->pool_ns);
4916 pii->pool_ns = NULL;
4917 return ret;
4918 }
4919 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4920 if (IS_ERR(pii->image_id)) {
4921 ret = PTR_ERR(pii->image_id);
4922 pii->image_id = NULL;
4923 return ret;
4924 }
4925 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
4926 return 0;
4927
4928e_inval:
4929 return -EINVAL;
4930}
4931
4932static int __get_parent_info(struct rbd_device *rbd_dev,
4933 struct page *req_page,
4934 struct page *reply_page,
4935 struct parent_image_info *pii)
4936{
4937 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4938 size_t reply_len = PAGE_SIZE;
4939 void *p, *end;
4940 int ret;
4941
4942 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4943 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4944 req_page, sizeof(u64), reply_page, &reply_len);
4945 if (ret)
4946 return ret == -EOPNOTSUPP ? 1 : ret;
4947
4948 p = page_address(reply_page);
4949 end = p + reply_len;
4950 ret = decode_parent_image_spec(&p, end, pii);
4951 if (ret)
4952 return ret;
4953
4954 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4955 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4956 req_page, sizeof(u64), reply_page, &reply_len);
4957 if (ret)
4958 return ret;
4959
4960 p = page_address(reply_page);
4961 end = p + reply_len;
4962 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
4963 if (pii->has_overlap)
4964 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4965
4966 return 0;
4967
4968e_inval:
4969 return -EINVAL;
4970}
4971
eb3b2d6b
ID
4972/*
4973 * The caller is responsible for @pii.
4974 */
4975static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4976 struct page *req_page,
4977 struct page *reply_page,
4978 struct parent_image_info *pii)
4979{
4980 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4981 size_t reply_len = PAGE_SIZE;
4982 void *p, *end;
4983 int ret;
4984
4985 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4986 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4987 req_page, sizeof(u64), reply_page, &reply_len);
4988 if (ret)
4989 return ret;
4990
4991 p = page_address(reply_page);
4992 end = p + reply_len;
4993 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4994 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4995 if (IS_ERR(pii->image_id)) {
4996 ret = PTR_ERR(pii->image_id);
4997 pii->image_id = NULL;
4998 return ret;
4999 }
5000 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
e92c0eaf 5001 pii->has_overlap = true;
eb3b2d6b
ID
5002 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5003
5004 return 0;
5005
5006e_inval:
5007 return -EINVAL;
5008}
5009
5010static int get_parent_info(struct rbd_device *rbd_dev,
5011 struct parent_image_info *pii)
5012{
5013 struct page *req_page, *reply_page;
5014 void *p;
5015 int ret;
5016
5017 req_page = alloc_page(GFP_KERNEL);
5018 if (!req_page)
5019 return -ENOMEM;
5020
5021 reply_page = alloc_page(GFP_KERNEL);
5022 if (!reply_page) {
5023 __free_page(req_page);
5024 return -ENOMEM;
5025 }
5026
5027 p = page_address(req_page);
5028 ceph_encode_64(&p, rbd_dev->spec->snap_id);
e92c0eaf
ID
5029 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5030 if (ret > 0)
5031 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5032 pii);
eb3b2d6b
ID
5033
5034 __free_page(req_page);
5035 __free_page(reply_page);
5036 return ret;
5037}
5038
86b00e0d
AE
5039static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5040{
5041 struct rbd_spec *parent_spec;
eb3b2d6b 5042 struct parent_image_info pii = { 0 };
86b00e0d
AE
5043 int ret;
5044
5045 parent_spec = rbd_spec_alloc();
5046 if (!parent_spec)
5047 return -ENOMEM;
5048
eb3b2d6b
ID
5049 ret = get_parent_info(rbd_dev, &pii);
5050 if (ret)
86b00e0d 5051 goto out_err;
86b00e0d 5052
e92c0eaf
ID
5053 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5054 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5055 pii.has_overlap, pii.overlap);
86b00e0d 5056
e92c0eaf 5057 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
392a9dad
AE
5058 /*
5059 * Either the parent never existed, or we have
5060 * record of it but the image got flattened so it no
5061 * longer has a parent. When the parent of a
5062 * layered image disappears we immediately set the
5063 * overlap to 0. The effect of this is that all new
5064 * requests will be treated as if the image had no
5065 * parent.
e92c0eaf
ID
5066 *
5067 * If !pii.has_overlap, the parent image spec is not
5068 * applicable. It's there to avoid duplication in each
5069 * snapshot record.
392a9dad
AE
5070 */
5071 if (rbd_dev->parent_overlap) {
5072 rbd_dev->parent_overlap = 0;
392a9dad
AE
5073 rbd_dev_parent_put(rbd_dev);
5074 pr_info("%s: clone image has been flattened\n",
5075 rbd_dev->disk->disk_name);
5076 }
5077
86b00e0d 5078 goto out; /* No parent? No problem. */
392a9dad 5079 }
86b00e0d 5080
0903e875
AE
5081 /* The ceph file layout needs to fit pool id in 32 bits */
5082
5083 ret = -EIO;
eb3b2d6b 5084 if (pii.pool_id > (u64)U32_MAX) {
9584d508 5085 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
eb3b2d6b 5086 (unsigned long long)pii.pool_id, U32_MAX);
86b00e0d
AE
5087 goto out_err;
5088 }
86b00e0d 5089
3b5cf2a2
AE
5090 /*
5091 * The parent won't change (except when the clone is
5092 * flattened, already handled that). So we only need to
5093 * record the parent spec we have not already done so.
5094 */
5095 if (!rbd_dev->parent_spec) {
eb3b2d6b 5096 parent_spec->pool_id = pii.pool_id;
e92c0eaf
ID
5097 if (pii.pool_ns && *pii.pool_ns) {
5098 parent_spec->pool_ns = pii.pool_ns;
5099 pii.pool_ns = NULL;
5100 }
eb3b2d6b
ID
5101 parent_spec->image_id = pii.image_id;
5102 pii.image_id = NULL;
5103 parent_spec->snap_id = pii.snap_id;
b26c047b 5104
70cf49cf
AE
5105 rbd_dev->parent_spec = parent_spec;
5106 parent_spec = NULL; /* rbd_dev now owns this */
3b5cf2a2
AE
5107 }
5108
5109 /*
cf32bd9c
ID
5110 * We always update the parent overlap. If it's zero we issue
5111 * a warning, as we will proceed as if there was no parent.
3b5cf2a2 5112 */
eb3b2d6b 5113 if (!pii.overlap) {
3b5cf2a2 5114 if (parent_spec) {
cf32bd9c
ID
5115 /* refresh, careful to warn just once */
5116 if (rbd_dev->parent_overlap)
5117 rbd_warn(rbd_dev,
5118 "clone now standalone (overlap became 0)");
3b5cf2a2 5119 } else {
cf32bd9c
ID
5120 /* initial probe */
5121 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
3b5cf2a2 5122 }
70cf49cf 5123 }
eb3b2d6b 5124 rbd_dev->parent_overlap = pii.overlap;
cf32bd9c 5125
86b00e0d
AE
5126out:
5127 ret = 0;
5128out_err:
e92c0eaf 5129 kfree(pii.pool_ns);
eb3b2d6b 5130 kfree(pii.image_id);
86b00e0d 5131 rbd_spec_put(parent_spec);
86b00e0d
AE
5132 return ret;
5133}
5134
cc070d59
AE
5135static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5136{
5137 struct {
5138 __le64 stripe_unit;
5139 __le64 stripe_count;
5140 } __attribute__ ((packed)) striping_info_buf = { 0 };
5141 size_t size = sizeof (striping_info_buf);
5142 void *p;
cc070d59
AE
5143 int ret;
5144
ecd4a68a
ID
5145 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5146 &rbd_dev->header_oloc, "get_stripe_unit_count",
5147 NULL, 0, &striping_info_buf, size);
cc070d59
AE
5148 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5149 if (ret < 0)
5150 return ret;
5151 if (ret < size)
5152 return -ERANGE;
5153
cc070d59 5154 p = &striping_info_buf;
b1331852
ID
5155 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5156 rbd_dev->header.stripe_count = ceph_decode_64(&p);
cc070d59
AE
5157 return 0;
5158}
5159
7e97332e
ID
5160static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5161{
5162 __le64 data_pool_id;
5163 int ret;
5164
5165 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5166 &rbd_dev->header_oloc, "get_data_pool",
5167 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5168 if (ret < 0)
5169 return ret;
5170 if (ret < sizeof(data_pool_id))
5171 return -EBADMSG;
5172
5173 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5174 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5175 return 0;
5176}
5177
9e15b77d
AE
5178static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5179{
ecd4a68a 5180 CEPH_DEFINE_OID_ONSTACK(oid);
9e15b77d
AE
5181 size_t image_id_size;
5182 char *image_id;
5183 void *p;
5184 void *end;
5185 size_t size;
5186 void *reply_buf = NULL;
5187 size_t len = 0;
5188 char *image_name = NULL;
5189 int ret;
5190
5191 rbd_assert(!rbd_dev->spec->image_name);
5192
69e7a02f
AE
5193 len = strlen(rbd_dev->spec->image_id);
5194 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
5195 image_id = kmalloc(image_id_size, GFP_KERNEL);
5196 if (!image_id)
5197 return NULL;
5198
5199 p = image_id;
4157976b 5200 end = image_id + image_id_size;
57385b51 5201 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
9e15b77d
AE
5202
5203 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5204 reply_buf = kmalloc(size, GFP_KERNEL);
5205 if (!reply_buf)
5206 goto out;
5207
ecd4a68a
ID
5208 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5209 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5210 "dir_get_name", image_id, image_id_size,
5211 reply_buf, size);
9e15b77d
AE
5212 if (ret < 0)
5213 goto out;
5214 p = reply_buf;
f40eb349
AE
5215 end = reply_buf + ret;
5216
9e15b77d
AE
5217 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5218 if (IS_ERR(image_name))
5219 image_name = NULL;
5220 else
5221 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5222out:
5223 kfree(reply_buf);
5224 kfree(image_id);
5225
5226 return image_name;
5227}
5228
2ad3d716
AE
5229static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5230{
5231 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5232 const char *snap_name;
5233 u32 which = 0;
5234
5235 /* Skip over names until we find the one we are looking for */
5236
5237 snap_name = rbd_dev->header.snap_names;
5238 while (which < snapc->num_snaps) {
5239 if (!strcmp(name, snap_name))
5240 return snapc->snaps[which];
5241 snap_name += strlen(snap_name) + 1;
5242 which++;
5243 }
5244 return CEPH_NOSNAP;
5245}
5246
5247static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5248{
5249 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5250 u32 which;
5251 bool found = false;
5252 u64 snap_id;
5253
5254 for (which = 0; !found && which < snapc->num_snaps; which++) {
5255 const char *snap_name;
5256
5257 snap_id = snapc->snaps[which];
5258 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
efadc98a
JD
5259 if (IS_ERR(snap_name)) {
5260 /* ignore no-longer existing snapshots */
5261 if (PTR_ERR(snap_name) == -ENOENT)
5262 continue;
5263 else
5264 break;
5265 }
2ad3d716
AE
5266 found = !strcmp(name, snap_name);
5267 kfree(snap_name);
5268 }
5269 return found ? snap_id : CEPH_NOSNAP;
5270}
5271
5272/*
5273 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5274 * no snapshot by that name is found, or if an error occurs.
5275 */
5276static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5277{
5278 if (rbd_dev->image_format == 1)
5279 return rbd_v1_snap_id_by_name(rbd_dev, name);
5280
5281 return rbd_v2_snap_id_by_name(rbd_dev, name);
5282}
5283
9e15b77d 5284/*
04077599
ID
5285 * An image being mapped will have everything but the snap id.
5286 */
5287static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5288{
5289 struct rbd_spec *spec = rbd_dev->spec;
5290
5291 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5292 rbd_assert(spec->image_id && spec->image_name);
5293 rbd_assert(spec->snap_name);
5294
5295 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5296 u64 snap_id;
5297
5298 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5299 if (snap_id == CEPH_NOSNAP)
5300 return -ENOENT;
5301
5302 spec->snap_id = snap_id;
5303 } else {
5304 spec->snap_id = CEPH_NOSNAP;
5305 }
5306
5307 return 0;
5308}
5309
5310/*
5311 * A parent image will have all ids but none of the names.
e1d4213f 5312 *
04077599
ID
5313 * All names in an rbd spec are dynamically allocated. It's OK if we
5314 * can't figure out the name for an image id.
9e15b77d 5315 */
04077599 5316static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
9e15b77d 5317{
2e9f7f1c
AE
5318 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5319 struct rbd_spec *spec = rbd_dev->spec;
5320 const char *pool_name;
5321 const char *image_name;
5322 const char *snap_name;
9e15b77d
AE
5323 int ret;
5324
04077599
ID
5325 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5326 rbd_assert(spec->image_id);
5327 rbd_assert(spec->snap_id != CEPH_NOSNAP);
9e15b77d 5328
2e9f7f1c 5329 /* Get the pool name; we have to make our own copy of this */
9e15b77d 5330
2e9f7f1c
AE
5331 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5332 if (!pool_name) {
5333 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
935dc89f
AE
5334 return -EIO;
5335 }
2e9f7f1c
AE
5336 pool_name = kstrdup(pool_name, GFP_KERNEL);
5337 if (!pool_name)
9e15b77d
AE
5338 return -ENOMEM;
5339
5340 /* Fetch the image name; tolerate failure here */
5341
2e9f7f1c
AE
5342 image_name = rbd_dev_image_name(rbd_dev);
5343 if (!image_name)
06ecc6cb 5344 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d 5345
04077599 5346 /* Fetch the snapshot name */
9e15b77d 5347
2e9f7f1c 5348 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
da6a6b63
JD
5349 if (IS_ERR(snap_name)) {
5350 ret = PTR_ERR(snap_name);
9e15b77d 5351 goto out_err;
2e9f7f1c
AE
5352 }
5353
5354 spec->pool_name = pool_name;
5355 spec->image_name = image_name;
5356 spec->snap_name = snap_name;
9e15b77d
AE
5357
5358 return 0;
04077599 5359
9e15b77d 5360out_err:
2e9f7f1c
AE
5361 kfree(image_name);
5362 kfree(pool_name);
9e15b77d
AE
5363 return ret;
5364}
5365
cc4a38bd 5366static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
35d489f9
AE
5367{
5368 size_t size;
5369 int ret;
5370 void *reply_buf;
5371 void *p;
5372 void *end;
5373 u64 seq;
5374 u32 snap_count;
5375 struct ceph_snap_context *snapc;
5376 u32 i;
5377
5378 /*
5379 * We'll need room for the seq value (maximum snapshot id),
5380 * snapshot count, and array of that many snapshot ids.
5381 * For now we have a fixed upper limit on the number we're
5382 * prepared to receive.
5383 */
5384 size = sizeof (__le64) + sizeof (__le32) +
5385 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5386 reply_buf = kzalloc(size, GFP_KERNEL);
5387 if (!reply_buf)
5388 return -ENOMEM;
5389
ecd4a68a
ID
5390 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5391 &rbd_dev->header_oloc, "get_snapcontext",
5392 NULL, 0, reply_buf, size);
36be9a76 5393 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
5394 if (ret < 0)
5395 goto out;
5396
35d489f9 5397 p = reply_buf;
57385b51
AE
5398 end = reply_buf + ret;
5399 ret = -ERANGE;
35d489f9
AE
5400 ceph_decode_64_safe(&p, end, seq, out);
5401 ceph_decode_32_safe(&p, end, snap_count, out);
5402
5403 /*
5404 * Make sure the reported number of snapshot ids wouldn't go
5405 * beyond the end of our buffer. But before checking that,
5406 * make sure the computed size of the snapshot context we
5407 * allocate is representable in a size_t.
5408 */
5409 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5410 / sizeof (u64)) {
5411 ret = -EINVAL;
5412 goto out;
5413 }
5414 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5415 goto out;
468521c1 5416 ret = 0;
35d489f9 5417
812164f8 5418 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
35d489f9
AE
5419 if (!snapc) {
5420 ret = -ENOMEM;
5421 goto out;
5422 }
35d489f9 5423 snapc->seq = seq;
35d489f9
AE
5424 for (i = 0; i < snap_count; i++)
5425 snapc->snaps[i] = ceph_decode_64(&p);
5426
49ece554 5427 ceph_put_snap_context(rbd_dev->header.snapc);
35d489f9
AE
5428 rbd_dev->header.snapc = snapc;
5429
5430 dout(" snap context seq = %llu, snap_count = %u\n",
57385b51 5431 (unsigned long long)seq, (unsigned int)snap_count);
35d489f9
AE
5432out:
5433 kfree(reply_buf);
5434
57385b51 5435 return ret;
35d489f9
AE
5436}
5437
54cac61f
AE
5438static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5439 u64 snap_id)
b8b1e2db
AE
5440{
5441 size_t size;
5442 void *reply_buf;
54cac61f 5443 __le64 snapid;
b8b1e2db
AE
5444 int ret;
5445 void *p;
5446 void *end;
b8b1e2db
AE
5447 char *snap_name;
5448
5449 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5450 reply_buf = kmalloc(size, GFP_KERNEL);
5451 if (!reply_buf)
5452 return ERR_PTR(-ENOMEM);
5453
54cac61f 5454 snapid = cpu_to_le64(snap_id);
ecd4a68a
ID
5455 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5456 &rbd_dev->header_oloc, "get_snapshot_name",
5457 &snapid, sizeof(snapid), reply_buf, size);
36be9a76 5458 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
f40eb349
AE
5459 if (ret < 0) {
5460 snap_name = ERR_PTR(ret);
b8b1e2db 5461 goto out;
f40eb349 5462 }
b8b1e2db
AE
5463
5464 p = reply_buf;
f40eb349 5465 end = reply_buf + ret;
e5c35534 5466 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
f40eb349 5467 if (IS_ERR(snap_name))
b8b1e2db 5468 goto out;
b8b1e2db 5469
f40eb349 5470 dout(" snap_id 0x%016llx snap_name = %s\n",
54cac61f 5471 (unsigned long long)snap_id, snap_name);
b8b1e2db
AE
5472out:
5473 kfree(reply_buf);
5474
f40eb349 5475 return snap_name;
b8b1e2db
AE
5476}
5477
2df3fac7 5478static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
117973fb 5479{
2df3fac7 5480 bool first_time = rbd_dev->header.object_prefix == NULL;
117973fb 5481 int ret;
117973fb 5482
1617e40c
JD
5483 ret = rbd_dev_v2_image_size(rbd_dev);
5484 if (ret)
cfbf6377 5485 return ret;
1617e40c 5486
2df3fac7
AE
5487 if (first_time) {
5488 ret = rbd_dev_v2_header_onetime(rbd_dev);
5489 if (ret)
cfbf6377 5490 return ret;
2df3fac7
AE
5491 }
5492
cc4a38bd 5493 ret = rbd_dev_v2_snap_context(rbd_dev);
d194cd1d
ID
5494 if (ret && first_time) {
5495 kfree(rbd_dev->header.object_prefix);
5496 rbd_dev->header.object_prefix = NULL;
5497 }
117973fb
AE
5498
5499 return ret;
5500}
5501
a720ae09
ID
5502static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5503{
5504 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5505
5506 if (rbd_dev->image_format == 1)
5507 return rbd_dev_v1_header_info(rbd_dev);
5508
5509 return rbd_dev_v2_header_info(rbd_dev);
5510}
5511
e28fff26
AE
5512/*
5513 * Skips over white space at *buf, and updates *buf to point to the
5514 * first found non-space character (if any). Returns the length of
593a9e7b
AE
5515 * the token (string of non-white space characters) found. Note
5516 * that *buf must be terminated with '\0'.
e28fff26
AE
5517 */
5518static inline size_t next_token(const char **buf)
5519{
5520 /*
5521 * These are the characters that produce nonzero for
5522 * isspace() in the "C" and "POSIX" locales.
5523 */
5524 const char *spaces = " \f\n\r\t\v";
5525
5526 *buf += strspn(*buf, spaces); /* Find start of token */
5527
5528 return strcspn(*buf, spaces); /* Return token length */
5529}
5530
ea3352f4
AE
5531/*
5532 * Finds the next token in *buf, dynamically allocates a buffer big
5533 * enough to hold a copy of it, and copies the token into the new
5534 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5535 * that a duplicate buffer is created even for a zero-length token.
5536 *
5537 * Returns a pointer to the newly-allocated duplicate, or a null
5538 * pointer if memory for the duplicate was not available. If
5539 * the lenp argument is a non-null pointer, the length of the token
5540 * (not including the '\0') is returned in *lenp.
5541 *
5542 * If successful, the *buf pointer will be updated to point beyond
5543 * the end of the found token.
5544 *
5545 * Note: uses GFP_KERNEL for allocation.
5546 */
5547static inline char *dup_token(const char **buf, size_t *lenp)
5548{
5549 char *dup;
5550 size_t len;
5551
5552 len = next_token(buf);
4caf35f9 5553 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
5554 if (!dup)
5555 return NULL;
ea3352f4
AE
5556 *(dup + len) = '\0';
5557 *buf += len;
5558
5559 if (lenp)
5560 *lenp = len;
5561
5562 return dup;
5563}
5564
a725f65e 5565/*
859c31df
AE
5566 * Parse the options provided for an "rbd add" (i.e., rbd image
5567 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5568 * and the data written is passed here via a NUL-terminated buffer.
5569 * Returns 0 if successful or an error code otherwise.
d22f76e7 5570 *
859c31df
AE
5571 * The information extracted from these options is recorded in
5572 * the other parameters which return dynamically-allocated
5573 * structures:
5574 * ceph_opts
5575 * The address of a pointer that will refer to a ceph options
5576 * structure. Caller must release the returned pointer using
5577 * ceph_destroy_options() when it is no longer needed.
5578 * rbd_opts
5579 * Address of an rbd options pointer. Fully initialized by
5580 * this function; caller must release with kfree().
5581 * spec
5582 * Address of an rbd image specification pointer. Fully
5583 * initialized by this function based on parsed options.
5584 * Caller must release with rbd_spec_put().
5585 *
5586 * The options passed take this form:
5587 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5588 * where:
5589 * <mon_addrs>
5590 * A comma-separated list of one or more monitor addresses.
5591 * A monitor address is an ip address, optionally followed
5592 * by a port number (separated by a colon).
5593 * I.e.: ip1[:port1][,ip2[:port2]...]
5594 * <options>
5595 * A comma-separated list of ceph and/or rbd options.
5596 * <pool_name>
5597 * The name of the rados pool containing the rbd image.
5598 * <image_name>
5599 * The name of the image in that pool to map.
5600 * <snap_id>
5601 * An optional snapshot id. If provided, the mapping will
5602 * present data from the image at the time that snapshot was
5603 * created. The image head is used if no snapshot id is
5604 * provided. Snapshot mappings are always read-only.
a725f65e 5605 */
859c31df 5606static int rbd_add_parse_args(const char *buf,
dc79b113 5607 struct ceph_options **ceph_opts,
859c31df
AE
5608 struct rbd_options **opts,
5609 struct rbd_spec **rbd_spec)
e28fff26 5610{
d22f76e7 5611 size_t len;
859c31df 5612 char *options;
0ddebc0c 5613 const char *mon_addrs;
ecb4dc22 5614 char *snap_name;
0ddebc0c 5615 size_t mon_addrs_size;
c300156b 5616 struct parse_rbd_opts_ctx pctx = { 0 };
859c31df 5617 struct ceph_options *copts;
dc79b113 5618 int ret;
e28fff26
AE
5619
5620 /* The first four tokens are required */
5621
7ef3214a 5622 len = next_token(&buf);
4fb5d671
AE
5623 if (!len) {
5624 rbd_warn(NULL, "no monitor address(es) provided");
5625 return -EINVAL;
5626 }
0ddebc0c 5627 mon_addrs = buf;
f28e565a 5628 mon_addrs_size = len + 1;
7ef3214a 5629 buf += len;
a725f65e 5630
dc79b113 5631 ret = -EINVAL;
f28e565a
AE
5632 options = dup_token(&buf, NULL);
5633 if (!options)
dc79b113 5634 return -ENOMEM;
4fb5d671
AE
5635 if (!*options) {
5636 rbd_warn(NULL, "no options provided");
5637 goto out_err;
5638 }
e28fff26 5639
c300156b
ID
5640 pctx.spec = rbd_spec_alloc();
5641 if (!pctx.spec)
f28e565a 5642 goto out_mem;
859c31df 5643
c300156b
ID
5644 pctx.spec->pool_name = dup_token(&buf, NULL);
5645 if (!pctx.spec->pool_name)
859c31df 5646 goto out_mem;
c300156b 5647 if (!*pctx.spec->pool_name) {
4fb5d671
AE
5648 rbd_warn(NULL, "no pool name provided");
5649 goto out_err;
5650 }
e28fff26 5651
c300156b
ID
5652 pctx.spec->image_name = dup_token(&buf, NULL);
5653 if (!pctx.spec->image_name)
f28e565a 5654 goto out_mem;
c300156b 5655 if (!*pctx.spec->image_name) {
4fb5d671
AE
5656 rbd_warn(NULL, "no image name provided");
5657 goto out_err;
5658 }
d4b125e9 5659
f28e565a
AE
5660 /*
5661 * Snapshot name is optional; default is to use "-"
5662 * (indicating the head/no snapshot).
5663 */
3feeb894 5664 len = next_token(&buf);
820a5f3e 5665 if (!len) {
3feeb894
AE
5666 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5667 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 5668 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 5669 ret = -ENAMETOOLONG;
f28e565a 5670 goto out_err;
849b4260 5671 }
ecb4dc22
AE
5672 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5673 if (!snap_name)
f28e565a 5674 goto out_mem;
ecb4dc22 5675 *(snap_name + len) = '\0';
c300156b 5676 pctx.spec->snap_name = snap_name;
e5c35534 5677
0ddebc0c 5678 /* Initialize all rbd options to the defaults */
e28fff26 5679
c300156b
ID
5680 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5681 if (!pctx.opts)
4e9afeba
AE
5682 goto out_mem;
5683
c300156b
ID
5684 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5685 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
0c93e1b7 5686 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
c300156b
ID
5687 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5688 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5689 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5690 pctx.opts->trim = RBD_TRIM_DEFAULT;
d22f76e7 5691
859c31df 5692 copts = ceph_parse_options(options, mon_addrs,
c300156b
ID
5693 mon_addrs + mon_addrs_size - 1,
5694 parse_rbd_opts_token, &pctx);
859c31df
AE
5695 if (IS_ERR(copts)) {
5696 ret = PTR_ERR(copts);
dc79b113
AE
5697 goto out_err;
5698 }
859c31df
AE
5699 kfree(options);
5700
5701 *ceph_opts = copts;
c300156b
ID
5702 *opts = pctx.opts;
5703 *rbd_spec = pctx.spec;
0ddebc0c 5704
dc79b113 5705 return 0;
f28e565a 5706out_mem:
dc79b113 5707 ret = -ENOMEM;
d22f76e7 5708out_err:
c300156b
ID
5709 kfree(pctx.opts);
5710 rbd_spec_put(pctx.spec);
f28e565a 5711 kfree(options);
d22f76e7 5712
dc79b113 5713 return ret;
a725f65e
AE
5714}
5715
e010dd0a
ID
5716static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5717{
5718 down_write(&rbd_dev->lock_rwsem);
5719 if (__rbd_is_lock_owner(rbd_dev))
5720 rbd_unlock(rbd_dev);
5721 up_write(&rbd_dev->lock_rwsem);
5722}
5723
5724static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5725{
2f18d466
ID
5726 int ret;
5727
e010dd0a
ID
5728 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5729 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5730 return -EINVAL;
5731 }
5732
5733 /* FIXME: "rbd map --exclusive" should be in interruptible */
5734 down_read(&rbd_dev->lock_rwsem);
2f18d466 5735 ret = rbd_wait_state_locked(rbd_dev, true);
e010dd0a 5736 up_read(&rbd_dev->lock_rwsem);
2f18d466 5737 if (ret) {
e010dd0a
ID
5738 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5739 return -EROFS;
5740 }
5741
5742 return 0;
5743}
5744
589d30e0
AE
5745/*
5746 * An rbd format 2 image has a unique identifier, distinct from the
5747 * name given to it by the user. Internally, that identifier is
5748 * what's used to specify the names of objects related to the image.
5749 *
5750 * A special "rbd id" object is used to map an rbd image name to its
5751 * id. If that object doesn't exist, then there is no v2 rbd image
5752 * with the supplied name.
5753 *
5754 * This function will record the given rbd_dev's image_id field if
5755 * it can be determined, and in that case will return 0. If any
5756 * errors occur a negative errno will be returned and the rbd_dev's
5757 * image_id field will be unchanged (and should be NULL).
5758 */
5759static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5760{
5761 int ret;
5762 size_t size;
ecd4a68a 5763 CEPH_DEFINE_OID_ONSTACK(oid);
589d30e0 5764 void *response;
c0fba368 5765 char *image_id;
2f82ee54 5766
2c0d0a10
AE
5767 /*
5768 * When probing a parent image, the image id is already
5769 * known (and the image name likely is not). There's no
c0fba368
AE
5770 * need to fetch the image id again in this case. We
5771 * do still need to set the image format though.
2c0d0a10 5772 */
c0fba368
AE
5773 if (rbd_dev->spec->image_id) {
5774 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5775
2c0d0a10 5776 return 0;
c0fba368 5777 }
2c0d0a10 5778
589d30e0
AE
5779 /*
5780 * First, see if the format 2 image id file exists, and if
5781 * so, get the image's persistent id from it.
5782 */
ecd4a68a
ID
5783 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5784 rbd_dev->spec->image_name);
5785 if (ret)
5786 return ret;
5787
5788 dout("rbd id object name is %s\n", oid.name);
589d30e0
AE
5789
5790 /* Response will be an encoded string, which includes a length */
5791
5792 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5793 response = kzalloc(size, GFP_NOIO);
5794 if (!response) {
5795 ret = -ENOMEM;
5796 goto out;
5797 }
5798
c0fba368
AE
5799 /* If it doesn't exist we'll assume it's a format 1 image */
5800
ecd4a68a
ID
5801 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5802 "get_id", NULL, 0,
5803 response, RBD_IMAGE_ID_LEN_MAX);
36be9a76 5804 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
c0fba368
AE
5805 if (ret == -ENOENT) {
5806 image_id = kstrdup("", GFP_KERNEL);
5807 ret = image_id ? 0 : -ENOMEM;
5808 if (!ret)
5809 rbd_dev->image_format = 1;
7dd440c9 5810 } else if (ret >= 0) {
c0fba368
AE
5811 void *p = response;
5812
5813 image_id = ceph_extract_encoded_string(&p, p + ret,
979ed480 5814 NULL, GFP_NOIO);
461f758a 5815 ret = PTR_ERR_OR_ZERO(image_id);
c0fba368
AE
5816 if (!ret)
5817 rbd_dev->image_format = 2;
c0fba368
AE
5818 }
5819
5820 if (!ret) {
5821 rbd_dev->spec->image_id = image_id;
5822 dout("image_id is %s\n", image_id);
589d30e0
AE
5823 }
5824out:
5825 kfree(response);
ecd4a68a 5826 ceph_oid_destroy(&oid);
589d30e0
AE
5827 return ret;
5828}
5829
3abef3b3
AE
5830/*
5831 * Undo whatever state changes are made by v1 or v2 header info
5832 * call.
5833 */
6fd48b3b
AE
5834static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5835{
5836 struct rbd_image_header *header;
5837
e69b8d41 5838 rbd_dev_parent_put(rbd_dev);
6fd48b3b
AE
5839
5840 /* Free dynamic fields from the header, then zero it out */
5841
5842 header = &rbd_dev->header;
812164f8 5843 ceph_put_snap_context(header->snapc);
6fd48b3b
AE
5844 kfree(header->snap_sizes);
5845 kfree(header->snap_names);
5846 kfree(header->object_prefix);
5847 memset(header, 0, sizeof (*header));
5848}
5849
2df3fac7 5850static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
a30b71b9
AE
5851{
5852 int ret;
a30b71b9 5853
1e130199 5854 ret = rbd_dev_v2_object_prefix(rbd_dev);
57385b51 5855 if (ret)
b1b5402a
AE
5856 goto out_err;
5857
2df3fac7
AE
5858 /*
5859 * Get the and check features for the image. Currently the
5860 * features are assumed to never change.
5861 */
b1b5402a 5862 ret = rbd_dev_v2_features(rbd_dev);
57385b51 5863 if (ret)
9d475de5 5864 goto out_err;
35d489f9 5865
cc070d59
AE
5866 /* If the image supports fancy striping, get its parameters */
5867
5868 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5869 ret = rbd_dev_v2_striping_info(rbd_dev);
5870 if (ret < 0)
5871 goto out_err;
5872 }
a30b71b9 5873
7e97332e
ID
5874 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5875 ret = rbd_dev_v2_data_pool(rbd_dev);
5876 if (ret)
5877 goto out_err;
5878 }
5879
263423f8 5880 rbd_init_layout(rbd_dev);
35152979 5881 return 0;
263423f8 5882
9d475de5 5883out_err:
642a2537 5884 rbd_dev->header.features = 0;
1e130199
AE
5885 kfree(rbd_dev->header.object_prefix);
5886 rbd_dev->header.object_prefix = NULL;
9d475de5 5887 return ret;
a30b71b9
AE
5888}
5889
6d69bb53
ID
5890/*
5891 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5892 * rbd_dev_image_probe() recursion depth, which means it's also the
5893 * length of the already discovered part of the parent chain.
5894 */
5895static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
83a06263 5896{
2f82ee54 5897 struct rbd_device *parent = NULL;
124afba2
AE
5898 int ret;
5899
5900 if (!rbd_dev->parent_spec)
5901 return 0;
124afba2 5902
6d69bb53
ID
5903 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5904 pr_info("parent chain is too long (%d)\n", depth);
5905 ret = -EINVAL;
5906 goto out_err;
5907 }
5908
1643dfa4 5909 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
1f2c6651
ID
5910 if (!parent) {
5911 ret = -ENOMEM;
124afba2 5912 goto out_err;
1f2c6651
ID
5913 }
5914
5915 /*
5916 * Images related by parent/child relationships always share
5917 * rbd_client and spec/parent_spec, so bump their refcounts.
5918 */
5919 __rbd_get_client(rbd_dev->rbd_client);
5920 rbd_spec_get(rbd_dev->parent_spec);
124afba2 5921
6d69bb53 5922 ret = rbd_dev_image_probe(parent, depth);
124afba2
AE
5923 if (ret < 0)
5924 goto out_err;
1f2c6651 5925
124afba2 5926 rbd_dev->parent = parent;
a2acd00e 5927 atomic_set(&rbd_dev->parent_ref, 1);
124afba2 5928 return 0;
1f2c6651 5929
124afba2 5930out_err:
1f2c6651 5931 rbd_dev_unparent(rbd_dev);
1761b229 5932 rbd_dev_destroy(parent);
124afba2
AE
5933 return ret;
5934}
5935
5769ed0c
ID
5936static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5937{
5938 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5939 rbd_dev_mapping_clear(rbd_dev);
5940 rbd_free_disk(rbd_dev);
5941 if (!single_major)
5942 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5943}
5944
811c6688
ID
5945/*
5946 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5947 * upon return.
5948 */
200a6a8b 5949static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
124afba2 5950{
83a06263 5951 int ret;
d1cf5788 5952
9b60e70b 5953 /* Record our major and minor device numbers. */
83a06263 5954
9b60e70b
ID
5955 if (!single_major) {
5956 ret = register_blkdev(0, rbd_dev->name);
5957 if (ret < 0)
1643dfa4 5958 goto err_out_unlock;
9b60e70b
ID
5959
5960 rbd_dev->major = ret;
5961 rbd_dev->minor = 0;
5962 } else {
5963 rbd_dev->major = rbd_major;
5964 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5965 }
83a06263
AE
5966
5967 /* Set up the blkdev mapping. */
5968
5969 ret = rbd_init_disk(rbd_dev);
5970 if (ret)
5971 goto err_out_blkdev;
5972
f35a4dee 5973 ret = rbd_dev_mapping_set(rbd_dev);
83a06263
AE
5974 if (ret)
5975 goto err_out_disk;
bc1ecc65 5976
f35a4dee 5977 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
9568c93e 5978 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
f35a4dee 5979
5769ed0c 5980 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
f35a4dee 5981 if (ret)
f5ee37bd 5982 goto err_out_mapping;
83a06263 5983
129b79d4 5984 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
811c6688 5985 up_write(&rbd_dev->header_rwsem);
5769ed0c 5986 return 0;
2f82ee54 5987
f35a4dee
AE
5988err_out_mapping:
5989 rbd_dev_mapping_clear(rbd_dev);
83a06263
AE
5990err_out_disk:
5991 rbd_free_disk(rbd_dev);
5992err_out_blkdev:
9b60e70b
ID
5993 if (!single_major)
5994 unregister_blkdev(rbd_dev->major, rbd_dev->name);
811c6688
ID
5995err_out_unlock:
5996 up_write(&rbd_dev->header_rwsem);
83a06263
AE
5997 return ret;
5998}
5999
332bb12d
AE
6000static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6001{
6002 struct rbd_spec *spec = rbd_dev->spec;
c41d13a3 6003 int ret;
332bb12d
AE
6004
6005 /* Record the header object name for this rbd image. */
6006
6007 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
332bb12d 6008 if (rbd_dev->image_format == 1)
c41d13a3
ID
6009 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6010 spec->image_name, RBD_SUFFIX);
332bb12d 6011 else
c41d13a3
ID
6012 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6013 RBD_HEADER_PREFIX, spec->image_id);
332bb12d 6014
c41d13a3 6015 return ret;
332bb12d
AE
6016}
6017
200a6a8b
AE
6018static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6019{
6fd48b3b 6020 rbd_dev_unprobe(rbd_dev);
fd22aef8
ID
6021 if (rbd_dev->opts)
6022 rbd_unregister_watch(rbd_dev);
6fd48b3b
AE
6023 rbd_dev->image_format = 0;
6024 kfree(rbd_dev->spec->image_id);
6025 rbd_dev->spec->image_id = NULL;
200a6a8b
AE
6026}
6027
a30b71b9
AE
6028/*
6029 * Probe for the existence of the header object for the given rbd
1f3ef788
AE
6030 * device. If this image is the one being mapped (i.e., not a
6031 * parent), initiate a watch on its header object before using that
6032 * object to get detailed information about the rbd image.
a30b71b9 6033 */
6d69bb53 6034static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
a30b71b9
AE
6035{
6036 int ret;
6037
6038 /*
3abef3b3
AE
6039 * Get the id from the image id object. Unless there's an
6040 * error, rbd_dev->spec->image_id will be filled in with
6041 * a dynamically-allocated string, and rbd_dev->image_format
6042 * will be set to either 1 or 2.
a30b71b9
AE
6043 */
6044 ret = rbd_dev_image_id(rbd_dev);
6045 if (ret)
c0fba368 6046 return ret;
c0fba368 6047
332bb12d
AE
6048 ret = rbd_dev_header_name(rbd_dev);
6049 if (ret)
6050 goto err_out_format;
6051
6d69bb53 6052 if (!depth) {
99d16943 6053 ret = rbd_register_watch(rbd_dev);
1fe48023
ID
6054 if (ret) {
6055 if (ret == -ENOENT)
b26c047b 6056 pr_info("image %s/%s%s%s does not exist\n",
1fe48023 6057 rbd_dev->spec->pool_name,
b26c047b
ID
6058 rbd_dev->spec->pool_ns ?: "",
6059 rbd_dev->spec->pool_ns ? "/" : "",
1fe48023 6060 rbd_dev->spec->image_name);
c41d13a3 6061 goto err_out_format;
1fe48023 6062 }
1f3ef788 6063 }
b644de2b 6064
a720ae09 6065 ret = rbd_dev_header_info(rbd_dev);
5655c4d9 6066 if (ret)
b644de2b 6067 goto err_out_watch;
83a06263 6068
04077599
ID
6069 /*
6070 * If this image is the one being mapped, we have pool name and
6071 * id, image name and id, and snap name - need to fill snap id.
6072 * Otherwise this is a parent image, identified by pool, image
6073 * and snap ids - need to fill in names for those ids.
6074 */
6d69bb53 6075 if (!depth)
04077599
ID
6076 ret = rbd_spec_fill_snap_id(rbd_dev);
6077 else
6078 ret = rbd_spec_fill_names(rbd_dev);
1fe48023
ID
6079 if (ret) {
6080 if (ret == -ENOENT)
b26c047b 6081 pr_info("snap %s/%s%s%s@%s does not exist\n",
1fe48023 6082 rbd_dev->spec->pool_name,
b26c047b
ID
6083 rbd_dev->spec->pool_ns ?: "",
6084 rbd_dev->spec->pool_ns ? "/" : "",
1fe48023
ID
6085 rbd_dev->spec->image_name,
6086 rbd_dev->spec->snap_name);
33dca39f 6087 goto err_out_probe;
1fe48023 6088 }
9bb81c9b 6089
e8f59b59
ID
6090 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6091 ret = rbd_dev_v2_parent_info(rbd_dev);
6092 if (ret)
6093 goto err_out_probe;
e8f59b59
ID
6094 }
6095
6d69bb53 6096 ret = rbd_dev_probe_parent(rbd_dev, depth);
30d60ba2
AE
6097 if (ret)
6098 goto err_out_probe;
6099
6100 dout("discovered format %u image, header name is %s\n",
c41d13a3 6101 rbd_dev->image_format, rbd_dev->header_oid.name);
30d60ba2 6102 return 0;
e8f59b59 6103
6fd48b3b
AE
6104err_out_probe:
6105 rbd_dev_unprobe(rbd_dev);
b644de2b 6106err_out_watch:
6d69bb53 6107 if (!depth)
99d16943 6108 rbd_unregister_watch(rbd_dev);
332bb12d
AE
6109err_out_format:
6110 rbd_dev->image_format = 0;
5655c4d9
AE
6111 kfree(rbd_dev->spec->image_id);
6112 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
6113 return ret;
6114}
6115
9b60e70b
ID
6116static ssize_t do_rbd_add(struct bus_type *bus,
6117 const char *buf,
6118 size_t count)
602adf40 6119{
cb8627c7 6120 struct rbd_device *rbd_dev = NULL;
dc79b113 6121 struct ceph_options *ceph_opts = NULL;
4e9afeba 6122 struct rbd_options *rbd_opts = NULL;
859c31df 6123 struct rbd_spec *spec = NULL;
9d3997fd 6124 struct rbd_client *rbdc;
b51c83c2 6125 int rc;
602adf40
YS
6126
6127 if (!try_module_get(THIS_MODULE))
6128 return -ENODEV;
6129
602adf40 6130 /* parse add command */
859c31df 6131 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 6132 if (rc < 0)
dd5ac32d 6133 goto out;
78cea76e 6134
9d3997fd
AE
6135 rbdc = rbd_get_client(ceph_opts);
6136 if (IS_ERR(rbdc)) {
6137 rc = PTR_ERR(rbdc);
0ddebc0c 6138 goto err_out_args;
9d3997fd 6139 }
602adf40 6140
602adf40 6141 /* pick the pool */
dd435855 6142 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
1fe48023
ID
6143 if (rc < 0) {
6144 if (rc == -ENOENT)
6145 pr_info("pool %s does not exist\n", spec->pool_name);
602adf40 6146 goto err_out_client;
1fe48023 6147 }
c0cd10db 6148 spec->pool_id = (u64)rc;
859c31df 6149
d147543d 6150 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
b51c83c2
ID
6151 if (!rbd_dev) {
6152 rc = -ENOMEM;
bd4ba655 6153 goto err_out_client;
b51c83c2 6154 }
c53d5893
AE
6155 rbdc = NULL; /* rbd_dev now owns this */
6156 spec = NULL; /* rbd_dev now owns this */
d147543d 6157 rbd_opts = NULL; /* rbd_dev now owns this */
602adf40 6158
0d6d1e9c
MC
6159 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
6160 if (!rbd_dev->config_info) {
6161 rc = -ENOMEM;
6162 goto err_out_rbd_dev;
6163 }
6164
811c6688 6165 down_write(&rbd_dev->header_rwsem);
6d69bb53 6166 rc = rbd_dev_image_probe(rbd_dev, 0);
0d6d1e9c
MC
6167 if (rc < 0) {
6168 up_write(&rbd_dev->header_rwsem);
c53d5893 6169 goto err_out_rbd_dev;
0d6d1e9c 6170 }
05fd6f6f 6171
7ce4eef7 6172 /* If we are mapping a snapshot it must be marked read-only */
7ce4eef7 6173 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9568c93e 6174 rbd_dev->opts->read_only = true;
7ce4eef7 6175
0c93e1b7
ID
6176 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
6177 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
6178 rbd_dev->layout.object_size);
6179 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
6180 }
6181
b536f69a 6182 rc = rbd_dev_device_setup(rbd_dev);
fd22aef8 6183 if (rc)
8b679ec5 6184 goto err_out_image_probe;
3abef3b3 6185
e010dd0a
ID
6186 if (rbd_dev->opts->exclusive) {
6187 rc = rbd_add_acquire_lock(rbd_dev);
6188 if (rc)
6189 goto err_out_device_setup;
3abef3b3
AE
6190 }
6191
5769ed0c
ID
6192 /* Everything's ready. Announce the disk to the world. */
6193
6194 rc = device_add(&rbd_dev->dev);
6195 if (rc)
e010dd0a 6196 goto err_out_image_lock;
5769ed0c
ID
6197
6198 add_disk(rbd_dev->disk);
6199 /* see rbd_init_disk() */
6200 blk_put_queue(rbd_dev->disk->queue);
6201
6202 spin_lock(&rbd_dev_list_lock);
6203 list_add_tail(&rbd_dev->node, &rbd_dev_list);
6204 spin_unlock(&rbd_dev_list_lock);
6205
6206 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
6207 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
6208 rbd_dev->header.features);
dd5ac32d
ID
6209 rc = count;
6210out:
6211 module_put(THIS_MODULE);
6212 return rc;
b536f69a 6213
e010dd0a
ID
6214err_out_image_lock:
6215 rbd_dev_image_unlock(rbd_dev);
5769ed0c
ID
6216err_out_device_setup:
6217 rbd_dev_device_release(rbd_dev);
8b679ec5
ID
6218err_out_image_probe:
6219 rbd_dev_image_release(rbd_dev);
c53d5893
AE
6220err_out_rbd_dev:
6221 rbd_dev_destroy(rbd_dev);
bd4ba655 6222err_out_client:
9d3997fd 6223 rbd_put_client(rbdc);
0ddebc0c 6224err_out_args:
859c31df 6225 rbd_spec_put(spec);
d147543d 6226 kfree(rbd_opts);
dd5ac32d 6227 goto out;
602adf40
YS
6228}
6229
7e9586ba 6230static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
6231{
6232 if (single_major)
6233 return -EINVAL;
6234
6235 return do_rbd_add(bus, buf, count);
6236}
6237
7e9586ba
GKH
6238static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
6239 size_t count)
9b60e70b
ID
6240{
6241 return do_rbd_add(bus, buf, count);
6242}
6243
05a46afd
AE
6244static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6245{
ad945fc1 6246 while (rbd_dev->parent) {
05a46afd
AE
6247 struct rbd_device *first = rbd_dev;
6248 struct rbd_device *second = first->parent;
6249 struct rbd_device *third;
6250
6251 /*
6252 * Follow to the parent with no grandparent and
6253 * remove it.
6254 */
6255 while (second && (third = second->parent)) {
6256 first = second;
6257 second = third;
6258 }
ad945fc1 6259 rbd_assert(second);
8ad42cd0 6260 rbd_dev_image_release(second);
8b679ec5 6261 rbd_dev_destroy(second);
ad945fc1
AE
6262 first->parent = NULL;
6263 first->parent_overlap = 0;
6264
6265 rbd_assert(first->parent_spec);
05a46afd
AE
6266 rbd_spec_put(first->parent_spec);
6267 first->parent_spec = NULL;
05a46afd
AE
6268 }
6269}
6270
9b60e70b
ID
6271static ssize_t do_rbd_remove(struct bus_type *bus,
6272 const char *buf,
6273 size_t count)
602adf40
YS
6274{
6275 struct rbd_device *rbd_dev = NULL;
751cc0e3
AE
6276 struct list_head *tmp;
6277 int dev_id;
0276dca6 6278 char opt_buf[6];
0276dca6 6279 bool force = false;
0d8189e1 6280 int ret;
602adf40 6281
0276dca6
MC
6282 dev_id = -1;
6283 opt_buf[0] = '\0';
6284 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6285 if (dev_id < 0) {
6286 pr_err("dev_id out of range\n");
602adf40 6287 return -EINVAL;
0276dca6
MC
6288 }
6289 if (opt_buf[0] != '\0') {
6290 if (!strcmp(opt_buf, "force")) {
6291 force = true;
6292 } else {
6293 pr_err("bad remove option at '%s'\n", opt_buf);
6294 return -EINVAL;
6295 }
6296 }
602adf40 6297
751cc0e3
AE
6298 ret = -ENOENT;
6299 spin_lock(&rbd_dev_list_lock);
6300 list_for_each(tmp, &rbd_dev_list) {
6301 rbd_dev = list_entry(tmp, struct rbd_device, node);
6302 if (rbd_dev->dev_id == dev_id) {
6303 ret = 0;
6304 break;
6305 }
42382b70 6306 }
751cc0e3
AE
6307 if (!ret) {
6308 spin_lock_irq(&rbd_dev->lock);
0276dca6 6309 if (rbd_dev->open_count && !force)
751cc0e3 6310 ret = -EBUSY;
85f5a4d6
ID
6311 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6312 &rbd_dev->flags))
6313 ret = -EINPROGRESS;
751cc0e3
AE
6314 spin_unlock_irq(&rbd_dev->lock);
6315 }
6316 spin_unlock(&rbd_dev_list_lock);
85f5a4d6 6317 if (ret)
1ba0f1e7 6318 return ret;
751cc0e3 6319
0276dca6
MC
6320 if (force) {
6321 /*
6322 * Prevent new IO from being queued and wait for existing
6323 * IO to complete/fail.
6324 */
6325 blk_mq_freeze_queue(rbd_dev->disk->queue);
6326 blk_set_queue_dying(rbd_dev->disk->queue);
6327 }
6328
5769ed0c
ID
6329 del_gendisk(rbd_dev->disk);
6330 spin_lock(&rbd_dev_list_lock);
6331 list_del_init(&rbd_dev->node);
6332 spin_unlock(&rbd_dev_list_lock);
6333 device_del(&rbd_dev->dev);
fca27065 6334
e010dd0a 6335 rbd_dev_image_unlock(rbd_dev);
dd5ac32d 6336 rbd_dev_device_release(rbd_dev);
8ad42cd0 6337 rbd_dev_image_release(rbd_dev);
8b679ec5 6338 rbd_dev_destroy(rbd_dev);
1ba0f1e7 6339 return count;
602adf40
YS
6340}
6341
7e9586ba 6342static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
6343{
6344 if (single_major)
6345 return -EINVAL;
6346
6347 return do_rbd_remove(bus, buf, count);
6348}
6349
7e9586ba
GKH
6350static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
6351 size_t count)
9b60e70b
ID
6352{
6353 return do_rbd_remove(bus, buf, count);
6354}
6355
602adf40
YS
6356/*
6357 * create control files in sysfs
dfc5606d 6358 * /sys/bus/rbd/...
602adf40 6359 */
7d8dc534 6360static int __init rbd_sysfs_init(void)
602adf40 6361{
dfc5606d 6362 int ret;
602adf40 6363
fed4c143 6364 ret = device_register(&rbd_root_dev);
21079786 6365 if (ret < 0)
dfc5606d 6366 return ret;
602adf40 6367
fed4c143
AE
6368 ret = bus_register(&rbd_bus_type);
6369 if (ret < 0)
6370 device_unregister(&rbd_root_dev);
602adf40 6371
602adf40
YS
6372 return ret;
6373}
6374
7d8dc534 6375static void __exit rbd_sysfs_cleanup(void)
602adf40 6376{
dfc5606d 6377 bus_unregister(&rbd_bus_type);
fed4c143 6378 device_unregister(&rbd_root_dev);
602adf40
YS
6379}
6380
7d8dc534 6381static int __init rbd_slab_init(void)
1c2a9dfe
AE
6382{
6383 rbd_assert(!rbd_img_request_cache);
03d94406 6384 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
868311b1
AE
6385 if (!rbd_img_request_cache)
6386 return -ENOMEM;
6387
6388 rbd_assert(!rbd_obj_request_cache);
03d94406 6389 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
78c2a44a
AE
6390 if (!rbd_obj_request_cache)
6391 goto out_err;
6392
6c696d85 6393 return 0;
1c2a9dfe 6394
6c696d85 6395out_err:
868311b1
AE
6396 kmem_cache_destroy(rbd_img_request_cache);
6397 rbd_img_request_cache = NULL;
1c2a9dfe
AE
6398 return -ENOMEM;
6399}
6400
6401static void rbd_slab_exit(void)
6402{
868311b1
AE
6403 rbd_assert(rbd_obj_request_cache);
6404 kmem_cache_destroy(rbd_obj_request_cache);
6405 rbd_obj_request_cache = NULL;
6406
1c2a9dfe
AE
6407 rbd_assert(rbd_img_request_cache);
6408 kmem_cache_destroy(rbd_img_request_cache);
6409 rbd_img_request_cache = NULL;
6410}
6411
cc344fa1 6412static int __init rbd_init(void)
602adf40
YS
6413{
6414 int rc;
6415
1e32d34c
AE
6416 if (!libceph_compatible(NULL)) {
6417 rbd_warn(NULL, "libceph incompatibility (quitting)");
1e32d34c
AE
6418 return -EINVAL;
6419 }
e1b4d96d 6420
1c2a9dfe 6421 rc = rbd_slab_init();
602adf40
YS
6422 if (rc)
6423 return rc;
e1b4d96d 6424
f5ee37bd
ID
6425 /*
6426 * The number of active work items is limited by the number of
f77303bd 6427 * rbd devices * queue depth, so leave @max_active at default.
f5ee37bd
ID
6428 */
6429 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6430 if (!rbd_wq) {
6431 rc = -ENOMEM;
6432 goto err_out_slab;
6433 }
6434
9b60e70b
ID
6435 if (single_major) {
6436 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6437 if (rbd_major < 0) {
6438 rc = rbd_major;
f5ee37bd 6439 goto err_out_wq;
9b60e70b
ID
6440 }
6441 }
6442
1c2a9dfe
AE
6443 rc = rbd_sysfs_init();
6444 if (rc)
9b60e70b
ID
6445 goto err_out_blkdev;
6446
6447 if (single_major)
6448 pr_info("loaded (major %d)\n", rbd_major);
6449 else
6450 pr_info("loaded\n");
1c2a9dfe 6451
e1b4d96d
ID
6452 return 0;
6453
9b60e70b
ID
6454err_out_blkdev:
6455 if (single_major)
6456 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd
ID
6457err_out_wq:
6458 destroy_workqueue(rbd_wq);
e1b4d96d
ID
6459err_out_slab:
6460 rbd_slab_exit();
1c2a9dfe 6461 return rc;
602adf40
YS
6462}
6463
cc344fa1 6464static void __exit rbd_exit(void)
602adf40 6465{
ffe312cf 6466 ida_destroy(&rbd_dev_id_ida);
602adf40 6467 rbd_sysfs_cleanup();
9b60e70b
ID
6468 if (single_major)
6469 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd 6470 destroy_workqueue(rbd_wq);
1c2a9dfe 6471 rbd_slab_exit();
602adf40
YS
6472}
6473
6474module_init(rbd_init);
6475module_exit(rbd_exit);
6476
d552c619 6477MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
602adf40
YS
6478MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6479MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
602adf40
YS
6480/* following authorship retained from original osdblk.c */
6481MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6482
90da258b 6483MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
602adf40 6484MODULE_LICENSE("GPL");