rbd: document rbd_spec structure
[linux-2.6-block.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
d4b125e9
AE
64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
35d489f9 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
69#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
9e15b77d
AE
73/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 75#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 76
1e130199 77#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 78
d889140c
AE
79/* Feature bits */
80
81#define RBD_FEATURE_LAYERING 1
82
83/* Features supported by this (client software) implementation. */
84
85#define RBD_FEATURES_ALL (0)
86
81a89793
AE
87/*
88 * An RBD device name will be "rbd#", where the "rbd" comes from
89 * RBD_DRV_NAME above, and # is a unique integer identifier.
90 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
91 * enough to hold all possible device names.
92 */
602adf40 93#define DEV_NAME_LEN 32
81a89793 94#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 95
cc0538b6 96#define RBD_READ_ONLY_DEFAULT false
59c2be1e 97
602adf40
YS
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
f84344f3 102 /* These four fields never change for a given rbd image */
849b4260 103 char *object_prefix;
34b13184 104 u64 features;
602adf40
YS
105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
602adf40 108
f84344f3
AE
109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
602adf40
YS
112 char *snap_names;
113 u64 *snap_sizes;
59c2be1e
YS
114
115 u64 obj_version;
116};
117
0d7dbfce
AE
118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
0d7dbfce
AE
142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
148 size_t image_id_len;
149 char *image_name;
150 size_t image_name_len;
151
152 u64 snap_id;
153 char *snap_name;
154
155 struct kref kref;
156};
157
59c2be1e 158struct rbd_options {
cc0538b6 159 bool read_only;
602adf40
YS
160};
161
162/*
f0f8cef5 163 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
164 */
165struct rbd_client {
166 struct ceph_client *client;
167 struct kref kref;
168 struct list_head node;
169};
170
171/*
f0f8cef5 172 * a request completion status
602adf40 173 */
1fec7093
YS
174struct rbd_req_status {
175 int done;
176 int rc;
177 u64 bytes;
178};
179
180/*
181 * a collection of requests
182 */
183struct rbd_req_coll {
184 int total;
185 int num_done;
186 struct kref kref;
187 struct rbd_req_status status[0];
602adf40
YS
188};
189
f0f8cef5
AE
190/*
191 * a single io request
192 */
193struct rbd_request {
194 struct request *rq; /* blk layer request */
195 struct bio *bio; /* cloned bio */
196 struct page **pages; /* list of used pages */
197 u64 len;
198 int coll_index;
199 struct rbd_req_coll *coll;
200};
201
dfc5606d
YS
202struct rbd_snap {
203 struct device dev;
204 const char *name;
3591538f 205 u64 size;
dfc5606d
YS
206 struct list_head node;
207 u64 id;
34b13184 208 u64 features;
dfc5606d
YS
209};
210
f84344f3 211struct rbd_mapping {
99c1f08f 212 u64 size;
34b13184 213 u64 features;
f84344f3
AE
214 bool read_only;
215};
216
602adf40
YS
217/*
218 * a single device
219 */
220struct rbd_device {
de71a297 221 int dev_id; /* blkdev unique id */
602adf40
YS
222
223 int major; /* blkdev assigned major */
224 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 225
a30b71b9 226 u32 image_format; /* Either 1 or 2 */
602adf40
YS
227 struct rbd_client *rbd_client;
228
229 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
230
231 spinlock_t lock; /* queue lock */
232
233 struct rbd_image_header header;
daba5fdb 234 bool exists;
0d7dbfce 235 struct rbd_spec *spec;
602adf40 236
0d7dbfce 237 char *header_name;
971f839a 238
59c2be1e
YS
239 struct ceph_osd_event *watch_event;
240 struct ceph_osd_request *watch_request;
241
86b00e0d
AE
242 struct rbd_spec *parent_spec;
243 u64 parent_overlap;
244
c666601a
JD
245 /* protects updating the header */
246 struct rw_semaphore header_rwsem;
f84344f3
AE
247
248 struct rbd_mapping mapping;
602adf40
YS
249
250 struct list_head node;
dfc5606d
YS
251
252 /* list of snapshots */
253 struct list_head snaps;
254
255 /* sysfs related */
256 struct device dev;
42382b70 257 unsigned long open_count;
dfc5606d
YS
258};
259
602adf40 260static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 261
602adf40 262static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
263static DEFINE_SPINLOCK(rbd_dev_list_lock);
264
432b8587
AE
265static LIST_HEAD(rbd_client_list); /* clients */
266static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 267
304f6808
AE
268static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
269static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
270
dfc5606d 271static void rbd_dev_release(struct device *dev);
41f38c2b 272static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 273
f0f8cef5
AE
274static ssize_t rbd_add(struct bus_type *bus, const char *buf,
275 size_t count);
276static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
277 size_t count);
278
279static struct bus_attribute rbd_bus_attrs[] = {
280 __ATTR(add, S_IWUSR, NULL, rbd_add),
281 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
282 __ATTR_NULL
283};
284
285static struct bus_type rbd_bus_type = {
286 .name = "rbd",
287 .bus_attrs = rbd_bus_attrs,
288};
289
290static void rbd_root_dev_release(struct device *dev)
291{
292}
293
294static struct device rbd_root_dev = {
295 .init_name = "rbd",
296 .release = rbd_root_dev_release,
297};
298
aafb230e
AE
299#ifdef RBD_DEBUG
300#define rbd_assert(expr) \
301 if (unlikely(!(expr))) { \
302 printk(KERN_ERR "\nAssertion failure in %s() " \
303 "at line %d:\n\n" \
304 "\trbd_assert(%s);\n\n", \
305 __func__, __LINE__, #expr); \
306 BUG(); \
307 }
308#else /* !RBD_DEBUG */
309# define rbd_assert(expr) ((void) 0)
310#endif /* !RBD_DEBUG */
dfc5606d 311
117973fb
AE
312static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
313static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 314
602adf40
YS
315static int rbd_open(struct block_device *bdev, fmode_t mode)
316{
f0f8cef5 317 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 318
f84344f3 319 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
320 return -EROFS;
321
42382b70 322 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 323 (void) get_device(&rbd_dev->dev);
f84344f3 324 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70
AE
325 rbd_dev->open_count++;
326 mutex_unlock(&ctl_mutex);
340c7a2b 327
602adf40
YS
328 return 0;
329}
330
dfc5606d
YS
331static int rbd_release(struct gendisk *disk, fmode_t mode)
332{
333 struct rbd_device *rbd_dev = disk->private_data;
334
42382b70
AE
335 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
336 rbd_assert(rbd_dev->open_count > 0);
337 rbd_dev->open_count--;
c3e946ce 338 put_device(&rbd_dev->dev);
42382b70 339 mutex_unlock(&ctl_mutex);
dfc5606d
YS
340
341 return 0;
342}
343
602adf40
YS
344static const struct block_device_operations rbd_bd_ops = {
345 .owner = THIS_MODULE,
346 .open = rbd_open,
dfc5606d 347 .release = rbd_release,
602adf40
YS
348};
349
350/*
351 * Initialize an rbd client instance.
43ae4701 352 * We own *ceph_opts.
602adf40 353 */
f8c38929 354static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
355{
356 struct rbd_client *rbdc;
357 int ret = -ENOMEM;
358
359 dout("rbd_client_create\n");
360 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
361 if (!rbdc)
362 goto out_opt;
363
364 kref_init(&rbdc->kref);
365 INIT_LIST_HEAD(&rbdc->node);
366
bc534d86
AE
367 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
368
43ae4701 369 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 370 if (IS_ERR(rbdc->client))
bc534d86 371 goto out_mutex;
43ae4701 372 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
373
374 ret = ceph_open_session(rbdc->client);
375 if (ret < 0)
376 goto out_err;
377
432b8587 378 spin_lock(&rbd_client_list_lock);
602adf40 379 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 380 spin_unlock(&rbd_client_list_lock);
602adf40 381
bc534d86
AE
382 mutex_unlock(&ctl_mutex);
383
602adf40
YS
384 dout("rbd_client_create created %p\n", rbdc);
385 return rbdc;
386
387out_err:
388 ceph_destroy_client(rbdc->client);
bc534d86
AE
389out_mutex:
390 mutex_unlock(&ctl_mutex);
602adf40
YS
391 kfree(rbdc);
392out_opt:
43ae4701
AE
393 if (ceph_opts)
394 ceph_destroy_options(ceph_opts);
28f259b7 395 return ERR_PTR(ret);
602adf40
YS
396}
397
398/*
1f7ba331
AE
399 * Find a ceph client with specific addr and configuration. If
400 * found, bump its reference count.
602adf40 401 */
1f7ba331 402static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
403{
404 struct rbd_client *client_node;
1f7ba331 405 bool found = false;
602adf40 406
43ae4701 407 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
408 return NULL;
409
1f7ba331
AE
410 spin_lock(&rbd_client_list_lock);
411 list_for_each_entry(client_node, &rbd_client_list, node) {
412 if (!ceph_compare_options(ceph_opts, client_node->client)) {
413 kref_get(&client_node->kref);
414 found = true;
415 break;
416 }
417 }
418 spin_unlock(&rbd_client_list_lock);
419
420 return found ? client_node : NULL;
602adf40
YS
421}
422
59c2be1e
YS
423/*
424 * mount options
425 */
426enum {
59c2be1e
YS
427 Opt_last_int,
428 /* int args above */
429 Opt_last_string,
430 /* string args above */
cc0538b6
AE
431 Opt_read_only,
432 Opt_read_write,
433 /* Boolean args above */
434 Opt_last_bool,
59c2be1e
YS
435};
436
43ae4701 437static match_table_t rbd_opts_tokens = {
59c2be1e
YS
438 /* int args above */
439 /* string args above */
be466c1c 440 {Opt_read_only, "read_only"},
cc0538b6
AE
441 {Opt_read_only, "ro"}, /* Alternate spelling */
442 {Opt_read_write, "read_write"},
443 {Opt_read_write, "rw"}, /* Alternate spelling */
444 /* Boolean args above */
59c2be1e
YS
445 {-1, NULL}
446};
447
448static int parse_rbd_opts_token(char *c, void *private)
449{
43ae4701 450 struct rbd_options *rbd_opts = private;
59c2be1e
YS
451 substring_t argstr[MAX_OPT_ARGS];
452 int token, intval, ret;
453
43ae4701 454 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
455 if (token < 0)
456 return -EINVAL;
457
458 if (token < Opt_last_int) {
459 ret = match_int(&argstr[0], &intval);
460 if (ret < 0) {
461 pr_err("bad mount option arg (not int) "
462 "at '%s'\n", c);
463 return ret;
464 }
465 dout("got int token %d val %d\n", token, intval);
466 } else if (token > Opt_last_int && token < Opt_last_string) {
467 dout("got string token %d val %s\n", token,
468 argstr[0].from);
cc0538b6
AE
469 } else if (token > Opt_last_string && token < Opt_last_bool) {
470 dout("got Boolean token %d\n", token);
59c2be1e
YS
471 } else {
472 dout("got token %d\n", token);
473 }
474
475 switch (token) {
cc0538b6
AE
476 case Opt_read_only:
477 rbd_opts->read_only = true;
478 break;
479 case Opt_read_write:
480 rbd_opts->read_only = false;
481 break;
59c2be1e 482 default:
aafb230e
AE
483 rbd_assert(false);
484 break;
59c2be1e
YS
485 }
486 return 0;
487}
488
602adf40
YS
489/*
490 * Get a ceph client with specific addr and configuration, if one does
491 * not exist create it.
492 */
9d3997fd 493static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 494{
f8c38929 495 struct rbd_client *rbdc;
59c2be1e 496
1f7ba331 497 rbdc = rbd_client_find(ceph_opts);
9d3997fd 498 if (rbdc) /* using an existing client */
43ae4701 499 ceph_destroy_options(ceph_opts);
9d3997fd 500 else
f8c38929 501 rbdc = rbd_client_create(ceph_opts);
602adf40 502
9d3997fd 503 return rbdc;
602adf40
YS
504}
505
506/*
507 * Destroy ceph client
d23a4b3f 508 *
432b8587 509 * Caller must hold rbd_client_list_lock.
602adf40
YS
510 */
511static void rbd_client_release(struct kref *kref)
512{
513 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
514
515 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 516 spin_lock(&rbd_client_list_lock);
602adf40 517 list_del(&rbdc->node);
cd9d9f5d 518 spin_unlock(&rbd_client_list_lock);
602adf40
YS
519
520 ceph_destroy_client(rbdc->client);
521 kfree(rbdc);
522}
523
524/*
525 * Drop reference to ceph client node. If it's not referenced anymore, release
526 * it.
527 */
9d3997fd 528static void rbd_put_client(struct rbd_client *rbdc)
602adf40 529{
c53d5893
AE
530 if (rbdc)
531 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
532}
533
1fec7093
YS
534/*
535 * Destroy requests collection
536 */
537static void rbd_coll_release(struct kref *kref)
538{
539 struct rbd_req_coll *coll =
540 container_of(kref, struct rbd_req_coll, kref);
541
542 dout("rbd_coll_release %p\n", coll);
543 kfree(coll);
544}
602adf40 545
a30b71b9
AE
546static bool rbd_image_format_valid(u32 image_format)
547{
548 return image_format == 1 || image_format == 2;
549}
550
8e94af8e
AE
551static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
552{
103a150f
AE
553 size_t size;
554 u32 snap_count;
555
556 /* The header has to start with the magic rbd header text */
557 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
558 return false;
559
db2388b6
AE
560 /* The bio layer requires at least sector-sized I/O */
561
562 if (ondisk->options.order < SECTOR_SHIFT)
563 return false;
564
565 /* If we use u64 in a few spots we may be able to loosen this */
566
567 if (ondisk->options.order > 8 * sizeof (int) - 1)
568 return false;
569
103a150f
AE
570 /*
571 * The size of a snapshot header has to fit in a size_t, and
572 * that limits the number of snapshots.
573 */
574 snap_count = le32_to_cpu(ondisk->snap_count);
575 size = SIZE_MAX - sizeof (struct ceph_snap_context);
576 if (snap_count > size / sizeof (__le64))
577 return false;
578
579 /*
580 * Not only that, but the size of the entire the snapshot
581 * header must also be representable in a size_t.
582 */
583 size -= snap_count * sizeof (__le64);
584 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
585 return false;
586
587 return true;
8e94af8e
AE
588}
589
602adf40
YS
590/*
591 * Create a new header structure, translate header format from the on-disk
592 * header.
593 */
594static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 595 struct rbd_image_header_ondisk *ondisk)
602adf40 596{
ccece235 597 u32 snap_count;
58c17b0e 598 size_t len;
d2bb24e5 599 size_t size;
621901d6 600 u32 i;
602adf40 601
6a52325f
AE
602 memset(header, 0, sizeof (*header));
603
103a150f
AE
604 snap_count = le32_to_cpu(ondisk->snap_count);
605
58c17b0e
AE
606 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
607 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 608 if (!header->object_prefix)
602adf40 609 return -ENOMEM;
58c17b0e
AE
610 memcpy(header->object_prefix, ondisk->object_prefix, len);
611 header->object_prefix[len] = '\0';
00f1f36f 612
602adf40 613 if (snap_count) {
f785cc1d
AE
614 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
615
621901d6
AE
616 /* Save a copy of the snapshot names */
617
f785cc1d
AE
618 if (snap_names_len > (u64) SIZE_MAX)
619 return -EIO;
620 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 621 if (!header->snap_names)
6a52325f 622 goto out_err;
f785cc1d
AE
623 /*
624 * Note that rbd_dev_v1_header_read() guarantees
625 * the ondisk buffer we're working with has
626 * snap_names_len bytes beyond the end of the
627 * snapshot id array, this memcpy() is safe.
628 */
629 memcpy(header->snap_names, &ondisk->snaps[snap_count],
630 snap_names_len);
6a52325f 631
621901d6
AE
632 /* Record each snapshot's size */
633
d2bb24e5
AE
634 size = snap_count * sizeof (*header->snap_sizes);
635 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 636 if (!header->snap_sizes)
6a52325f 637 goto out_err;
621901d6
AE
638 for (i = 0; i < snap_count; i++)
639 header->snap_sizes[i] =
640 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 641 } else {
ccece235 642 WARN_ON(ondisk->snap_names_len);
602adf40
YS
643 header->snap_names = NULL;
644 header->snap_sizes = NULL;
645 }
849b4260 646
34b13184 647 header->features = 0; /* No features support in v1 images */
602adf40
YS
648 header->obj_order = ondisk->options.order;
649 header->crypt_type = ondisk->options.crypt_type;
650 header->comp_type = ondisk->options.comp_type;
6a52325f 651
621901d6
AE
652 /* Allocate and fill in the snapshot context */
653
f84344f3 654 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
655 size = sizeof (struct ceph_snap_context);
656 size += snap_count * sizeof (header->snapc->snaps[0]);
657 header->snapc = kzalloc(size, GFP_KERNEL);
658 if (!header->snapc)
659 goto out_err;
602adf40
YS
660
661 atomic_set(&header->snapc->nref, 1);
505cbb9b 662 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 663 header->snapc->num_snaps = snap_count;
621901d6
AE
664 for (i = 0; i < snap_count; i++)
665 header->snapc->snaps[i] =
666 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
667
668 return 0;
669
6a52325f 670out_err:
849b4260 671 kfree(header->snap_sizes);
ccece235 672 header->snap_sizes = NULL;
602adf40 673 kfree(header->snap_names);
ccece235 674 header->snap_names = NULL;
6a52325f
AE
675 kfree(header->object_prefix);
676 header->object_prefix = NULL;
ccece235 677
00f1f36f 678 return -ENOMEM;
602adf40
YS
679}
680
9e15b77d
AE
681static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
682{
683 struct rbd_snap *snap;
684
685 if (snap_id == CEPH_NOSNAP)
686 return RBD_SNAP_HEAD_NAME;
687
688 list_for_each_entry(snap, &rbd_dev->snaps, node)
689 if (snap_id == snap->id)
690 return snap->name;
691
692 return NULL;
693}
694
8836b995 695static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 696{
602adf40 697
e86924a8 698 struct rbd_snap *snap;
602adf40 699
e86924a8
AE
700 list_for_each_entry(snap, &rbd_dev->snaps, node) {
701 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 702 rbd_dev->spec->snap_id = snap->id;
e86924a8 703 rbd_dev->mapping.size = snap->size;
34b13184 704 rbd_dev->mapping.features = snap->features;
602adf40 705
e86924a8 706 return 0;
00f1f36f 707 }
00f1f36f 708 }
e86924a8 709
00f1f36f 710 return -ENOENT;
602adf40
YS
711}
712
819d52bf 713static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 714{
78dc447d 715 int ret;
602adf40 716
0d7dbfce 717 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 718 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 719 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 720 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 721 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 722 ret = 0;
602adf40 723 } else {
0d7dbfce 724 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
725 if (ret < 0)
726 goto done;
f84344f3 727 rbd_dev->mapping.read_only = true;
602adf40 728 }
daba5fdb 729 rbd_dev->exists = true;
602adf40 730done:
602adf40
YS
731 return ret;
732}
733
734static void rbd_header_free(struct rbd_image_header *header)
735{
849b4260 736 kfree(header->object_prefix);
d78fd7ae 737 header->object_prefix = NULL;
602adf40 738 kfree(header->snap_sizes);
d78fd7ae 739 header->snap_sizes = NULL;
849b4260 740 kfree(header->snap_names);
d78fd7ae 741 header->snap_names = NULL;
d1d25646 742 ceph_put_snap_context(header->snapc);
d78fd7ae 743 header->snapc = NULL;
602adf40
YS
744}
745
65ccfe21 746static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 747{
65ccfe21
AE
748 char *name;
749 u64 segment;
750 int ret;
602adf40 751
2fd82b9e 752 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
753 if (!name)
754 return NULL;
755 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 756 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 757 rbd_dev->header.object_prefix, segment);
2fd82b9e 758 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
759 pr_err("error formatting segment name for #%llu (%d)\n",
760 segment, ret);
761 kfree(name);
762 name = NULL;
763 }
602adf40 764
65ccfe21
AE
765 return name;
766}
602adf40 767
65ccfe21
AE
768static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
769{
770 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 771
65ccfe21
AE
772 return offset & (segment_size - 1);
773}
774
775static u64 rbd_segment_length(struct rbd_device *rbd_dev,
776 u64 offset, u64 length)
777{
778 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
779
780 offset &= segment_size - 1;
781
aafb230e 782 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
783 if (offset + length > segment_size)
784 length = segment_size - offset;
785
786 return length;
602adf40
YS
787}
788
1fec7093
YS
789static int rbd_get_num_segments(struct rbd_image_header *header,
790 u64 ofs, u64 len)
791{
df111be6
AE
792 u64 start_seg;
793 u64 end_seg;
794
795 if (!len)
796 return 0;
797 if (len - 1 > U64_MAX - ofs)
798 return -ERANGE;
799
800 start_seg = ofs >> header->obj_order;
801 end_seg = (ofs + len - 1) >> header->obj_order;
802
1fec7093
YS
803 return end_seg - start_seg + 1;
804}
805
029bcbd8
JD
806/*
807 * returns the size of an object in the image
808 */
809static u64 rbd_obj_bytes(struct rbd_image_header *header)
810{
811 return 1 << header->obj_order;
812}
813
602adf40
YS
814/*
815 * bio helpers
816 */
817
818static void bio_chain_put(struct bio *chain)
819{
820 struct bio *tmp;
821
822 while (chain) {
823 tmp = chain;
824 chain = chain->bi_next;
825 bio_put(tmp);
826 }
827}
828
829/*
830 * zeros a bio chain, starting at specific offset
831 */
832static void zero_bio_chain(struct bio *chain, int start_ofs)
833{
834 struct bio_vec *bv;
835 unsigned long flags;
836 void *buf;
837 int i;
838 int pos = 0;
839
840 while (chain) {
841 bio_for_each_segment(bv, chain, i) {
842 if (pos + bv->bv_len > start_ofs) {
843 int remainder = max(start_ofs - pos, 0);
844 buf = bvec_kmap_irq(bv, &flags);
845 memset(buf + remainder, 0,
846 bv->bv_len - remainder);
85b5aaa6 847 bvec_kunmap_irq(buf, &flags);
602adf40
YS
848 }
849 pos += bv->bv_len;
850 }
851
852 chain = chain->bi_next;
853 }
854}
855
856/*
f7760dad
AE
857 * Clone a portion of a bio, starting at the given byte offset
858 * and continuing for the number of bytes indicated.
602adf40 859 */
f7760dad
AE
860static struct bio *bio_clone_range(struct bio *bio_src,
861 unsigned int offset,
862 unsigned int len,
863 gfp_t gfpmask)
602adf40 864{
f7760dad
AE
865 struct bio_vec *bv;
866 unsigned int resid;
867 unsigned short idx;
868 unsigned int voff;
869 unsigned short end_idx;
870 unsigned short vcnt;
871 struct bio *bio;
872
873 /* Handle the easy case for the caller */
874
875 if (!offset && len == bio_src->bi_size)
876 return bio_clone(bio_src, gfpmask);
877
878 if (WARN_ON_ONCE(!len))
879 return NULL;
880 if (WARN_ON_ONCE(len > bio_src->bi_size))
881 return NULL;
882 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
883 return NULL;
884
885 /* Find first affected segment... */
886
887 resid = offset;
888 __bio_for_each_segment(bv, bio_src, idx, 0) {
889 if (resid < bv->bv_len)
890 break;
891 resid -= bv->bv_len;
602adf40 892 }
f7760dad 893 voff = resid;
602adf40 894
f7760dad 895 /* ...and the last affected segment */
602adf40 896
f7760dad
AE
897 resid += len;
898 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
899 if (resid <= bv->bv_len)
900 break;
901 resid -= bv->bv_len;
902 }
903 vcnt = end_idx - idx + 1;
904
905 /* Build the clone */
906
907 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
908 if (!bio)
909 return NULL; /* ENOMEM */
602adf40 910
f7760dad
AE
911 bio->bi_bdev = bio_src->bi_bdev;
912 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
913 bio->bi_rw = bio_src->bi_rw;
914 bio->bi_flags |= 1 << BIO_CLONED;
915
916 /*
917 * Copy over our part of the bio_vec, then update the first
918 * and last (or only) entries.
919 */
920 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
921 vcnt * sizeof (struct bio_vec));
922 bio->bi_io_vec[0].bv_offset += voff;
923 if (vcnt > 1) {
924 bio->bi_io_vec[0].bv_len -= voff;
925 bio->bi_io_vec[vcnt - 1].bv_len = resid;
926 } else {
927 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
928 }
929
f7760dad
AE
930 bio->bi_vcnt = vcnt;
931 bio->bi_size = len;
932 bio->bi_idx = 0;
933
934 return bio;
935}
936
937/*
938 * Clone a portion of a bio chain, starting at the given byte offset
939 * into the first bio in the source chain and continuing for the
940 * number of bytes indicated. The result is another bio chain of
941 * exactly the given length, or a null pointer on error.
942 *
943 * The bio_src and offset parameters are both in-out. On entry they
944 * refer to the first source bio and the offset into that bio where
945 * the start of data to be cloned is located.
946 *
947 * On return, bio_src is updated to refer to the bio in the source
948 * chain that contains first un-cloned byte, and *offset will
949 * contain the offset of that byte within that bio.
950 */
951static struct bio *bio_chain_clone_range(struct bio **bio_src,
952 unsigned int *offset,
953 unsigned int len,
954 gfp_t gfpmask)
955{
956 struct bio *bi = *bio_src;
957 unsigned int off = *offset;
958 struct bio *chain = NULL;
959 struct bio **end;
960
961 /* Build up a chain of clone bios up to the limit */
962
963 if (!bi || off >= bi->bi_size || !len)
964 return NULL; /* Nothing to clone */
602adf40 965
f7760dad
AE
966 end = &chain;
967 while (len) {
968 unsigned int bi_size;
969 struct bio *bio;
970
971 if (!bi)
972 goto out_err; /* EINVAL; ran out of bio's */
973 bi_size = min_t(unsigned int, bi->bi_size - off, len);
974 bio = bio_clone_range(bi, off, bi_size, gfpmask);
975 if (!bio)
976 goto out_err; /* ENOMEM */
977
978 *end = bio;
979 end = &bio->bi_next;
602adf40 980
f7760dad
AE
981 off += bi_size;
982 if (off == bi->bi_size) {
983 bi = bi->bi_next;
984 off = 0;
985 }
986 len -= bi_size;
987 }
988 *bio_src = bi;
989 *offset = off;
990
991 return chain;
992out_err:
993 bio_chain_put(chain);
602adf40 994
602adf40
YS
995 return NULL;
996}
997
998/*
999 * helpers for osd request op vectors.
1000 */
57cfc106
AE
1001static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
1002 int opcode, u32 payload_len)
602adf40 1003{
57cfc106
AE
1004 struct ceph_osd_req_op *ops;
1005
1006 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
1007 if (!ops)
1008 return NULL;
1009
1010 ops[0].op = opcode;
1011
602adf40
YS
1012 /*
1013 * op extent offset and length will be set later on
1014 * in calc_raw_layout()
1015 */
57cfc106
AE
1016 ops[0].payload_len = payload_len;
1017
1018 return ops;
602adf40
YS
1019}
1020
1021static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1022{
1023 kfree(ops);
1024}
1025
1fec7093
YS
1026static void rbd_coll_end_req_index(struct request *rq,
1027 struct rbd_req_coll *coll,
1028 int index,
1029 int ret, u64 len)
1030{
1031 struct request_queue *q;
1032 int min, max, i;
1033
bd919d45
AE
1034 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1035 coll, index, ret, (unsigned long long) len);
1fec7093
YS
1036
1037 if (!rq)
1038 return;
1039
1040 if (!coll) {
1041 blk_end_request(rq, ret, len);
1042 return;
1043 }
1044
1045 q = rq->q;
1046
1047 spin_lock_irq(q->queue_lock);
1048 coll->status[index].done = 1;
1049 coll->status[index].rc = ret;
1050 coll->status[index].bytes = len;
1051 max = min = coll->num_done;
1052 while (max < coll->total && coll->status[max].done)
1053 max++;
1054
1055 for (i = min; i<max; i++) {
1056 __blk_end_request(rq, coll->status[i].rc,
1057 coll->status[i].bytes);
1058 coll->num_done++;
1059 kref_put(&coll->kref, rbd_coll_release);
1060 }
1061 spin_unlock_irq(q->queue_lock);
1062}
1063
1064static void rbd_coll_end_req(struct rbd_request *req,
1065 int ret, u64 len)
1066{
1067 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1068}
1069
602adf40
YS
1070/*
1071 * Send ceph osd request
1072 */
1073static int rbd_do_request(struct request *rq,
0ce1a794 1074 struct rbd_device *rbd_dev,
602adf40
YS
1075 struct ceph_snap_context *snapc,
1076 u64 snapid,
aded07ea 1077 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1078 struct bio *bio,
1079 struct page **pages,
1080 int num_pages,
1081 int flags,
1082 struct ceph_osd_req_op *ops,
1fec7093
YS
1083 struct rbd_req_coll *coll,
1084 int coll_index,
602adf40 1085 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
1086 struct ceph_msg *msg),
1087 struct ceph_osd_request **linger_req,
1088 u64 *ver)
602adf40
YS
1089{
1090 struct ceph_osd_request *req;
1091 struct ceph_file_layout *layout;
1092 int ret;
1093 u64 bno;
1094 struct timespec mtime = CURRENT_TIME;
1095 struct rbd_request *req_data;
1096 struct ceph_osd_request_head *reqhead;
1dbb4399 1097 struct ceph_osd_client *osdc;
602adf40 1098
602adf40 1099 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
1100 if (!req_data) {
1101 if (coll)
1102 rbd_coll_end_req_index(rq, coll, coll_index,
1103 -ENOMEM, len);
1104 return -ENOMEM;
1105 }
1106
1107 if (coll) {
1108 req_data->coll = coll;
1109 req_data->coll_index = coll_index;
1110 }
602adf40 1111
f7760dad
AE
1112 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1113 object_name, (unsigned long long) ofs,
1114 (unsigned long long) len, coll, coll_index);
602adf40 1115
0ce1a794 1116 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
1117 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1118 false, GFP_NOIO, pages, bio);
4ad12621 1119 if (!req) {
4ad12621 1120 ret = -ENOMEM;
602adf40
YS
1121 goto done_pages;
1122 }
1123
1124 req->r_callback = rbd_cb;
1125
1126 req_data->rq = rq;
1127 req_data->bio = bio;
1128 req_data->pages = pages;
1129 req_data->len = len;
1130
1131 req->r_priv = req_data;
1132
1133 reqhead = req->r_request->front.iov_base;
1134 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1135
aded07ea 1136 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1137 req->r_oid_len = strlen(req->r_oid);
1138
1139 layout = &req->r_file_layout;
1140 memset(layout, 0, sizeof(*layout));
1141 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1142 layout->fl_stripe_count = cpu_to_le32(1);
1143 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0d7dbfce 1144 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
6cae3717
SW
1145 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1146 req, ops);
1147 rbd_assert(ret == 0);
602adf40
YS
1148
1149 ceph_osdc_build_request(req, ofs, &len,
1150 ops,
1151 snapc,
1152 &mtime,
1153 req->r_oid, req->r_oid_len);
602adf40 1154
59c2be1e 1155 if (linger_req) {
1dbb4399 1156 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1157 *linger_req = req;
1158 }
1159
1dbb4399 1160 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1161 if (ret < 0)
1162 goto done_err;
1163
1164 if (!rbd_cb) {
1dbb4399 1165 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1166 if (ver)
1167 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1168 dout("reassert_ver=%llu\n",
1169 (unsigned long long)
1170 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1171 ceph_osdc_put_request(req);
1172 }
1173 return ret;
1174
1175done_err:
1176 bio_chain_put(req_data->bio);
1177 ceph_osdc_put_request(req);
1178done_pages:
1fec7093 1179 rbd_coll_end_req(req_data, ret, len);
602adf40 1180 kfree(req_data);
602adf40
YS
1181 return ret;
1182}
1183
1184/*
1185 * Ceph osd op callback
1186 */
1187static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1188{
1189 struct rbd_request *req_data = req->r_priv;
1190 struct ceph_osd_reply_head *replyhead;
1191 struct ceph_osd_op *op;
1192 __s32 rc;
1193 u64 bytes;
1194 int read_op;
1195
1196 /* parse reply */
1197 replyhead = msg->front.iov_base;
1198 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1199 op = (void *)(replyhead + 1);
1200 rc = le32_to_cpu(replyhead->result);
1201 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1202 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1203
bd919d45
AE
1204 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1205 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1206
1207 if (rc == -ENOENT && read_op) {
1208 zero_bio_chain(req_data->bio, 0);
1209 rc = 0;
1210 } else if (rc == 0 && read_op && bytes < req_data->len) {
1211 zero_bio_chain(req_data->bio, bytes);
1212 bytes = req_data->len;
1213 }
1214
1fec7093 1215 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1216
1217 if (req_data->bio)
1218 bio_chain_put(req_data->bio);
1219
1220 ceph_osdc_put_request(req);
1221 kfree(req_data);
1222}
1223
59c2be1e
YS
1224static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1225{
1226 ceph_osdc_put_request(req);
1227}
1228
602adf40
YS
1229/*
1230 * Do a synchronous ceph osd operation
1231 */
0ce1a794 1232static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1233 struct ceph_snap_context *snapc,
1234 u64 snapid,
602adf40 1235 int flags,
913d2fdc 1236 struct ceph_osd_req_op *ops,
aded07ea 1237 const char *object_name,
f8d4de6e
AE
1238 u64 ofs, u64 inbound_size,
1239 char *inbound,
59c2be1e
YS
1240 struct ceph_osd_request **linger_req,
1241 u64 *ver)
602adf40
YS
1242{
1243 int ret;
1244 struct page **pages;
1245 int num_pages;
913d2fdc 1246
aafb230e 1247 rbd_assert(ops != NULL);
602adf40 1248
f8d4de6e 1249 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1250 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1251 if (IS_ERR(pages))
1252 return PTR_ERR(pages);
602adf40 1253
0ce1a794 1254 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1255 object_name, ofs, inbound_size, NULL,
602adf40
YS
1256 pages, num_pages,
1257 flags,
1258 ops,
1fec7093 1259 NULL, 0,
59c2be1e
YS
1260 NULL,
1261 linger_req, ver);
602adf40 1262 if (ret < 0)
913d2fdc 1263 goto done;
602adf40 1264
f8d4de6e
AE
1265 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1266 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1267
602adf40
YS
1268done:
1269 ceph_release_page_vector(pages, num_pages);
1270 return ret;
1271}
1272
1273/*
1274 * Do an asynchronous ceph osd operation
1275 */
1276static int rbd_do_op(struct request *rq,
0ce1a794 1277 struct rbd_device *rbd_dev,
602adf40 1278 struct ceph_snap_context *snapc,
602adf40 1279 u64 ofs, u64 len,
1fec7093
YS
1280 struct bio *bio,
1281 struct rbd_req_coll *coll,
1282 int coll_index)
602adf40
YS
1283{
1284 char *seg_name;
1285 u64 seg_ofs;
1286 u64 seg_len;
1287 int ret;
1288 struct ceph_osd_req_op *ops;
1289 u32 payload_len;
ff2e4bb5
AE
1290 int opcode;
1291 int flags;
4634246d 1292 u64 snapid;
602adf40 1293
65ccfe21 1294 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1295 if (!seg_name)
1296 return -ENOMEM;
65ccfe21
AE
1297 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1298 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1299
ff2e4bb5
AE
1300 if (rq_data_dir(rq) == WRITE) {
1301 opcode = CEPH_OSD_OP_WRITE;
1302 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1303 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1304 payload_len = seg_len;
1305 } else {
1306 opcode = CEPH_OSD_OP_READ;
1307 flags = CEPH_OSD_FLAG_READ;
4634246d 1308 snapc = NULL;
0d7dbfce 1309 snapid = rbd_dev->spec->snap_id;
ff2e4bb5
AE
1310 payload_len = 0;
1311 }
602adf40 1312
57cfc106
AE
1313 ret = -ENOMEM;
1314 ops = rbd_create_rw_ops(1, opcode, payload_len);
1315 if (!ops)
602adf40
YS
1316 goto done;
1317
1318 /* we've taken care of segment sizes earlier when we
1319 cloned the bios. We should never have a segment
1320 truncated at this point */
aafb230e 1321 rbd_assert(seg_len == len);
602adf40
YS
1322
1323 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1324 seg_name, seg_ofs, seg_len,
1325 bio,
1326 NULL, 0,
1327 flags,
1328 ops,
1fec7093 1329 coll, coll_index,
59c2be1e 1330 rbd_req_cb, 0, NULL);
11f77002
SW
1331
1332 rbd_destroy_ops(ops);
602adf40
YS
1333done:
1334 kfree(seg_name);
1335 return ret;
1336}
1337
602adf40
YS
1338/*
1339 * Request sync osd read
1340 */
0ce1a794 1341static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1342 u64 snapid,
aded07ea 1343 const char *object_name,
602adf40 1344 u64 ofs, u64 len,
59c2be1e
YS
1345 char *buf,
1346 u64 *ver)
602adf40 1347{
913d2fdc
AE
1348 struct ceph_osd_req_op *ops;
1349 int ret;
1350
1351 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1352 if (!ops)
1353 return -ENOMEM;
1354
1355 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1356 snapid,
602adf40 1357 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1358 ops, object_name, ofs, len, buf, NULL, ver);
1359 rbd_destroy_ops(ops);
1360
1361 return ret;
602adf40
YS
1362}
1363
1364/*
59c2be1e
YS
1365 * Request sync osd watch
1366 */
0ce1a794 1367static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1368 u64 ver,
7f0a24d8 1369 u64 notify_id)
59c2be1e
YS
1370{
1371 struct ceph_osd_req_op *ops;
11f77002
SW
1372 int ret;
1373
57cfc106
AE
1374 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1375 if (!ops)
1376 return -ENOMEM;
59c2be1e 1377
a71b891b 1378 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1379 ops[0].watch.cookie = notify_id;
1380 ops[0].watch.flag = 0;
1381
0ce1a794 1382 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1383 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1384 NULL, 0,
59c2be1e
YS
1385 CEPH_OSD_FLAG_READ,
1386 ops,
1fec7093 1387 NULL, 0,
59c2be1e
YS
1388 rbd_simple_req_cb, 0, NULL);
1389
1390 rbd_destroy_ops(ops);
1391 return ret;
1392}
1393
1394static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1395{
0ce1a794 1396 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1397 u64 hver;
13143d2d
SW
1398 int rc;
1399
0ce1a794 1400 if (!rbd_dev)
59c2be1e
YS
1401 return;
1402
bd919d45
AE
1403 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1404 rbd_dev->header_name, (unsigned long long) notify_id,
1405 (unsigned int) opcode);
117973fb 1406 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1407 if (rc)
f0f8cef5 1408 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1409 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1410
7f0a24d8 1411 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1412}
1413
1414/*
1415 * Request sync osd watch
1416 */
0e6f322d 1417static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1418{
1419 struct ceph_osd_req_op *ops;
0ce1a794 1420 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1421 int ret;
59c2be1e 1422
57cfc106
AE
1423 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1424 if (!ops)
1425 return -ENOMEM;
59c2be1e
YS
1426
1427 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1428 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1429 if (ret < 0)
1430 goto fail;
1431
0e6f322d 1432 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1433 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1434 ops[0].watch.flag = 1;
1435
0ce1a794 1436 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1437 CEPH_NOSNAP,
59c2be1e
YS
1438 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1439 ops,
0e6f322d
AE
1440 rbd_dev->header_name,
1441 0, 0, NULL,
0ce1a794 1442 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1443
1444 if (ret < 0)
1445 goto fail_event;
1446
1447 rbd_destroy_ops(ops);
1448 return 0;
1449
1450fail_event:
0ce1a794
AE
1451 ceph_osdc_cancel_event(rbd_dev->watch_event);
1452 rbd_dev->watch_event = NULL;
59c2be1e
YS
1453fail:
1454 rbd_destroy_ops(ops);
1455 return ret;
1456}
1457
79e3057c
YS
1458/*
1459 * Request sync osd unwatch
1460 */
070c633f 1461static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1462{
1463 struct ceph_osd_req_op *ops;
57cfc106 1464 int ret;
79e3057c 1465
57cfc106
AE
1466 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1467 if (!ops)
1468 return -ENOMEM;
79e3057c
YS
1469
1470 ops[0].watch.ver = 0;
0ce1a794 1471 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1472 ops[0].watch.flag = 0;
1473
0ce1a794 1474 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1475 CEPH_NOSNAP,
79e3057c
YS
1476 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1477 ops,
070c633f
AE
1478 rbd_dev->header_name,
1479 0, 0, NULL, NULL, NULL);
1480
79e3057c
YS
1481
1482 rbd_destroy_ops(ops);
0ce1a794
AE
1483 ceph_osdc_cancel_event(rbd_dev->watch_event);
1484 rbd_dev->watch_event = NULL;
79e3057c
YS
1485 return ret;
1486}
1487
602adf40 1488/*
3cb4a687 1489 * Synchronous osd object method call
602adf40 1490 */
0ce1a794 1491static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1492 const char *object_name,
1493 const char *class_name,
1494 const char *method_name,
3cb4a687
AE
1495 const char *outbound,
1496 size_t outbound_size,
f8d4de6e
AE
1497 char *inbound,
1498 size_t inbound_size,
3cb4a687 1499 int flags,
59c2be1e 1500 u64 *ver)
602adf40
YS
1501{
1502 struct ceph_osd_req_op *ops;
aded07ea
AE
1503 int class_name_len = strlen(class_name);
1504 int method_name_len = strlen(method_name);
3cb4a687 1505 int payload_size;
57cfc106
AE
1506 int ret;
1507
3cb4a687
AE
1508 /*
1509 * Any input parameters required by the method we're calling
1510 * will be sent along with the class and method names as
1511 * part of the message payload. That data and its size are
1512 * supplied via the indata and indata_len fields (named from
1513 * the perspective of the server side) in the OSD request
1514 * operation.
1515 */
1516 payload_size = class_name_len + method_name_len + outbound_size;
1517 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1518 if (!ops)
1519 return -ENOMEM;
602adf40 1520
aded07ea
AE
1521 ops[0].cls.class_name = class_name;
1522 ops[0].cls.class_len = (__u8) class_name_len;
1523 ops[0].cls.method_name = method_name;
1524 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1525 ops[0].cls.argc = 0;
3cb4a687
AE
1526 ops[0].cls.indata = outbound;
1527 ops[0].cls.indata_len = outbound_size;
602adf40 1528
0ce1a794 1529 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1530 CEPH_NOSNAP,
3cb4a687 1531 flags, ops,
f8d4de6e
AE
1532 object_name, 0, inbound_size, inbound,
1533 NULL, ver);
602adf40
YS
1534
1535 rbd_destroy_ops(ops);
1536
1537 dout("cls_exec returned %d\n", ret);
1538 return ret;
1539}
1540
1fec7093
YS
1541static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1542{
1543 struct rbd_req_coll *coll =
1544 kzalloc(sizeof(struct rbd_req_coll) +
1545 sizeof(struct rbd_req_status) * num_reqs,
1546 GFP_ATOMIC);
1547
1548 if (!coll)
1549 return NULL;
1550 coll->total = num_reqs;
1551 kref_init(&coll->kref);
1552 return coll;
1553}
1554
602adf40
YS
1555/*
1556 * block device queue callback
1557 */
1558static void rbd_rq_fn(struct request_queue *q)
1559{
1560 struct rbd_device *rbd_dev = q->queuedata;
1561 struct request *rq;
602adf40 1562
00f1f36f 1563 while ((rq = blk_fetch_request(q))) {
602adf40 1564 struct bio *bio;
602adf40 1565 bool do_write;
bd919d45 1566 unsigned int size;
602adf40 1567 u64 ofs;
1fec7093
YS
1568 int num_segs, cur_seg = 0;
1569 struct rbd_req_coll *coll;
d1d25646 1570 struct ceph_snap_context *snapc;
f7760dad 1571 unsigned int bio_offset;
602adf40 1572
602adf40
YS
1573 dout("fetched request\n");
1574
1575 /* filter out block requests we don't understand */
1576 if ((rq->cmd_type != REQ_TYPE_FS)) {
1577 __blk_end_request_all(rq, 0);
00f1f36f 1578 continue;
602adf40
YS
1579 }
1580
1581 /* deduce our operation (read, write) */
1582 do_write = (rq_data_dir(rq) == WRITE);
f84344f3 1583 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1584 __blk_end_request_all(rq, -EROFS);
00f1f36f 1585 continue;
602adf40
YS
1586 }
1587
1588 spin_unlock_irq(q->queue_lock);
1589
d1d25646 1590 down_read(&rbd_dev->header_rwsem);
e88a36ec 1591
daba5fdb 1592 if (!rbd_dev->exists) {
0d7dbfce 1593 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
e88a36ec 1594 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1595 dout("request for non-existent snapshot");
1596 spin_lock_irq(q->queue_lock);
1597 __blk_end_request_all(rq, -ENXIO);
1598 continue;
e88a36ec
JD
1599 }
1600
d1d25646
JD
1601 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1602
1603 up_read(&rbd_dev->header_rwsem);
1604
f7760dad
AE
1605 size = blk_rq_bytes(rq);
1606 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1607 bio = rq->bio;
1608
602adf40
YS
1609 dout("%s 0x%x bytes at 0x%llx\n",
1610 do_write ? "write" : "read",
bd919d45 1611 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1612
1fec7093 1613 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1614 if (num_segs <= 0) {
1615 spin_lock_irq(q->queue_lock);
1616 __blk_end_request_all(rq, num_segs);
1617 ceph_put_snap_context(snapc);
1618 continue;
1619 }
1fec7093
YS
1620 coll = rbd_alloc_coll(num_segs);
1621 if (!coll) {
1622 spin_lock_irq(q->queue_lock);
1623 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1624 ceph_put_snap_context(snapc);
00f1f36f 1625 continue;
1fec7093
YS
1626 }
1627
f7760dad 1628 bio_offset = 0;
602adf40 1629 do {
f7760dad
AE
1630 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1631 unsigned int chain_size;
1632 struct bio *bio_chain;
1633
1634 BUG_ON(limit > (u64) UINT_MAX);
1635 chain_size = (unsigned int) limit;
bd919d45 1636 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
f7760dad 1637
1fec7093 1638 kref_get(&coll->kref);
f7760dad
AE
1639
1640 /* Pass a cloned bio chain via an osd request */
1641
1642 bio_chain = bio_chain_clone_range(&bio,
1643 &bio_offset, chain_size,
1644 GFP_ATOMIC);
1645 if (bio_chain)
4634246d 1646 (void) rbd_do_op(rq, rbd_dev, snapc,
f7760dad
AE
1647 ofs, chain_size,
1648 bio_chain, coll, cur_seg);
4634246d 1649 else
1fec7093 1650 rbd_coll_end_req_index(rq, coll, cur_seg,
f7760dad
AE
1651 -ENOMEM, chain_size);
1652 size -= chain_size;
1653 ofs += chain_size;
602adf40 1654
1fec7093 1655 cur_seg++;
602adf40 1656 } while (size > 0);
1fec7093 1657 kref_put(&coll->kref, rbd_coll_release);
602adf40 1658
602adf40 1659 spin_lock_irq(q->queue_lock);
d1d25646
JD
1660
1661 ceph_put_snap_context(snapc);
602adf40
YS
1662 }
1663}
1664
1665/*
1666 * a queue callback. Makes sure that we don't create a bio that spans across
1667 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1668 * which we handle later at bio_chain_clone_range()
602adf40
YS
1669 */
1670static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1671 struct bio_vec *bvec)
1672{
1673 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1674 sector_t sector_offset;
1675 sector_t sectors_per_obj;
1676 sector_t obj_sector_offset;
1677 int ret;
1678
1679 /*
1680 * Find how far into its rbd object the partition-relative
1681 * bio start sector is to offset relative to the enclosing
1682 * device.
1683 */
1684 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1685 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1686 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1687
1688 /*
1689 * Compute the number of bytes from that offset to the end
1690 * of the object. Account for what's already used by the bio.
1691 */
1692 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1693 if (ret > bmd->bi_size)
1694 ret -= bmd->bi_size;
1695 else
1696 ret = 0;
1697
1698 /*
1699 * Don't send back more than was asked for. And if the bio
1700 * was empty, let the whole thing through because: "Note
1701 * that a block device *must* allow a single page to be
1702 * added to an empty bio."
1703 */
1704 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1705 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1706 ret = (int) bvec->bv_len;
1707
1708 return ret;
602adf40
YS
1709}
1710
1711static void rbd_free_disk(struct rbd_device *rbd_dev)
1712{
1713 struct gendisk *disk = rbd_dev->disk;
1714
1715 if (!disk)
1716 return;
1717
602adf40
YS
1718 if (disk->flags & GENHD_FL_UP)
1719 del_gendisk(disk);
1720 if (disk->queue)
1721 blk_cleanup_queue(disk->queue);
1722 put_disk(disk);
1723}
1724
1725/*
4156d998
AE
1726 * Read the complete header for the given rbd device.
1727 *
1728 * Returns a pointer to a dynamically-allocated buffer containing
1729 * the complete and validated header. Caller can pass the address
1730 * of a variable that will be filled in with the version of the
1731 * header object at the time it was read.
1732 *
1733 * Returns a pointer-coded errno if a failure occurs.
602adf40 1734 */
4156d998
AE
1735static struct rbd_image_header_ondisk *
1736rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1737{
4156d998 1738 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1739 u32 snap_count = 0;
4156d998
AE
1740 u64 names_size = 0;
1741 u32 want_count;
1742 int ret;
602adf40 1743
00f1f36f 1744 /*
4156d998
AE
1745 * The complete header will include an array of its 64-bit
1746 * snapshot ids, followed by the names of those snapshots as
1747 * a contiguous block of NUL-terminated strings. Note that
1748 * the number of snapshots could change by the time we read
1749 * it in, in which case we re-read it.
00f1f36f 1750 */
4156d998
AE
1751 do {
1752 size_t size;
1753
1754 kfree(ondisk);
1755
1756 size = sizeof (*ondisk);
1757 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1758 size += names_size;
1759 ondisk = kmalloc(size, GFP_KERNEL);
1760 if (!ondisk)
1761 return ERR_PTR(-ENOMEM);
1762
1763 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1764 rbd_dev->header_name,
4156d998
AE
1765 0, size,
1766 (char *) ondisk, version);
1767
1768 if (ret < 0)
1769 goto out_err;
1770 if (WARN_ON((size_t) ret < size)) {
1771 ret = -ENXIO;
1772 pr_warning("short header read for image %s"
1773 " (want %zd got %d)\n",
0d7dbfce 1774 rbd_dev->spec->image_name, size, ret);
4156d998
AE
1775 goto out_err;
1776 }
1777 if (!rbd_dev_ondisk_valid(ondisk)) {
1778 ret = -ENXIO;
1779 pr_warning("invalid header for image %s\n",
0d7dbfce 1780 rbd_dev->spec->image_name);
4156d998 1781 goto out_err;
81e759fb 1782 }
602adf40 1783
4156d998
AE
1784 names_size = le64_to_cpu(ondisk->snap_names_len);
1785 want_count = snap_count;
1786 snap_count = le32_to_cpu(ondisk->snap_count);
1787 } while (snap_count != want_count);
00f1f36f 1788
4156d998 1789 return ondisk;
00f1f36f 1790
4156d998
AE
1791out_err:
1792 kfree(ondisk);
1793
1794 return ERR_PTR(ret);
1795}
1796
1797/*
1798 * reload the ondisk the header
1799 */
1800static int rbd_read_header(struct rbd_device *rbd_dev,
1801 struct rbd_image_header *header)
1802{
1803 struct rbd_image_header_ondisk *ondisk;
1804 u64 ver = 0;
1805 int ret;
602adf40 1806
4156d998
AE
1807 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1808 if (IS_ERR(ondisk))
1809 return PTR_ERR(ondisk);
1810 ret = rbd_header_from_disk(header, ondisk);
1811 if (ret >= 0)
1812 header->obj_version = ver;
1813 kfree(ondisk);
1814
1815 return ret;
602adf40
YS
1816}
1817
41f38c2b 1818static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1819{
1820 struct rbd_snap *snap;
a0593290 1821 struct rbd_snap *next;
dfc5606d 1822
a0593290 1823 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1824 rbd_remove_snap_dev(snap);
dfc5606d
YS
1825}
1826
9478554a
AE
1827static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1828{
1829 sector_t size;
1830
0d7dbfce 1831 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
1832 return;
1833
1834 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1835 dout("setting size to %llu sectors", (unsigned long long) size);
1836 rbd_dev->mapping.size = (u64) size;
1837 set_capacity(rbd_dev->disk, size);
1838}
1839
602adf40
YS
1840/*
1841 * only read the first part of the ondisk header, without the snaps info
1842 */
117973fb 1843static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1844{
1845 int ret;
1846 struct rbd_image_header h;
602adf40
YS
1847
1848 ret = rbd_read_header(rbd_dev, &h);
1849 if (ret < 0)
1850 return ret;
1851
a51aa0c0
JD
1852 down_write(&rbd_dev->header_rwsem);
1853
9478554a
AE
1854 /* Update image size, and check for resize of mapped image */
1855 rbd_dev->header.image_size = h.image_size;
1856 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1857
849b4260 1858 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1859 kfree(rbd_dev->header.snap_sizes);
849b4260 1860 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1861 /* osd requests may still refer to snapc */
1862 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1863
b813623a
AE
1864 if (hver)
1865 *hver = h.obj_version;
a71b891b 1866 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1867 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1868 rbd_dev->header.snapc = h.snapc;
1869 rbd_dev->header.snap_names = h.snap_names;
1870 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1871 /* Free the extra copy of the object prefix */
1872 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1873 kfree(h.object_prefix);
1874
304f6808
AE
1875 ret = rbd_dev_snaps_update(rbd_dev);
1876 if (!ret)
1877 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1878
c666601a 1879 up_write(&rbd_dev->header_rwsem);
602adf40 1880
dfc5606d 1881 return ret;
602adf40
YS
1882}
1883
117973fb 1884static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1885{
1886 int ret;
1887
117973fb 1888 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1889 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1890 if (rbd_dev->image_format == 1)
1891 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1892 else
1893 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1894 mutex_unlock(&ctl_mutex);
1895
1896 return ret;
1897}
1898
602adf40
YS
1899static int rbd_init_disk(struct rbd_device *rbd_dev)
1900{
1901 struct gendisk *disk;
1902 struct request_queue *q;
593a9e7b 1903 u64 segment_size;
602adf40 1904
602adf40 1905 /* create gendisk info */
602adf40
YS
1906 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1907 if (!disk)
1fcdb8aa 1908 return -ENOMEM;
602adf40 1909
f0f8cef5 1910 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1911 rbd_dev->dev_id);
602adf40
YS
1912 disk->major = rbd_dev->major;
1913 disk->first_minor = 0;
1914 disk->fops = &rbd_bd_ops;
1915 disk->private_data = rbd_dev;
1916
1917 /* init rq */
602adf40
YS
1918 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1919 if (!q)
1920 goto out_disk;
029bcbd8 1921
593a9e7b
AE
1922 /* We use the default size, but let's be explicit about it. */
1923 blk_queue_physical_block_size(q, SECTOR_SIZE);
1924
029bcbd8 1925 /* set io sizes to object size */
593a9e7b
AE
1926 segment_size = rbd_obj_bytes(&rbd_dev->header);
1927 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1928 blk_queue_max_segment_size(q, segment_size);
1929 blk_queue_io_min(q, segment_size);
1930 blk_queue_io_opt(q, segment_size);
029bcbd8 1931
602adf40
YS
1932 blk_queue_merge_bvec(q, rbd_merge_bvec);
1933 disk->queue = q;
1934
1935 q->queuedata = rbd_dev;
1936
1937 rbd_dev->disk = disk;
602adf40 1938
12f02944
AE
1939 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1940
602adf40 1941 return 0;
602adf40
YS
1942out_disk:
1943 put_disk(disk);
1fcdb8aa
AE
1944
1945 return -ENOMEM;
602adf40
YS
1946}
1947
dfc5606d
YS
1948/*
1949 sysfs
1950*/
1951
593a9e7b
AE
1952static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1953{
1954 return container_of(dev, struct rbd_device, dev);
1955}
1956
dfc5606d
YS
1957static ssize_t rbd_size_show(struct device *dev,
1958 struct device_attribute *attr, char *buf)
1959{
593a9e7b 1960 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1961 sector_t size;
1962
1963 down_read(&rbd_dev->header_rwsem);
1964 size = get_capacity(rbd_dev->disk);
1965 up_read(&rbd_dev->header_rwsem);
dfc5606d 1966
a51aa0c0 1967 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1968}
1969
34b13184
AE
1970/*
1971 * Note this shows the features for whatever's mapped, which is not
1972 * necessarily the base image.
1973 */
1974static ssize_t rbd_features_show(struct device *dev,
1975 struct device_attribute *attr, char *buf)
1976{
1977 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1978
1979 return sprintf(buf, "0x%016llx\n",
1980 (unsigned long long) rbd_dev->mapping.features);
1981}
1982
dfc5606d
YS
1983static ssize_t rbd_major_show(struct device *dev,
1984 struct device_attribute *attr, char *buf)
1985{
593a9e7b 1986 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1987
dfc5606d
YS
1988 return sprintf(buf, "%d\n", rbd_dev->major);
1989}
1990
1991static ssize_t rbd_client_id_show(struct device *dev,
1992 struct device_attribute *attr, char *buf)
602adf40 1993{
593a9e7b 1994 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1995
1dbb4399
AE
1996 return sprintf(buf, "client%lld\n",
1997 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1998}
1999
dfc5606d
YS
2000static ssize_t rbd_pool_show(struct device *dev,
2001 struct device_attribute *attr, char *buf)
602adf40 2002{
593a9e7b 2003 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2004
0d7dbfce 2005 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
2006}
2007
9bb2f334
AE
2008static ssize_t rbd_pool_id_show(struct device *dev,
2009 struct device_attribute *attr, char *buf)
2010{
2011 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2012
0d7dbfce
AE
2013 return sprintf(buf, "%llu\n",
2014 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2015}
2016
dfc5606d
YS
2017static ssize_t rbd_name_show(struct device *dev,
2018 struct device_attribute *attr, char *buf)
2019{
593a9e7b 2020 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2021
a92ffdf8
AE
2022 if (rbd_dev->spec->image_name)
2023 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2024
2025 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2026}
2027
589d30e0
AE
2028static ssize_t rbd_image_id_show(struct device *dev,
2029 struct device_attribute *attr, char *buf)
2030{
2031 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2032
0d7dbfce 2033 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2034}
2035
34b13184
AE
2036/*
2037 * Shows the name of the currently-mapped snapshot (or
2038 * RBD_SNAP_HEAD_NAME for the base image).
2039 */
dfc5606d
YS
2040static ssize_t rbd_snap_show(struct device *dev,
2041 struct device_attribute *attr,
2042 char *buf)
2043{
593a9e7b 2044 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2045
0d7dbfce 2046 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2047}
2048
86b00e0d
AE
2049/*
2050 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2051 * for the parent image. If there is no parent, simply shows
2052 * "(no parent image)".
2053 */
2054static ssize_t rbd_parent_show(struct device *dev,
2055 struct device_attribute *attr,
2056 char *buf)
2057{
2058 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2059 struct rbd_spec *spec = rbd_dev->parent_spec;
2060 int count;
2061 char *bufp = buf;
2062
2063 if (!spec)
2064 return sprintf(buf, "(no parent image)\n");
2065
2066 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2067 (unsigned long long) spec->pool_id, spec->pool_name);
2068 if (count < 0)
2069 return count;
2070 bufp += count;
2071
2072 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2073 spec->image_name ? spec->image_name : "(unknown)");
2074 if (count < 0)
2075 return count;
2076 bufp += count;
2077
2078 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2079 (unsigned long long) spec->snap_id, spec->snap_name);
2080 if (count < 0)
2081 return count;
2082 bufp += count;
2083
2084 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2085 if (count < 0)
2086 return count;
2087 bufp += count;
2088
2089 return (ssize_t) (bufp - buf);
2090}
2091
dfc5606d
YS
2092static ssize_t rbd_image_refresh(struct device *dev,
2093 struct device_attribute *attr,
2094 const char *buf,
2095 size_t size)
2096{
593a9e7b 2097 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2098 int ret;
602adf40 2099
117973fb 2100 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2101
2102 return ret < 0 ? ret : size;
dfc5606d 2103}
602adf40 2104
dfc5606d 2105static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2106static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2107static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2108static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2109static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2110static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2111static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2112static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2113static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2114static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2115static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2116
2117static struct attribute *rbd_attrs[] = {
2118 &dev_attr_size.attr,
34b13184 2119 &dev_attr_features.attr,
dfc5606d
YS
2120 &dev_attr_major.attr,
2121 &dev_attr_client_id.attr,
2122 &dev_attr_pool.attr,
9bb2f334 2123 &dev_attr_pool_id.attr,
dfc5606d 2124 &dev_attr_name.attr,
589d30e0 2125 &dev_attr_image_id.attr,
dfc5606d 2126 &dev_attr_current_snap.attr,
86b00e0d 2127 &dev_attr_parent.attr,
dfc5606d 2128 &dev_attr_refresh.attr,
dfc5606d
YS
2129 NULL
2130};
2131
2132static struct attribute_group rbd_attr_group = {
2133 .attrs = rbd_attrs,
2134};
2135
2136static const struct attribute_group *rbd_attr_groups[] = {
2137 &rbd_attr_group,
2138 NULL
2139};
2140
2141static void rbd_sysfs_dev_release(struct device *dev)
2142{
2143}
2144
2145static struct device_type rbd_device_type = {
2146 .name = "rbd",
2147 .groups = rbd_attr_groups,
2148 .release = rbd_sysfs_dev_release,
2149};
2150
2151
2152/*
2153 sysfs - snapshots
2154*/
2155
2156static ssize_t rbd_snap_size_show(struct device *dev,
2157 struct device_attribute *attr,
2158 char *buf)
2159{
2160 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2161
3591538f 2162 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2163}
2164
2165static ssize_t rbd_snap_id_show(struct device *dev,
2166 struct device_attribute *attr,
2167 char *buf)
2168{
2169 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2170
3591538f 2171 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2172}
2173
34b13184
AE
2174static ssize_t rbd_snap_features_show(struct device *dev,
2175 struct device_attribute *attr,
2176 char *buf)
2177{
2178 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2179
2180 return sprintf(buf, "0x%016llx\n",
2181 (unsigned long long) snap->features);
2182}
2183
dfc5606d
YS
2184static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2185static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2186static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2187
2188static struct attribute *rbd_snap_attrs[] = {
2189 &dev_attr_snap_size.attr,
2190 &dev_attr_snap_id.attr,
34b13184 2191 &dev_attr_snap_features.attr,
dfc5606d
YS
2192 NULL,
2193};
2194
2195static struct attribute_group rbd_snap_attr_group = {
2196 .attrs = rbd_snap_attrs,
2197};
2198
2199static void rbd_snap_dev_release(struct device *dev)
2200{
2201 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2202 kfree(snap->name);
2203 kfree(snap);
2204}
2205
2206static const struct attribute_group *rbd_snap_attr_groups[] = {
2207 &rbd_snap_attr_group,
2208 NULL
2209};
2210
2211static struct device_type rbd_snap_device_type = {
2212 .groups = rbd_snap_attr_groups,
2213 .release = rbd_snap_dev_release,
2214};
2215
8b8fb99c
AE
2216static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2217{
2218 kref_get(&spec->kref);
2219
2220 return spec;
2221}
2222
2223static void rbd_spec_free(struct kref *kref);
2224static void rbd_spec_put(struct rbd_spec *spec)
2225{
2226 if (spec)
2227 kref_put(&spec->kref, rbd_spec_free);
2228}
2229
2230static struct rbd_spec *rbd_spec_alloc(void)
2231{
2232 struct rbd_spec *spec;
2233
2234 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2235 if (!spec)
2236 return NULL;
2237 kref_init(&spec->kref);
2238
2239 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2240
2241 return spec;
2242}
2243
2244static void rbd_spec_free(struct kref *kref)
2245{
2246 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2247
2248 kfree(spec->pool_name);
2249 kfree(spec->image_id);
2250 kfree(spec->image_name);
2251 kfree(spec->snap_name);
2252 kfree(spec);
2253}
2254
c53d5893
AE
2255struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2256 struct rbd_spec *spec)
2257{
2258 struct rbd_device *rbd_dev;
2259
2260 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2261 if (!rbd_dev)
2262 return NULL;
2263
2264 spin_lock_init(&rbd_dev->lock);
2265 INIT_LIST_HEAD(&rbd_dev->node);
2266 INIT_LIST_HEAD(&rbd_dev->snaps);
2267 init_rwsem(&rbd_dev->header_rwsem);
2268
2269 rbd_dev->spec = spec;
2270 rbd_dev->rbd_client = rbdc;
2271
2272 return rbd_dev;
2273}
2274
2275static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2276{
86b00e0d 2277 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2278 kfree(rbd_dev->header_name);
2279 rbd_put_client(rbd_dev->rbd_client);
2280 rbd_spec_put(rbd_dev->spec);
2281 kfree(rbd_dev);
2282}
2283
304f6808
AE
2284static bool rbd_snap_registered(struct rbd_snap *snap)
2285{
2286 bool ret = snap->dev.type == &rbd_snap_device_type;
2287 bool reg = device_is_registered(&snap->dev);
2288
2289 rbd_assert(!ret ^ reg);
2290
2291 return ret;
2292}
2293
41f38c2b 2294static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2295{
2296 list_del(&snap->node);
304f6808
AE
2297 if (device_is_registered(&snap->dev))
2298 device_unregister(&snap->dev);
dfc5606d
YS
2299}
2300
14e7085d 2301static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2302 struct device *parent)
2303{
2304 struct device *dev = &snap->dev;
2305 int ret;
2306
2307 dev->type = &rbd_snap_device_type;
2308 dev->parent = parent;
2309 dev->release = rbd_snap_dev_release;
d4b125e9 2310 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2311 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2312
dfc5606d
YS
2313 ret = device_register(dev);
2314
2315 return ret;
2316}
2317
4e891e0a 2318static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2319 const char *snap_name,
34b13184
AE
2320 u64 snap_id, u64 snap_size,
2321 u64 snap_features)
dfc5606d 2322{
4e891e0a 2323 struct rbd_snap *snap;
dfc5606d 2324 int ret;
4e891e0a
AE
2325
2326 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2327 if (!snap)
4e891e0a
AE
2328 return ERR_PTR(-ENOMEM);
2329
2330 ret = -ENOMEM;
c8d18425 2331 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2332 if (!snap->name)
2333 goto err;
2334
c8d18425
AE
2335 snap->id = snap_id;
2336 snap->size = snap_size;
34b13184 2337 snap->features = snap_features;
4e891e0a
AE
2338
2339 return snap;
2340
dfc5606d
YS
2341err:
2342 kfree(snap->name);
2343 kfree(snap);
4e891e0a
AE
2344
2345 return ERR_PTR(ret);
dfc5606d
YS
2346}
2347
cd892126
AE
2348static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2349 u64 *snap_size, u64 *snap_features)
2350{
2351 char *snap_name;
2352
2353 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2354
2355 *snap_size = rbd_dev->header.snap_sizes[which];
2356 *snap_features = 0; /* No features for v1 */
2357
2358 /* Skip over names until we find the one we are looking for */
2359
2360 snap_name = rbd_dev->header.snap_names;
2361 while (which--)
2362 snap_name += strlen(snap_name) + 1;
2363
2364 return snap_name;
2365}
2366
9d475de5
AE
2367/*
2368 * Get the size and object order for an image snapshot, or if
2369 * snap_id is CEPH_NOSNAP, gets this information for the base
2370 * image.
2371 */
2372static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2373 u8 *order, u64 *snap_size)
2374{
2375 __le64 snapid = cpu_to_le64(snap_id);
2376 int ret;
2377 struct {
2378 u8 order;
2379 __le64 size;
2380 } __attribute__ ((packed)) size_buf = { 0 };
2381
2382 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2383 "rbd", "get_size",
2384 (char *) &snapid, sizeof (snapid),
2385 (char *) &size_buf, sizeof (size_buf),
2386 CEPH_OSD_FLAG_READ, NULL);
2387 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2388 if (ret < 0)
2389 return ret;
2390
2391 *order = size_buf.order;
2392 *snap_size = le64_to_cpu(size_buf.size);
2393
2394 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2395 (unsigned long long) snap_id, (unsigned int) *order,
2396 (unsigned long long) *snap_size);
2397
2398 return 0;
2399}
2400
2401static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2402{
2403 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2404 &rbd_dev->header.obj_order,
2405 &rbd_dev->header.image_size);
2406}
2407
1e130199
AE
2408static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2409{
2410 void *reply_buf;
2411 int ret;
2412 void *p;
2413
2414 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2415 if (!reply_buf)
2416 return -ENOMEM;
2417
2418 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2419 "rbd", "get_object_prefix",
2420 NULL, 0,
2421 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2422 CEPH_OSD_FLAG_READ, NULL);
2423 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2424 if (ret < 0)
2425 goto out;
a0ea3a40 2426 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2427
2428 p = reply_buf;
2429 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2430 p + RBD_OBJ_PREFIX_LEN_MAX,
2431 NULL, GFP_NOIO);
2432
2433 if (IS_ERR(rbd_dev->header.object_prefix)) {
2434 ret = PTR_ERR(rbd_dev->header.object_prefix);
2435 rbd_dev->header.object_prefix = NULL;
2436 } else {
2437 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2438 }
2439
2440out:
2441 kfree(reply_buf);
2442
2443 return ret;
2444}
2445
b1b5402a
AE
2446static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2447 u64 *snap_features)
2448{
2449 __le64 snapid = cpu_to_le64(snap_id);
2450 struct {
2451 __le64 features;
2452 __le64 incompat;
2453 } features_buf = { 0 };
d889140c 2454 u64 incompat;
b1b5402a
AE
2455 int ret;
2456
2457 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2458 "rbd", "get_features",
2459 (char *) &snapid, sizeof (snapid),
2460 (char *) &features_buf, sizeof (features_buf),
2461 CEPH_OSD_FLAG_READ, NULL);
2462 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2463 if (ret < 0)
2464 return ret;
d889140c
AE
2465
2466 incompat = le64_to_cpu(features_buf.incompat);
2467 if (incompat & ~RBD_FEATURES_ALL)
b8f5c6ed 2468 return -ENXIO;
d889140c 2469
b1b5402a
AE
2470 *snap_features = le64_to_cpu(features_buf.features);
2471
2472 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2473 (unsigned long long) snap_id,
2474 (unsigned long long) *snap_features,
2475 (unsigned long long) le64_to_cpu(features_buf.incompat));
2476
2477 return 0;
2478}
2479
2480static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2481{
2482 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2483 &rbd_dev->header.features);
2484}
2485
86b00e0d
AE
2486static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2487{
2488 struct rbd_spec *parent_spec;
2489 size_t size;
2490 void *reply_buf = NULL;
2491 __le64 snapid;
2492 void *p;
2493 void *end;
2494 char *image_id;
2495 u64 overlap;
2496 size_t len = 0;
2497 int ret;
2498
2499 parent_spec = rbd_spec_alloc();
2500 if (!parent_spec)
2501 return -ENOMEM;
2502
2503 size = sizeof (__le64) + /* pool_id */
2504 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2505 sizeof (__le64) + /* snap_id */
2506 sizeof (__le64); /* overlap */
2507 reply_buf = kmalloc(size, GFP_KERNEL);
2508 if (!reply_buf) {
2509 ret = -ENOMEM;
2510 goto out_err;
2511 }
2512
2513 snapid = cpu_to_le64(CEPH_NOSNAP);
2514 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2515 "rbd", "get_parent",
2516 (char *) &snapid, sizeof (snapid),
2517 (char *) reply_buf, size,
2518 CEPH_OSD_FLAG_READ, NULL);
2519 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2520 if (ret < 0)
2521 goto out_err;
2522
2523 ret = -ERANGE;
2524 p = reply_buf;
2525 end = (char *) reply_buf + size;
2526 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2527 if (parent_spec->pool_id == CEPH_NOPOOL)
2528 goto out; /* No parent? No problem. */
2529
2530 image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2531 if (IS_ERR(image_id)) {
2532 ret = PTR_ERR(image_id);
2533 goto out_err;
2534 }
2535 parent_spec->image_id = image_id;
9e15b77d 2536 parent_spec->image_id_len = len;
86b00e0d
AE
2537 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2538 ceph_decode_64_safe(&p, end, overlap, out_err);
2539
2540 rbd_dev->parent_overlap = overlap;
2541 rbd_dev->parent_spec = parent_spec;
2542 parent_spec = NULL; /* rbd_dev now owns this */
2543out:
2544 ret = 0;
2545out_err:
2546 kfree(reply_buf);
2547 rbd_spec_put(parent_spec);
2548
2549 return ret;
2550}
2551
9e15b77d
AE
2552static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2553{
2554 size_t image_id_size;
2555 char *image_id;
2556 void *p;
2557 void *end;
2558 size_t size;
2559 void *reply_buf = NULL;
2560 size_t len = 0;
2561 char *image_name = NULL;
2562 int ret;
2563
2564 rbd_assert(!rbd_dev->spec->image_name);
2565
2566 image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
2567 image_id = kmalloc(image_id_size, GFP_KERNEL);
2568 if (!image_id)
2569 return NULL;
2570
2571 p = image_id;
2572 end = (char *) image_id + image_id_size;
2573 ceph_encode_string(&p, end, rbd_dev->spec->image_id,
2574 (u32) rbd_dev->spec->image_id_len);
2575
2576 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2577 reply_buf = kmalloc(size, GFP_KERNEL);
2578 if (!reply_buf)
2579 goto out;
2580
2581 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2582 "rbd", "dir_get_name",
2583 image_id, image_id_size,
2584 (char *) reply_buf, size,
2585 CEPH_OSD_FLAG_READ, NULL);
2586 if (ret < 0)
2587 goto out;
2588 p = reply_buf;
2589 end = (char *) reply_buf + size;
2590 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2591 if (IS_ERR(image_name))
2592 image_name = NULL;
2593 else
2594 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2595out:
2596 kfree(reply_buf);
2597 kfree(image_id);
2598
2599 return image_name;
2600}
2601
2602/*
2603 * When a parent image gets probed, we only have the pool, image,
2604 * and snapshot ids but not the names of any of them. This call
2605 * is made later to fill in those names. It has to be done after
2606 * rbd_dev_snaps_update() has completed because some of the
2607 * information (in particular, snapshot name) is not available
2608 * until then.
2609 */
2610static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2611{
2612 struct ceph_osd_client *osdc;
2613 const char *name;
2614 void *reply_buf = NULL;
2615 int ret;
2616
2617 if (rbd_dev->spec->pool_name)
2618 return 0; /* Already have the names */
2619
2620 /* Look up the pool name */
2621
2622 osdc = &rbd_dev->rbd_client->client->osdc;
2623 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2624 if (!name)
2625 return -EIO; /* pool id too large (>= 2^31) */
2626
2627 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2628 if (!rbd_dev->spec->pool_name)
2629 return -ENOMEM;
2630
2631 /* Fetch the image name; tolerate failure here */
2632
2633 name = rbd_dev_image_name(rbd_dev);
2634 if (name) {
2635 rbd_dev->spec->image_name_len = strlen(name);
2636 rbd_dev->spec->image_name = (char *) name;
2637 } else {
2638 pr_warning(RBD_DRV_NAME "%d "
2639 "unable to get image name for image id %s\n",
2640 rbd_dev->major, rbd_dev->spec->image_id);
2641 }
2642
2643 /* Look up the snapshot name. */
2644
2645 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2646 if (!name) {
2647 ret = -EIO;
2648 goto out_err;
2649 }
2650 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2651 if(!rbd_dev->spec->snap_name)
2652 goto out_err;
2653
2654 return 0;
2655out_err:
2656 kfree(reply_buf);
2657 kfree(rbd_dev->spec->pool_name);
2658 rbd_dev->spec->pool_name = NULL;
2659
2660 return ret;
2661}
2662
6e14b1a6 2663static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2664{
2665 size_t size;
2666 int ret;
2667 void *reply_buf;
2668 void *p;
2669 void *end;
2670 u64 seq;
2671 u32 snap_count;
2672 struct ceph_snap_context *snapc;
2673 u32 i;
2674
2675 /*
2676 * We'll need room for the seq value (maximum snapshot id),
2677 * snapshot count, and array of that many snapshot ids.
2678 * For now we have a fixed upper limit on the number we're
2679 * prepared to receive.
2680 */
2681 size = sizeof (__le64) + sizeof (__le32) +
2682 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2683 reply_buf = kzalloc(size, GFP_KERNEL);
2684 if (!reply_buf)
2685 return -ENOMEM;
2686
2687 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2688 "rbd", "get_snapcontext",
2689 NULL, 0,
2690 reply_buf, size,
6e14b1a6 2691 CEPH_OSD_FLAG_READ, ver);
35d489f9
AE
2692 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2693 if (ret < 0)
2694 goto out;
2695
2696 ret = -ERANGE;
2697 p = reply_buf;
2698 end = (char *) reply_buf + size;
2699 ceph_decode_64_safe(&p, end, seq, out);
2700 ceph_decode_32_safe(&p, end, snap_count, out);
2701
2702 /*
2703 * Make sure the reported number of snapshot ids wouldn't go
2704 * beyond the end of our buffer. But before checking that,
2705 * make sure the computed size of the snapshot context we
2706 * allocate is representable in a size_t.
2707 */
2708 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2709 / sizeof (u64)) {
2710 ret = -EINVAL;
2711 goto out;
2712 }
2713 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2714 goto out;
2715
2716 size = sizeof (struct ceph_snap_context) +
2717 snap_count * sizeof (snapc->snaps[0]);
2718 snapc = kmalloc(size, GFP_KERNEL);
2719 if (!snapc) {
2720 ret = -ENOMEM;
2721 goto out;
2722 }
2723
2724 atomic_set(&snapc->nref, 1);
2725 snapc->seq = seq;
2726 snapc->num_snaps = snap_count;
2727 for (i = 0; i < snap_count; i++)
2728 snapc->snaps[i] = ceph_decode_64(&p);
2729
2730 rbd_dev->header.snapc = snapc;
2731
2732 dout(" snap context seq = %llu, snap_count = %u\n",
2733 (unsigned long long) seq, (unsigned int) snap_count);
2734
2735out:
2736 kfree(reply_buf);
2737
2738 return 0;
2739}
2740
b8b1e2db
AE
2741static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2742{
2743 size_t size;
2744 void *reply_buf;
2745 __le64 snap_id;
2746 int ret;
2747 void *p;
2748 void *end;
b8b1e2db
AE
2749 char *snap_name;
2750
2751 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2752 reply_buf = kmalloc(size, GFP_KERNEL);
2753 if (!reply_buf)
2754 return ERR_PTR(-ENOMEM);
2755
2756 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2757 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2758 "rbd", "get_snapshot_name",
2759 (char *) &snap_id, sizeof (snap_id),
2760 reply_buf, size,
2761 CEPH_OSD_FLAG_READ, NULL);
2762 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2763 if (ret < 0)
2764 goto out;
2765
2766 p = reply_buf;
2767 end = (char *) reply_buf + size;
e5c35534 2768 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2769 if (IS_ERR(snap_name)) {
2770 ret = PTR_ERR(snap_name);
2771 goto out;
2772 } else {
2773 dout(" snap_id 0x%016llx snap_name = %s\n",
2774 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2775 }
2776 kfree(reply_buf);
2777
2778 return snap_name;
2779out:
2780 kfree(reply_buf);
2781
2782 return ERR_PTR(ret);
2783}
2784
2785static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2786 u64 *snap_size, u64 *snap_features)
2787{
2788 __le64 snap_id;
2789 u8 order;
2790 int ret;
2791
2792 snap_id = rbd_dev->header.snapc->snaps[which];
2793 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2794 if (ret)
2795 return ERR_PTR(ret);
2796 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2797 if (ret)
2798 return ERR_PTR(ret);
2799
2800 return rbd_dev_v2_snap_name(rbd_dev, which);
2801}
2802
2803static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2804 u64 *snap_size, u64 *snap_features)
2805{
2806 if (rbd_dev->image_format == 1)
2807 return rbd_dev_v1_snap_info(rbd_dev, which,
2808 snap_size, snap_features);
2809 if (rbd_dev->image_format == 2)
2810 return rbd_dev_v2_snap_info(rbd_dev, which,
2811 snap_size, snap_features);
2812 return ERR_PTR(-EINVAL);
2813}
2814
117973fb
AE
2815static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2816{
2817 int ret;
2818 __u8 obj_order;
2819
2820 down_write(&rbd_dev->header_rwsem);
2821
2822 /* Grab old order first, to see if it changes */
2823
2824 obj_order = rbd_dev->header.obj_order,
2825 ret = rbd_dev_v2_image_size(rbd_dev);
2826 if (ret)
2827 goto out;
2828 if (rbd_dev->header.obj_order != obj_order) {
2829 ret = -EIO;
2830 goto out;
2831 }
2832 rbd_update_mapping_size(rbd_dev);
2833
2834 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2835 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2836 if (ret)
2837 goto out;
2838 ret = rbd_dev_snaps_update(rbd_dev);
2839 dout("rbd_dev_snaps_update returned %d\n", ret);
2840 if (ret)
2841 goto out;
2842 ret = rbd_dev_snaps_register(rbd_dev);
2843 dout("rbd_dev_snaps_register returned %d\n", ret);
2844out:
2845 up_write(&rbd_dev->header_rwsem);
2846
2847 return ret;
2848}
2849
dfc5606d 2850/*
35938150
AE
2851 * Scan the rbd device's current snapshot list and compare it to the
2852 * newly-received snapshot context. Remove any existing snapshots
2853 * not present in the new snapshot context. Add a new snapshot for
2854 * any snaphots in the snapshot context not in the current list.
2855 * And verify there are no changes to snapshots we already know
2856 * about.
2857 *
2858 * Assumes the snapshots in the snapshot context are sorted by
2859 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2860 * are also maintained in that order.)
dfc5606d 2861 */
304f6808 2862static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2863{
35938150
AE
2864 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2865 const u32 snap_count = snapc->num_snaps;
35938150
AE
2866 struct list_head *head = &rbd_dev->snaps;
2867 struct list_head *links = head->next;
2868 u32 index = 0;
dfc5606d 2869
9fcbb800 2870 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2871 while (index < snap_count || links != head) {
2872 u64 snap_id;
2873 struct rbd_snap *snap;
cd892126
AE
2874 char *snap_name;
2875 u64 snap_size = 0;
2876 u64 snap_features = 0;
dfc5606d 2877
35938150
AE
2878 snap_id = index < snap_count ? snapc->snaps[index]
2879 : CEPH_NOSNAP;
2880 snap = links != head ? list_entry(links, struct rbd_snap, node)
2881 : NULL;
aafb230e 2882 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2883
35938150
AE
2884 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2885 struct list_head *next = links->next;
dfc5606d 2886
35938150 2887 /* Existing snapshot not in the new snap context */
dfc5606d 2888
0d7dbfce 2889 if (rbd_dev->spec->snap_id == snap->id)
daba5fdb 2890 rbd_dev->exists = false;
41f38c2b 2891 rbd_remove_snap_dev(snap);
9fcbb800 2892 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
2893 rbd_dev->spec->snap_id == snap->id ?
2894 "mapped " : "",
9fcbb800 2895 (unsigned long long) snap->id);
35938150
AE
2896
2897 /* Done with this list entry; advance */
2898
2899 links = next;
dfc5606d
YS
2900 continue;
2901 }
35938150 2902
b8b1e2db
AE
2903 snap_name = rbd_dev_snap_info(rbd_dev, index,
2904 &snap_size, &snap_features);
cd892126
AE
2905 if (IS_ERR(snap_name))
2906 return PTR_ERR(snap_name);
2907
9fcbb800
AE
2908 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2909 (unsigned long long) snap_id);
35938150
AE
2910 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2911 struct rbd_snap *new_snap;
2912
2913 /* We haven't seen this snapshot before */
2914
c8d18425 2915 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2916 snap_id, snap_size, snap_features);
9fcbb800
AE
2917 if (IS_ERR(new_snap)) {
2918 int err = PTR_ERR(new_snap);
2919
2920 dout(" failed to add dev, error %d\n", err);
2921
2922 return err;
2923 }
35938150
AE
2924
2925 /* New goes before existing, or at end of list */
2926
9fcbb800 2927 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2928 if (snap)
2929 list_add_tail(&new_snap->node, &snap->node);
2930 else
523f3258 2931 list_add_tail(&new_snap->node, head);
35938150
AE
2932 } else {
2933 /* Already have this one */
2934
9fcbb800
AE
2935 dout(" already present\n");
2936
cd892126 2937 rbd_assert(snap->size == snap_size);
aafb230e 2938 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2939 rbd_assert(snap->features == snap_features);
35938150
AE
2940
2941 /* Done with this list entry; advance */
2942
2943 links = links->next;
dfc5606d 2944 }
35938150
AE
2945
2946 /* Advance to the next entry in the snapshot context */
2947
2948 index++;
dfc5606d 2949 }
9fcbb800 2950 dout("%s: done\n", __func__);
dfc5606d
YS
2951
2952 return 0;
2953}
2954
304f6808
AE
2955/*
2956 * Scan the list of snapshots and register the devices for any that
2957 * have not already been registered.
2958 */
2959static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2960{
2961 struct rbd_snap *snap;
2962 int ret = 0;
2963
2964 dout("%s called\n", __func__);
86ff77bb
AE
2965 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2966 return -EIO;
304f6808
AE
2967
2968 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2969 if (!rbd_snap_registered(snap)) {
2970 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2971 if (ret < 0)
2972 break;
2973 }
2974 }
2975 dout("%s: returning %d\n", __func__, ret);
2976
2977 return ret;
2978}
2979
dfc5606d
YS
2980static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2981{
dfc5606d 2982 struct device *dev;
cd789ab9 2983 int ret;
dfc5606d
YS
2984
2985 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2986
cd789ab9 2987 dev = &rbd_dev->dev;
dfc5606d
YS
2988 dev->bus = &rbd_bus_type;
2989 dev->type = &rbd_device_type;
2990 dev->parent = &rbd_root_dev;
2991 dev->release = rbd_dev_release;
de71a297 2992 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2993 ret = device_register(dev);
dfc5606d 2994
dfc5606d 2995 mutex_unlock(&ctl_mutex);
cd789ab9 2996
dfc5606d 2997 return ret;
602adf40
YS
2998}
2999
dfc5606d
YS
3000static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3001{
3002 device_unregister(&rbd_dev->dev);
3003}
3004
59c2be1e
YS
3005static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
3006{
3007 int ret, rc;
3008
3009 do {
0e6f322d 3010 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 3011 if (ret == -ERANGE) {
117973fb 3012 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
3013 if (rc < 0)
3014 return rc;
3015 }
3016 } while (ret == -ERANGE);
3017
3018 return ret;
3019}
3020
e2839308 3021static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3022
3023/*
499afd5b
AE
3024 * Get a unique rbd identifier for the given new rbd_dev, and add
3025 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3026 */
e2839308 3027static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3028{
e2839308 3029 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3030
3031 spin_lock(&rbd_dev_list_lock);
3032 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3033 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3034 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3035 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3036}
b7f23c36 3037
1ddbe94e 3038/*
499afd5b
AE
3039 * Remove an rbd_dev from the global list, and record that its
3040 * identifier is no longer in use.
1ddbe94e 3041 */
e2839308 3042static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3043{
d184f6bf 3044 struct list_head *tmp;
de71a297 3045 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3046 int max_id;
3047
aafb230e 3048 rbd_assert(rbd_id > 0);
499afd5b 3049
e2839308
AE
3050 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3051 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3052 spin_lock(&rbd_dev_list_lock);
3053 list_del_init(&rbd_dev->node);
d184f6bf
AE
3054
3055 /*
3056 * If the id being "put" is not the current maximum, there
3057 * is nothing special we need to do.
3058 */
e2839308 3059 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3060 spin_unlock(&rbd_dev_list_lock);
3061 return;
3062 }
3063
3064 /*
3065 * We need to update the current maximum id. Search the
3066 * list to find out what it is. We're more likely to find
3067 * the maximum at the end, so search the list backward.
3068 */
3069 max_id = 0;
3070 list_for_each_prev(tmp, &rbd_dev_list) {
3071 struct rbd_device *rbd_dev;
3072
3073 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3074 if (rbd_dev->dev_id > max_id)
3075 max_id = rbd_dev->dev_id;
d184f6bf 3076 }
499afd5b 3077 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3078
1ddbe94e 3079 /*
e2839308 3080 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3081 * which case it now accurately reflects the new maximum.
3082 * Be careful not to overwrite the maximum value in that
3083 * case.
1ddbe94e 3084 */
e2839308
AE
3085 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3086 dout(" max dev id has been reset\n");
b7f23c36
AE
3087}
3088
e28fff26
AE
3089/*
3090 * Skips over white space at *buf, and updates *buf to point to the
3091 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3092 * the token (string of non-white space characters) found. Note
3093 * that *buf must be terminated with '\0'.
e28fff26
AE
3094 */
3095static inline size_t next_token(const char **buf)
3096{
3097 /*
3098 * These are the characters that produce nonzero for
3099 * isspace() in the "C" and "POSIX" locales.
3100 */
3101 const char *spaces = " \f\n\r\t\v";
3102
3103 *buf += strspn(*buf, spaces); /* Find start of token */
3104
3105 return strcspn(*buf, spaces); /* Return token length */
3106}
3107
3108/*
3109 * Finds the next token in *buf, and if the provided token buffer is
3110 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3111 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3112 * must be terminated with '\0' on entry.
e28fff26
AE
3113 *
3114 * Returns the length of the token found (not including the '\0').
3115 * Return value will be 0 if no token is found, and it will be >=
3116 * token_size if the token would not fit.
3117 *
593a9e7b 3118 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3119 * found token. Note that this occurs even if the token buffer is
3120 * too small to hold it.
3121 */
3122static inline size_t copy_token(const char **buf,
3123 char *token,
3124 size_t token_size)
3125{
3126 size_t len;
3127
3128 len = next_token(buf);
3129 if (len < token_size) {
3130 memcpy(token, *buf, len);
3131 *(token + len) = '\0';
3132 }
3133 *buf += len;
3134
3135 return len;
3136}
3137
ea3352f4
AE
3138/*
3139 * Finds the next token in *buf, dynamically allocates a buffer big
3140 * enough to hold a copy of it, and copies the token into the new
3141 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3142 * that a duplicate buffer is created even for a zero-length token.
3143 *
3144 * Returns a pointer to the newly-allocated duplicate, or a null
3145 * pointer if memory for the duplicate was not available. If
3146 * the lenp argument is a non-null pointer, the length of the token
3147 * (not including the '\0') is returned in *lenp.
3148 *
3149 * If successful, the *buf pointer will be updated to point beyond
3150 * the end of the found token.
3151 *
3152 * Note: uses GFP_KERNEL for allocation.
3153 */
3154static inline char *dup_token(const char **buf, size_t *lenp)
3155{
3156 char *dup;
3157 size_t len;
3158
3159 len = next_token(buf);
3160 dup = kmalloc(len + 1, GFP_KERNEL);
3161 if (!dup)
3162 return NULL;
3163
3164 memcpy(dup, *buf, len);
3165 *(dup + len) = '\0';
3166 *buf += len;
3167
3168 if (lenp)
3169 *lenp = len;
3170
3171 return dup;
3172}
3173
a725f65e 3174/*
859c31df
AE
3175 * Parse the options provided for an "rbd add" (i.e., rbd image
3176 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3177 * and the data written is passed here via a NUL-terminated buffer.
3178 * Returns 0 if successful or an error code otherwise.
d22f76e7 3179 *
859c31df
AE
3180 * The information extracted from these options is recorded in
3181 * the other parameters which return dynamically-allocated
3182 * structures:
3183 * ceph_opts
3184 * The address of a pointer that will refer to a ceph options
3185 * structure. Caller must release the returned pointer using
3186 * ceph_destroy_options() when it is no longer needed.
3187 * rbd_opts
3188 * Address of an rbd options pointer. Fully initialized by
3189 * this function; caller must release with kfree().
3190 * spec
3191 * Address of an rbd image specification pointer. Fully
3192 * initialized by this function based on parsed options.
3193 * Caller must release with rbd_spec_put().
3194 *
3195 * The options passed take this form:
3196 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3197 * where:
3198 * <mon_addrs>
3199 * A comma-separated list of one or more monitor addresses.
3200 * A monitor address is an ip address, optionally followed
3201 * by a port number (separated by a colon).
3202 * I.e.: ip1[:port1][,ip2[:port2]...]
3203 * <options>
3204 * A comma-separated list of ceph and/or rbd options.
3205 * <pool_name>
3206 * The name of the rados pool containing the rbd image.
3207 * <image_name>
3208 * The name of the image in that pool to map.
3209 * <snap_id>
3210 * An optional snapshot id. If provided, the mapping will
3211 * present data from the image at the time that snapshot was
3212 * created. The image head is used if no snapshot id is
3213 * provided. Snapshot mappings are always read-only.
a725f65e 3214 */
859c31df 3215static int rbd_add_parse_args(const char *buf,
dc79b113 3216 struct ceph_options **ceph_opts,
859c31df
AE
3217 struct rbd_options **opts,
3218 struct rbd_spec **rbd_spec)
e28fff26 3219{
d22f76e7 3220 size_t len;
859c31df 3221 char *options;
0ddebc0c
AE
3222 const char *mon_addrs;
3223 size_t mon_addrs_size;
859c31df 3224 struct rbd_spec *spec = NULL;
4e9afeba 3225 struct rbd_options *rbd_opts = NULL;
859c31df 3226 struct ceph_options *copts;
dc79b113 3227 int ret;
e28fff26
AE
3228
3229 /* The first four tokens are required */
3230
7ef3214a
AE
3231 len = next_token(&buf);
3232 if (!len)
dc79b113 3233 return -EINVAL; /* Missing monitor address(es) */
0ddebc0c 3234 mon_addrs = buf;
f28e565a 3235 mon_addrs_size = len + 1;
7ef3214a 3236 buf += len;
a725f65e 3237
dc79b113 3238 ret = -EINVAL;
f28e565a
AE
3239 options = dup_token(&buf, NULL);
3240 if (!options)
dc79b113 3241 return -ENOMEM;
f28e565a
AE
3242 if (!*options)
3243 goto out_err; /* Missing options */
e28fff26 3244
859c31df
AE
3245 spec = rbd_spec_alloc();
3246 if (!spec)
f28e565a 3247 goto out_mem;
859c31df
AE
3248
3249 spec->pool_name = dup_token(&buf, NULL);
3250 if (!spec->pool_name)
3251 goto out_mem;
3252 if (!*spec->pool_name)
f28e565a 3253 goto out_err; /* Missing pool name */
e28fff26 3254
859c31df
AE
3255 spec->image_name = dup_token(&buf, &spec->image_name_len);
3256 if (!spec->image_name)
f28e565a 3257 goto out_mem;
859c31df 3258 if (!*spec->image_name)
f28e565a 3259 goto out_err; /* Missing image name */
d4b125e9 3260
f28e565a
AE
3261 /*
3262 * Snapshot name is optional; default is to use "-"
3263 * (indicating the head/no snapshot).
3264 */
3feeb894 3265 len = next_token(&buf);
820a5f3e 3266 if (!len) {
3feeb894
AE
3267 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3268 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3269 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3270 ret = -ENAMETOOLONG;
f28e565a 3271 goto out_err;
849b4260 3272 }
859c31df
AE
3273 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3274 if (!spec->snap_name)
f28e565a 3275 goto out_mem;
859c31df
AE
3276 memcpy(spec->snap_name, buf, len);
3277 *(spec->snap_name + len) = '\0';
e5c35534 3278
0ddebc0c 3279 /* Initialize all rbd options to the defaults */
e28fff26 3280
4e9afeba
AE
3281 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3282 if (!rbd_opts)
3283 goto out_mem;
3284
3285 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3286
859c31df 3287 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3288 mon_addrs + mon_addrs_size - 1,
4e9afeba 3289 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3290 if (IS_ERR(copts)) {
3291 ret = PTR_ERR(copts);
dc79b113
AE
3292 goto out_err;
3293 }
859c31df
AE
3294 kfree(options);
3295
3296 *ceph_opts = copts;
4e9afeba 3297 *opts = rbd_opts;
859c31df 3298 *rbd_spec = spec;
0ddebc0c 3299
dc79b113 3300 return 0;
f28e565a 3301out_mem:
dc79b113 3302 ret = -ENOMEM;
d22f76e7 3303out_err:
859c31df
AE
3304 kfree(rbd_opts);
3305 rbd_spec_put(spec);
f28e565a 3306 kfree(options);
d22f76e7 3307
dc79b113 3308 return ret;
a725f65e
AE
3309}
3310
589d30e0
AE
3311/*
3312 * An rbd format 2 image has a unique identifier, distinct from the
3313 * name given to it by the user. Internally, that identifier is
3314 * what's used to specify the names of objects related to the image.
3315 *
3316 * A special "rbd id" object is used to map an rbd image name to its
3317 * id. If that object doesn't exist, then there is no v2 rbd image
3318 * with the supplied name.
3319 *
3320 * This function will record the given rbd_dev's image_id field if
3321 * it can be determined, and in that case will return 0. If any
3322 * errors occur a negative errno will be returned and the rbd_dev's
3323 * image_id field will be unchanged (and should be NULL).
3324 */
3325static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3326{
3327 int ret;
3328 size_t size;
3329 char *object_name;
3330 void *response;
3331 void *p;
3332
2c0d0a10
AE
3333 /*
3334 * When probing a parent image, the image id is already
3335 * known (and the image name likely is not). There's no
3336 * need to fetch the image id again in this case.
3337 */
3338 if (rbd_dev->spec->image_id)
3339 return 0;
3340
589d30e0
AE
3341 /*
3342 * First, see if the format 2 image id file exists, and if
3343 * so, get the image's persistent id from it.
3344 */
0d7dbfce 3345 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
589d30e0
AE
3346 object_name = kmalloc(size, GFP_NOIO);
3347 if (!object_name)
3348 return -ENOMEM;
0d7dbfce 3349 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3350 dout("rbd id object name is %s\n", object_name);
3351
3352 /* Response will be an encoded string, which includes a length */
3353
3354 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3355 response = kzalloc(size, GFP_NOIO);
3356 if (!response) {
3357 ret = -ENOMEM;
3358 goto out;
3359 }
3360
3361 ret = rbd_req_sync_exec(rbd_dev, object_name,
3362 "rbd", "get_id",
3363 NULL, 0,
3364 response, RBD_IMAGE_ID_LEN_MAX,
3365 CEPH_OSD_FLAG_READ, NULL);
3366 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3367 if (ret < 0)
3368 goto out;
a0ea3a40 3369 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
3370
3371 p = response;
0d7dbfce 3372 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3373 p + RBD_IMAGE_ID_LEN_MAX,
0d7dbfce 3374 &rbd_dev->spec->image_id_len,
589d30e0 3375 GFP_NOIO);
0d7dbfce
AE
3376 if (IS_ERR(rbd_dev->spec->image_id)) {
3377 ret = PTR_ERR(rbd_dev->spec->image_id);
3378 rbd_dev->spec->image_id = NULL;
589d30e0 3379 } else {
0d7dbfce 3380 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3381 }
3382out:
3383 kfree(response);
3384 kfree(object_name);
3385
3386 return ret;
3387}
3388
a30b71b9
AE
3389static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3390{
3391 int ret;
3392 size_t size;
3393
3394 /* Version 1 images have no id; empty string is used */
3395
0d7dbfce
AE
3396 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3397 if (!rbd_dev->spec->image_id)
a30b71b9 3398 return -ENOMEM;
0d7dbfce 3399 rbd_dev->spec->image_id_len = 0;
a30b71b9
AE
3400
3401 /* Record the header object name for this rbd image. */
3402
0d7dbfce 3403 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
a30b71b9
AE
3404 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3405 if (!rbd_dev->header_name) {
3406 ret = -ENOMEM;
3407 goto out_err;
3408 }
0d7dbfce
AE
3409 sprintf(rbd_dev->header_name, "%s%s",
3410 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3411
3412 /* Populate rbd image metadata */
3413
3414 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3415 if (ret < 0)
3416 goto out_err;
86b00e0d
AE
3417
3418 /* Version 1 images have no parent (no layering) */
3419
3420 rbd_dev->parent_spec = NULL;
3421 rbd_dev->parent_overlap = 0;
3422
a30b71b9
AE
3423 rbd_dev->image_format = 1;
3424
3425 dout("discovered version 1 image, header name is %s\n",
3426 rbd_dev->header_name);
3427
3428 return 0;
3429
3430out_err:
3431 kfree(rbd_dev->header_name);
3432 rbd_dev->header_name = NULL;
0d7dbfce
AE
3433 kfree(rbd_dev->spec->image_id);
3434 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3435
3436 return ret;
3437}
3438
3439static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3440{
3441 size_t size;
9d475de5 3442 int ret;
6e14b1a6 3443 u64 ver = 0;
a30b71b9
AE
3444
3445 /*
3446 * Image id was filled in by the caller. Record the header
3447 * object name for this rbd image.
3448 */
0d7dbfce 3449 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
a30b71b9
AE
3450 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3451 if (!rbd_dev->header_name)
3452 return -ENOMEM;
3453 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3454 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3455
3456 /* Get the size and object order for the image */
3457
3458 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3459 if (ret < 0)
3460 goto out_err;
3461
3462 /* Get the object prefix (a.k.a. block_name) for the image */
3463
3464 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3465 if (ret < 0)
3466 goto out_err;
3467
d889140c 3468 /* Get the and check features for the image */
b1b5402a
AE
3469
3470 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3471 if (ret < 0)
3472 goto out_err;
35d489f9 3473
86b00e0d
AE
3474 /* If the image supports layering, get the parent info */
3475
3476 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3477 ret = rbd_dev_v2_parent_info(rbd_dev);
3478 if (ret < 0)
3479 goto out_err;
3480 }
3481
6e14b1a6
AE
3482 /* crypto and compression type aren't (yet) supported for v2 images */
3483
3484 rbd_dev->header.crypt_type = 0;
3485 rbd_dev->header.comp_type = 0;
35d489f9 3486
6e14b1a6
AE
3487 /* Get the snapshot context, plus the header version */
3488
3489 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3490 if (ret)
3491 goto out_err;
6e14b1a6
AE
3492 rbd_dev->header.obj_version = ver;
3493
a30b71b9
AE
3494 rbd_dev->image_format = 2;
3495
3496 dout("discovered version 2 image, header name is %s\n",
3497 rbd_dev->header_name);
3498
35152979 3499 return 0;
9d475de5 3500out_err:
86b00e0d
AE
3501 rbd_dev->parent_overlap = 0;
3502 rbd_spec_put(rbd_dev->parent_spec);
3503 rbd_dev->parent_spec = NULL;
9d475de5
AE
3504 kfree(rbd_dev->header_name);
3505 rbd_dev->header_name = NULL;
1e130199
AE
3506 kfree(rbd_dev->header.object_prefix);
3507 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3508
3509 return ret;
a30b71b9
AE
3510}
3511
83a06263
AE
3512static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3513{
3514 int ret;
3515
3516 /* no need to lock here, as rbd_dev is not registered yet */
3517 ret = rbd_dev_snaps_update(rbd_dev);
3518 if (ret)
3519 return ret;
3520
9e15b77d
AE
3521 ret = rbd_dev_probe_update_spec(rbd_dev);
3522 if (ret)
3523 goto err_out_snaps;
3524
83a06263
AE
3525 ret = rbd_dev_set_mapping(rbd_dev);
3526 if (ret)
3527 goto err_out_snaps;
3528
3529 /* generate unique id: find highest unique id, add one */
3530 rbd_dev_id_get(rbd_dev);
3531
3532 /* Fill in the device name, now that we have its id. */
3533 BUILD_BUG_ON(DEV_NAME_LEN
3534 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3535 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3536
3537 /* Get our block major device number. */
3538
3539 ret = register_blkdev(0, rbd_dev->name);
3540 if (ret < 0)
3541 goto err_out_id;
3542 rbd_dev->major = ret;
3543
3544 /* Set up the blkdev mapping. */
3545
3546 ret = rbd_init_disk(rbd_dev);
3547 if (ret)
3548 goto err_out_blkdev;
3549
3550 ret = rbd_bus_add_dev(rbd_dev);
3551 if (ret)
3552 goto err_out_disk;
3553
3554 /*
3555 * At this point cleanup in the event of an error is the job
3556 * of the sysfs code (initiated by rbd_bus_del_dev()).
3557 */
3558 down_write(&rbd_dev->header_rwsem);
3559 ret = rbd_dev_snaps_register(rbd_dev);
3560 up_write(&rbd_dev->header_rwsem);
3561 if (ret)
3562 goto err_out_bus;
3563
3564 ret = rbd_init_watch_dev(rbd_dev);
3565 if (ret)
3566 goto err_out_bus;
3567
3568 /* Everything's ready. Announce the disk to the world. */
3569
3570 add_disk(rbd_dev->disk);
3571
3572 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3573 (unsigned long long) rbd_dev->mapping.size);
3574
3575 return ret;
3576err_out_bus:
3577 /* this will also clean up rest of rbd_dev stuff */
3578
3579 rbd_bus_del_dev(rbd_dev);
3580
3581 return ret;
3582err_out_disk:
3583 rbd_free_disk(rbd_dev);
3584err_out_blkdev:
3585 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3586err_out_id:
3587 rbd_dev_id_put(rbd_dev);
3588err_out_snaps:
3589 rbd_remove_all_snaps(rbd_dev);
3590
3591 return ret;
3592}
3593
a30b71b9
AE
3594/*
3595 * Probe for the existence of the header object for the given rbd
3596 * device. For format 2 images this includes determining the image
3597 * id.
3598 */
3599static int rbd_dev_probe(struct rbd_device *rbd_dev)
3600{
3601 int ret;
3602
3603 /*
3604 * Get the id from the image id object. If it's not a
3605 * format 2 image, we'll get ENOENT back, and we'll assume
3606 * it's a format 1 image.
3607 */
3608 ret = rbd_dev_image_id(rbd_dev);
3609 if (ret)
3610 ret = rbd_dev_v1_probe(rbd_dev);
3611 else
3612 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3613 if (ret) {
a30b71b9
AE
3614 dout("probe failed, returning %d\n", ret);
3615
83a06263
AE
3616 return ret;
3617 }
3618
3619 ret = rbd_dev_probe_finish(rbd_dev);
3620 if (ret)
3621 rbd_header_free(&rbd_dev->header);
3622
a30b71b9
AE
3623 return ret;
3624}
3625
59c2be1e
YS
3626static ssize_t rbd_add(struct bus_type *bus,
3627 const char *buf,
3628 size_t count)
602adf40 3629{
cb8627c7 3630 struct rbd_device *rbd_dev = NULL;
dc79b113 3631 struct ceph_options *ceph_opts = NULL;
4e9afeba 3632 struct rbd_options *rbd_opts = NULL;
859c31df 3633 struct rbd_spec *spec = NULL;
9d3997fd 3634 struct rbd_client *rbdc;
27cc2594
AE
3635 struct ceph_osd_client *osdc;
3636 int rc = -ENOMEM;
602adf40
YS
3637
3638 if (!try_module_get(THIS_MODULE))
3639 return -ENODEV;
3640
602adf40 3641 /* parse add command */
859c31df 3642 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 3643 if (rc < 0)
bd4ba655 3644 goto err_out_module;
78cea76e 3645
9d3997fd
AE
3646 rbdc = rbd_get_client(ceph_opts);
3647 if (IS_ERR(rbdc)) {
3648 rc = PTR_ERR(rbdc);
0ddebc0c 3649 goto err_out_args;
9d3997fd 3650 }
c53d5893 3651 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 3652
602adf40 3653 /* pick the pool */
9d3997fd 3654 osdc = &rbdc->client->osdc;
859c31df 3655 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
3656 if (rc < 0)
3657 goto err_out_client;
859c31df
AE
3658 spec->pool_id = (u64) rc;
3659
c53d5893 3660 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
3661 if (!rbd_dev)
3662 goto err_out_client;
c53d5893
AE
3663 rbdc = NULL; /* rbd_dev now owns this */
3664 spec = NULL; /* rbd_dev now owns this */
602adf40 3665
bd4ba655 3666 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
3667 kfree(rbd_opts);
3668 rbd_opts = NULL; /* done with this */
bd4ba655 3669
a30b71b9
AE
3670 rc = rbd_dev_probe(rbd_dev);
3671 if (rc < 0)
c53d5893 3672 goto err_out_rbd_dev;
05fd6f6f 3673
602adf40 3674 return count;
c53d5893
AE
3675err_out_rbd_dev:
3676 rbd_dev_destroy(rbd_dev);
bd4ba655 3677err_out_client:
9d3997fd 3678 rbd_put_client(rbdc);
0ddebc0c 3679err_out_args:
78cea76e
AE
3680 if (ceph_opts)
3681 ceph_destroy_options(ceph_opts);
4e9afeba 3682 kfree(rbd_opts);
859c31df 3683 rbd_spec_put(spec);
bd4ba655
AE
3684err_out_module:
3685 module_put(THIS_MODULE);
27cc2594 3686
602adf40 3687 dout("Error adding device %s\n", buf);
27cc2594
AE
3688
3689 return (ssize_t) rc;
602adf40
YS
3690}
3691
de71a297 3692static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3693{
3694 struct list_head *tmp;
3695 struct rbd_device *rbd_dev;
3696
e124a82f 3697 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3698 list_for_each(tmp, &rbd_dev_list) {
3699 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3700 if (rbd_dev->dev_id == dev_id) {
e124a82f 3701 spin_unlock(&rbd_dev_list_lock);
602adf40 3702 return rbd_dev;
e124a82f 3703 }
602adf40 3704 }
e124a82f 3705 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3706 return NULL;
3707}
3708
dfc5606d 3709static void rbd_dev_release(struct device *dev)
602adf40 3710{
593a9e7b 3711 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3712
1dbb4399
AE
3713 if (rbd_dev->watch_request) {
3714 struct ceph_client *client = rbd_dev->rbd_client->client;
3715
3716 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3717 rbd_dev->watch_request);
1dbb4399 3718 }
59c2be1e 3719 if (rbd_dev->watch_event)
070c633f 3720 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 3721
602adf40
YS
3722
3723 /* clean up and free blkdev */
3724 rbd_free_disk(rbd_dev);
3725 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3726
2ac4e75d
AE
3727 /* release allocated disk header fields */
3728 rbd_header_free(&rbd_dev->header);
3729
32eec68d 3730 /* done with the id, and with the rbd_dev */
e2839308 3731 rbd_dev_id_put(rbd_dev);
c53d5893
AE
3732 rbd_assert(rbd_dev->rbd_client != NULL);
3733 rbd_dev_destroy(rbd_dev);
602adf40
YS
3734
3735 /* release module ref */
3736 module_put(THIS_MODULE);
602adf40
YS
3737}
3738
dfc5606d
YS
3739static ssize_t rbd_remove(struct bus_type *bus,
3740 const char *buf,
3741 size_t count)
602adf40
YS
3742{
3743 struct rbd_device *rbd_dev = NULL;
3744 int target_id, rc;
3745 unsigned long ul;
3746 int ret = count;
3747
3748 rc = strict_strtoul(buf, 10, &ul);
3749 if (rc)
3750 return rc;
3751
3752 /* convert to int; abort if we lost anything in the conversion */
3753 target_id = (int) ul;
3754 if (target_id != ul)
3755 return -EINVAL;
3756
3757 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3758
3759 rbd_dev = __rbd_get_dev(target_id);
3760 if (!rbd_dev) {
3761 ret = -ENOENT;
3762 goto done;
42382b70
AE
3763 }
3764
3765 if (rbd_dev->open_count) {
3766 ret = -EBUSY;
3767 goto done;
602adf40
YS
3768 }
3769
41f38c2b 3770 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3771 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3772
3773done:
3774 mutex_unlock(&ctl_mutex);
aafb230e 3775
602adf40
YS
3776 return ret;
3777}
3778
602adf40
YS
3779/*
3780 * create control files in sysfs
dfc5606d 3781 * /sys/bus/rbd/...
602adf40
YS
3782 */
3783static int rbd_sysfs_init(void)
3784{
dfc5606d 3785 int ret;
602adf40 3786
fed4c143 3787 ret = device_register(&rbd_root_dev);
21079786 3788 if (ret < 0)
dfc5606d 3789 return ret;
602adf40 3790
fed4c143
AE
3791 ret = bus_register(&rbd_bus_type);
3792 if (ret < 0)
3793 device_unregister(&rbd_root_dev);
602adf40 3794
602adf40
YS
3795 return ret;
3796}
3797
3798static void rbd_sysfs_cleanup(void)
3799{
dfc5606d 3800 bus_unregister(&rbd_bus_type);
fed4c143 3801 device_unregister(&rbd_root_dev);
602adf40
YS
3802}
3803
3804int __init rbd_init(void)
3805{
3806 int rc;
3807
3808 rc = rbd_sysfs_init();
3809 if (rc)
3810 return rc;
f0f8cef5 3811 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3812 return 0;
3813}
3814
3815void __exit rbd_exit(void)
3816{
3817 rbd_sysfs_cleanup();
3818}
3819
3820module_init(rbd_init);
3821module_exit(rbd_exit);
3822
3823MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3824MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3825MODULE_DESCRIPTION("rados block device");
3826
3827/* following authorship retained from original osdblk.c */
3828MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3829
3830MODULE_LICENSE("GPL");