rbd: allow null image name
[linux-2.6-block.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
d4b125e9
AE
64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
35d489f9 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
69#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
1e130199
AE
73#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 75
d889140c
AE
76/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
81a89793
AE
84/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
602adf40 90#define DEV_NAME_LEN 32
81a89793 91#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 92
cc0538b6 93#define RBD_READ_ONLY_DEFAULT false
59c2be1e 94
602adf40
YS
95/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
f84344f3 99 /* These four fields never change for a given rbd image */
849b4260 100 char *object_prefix;
34b13184 101 u64 features;
602adf40
YS
102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
602adf40 105
f84344f3
AE
106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
602adf40
YS
109 char *snap_names;
110 u64 *snap_sizes;
59c2be1e
YS
111
112 u64 obj_version;
113};
114
0d7dbfce
AE
115/*
116 * An rbd image specification.
117 *
118 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
119 * identify an image.
120 */
121struct rbd_spec {
122 u64 pool_id;
123 char *pool_name;
124
125 char *image_id;
126 size_t image_id_len;
127 char *image_name;
128 size_t image_name_len;
129
130 u64 snap_id;
131 char *snap_name;
132
133 struct kref kref;
134};
135
59c2be1e 136struct rbd_options {
cc0538b6 137 bool read_only;
602adf40
YS
138};
139
140/*
f0f8cef5 141 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
142 */
143struct rbd_client {
144 struct ceph_client *client;
145 struct kref kref;
146 struct list_head node;
147};
148
149/*
f0f8cef5 150 * a request completion status
602adf40 151 */
1fec7093
YS
152struct rbd_req_status {
153 int done;
154 int rc;
155 u64 bytes;
156};
157
158/*
159 * a collection of requests
160 */
161struct rbd_req_coll {
162 int total;
163 int num_done;
164 struct kref kref;
165 struct rbd_req_status status[0];
602adf40
YS
166};
167
f0f8cef5
AE
168/*
169 * a single io request
170 */
171struct rbd_request {
172 struct request *rq; /* blk layer request */
173 struct bio *bio; /* cloned bio */
174 struct page **pages; /* list of used pages */
175 u64 len;
176 int coll_index;
177 struct rbd_req_coll *coll;
178};
179
dfc5606d
YS
180struct rbd_snap {
181 struct device dev;
182 const char *name;
3591538f 183 u64 size;
dfc5606d
YS
184 struct list_head node;
185 u64 id;
34b13184 186 u64 features;
dfc5606d
YS
187};
188
f84344f3 189struct rbd_mapping {
99c1f08f 190 u64 size;
34b13184 191 u64 features;
f84344f3
AE
192 bool read_only;
193};
194
602adf40
YS
195/*
196 * a single device
197 */
198struct rbd_device {
de71a297 199 int dev_id; /* blkdev unique id */
602adf40
YS
200
201 int major; /* blkdev assigned major */
202 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 203
a30b71b9 204 u32 image_format; /* Either 1 or 2 */
602adf40
YS
205 struct rbd_client *rbd_client;
206
207 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
208
209 spinlock_t lock; /* queue lock */
210
211 struct rbd_image_header header;
daba5fdb 212 bool exists;
0d7dbfce 213 struct rbd_spec *spec;
602adf40 214
0d7dbfce 215 char *header_name;
971f839a 216
59c2be1e
YS
217 struct ceph_osd_event *watch_event;
218 struct ceph_osd_request *watch_request;
219
c666601a
JD
220 /* protects updating the header */
221 struct rw_semaphore header_rwsem;
f84344f3
AE
222
223 struct rbd_mapping mapping;
602adf40
YS
224
225 struct list_head node;
dfc5606d
YS
226
227 /* list of snapshots */
228 struct list_head snaps;
229
230 /* sysfs related */
231 struct device dev;
232};
233
602adf40 234static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 235
602adf40 236static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
237static DEFINE_SPINLOCK(rbd_dev_list_lock);
238
432b8587
AE
239static LIST_HEAD(rbd_client_list); /* clients */
240static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 241
304f6808
AE
242static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
243static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
244
dfc5606d 245static void rbd_dev_release(struct device *dev);
41f38c2b 246static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 247
f0f8cef5
AE
248static ssize_t rbd_add(struct bus_type *bus, const char *buf,
249 size_t count);
250static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
251 size_t count);
252
253static struct bus_attribute rbd_bus_attrs[] = {
254 __ATTR(add, S_IWUSR, NULL, rbd_add),
255 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
256 __ATTR_NULL
257};
258
259static struct bus_type rbd_bus_type = {
260 .name = "rbd",
261 .bus_attrs = rbd_bus_attrs,
262};
263
264static void rbd_root_dev_release(struct device *dev)
265{
266}
267
268static struct device rbd_root_dev = {
269 .init_name = "rbd",
270 .release = rbd_root_dev_release,
271};
272
aafb230e
AE
273#ifdef RBD_DEBUG
274#define rbd_assert(expr) \
275 if (unlikely(!(expr))) { \
276 printk(KERN_ERR "\nAssertion failure in %s() " \
277 "at line %d:\n\n" \
278 "\trbd_assert(%s);\n\n", \
279 __func__, __LINE__, #expr); \
280 BUG(); \
281 }
282#else /* !RBD_DEBUG */
283# define rbd_assert(expr) ((void) 0)
284#endif /* !RBD_DEBUG */
dfc5606d 285
dfc5606d
YS
286static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
287{
288 return get_device(&rbd_dev->dev);
289}
290
291static void rbd_put_dev(struct rbd_device *rbd_dev)
292{
293 put_device(&rbd_dev->dev);
294}
602adf40 295
117973fb
AE
296static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
297static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 298
602adf40
YS
299static int rbd_open(struct block_device *bdev, fmode_t mode)
300{
f0f8cef5 301 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 302
f84344f3 303 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
304 return -EROFS;
305
340c7a2b 306 rbd_get_dev(rbd_dev);
f84344f3 307 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 308
602adf40
YS
309 return 0;
310}
311
dfc5606d
YS
312static int rbd_release(struct gendisk *disk, fmode_t mode)
313{
314 struct rbd_device *rbd_dev = disk->private_data;
315
316 rbd_put_dev(rbd_dev);
317
318 return 0;
319}
320
602adf40
YS
321static const struct block_device_operations rbd_bd_ops = {
322 .owner = THIS_MODULE,
323 .open = rbd_open,
dfc5606d 324 .release = rbd_release,
602adf40
YS
325};
326
327/*
328 * Initialize an rbd client instance.
43ae4701 329 * We own *ceph_opts.
602adf40 330 */
f8c38929 331static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
332{
333 struct rbd_client *rbdc;
334 int ret = -ENOMEM;
335
336 dout("rbd_client_create\n");
337 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
338 if (!rbdc)
339 goto out_opt;
340
341 kref_init(&rbdc->kref);
342 INIT_LIST_HEAD(&rbdc->node);
343
bc534d86
AE
344 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
345
43ae4701 346 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 347 if (IS_ERR(rbdc->client))
bc534d86 348 goto out_mutex;
43ae4701 349 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
350
351 ret = ceph_open_session(rbdc->client);
352 if (ret < 0)
353 goto out_err;
354
432b8587 355 spin_lock(&rbd_client_list_lock);
602adf40 356 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 357 spin_unlock(&rbd_client_list_lock);
602adf40 358
bc534d86
AE
359 mutex_unlock(&ctl_mutex);
360
602adf40
YS
361 dout("rbd_client_create created %p\n", rbdc);
362 return rbdc;
363
364out_err:
365 ceph_destroy_client(rbdc->client);
bc534d86
AE
366out_mutex:
367 mutex_unlock(&ctl_mutex);
602adf40
YS
368 kfree(rbdc);
369out_opt:
43ae4701
AE
370 if (ceph_opts)
371 ceph_destroy_options(ceph_opts);
28f259b7 372 return ERR_PTR(ret);
602adf40
YS
373}
374
375/*
1f7ba331
AE
376 * Find a ceph client with specific addr and configuration. If
377 * found, bump its reference count.
602adf40 378 */
1f7ba331 379static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
380{
381 struct rbd_client *client_node;
1f7ba331 382 bool found = false;
602adf40 383
43ae4701 384 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
385 return NULL;
386
1f7ba331
AE
387 spin_lock(&rbd_client_list_lock);
388 list_for_each_entry(client_node, &rbd_client_list, node) {
389 if (!ceph_compare_options(ceph_opts, client_node->client)) {
390 kref_get(&client_node->kref);
391 found = true;
392 break;
393 }
394 }
395 spin_unlock(&rbd_client_list_lock);
396
397 return found ? client_node : NULL;
602adf40
YS
398}
399
59c2be1e
YS
400/*
401 * mount options
402 */
403enum {
59c2be1e
YS
404 Opt_last_int,
405 /* int args above */
406 Opt_last_string,
407 /* string args above */
cc0538b6
AE
408 Opt_read_only,
409 Opt_read_write,
410 /* Boolean args above */
411 Opt_last_bool,
59c2be1e
YS
412};
413
43ae4701 414static match_table_t rbd_opts_tokens = {
59c2be1e
YS
415 /* int args above */
416 /* string args above */
be466c1c 417 {Opt_read_only, "read_only"},
cc0538b6
AE
418 {Opt_read_only, "ro"}, /* Alternate spelling */
419 {Opt_read_write, "read_write"},
420 {Opt_read_write, "rw"}, /* Alternate spelling */
421 /* Boolean args above */
59c2be1e
YS
422 {-1, NULL}
423};
424
425static int parse_rbd_opts_token(char *c, void *private)
426{
43ae4701 427 struct rbd_options *rbd_opts = private;
59c2be1e
YS
428 substring_t argstr[MAX_OPT_ARGS];
429 int token, intval, ret;
430
43ae4701 431 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
432 if (token < 0)
433 return -EINVAL;
434
435 if (token < Opt_last_int) {
436 ret = match_int(&argstr[0], &intval);
437 if (ret < 0) {
438 pr_err("bad mount option arg (not int) "
439 "at '%s'\n", c);
440 return ret;
441 }
442 dout("got int token %d val %d\n", token, intval);
443 } else if (token > Opt_last_int && token < Opt_last_string) {
444 dout("got string token %d val %s\n", token,
445 argstr[0].from);
cc0538b6
AE
446 } else if (token > Opt_last_string && token < Opt_last_bool) {
447 dout("got Boolean token %d\n", token);
59c2be1e
YS
448 } else {
449 dout("got token %d\n", token);
450 }
451
452 switch (token) {
cc0538b6
AE
453 case Opt_read_only:
454 rbd_opts->read_only = true;
455 break;
456 case Opt_read_write:
457 rbd_opts->read_only = false;
458 break;
59c2be1e 459 default:
aafb230e
AE
460 rbd_assert(false);
461 break;
59c2be1e
YS
462 }
463 return 0;
464}
465
602adf40
YS
466/*
467 * Get a ceph client with specific addr and configuration, if one does
468 * not exist create it.
469 */
9d3997fd 470static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 471{
f8c38929 472 struct rbd_client *rbdc;
59c2be1e 473
1f7ba331 474 rbdc = rbd_client_find(ceph_opts);
9d3997fd 475 if (rbdc) /* using an existing client */
43ae4701 476 ceph_destroy_options(ceph_opts);
9d3997fd 477 else
f8c38929 478 rbdc = rbd_client_create(ceph_opts);
602adf40 479
9d3997fd 480 return rbdc;
602adf40
YS
481}
482
483/*
484 * Destroy ceph client
d23a4b3f 485 *
432b8587 486 * Caller must hold rbd_client_list_lock.
602adf40
YS
487 */
488static void rbd_client_release(struct kref *kref)
489{
490 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
491
492 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 493 spin_lock(&rbd_client_list_lock);
602adf40 494 list_del(&rbdc->node);
cd9d9f5d 495 spin_unlock(&rbd_client_list_lock);
602adf40
YS
496
497 ceph_destroy_client(rbdc->client);
498 kfree(rbdc);
499}
500
501/*
502 * Drop reference to ceph client node. If it's not referenced anymore, release
503 * it.
504 */
9d3997fd 505static void rbd_put_client(struct rbd_client *rbdc)
602adf40 506{
c53d5893
AE
507 if (rbdc)
508 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
509}
510
1fec7093
YS
511/*
512 * Destroy requests collection
513 */
514static void rbd_coll_release(struct kref *kref)
515{
516 struct rbd_req_coll *coll =
517 container_of(kref, struct rbd_req_coll, kref);
518
519 dout("rbd_coll_release %p\n", coll);
520 kfree(coll);
521}
602adf40 522
a30b71b9
AE
523static bool rbd_image_format_valid(u32 image_format)
524{
525 return image_format == 1 || image_format == 2;
526}
527
8e94af8e
AE
528static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
529{
103a150f
AE
530 size_t size;
531 u32 snap_count;
532
533 /* The header has to start with the magic rbd header text */
534 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
535 return false;
536
db2388b6
AE
537 /* The bio layer requires at least sector-sized I/O */
538
539 if (ondisk->options.order < SECTOR_SHIFT)
540 return false;
541
542 /* If we use u64 in a few spots we may be able to loosen this */
543
544 if (ondisk->options.order > 8 * sizeof (int) - 1)
545 return false;
546
103a150f
AE
547 /*
548 * The size of a snapshot header has to fit in a size_t, and
549 * that limits the number of snapshots.
550 */
551 snap_count = le32_to_cpu(ondisk->snap_count);
552 size = SIZE_MAX - sizeof (struct ceph_snap_context);
553 if (snap_count > size / sizeof (__le64))
554 return false;
555
556 /*
557 * Not only that, but the size of the entire the snapshot
558 * header must also be representable in a size_t.
559 */
560 size -= snap_count * sizeof (__le64);
561 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
562 return false;
563
564 return true;
8e94af8e
AE
565}
566
602adf40
YS
567/*
568 * Create a new header structure, translate header format from the on-disk
569 * header.
570 */
571static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 572 struct rbd_image_header_ondisk *ondisk)
602adf40 573{
ccece235 574 u32 snap_count;
58c17b0e 575 size_t len;
d2bb24e5 576 size_t size;
621901d6 577 u32 i;
602adf40 578
6a52325f
AE
579 memset(header, 0, sizeof (*header));
580
103a150f
AE
581 snap_count = le32_to_cpu(ondisk->snap_count);
582
58c17b0e
AE
583 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
584 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 585 if (!header->object_prefix)
602adf40 586 return -ENOMEM;
58c17b0e
AE
587 memcpy(header->object_prefix, ondisk->object_prefix, len);
588 header->object_prefix[len] = '\0';
00f1f36f 589
602adf40 590 if (snap_count) {
f785cc1d
AE
591 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
592
621901d6
AE
593 /* Save a copy of the snapshot names */
594
f785cc1d
AE
595 if (snap_names_len > (u64) SIZE_MAX)
596 return -EIO;
597 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 598 if (!header->snap_names)
6a52325f 599 goto out_err;
f785cc1d
AE
600 /*
601 * Note that rbd_dev_v1_header_read() guarantees
602 * the ondisk buffer we're working with has
603 * snap_names_len bytes beyond the end of the
604 * snapshot id array, this memcpy() is safe.
605 */
606 memcpy(header->snap_names, &ondisk->snaps[snap_count],
607 snap_names_len);
6a52325f 608
621901d6
AE
609 /* Record each snapshot's size */
610
d2bb24e5
AE
611 size = snap_count * sizeof (*header->snap_sizes);
612 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 613 if (!header->snap_sizes)
6a52325f 614 goto out_err;
621901d6
AE
615 for (i = 0; i < snap_count; i++)
616 header->snap_sizes[i] =
617 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 618 } else {
ccece235 619 WARN_ON(ondisk->snap_names_len);
602adf40
YS
620 header->snap_names = NULL;
621 header->snap_sizes = NULL;
622 }
849b4260 623
34b13184 624 header->features = 0; /* No features support in v1 images */
602adf40
YS
625 header->obj_order = ondisk->options.order;
626 header->crypt_type = ondisk->options.crypt_type;
627 header->comp_type = ondisk->options.comp_type;
6a52325f 628
621901d6
AE
629 /* Allocate and fill in the snapshot context */
630
f84344f3 631 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
632 size = sizeof (struct ceph_snap_context);
633 size += snap_count * sizeof (header->snapc->snaps[0]);
634 header->snapc = kzalloc(size, GFP_KERNEL);
635 if (!header->snapc)
636 goto out_err;
602adf40
YS
637
638 atomic_set(&header->snapc->nref, 1);
505cbb9b 639 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 640 header->snapc->num_snaps = snap_count;
621901d6
AE
641 for (i = 0; i < snap_count; i++)
642 header->snapc->snaps[i] =
643 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
644
645 return 0;
646
6a52325f 647out_err:
849b4260 648 kfree(header->snap_sizes);
ccece235 649 header->snap_sizes = NULL;
602adf40 650 kfree(header->snap_names);
ccece235 651 header->snap_names = NULL;
6a52325f
AE
652 kfree(header->object_prefix);
653 header->object_prefix = NULL;
ccece235 654
00f1f36f 655 return -ENOMEM;
602adf40
YS
656}
657
8836b995 658static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 659{
602adf40 660
e86924a8 661 struct rbd_snap *snap;
602adf40 662
e86924a8
AE
663 list_for_each_entry(snap, &rbd_dev->snaps, node) {
664 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 665 rbd_dev->spec->snap_id = snap->id;
e86924a8 666 rbd_dev->mapping.size = snap->size;
34b13184 667 rbd_dev->mapping.features = snap->features;
602adf40 668
e86924a8 669 return 0;
00f1f36f 670 }
00f1f36f 671 }
e86924a8 672
00f1f36f 673 return -ENOENT;
602adf40
YS
674}
675
819d52bf 676static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 677{
78dc447d 678 int ret;
602adf40 679
0d7dbfce 680 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 681 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 682 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 683 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 684 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 685 ret = 0;
602adf40 686 } else {
0d7dbfce 687 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
688 if (ret < 0)
689 goto done;
f84344f3 690 rbd_dev->mapping.read_only = true;
602adf40 691 }
daba5fdb 692 rbd_dev->exists = true;
602adf40 693done:
602adf40
YS
694 return ret;
695}
696
697static void rbd_header_free(struct rbd_image_header *header)
698{
849b4260 699 kfree(header->object_prefix);
d78fd7ae 700 header->object_prefix = NULL;
602adf40 701 kfree(header->snap_sizes);
d78fd7ae 702 header->snap_sizes = NULL;
849b4260 703 kfree(header->snap_names);
d78fd7ae 704 header->snap_names = NULL;
d1d25646 705 ceph_put_snap_context(header->snapc);
d78fd7ae 706 header->snapc = NULL;
602adf40
YS
707}
708
65ccfe21 709static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 710{
65ccfe21
AE
711 char *name;
712 u64 segment;
713 int ret;
602adf40 714
65ccfe21
AE
715 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
716 if (!name)
717 return NULL;
718 segment = offset >> rbd_dev->header.obj_order;
719 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
720 rbd_dev->header.object_prefix, segment);
721 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
722 pr_err("error formatting segment name for #%llu (%d)\n",
723 segment, ret);
724 kfree(name);
725 name = NULL;
726 }
602adf40 727
65ccfe21
AE
728 return name;
729}
602adf40 730
65ccfe21
AE
731static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
732{
733 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 734
65ccfe21
AE
735 return offset & (segment_size - 1);
736}
737
738static u64 rbd_segment_length(struct rbd_device *rbd_dev,
739 u64 offset, u64 length)
740{
741 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
742
743 offset &= segment_size - 1;
744
aafb230e 745 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
746 if (offset + length > segment_size)
747 length = segment_size - offset;
748
749 return length;
602adf40
YS
750}
751
1fec7093
YS
752static int rbd_get_num_segments(struct rbd_image_header *header,
753 u64 ofs, u64 len)
754{
df111be6
AE
755 u64 start_seg;
756 u64 end_seg;
757
758 if (!len)
759 return 0;
760 if (len - 1 > U64_MAX - ofs)
761 return -ERANGE;
762
763 start_seg = ofs >> header->obj_order;
764 end_seg = (ofs + len - 1) >> header->obj_order;
765
1fec7093
YS
766 return end_seg - start_seg + 1;
767}
768
029bcbd8
JD
769/*
770 * returns the size of an object in the image
771 */
772static u64 rbd_obj_bytes(struct rbd_image_header *header)
773{
774 return 1 << header->obj_order;
775}
776
602adf40
YS
777/*
778 * bio helpers
779 */
780
781static void bio_chain_put(struct bio *chain)
782{
783 struct bio *tmp;
784
785 while (chain) {
786 tmp = chain;
787 chain = chain->bi_next;
788 bio_put(tmp);
789 }
790}
791
792/*
793 * zeros a bio chain, starting at specific offset
794 */
795static void zero_bio_chain(struct bio *chain, int start_ofs)
796{
797 struct bio_vec *bv;
798 unsigned long flags;
799 void *buf;
800 int i;
801 int pos = 0;
802
803 while (chain) {
804 bio_for_each_segment(bv, chain, i) {
805 if (pos + bv->bv_len > start_ofs) {
806 int remainder = max(start_ofs - pos, 0);
807 buf = bvec_kmap_irq(bv, &flags);
808 memset(buf + remainder, 0,
809 bv->bv_len - remainder);
85b5aaa6 810 bvec_kunmap_irq(buf, &flags);
602adf40
YS
811 }
812 pos += bv->bv_len;
813 }
814
815 chain = chain->bi_next;
816 }
817}
818
819/*
f7760dad
AE
820 * Clone a portion of a bio, starting at the given byte offset
821 * and continuing for the number of bytes indicated.
602adf40 822 */
f7760dad
AE
823static struct bio *bio_clone_range(struct bio *bio_src,
824 unsigned int offset,
825 unsigned int len,
826 gfp_t gfpmask)
602adf40 827{
f7760dad
AE
828 struct bio_vec *bv;
829 unsigned int resid;
830 unsigned short idx;
831 unsigned int voff;
832 unsigned short end_idx;
833 unsigned short vcnt;
834 struct bio *bio;
835
836 /* Handle the easy case for the caller */
837
838 if (!offset && len == bio_src->bi_size)
839 return bio_clone(bio_src, gfpmask);
840
841 if (WARN_ON_ONCE(!len))
842 return NULL;
843 if (WARN_ON_ONCE(len > bio_src->bi_size))
844 return NULL;
845 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
846 return NULL;
847
848 /* Find first affected segment... */
849
850 resid = offset;
851 __bio_for_each_segment(bv, bio_src, idx, 0) {
852 if (resid < bv->bv_len)
853 break;
854 resid -= bv->bv_len;
602adf40 855 }
f7760dad 856 voff = resid;
602adf40 857
f7760dad 858 /* ...and the last affected segment */
602adf40 859
f7760dad
AE
860 resid += len;
861 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
862 if (resid <= bv->bv_len)
863 break;
864 resid -= bv->bv_len;
865 }
866 vcnt = end_idx - idx + 1;
867
868 /* Build the clone */
869
870 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
871 if (!bio)
872 return NULL; /* ENOMEM */
602adf40 873
f7760dad
AE
874 bio->bi_bdev = bio_src->bi_bdev;
875 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
876 bio->bi_rw = bio_src->bi_rw;
877 bio->bi_flags |= 1 << BIO_CLONED;
878
879 /*
880 * Copy over our part of the bio_vec, then update the first
881 * and last (or only) entries.
882 */
883 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
884 vcnt * sizeof (struct bio_vec));
885 bio->bi_io_vec[0].bv_offset += voff;
886 if (vcnt > 1) {
887 bio->bi_io_vec[0].bv_len -= voff;
888 bio->bi_io_vec[vcnt - 1].bv_len = resid;
889 } else {
890 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
891 }
892
f7760dad
AE
893 bio->bi_vcnt = vcnt;
894 bio->bi_size = len;
895 bio->bi_idx = 0;
896
897 return bio;
898}
899
900/*
901 * Clone a portion of a bio chain, starting at the given byte offset
902 * into the first bio in the source chain and continuing for the
903 * number of bytes indicated. The result is another bio chain of
904 * exactly the given length, or a null pointer on error.
905 *
906 * The bio_src and offset parameters are both in-out. On entry they
907 * refer to the first source bio and the offset into that bio where
908 * the start of data to be cloned is located.
909 *
910 * On return, bio_src is updated to refer to the bio in the source
911 * chain that contains first un-cloned byte, and *offset will
912 * contain the offset of that byte within that bio.
913 */
914static struct bio *bio_chain_clone_range(struct bio **bio_src,
915 unsigned int *offset,
916 unsigned int len,
917 gfp_t gfpmask)
918{
919 struct bio *bi = *bio_src;
920 unsigned int off = *offset;
921 struct bio *chain = NULL;
922 struct bio **end;
923
924 /* Build up a chain of clone bios up to the limit */
925
926 if (!bi || off >= bi->bi_size || !len)
927 return NULL; /* Nothing to clone */
602adf40 928
f7760dad
AE
929 end = &chain;
930 while (len) {
931 unsigned int bi_size;
932 struct bio *bio;
933
934 if (!bi)
935 goto out_err; /* EINVAL; ran out of bio's */
936 bi_size = min_t(unsigned int, bi->bi_size - off, len);
937 bio = bio_clone_range(bi, off, bi_size, gfpmask);
938 if (!bio)
939 goto out_err; /* ENOMEM */
940
941 *end = bio;
942 end = &bio->bi_next;
602adf40 943
f7760dad
AE
944 off += bi_size;
945 if (off == bi->bi_size) {
946 bi = bi->bi_next;
947 off = 0;
948 }
949 len -= bi_size;
950 }
951 *bio_src = bi;
952 *offset = off;
953
954 return chain;
955out_err:
956 bio_chain_put(chain);
602adf40 957
602adf40
YS
958 return NULL;
959}
960
961/*
962 * helpers for osd request op vectors.
963 */
57cfc106
AE
964static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
965 int opcode, u32 payload_len)
602adf40 966{
57cfc106
AE
967 struct ceph_osd_req_op *ops;
968
969 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
970 if (!ops)
971 return NULL;
972
973 ops[0].op = opcode;
974
602adf40
YS
975 /*
976 * op extent offset and length will be set later on
977 * in calc_raw_layout()
978 */
57cfc106
AE
979 ops[0].payload_len = payload_len;
980
981 return ops;
602adf40
YS
982}
983
984static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
985{
986 kfree(ops);
987}
988
1fec7093
YS
989static void rbd_coll_end_req_index(struct request *rq,
990 struct rbd_req_coll *coll,
991 int index,
992 int ret, u64 len)
993{
994 struct request_queue *q;
995 int min, max, i;
996
bd919d45
AE
997 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
998 coll, index, ret, (unsigned long long) len);
1fec7093
YS
999
1000 if (!rq)
1001 return;
1002
1003 if (!coll) {
1004 blk_end_request(rq, ret, len);
1005 return;
1006 }
1007
1008 q = rq->q;
1009
1010 spin_lock_irq(q->queue_lock);
1011 coll->status[index].done = 1;
1012 coll->status[index].rc = ret;
1013 coll->status[index].bytes = len;
1014 max = min = coll->num_done;
1015 while (max < coll->total && coll->status[max].done)
1016 max++;
1017
1018 for (i = min; i<max; i++) {
1019 __blk_end_request(rq, coll->status[i].rc,
1020 coll->status[i].bytes);
1021 coll->num_done++;
1022 kref_put(&coll->kref, rbd_coll_release);
1023 }
1024 spin_unlock_irq(q->queue_lock);
1025}
1026
1027static void rbd_coll_end_req(struct rbd_request *req,
1028 int ret, u64 len)
1029{
1030 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1031}
1032
602adf40
YS
1033/*
1034 * Send ceph osd request
1035 */
1036static int rbd_do_request(struct request *rq,
0ce1a794 1037 struct rbd_device *rbd_dev,
602adf40
YS
1038 struct ceph_snap_context *snapc,
1039 u64 snapid,
aded07ea 1040 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1041 struct bio *bio,
1042 struct page **pages,
1043 int num_pages,
1044 int flags,
1045 struct ceph_osd_req_op *ops,
1fec7093
YS
1046 struct rbd_req_coll *coll,
1047 int coll_index,
602adf40 1048 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
1049 struct ceph_msg *msg),
1050 struct ceph_osd_request **linger_req,
1051 u64 *ver)
602adf40
YS
1052{
1053 struct ceph_osd_request *req;
1054 struct ceph_file_layout *layout;
1055 int ret;
1056 u64 bno;
1057 struct timespec mtime = CURRENT_TIME;
1058 struct rbd_request *req_data;
1059 struct ceph_osd_request_head *reqhead;
1dbb4399 1060 struct ceph_osd_client *osdc;
602adf40 1061
602adf40 1062 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
1063 if (!req_data) {
1064 if (coll)
1065 rbd_coll_end_req_index(rq, coll, coll_index,
1066 -ENOMEM, len);
1067 return -ENOMEM;
1068 }
1069
1070 if (coll) {
1071 req_data->coll = coll;
1072 req_data->coll_index = coll_index;
1073 }
602adf40 1074
f7760dad
AE
1075 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1076 object_name, (unsigned long long) ofs,
1077 (unsigned long long) len, coll, coll_index);
602adf40 1078
0ce1a794 1079 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
1080 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1081 false, GFP_NOIO, pages, bio);
4ad12621 1082 if (!req) {
4ad12621 1083 ret = -ENOMEM;
602adf40
YS
1084 goto done_pages;
1085 }
1086
1087 req->r_callback = rbd_cb;
1088
1089 req_data->rq = rq;
1090 req_data->bio = bio;
1091 req_data->pages = pages;
1092 req_data->len = len;
1093
1094 req->r_priv = req_data;
1095
1096 reqhead = req->r_request->front.iov_base;
1097 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1098
aded07ea 1099 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1100 req->r_oid_len = strlen(req->r_oid);
1101
1102 layout = &req->r_file_layout;
1103 memset(layout, 0, sizeof(*layout));
1104 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1105 layout->fl_stripe_count = cpu_to_le32(1);
1106 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0d7dbfce 1107 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
6cae3717
SW
1108 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1109 req, ops);
1110 rbd_assert(ret == 0);
602adf40
YS
1111
1112 ceph_osdc_build_request(req, ofs, &len,
1113 ops,
1114 snapc,
1115 &mtime,
1116 req->r_oid, req->r_oid_len);
602adf40 1117
59c2be1e 1118 if (linger_req) {
1dbb4399 1119 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1120 *linger_req = req;
1121 }
1122
1dbb4399 1123 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1124 if (ret < 0)
1125 goto done_err;
1126
1127 if (!rbd_cb) {
1dbb4399 1128 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1129 if (ver)
1130 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1131 dout("reassert_ver=%llu\n",
1132 (unsigned long long)
1133 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1134 ceph_osdc_put_request(req);
1135 }
1136 return ret;
1137
1138done_err:
1139 bio_chain_put(req_data->bio);
1140 ceph_osdc_put_request(req);
1141done_pages:
1fec7093 1142 rbd_coll_end_req(req_data, ret, len);
602adf40 1143 kfree(req_data);
602adf40
YS
1144 return ret;
1145}
1146
1147/*
1148 * Ceph osd op callback
1149 */
1150static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1151{
1152 struct rbd_request *req_data = req->r_priv;
1153 struct ceph_osd_reply_head *replyhead;
1154 struct ceph_osd_op *op;
1155 __s32 rc;
1156 u64 bytes;
1157 int read_op;
1158
1159 /* parse reply */
1160 replyhead = msg->front.iov_base;
1161 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1162 op = (void *)(replyhead + 1);
1163 rc = le32_to_cpu(replyhead->result);
1164 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1165 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1166
bd919d45
AE
1167 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1168 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1169
1170 if (rc == -ENOENT && read_op) {
1171 zero_bio_chain(req_data->bio, 0);
1172 rc = 0;
1173 } else if (rc == 0 && read_op && bytes < req_data->len) {
1174 zero_bio_chain(req_data->bio, bytes);
1175 bytes = req_data->len;
1176 }
1177
1fec7093 1178 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1179
1180 if (req_data->bio)
1181 bio_chain_put(req_data->bio);
1182
1183 ceph_osdc_put_request(req);
1184 kfree(req_data);
1185}
1186
59c2be1e
YS
1187static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1188{
1189 ceph_osdc_put_request(req);
1190}
1191
602adf40
YS
1192/*
1193 * Do a synchronous ceph osd operation
1194 */
0ce1a794 1195static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1196 struct ceph_snap_context *snapc,
1197 u64 snapid,
602adf40 1198 int flags,
913d2fdc 1199 struct ceph_osd_req_op *ops,
aded07ea 1200 const char *object_name,
f8d4de6e
AE
1201 u64 ofs, u64 inbound_size,
1202 char *inbound,
59c2be1e
YS
1203 struct ceph_osd_request **linger_req,
1204 u64 *ver)
602adf40
YS
1205{
1206 int ret;
1207 struct page **pages;
1208 int num_pages;
913d2fdc 1209
aafb230e 1210 rbd_assert(ops != NULL);
602adf40 1211
f8d4de6e 1212 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1213 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1214 if (IS_ERR(pages))
1215 return PTR_ERR(pages);
602adf40 1216
0ce1a794 1217 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1218 object_name, ofs, inbound_size, NULL,
602adf40
YS
1219 pages, num_pages,
1220 flags,
1221 ops,
1fec7093 1222 NULL, 0,
59c2be1e
YS
1223 NULL,
1224 linger_req, ver);
602adf40 1225 if (ret < 0)
913d2fdc 1226 goto done;
602adf40 1227
f8d4de6e
AE
1228 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1229 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1230
602adf40
YS
1231done:
1232 ceph_release_page_vector(pages, num_pages);
1233 return ret;
1234}
1235
1236/*
1237 * Do an asynchronous ceph osd operation
1238 */
1239static int rbd_do_op(struct request *rq,
0ce1a794 1240 struct rbd_device *rbd_dev,
602adf40 1241 struct ceph_snap_context *snapc,
602adf40 1242 u64 ofs, u64 len,
1fec7093
YS
1243 struct bio *bio,
1244 struct rbd_req_coll *coll,
1245 int coll_index)
602adf40
YS
1246{
1247 char *seg_name;
1248 u64 seg_ofs;
1249 u64 seg_len;
1250 int ret;
1251 struct ceph_osd_req_op *ops;
1252 u32 payload_len;
ff2e4bb5
AE
1253 int opcode;
1254 int flags;
4634246d 1255 u64 snapid;
602adf40 1256
65ccfe21 1257 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1258 if (!seg_name)
1259 return -ENOMEM;
65ccfe21
AE
1260 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1261 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1262
ff2e4bb5
AE
1263 if (rq_data_dir(rq) == WRITE) {
1264 opcode = CEPH_OSD_OP_WRITE;
1265 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1266 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1267 payload_len = seg_len;
1268 } else {
1269 opcode = CEPH_OSD_OP_READ;
1270 flags = CEPH_OSD_FLAG_READ;
4634246d 1271 snapc = NULL;
0d7dbfce 1272 snapid = rbd_dev->spec->snap_id;
ff2e4bb5
AE
1273 payload_len = 0;
1274 }
602adf40 1275
57cfc106
AE
1276 ret = -ENOMEM;
1277 ops = rbd_create_rw_ops(1, opcode, payload_len);
1278 if (!ops)
602adf40
YS
1279 goto done;
1280
1281 /* we've taken care of segment sizes earlier when we
1282 cloned the bios. We should never have a segment
1283 truncated at this point */
aafb230e 1284 rbd_assert(seg_len == len);
602adf40
YS
1285
1286 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1287 seg_name, seg_ofs, seg_len,
1288 bio,
1289 NULL, 0,
1290 flags,
1291 ops,
1fec7093 1292 coll, coll_index,
59c2be1e 1293 rbd_req_cb, 0, NULL);
11f77002
SW
1294
1295 rbd_destroy_ops(ops);
602adf40
YS
1296done:
1297 kfree(seg_name);
1298 return ret;
1299}
1300
602adf40
YS
1301/*
1302 * Request sync osd read
1303 */
0ce1a794 1304static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1305 u64 snapid,
aded07ea 1306 const char *object_name,
602adf40 1307 u64 ofs, u64 len,
59c2be1e
YS
1308 char *buf,
1309 u64 *ver)
602adf40 1310{
913d2fdc
AE
1311 struct ceph_osd_req_op *ops;
1312 int ret;
1313
1314 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1315 if (!ops)
1316 return -ENOMEM;
1317
1318 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1319 snapid,
602adf40 1320 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1321 ops, object_name, ofs, len, buf, NULL, ver);
1322 rbd_destroy_ops(ops);
1323
1324 return ret;
602adf40
YS
1325}
1326
1327/*
59c2be1e
YS
1328 * Request sync osd watch
1329 */
0ce1a794 1330static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1331 u64 ver,
7f0a24d8 1332 u64 notify_id)
59c2be1e
YS
1333{
1334 struct ceph_osd_req_op *ops;
11f77002
SW
1335 int ret;
1336
57cfc106
AE
1337 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1338 if (!ops)
1339 return -ENOMEM;
59c2be1e 1340
a71b891b 1341 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1342 ops[0].watch.cookie = notify_id;
1343 ops[0].watch.flag = 0;
1344
0ce1a794 1345 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1346 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1347 NULL, 0,
59c2be1e
YS
1348 CEPH_OSD_FLAG_READ,
1349 ops,
1fec7093 1350 NULL, 0,
59c2be1e
YS
1351 rbd_simple_req_cb, 0, NULL);
1352
1353 rbd_destroy_ops(ops);
1354 return ret;
1355}
1356
1357static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1358{
0ce1a794 1359 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1360 u64 hver;
13143d2d
SW
1361 int rc;
1362
0ce1a794 1363 if (!rbd_dev)
59c2be1e
YS
1364 return;
1365
bd919d45
AE
1366 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1367 rbd_dev->header_name, (unsigned long long) notify_id,
1368 (unsigned int) opcode);
117973fb 1369 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1370 if (rc)
f0f8cef5 1371 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1372 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1373
7f0a24d8 1374 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1375}
1376
1377/*
1378 * Request sync osd watch
1379 */
0e6f322d 1380static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1381{
1382 struct ceph_osd_req_op *ops;
0ce1a794 1383 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1384 int ret;
59c2be1e 1385
57cfc106
AE
1386 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1387 if (!ops)
1388 return -ENOMEM;
59c2be1e
YS
1389
1390 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1391 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1392 if (ret < 0)
1393 goto fail;
1394
0e6f322d 1395 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1396 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1397 ops[0].watch.flag = 1;
1398
0ce1a794 1399 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1400 CEPH_NOSNAP,
59c2be1e
YS
1401 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1402 ops,
0e6f322d
AE
1403 rbd_dev->header_name,
1404 0, 0, NULL,
0ce1a794 1405 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1406
1407 if (ret < 0)
1408 goto fail_event;
1409
1410 rbd_destroy_ops(ops);
1411 return 0;
1412
1413fail_event:
0ce1a794
AE
1414 ceph_osdc_cancel_event(rbd_dev->watch_event);
1415 rbd_dev->watch_event = NULL;
59c2be1e
YS
1416fail:
1417 rbd_destroy_ops(ops);
1418 return ret;
1419}
1420
79e3057c
YS
1421/*
1422 * Request sync osd unwatch
1423 */
070c633f 1424static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1425{
1426 struct ceph_osd_req_op *ops;
57cfc106 1427 int ret;
79e3057c 1428
57cfc106
AE
1429 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1430 if (!ops)
1431 return -ENOMEM;
79e3057c
YS
1432
1433 ops[0].watch.ver = 0;
0ce1a794 1434 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1435 ops[0].watch.flag = 0;
1436
0ce1a794 1437 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1438 CEPH_NOSNAP,
79e3057c
YS
1439 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1440 ops,
070c633f
AE
1441 rbd_dev->header_name,
1442 0, 0, NULL, NULL, NULL);
1443
79e3057c
YS
1444
1445 rbd_destroy_ops(ops);
0ce1a794
AE
1446 ceph_osdc_cancel_event(rbd_dev->watch_event);
1447 rbd_dev->watch_event = NULL;
79e3057c
YS
1448 return ret;
1449}
1450
602adf40 1451/*
3cb4a687 1452 * Synchronous osd object method call
602adf40 1453 */
0ce1a794 1454static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1455 const char *object_name,
1456 const char *class_name,
1457 const char *method_name,
3cb4a687
AE
1458 const char *outbound,
1459 size_t outbound_size,
f8d4de6e
AE
1460 char *inbound,
1461 size_t inbound_size,
3cb4a687 1462 int flags,
59c2be1e 1463 u64 *ver)
602adf40
YS
1464{
1465 struct ceph_osd_req_op *ops;
aded07ea
AE
1466 int class_name_len = strlen(class_name);
1467 int method_name_len = strlen(method_name);
3cb4a687 1468 int payload_size;
57cfc106
AE
1469 int ret;
1470
3cb4a687
AE
1471 /*
1472 * Any input parameters required by the method we're calling
1473 * will be sent along with the class and method names as
1474 * part of the message payload. That data and its size are
1475 * supplied via the indata and indata_len fields (named from
1476 * the perspective of the server side) in the OSD request
1477 * operation.
1478 */
1479 payload_size = class_name_len + method_name_len + outbound_size;
1480 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1481 if (!ops)
1482 return -ENOMEM;
602adf40 1483
aded07ea
AE
1484 ops[0].cls.class_name = class_name;
1485 ops[0].cls.class_len = (__u8) class_name_len;
1486 ops[0].cls.method_name = method_name;
1487 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1488 ops[0].cls.argc = 0;
3cb4a687
AE
1489 ops[0].cls.indata = outbound;
1490 ops[0].cls.indata_len = outbound_size;
602adf40 1491
0ce1a794 1492 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1493 CEPH_NOSNAP,
3cb4a687 1494 flags, ops,
f8d4de6e
AE
1495 object_name, 0, inbound_size, inbound,
1496 NULL, ver);
602adf40
YS
1497
1498 rbd_destroy_ops(ops);
1499
1500 dout("cls_exec returned %d\n", ret);
1501 return ret;
1502}
1503
1fec7093
YS
1504static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1505{
1506 struct rbd_req_coll *coll =
1507 kzalloc(sizeof(struct rbd_req_coll) +
1508 sizeof(struct rbd_req_status) * num_reqs,
1509 GFP_ATOMIC);
1510
1511 if (!coll)
1512 return NULL;
1513 coll->total = num_reqs;
1514 kref_init(&coll->kref);
1515 return coll;
1516}
1517
602adf40
YS
1518/*
1519 * block device queue callback
1520 */
1521static void rbd_rq_fn(struct request_queue *q)
1522{
1523 struct rbd_device *rbd_dev = q->queuedata;
1524 struct request *rq;
602adf40 1525
00f1f36f 1526 while ((rq = blk_fetch_request(q))) {
602adf40 1527 struct bio *bio;
602adf40 1528 bool do_write;
bd919d45 1529 unsigned int size;
602adf40 1530 u64 ofs;
1fec7093
YS
1531 int num_segs, cur_seg = 0;
1532 struct rbd_req_coll *coll;
d1d25646 1533 struct ceph_snap_context *snapc;
f7760dad 1534 unsigned int bio_offset;
602adf40 1535
602adf40
YS
1536 dout("fetched request\n");
1537
1538 /* filter out block requests we don't understand */
1539 if ((rq->cmd_type != REQ_TYPE_FS)) {
1540 __blk_end_request_all(rq, 0);
00f1f36f 1541 continue;
602adf40
YS
1542 }
1543
1544 /* deduce our operation (read, write) */
1545 do_write = (rq_data_dir(rq) == WRITE);
f84344f3 1546 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1547 __blk_end_request_all(rq, -EROFS);
00f1f36f 1548 continue;
602adf40
YS
1549 }
1550
1551 spin_unlock_irq(q->queue_lock);
1552
d1d25646 1553 down_read(&rbd_dev->header_rwsem);
e88a36ec 1554
daba5fdb 1555 if (!rbd_dev->exists) {
0d7dbfce 1556 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
e88a36ec 1557 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1558 dout("request for non-existent snapshot");
1559 spin_lock_irq(q->queue_lock);
1560 __blk_end_request_all(rq, -ENXIO);
1561 continue;
e88a36ec
JD
1562 }
1563
d1d25646
JD
1564 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1565
1566 up_read(&rbd_dev->header_rwsem);
1567
f7760dad
AE
1568 size = blk_rq_bytes(rq);
1569 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1570 bio = rq->bio;
1571
602adf40
YS
1572 dout("%s 0x%x bytes at 0x%llx\n",
1573 do_write ? "write" : "read",
bd919d45 1574 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1575
1fec7093 1576 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1577 if (num_segs <= 0) {
1578 spin_lock_irq(q->queue_lock);
1579 __blk_end_request_all(rq, num_segs);
1580 ceph_put_snap_context(snapc);
1581 continue;
1582 }
1fec7093
YS
1583 coll = rbd_alloc_coll(num_segs);
1584 if (!coll) {
1585 spin_lock_irq(q->queue_lock);
1586 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1587 ceph_put_snap_context(snapc);
00f1f36f 1588 continue;
1fec7093
YS
1589 }
1590
f7760dad 1591 bio_offset = 0;
602adf40 1592 do {
f7760dad
AE
1593 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1594 unsigned int chain_size;
1595 struct bio *bio_chain;
1596
1597 BUG_ON(limit > (u64) UINT_MAX);
1598 chain_size = (unsigned int) limit;
bd919d45 1599 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
f7760dad 1600
1fec7093 1601 kref_get(&coll->kref);
f7760dad
AE
1602
1603 /* Pass a cloned bio chain via an osd request */
1604
1605 bio_chain = bio_chain_clone_range(&bio,
1606 &bio_offset, chain_size,
1607 GFP_ATOMIC);
1608 if (bio_chain)
4634246d 1609 (void) rbd_do_op(rq, rbd_dev, snapc,
f7760dad
AE
1610 ofs, chain_size,
1611 bio_chain, coll, cur_seg);
4634246d 1612 else
1fec7093 1613 rbd_coll_end_req_index(rq, coll, cur_seg,
f7760dad
AE
1614 -ENOMEM, chain_size);
1615 size -= chain_size;
1616 ofs += chain_size;
602adf40 1617
1fec7093 1618 cur_seg++;
602adf40 1619 } while (size > 0);
1fec7093 1620 kref_put(&coll->kref, rbd_coll_release);
602adf40 1621
602adf40 1622 spin_lock_irq(q->queue_lock);
d1d25646
JD
1623
1624 ceph_put_snap_context(snapc);
602adf40
YS
1625 }
1626}
1627
1628/*
1629 * a queue callback. Makes sure that we don't create a bio that spans across
1630 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1631 * which we handle later at bio_chain_clone_range()
602adf40
YS
1632 */
1633static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1634 struct bio_vec *bvec)
1635{
1636 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1637 sector_t sector_offset;
1638 sector_t sectors_per_obj;
1639 sector_t obj_sector_offset;
1640 int ret;
1641
1642 /*
1643 * Find how far into its rbd object the partition-relative
1644 * bio start sector is to offset relative to the enclosing
1645 * device.
1646 */
1647 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1648 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1649 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1650
1651 /*
1652 * Compute the number of bytes from that offset to the end
1653 * of the object. Account for what's already used by the bio.
1654 */
1655 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1656 if (ret > bmd->bi_size)
1657 ret -= bmd->bi_size;
1658 else
1659 ret = 0;
1660
1661 /*
1662 * Don't send back more than was asked for. And if the bio
1663 * was empty, let the whole thing through because: "Note
1664 * that a block device *must* allow a single page to be
1665 * added to an empty bio."
1666 */
1667 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1668 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1669 ret = (int) bvec->bv_len;
1670
1671 return ret;
602adf40
YS
1672}
1673
1674static void rbd_free_disk(struct rbd_device *rbd_dev)
1675{
1676 struct gendisk *disk = rbd_dev->disk;
1677
1678 if (!disk)
1679 return;
1680
602adf40
YS
1681 if (disk->flags & GENHD_FL_UP)
1682 del_gendisk(disk);
1683 if (disk->queue)
1684 blk_cleanup_queue(disk->queue);
1685 put_disk(disk);
1686}
1687
1688/*
4156d998
AE
1689 * Read the complete header for the given rbd device.
1690 *
1691 * Returns a pointer to a dynamically-allocated buffer containing
1692 * the complete and validated header. Caller can pass the address
1693 * of a variable that will be filled in with the version of the
1694 * header object at the time it was read.
1695 *
1696 * Returns a pointer-coded errno if a failure occurs.
602adf40 1697 */
4156d998
AE
1698static struct rbd_image_header_ondisk *
1699rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1700{
4156d998 1701 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1702 u32 snap_count = 0;
4156d998
AE
1703 u64 names_size = 0;
1704 u32 want_count;
1705 int ret;
602adf40 1706
00f1f36f 1707 /*
4156d998
AE
1708 * The complete header will include an array of its 64-bit
1709 * snapshot ids, followed by the names of those snapshots as
1710 * a contiguous block of NUL-terminated strings. Note that
1711 * the number of snapshots could change by the time we read
1712 * it in, in which case we re-read it.
00f1f36f 1713 */
4156d998
AE
1714 do {
1715 size_t size;
1716
1717 kfree(ondisk);
1718
1719 size = sizeof (*ondisk);
1720 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1721 size += names_size;
1722 ondisk = kmalloc(size, GFP_KERNEL);
1723 if (!ondisk)
1724 return ERR_PTR(-ENOMEM);
1725
1726 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1727 rbd_dev->header_name,
4156d998
AE
1728 0, size,
1729 (char *) ondisk, version);
1730
1731 if (ret < 0)
1732 goto out_err;
1733 if (WARN_ON((size_t) ret < size)) {
1734 ret = -ENXIO;
1735 pr_warning("short header read for image %s"
1736 " (want %zd got %d)\n",
0d7dbfce 1737 rbd_dev->spec->image_name, size, ret);
4156d998
AE
1738 goto out_err;
1739 }
1740 if (!rbd_dev_ondisk_valid(ondisk)) {
1741 ret = -ENXIO;
1742 pr_warning("invalid header for image %s\n",
0d7dbfce 1743 rbd_dev->spec->image_name);
4156d998 1744 goto out_err;
81e759fb 1745 }
602adf40 1746
4156d998
AE
1747 names_size = le64_to_cpu(ondisk->snap_names_len);
1748 want_count = snap_count;
1749 snap_count = le32_to_cpu(ondisk->snap_count);
1750 } while (snap_count != want_count);
00f1f36f 1751
4156d998 1752 return ondisk;
00f1f36f 1753
4156d998
AE
1754out_err:
1755 kfree(ondisk);
1756
1757 return ERR_PTR(ret);
1758}
1759
1760/*
1761 * reload the ondisk the header
1762 */
1763static int rbd_read_header(struct rbd_device *rbd_dev,
1764 struct rbd_image_header *header)
1765{
1766 struct rbd_image_header_ondisk *ondisk;
1767 u64 ver = 0;
1768 int ret;
602adf40 1769
4156d998
AE
1770 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1771 if (IS_ERR(ondisk))
1772 return PTR_ERR(ondisk);
1773 ret = rbd_header_from_disk(header, ondisk);
1774 if (ret >= 0)
1775 header->obj_version = ver;
1776 kfree(ondisk);
1777
1778 return ret;
602adf40
YS
1779}
1780
41f38c2b 1781static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1782{
1783 struct rbd_snap *snap;
a0593290 1784 struct rbd_snap *next;
dfc5606d 1785
a0593290 1786 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1787 rbd_remove_snap_dev(snap);
dfc5606d
YS
1788}
1789
9478554a
AE
1790static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1791{
1792 sector_t size;
1793
0d7dbfce 1794 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
1795 return;
1796
1797 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1798 dout("setting size to %llu sectors", (unsigned long long) size);
1799 rbd_dev->mapping.size = (u64) size;
1800 set_capacity(rbd_dev->disk, size);
1801}
1802
602adf40
YS
1803/*
1804 * only read the first part of the ondisk header, without the snaps info
1805 */
117973fb 1806static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1807{
1808 int ret;
1809 struct rbd_image_header h;
602adf40
YS
1810
1811 ret = rbd_read_header(rbd_dev, &h);
1812 if (ret < 0)
1813 return ret;
1814
a51aa0c0
JD
1815 down_write(&rbd_dev->header_rwsem);
1816
9478554a
AE
1817 /* Update image size, and check for resize of mapped image */
1818 rbd_dev->header.image_size = h.image_size;
1819 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1820
849b4260 1821 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1822 kfree(rbd_dev->header.snap_sizes);
849b4260 1823 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1824 /* osd requests may still refer to snapc */
1825 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1826
b813623a
AE
1827 if (hver)
1828 *hver = h.obj_version;
a71b891b 1829 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1830 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1831 rbd_dev->header.snapc = h.snapc;
1832 rbd_dev->header.snap_names = h.snap_names;
1833 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1834 /* Free the extra copy of the object prefix */
1835 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1836 kfree(h.object_prefix);
1837
304f6808
AE
1838 ret = rbd_dev_snaps_update(rbd_dev);
1839 if (!ret)
1840 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1841
c666601a 1842 up_write(&rbd_dev->header_rwsem);
602adf40 1843
dfc5606d 1844 return ret;
602adf40
YS
1845}
1846
117973fb 1847static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1848{
1849 int ret;
1850
117973fb 1851 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1852 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1853 if (rbd_dev->image_format == 1)
1854 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1855 else
1856 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1857 mutex_unlock(&ctl_mutex);
1858
1859 return ret;
1860}
1861
602adf40
YS
1862static int rbd_init_disk(struct rbd_device *rbd_dev)
1863{
1864 struct gendisk *disk;
1865 struct request_queue *q;
593a9e7b 1866 u64 segment_size;
602adf40 1867
602adf40 1868 /* create gendisk info */
602adf40
YS
1869 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1870 if (!disk)
1fcdb8aa 1871 return -ENOMEM;
602adf40 1872
f0f8cef5 1873 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1874 rbd_dev->dev_id);
602adf40
YS
1875 disk->major = rbd_dev->major;
1876 disk->first_minor = 0;
1877 disk->fops = &rbd_bd_ops;
1878 disk->private_data = rbd_dev;
1879
1880 /* init rq */
602adf40
YS
1881 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1882 if (!q)
1883 goto out_disk;
029bcbd8 1884
593a9e7b
AE
1885 /* We use the default size, but let's be explicit about it. */
1886 blk_queue_physical_block_size(q, SECTOR_SIZE);
1887
029bcbd8 1888 /* set io sizes to object size */
593a9e7b
AE
1889 segment_size = rbd_obj_bytes(&rbd_dev->header);
1890 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1891 blk_queue_max_segment_size(q, segment_size);
1892 blk_queue_io_min(q, segment_size);
1893 blk_queue_io_opt(q, segment_size);
029bcbd8 1894
602adf40
YS
1895 blk_queue_merge_bvec(q, rbd_merge_bvec);
1896 disk->queue = q;
1897
1898 q->queuedata = rbd_dev;
1899
1900 rbd_dev->disk = disk;
602adf40 1901
12f02944
AE
1902 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1903
602adf40 1904 return 0;
602adf40
YS
1905out_disk:
1906 put_disk(disk);
1fcdb8aa
AE
1907
1908 return -ENOMEM;
602adf40
YS
1909}
1910
dfc5606d
YS
1911/*
1912 sysfs
1913*/
1914
593a9e7b
AE
1915static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1916{
1917 return container_of(dev, struct rbd_device, dev);
1918}
1919
dfc5606d
YS
1920static ssize_t rbd_size_show(struct device *dev,
1921 struct device_attribute *attr, char *buf)
1922{
593a9e7b 1923 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1924 sector_t size;
1925
1926 down_read(&rbd_dev->header_rwsem);
1927 size = get_capacity(rbd_dev->disk);
1928 up_read(&rbd_dev->header_rwsem);
dfc5606d 1929
a51aa0c0 1930 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1931}
1932
34b13184
AE
1933/*
1934 * Note this shows the features for whatever's mapped, which is not
1935 * necessarily the base image.
1936 */
1937static ssize_t rbd_features_show(struct device *dev,
1938 struct device_attribute *attr, char *buf)
1939{
1940 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1941
1942 return sprintf(buf, "0x%016llx\n",
1943 (unsigned long long) rbd_dev->mapping.features);
1944}
1945
dfc5606d
YS
1946static ssize_t rbd_major_show(struct device *dev,
1947 struct device_attribute *attr, char *buf)
1948{
593a9e7b 1949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1950
dfc5606d
YS
1951 return sprintf(buf, "%d\n", rbd_dev->major);
1952}
1953
1954static ssize_t rbd_client_id_show(struct device *dev,
1955 struct device_attribute *attr, char *buf)
602adf40 1956{
593a9e7b 1957 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1958
1dbb4399
AE
1959 return sprintf(buf, "client%lld\n",
1960 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1961}
1962
dfc5606d
YS
1963static ssize_t rbd_pool_show(struct device *dev,
1964 struct device_attribute *attr, char *buf)
602adf40 1965{
593a9e7b 1966 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1967
0d7dbfce 1968 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
1969}
1970
9bb2f334
AE
1971static ssize_t rbd_pool_id_show(struct device *dev,
1972 struct device_attribute *attr, char *buf)
1973{
1974 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1975
0d7dbfce
AE
1976 return sprintf(buf, "%llu\n",
1977 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
1978}
1979
dfc5606d
YS
1980static ssize_t rbd_name_show(struct device *dev,
1981 struct device_attribute *attr, char *buf)
1982{
593a9e7b 1983 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1984
0d7dbfce 1985 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
dfc5606d
YS
1986}
1987
589d30e0
AE
1988static ssize_t rbd_image_id_show(struct device *dev,
1989 struct device_attribute *attr, char *buf)
1990{
1991 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1992
0d7dbfce 1993 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
1994}
1995
34b13184
AE
1996/*
1997 * Shows the name of the currently-mapped snapshot (or
1998 * RBD_SNAP_HEAD_NAME for the base image).
1999 */
dfc5606d
YS
2000static ssize_t rbd_snap_show(struct device *dev,
2001 struct device_attribute *attr,
2002 char *buf)
2003{
593a9e7b 2004 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2005
0d7dbfce 2006 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2007}
2008
2009static ssize_t rbd_image_refresh(struct device *dev,
2010 struct device_attribute *attr,
2011 const char *buf,
2012 size_t size)
2013{
593a9e7b 2014 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2015 int ret;
602adf40 2016
117973fb 2017 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2018
2019 return ret < 0 ? ret : size;
dfc5606d 2020}
602adf40 2021
dfc5606d 2022static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2023static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2024static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2025static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2026static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2027static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2028static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2029static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2030static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2031static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
dfc5606d
YS
2032
2033static struct attribute *rbd_attrs[] = {
2034 &dev_attr_size.attr,
34b13184 2035 &dev_attr_features.attr,
dfc5606d
YS
2036 &dev_attr_major.attr,
2037 &dev_attr_client_id.attr,
2038 &dev_attr_pool.attr,
9bb2f334 2039 &dev_attr_pool_id.attr,
dfc5606d 2040 &dev_attr_name.attr,
589d30e0 2041 &dev_attr_image_id.attr,
dfc5606d
YS
2042 &dev_attr_current_snap.attr,
2043 &dev_attr_refresh.attr,
dfc5606d
YS
2044 NULL
2045};
2046
2047static struct attribute_group rbd_attr_group = {
2048 .attrs = rbd_attrs,
2049};
2050
2051static const struct attribute_group *rbd_attr_groups[] = {
2052 &rbd_attr_group,
2053 NULL
2054};
2055
2056static void rbd_sysfs_dev_release(struct device *dev)
2057{
2058}
2059
2060static struct device_type rbd_device_type = {
2061 .name = "rbd",
2062 .groups = rbd_attr_groups,
2063 .release = rbd_sysfs_dev_release,
2064};
2065
2066
2067/*
2068 sysfs - snapshots
2069*/
2070
2071static ssize_t rbd_snap_size_show(struct device *dev,
2072 struct device_attribute *attr,
2073 char *buf)
2074{
2075 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2076
3591538f 2077 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2078}
2079
2080static ssize_t rbd_snap_id_show(struct device *dev,
2081 struct device_attribute *attr,
2082 char *buf)
2083{
2084 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2085
3591538f 2086 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2087}
2088
34b13184
AE
2089static ssize_t rbd_snap_features_show(struct device *dev,
2090 struct device_attribute *attr,
2091 char *buf)
2092{
2093 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2094
2095 return sprintf(buf, "0x%016llx\n",
2096 (unsigned long long) snap->features);
2097}
2098
dfc5606d
YS
2099static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2100static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2101static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2102
2103static struct attribute *rbd_snap_attrs[] = {
2104 &dev_attr_snap_size.attr,
2105 &dev_attr_snap_id.attr,
34b13184 2106 &dev_attr_snap_features.attr,
dfc5606d
YS
2107 NULL,
2108};
2109
2110static struct attribute_group rbd_snap_attr_group = {
2111 .attrs = rbd_snap_attrs,
2112};
2113
2114static void rbd_snap_dev_release(struct device *dev)
2115{
2116 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2117 kfree(snap->name);
2118 kfree(snap);
2119}
2120
2121static const struct attribute_group *rbd_snap_attr_groups[] = {
2122 &rbd_snap_attr_group,
2123 NULL
2124};
2125
2126static struct device_type rbd_snap_device_type = {
2127 .groups = rbd_snap_attr_groups,
2128 .release = rbd_snap_dev_release,
2129};
2130
8b8fb99c
AE
2131static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2132{
2133 kref_get(&spec->kref);
2134
2135 return spec;
2136}
2137
2138static void rbd_spec_free(struct kref *kref);
2139static void rbd_spec_put(struct rbd_spec *spec)
2140{
2141 if (spec)
2142 kref_put(&spec->kref, rbd_spec_free);
2143}
2144
2145static struct rbd_spec *rbd_spec_alloc(void)
2146{
2147 struct rbd_spec *spec;
2148
2149 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2150 if (!spec)
2151 return NULL;
2152 kref_init(&spec->kref);
2153
2154 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2155
2156 return spec;
2157}
2158
2159static void rbd_spec_free(struct kref *kref)
2160{
2161 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2162
2163 kfree(spec->pool_name);
2164 kfree(spec->image_id);
2165 kfree(spec->image_name);
2166 kfree(spec->snap_name);
2167 kfree(spec);
2168}
2169
c53d5893
AE
2170struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2171 struct rbd_spec *spec)
2172{
2173 struct rbd_device *rbd_dev;
2174
2175 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2176 if (!rbd_dev)
2177 return NULL;
2178
2179 spin_lock_init(&rbd_dev->lock);
2180 INIT_LIST_HEAD(&rbd_dev->node);
2181 INIT_LIST_HEAD(&rbd_dev->snaps);
2182 init_rwsem(&rbd_dev->header_rwsem);
2183
2184 rbd_dev->spec = spec;
2185 rbd_dev->rbd_client = rbdc;
2186
2187 return rbd_dev;
2188}
2189
2190static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2191{
2192 kfree(rbd_dev->header_name);
2193 rbd_put_client(rbd_dev->rbd_client);
2194 rbd_spec_put(rbd_dev->spec);
2195 kfree(rbd_dev);
2196}
2197
304f6808
AE
2198static bool rbd_snap_registered(struct rbd_snap *snap)
2199{
2200 bool ret = snap->dev.type == &rbd_snap_device_type;
2201 bool reg = device_is_registered(&snap->dev);
2202
2203 rbd_assert(!ret ^ reg);
2204
2205 return ret;
2206}
2207
41f38c2b 2208static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2209{
2210 list_del(&snap->node);
304f6808
AE
2211 if (device_is_registered(&snap->dev))
2212 device_unregister(&snap->dev);
dfc5606d
YS
2213}
2214
14e7085d 2215static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2216 struct device *parent)
2217{
2218 struct device *dev = &snap->dev;
2219 int ret;
2220
2221 dev->type = &rbd_snap_device_type;
2222 dev->parent = parent;
2223 dev->release = rbd_snap_dev_release;
d4b125e9 2224 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2225 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2226
dfc5606d
YS
2227 ret = device_register(dev);
2228
2229 return ret;
2230}
2231
4e891e0a 2232static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2233 const char *snap_name,
34b13184
AE
2234 u64 snap_id, u64 snap_size,
2235 u64 snap_features)
dfc5606d 2236{
4e891e0a 2237 struct rbd_snap *snap;
dfc5606d 2238 int ret;
4e891e0a
AE
2239
2240 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2241 if (!snap)
4e891e0a
AE
2242 return ERR_PTR(-ENOMEM);
2243
2244 ret = -ENOMEM;
c8d18425 2245 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2246 if (!snap->name)
2247 goto err;
2248
c8d18425
AE
2249 snap->id = snap_id;
2250 snap->size = snap_size;
34b13184 2251 snap->features = snap_features;
4e891e0a
AE
2252
2253 return snap;
2254
dfc5606d
YS
2255err:
2256 kfree(snap->name);
2257 kfree(snap);
4e891e0a
AE
2258
2259 return ERR_PTR(ret);
dfc5606d
YS
2260}
2261
cd892126
AE
2262static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2263 u64 *snap_size, u64 *snap_features)
2264{
2265 char *snap_name;
2266
2267 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2268
2269 *snap_size = rbd_dev->header.snap_sizes[which];
2270 *snap_features = 0; /* No features for v1 */
2271
2272 /* Skip over names until we find the one we are looking for */
2273
2274 snap_name = rbd_dev->header.snap_names;
2275 while (which--)
2276 snap_name += strlen(snap_name) + 1;
2277
2278 return snap_name;
2279}
2280
9d475de5
AE
2281/*
2282 * Get the size and object order for an image snapshot, or if
2283 * snap_id is CEPH_NOSNAP, gets this information for the base
2284 * image.
2285 */
2286static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2287 u8 *order, u64 *snap_size)
2288{
2289 __le64 snapid = cpu_to_le64(snap_id);
2290 int ret;
2291 struct {
2292 u8 order;
2293 __le64 size;
2294 } __attribute__ ((packed)) size_buf = { 0 };
2295
2296 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2297 "rbd", "get_size",
2298 (char *) &snapid, sizeof (snapid),
2299 (char *) &size_buf, sizeof (size_buf),
2300 CEPH_OSD_FLAG_READ, NULL);
2301 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2302 if (ret < 0)
2303 return ret;
2304
2305 *order = size_buf.order;
2306 *snap_size = le64_to_cpu(size_buf.size);
2307
2308 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2309 (unsigned long long) snap_id, (unsigned int) *order,
2310 (unsigned long long) *snap_size);
2311
2312 return 0;
2313}
2314
2315static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2316{
2317 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2318 &rbd_dev->header.obj_order,
2319 &rbd_dev->header.image_size);
2320}
2321
1e130199
AE
2322static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2323{
2324 void *reply_buf;
2325 int ret;
2326 void *p;
2327
2328 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2329 if (!reply_buf)
2330 return -ENOMEM;
2331
2332 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2333 "rbd", "get_object_prefix",
2334 NULL, 0,
2335 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2336 CEPH_OSD_FLAG_READ, NULL);
2337 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2338 if (ret < 0)
2339 goto out;
a0ea3a40 2340 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2341
2342 p = reply_buf;
2343 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2344 p + RBD_OBJ_PREFIX_LEN_MAX,
2345 NULL, GFP_NOIO);
2346
2347 if (IS_ERR(rbd_dev->header.object_prefix)) {
2348 ret = PTR_ERR(rbd_dev->header.object_prefix);
2349 rbd_dev->header.object_prefix = NULL;
2350 } else {
2351 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2352 }
2353
2354out:
2355 kfree(reply_buf);
2356
2357 return ret;
2358}
2359
b1b5402a
AE
2360static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2361 u64 *snap_features)
2362{
2363 __le64 snapid = cpu_to_le64(snap_id);
2364 struct {
2365 __le64 features;
2366 __le64 incompat;
2367 } features_buf = { 0 };
d889140c 2368 u64 incompat;
b1b5402a
AE
2369 int ret;
2370
2371 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2372 "rbd", "get_features",
2373 (char *) &snapid, sizeof (snapid),
2374 (char *) &features_buf, sizeof (features_buf),
2375 CEPH_OSD_FLAG_READ, NULL);
2376 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2377 if (ret < 0)
2378 return ret;
d889140c
AE
2379
2380 incompat = le64_to_cpu(features_buf.incompat);
2381 if (incompat & ~RBD_FEATURES_ALL)
2382 return -ENOTSUPP;
2383
b1b5402a
AE
2384 *snap_features = le64_to_cpu(features_buf.features);
2385
2386 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2387 (unsigned long long) snap_id,
2388 (unsigned long long) *snap_features,
2389 (unsigned long long) le64_to_cpu(features_buf.incompat));
2390
2391 return 0;
2392}
2393
2394static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2395{
2396 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2397 &rbd_dev->header.features);
2398}
2399
6e14b1a6 2400static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2401{
2402 size_t size;
2403 int ret;
2404 void *reply_buf;
2405 void *p;
2406 void *end;
2407 u64 seq;
2408 u32 snap_count;
2409 struct ceph_snap_context *snapc;
2410 u32 i;
2411
2412 /*
2413 * We'll need room for the seq value (maximum snapshot id),
2414 * snapshot count, and array of that many snapshot ids.
2415 * For now we have a fixed upper limit on the number we're
2416 * prepared to receive.
2417 */
2418 size = sizeof (__le64) + sizeof (__le32) +
2419 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2420 reply_buf = kzalloc(size, GFP_KERNEL);
2421 if (!reply_buf)
2422 return -ENOMEM;
2423
2424 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2425 "rbd", "get_snapcontext",
2426 NULL, 0,
2427 reply_buf, size,
6e14b1a6 2428 CEPH_OSD_FLAG_READ, ver);
35d489f9
AE
2429 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2430 if (ret < 0)
2431 goto out;
2432
2433 ret = -ERANGE;
2434 p = reply_buf;
2435 end = (char *) reply_buf + size;
2436 ceph_decode_64_safe(&p, end, seq, out);
2437 ceph_decode_32_safe(&p, end, snap_count, out);
2438
2439 /*
2440 * Make sure the reported number of snapshot ids wouldn't go
2441 * beyond the end of our buffer. But before checking that,
2442 * make sure the computed size of the snapshot context we
2443 * allocate is representable in a size_t.
2444 */
2445 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2446 / sizeof (u64)) {
2447 ret = -EINVAL;
2448 goto out;
2449 }
2450 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2451 goto out;
2452
2453 size = sizeof (struct ceph_snap_context) +
2454 snap_count * sizeof (snapc->snaps[0]);
2455 snapc = kmalloc(size, GFP_KERNEL);
2456 if (!snapc) {
2457 ret = -ENOMEM;
2458 goto out;
2459 }
2460
2461 atomic_set(&snapc->nref, 1);
2462 snapc->seq = seq;
2463 snapc->num_snaps = snap_count;
2464 for (i = 0; i < snap_count; i++)
2465 snapc->snaps[i] = ceph_decode_64(&p);
2466
2467 rbd_dev->header.snapc = snapc;
2468
2469 dout(" snap context seq = %llu, snap_count = %u\n",
2470 (unsigned long long) seq, (unsigned int) snap_count);
2471
2472out:
2473 kfree(reply_buf);
2474
2475 return 0;
2476}
2477
b8b1e2db
AE
2478static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2479{
2480 size_t size;
2481 void *reply_buf;
2482 __le64 snap_id;
2483 int ret;
2484 void *p;
2485 void *end;
b8b1e2db
AE
2486 char *snap_name;
2487
2488 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2489 reply_buf = kmalloc(size, GFP_KERNEL);
2490 if (!reply_buf)
2491 return ERR_PTR(-ENOMEM);
2492
2493 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2494 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2495 "rbd", "get_snapshot_name",
2496 (char *) &snap_id, sizeof (snap_id),
2497 reply_buf, size,
2498 CEPH_OSD_FLAG_READ, NULL);
2499 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2500 if (ret < 0)
2501 goto out;
2502
2503 p = reply_buf;
2504 end = (char *) reply_buf + size;
e5c35534 2505 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2506 if (IS_ERR(snap_name)) {
2507 ret = PTR_ERR(snap_name);
2508 goto out;
2509 } else {
2510 dout(" snap_id 0x%016llx snap_name = %s\n",
2511 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2512 }
2513 kfree(reply_buf);
2514
2515 return snap_name;
2516out:
2517 kfree(reply_buf);
2518
2519 return ERR_PTR(ret);
2520}
2521
2522static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2523 u64 *snap_size, u64 *snap_features)
2524{
2525 __le64 snap_id;
2526 u8 order;
2527 int ret;
2528
2529 snap_id = rbd_dev->header.snapc->snaps[which];
2530 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2531 if (ret)
2532 return ERR_PTR(ret);
2533 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2534 if (ret)
2535 return ERR_PTR(ret);
2536
2537 return rbd_dev_v2_snap_name(rbd_dev, which);
2538}
2539
2540static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2541 u64 *snap_size, u64 *snap_features)
2542{
2543 if (rbd_dev->image_format == 1)
2544 return rbd_dev_v1_snap_info(rbd_dev, which,
2545 snap_size, snap_features);
2546 if (rbd_dev->image_format == 2)
2547 return rbd_dev_v2_snap_info(rbd_dev, which,
2548 snap_size, snap_features);
2549 return ERR_PTR(-EINVAL);
2550}
2551
117973fb
AE
2552static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2553{
2554 int ret;
2555 __u8 obj_order;
2556
2557 down_write(&rbd_dev->header_rwsem);
2558
2559 /* Grab old order first, to see if it changes */
2560
2561 obj_order = rbd_dev->header.obj_order,
2562 ret = rbd_dev_v2_image_size(rbd_dev);
2563 if (ret)
2564 goto out;
2565 if (rbd_dev->header.obj_order != obj_order) {
2566 ret = -EIO;
2567 goto out;
2568 }
2569 rbd_update_mapping_size(rbd_dev);
2570
2571 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2572 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2573 if (ret)
2574 goto out;
2575 ret = rbd_dev_snaps_update(rbd_dev);
2576 dout("rbd_dev_snaps_update returned %d\n", ret);
2577 if (ret)
2578 goto out;
2579 ret = rbd_dev_snaps_register(rbd_dev);
2580 dout("rbd_dev_snaps_register returned %d\n", ret);
2581out:
2582 up_write(&rbd_dev->header_rwsem);
2583
2584 return ret;
2585}
2586
dfc5606d 2587/*
35938150
AE
2588 * Scan the rbd device's current snapshot list and compare it to the
2589 * newly-received snapshot context. Remove any existing snapshots
2590 * not present in the new snapshot context. Add a new snapshot for
2591 * any snaphots in the snapshot context not in the current list.
2592 * And verify there are no changes to snapshots we already know
2593 * about.
2594 *
2595 * Assumes the snapshots in the snapshot context are sorted by
2596 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2597 * are also maintained in that order.)
dfc5606d 2598 */
304f6808 2599static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2600{
35938150
AE
2601 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2602 const u32 snap_count = snapc->num_snaps;
35938150
AE
2603 struct list_head *head = &rbd_dev->snaps;
2604 struct list_head *links = head->next;
2605 u32 index = 0;
dfc5606d 2606
9fcbb800 2607 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2608 while (index < snap_count || links != head) {
2609 u64 snap_id;
2610 struct rbd_snap *snap;
cd892126
AE
2611 char *snap_name;
2612 u64 snap_size = 0;
2613 u64 snap_features = 0;
dfc5606d 2614
35938150
AE
2615 snap_id = index < snap_count ? snapc->snaps[index]
2616 : CEPH_NOSNAP;
2617 snap = links != head ? list_entry(links, struct rbd_snap, node)
2618 : NULL;
aafb230e 2619 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2620
35938150
AE
2621 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2622 struct list_head *next = links->next;
dfc5606d 2623
35938150 2624 /* Existing snapshot not in the new snap context */
dfc5606d 2625
0d7dbfce 2626 if (rbd_dev->spec->snap_id == snap->id)
daba5fdb 2627 rbd_dev->exists = false;
41f38c2b 2628 rbd_remove_snap_dev(snap);
9fcbb800 2629 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
2630 rbd_dev->spec->snap_id == snap->id ?
2631 "mapped " : "",
9fcbb800 2632 (unsigned long long) snap->id);
35938150
AE
2633
2634 /* Done with this list entry; advance */
2635
2636 links = next;
dfc5606d
YS
2637 continue;
2638 }
35938150 2639
b8b1e2db
AE
2640 snap_name = rbd_dev_snap_info(rbd_dev, index,
2641 &snap_size, &snap_features);
cd892126
AE
2642 if (IS_ERR(snap_name))
2643 return PTR_ERR(snap_name);
2644
9fcbb800
AE
2645 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2646 (unsigned long long) snap_id);
35938150
AE
2647 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2648 struct rbd_snap *new_snap;
2649
2650 /* We haven't seen this snapshot before */
2651
c8d18425 2652 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2653 snap_id, snap_size, snap_features);
9fcbb800
AE
2654 if (IS_ERR(new_snap)) {
2655 int err = PTR_ERR(new_snap);
2656
2657 dout(" failed to add dev, error %d\n", err);
2658
2659 return err;
2660 }
35938150
AE
2661
2662 /* New goes before existing, or at end of list */
2663
9fcbb800 2664 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2665 if (snap)
2666 list_add_tail(&new_snap->node, &snap->node);
2667 else
523f3258 2668 list_add_tail(&new_snap->node, head);
35938150
AE
2669 } else {
2670 /* Already have this one */
2671
9fcbb800
AE
2672 dout(" already present\n");
2673
cd892126 2674 rbd_assert(snap->size == snap_size);
aafb230e 2675 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2676 rbd_assert(snap->features == snap_features);
35938150
AE
2677
2678 /* Done with this list entry; advance */
2679
2680 links = links->next;
dfc5606d 2681 }
35938150
AE
2682
2683 /* Advance to the next entry in the snapshot context */
2684
2685 index++;
dfc5606d 2686 }
9fcbb800 2687 dout("%s: done\n", __func__);
dfc5606d
YS
2688
2689 return 0;
2690}
2691
304f6808
AE
2692/*
2693 * Scan the list of snapshots and register the devices for any that
2694 * have not already been registered.
2695 */
2696static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2697{
2698 struct rbd_snap *snap;
2699 int ret = 0;
2700
2701 dout("%s called\n", __func__);
86ff77bb
AE
2702 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2703 return -EIO;
304f6808
AE
2704
2705 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2706 if (!rbd_snap_registered(snap)) {
2707 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2708 if (ret < 0)
2709 break;
2710 }
2711 }
2712 dout("%s: returning %d\n", __func__, ret);
2713
2714 return ret;
2715}
2716
dfc5606d
YS
2717static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2718{
dfc5606d 2719 struct device *dev;
cd789ab9 2720 int ret;
dfc5606d
YS
2721
2722 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2723
cd789ab9 2724 dev = &rbd_dev->dev;
dfc5606d
YS
2725 dev->bus = &rbd_bus_type;
2726 dev->type = &rbd_device_type;
2727 dev->parent = &rbd_root_dev;
2728 dev->release = rbd_dev_release;
de71a297 2729 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2730 ret = device_register(dev);
dfc5606d 2731
dfc5606d 2732 mutex_unlock(&ctl_mutex);
cd789ab9 2733
dfc5606d 2734 return ret;
602adf40
YS
2735}
2736
dfc5606d
YS
2737static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2738{
2739 device_unregister(&rbd_dev->dev);
2740}
2741
59c2be1e
YS
2742static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2743{
2744 int ret, rc;
2745
2746 do {
0e6f322d 2747 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2748 if (ret == -ERANGE) {
117973fb 2749 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
2750 if (rc < 0)
2751 return rc;
2752 }
2753 } while (ret == -ERANGE);
2754
2755 return ret;
2756}
2757
e2839308 2758static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2759
2760/*
499afd5b
AE
2761 * Get a unique rbd identifier for the given new rbd_dev, and add
2762 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2763 */
e2839308 2764static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2765{
e2839308 2766 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2767
2768 spin_lock(&rbd_dev_list_lock);
2769 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2770 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2771 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2772 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2773}
b7f23c36 2774
1ddbe94e 2775/*
499afd5b
AE
2776 * Remove an rbd_dev from the global list, and record that its
2777 * identifier is no longer in use.
1ddbe94e 2778 */
e2839308 2779static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2780{
d184f6bf 2781 struct list_head *tmp;
de71a297 2782 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2783 int max_id;
2784
aafb230e 2785 rbd_assert(rbd_id > 0);
499afd5b 2786
e2839308
AE
2787 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2788 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2789 spin_lock(&rbd_dev_list_lock);
2790 list_del_init(&rbd_dev->node);
d184f6bf
AE
2791
2792 /*
2793 * If the id being "put" is not the current maximum, there
2794 * is nothing special we need to do.
2795 */
e2839308 2796 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2797 spin_unlock(&rbd_dev_list_lock);
2798 return;
2799 }
2800
2801 /*
2802 * We need to update the current maximum id. Search the
2803 * list to find out what it is. We're more likely to find
2804 * the maximum at the end, so search the list backward.
2805 */
2806 max_id = 0;
2807 list_for_each_prev(tmp, &rbd_dev_list) {
2808 struct rbd_device *rbd_dev;
2809
2810 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
2811 if (rbd_dev->dev_id > max_id)
2812 max_id = rbd_dev->dev_id;
d184f6bf 2813 }
499afd5b 2814 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2815
1ddbe94e 2816 /*
e2839308 2817 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2818 * which case it now accurately reflects the new maximum.
2819 * Be careful not to overwrite the maximum value in that
2820 * case.
1ddbe94e 2821 */
e2839308
AE
2822 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2823 dout(" max dev id has been reset\n");
b7f23c36
AE
2824}
2825
e28fff26
AE
2826/*
2827 * Skips over white space at *buf, and updates *buf to point to the
2828 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2829 * the token (string of non-white space characters) found. Note
2830 * that *buf must be terminated with '\0'.
e28fff26
AE
2831 */
2832static inline size_t next_token(const char **buf)
2833{
2834 /*
2835 * These are the characters that produce nonzero for
2836 * isspace() in the "C" and "POSIX" locales.
2837 */
2838 const char *spaces = " \f\n\r\t\v";
2839
2840 *buf += strspn(*buf, spaces); /* Find start of token */
2841
2842 return strcspn(*buf, spaces); /* Return token length */
2843}
2844
2845/*
2846 * Finds the next token in *buf, and if the provided token buffer is
2847 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2848 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2849 * must be terminated with '\0' on entry.
e28fff26
AE
2850 *
2851 * Returns the length of the token found (not including the '\0').
2852 * Return value will be 0 if no token is found, and it will be >=
2853 * token_size if the token would not fit.
2854 *
593a9e7b 2855 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2856 * found token. Note that this occurs even if the token buffer is
2857 * too small to hold it.
2858 */
2859static inline size_t copy_token(const char **buf,
2860 char *token,
2861 size_t token_size)
2862{
2863 size_t len;
2864
2865 len = next_token(buf);
2866 if (len < token_size) {
2867 memcpy(token, *buf, len);
2868 *(token + len) = '\0';
2869 }
2870 *buf += len;
2871
2872 return len;
2873}
2874
ea3352f4
AE
2875/*
2876 * Finds the next token in *buf, dynamically allocates a buffer big
2877 * enough to hold a copy of it, and copies the token into the new
2878 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2879 * that a duplicate buffer is created even for a zero-length token.
2880 *
2881 * Returns a pointer to the newly-allocated duplicate, or a null
2882 * pointer if memory for the duplicate was not available. If
2883 * the lenp argument is a non-null pointer, the length of the token
2884 * (not including the '\0') is returned in *lenp.
2885 *
2886 * If successful, the *buf pointer will be updated to point beyond
2887 * the end of the found token.
2888 *
2889 * Note: uses GFP_KERNEL for allocation.
2890 */
2891static inline char *dup_token(const char **buf, size_t *lenp)
2892{
2893 char *dup;
2894 size_t len;
2895
2896 len = next_token(buf);
2897 dup = kmalloc(len + 1, GFP_KERNEL);
2898 if (!dup)
2899 return NULL;
2900
2901 memcpy(dup, *buf, len);
2902 *(dup + len) = '\0';
2903 *buf += len;
2904
2905 if (lenp)
2906 *lenp = len;
2907
2908 return dup;
2909}
2910
a725f65e 2911/*
859c31df
AE
2912 * Parse the options provided for an "rbd add" (i.e., rbd image
2913 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
2914 * and the data written is passed here via a NUL-terminated buffer.
2915 * Returns 0 if successful or an error code otherwise.
d22f76e7 2916 *
859c31df
AE
2917 * The information extracted from these options is recorded in
2918 * the other parameters which return dynamically-allocated
2919 * structures:
2920 * ceph_opts
2921 * The address of a pointer that will refer to a ceph options
2922 * structure. Caller must release the returned pointer using
2923 * ceph_destroy_options() when it is no longer needed.
2924 * rbd_opts
2925 * Address of an rbd options pointer. Fully initialized by
2926 * this function; caller must release with kfree().
2927 * spec
2928 * Address of an rbd image specification pointer. Fully
2929 * initialized by this function based on parsed options.
2930 * Caller must release with rbd_spec_put().
2931 *
2932 * The options passed take this form:
2933 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
2934 * where:
2935 * <mon_addrs>
2936 * A comma-separated list of one or more monitor addresses.
2937 * A monitor address is an ip address, optionally followed
2938 * by a port number (separated by a colon).
2939 * I.e.: ip1[:port1][,ip2[:port2]...]
2940 * <options>
2941 * A comma-separated list of ceph and/or rbd options.
2942 * <pool_name>
2943 * The name of the rados pool containing the rbd image.
2944 * <image_name>
2945 * The name of the image in that pool to map.
2946 * <snap_id>
2947 * An optional snapshot id. If provided, the mapping will
2948 * present data from the image at the time that snapshot was
2949 * created. The image head is used if no snapshot id is
2950 * provided. Snapshot mappings are always read-only.
a725f65e 2951 */
859c31df 2952static int rbd_add_parse_args(const char *buf,
dc79b113 2953 struct ceph_options **ceph_opts,
859c31df
AE
2954 struct rbd_options **opts,
2955 struct rbd_spec **rbd_spec)
e28fff26 2956{
d22f76e7 2957 size_t len;
859c31df 2958 char *options;
0ddebc0c
AE
2959 const char *mon_addrs;
2960 size_t mon_addrs_size;
859c31df 2961 struct rbd_spec *spec = NULL;
4e9afeba 2962 struct rbd_options *rbd_opts = NULL;
859c31df 2963 struct ceph_options *copts;
dc79b113 2964 int ret;
e28fff26
AE
2965
2966 /* The first four tokens are required */
2967
7ef3214a
AE
2968 len = next_token(&buf);
2969 if (!len)
dc79b113 2970 return -EINVAL; /* Missing monitor address(es) */
0ddebc0c 2971 mon_addrs = buf;
f28e565a 2972 mon_addrs_size = len + 1;
7ef3214a 2973 buf += len;
a725f65e 2974
dc79b113 2975 ret = -EINVAL;
f28e565a
AE
2976 options = dup_token(&buf, NULL);
2977 if (!options)
dc79b113 2978 return -ENOMEM;
f28e565a
AE
2979 if (!*options)
2980 goto out_err; /* Missing options */
e28fff26 2981
859c31df
AE
2982 spec = rbd_spec_alloc();
2983 if (!spec)
f28e565a 2984 goto out_mem;
859c31df
AE
2985
2986 spec->pool_name = dup_token(&buf, NULL);
2987 if (!spec->pool_name)
2988 goto out_mem;
2989 if (!*spec->pool_name)
f28e565a 2990 goto out_err; /* Missing pool name */
e28fff26 2991
859c31df
AE
2992 spec->image_name = dup_token(&buf, &spec->image_name_len);
2993 if (!spec->image_name)
f28e565a 2994 goto out_mem;
859c31df 2995 if (!*spec->image_name)
f28e565a 2996 goto out_err; /* Missing image name */
d4b125e9 2997
f28e565a
AE
2998 /*
2999 * Snapshot name is optional; default is to use "-"
3000 * (indicating the head/no snapshot).
3001 */
3feeb894 3002 len = next_token(&buf);
820a5f3e 3003 if (!len) {
3feeb894
AE
3004 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3005 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3006 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3007 ret = -ENAMETOOLONG;
f28e565a 3008 goto out_err;
849b4260 3009 }
859c31df
AE
3010 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3011 if (!spec->snap_name)
f28e565a 3012 goto out_mem;
859c31df
AE
3013 memcpy(spec->snap_name, buf, len);
3014 *(spec->snap_name + len) = '\0';
e5c35534 3015
0ddebc0c 3016 /* Initialize all rbd options to the defaults */
e28fff26 3017
4e9afeba
AE
3018 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3019 if (!rbd_opts)
3020 goto out_mem;
3021
3022 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3023
859c31df 3024 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3025 mon_addrs + mon_addrs_size - 1,
4e9afeba 3026 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3027 if (IS_ERR(copts)) {
3028 ret = PTR_ERR(copts);
dc79b113
AE
3029 goto out_err;
3030 }
859c31df
AE
3031 kfree(options);
3032
3033 *ceph_opts = copts;
4e9afeba 3034 *opts = rbd_opts;
859c31df 3035 *rbd_spec = spec;
0ddebc0c 3036
dc79b113 3037 return 0;
f28e565a 3038out_mem:
dc79b113 3039 ret = -ENOMEM;
d22f76e7 3040out_err:
859c31df
AE
3041 kfree(rbd_opts);
3042 rbd_spec_put(spec);
f28e565a 3043 kfree(options);
d22f76e7 3044
dc79b113 3045 return ret;
a725f65e
AE
3046}
3047
589d30e0
AE
3048/*
3049 * An rbd format 2 image has a unique identifier, distinct from the
3050 * name given to it by the user. Internally, that identifier is
3051 * what's used to specify the names of objects related to the image.
3052 *
3053 * A special "rbd id" object is used to map an rbd image name to its
3054 * id. If that object doesn't exist, then there is no v2 rbd image
3055 * with the supplied name.
3056 *
3057 * This function will record the given rbd_dev's image_id field if
3058 * it can be determined, and in that case will return 0. If any
3059 * errors occur a negative errno will be returned and the rbd_dev's
3060 * image_id field will be unchanged (and should be NULL).
3061 */
3062static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3063{
3064 int ret;
3065 size_t size;
3066 char *object_name;
3067 void *response;
3068 void *p;
3069
2c0d0a10
AE
3070 /*
3071 * When probing a parent image, the image id is already
3072 * known (and the image name likely is not). There's no
3073 * need to fetch the image id again in this case.
3074 */
3075 if (rbd_dev->spec->image_id)
3076 return 0;
3077
589d30e0
AE
3078 /*
3079 * First, see if the format 2 image id file exists, and if
3080 * so, get the image's persistent id from it.
3081 */
0d7dbfce 3082 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
589d30e0
AE
3083 object_name = kmalloc(size, GFP_NOIO);
3084 if (!object_name)
3085 return -ENOMEM;
0d7dbfce 3086 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3087 dout("rbd id object name is %s\n", object_name);
3088
3089 /* Response will be an encoded string, which includes a length */
3090
3091 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3092 response = kzalloc(size, GFP_NOIO);
3093 if (!response) {
3094 ret = -ENOMEM;
3095 goto out;
3096 }
3097
3098 ret = rbd_req_sync_exec(rbd_dev, object_name,
3099 "rbd", "get_id",
3100 NULL, 0,
3101 response, RBD_IMAGE_ID_LEN_MAX,
3102 CEPH_OSD_FLAG_READ, NULL);
3103 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3104 if (ret < 0)
3105 goto out;
a0ea3a40 3106 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
3107
3108 p = response;
0d7dbfce 3109 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3110 p + RBD_IMAGE_ID_LEN_MAX,
0d7dbfce 3111 &rbd_dev->spec->image_id_len,
589d30e0 3112 GFP_NOIO);
0d7dbfce
AE
3113 if (IS_ERR(rbd_dev->spec->image_id)) {
3114 ret = PTR_ERR(rbd_dev->spec->image_id);
3115 rbd_dev->spec->image_id = NULL;
589d30e0 3116 } else {
0d7dbfce 3117 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3118 }
3119out:
3120 kfree(response);
3121 kfree(object_name);
3122
3123 return ret;
3124}
3125
a30b71b9
AE
3126static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3127{
3128 int ret;
3129 size_t size;
3130
3131 /* Version 1 images have no id; empty string is used */
3132
0d7dbfce
AE
3133 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3134 if (!rbd_dev->spec->image_id)
a30b71b9 3135 return -ENOMEM;
0d7dbfce 3136 rbd_dev->spec->image_id_len = 0;
a30b71b9
AE
3137
3138 /* Record the header object name for this rbd image. */
3139
0d7dbfce 3140 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
a30b71b9
AE
3141 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3142 if (!rbd_dev->header_name) {
3143 ret = -ENOMEM;
3144 goto out_err;
3145 }
0d7dbfce
AE
3146 sprintf(rbd_dev->header_name, "%s%s",
3147 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3148
3149 /* Populate rbd image metadata */
3150
3151 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3152 if (ret < 0)
3153 goto out_err;
3154 rbd_dev->image_format = 1;
3155
3156 dout("discovered version 1 image, header name is %s\n",
3157 rbd_dev->header_name);
3158
3159 return 0;
3160
3161out_err:
3162 kfree(rbd_dev->header_name);
3163 rbd_dev->header_name = NULL;
0d7dbfce
AE
3164 kfree(rbd_dev->spec->image_id);
3165 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3166
3167 return ret;
3168}
3169
3170static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3171{
3172 size_t size;
9d475de5 3173 int ret;
6e14b1a6 3174 u64 ver = 0;
a30b71b9
AE
3175
3176 /*
3177 * Image id was filled in by the caller. Record the header
3178 * object name for this rbd image.
3179 */
0d7dbfce 3180 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
a30b71b9
AE
3181 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3182 if (!rbd_dev->header_name)
3183 return -ENOMEM;
3184 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3185 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3186
3187 /* Get the size and object order for the image */
3188
3189 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3190 if (ret < 0)
3191 goto out_err;
3192
3193 /* Get the object prefix (a.k.a. block_name) for the image */
3194
3195 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3196 if (ret < 0)
3197 goto out_err;
3198
d889140c 3199 /* Get the and check features for the image */
b1b5402a
AE
3200
3201 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3202 if (ret < 0)
3203 goto out_err;
35d489f9 3204
6e14b1a6
AE
3205 /* crypto and compression type aren't (yet) supported for v2 images */
3206
3207 rbd_dev->header.crypt_type = 0;
3208 rbd_dev->header.comp_type = 0;
35d489f9 3209
6e14b1a6
AE
3210 /* Get the snapshot context, plus the header version */
3211
3212 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3213 if (ret)
3214 goto out_err;
6e14b1a6
AE
3215 rbd_dev->header.obj_version = ver;
3216
a30b71b9
AE
3217 rbd_dev->image_format = 2;
3218
3219 dout("discovered version 2 image, header name is %s\n",
3220 rbd_dev->header_name);
3221
35152979 3222 return 0;
9d475de5
AE
3223out_err:
3224 kfree(rbd_dev->header_name);
3225 rbd_dev->header_name = NULL;
1e130199
AE
3226 kfree(rbd_dev->header.object_prefix);
3227 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3228
3229 return ret;
a30b71b9
AE
3230}
3231
83a06263
AE
3232static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3233{
3234 int ret;
3235
3236 /* no need to lock here, as rbd_dev is not registered yet */
3237 ret = rbd_dev_snaps_update(rbd_dev);
3238 if (ret)
3239 return ret;
3240
3241 ret = rbd_dev_set_mapping(rbd_dev);
3242 if (ret)
3243 goto err_out_snaps;
3244
3245 /* generate unique id: find highest unique id, add one */
3246 rbd_dev_id_get(rbd_dev);
3247
3248 /* Fill in the device name, now that we have its id. */
3249 BUILD_BUG_ON(DEV_NAME_LEN
3250 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3251 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3252
3253 /* Get our block major device number. */
3254
3255 ret = register_blkdev(0, rbd_dev->name);
3256 if (ret < 0)
3257 goto err_out_id;
3258 rbd_dev->major = ret;
3259
3260 /* Set up the blkdev mapping. */
3261
3262 ret = rbd_init_disk(rbd_dev);
3263 if (ret)
3264 goto err_out_blkdev;
3265
3266 ret = rbd_bus_add_dev(rbd_dev);
3267 if (ret)
3268 goto err_out_disk;
3269
3270 /*
3271 * At this point cleanup in the event of an error is the job
3272 * of the sysfs code (initiated by rbd_bus_del_dev()).
3273 */
3274 down_write(&rbd_dev->header_rwsem);
3275 ret = rbd_dev_snaps_register(rbd_dev);
3276 up_write(&rbd_dev->header_rwsem);
3277 if (ret)
3278 goto err_out_bus;
3279
3280 ret = rbd_init_watch_dev(rbd_dev);
3281 if (ret)
3282 goto err_out_bus;
3283
3284 /* Everything's ready. Announce the disk to the world. */
3285
3286 add_disk(rbd_dev->disk);
3287
3288 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3289 (unsigned long long) rbd_dev->mapping.size);
3290
3291 return ret;
3292err_out_bus:
3293 /* this will also clean up rest of rbd_dev stuff */
3294
3295 rbd_bus_del_dev(rbd_dev);
3296
3297 return ret;
3298err_out_disk:
3299 rbd_free_disk(rbd_dev);
3300err_out_blkdev:
3301 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3302err_out_id:
3303 rbd_dev_id_put(rbd_dev);
3304err_out_snaps:
3305 rbd_remove_all_snaps(rbd_dev);
3306
3307 return ret;
3308}
3309
a30b71b9
AE
3310/*
3311 * Probe for the existence of the header object for the given rbd
3312 * device. For format 2 images this includes determining the image
3313 * id.
3314 */
3315static int rbd_dev_probe(struct rbd_device *rbd_dev)
3316{
3317 int ret;
3318
3319 /*
3320 * Get the id from the image id object. If it's not a
3321 * format 2 image, we'll get ENOENT back, and we'll assume
3322 * it's a format 1 image.
3323 */
3324 ret = rbd_dev_image_id(rbd_dev);
3325 if (ret)
3326 ret = rbd_dev_v1_probe(rbd_dev);
3327 else
3328 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3329 if (ret) {
a30b71b9
AE
3330 dout("probe failed, returning %d\n", ret);
3331
83a06263
AE
3332 return ret;
3333 }
3334
3335 ret = rbd_dev_probe_finish(rbd_dev);
3336 if (ret)
3337 rbd_header_free(&rbd_dev->header);
3338
a30b71b9
AE
3339 return ret;
3340}
3341
59c2be1e
YS
3342static ssize_t rbd_add(struct bus_type *bus,
3343 const char *buf,
3344 size_t count)
602adf40 3345{
cb8627c7 3346 struct rbd_device *rbd_dev = NULL;
dc79b113 3347 struct ceph_options *ceph_opts = NULL;
4e9afeba 3348 struct rbd_options *rbd_opts = NULL;
859c31df 3349 struct rbd_spec *spec = NULL;
9d3997fd 3350 struct rbd_client *rbdc;
27cc2594
AE
3351 struct ceph_osd_client *osdc;
3352 int rc = -ENOMEM;
602adf40
YS
3353
3354 if (!try_module_get(THIS_MODULE))
3355 return -ENODEV;
3356
602adf40 3357 /* parse add command */
859c31df 3358 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 3359 if (rc < 0)
bd4ba655 3360 goto err_out_module;
78cea76e 3361
9d3997fd
AE
3362 rbdc = rbd_get_client(ceph_opts);
3363 if (IS_ERR(rbdc)) {
3364 rc = PTR_ERR(rbdc);
0ddebc0c 3365 goto err_out_args;
9d3997fd 3366 }
c53d5893 3367 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 3368
602adf40 3369 /* pick the pool */
9d3997fd 3370 osdc = &rbdc->client->osdc;
859c31df 3371 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
3372 if (rc < 0)
3373 goto err_out_client;
859c31df
AE
3374 spec->pool_id = (u64) rc;
3375
c53d5893 3376 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
3377 if (!rbd_dev)
3378 goto err_out_client;
c53d5893
AE
3379 rbdc = NULL; /* rbd_dev now owns this */
3380 spec = NULL; /* rbd_dev now owns this */
602adf40 3381
bd4ba655 3382 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
3383 kfree(rbd_opts);
3384 rbd_opts = NULL; /* done with this */
bd4ba655 3385
a30b71b9
AE
3386 rc = rbd_dev_probe(rbd_dev);
3387 if (rc < 0)
c53d5893 3388 goto err_out_rbd_dev;
05fd6f6f 3389
602adf40 3390 return count;
c53d5893
AE
3391err_out_rbd_dev:
3392 rbd_dev_destroy(rbd_dev);
bd4ba655 3393err_out_client:
9d3997fd 3394 rbd_put_client(rbdc);
0ddebc0c 3395err_out_args:
78cea76e
AE
3396 if (ceph_opts)
3397 ceph_destroy_options(ceph_opts);
4e9afeba 3398 kfree(rbd_opts);
859c31df 3399 rbd_spec_put(spec);
bd4ba655
AE
3400err_out_module:
3401 module_put(THIS_MODULE);
27cc2594 3402
602adf40 3403 dout("Error adding device %s\n", buf);
27cc2594
AE
3404
3405 return (ssize_t) rc;
602adf40
YS
3406}
3407
de71a297 3408static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3409{
3410 struct list_head *tmp;
3411 struct rbd_device *rbd_dev;
3412
e124a82f 3413 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3414 list_for_each(tmp, &rbd_dev_list) {
3415 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3416 if (rbd_dev->dev_id == dev_id) {
e124a82f 3417 spin_unlock(&rbd_dev_list_lock);
602adf40 3418 return rbd_dev;
e124a82f 3419 }
602adf40 3420 }
e124a82f 3421 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3422 return NULL;
3423}
3424
dfc5606d 3425static void rbd_dev_release(struct device *dev)
602adf40 3426{
593a9e7b 3427 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3428
1dbb4399
AE
3429 if (rbd_dev->watch_request) {
3430 struct ceph_client *client = rbd_dev->rbd_client->client;
3431
3432 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3433 rbd_dev->watch_request);
1dbb4399 3434 }
59c2be1e 3435 if (rbd_dev->watch_event)
070c633f 3436 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 3437
602adf40
YS
3438
3439 /* clean up and free blkdev */
3440 rbd_free_disk(rbd_dev);
3441 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3442
2ac4e75d
AE
3443 /* release allocated disk header fields */
3444 rbd_header_free(&rbd_dev->header);
3445
32eec68d 3446 /* done with the id, and with the rbd_dev */
e2839308 3447 rbd_dev_id_put(rbd_dev);
c53d5893
AE
3448 rbd_assert(rbd_dev->rbd_client != NULL);
3449 rbd_dev_destroy(rbd_dev);
602adf40
YS
3450
3451 /* release module ref */
3452 module_put(THIS_MODULE);
602adf40
YS
3453}
3454
dfc5606d
YS
3455static ssize_t rbd_remove(struct bus_type *bus,
3456 const char *buf,
3457 size_t count)
602adf40
YS
3458{
3459 struct rbd_device *rbd_dev = NULL;
3460 int target_id, rc;
3461 unsigned long ul;
3462 int ret = count;
3463
3464 rc = strict_strtoul(buf, 10, &ul);
3465 if (rc)
3466 return rc;
3467
3468 /* convert to int; abort if we lost anything in the conversion */
3469 target_id = (int) ul;
3470 if (target_id != ul)
3471 return -EINVAL;
3472
3473 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3474
3475 rbd_dev = __rbd_get_dev(target_id);
3476 if (!rbd_dev) {
3477 ret = -ENOENT;
3478 goto done;
3479 }
3480
41f38c2b 3481 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3482 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3483
3484done:
3485 mutex_unlock(&ctl_mutex);
aafb230e 3486
602adf40
YS
3487 return ret;
3488}
3489
602adf40
YS
3490/*
3491 * create control files in sysfs
dfc5606d 3492 * /sys/bus/rbd/...
602adf40
YS
3493 */
3494static int rbd_sysfs_init(void)
3495{
dfc5606d 3496 int ret;
602adf40 3497
fed4c143 3498 ret = device_register(&rbd_root_dev);
21079786 3499 if (ret < 0)
dfc5606d 3500 return ret;
602adf40 3501
fed4c143
AE
3502 ret = bus_register(&rbd_bus_type);
3503 if (ret < 0)
3504 device_unregister(&rbd_root_dev);
602adf40 3505
602adf40
YS
3506 return ret;
3507}
3508
3509static void rbd_sysfs_cleanup(void)
3510{
dfc5606d 3511 bus_unregister(&rbd_bus_type);
fed4c143 3512 device_unregister(&rbd_root_dev);
602adf40
YS
3513}
3514
3515int __init rbd_init(void)
3516{
3517 int rc;
3518
3519 rc = rbd_sysfs_init();
3520 if (rc)
3521 return rc;
f0f8cef5 3522 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3523 return 0;
3524}
3525
3526void __exit rbd_exit(void)
3527{
3528 rbd_sysfs_cleanup();
3529}
3530
3531module_init(rbd_init);
3532module_exit(rbd_exit);
3533
3534MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3535MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3536MODULE_DESCRIPTION("rados block device");
3537
3538/* following authorship retained from original osdblk.c */
3539MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3540
3541MODULE_LICENSE("GPL");