rbd: increase maximum snapshot name length
[linux-2.6-block.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
d4b125e9
AE
64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
35d489f9 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
69#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
1e130199
AE
73#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 75
d889140c
AE
76/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
81a89793
AE
84/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
602adf40 90#define DEV_NAME_LEN 32
81a89793 91#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 92
cc0538b6 93#define RBD_READ_ONLY_DEFAULT false
59c2be1e 94
602adf40
YS
95/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
f84344f3 99 /* These four fields never change for a given rbd image */
849b4260 100 char *object_prefix;
34b13184 101 u64 features;
602adf40
YS
102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
602adf40 105
f84344f3
AE
106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
602adf40
YS
109 char *snap_names;
110 u64 *snap_sizes;
59c2be1e
YS
111
112 u64 obj_version;
113};
114
115struct rbd_options {
cc0538b6 116 bool read_only;
602adf40
YS
117};
118
119/*
f0f8cef5 120 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
f0f8cef5 129 * a request completion status
602adf40 130 */
1fec7093
YS
131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
602adf40
YS
145};
146
f0f8cef5
AE
147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
dfc5606d
YS
159struct rbd_snap {
160 struct device dev;
161 const char *name;
3591538f 162 u64 size;
dfc5606d
YS
163 struct list_head node;
164 u64 id;
34b13184 165 u64 features;
dfc5606d
YS
166};
167
f84344f3
AE
168struct rbd_mapping {
169 char *snap_name;
170 u64 snap_id;
99c1f08f 171 u64 size;
34b13184 172 u64 features;
f84344f3
AE
173 bool snap_exists;
174 bool read_only;
175};
176
602adf40
YS
177/*
178 * a single device
179 */
180struct rbd_device {
de71a297 181 int dev_id; /* blkdev unique id */
602adf40
YS
182
183 int major; /* blkdev assigned major */
184 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 185
a30b71b9 186 u32 image_format; /* Either 1 or 2 */
f8c38929 187 struct rbd_options rbd_opts;
602adf40
YS
188 struct rbd_client *rbd_client;
189
190 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
191
192 spinlock_t lock; /* queue lock */
193
194 struct rbd_image_header header;
589d30e0
AE
195 char *image_id;
196 size_t image_id_len;
0bed54dc
AE
197 char *image_name;
198 size_t image_name_len;
199 char *header_name;
d22f76e7 200 char *pool_name;
9bb2f334 201 int pool_id;
602adf40 202
59c2be1e
YS
203 struct ceph_osd_event *watch_event;
204 struct ceph_osd_request *watch_request;
205
c666601a
JD
206 /* protects updating the header */
207 struct rw_semaphore header_rwsem;
f84344f3
AE
208
209 struct rbd_mapping mapping;
602adf40
YS
210
211 struct list_head node;
dfc5606d
YS
212
213 /* list of snapshots */
214 struct list_head snaps;
215
216 /* sysfs related */
217 struct device dev;
218};
219
602adf40 220static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 221
602adf40 222static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
223static DEFINE_SPINLOCK(rbd_dev_list_lock);
224
432b8587
AE
225static LIST_HEAD(rbd_client_list); /* clients */
226static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 227
304f6808
AE
228static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
229static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
230
dfc5606d 231static void rbd_dev_release(struct device *dev);
14e7085d 232static void __rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 233
f0f8cef5
AE
234static ssize_t rbd_add(struct bus_type *bus, const char *buf,
235 size_t count);
236static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
237 size_t count);
238
239static struct bus_attribute rbd_bus_attrs[] = {
240 __ATTR(add, S_IWUSR, NULL, rbd_add),
241 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
242 __ATTR_NULL
243};
244
245static struct bus_type rbd_bus_type = {
246 .name = "rbd",
247 .bus_attrs = rbd_bus_attrs,
248};
249
250static void rbd_root_dev_release(struct device *dev)
251{
252}
253
254static struct device rbd_root_dev = {
255 .init_name = "rbd",
256 .release = rbd_root_dev_release,
257};
258
aafb230e
AE
259#ifdef RBD_DEBUG
260#define rbd_assert(expr) \
261 if (unlikely(!(expr))) { \
262 printk(KERN_ERR "\nAssertion failure in %s() " \
263 "at line %d:\n\n" \
264 "\trbd_assert(%s);\n\n", \
265 __func__, __LINE__, #expr); \
266 BUG(); \
267 }
268#else /* !RBD_DEBUG */
269# define rbd_assert(expr) ((void) 0)
270#endif /* !RBD_DEBUG */
dfc5606d 271
dfc5606d
YS
272static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
273{
274 return get_device(&rbd_dev->dev);
275}
276
277static void rbd_put_dev(struct rbd_device *rbd_dev)
278{
279 put_device(&rbd_dev->dev);
280}
602adf40 281
117973fb
AE
282static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
283static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 284
602adf40
YS
285static int rbd_open(struct block_device *bdev, fmode_t mode)
286{
f0f8cef5 287 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 288
f84344f3 289 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
290 return -EROFS;
291
340c7a2b 292 rbd_get_dev(rbd_dev);
f84344f3 293 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 294
602adf40
YS
295 return 0;
296}
297
dfc5606d
YS
298static int rbd_release(struct gendisk *disk, fmode_t mode)
299{
300 struct rbd_device *rbd_dev = disk->private_data;
301
302 rbd_put_dev(rbd_dev);
303
304 return 0;
305}
306
602adf40
YS
307static const struct block_device_operations rbd_bd_ops = {
308 .owner = THIS_MODULE,
309 .open = rbd_open,
dfc5606d 310 .release = rbd_release,
602adf40
YS
311};
312
313/*
314 * Initialize an rbd client instance.
43ae4701 315 * We own *ceph_opts.
602adf40 316 */
f8c38929 317static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
318{
319 struct rbd_client *rbdc;
320 int ret = -ENOMEM;
321
322 dout("rbd_client_create\n");
323 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
324 if (!rbdc)
325 goto out_opt;
326
327 kref_init(&rbdc->kref);
328 INIT_LIST_HEAD(&rbdc->node);
329
bc534d86
AE
330 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
331
43ae4701 332 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 333 if (IS_ERR(rbdc->client))
bc534d86 334 goto out_mutex;
43ae4701 335 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
336
337 ret = ceph_open_session(rbdc->client);
338 if (ret < 0)
339 goto out_err;
340
432b8587 341 spin_lock(&rbd_client_list_lock);
602adf40 342 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 343 spin_unlock(&rbd_client_list_lock);
602adf40 344
bc534d86
AE
345 mutex_unlock(&ctl_mutex);
346
602adf40
YS
347 dout("rbd_client_create created %p\n", rbdc);
348 return rbdc;
349
350out_err:
351 ceph_destroy_client(rbdc->client);
bc534d86
AE
352out_mutex:
353 mutex_unlock(&ctl_mutex);
602adf40
YS
354 kfree(rbdc);
355out_opt:
43ae4701
AE
356 if (ceph_opts)
357 ceph_destroy_options(ceph_opts);
28f259b7 358 return ERR_PTR(ret);
602adf40
YS
359}
360
361/*
1f7ba331
AE
362 * Find a ceph client with specific addr and configuration. If
363 * found, bump its reference count.
602adf40 364 */
1f7ba331 365static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
366{
367 struct rbd_client *client_node;
1f7ba331 368 bool found = false;
602adf40 369
43ae4701 370 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
371 return NULL;
372
1f7ba331
AE
373 spin_lock(&rbd_client_list_lock);
374 list_for_each_entry(client_node, &rbd_client_list, node) {
375 if (!ceph_compare_options(ceph_opts, client_node->client)) {
376 kref_get(&client_node->kref);
377 found = true;
378 break;
379 }
380 }
381 spin_unlock(&rbd_client_list_lock);
382
383 return found ? client_node : NULL;
602adf40
YS
384}
385
59c2be1e
YS
386/*
387 * mount options
388 */
389enum {
59c2be1e
YS
390 Opt_last_int,
391 /* int args above */
392 Opt_last_string,
393 /* string args above */
cc0538b6
AE
394 Opt_read_only,
395 Opt_read_write,
396 /* Boolean args above */
397 Opt_last_bool,
59c2be1e
YS
398};
399
43ae4701 400static match_table_t rbd_opts_tokens = {
59c2be1e
YS
401 /* int args above */
402 /* string args above */
be466c1c 403 {Opt_read_only, "read_only"},
cc0538b6
AE
404 {Opt_read_only, "ro"}, /* Alternate spelling */
405 {Opt_read_write, "read_write"},
406 {Opt_read_write, "rw"}, /* Alternate spelling */
407 /* Boolean args above */
59c2be1e
YS
408 {-1, NULL}
409};
410
411static int parse_rbd_opts_token(char *c, void *private)
412{
43ae4701 413 struct rbd_options *rbd_opts = private;
59c2be1e
YS
414 substring_t argstr[MAX_OPT_ARGS];
415 int token, intval, ret;
416
43ae4701 417 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
418 if (token < 0)
419 return -EINVAL;
420
421 if (token < Opt_last_int) {
422 ret = match_int(&argstr[0], &intval);
423 if (ret < 0) {
424 pr_err("bad mount option arg (not int) "
425 "at '%s'\n", c);
426 return ret;
427 }
428 dout("got int token %d val %d\n", token, intval);
429 } else if (token > Opt_last_int && token < Opt_last_string) {
430 dout("got string token %d val %s\n", token,
431 argstr[0].from);
cc0538b6
AE
432 } else if (token > Opt_last_string && token < Opt_last_bool) {
433 dout("got Boolean token %d\n", token);
59c2be1e
YS
434 } else {
435 dout("got token %d\n", token);
436 }
437
438 switch (token) {
cc0538b6
AE
439 case Opt_read_only:
440 rbd_opts->read_only = true;
441 break;
442 case Opt_read_write:
443 rbd_opts->read_only = false;
444 break;
59c2be1e 445 default:
aafb230e
AE
446 rbd_assert(false);
447 break;
59c2be1e
YS
448 }
449 return 0;
450}
451
602adf40
YS
452/*
453 * Get a ceph client with specific addr and configuration, if one does
454 * not exist create it.
455 */
f8c38929
AE
456static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
457 size_t mon_addr_len, char *options)
602adf40 458{
f8c38929 459 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
43ae4701 460 struct ceph_options *ceph_opts;
f8c38929 461 struct rbd_client *rbdc;
59c2be1e 462
cc0538b6 463 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
602adf40 464
43ae4701
AE
465 ceph_opts = ceph_parse_options(options, mon_addr,
466 mon_addr + mon_addr_len,
467 parse_rbd_opts_token, rbd_opts);
f8c38929
AE
468 if (IS_ERR(ceph_opts))
469 return PTR_ERR(ceph_opts);
602adf40 470
1f7ba331 471 rbdc = rbd_client_find(ceph_opts);
602adf40 472 if (rbdc) {
602adf40 473 /* using an existing client */
43ae4701 474 ceph_destroy_options(ceph_opts);
f8c38929
AE
475 } else {
476 rbdc = rbd_client_create(ceph_opts);
477 if (IS_ERR(rbdc))
478 return PTR_ERR(rbdc);
602adf40 479 }
f8c38929 480 rbd_dev->rbd_client = rbdc;
602adf40 481
f8c38929 482 return 0;
602adf40
YS
483}
484
485/*
486 * Destroy ceph client
d23a4b3f 487 *
432b8587 488 * Caller must hold rbd_client_list_lock.
602adf40
YS
489 */
490static void rbd_client_release(struct kref *kref)
491{
492 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
493
494 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 495 spin_lock(&rbd_client_list_lock);
602adf40 496 list_del(&rbdc->node);
cd9d9f5d 497 spin_unlock(&rbd_client_list_lock);
602adf40
YS
498
499 ceph_destroy_client(rbdc->client);
500 kfree(rbdc);
501}
502
503/*
504 * Drop reference to ceph client node. If it's not referenced anymore, release
505 * it.
506 */
507static void rbd_put_client(struct rbd_device *rbd_dev)
508{
509 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
510 rbd_dev->rbd_client = NULL;
602adf40
YS
511}
512
1fec7093
YS
513/*
514 * Destroy requests collection
515 */
516static void rbd_coll_release(struct kref *kref)
517{
518 struct rbd_req_coll *coll =
519 container_of(kref, struct rbd_req_coll, kref);
520
521 dout("rbd_coll_release %p\n", coll);
522 kfree(coll);
523}
602adf40 524
a30b71b9
AE
525static bool rbd_image_format_valid(u32 image_format)
526{
527 return image_format == 1 || image_format == 2;
528}
529
8e94af8e
AE
530static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
531{
103a150f
AE
532 size_t size;
533 u32 snap_count;
534
535 /* The header has to start with the magic rbd header text */
536 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
537 return false;
538
db2388b6
AE
539 /* The bio layer requires at least sector-sized I/O */
540
541 if (ondisk->options.order < SECTOR_SHIFT)
542 return false;
543
544 /* If we use u64 in a few spots we may be able to loosen this */
545
546 if (ondisk->options.order > 8 * sizeof (int) - 1)
547 return false;
548
103a150f
AE
549 /*
550 * The size of a snapshot header has to fit in a size_t, and
551 * that limits the number of snapshots.
552 */
553 snap_count = le32_to_cpu(ondisk->snap_count);
554 size = SIZE_MAX - sizeof (struct ceph_snap_context);
555 if (snap_count > size / sizeof (__le64))
556 return false;
557
558 /*
559 * Not only that, but the size of the entire the snapshot
560 * header must also be representable in a size_t.
561 */
562 size -= snap_count * sizeof (__le64);
563 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
564 return false;
565
566 return true;
8e94af8e
AE
567}
568
602adf40
YS
569/*
570 * Create a new header structure, translate header format from the on-disk
571 * header.
572 */
573static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 574 struct rbd_image_header_ondisk *ondisk)
602adf40 575{
ccece235 576 u32 snap_count;
58c17b0e 577 size_t len;
d2bb24e5 578 size_t size;
621901d6 579 u32 i;
602adf40 580
6a52325f
AE
581 memset(header, 0, sizeof (*header));
582
103a150f
AE
583 snap_count = le32_to_cpu(ondisk->snap_count);
584
58c17b0e
AE
585 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
586 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 587 if (!header->object_prefix)
602adf40 588 return -ENOMEM;
58c17b0e
AE
589 memcpy(header->object_prefix, ondisk->object_prefix, len);
590 header->object_prefix[len] = '\0';
00f1f36f 591
602adf40 592 if (snap_count) {
f785cc1d
AE
593 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
594
621901d6
AE
595 /* Save a copy of the snapshot names */
596
f785cc1d
AE
597 if (snap_names_len > (u64) SIZE_MAX)
598 return -EIO;
599 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 600 if (!header->snap_names)
6a52325f 601 goto out_err;
f785cc1d
AE
602 /*
603 * Note that rbd_dev_v1_header_read() guarantees
604 * the ondisk buffer we're working with has
605 * snap_names_len bytes beyond the end of the
606 * snapshot id array, this memcpy() is safe.
607 */
608 memcpy(header->snap_names, &ondisk->snaps[snap_count],
609 snap_names_len);
6a52325f 610
621901d6
AE
611 /* Record each snapshot's size */
612
d2bb24e5
AE
613 size = snap_count * sizeof (*header->snap_sizes);
614 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 615 if (!header->snap_sizes)
6a52325f 616 goto out_err;
621901d6
AE
617 for (i = 0; i < snap_count; i++)
618 header->snap_sizes[i] =
619 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 620 } else {
ccece235 621 WARN_ON(ondisk->snap_names_len);
602adf40
YS
622 header->snap_names = NULL;
623 header->snap_sizes = NULL;
624 }
849b4260 625
34b13184 626 header->features = 0; /* No features support in v1 images */
602adf40
YS
627 header->obj_order = ondisk->options.order;
628 header->crypt_type = ondisk->options.crypt_type;
629 header->comp_type = ondisk->options.comp_type;
6a52325f 630
621901d6
AE
631 /* Allocate and fill in the snapshot context */
632
f84344f3 633 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
634 size = sizeof (struct ceph_snap_context);
635 size += snap_count * sizeof (header->snapc->snaps[0]);
636 header->snapc = kzalloc(size, GFP_KERNEL);
637 if (!header->snapc)
638 goto out_err;
602adf40
YS
639
640 atomic_set(&header->snapc->nref, 1);
505cbb9b 641 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 642 header->snapc->num_snaps = snap_count;
621901d6
AE
643 for (i = 0; i < snap_count; i++)
644 header->snapc->snaps[i] =
645 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
646
647 return 0;
648
6a52325f 649out_err:
849b4260 650 kfree(header->snap_sizes);
ccece235 651 header->snap_sizes = NULL;
602adf40 652 kfree(header->snap_names);
ccece235 653 header->snap_names = NULL;
6a52325f
AE
654 kfree(header->object_prefix);
655 header->object_prefix = NULL;
ccece235 656
00f1f36f 657 return -ENOMEM;
602adf40
YS
658}
659
8836b995 660static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 661{
602adf40 662
e86924a8 663 struct rbd_snap *snap;
602adf40 664
e86924a8
AE
665 list_for_each_entry(snap, &rbd_dev->snaps, node) {
666 if (!strcmp(snap_name, snap->name)) {
667 rbd_dev->mapping.snap_id = snap->id;
668 rbd_dev->mapping.size = snap->size;
34b13184 669 rbd_dev->mapping.features = snap->features;
602adf40 670
e86924a8 671 return 0;
00f1f36f 672 }
00f1f36f 673 }
e86924a8 674
00f1f36f 675 return -ENOENT;
602adf40
YS
676}
677
5ed16177 678static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
602adf40 679{
78dc447d 680 int ret;
602adf40 681
4e1105a2 682 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 683 sizeof (RBD_SNAP_HEAD_NAME))) {
f84344f3 684 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
99c1f08f 685 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 686 rbd_dev->mapping.features = rbd_dev->header.features;
f84344f3
AE
687 rbd_dev->mapping.snap_exists = false;
688 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
e86924a8 689 ret = 0;
602adf40 690 } else {
8836b995 691 ret = snap_by_name(rbd_dev, snap_name);
602adf40
YS
692 if (ret < 0)
693 goto done;
f84344f3
AE
694 rbd_dev->mapping.snap_exists = true;
695 rbd_dev->mapping.read_only = true;
602adf40 696 }
4e1105a2 697 rbd_dev->mapping.snap_name = snap_name;
602adf40 698done:
602adf40
YS
699 return ret;
700}
701
702static void rbd_header_free(struct rbd_image_header *header)
703{
849b4260 704 kfree(header->object_prefix);
d78fd7ae 705 header->object_prefix = NULL;
602adf40 706 kfree(header->snap_sizes);
d78fd7ae 707 header->snap_sizes = NULL;
849b4260 708 kfree(header->snap_names);
d78fd7ae 709 header->snap_names = NULL;
d1d25646 710 ceph_put_snap_context(header->snapc);
d78fd7ae 711 header->snapc = NULL;
602adf40
YS
712}
713
65ccfe21 714static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 715{
65ccfe21
AE
716 char *name;
717 u64 segment;
718 int ret;
602adf40 719
65ccfe21
AE
720 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
721 if (!name)
722 return NULL;
723 segment = offset >> rbd_dev->header.obj_order;
724 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
725 rbd_dev->header.object_prefix, segment);
726 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
727 pr_err("error formatting segment name for #%llu (%d)\n",
728 segment, ret);
729 kfree(name);
730 name = NULL;
731 }
602adf40 732
65ccfe21
AE
733 return name;
734}
602adf40 735
65ccfe21
AE
736static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
737{
738 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 739
65ccfe21
AE
740 return offset & (segment_size - 1);
741}
742
743static u64 rbd_segment_length(struct rbd_device *rbd_dev,
744 u64 offset, u64 length)
745{
746 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
747
748 offset &= segment_size - 1;
749
aafb230e 750 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
751 if (offset + length > segment_size)
752 length = segment_size - offset;
753
754 return length;
602adf40
YS
755}
756
1fec7093
YS
757static int rbd_get_num_segments(struct rbd_image_header *header,
758 u64 ofs, u64 len)
759{
df111be6
AE
760 u64 start_seg;
761 u64 end_seg;
762
763 if (!len)
764 return 0;
765 if (len - 1 > U64_MAX - ofs)
766 return -ERANGE;
767
768 start_seg = ofs >> header->obj_order;
769 end_seg = (ofs + len - 1) >> header->obj_order;
770
1fec7093
YS
771 return end_seg - start_seg + 1;
772}
773
029bcbd8
JD
774/*
775 * returns the size of an object in the image
776 */
777static u64 rbd_obj_bytes(struct rbd_image_header *header)
778{
779 return 1 << header->obj_order;
780}
781
602adf40
YS
782/*
783 * bio helpers
784 */
785
786static void bio_chain_put(struct bio *chain)
787{
788 struct bio *tmp;
789
790 while (chain) {
791 tmp = chain;
792 chain = chain->bi_next;
793 bio_put(tmp);
794 }
795}
796
797/*
798 * zeros a bio chain, starting at specific offset
799 */
800static void zero_bio_chain(struct bio *chain, int start_ofs)
801{
802 struct bio_vec *bv;
803 unsigned long flags;
804 void *buf;
805 int i;
806 int pos = 0;
807
808 while (chain) {
809 bio_for_each_segment(bv, chain, i) {
810 if (pos + bv->bv_len > start_ofs) {
811 int remainder = max(start_ofs - pos, 0);
812 buf = bvec_kmap_irq(bv, &flags);
813 memset(buf + remainder, 0,
814 bv->bv_len - remainder);
85b5aaa6 815 bvec_kunmap_irq(buf, &flags);
602adf40
YS
816 }
817 pos += bv->bv_len;
818 }
819
820 chain = chain->bi_next;
821 }
822}
823
824/*
825 * bio_chain_clone - clone a chain of bios up to a certain length.
826 * might return a bio_pair that will need to be released.
827 */
828static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
829 struct bio_pair **bp,
830 int len, gfp_t gfpmask)
831{
542582fc
AE
832 struct bio *old_chain = *old;
833 struct bio *new_chain = NULL;
834 struct bio *tail;
602adf40
YS
835 int total = 0;
836
837 if (*bp) {
838 bio_pair_release(*bp);
839 *bp = NULL;
840 }
841
842 while (old_chain && (total < len)) {
542582fc
AE
843 struct bio *tmp;
844
602adf40
YS
845 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
846 if (!tmp)
847 goto err_out;
542582fc 848 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
602adf40
YS
849
850 if (total + old_chain->bi_size > len) {
851 struct bio_pair *bp;
852
853 /*
854 * this split can only happen with a single paged bio,
855 * split_bio will BUG_ON if this is not the case
856 */
857 dout("bio_chain_clone split! total=%d remaining=%d"
bd919d45
AE
858 "bi_size=%u\n",
859 total, len - total, old_chain->bi_size);
602adf40
YS
860
861 /* split the bio. We'll release it either in the next
862 call, or it will have to be released outside */
593a9e7b 863 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
602adf40
YS
864 if (!bp)
865 goto err_out;
866
867 __bio_clone(tmp, &bp->bio1);
868
869 *next = &bp->bio2;
870 } else {
871 __bio_clone(tmp, old_chain);
872 *next = old_chain->bi_next;
873 }
874
875 tmp->bi_bdev = NULL;
602adf40 876 tmp->bi_next = NULL;
542582fc 877 if (new_chain)
602adf40 878 tail->bi_next = tmp;
542582fc
AE
879 else
880 new_chain = tmp;
881 tail = tmp;
602adf40
YS
882 old_chain = old_chain->bi_next;
883
884 total += tmp->bi_size;
885 }
886
aafb230e 887 rbd_assert(total == len);
602adf40 888
602adf40
YS
889 *old = old_chain;
890
891 return new_chain;
892
893err_out:
894 dout("bio_chain_clone with err\n");
895 bio_chain_put(new_chain);
896 return NULL;
897}
898
899/*
900 * helpers for osd request op vectors.
901 */
57cfc106
AE
902static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
903 int opcode, u32 payload_len)
602adf40 904{
57cfc106
AE
905 struct ceph_osd_req_op *ops;
906
907 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
908 if (!ops)
909 return NULL;
910
911 ops[0].op = opcode;
912
602adf40
YS
913 /*
914 * op extent offset and length will be set later on
915 * in calc_raw_layout()
916 */
57cfc106
AE
917 ops[0].payload_len = payload_len;
918
919 return ops;
602adf40
YS
920}
921
922static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
923{
924 kfree(ops);
925}
926
1fec7093
YS
927static void rbd_coll_end_req_index(struct request *rq,
928 struct rbd_req_coll *coll,
929 int index,
930 int ret, u64 len)
931{
932 struct request_queue *q;
933 int min, max, i;
934
bd919d45
AE
935 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
936 coll, index, ret, (unsigned long long) len);
1fec7093
YS
937
938 if (!rq)
939 return;
940
941 if (!coll) {
942 blk_end_request(rq, ret, len);
943 return;
944 }
945
946 q = rq->q;
947
948 spin_lock_irq(q->queue_lock);
949 coll->status[index].done = 1;
950 coll->status[index].rc = ret;
951 coll->status[index].bytes = len;
952 max = min = coll->num_done;
953 while (max < coll->total && coll->status[max].done)
954 max++;
955
956 for (i = min; i<max; i++) {
957 __blk_end_request(rq, coll->status[i].rc,
958 coll->status[i].bytes);
959 coll->num_done++;
960 kref_put(&coll->kref, rbd_coll_release);
961 }
962 spin_unlock_irq(q->queue_lock);
963}
964
965static void rbd_coll_end_req(struct rbd_request *req,
966 int ret, u64 len)
967{
968 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
969}
970
602adf40
YS
971/*
972 * Send ceph osd request
973 */
974static int rbd_do_request(struct request *rq,
0ce1a794 975 struct rbd_device *rbd_dev,
602adf40
YS
976 struct ceph_snap_context *snapc,
977 u64 snapid,
aded07ea 978 const char *object_name, u64 ofs, u64 len,
602adf40
YS
979 struct bio *bio,
980 struct page **pages,
981 int num_pages,
982 int flags,
983 struct ceph_osd_req_op *ops,
1fec7093
YS
984 struct rbd_req_coll *coll,
985 int coll_index,
602adf40 986 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
987 struct ceph_msg *msg),
988 struct ceph_osd_request **linger_req,
989 u64 *ver)
602adf40
YS
990{
991 struct ceph_osd_request *req;
992 struct ceph_file_layout *layout;
993 int ret;
994 u64 bno;
995 struct timespec mtime = CURRENT_TIME;
996 struct rbd_request *req_data;
997 struct ceph_osd_request_head *reqhead;
1dbb4399 998 struct ceph_osd_client *osdc;
602adf40 999
602adf40 1000 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
1001 if (!req_data) {
1002 if (coll)
1003 rbd_coll_end_req_index(rq, coll, coll_index,
1004 -ENOMEM, len);
1005 return -ENOMEM;
1006 }
1007
1008 if (coll) {
1009 req_data->coll = coll;
1010 req_data->coll_index = coll_index;
1011 }
602adf40 1012
bd919d45
AE
1013 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
1014 (unsigned long long) ofs, (unsigned long long) len);
602adf40 1015
0ce1a794 1016 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
1017 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1018 false, GFP_NOIO, pages, bio);
4ad12621 1019 if (!req) {
4ad12621 1020 ret = -ENOMEM;
602adf40
YS
1021 goto done_pages;
1022 }
1023
1024 req->r_callback = rbd_cb;
1025
1026 req_data->rq = rq;
1027 req_data->bio = bio;
1028 req_data->pages = pages;
1029 req_data->len = len;
1030
1031 req->r_priv = req_data;
1032
1033 reqhead = req->r_request->front.iov_base;
1034 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1035
aded07ea 1036 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1037 req->r_oid_len = strlen(req->r_oid);
1038
1039 layout = &req->r_file_layout;
1040 memset(layout, 0, sizeof(*layout));
1041 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1042 layout->fl_stripe_count = cpu_to_le32(1);
1043 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0ce1a794 1044 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
6cae3717
SW
1045 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1046 req, ops);
1047 rbd_assert(ret == 0);
602adf40
YS
1048
1049 ceph_osdc_build_request(req, ofs, &len,
1050 ops,
1051 snapc,
1052 &mtime,
1053 req->r_oid, req->r_oid_len);
602adf40 1054
59c2be1e 1055 if (linger_req) {
1dbb4399 1056 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1057 *linger_req = req;
1058 }
1059
1dbb4399 1060 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1061 if (ret < 0)
1062 goto done_err;
1063
1064 if (!rbd_cb) {
1dbb4399 1065 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1066 if (ver)
1067 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1068 dout("reassert_ver=%llu\n",
1069 (unsigned long long)
1070 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1071 ceph_osdc_put_request(req);
1072 }
1073 return ret;
1074
1075done_err:
1076 bio_chain_put(req_data->bio);
1077 ceph_osdc_put_request(req);
1078done_pages:
1fec7093 1079 rbd_coll_end_req(req_data, ret, len);
602adf40 1080 kfree(req_data);
602adf40
YS
1081 return ret;
1082}
1083
1084/*
1085 * Ceph osd op callback
1086 */
1087static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1088{
1089 struct rbd_request *req_data = req->r_priv;
1090 struct ceph_osd_reply_head *replyhead;
1091 struct ceph_osd_op *op;
1092 __s32 rc;
1093 u64 bytes;
1094 int read_op;
1095
1096 /* parse reply */
1097 replyhead = msg->front.iov_base;
1098 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1099 op = (void *)(replyhead + 1);
1100 rc = le32_to_cpu(replyhead->result);
1101 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1102 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1103
bd919d45
AE
1104 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1105 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1106
1107 if (rc == -ENOENT && read_op) {
1108 zero_bio_chain(req_data->bio, 0);
1109 rc = 0;
1110 } else if (rc == 0 && read_op && bytes < req_data->len) {
1111 zero_bio_chain(req_data->bio, bytes);
1112 bytes = req_data->len;
1113 }
1114
1fec7093 1115 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1116
1117 if (req_data->bio)
1118 bio_chain_put(req_data->bio);
1119
1120 ceph_osdc_put_request(req);
1121 kfree(req_data);
1122}
1123
59c2be1e
YS
1124static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1125{
1126 ceph_osdc_put_request(req);
1127}
1128
602adf40
YS
1129/*
1130 * Do a synchronous ceph osd operation
1131 */
0ce1a794 1132static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1133 struct ceph_snap_context *snapc,
1134 u64 snapid,
602adf40 1135 int flags,
913d2fdc 1136 struct ceph_osd_req_op *ops,
aded07ea 1137 const char *object_name,
f8d4de6e
AE
1138 u64 ofs, u64 inbound_size,
1139 char *inbound,
59c2be1e
YS
1140 struct ceph_osd_request **linger_req,
1141 u64 *ver)
602adf40
YS
1142{
1143 int ret;
1144 struct page **pages;
1145 int num_pages;
913d2fdc 1146
aafb230e 1147 rbd_assert(ops != NULL);
602adf40 1148
f8d4de6e 1149 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1150 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1151 if (IS_ERR(pages))
1152 return PTR_ERR(pages);
602adf40 1153
0ce1a794 1154 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1155 object_name, ofs, inbound_size, NULL,
602adf40
YS
1156 pages, num_pages,
1157 flags,
1158 ops,
1fec7093 1159 NULL, 0,
59c2be1e
YS
1160 NULL,
1161 linger_req, ver);
602adf40 1162 if (ret < 0)
913d2fdc 1163 goto done;
602adf40 1164
f8d4de6e
AE
1165 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1166 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1167
602adf40
YS
1168done:
1169 ceph_release_page_vector(pages, num_pages);
1170 return ret;
1171}
1172
1173/*
1174 * Do an asynchronous ceph osd operation
1175 */
1176static int rbd_do_op(struct request *rq,
0ce1a794 1177 struct rbd_device *rbd_dev,
602adf40 1178 struct ceph_snap_context *snapc,
602adf40 1179 u64 ofs, u64 len,
1fec7093
YS
1180 struct bio *bio,
1181 struct rbd_req_coll *coll,
1182 int coll_index)
602adf40
YS
1183{
1184 char *seg_name;
1185 u64 seg_ofs;
1186 u64 seg_len;
1187 int ret;
1188 struct ceph_osd_req_op *ops;
1189 u32 payload_len;
ff2e4bb5
AE
1190 int opcode;
1191 int flags;
4634246d 1192 u64 snapid;
602adf40 1193
65ccfe21 1194 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1195 if (!seg_name)
1196 return -ENOMEM;
65ccfe21
AE
1197 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1198 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1199
ff2e4bb5
AE
1200 if (rq_data_dir(rq) == WRITE) {
1201 opcode = CEPH_OSD_OP_WRITE;
1202 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1203 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1204 payload_len = seg_len;
1205 } else {
1206 opcode = CEPH_OSD_OP_READ;
1207 flags = CEPH_OSD_FLAG_READ;
4634246d
AE
1208 snapc = NULL;
1209 snapid = rbd_dev->mapping.snap_id;
ff2e4bb5
AE
1210 payload_len = 0;
1211 }
602adf40 1212
57cfc106
AE
1213 ret = -ENOMEM;
1214 ops = rbd_create_rw_ops(1, opcode, payload_len);
1215 if (!ops)
602adf40
YS
1216 goto done;
1217
1218 /* we've taken care of segment sizes earlier when we
1219 cloned the bios. We should never have a segment
1220 truncated at this point */
aafb230e 1221 rbd_assert(seg_len == len);
602adf40
YS
1222
1223 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1224 seg_name, seg_ofs, seg_len,
1225 bio,
1226 NULL, 0,
1227 flags,
1228 ops,
1fec7093 1229 coll, coll_index,
59c2be1e 1230 rbd_req_cb, 0, NULL);
11f77002
SW
1231
1232 rbd_destroy_ops(ops);
602adf40
YS
1233done:
1234 kfree(seg_name);
1235 return ret;
1236}
1237
602adf40
YS
1238/*
1239 * Request sync osd read
1240 */
0ce1a794 1241static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1242 u64 snapid,
aded07ea 1243 const char *object_name,
602adf40 1244 u64 ofs, u64 len,
59c2be1e
YS
1245 char *buf,
1246 u64 *ver)
602adf40 1247{
913d2fdc
AE
1248 struct ceph_osd_req_op *ops;
1249 int ret;
1250
1251 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1252 if (!ops)
1253 return -ENOMEM;
1254
1255 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1256 snapid,
602adf40 1257 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1258 ops, object_name, ofs, len, buf, NULL, ver);
1259 rbd_destroy_ops(ops);
1260
1261 return ret;
602adf40
YS
1262}
1263
1264/*
59c2be1e
YS
1265 * Request sync osd watch
1266 */
0ce1a794 1267static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1268 u64 ver,
7f0a24d8 1269 u64 notify_id)
59c2be1e
YS
1270{
1271 struct ceph_osd_req_op *ops;
11f77002
SW
1272 int ret;
1273
57cfc106
AE
1274 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1275 if (!ops)
1276 return -ENOMEM;
59c2be1e 1277
a71b891b 1278 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1279 ops[0].watch.cookie = notify_id;
1280 ops[0].watch.flag = 0;
1281
0ce1a794 1282 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1283 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1284 NULL, 0,
59c2be1e
YS
1285 CEPH_OSD_FLAG_READ,
1286 ops,
1fec7093 1287 NULL, 0,
59c2be1e
YS
1288 rbd_simple_req_cb, 0, NULL);
1289
1290 rbd_destroy_ops(ops);
1291 return ret;
1292}
1293
1294static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1295{
0ce1a794 1296 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1297 u64 hver;
13143d2d
SW
1298 int rc;
1299
0ce1a794 1300 if (!rbd_dev)
59c2be1e
YS
1301 return;
1302
bd919d45
AE
1303 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1304 rbd_dev->header_name, (unsigned long long) notify_id,
1305 (unsigned int) opcode);
117973fb 1306 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1307 if (rc)
f0f8cef5 1308 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1309 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1310
7f0a24d8 1311 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1312}
1313
1314/*
1315 * Request sync osd watch
1316 */
0e6f322d 1317static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1318{
1319 struct ceph_osd_req_op *ops;
0ce1a794 1320 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1321 int ret;
59c2be1e 1322
57cfc106
AE
1323 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1324 if (!ops)
1325 return -ENOMEM;
59c2be1e
YS
1326
1327 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1328 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1329 if (ret < 0)
1330 goto fail;
1331
0e6f322d 1332 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1333 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1334 ops[0].watch.flag = 1;
1335
0ce1a794 1336 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1337 CEPH_NOSNAP,
59c2be1e
YS
1338 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1339 ops,
0e6f322d
AE
1340 rbd_dev->header_name,
1341 0, 0, NULL,
0ce1a794 1342 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1343
1344 if (ret < 0)
1345 goto fail_event;
1346
1347 rbd_destroy_ops(ops);
1348 return 0;
1349
1350fail_event:
0ce1a794
AE
1351 ceph_osdc_cancel_event(rbd_dev->watch_event);
1352 rbd_dev->watch_event = NULL;
59c2be1e
YS
1353fail:
1354 rbd_destroy_ops(ops);
1355 return ret;
1356}
1357
79e3057c
YS
1358/*
1359 * Request sync osd unwatch
1360 */
070c633f 1361static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1362{
1363 struct ceph_osd_req_op *ops;
57cfc106 1364 int ret;
79e3057c 1365
57cfc106
AE
1366 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1367 if (!ops)
1368 return -ENOMEM;
79e3057c
YS
1369
1370 ops[0].watch.ver = 0;
0ce1a794 1371 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1372 ops[0].watch.flag = 0;
1373
0ce1a794 1374 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1375 CEPH_NOSNAP,
79e3057c
YS
1376 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1377 ops,
070c633f
AE
1378 rbd_dev->header_name,
1379 0, 0, NULL, NULL, NULL);
1380
79e3057c
YS
1381
1382 rbd_destroy_ops(ops);
0ce1a794
AE
1383 ceph_osdc_cancel_event(rbd_dev->watch_event);
1384 rbd_dev->watch_event = NULL;
79e3057c
YS
1385 return ret;
1386}
1387
602adf40 1388/*
3cb4a687 1389 * Synchronous osd object method call
602adf40 1390 */
0ce1a794 1391static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1392 const char *object_name,
1393 const char *class_name,
1394 const char *method_name,
3cb4a687
AE
1395 const char *outbound,
1396 size_t outbound_size,
f8d4de6e
AE
1397 char *inbound,
1398 size_t inbound_size,
3cb4a687 1399 int flags,
59c2be1e 1400 u64 *ver)
602adf40
YS
1401{
1402 struct ceph_osd_req_op *ops;
aded07ea
AE
1403 int class_name_len = strlen(class_name);
1404 int method_name_len = strlen(method_name);
3cb4a687 1405 int payload_size;
57cfc106
AE
1406 int ret;
1407
3cb4a687
AE
1408 /*
1409 * Any input parameters required by the method we're calling
1410 * will be sent along with the class and method names as
1411 * part of the message payload. That data and its size are
1412 * supplied via the indata and indata_len fields (named from
1413 * the perspective of the server side) in the OSD request
1414 * operation.
1415 */
1416 payload_size = class_name_len + method_name_len + outbound_size;
1417 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1418 if (!ops)
1419 return -ENOMEM;
602adf40 1420
aded07ea
AE
1421 ops[0].cls.class_name = class_name;
1422 ops[0].cls.class_len = (__u8) class_name_len;
1423 ops[0].cls.method_name = method_name;
1424 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1425 ops[0].cls.argc = 0;
3cb4a687
AE
1426 ops[0].cls.indata = outbound;
1427 ops[0].cls.indata_len = outbound_size;
602adf40 1428
0ce1a794 1429 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1430 CEPH_NOSNAP,
3cb4a687 1431 flags, ops,
f8d4de6e
AE
1432 object_name, 0, inbound_size, inbound,
1433 NULL, ver);
602adf40
YS
1434
1435 rbd_destroy_ops(ops);
1436
1437 dout("cls_exec returned %d\n", ret);
1438 return ret;
1439}
1440
1fec7093
YS
1441static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1442{
1443 struct rbd_req_coll *coll =
1444 kzalloc(sizeof(struct rbd_req_coll) +
1445 sizeof(struct rbd_req_status) * num_reqs,
1446 GFP_ATOMIC);
1447
1448 if (!coll)
1449 return NULL;
1450 coll->total = num_reqs;
1451 kref_init(&coll->kref);
1452 return coll;
1453}
1454
602adf40
YS
1455/*
1456 * block device queue callback
1457 */
1458static void rbd_rq_fn(struct request_queue *q)
1459{
1460 struct rbd_device *rbd_dev = q->queuedata;
1461 struct request *rq;
1462 struct bio_pair *bp = NULL;
1463
00f1f36f 1464 while ((rq = blk_fetch_request(q))) {
602adf40
YS
1465 struct bio *bio;
1466 struct bio *rq_bio, *next_bio = NULL;
1467 bool do_write;
bd919d45
AE
1468 unsigned int size;
1469 u64 op_size = 0;
602adf40 1470 u64 ofs;
1fec7093
YS
1471 int num_segs, cur_seg = 0;
1472 struct rbd_req_coll *coll;
d1d25646 1473 struct ceph_snap_context *snapc;
602adf40 1474
602adf40
YS
1475 dout("fetched request\n");
1476
1477 /* filter out block requests we don't understand */
1478 if ((rq->cmd_type != REQ_TYPE_FS)) {
1479 __blk_end_request_all(rq, 0);
00f1f36f 1480 continue;
602adf40
YS
1481 }
1482
1483 /* deduce our operation (read, write) */
1484 do_write = (rq_data_dir(rq) == WRITE);
1485
1486 size = blk_rq_bytes(rq);
593a9e7b 1487 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
602adf40 1488 rq_bio = rq->bio;
f84344f3 1489 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1490 __blk_end_request_all(rq, -EROFS);
00f1f36f 1491 continue;
602adf40
YS
1492 }
1493
1494 spin_unlock_irq(q->queue_lock);
1495
d1d25646 1496 down_read(&rbd_dev->header_rwsem);
e88a36ec 1497
f84344f3
AE
1498 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1499 !rbd_dev->mapping.snap_exists) {
e88a36ec 1500 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1501 dout("request for non-existent snapshot");
1502 spin_lock_irq(q->queue_lock);
1503 __blk_end_request_all(rq, -ENXIO);
1504 continue;
e88a36ec
JD
1505 }
1506
d1d25646
JD
1507 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1508
1509 up_read(&rbd_dev->header_rwsem);
1510
602adf40
YS
1511 dout("%s 0x%x bytes at 0x%llx\n",
1512 do_write ? "write" : "read",
bd919d45 1513 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1514
1fec7093 1515 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1516 if (num_segs <= 0) {
1517 spin_lock_irq(q->queue_lock);
1518 __blk_end_request_all(rq, num_segs);
1519 ceph_put_snap_context(snapc);
1520 continue;
1521 }
1fec7093
YS
1522 coll = rbd_alloc_coll(num_segs);
1523 if (!coll) {
1524 spin_lock_irq(q->queue_lock);
1525 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1526 ceph_put_snap_context(snapc);
00f1f36f 1527 continue;
1fec7093
YS
1528 }
1529
602adf40
YS
1530 do {
1531 /* a bio clone to be passed down to OSD req */
bd919d45 1532 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
65ccfe21 1533 op_size = rbd_segment_length(rbd_dev, ofs, size);
1fec7093 1534 kref_get(&coll->kref);
602adf40
YS
1535 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1536 op_size, GFP_ATOMIC);
4634246d
AE
1537 if (bio)
1538 (void) rbd_do_op(rq, rbd_dev, snapc,
1539 ofs, op_size,
1540 bio, coll, cur_seg);
1541 else
1fec7093
YS
1542 rbd_coll_end_req_index(rq, coll, cur_seg,
1543 -ENOMEM, op_size);
602adf40
YS
1544 size -= op_size;
1545 ofs += op_size;
1546
1fec7093 1547 cur_seg++;
602adf40
YS
1548 rq_bio = next_bio;
1549 } while (size > 0);
1fec7093 1550 kref_put(&coll->kref, rbd_coll_release);
602adf40
YS
1551
1552 if (bp)
1553 bio_pair_release(bp);
602adf40 1554 spin_lock_irq(q->queue_lock);
d1d25646
JD
1555
1556 ceph_put_snap_context(snapc);
602adf40
YS
1557 }
1558}
1559
1560/*
1561 * a queue callback. Makes sure that we don't create a bio that spans across
1562 * multiple osd objects. One exception would be with a single page bios,
1563 * which we handle later at bio_chain_clone
1564 */
1565static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1566 struct bio_vec *bvec)
1567{
1568 struct rbd_device *rbd_dev = q->queuedata;
593a9e7b
AE
1569 unsigned int chunk_sectors;
1570 sector_t sector;
1571 unsigned int bio_sectors;
602adf40
YS
1572 int max;
1573
593a9e7b
AE
1574 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1575 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1576 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1577
602adf40 1578 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
593a9e7b 1579 + bio_sectors)) << SECTOR_SHIFT;
602adf40
YS
1580 if (max < 0)
1581 max = 0; /* bio_add cannot handle a negative return */
1582 if (max <= bvec->bv_len && bio_sectors == 0)
1583 return bvec->bv_len;
1584 return max;
1585}
1586
1587static void rbd_free_disk(struct rbd_device *rbd_dev)
1588{
1589 struct gendisk *disk = rbd_dev->disk;
1590
1591 if (!disk)
1592 return;
1593
602adf40
YS
1594 if (disk->flags & GENHD_FL_UP)
1595 del_gendisk(disk);
1596 if (disk->queue)
1597 blk_cleanup_queue(disk->queue);
1598 put_disk(disk);
1599}
1600
1601/*
4156d998
AE
1602 * Read the complete header for the given rbd device.
1603 *
1604 * Returns a pointer to a dynamically-allocated buffer containing
1605 * the complete and validated header. Caller can pass the address
1606 * of a variable that will be filled in with the version of the
1607 * header object at the time it was read.
1608 *
1609 * Returns a pointer-coded errno if a failure occurs.
602adf40 1610 */
4156d998
AE
1611static struct rbd_image_header_ondisk *
1612rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1613{
4156d998 1614 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1615 u32 snap_count = 0;
4156d998
AE
1616 u64 names_size = 0;
1617 u32 want_count;
1618 int ret;
602adf40 1619
00f1f36f 1620 /*
4156d998
AE
1621 * The complete header will include an array of its 64-bit
1622 * snapshot ids, followed by the names of those snapshots as
1623 * a contiguous block of NUL-terminated strings. Note that
1624 * the number of snapshots could change by the time we read
1625 * it in, in which case we re-read it.
00f1f36f 1626 */
4156d998
AE
1627 do {
1628 size_t size;
1629
1630 kfree(ondisk);
1631
1632 size = sizeof (*ondisk);
1633 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1634 size += names_size;
1635 ondisk = kmalloc(size, GFP_KERNEL);
1636 if (!ondisk)
1637 return ERR_PTR(-ENOMEM);
1638
1639 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1640 rbd_dev->header_name,
4156d998
AE
1641 0, size,
1642 (char *) ondisk, version);
1643
1644 if (ret < 0)
1645 goto out_err;
1646 if (WARN_ON((size_t) ret < size)) {
1647 ret = -ENXIO;
1648 pr_warning("short header read for image %s"
1649 " (want %zd got %d)\n",
1650 rbd_dev->image_name, size, ret);
1651 goto out_err;
1652 }
1653 if (!rbd_dev_ondisk_valid(ondisk)) {
1654 ret = -ENXIO;
1655 pr_warning("invalid header for image %s\n",
1656 rbd_dev->image_name);
1657 goto out_err;
81e759fb 1658 }
602adf40 1659
4156d998
AE
1660 names_size = le64_to_cpu(ondisk->snap_names_len);
1661 want_count = snap_count;
1662 snap_count = le32_to_cpu(ondisk->snap_count);
1663 } while (snap_count != want_count);
00f1f36f 1664
4156d998 1665 return ondisk;
00f1f36f 1666
4156d998
AE
1667out_err:
1668 kfree(ondisk);
1669
1670 return ERR_PTR(ret);
1671}
1672
1673/*
1674 * reload the ondisk the header
1675 */
1676static int rbd_read_header(struct rbd_device *rbd_dev,
1677 struct rbd_image_header *header)
1678{
1679 struct rbd_image_header_ondisk *ondisk;
1680 u64 ver = 0;
1681 int ret;
602adf40 1682
4156d998
AE
1683 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1684 if (IS_ERR(ondisk))
1685 return PTR_ERR(ondisk);
1686 ret = rbd_header_from_disk(header, ondisk);
1687 if (ret >= 0)
1688 header->obj_version = ver;
1689 kfree(ondisk);
1690
1691 return ret;
602adf40
YS
1692}
1693
dfc5606d
YS
1694static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1695{
1696 struct rbd_snap *snap;
a0593290 1697 struct rbd_snap *next;
dfc5606d 1698
a0593290 1699 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
14e7085d 1700 __rbd_remove_snap_dev(snap);
dfc5606d
YS
1701}
1702
9478554a
AE
1703static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1704{
1705 sector_t size;
1706
1707 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
1708 return;
1709
1710 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1711 dout("setting size to %llu sectors", (unsigned long long) size);
1712 rbd_dev->mapping.size = (u64) size;
1713 set_capacity(rbd_dev->disk, size);
1714}
1715
602adf40
YS
1716/*
1717 * only read the first part of the ondisk header, without the snaps info
1718 */
117973fb 1719static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1720{
1721 int ret;
1722 struct rbd_image_header h;
602adf40
YS
1723
1724 ret = rbd_read_header(rbd_dev, &h);
1725 if (ret < 0)
1726 return ret;
1727
a51aa0c0
JD
1728 down_write(&rbd_dev->header_rwsem);
1729
9478554a
AE
1730 /* Update image size, and check for resize of mapped image */
1731 rbd_dev->header.image_size = h.image_size;
1732 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1733
849b4260 1734 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1735 kfree(rbd_dev->header.snap_sizes);
849b4260 1736 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1737 /* osd requests may still refer to snapc */
1738 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1739
b813623a
AE
1740 if (hver)
1741 *hver = h.obj_version;
a71b891b 1742 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1743 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1744 rbd_dev->header.snapc = h.snapc;
1745 rbd_dev->header.snap_names = h.snap_names;
1746 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1747 /* Free the extra copy of the object prefix */
1748 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1749 kfree(h.object_prefix);
1750
304f6808
AE
1751 ret = rbd_dev_snaps_update(rbd_dev);
1752 if (!ret)
1753 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1754
c666601a 1755 up_write(&rbd_dev->header_rwsem);
602adf40 1756
dfc5606d 1757 return ret;
602adf40
YS
1758}
1759
117973fb 1760static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1761{
1762 int ret;
1763
117973fb 1764 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1765 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1766 if (rbd_dev->image_format == 1)
1767 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1768 else
1769 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1770 mutex_unlock(&ctl_mutex);
1771
1772 return ret;
1773}
1774
602adf40
YS
1775static int rbd_init_disk(struct rbd_device *rbd_dev)
1776{
1777 struct gendisk *disk;
1778 struct request_queue *q;
593a9e7b 1779 u64 segment_size;
602adf40 1780
602adf40 1781 /* create gendisk info */
602adf40
YS
1782 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1783 if (!disk)
1fcdb8aa 1784 return -ENOMEM;
602adf40 1785
f0f8cef5 1786 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1787 rbd_dev->dev_id);
602adf40
YS
1788 disk->major = rbd_dev->major;
1789 disk->first_minor = 0;
1790 disk->fops = &rbd_bd_ops;
1791 disk->private_data = rbd_dev;
1792
1793 /* init rq */
602adf40
YS
1794 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1795 if (!q)
1796 goto out_disk;
029bcbd8 1797
593a9e7b
AE
1798 /* We use the default size, but let's be explicit about it. */
1799 blk_queue_physical_block_size(q, SECTOR_SIZE);
1800
029bcbd8 1801 /* set io sizes to object size */
593a9e7b
AE
1802 segment_size = rbd_obj_bytes(&rbd_dev->header);
1803 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1804 blk_queue_max_segment_size(q, segment_size);
1805 blk_queue_io_min(q, segment_size);
1806 blk_queue_io_opt(q, segment_size);
029bcbd8 1807
602adf40
YS
1808 blk_queue_merge_bvec(q, rbd_merge_bvec);
1809 disk->queue = q;
1810
1811 q->queuedata = rbd_dev;
1812
1813 rbd_dev->disk = disk;
602adf40 1814
12f02944
AE
1815 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1816
602adf40 1817 return 0;
602adf40
YS
1818out_disk:
1819 put_disk(disk);
1fcdb8aa
AE
1820
1821 return -ENOMEM;
602adf40
YS
1822}
1823
dfc5606d
YS
1824/*
1825 sysfs
1826*/
1827
593a9e7b
AE
1828static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1829{
1830 return container_of(dev, struct rbd_device, dev);
1831}
1832
dfc5606d
YS
1833static ssize_t rbd_size_show(struct device *dev,
1834 struct device_attribute *attr, char *buf)
1835{
593a9e7b 1836 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1837 sector_t size;
1838
1839 down_read(&rbd_dev->header_rwsem);
1840 size = get_capacity(rbd_dev->disk);
1841 up_read(&rbd_dev->header_rwsem);
dfc5606d 1842
a51aa0c0 1843 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1844}
1845
34b13184
AE
1846/*
1847 * Note this shows the features for whatever's mapped, which is not
1848 * necessarily the base image.
1849 */
1850static ssize_t rbd_features_show(struct device *dev,
1851 struct device_attribute *attr, char *buf)
1852{
1853 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1854
1855 return sprintf(buf, "0x%016llx\n",
1856 (unsigned long long) rbd_dev->mapping.features);
1857}
1858
dfc5606d
YS
1859static ssize_t rbd_major_show(struct device *dev,
1860 struct device_attribute *attr, char *buf)
1861{
593a9e7b 1862 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1863
dfc5606d
YS
1864 return sprintf(buf, "%d\n", rbd_dev->major);
1865}
1866
1867static ssize_t rbd_client_id_show(struct device *dev,
1868 struct device_attribute *attr, char *buf)
602adf40 1869{
593a9e7b 1870 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1871
1dbb4399
AE
1872 return sprintf(buf, "client%lld\n",
1873 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1874}
1875
dfc5606d
YS
1876static ssize_t rbd_pool_show(struct device *dev,
1877 struct device_attribute *attr, char *buf)
602adf40 1878{
593a9e7b 1879 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1880
1881 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1882}
1883
9bb2f334
AE
1884static ssize_t rbd_pool_id_show(struct device *dev,
1885 struct device_attribute *attr, char *buf)
1886{
1887 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1888
1889 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1890}
1891
dfc5606d
YS
1892static ssize_t rbd_name_show(struct device *dev,
1893 struct device_attribute *attr, char *buf)
1894{
593a9e7b 1895 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1896
0bed54dc 1897 return sprintf(buf, "%s\n", rbd_dev->image_name);
dfc5606d
YS
1898}
1899
589d30e0
AE
1900static ssize_t rbd_image_id_show(struct device *dev,
1901 struct device_attribute *attr, char *buf)
1902{
1903 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1904
1905 return sprintf(buf, "%s\n", rbd_dev->image_id);
1906}
1907
34b13184
AE
1908/*
1909 * Shows the name of the currently-mapped snapshot (or
1910 * RBD_SNAP_HEAD_NAME for the base image).
1911 */
dfc5606d
YS
1912static ssize_t rbd_snap_show(struct device *dev,
1913 struct device_attribute *attr,
1914 char *buf)
1915{
593a9e7b 1916 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1917
f84344f3 1918 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
dfc5606d
YS
1919}
1920
1921static ssize_t rbd_image_refresh(struct device *dev,
1922 struct device_attribute *attr,
1923 const char *buf,
1924 size_t size)
1925{
593a9e7b 1926 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 1927 int ret;
602adf40 1928
117973fb 1929 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
1930
1931 return ret < 0 ? ret : size;
dfc5606d 1932}
602adf40 1933
dfc5606d 1934static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 1935static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
1936static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1937static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1938static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 1939static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 1940static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 1941static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
1942static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1943static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
dfc5606d
YS
1944
1945static struct attribute *rbd_attrs[] = {
1946 &dev_attr_size.attr,
34b13184 1947 &dev_attr_features.attr,
dfc5606d
YS
1948 &dev_attr_major.attr,
1949 &dev_attr_client_id.attr,
1950 &dev_attr_pool.attr,
9bb2f334 1951 &dev_attr_pool_id.attr,
dfc5606d 1952 &dev_attr_name.attr,
589d30e0 1953 &dev_attr_image_id.attr,
dfc5606d
YS
1954 &dev_attr_current_snap.attr,
1955 &dev_attr_refresh.attr,
dfc5606d
YS
1956 NULL
1957};
1958
1959static struct attribute_group rbd_attr_group = {
1960 .attrs = rbd_attrs,
1961};
1962
1963static const struct attribute_group *rbd_attr_groups[] = {
1964 &rbd_attr_group,
1965 NULL
1966};
1967
1968static void rbd_sysfs_dev_release(struct device *dev)
1969{
1970}
1971
1972static struct device_type rbd_device_type = {
1973 .name = "rbd",
1974 .groups = rbd_attr_groups,
1975 .release = rbd_sysfs_dev_release,
1976};
1977
1978
1979/*
1980 sysfs - snapshots
1981*/
1982
1983static ssize_t rbd_snap_size_show(struct device *dev,
1984 struct device_attribute *attr,
1985 char *buf)
1986{
1987 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1988
3591538f 1989 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
1990}
1991
1992static ssize_t rbd_snap_id_show(struct device *dev,
1993 struct device_attribute *attr,
1994 char *buf)
1995{
1996 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1997
3591538f 1998 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
1999}
2000
34b13184
AE
2001static ssize_t rbd_snap_features_show(struct device *dev,
2002 struct device_attribute *attr,
2003 char *buf)
2004{
2005 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2006
2007 return sprintf(buf, "0x%016llx\n",
2008 (unsigned long long) snap->features);
2009}
2010
dfc5606d
YS
2011static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2012static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2013static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2014
2015static struct attribute *rbd_snap_attrs[] = {
2016 &dev_attr_snap_size.attr,
2017 &dev_attr_snap_id.attr,
34b13184 2018 &dev_attr_snap_features.attr,
dfc5606d
YS
2019 NULL,
2020};
2021
2022static struct attribute_group rbd_snap_attr_group = {
2023 .attrs = rbd_snap_attrs,
2024};
2025
2026static void rbd_snap_dev_release(struct device *dev)
2027{
2028 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2029 kfree(snap->name);
2030 kfree(snap);
2031}
2032
2033static const struct attribute_group *rbd_snap_attr_groups[] = {
2034 &rbd_snap_attr_group,
2035 NULL
2036};
2037
2038static struct device_type rbd_snap_device_type = {
2039 .groups = rbd_snap_attr_groups,
2040 .release = rbd_snap_dev_release,
2041};
2042
304f6808
AE
2043static bool rbd_snap_registered(struct rbd_snap *snap)
2044{
2045 bool ret = snap->dev.type == &rbd_snap_device_type;
2046 bool reg = device_is_registered(&snap->dev);
2047
2048 rbd_assert(!ret ^ reg);
2049
2050 return ret;
2051}
2052
14e7085d 2053static void __rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2054{
2055 list_del(&snap->node);
304f6808
AE
2056 if (device_is_registered(&snap->dev))
2057 device_unregister(&snap->dev);
dfc5606d
YS
2058}
2059
14e7085d 2060static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2061 struct device *parent)
2062{
2063 struct device *dev = &snap->dev;
2064 int ret;
2065
2066 dev->type = &rbd_snap_device_type;
2067 dev->parent = parent;
2068 dev->release = rbd_snap_dev_release;
d4b125e9 2069 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2070 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2071
dfc5606d
YS
2072 ret = device_register(dev);
2073
2074 return ret;
2075}
2076
4e891e0a 2077static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2078 const char *snap_name,
34b13184
AE
2079 u64 snap_id, u64 snap_size,
2080 u64 snap_features)
dfc5606d 2081{
4e891e0a 2082 struct rbd_snap *snap;
dfc5606d 2083 int ret;
4e891e0a
AE
2084
2085 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2086 if (!snap)
4e891e0a
AE
2087 return ERR_PTR(-ENOMEM);
2088
2089 ret = -ENOMEM;
c8d18425 2090 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2091 if (!snap->name)
2092 goto err;
2093
c8d18425
AE
2094 snap->id = snap_id;
2095 snap->size = snap_size;
34b13184 2096 snap->features = snap_features;
4e891e0a
AE
2097
2098 return snap;
2099
dfc5606d
YS
2100err:
2101 kfree(snap->name);
2102 kfree(snap);
4e891e0a
AE
2103
2104 return ERR_PTR(ret);
dfc5606d
YS
2105}
2106
cd892126
AE
2107static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2108 u64 *snap_size, u64 *snap_features)
2109{
2110 char *snap_name;
2111
2112 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2113
2114 *snap_size = rbd_dev->header.snap_sizes[which];
2115 *snap_features = 0; /* No features for v1 */
2116
2117 /* Skip over names until we find the one we are looking for */
2118
2119 snap_name = rbd_dev->header.snap_names;
2120 while (which--)
2121 snap_name += strlen(snap_name) + 1;
2122
2123 return snap_name;
2124}
2125
9d475de5
AE
2126/*
2127 * Get the size and object order for an image snapshot, or if
2128 * snap_id is CEPH_NOSNAP, gets this information for the base
2129 * image.
2130 */
2131static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2132 u8 *order, u64 *snap_size)
2133{
2134 __le64 snapid = cpu_to_le64(snap_id);
2135 int ret;
2136 struct {
2137 u8 order;
2138 __le64 size;
2139 } __attribute__ ((packed)) size_buf = { 0 };
2140
2141 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2142 "rbd", "get_size",
2143 (char *) &snapid, sizeof (snapid),
2144 (char *) &size_buf, sizeof (size_buf),
2145 CEPH_OSD_FLAG_READ, NULL);
2146 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2147 if (ret < 0)
2148 return ret;
2149
2150 *order = size_buf.order;
2151 *snap_size = le64_to_cpu(size_buf.size);
2152
2153 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2154 (unsigned long long) snap_id, (unsigned int) *order,
2155 (unsigned long long) *snap_size);
2156
2157 return 0;
2158}
2159
2160static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2161{
2162 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2163 &rbd_dev->header.obj_order,
2164 &rbd_dev->header.image_size);
2165}
2166
1e130199
AE
2167static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2168{
2169 void *reply_buf;
2170 int ret;
2171 void *p;
2172
2173 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2174 if (!reply_buf)
2175 return -ENOMEM;
2176
2177 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2178 "rbd", "get_object_prefix",
2179 NULL, 0,
2180 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2181 CEPH_OSD_FLAG_READ, NULL);
2182 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2183 if (ret < 0)
2184 goto out;
a0ea3a40 2185 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2186
2187 p = reply_buf;
2188 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2189 p + RBD_OBJ_PREFIX_LEN_MAX,
2190 NULL, GFP_NOIO);
2191
2192 if (IS_ERR(rbd_dev->header.object_prefix)) {
2193 ret = PTR_ERR(rbd_dev->header.object_prefix);
2194 rbd_dev->header.object_prefix = NULL;
2195 } else {
2196 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2197 }
2198
2199out:
2200 kfree(reply_buf);
2201
2202 return ret;
2203}
2204
b1b5402a
AE
2205static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2206 u64 *snap_features)
2207{
2208 __le64 snapid = cpu_to_le64(snap_id);
2209 struct {
2210 __le64 features;
2211 __le64 incompat;
2212 } features_buf = { 0 };
d889140c 2213 u64 incompat;
b1b5402a
AE
2214 int ret;
2215
2216 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2217 "rbd", "get_features",
2218 (char *) &snapid, sizeof (snapid),
2219 (char *) &features_buf, sizeof (features_buf),
2220 CEPH_OSD_FLAG_READ, NULL);
2221 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2222 if (ret < 0)
2223 return ret;
d889140c
AE
2224
2225 incompat = le64_to_cpu(features_buf.incompat);
2226 if (incompat & ~RBD_FEATURES_ALL)
2227 return -ENOTSUPP;
2228
b1b5402a
AE
2229 *snap_features = le64_to_cpu(features_buf.features);
2230
2231 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2232 (unsigned long long) snap_id,
2233 (unsigned long long) *snap_features,
2234 (unsigned long long) le64_to_cpu(features_buf.incompat));
2235
2236 return 0;
2237}
2238
2239static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2240{
2241 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2242 &rbd_dev->header.features);
2243}
2244
6e14b1a6 2245static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2246{
2247 size_t size;
2248 int ret;
2249 void *reply_buf;
2250 void *p;
2251 void *end;
2252 u64 seq;
2253 u32 snap_count;
2254 struct ceph_snap_context *snapc;
2255 u32 i;
2256
2257 /*
2258 * We'll need room for the seq value (maximum snapshot id),
2259 * snapshot count, and array of that many snapshot ids.
2260 * For now we have a fixed upper limit on the number we're
2261 * prepared to receive.
2262 */
2263 size = sizeof (__le64) + sizeof (__le32) +
2264 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2265 reply_buf = kzalloc(size, GFP_KERNEL);
2266 if (!reply_buf)
2267 return -ENOMEM;
2268
2269 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2270 "rbd", "get_snapcontext",
2271 NULL, 0,
2272 reply_buf, size,
6e14b1a6 2273 CEPH_OSD_FLAG_READ, ver);
35d489f9
AE
2274 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2275 if (ret < 0)
2276 goto out;
2277
2278 ret = -ERANGE;
2279 p = reply_buf;
2280 end = (char *) reply_buf + size;
2281 ceph_decode_64_safe(&p, end, seq, out);
2282 ceph_decode_32_safe(&p, end, snap_count, out);
2283
2284 /*
2285 * Make sure the reported number of snapshot ids wouldn't go
2286 * beyond the end of our buffer. But before checking that,
2287 * make sure the computed size of the snapshot context we
2288 * allocate is representable in a size_t.
2289 */
2290 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2291 / sizeof (u64)) {
2292 ret = -EINVAL;
2293 goto out;
2294 }
2295 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2296 goto out;
2297
2298 size = sizeof (struct ceph_snap_context) +
2299 snap_count * sizeof (snapc->snaps[0]);
2300 snapc = kmalloc(size, GFP_KERNEL);
2301 if (!snapc) {
2302 ret = -ENOMEM;
2303 goto out;
2304 }
2305
2306 atomic_set(&snapc->nref, 1);
2307 snapc->seq = seq;
2308 snapc->num_snaps = snap_count;
2309 for (i = 0; i < snap_count; i++)
2310 snapc->snaps[i] = ceph_decode_64(&p);
2311
2312 rbd_dev->header.snapc = snapc;
2313
2314 dout(" snap context seq = %llu, snap_count = %u\n",
2315 (unsigned long long) seq, (unsigned int) snap_count);
2316
2317out:
2318 kfree(reply_buf);
2319
2320 return 0;
2321}
2322
b8b1e2db
AE
2323static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2324{
2325 size_t size;
2326 void *reply_buf;
2327 __le64 snap_id;
2328 int ret;
2329 void *p;
2330 void *end;
2331 size_t snap_name_len;
2332 char *snap_name;
2333
2334 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2335 reply_buf = kmalloc(size, GFP_KERNEL);
2336 if (!reply_buf)
2337 return ERR_PTR(-ENOMEM);
2338
2339 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2340 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2341 "rbd", "get_snapshot_name",
2342 (char *) &snap_id, sizeof (snap_id),
2343 reply_buf, size,
2344 CEPH_OSD_FLAG_READ, NULL);
2345 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2346 if (ret < 0)
2347 goto out;
2348
2349 p = reply_buf;
2350 end = (char *) reply_buf + size;
2351 snap_name_len = 0;
2352 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2353 GFP_KERNEL);
2354 if (IS_ERR(snap_name)) {
2355 ret = PTR_ERR(snap_name);
2356 goto out;
2357 } else {
2358 dout(" snap_id 0x%016llx snap_name = %s\n",
2359 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2360 }
2361 kfree(reply_buf);
2362
2363 return snap_name;
2364out:
2365 kfree(reply_buf);
2366
2367 return ERR_PTR(ret);
2368}
2369
2370static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2371 u64 *snap_size, u64 *snap_features)
2372{
2373 __le64 snap_id;
2374 u8 order;
2375 int ret;
2376
2377 snap_id = rbd_dev->header.snapc->snaps[which];
2378 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2379 if (ret)
2380 return ERR_PTR(ret);
2381 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2382 if (ret)
2383 return ERR_PTR(ret);
2384
2385 return rbd_dev_v2_snap_name(rbd_dev, which);
2386}
2387
2388static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2389 u64 *snap_size, u64 *snap_features)
2390{
2391 if (rbd_dev->image_format == 1)
2392 return rbd_dev_v1_snap_info(rbd_dev, which,
2393 snap_size, snap_features);
2394 if (rbd_dev->image_format == 2)
2395 return rbd_dev_v2_snap_info(rbd_dev, which,
2396 snap_size, snap_features);
2397 return ERR_PTR(-EINVAL);
2398}
2399
117973fb
AE
2400static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2401{
2402 int ret;
2403 __u8 obj_order;
2404
2405 down_write(&rbd_dev->header_rwsem);
2406
2407 /* Grab old order first, to see if it changes */
2408
2409 obj_order = rbd_dev->header.obj_order,
2410 ret = rbd_dev_v2_image_size(rbd_dev);
2411 if (ret)
2412 goto out;
2413 if (rbd_dev->header.obj_order != obj_order) {
2414 ret = -EIO;
2415 goto out;
2416 }
2417 rbd_update_mapping_size(rbd_dev);
2418
2419 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2420 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2421 if (ret)
2422 goto out;
2423 ret = rbd_dev_snaps_update(rbd_dev);
2424 dout("rbd_dev_snaps_update returned %d\n", ret);
2425 if (ret)
2426 goto out;
2427 ret = rbd_dev_snaps_register(rbd_dev);
2428 dout("rbd_dev_snaps_register returned %d\n", ret);
2429out:
2430 up_write(&rbd_dev->header_rwsem);
2431
2432 return ret;
2433}
2434
dfc5606d 2435/*
35938150
AE
2436 * Scan the rbd device's current snapshot list and compare it to the
2437 * newly-received snapshot context. Remove any existing snapshots
2438 * not present in the new snapshot context. Add a new snapshot for
2439 * any snaphots in the snapshot context not in the current list.
2440 * And verify there are no changes to snapshots we already know
2441 * about.
2442 *
2443 * Assumes the snapshots in the snapshot context are sorted by
2444 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2445 * are also maintained in that order.)
dfc5606d 2446 */
304f6808 2447static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2448{
35938150
AE
2449 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2450 const u32 snap_count = snapc->num_snaps;
35938150
AE
2451 struct list_head *head = &rbd_dev->snaps;
2452 struct list_head *links = head->next;
2453 u32 index = 0;
dfc5606d 2454
9fcbb800 2455 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2456 while (index < snap_count || links != head) {
2457 u64 snap_id;
2458 struct rbd_snap *snap;
cd892126
AE
2459 char *snap_name;
2460 u64 snap_size = 0;
2461 u64 snap_features = 0;
dfc5606d 2462
35938150
AE
2463 snap_id = index < snap_count ? snapc->snaps[index]
2464 : CEPH_NOSNAP;
2465 snap = links != head ? list_entry(links, struct rbd_snap, node)
2466 : NULL;
aafb230e 2467 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2468
35938150
AE
2469 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2470 struct list_head *next = links->next;
dfc5606d 2471
35938150 2472 /* Existing snapshot not in the new snap context */
dfc5606d 2473
f84344f3
AE
2474 if (rbd_dev->mapping.snap_id == snap->id)
2475 rbd_dev->mapping.snap_exists = false;
35938150 2476 __rbd_remove_snap_dev(snap);
9fcbb800 2477 dout("%ssnap id %llu has been removed\n",
f84344f3
AE
2478 rbd_dev->mapping.snap_id == snap->id ?
2479 "mapped " : "",
9fcbb800 2480 (unsigned long long) snap->id);
35938150
AE
2481
2482 /* Done with this list entry; advance */
2483
2484 links = next;
dfc5606d
YS
2485 continue;
2486 }
35938150 2487
b8b1e2db
AE
2488 snap_name = rbd_dev_snap_info(rbd_dev, index,
2489 &snap_size, &snap_features);
cd892126
AE
2490 if (IS_ERR(snap_name))
2491 return PTR_ERR(snap_name);
2492
9fcbb800
AE
2493 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2494 (unsigned long long) snap_id);
35938150
AE
2495 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2496 struct rbd_snap *new_snap;
2497
2498 /* We haven't seen this snapshot before */
2499
c8d18425 2500 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2501 snap_id, snap_size, snap_features);
9fcbb800
AE
2502 if (IS_ERR(new_snap)) {
2503 int err = PTR_ERR(new_snap);
2504
2505 dout(" failed to add dev, error %d\n", err);
2506
2507 return err;
2508 }
35938150
AE
2509
2510 /* New goes before existing, or at end of list */
2511
9fcbb800 2512 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2513 if (snap)
2514 list_add_tail(&new_snap->node, &snap->node);
2515 else
523f3258 2516 list_add_tail(&new_snap->node, head);
35938150
AE
2517 } else {
2518 /* Already have this one */
2519
9fcbb800
AE
2520 dout(" already present\n");
2521
cd892126 2522 rbd_assert(snap->size == snap_size);
aafb230e 2523 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2524 rbd_assert(snap->features == snap_features);
35938150
AE
2525
2526 /* Done with this list entry; advance */
2527
2528 links = links->next;
dfc5606d 2529 }
35938150
AE
2530
2531 /* Advance to the next entry in the snapshot context */
2532
2533 index++;
dfc5606d 2534 }
9fcbb800 2535 dout("%s: done\n", __func__);
dfc5606d
YS
2536
2537 return 0;
2538}
2539
304f6808
AE
2540/*
2541 * Scan the list of snapshots and register the devices for any that
2542 * have not already been registered.
2543 */
2544static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2545{
2546 struct rbd_snap *snap;
2547 int ret = 0;
2548
2549 dout("%s called\n", __func__);
86ff77bb
AE
2550 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2551 return -EIO;
304f6808
AE
2552
2553 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2554 if (!rbd_snap_registered(snap)) {
2555 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2556 if (ret < 0)
2557 break;
2558 }
2559 }
2560 dout("%s: returning %d\n", __func__, ret);
2561
2562 return ret;
2563}
2564
dfc5606d
YS
2565static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2566{
dfc5606d 2567 struct device *dev;
cd789ab9 2568 int ret;
dfc5606d
YS
2569
2570 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2571
cd789ab9 2572 dev = &rbd_dev->dev;
dfc5606d
YS
2573 dev->bus = &rbd_bus_type;
2574 dev->type = &rbd_device_type;
2575 dev->parent = &rbd_root_dev;
2576 dev->release = rbd_dev_release;
de71a297 2577 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2578 ret = device_register(dev);
dfc5606d 2579
dfc5606d 2580 mutex_unlock(&ctl_mutex);
cd789ab9 2581
dfc5606d 2582 return ret;
602adf40
YS
2583}
2584
dfc5606d
YS
2585static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2586{
2587 device_unregister(&rbd_dev->dev);
2588}
2589
59c2be1e
YS
2590static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2591{
2592 int ret, rc;
2593
2594 do {
0e6f322d 2595 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2596 if (ret == -ERANGE) {
117973fb 2597 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
2598 if (rc < 0)
2599 return rc;
2600 }
2601 } while (ret == -ERANGE);
2602
2603 return ret;
2604}
2605
e2839308 2606static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2607
2608/*
499afd5b
AE
2609 * Get a unique rbd identifier for the given new rbd_dev, and add
2610 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2611 */
e2839308 2612static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2613{
e2839308 2614 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2615
2616 spin_lock(&rbd_dev_list_lock);
2617 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2618 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2619 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2620 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2621}
b7f23c36 2622
1ddbe94e 2623/*
499afd5b
AE
2624 * Remove an rbd_dev from the global list, and record that its
2625 * identifier is no longer in use.
1ddbe94e 2626 */
e2839308 2627static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2628{
d184f6bf 2629 struct list_head *tmp;
de71a297 2630 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2631 int max_id;
2632
aafb230e 2633 rbd_assert(rbd_id > 0);
499afd5b 2634
e2839308
AE
2635 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2636 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2637 spin_lock(&rbd_dev_list_lock);
2638 list_del_init(&rbd_dev->node);
d184f6bf
AE
2639
2640 /*
2641 * If the id being "put" is not the current maximum, there
2642 * is nothing special we need to do.
2643 */
e2839308 2644 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2645 spin_unlock(&rbd_dev_list_lock);
2646 return;
2647 }
2648
2649 /*
2650 * We need to update the current maximum id. Search the
2651 * list to find out what it is. We're more likely to find
2652 * the maximum at the end, so search the list backward.
2653 */
2654 max_id = 0;
2655 list_for_each_prev(tmp, &rbd_dev_list) {
2656 struct rbd_device *rbd_dev;
2657
2658 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
2659 if (rbd_dev->dev_id > max_id)
2660 max_id = rbd_dev->dev_id;
d184f6bf 2661 }
499afd5b 2662 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2663
1ddbe94e 2664 /*
e2839308 2665 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2666 * which case it now accurately reflects the new maximum.
2667 * Be careful not to overwrite the maximum value in that
2668 * case.
1ddbe94e 2669 */
e2839308
AE
2670 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2671 dout(" max dev id has been reset\n");
b7f23c36
AE
2672}
2673
e28fff26
AE
2674/*
2675 * Skips over white space at *buf, and updates *buf to point to the
2676 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2677 * the token (string of non-white space characters) found. Note
2678 * that *buf must be terminated with '\0'.
e28fff26
AE
2679 */
2680static inline size_t next_token(const char **buf)
2681{
2682 /*
2683 * These are the characters that produce nonzero for
2684 * isspace() in the "C" and "POSIX" locales.
2685 */
2686 const char *spaces = " \f\n\r\t\v";
2687
2688 *buf += strspn(*buf, spaces); /* Find start of token */
2689
2690 return strcspn(*buf, spaces); /* Return token length */
2691}
2692
2693/*
2694 * Finds the next token in *buf, and if the provided token buffer is
2695 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2696 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2697 * must be terminated with '\0' on entry.
e28fff26
AE
2698 *
2699 * Returns the length of the token found (not including the '\0').
2700 * Return value will be 0 if no token is found, and it will be >=
2701 * token_size if the token would not fit.
2702 *
593a9e7b 2703 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2704 * found token. Note that this occurs even if the token buffer is
2705 * too small to hold it.
2706 */
2707static inline size_t copy_token(const char **buf,
2708 char *token,
2709 size_t token_size)
2710{
2711 size_t len;
2712
2713 len = next_token(buf);
2714 if (len < token_size) {
2715 memcpy(token, *buf, len);
2716 *(token + len) = '\0';
2717 }
2718 *buf += len;
2719
2720 return len;
2721}
2722
ea3352f4
AE
2723/*
2724 * Finds the next token in *buf, dynamically allocates a buffer big
2725 * enough to hold a copy of it, and copies the token into the new
2726 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2727 * that a duplicate buffer is created even for a zero-length token.
2728 *
2729 * Returns a pointer to the newly-allocated duplicate, or a null
2730 * pointer if memory for the duplicate was not available. If
2731 * the lenp argument is a non-null pointer, the length of the token
2732 * (not including the '\0') is returned in *lenp.
2733 *
2734 * If successful, the *buf pointer will be updated to point beyond
2735 * the end of the found token.
2736 *
2737 * Note: uses GFP_KERNEL for allocation.
2738 */
2739static inline char *dup_token(const char **buf, size_t *lenp)
2740{
2741 char *dup;
2742 size_t len;
2743
2744 len = next_token(buf);
2745 dup = kmalloc(len + 1, GFP_KERNEL);
2746 if (!dup)
2747 return NULL;
2748
2749 memcpy(dup, *buf, len);
2750 *(dup + len) = '\0';
2751 *buf += len;
2752
2753 if (lenp)
2754 *lenp = len;
2755
2756 return dup;
2757}
2758
a725f65e 2759/*
3feeb894
AE
2760 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2761 * rbd_md_name, and name fields of the given rbd_dev, based on the
2762 * list of monitor addresses and other options provided via
2763 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2764 * copy of the snapshot name to map if successful, or a
2765 * pointer-coded error otherwise.
d22f76e7
AE
2766 *
2767 * Note: rbd_dev is assumed to have been initially zero-filled.
a725f65e 2768 */
3feeb894
AE
2769static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2770 const char *buf,
2771 const char **mon_addrs,
2772 size_t *mon_addrs_size,
2773 char *options,
2774 size_t options_size)
e28fff26 2775{
d22f76e7 2776 size_t len;
3feeb894
AE
2777 char *err_ptr = ERR_PTR(-EINVAL);
2778 char *snap_name;
e28fff26
AE
2779
2780 /* The first four tokens are required */
2781
7ef3214a
AE
2782 len = next_token(&buf);
2783 if (!len)
3feeb894 2784 return err_ptr;
5214ecc4 2785 *mon_addrs_size = len + 1;
7ef3214a
AE
2786 *mon_addrs = buf;
2787
2788 buf += len;
a725f65e 2789
e28fff26
AE
2790 len = copy_token(&buf, options, options_size);
2791 if (!len || len >= options_size)
3feeb894 2792 return err_ptr;
e28fff26 2793
3feeb894 2794 err_ptr = ERR_PTR(-ENOMEM);
d22f76e7
AE
2795 rbd_dev->pool_name = dup_token(&buf, NULL);
2796 if (!rbd_dev->pool_name)
d22f76e7 2797 goto out_err;
e28fff26 2798
0bed54dc
AE
2799 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2800 if (!rbd_dev->image_name)
bf3e5ae1 2801 goto out_err;
a725f65e 2802
d4b125e9
AE
2803 /* Snapshot name is optional; default is to use "head" */
2804
3feeb894 2805 len = next_token(&buf);
d4b125e9
AE
2806 if (len > RBD_MAX_SNAP_NAME_LEN) {
2807 err_ptr = ERR_PTR(-ENAMETOOLONG);
2808 goto out_err;
2809 }
820a5f3e 2810 if (!len) {
3feeb894
AE
2811 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2812 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
849b4260 2813 }
3feeb894
AE
2814 snap_name = kmalloc(len + 1, GFP_KERNEL);
2815 if (!snap_name)
2816 goto out_err;
2817 memcpy(snap_name, buf, len);
2818 *(snap_name + len) = '\0';
e28fff26 2819
3feeb894 2820 return snap_name;
d22f76e7
AE
2821
2822out_err:
0bed54dc 2823 kfree(rbd_dev->image_name);
d78fd7ae
AE
2824 rbd_dev->image_name = NULL;
2825 rbd_dev->image_name_len = 0;
d22f76e7
AE
2826 kfree(rbd_dev->pool_name);
2827 rbd_dev->pool_name = NULL;
2828
3feeb894 2829 return err_ptr;
a725f65e
AE
2830}
2831
589d30e0
AE
2832/*
2833 * An rbd format 2 image has a unique identifier, distinct from the
2834 * name given to it by the user. Internally, that identifier is
2835 * what's used to specify the names of objects related to the image.
2836 *
2837 * A special "rbd id" object is used to map an rbd image name to its
2838 * id. If that object doesn't exist, then there is no v2 rbd image
2839 * with the supplied name.
2840 *
2841 * This function will record the given rbd_dev's image_id field if
2842 * it can be determined, and in that case will return 0. If any
2843 * errors occur a negative errno will be returned and the rbd_dev's
2844 * image_id field will be unchanged (and should be NULL).
2845 */
2846static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2847{
2848 int ret;
2849 size_t size;
2850 char *object_name;
2851 void *response;
2852 void *p;
2853
2854 /*
2855 * First, see if the format 2 image id file exists, and if
2856 * so, get the image's persistent id from it.
2857 */
2858 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2859 object_name = kmalloc(size, GFP_NOIO);
2860 if (!object_name)
2861 return -ENOMEM;
2862 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2863 dout("rbd id object name is %s\n", object_name);
2864
2865 /* Response will be an encoded string, which includes a length */
2866
2867 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2868 response = kzalloc(size, GFP_NOIO);
2869 if (!response) {
2870 ret = -ENOMEM;
2871 goto out;
2872 }
2873
2874 ret = rbd_req_sync_exec(rbd_dev, object_name,
2875 "rbd", "get_id",
2876 NULL, 0,
2877 response, RBD_IMAGE_ID_LEN_MAX,
2878 CEPH_OSD_FLAG_READ, NULL);
2879 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2880 if (ret < 0)
2881 goto out;
a0ea3a40 2882 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
2883
2884 p = response;
2885 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2886 p + RBD_IMAGE_ID_LEN_MAX,
2887 &rbd_dev->image_id_len,
2888 GFP_NOIO);
2889 if (IS_ERR(rbd_dev->image_id)) {
2890 ret = PTR_ERR(rbd_dev->image_id);
2891 rbd_dev->image_id = NULL;
2892 } else {
2893 dout("image_id is %s\n", rbd_dev->image_id);
2894 }
2895out:
2896 kfree(response);
2897 kfree(object_name);
2898
2899 return ret;
2900}
2901
a30b71b9
AE
2902static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2903{
2904 int ret;
2905 size_t size;
2906
2907 /* Version 1 images have no id; empty string is used */
2908
2909 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2910 if (!rbd_dev->image_id)
2911 return -ENOMEM;
2912 rbd_dev->image_id_len = 0;
2913
2914 /* Record the header object name for this rbd image. */
2915
2916 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2917 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2918 if (!rbd_dev->header_name) {
2919 ret = -ENOMEM;
2920 goto out_err;
2921 }
2922 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2923
2924 /* Populate rbd image metadata */
2925
2926 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2927 if (ret < 0)
2928 goto out_err;
2929 rbd_dev->image_format = 1;
2930
2931 dout("discovered version 1 image, header name is %s\n",
2932 rbd_dev->header_name);
2933
2934 return 0;
2935
2936out_err:
2937 kfree(rbd_dev->header_name);
2938 rbd_dev->header_name = NULL;
2939 kfree(rbd_dev->image_id);
2940 rbd_dev->image_id = NULL;
2941
2942 return ret;
2943}
2944
2945static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2946{
2947 size_t size;
9d475de5 2948 int ret;
6e14b1a6 2949 u64 ver = 0;
a30b71b9
AE
2950
2951 /*
2952 * Image id was filled in by the caller. Record the header
2953 * object name for this rbd image.
2954 */
2955 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2956 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2957 if (!rbd_dev->header_name)
2958 return -ENOMEM;
2959 sprintf(rbd_dev->header_name, "%s%s",
2960 RBD_HEADER_PREFIX, rbd_dev->image_id);
9d475de5
AE
2961
2962 /* Get the size and object order for the image */
2963
2964 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
2965 if (ret < 0)
2966 goto out_err;
2967
2968 /* Get the object prefix (a.k.a. block_name) for the image */
2969
2970 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
2971 if (ret < 0)
2972 goto out_err;
2973
d889140c 2974 /* Get the and check features for the image */
b1b5402a
AE
2975
2976 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
2977 if (ret < 0)
2978 goto out_err;
35d489f9 2979
6e14b1a6
AE
2980 /* crypto and compression type aren't (yet) supported for v2 images */
2981
2982 rbd_dev->header.crypt_type = 0;
2983 rbd_dev->header.comp_type = 0;
35d489f9 2984
6e14b1a6
AE
2985 /* Get the snapshot context, plus the header version */
2986
2987 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
2988 if (ret)
2989 goto out_err;
6e14b1a6
AE
2990 rbd_dev->header.obj_version = ver;
2991
a30b71b9
AE
2992 rbd_dev->image_format = 2;
2993
2994 dout("discovered version 2 image, header name is %s\n",
2995 rbd_dev->header_name);
2996
35152979 2997 return 0;
9d475de5
AE
2998out_err:
2999 kfree(rbd_dev->header_name);
3000 rbd_dev->header_name = NULL;
1e130199
AE
3001 kfree(rbd_dev->header.object_prefix);
3002 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3003
3004 return ret;
a30b71b9
AE
3005}
3006
3007/*
3008 * Probe for the existence of the header object for the given rbd
3009 * device. For format 2 images this includes determining the image
3010 * id.
3011 */
3012static int rbd_dev_probe(struct rbd_device *rbd_dev)
3013{
3014 int ret;
3015
3016 /*
3017 * Get the id from the image id object. If it's not a
3018 * format 2 image, we'll get ENOENT back, and we'll assume
3019 * it's a format 1 image.
3020 */
3021 ret = rbd_dev_image_id(rbd_dev);
3022 if (ret)
3023 ret = rbd_dev_v1_probe(rbd_dev);
3024 else
3025 ret = rbd_dev_v2_probe(rbd_dev);
3026 if (ret)
3027 dout("probe failed, returning %d\n", ret);
3028
3029 return ret;
3030}
3031
59c2be1e
YS
3032static ssize_t rbd_add(struct bus_type *bus,
3033 const char *buf,
3034 size_t count)
602adf40 3035{
cb8627c7
AE
3036 char *options;
3037 struct rbd_device *rbd_dev = NULL;
7ef3214a
AE
3038 const char *mon_addrs = NULL;
3039 size_t mon_addrs_size = 0;
27cc2594
AE
3040 struct ceph_osd_client *osdc;
3041 int rc = -ENOMEM;
3feeb894 3042 char *snap_name;
602adf40
YS
3043
3044 if (!try_module_get(THIS_MODULE))
3045 return -ENODEV;
3046
60571c7d 3047 options = kmalloc(count, GFP_KERNEL);
602adf40 3048 if (!options)
85ae8926 3049 goto err_out_mem;
cb8627c7
AE
3050 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3051 if (!rbd_dev)
85ae8926 3052 goto err_out_mem;
602adf40
YS
3053
3054 /* static rbd_device initialization */
3055 spin_lock_init(&rbd_dev->lock);
3056 INIT_LIST_HEAD(&rbd_dev->node);
dfc5606d 3057 INIT_LIST_HEAD(&rbd_dev->snaps);
c666601a 3058 init_rwsem(&rbd_dev->header_rwsem);
602adf40 3059
602adf40 3060 /* parse add command */
3feeb894
AE
3061 snap_name = rbd_add_parse_args(rbd_dev, buf,
3062 &mon_addrs, &mon_addrs_size, options, count);
3063 if (IS_ERR(snap_name)) {
3064 rc = PTR_ERR(snap_name);
85ae8926 3065 goto err_out_mem;
3feeb894 3066 }
e124a82f 3067
f8c38929
AE
3068 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3069 if (rc < 0)
85ae8926 3070 goto err_out_args;
602adf40 3071
602adf40 3072 /* pick the pool */
1dbb4399 3073 osdc = &rbd_dev->rbd_client->client->osdc;
602adf40
YS
3074 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3075 if (rc < 0)
3076 goto err_out_client;
9bb2f334 3077 rbd_dev->pool_id = rc;
602adf40 3078
a30b71b9
AE
3079 rc = rbd_dev_probe(rbd_dev);
3080 if (rc < 0)
05fd6f6f
AE
3081 goto err_out_client;
3082
3083 /* no need to lock here, as rbd_dev is not registered yet */
3084 rc = rbd_dev_snaps_update(rbd_dev);
3085 if (rc)
3086 goto err_out_header;
3087
3088 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3089 if (rc)
3090 goto err_out_header;
3091
85ae8926
AE
3092 /* generate unique id: find highest unique id, add one */
3093 rbd_dev_id_get(rbd_dev);
3094
3095 /* Fill in the device name, now that we have its id. */
3096 BUILD_BUG_ON(DEV_NAME_LEN
3097 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3098 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3099
3100 /* Get our block major device number. */
3101
27cc2594
AE
3102 rc = register_blkdev(0, rbd_dev->name);
3103 if (rc < 0)
85ae8926 3104 goto err_out_id;
27cc2594 3105 rbd_dev->major = rc;
602adf40 3106
0f308a31
AE
3107 /* Set up the blkdev mapping. */
3108
3109 rc = rbd_init_disk(rbd_dev);
dfc5606d 3110 if (rc)
766fc439
YS
3111 goto err_out_blkdev;
3112
0f308a31
AE
3113 rc = rbd_bus_add_dev(rbd_dev);
3114 if (rc)
3115 goto err_out_disk;
3116
32eec68d
AE
3117 /*
3118 * At this point cleanup in the event of an error is the job
3119 * of the sysfs code (initiated by rbd_bus_del_dev()).
32eec68d 3120 */
2ac4e75d 3121
4bb1f1ed 3122 down_write(&rbd_dev->header_rwsem);
5ed16177 3123 rc = rbd_dev_snaps_register(rbd_dev);
4bb1f1ed 3124 up_write(&rbd_dev->header_rwsem);
2ac4e75d
AE
3125 if (rc)
3126 goto err_out_bus;
3127
3ee4001e
AE
3128 rc = rbd_init_watch_dev(rbd_dev);
3129 if (rc)
3130 goto err_out_bus;
3131
2ac4e75d
AE
3132 /* Everything's ready. Announce the disk to the world. */
3133
2ac4e75d 3134 add_disk(rbd_dev->disk);
3ee4001e 3135
2ac4e75d
AE
3136 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3137 (unsigned long long) rbd_dev->mapping.size);
3138
602adf40
YS
3139 return count;
3140
766fc439 3141err_out_bus:
766fc439
YS
3142 /* this will also clean up rest of rbd_dev stuff */
3143
3144 rbd_bus_del_dev(rbd_dev);
3145 kfree(options);
766fc439
YS
3146 return rc;
3147
0f308a31
AE
3148err_out_disk:
3149 rbd_free_disk(rbd_dev);
602adf40
YS
3150err_out_blkdev:
3151 unregister_blkdev(rbd_dev->major, rbd_dev->name);
85ae8926
AE
3152err_out_id:
3153 rbd_dev_id_put(rbd_dev);
05fd6f6f
AE
3154err_out_header:
3155 rbd_header_free(&rbd_dev->header);
602adf40 3156err_out_client:
3fcf2581 3157 kfree(rbd_dev->header_name);
602adf40 3158 rbd_put_client(rbd_dev);
589d30e0 3159 kfree(rbd_dev->image_id);
85ae8926
AE
3160err_out_args:
3161 kfree(rbd_dev->mapping.snap_name);
3162 kfree(rbd_dev->image_name);
3163 kfree(rbd_dev->pool_name);
3164err_out_mem:
27cc2594 3165 kfree(rbd_dev);
cb8627c7 3166 kfree(options);
27cc2594 3167
602adf40
YS
3168 dout("Error adding device %s\n", buf);
3169 module_put(THIS_MODULE);
27cc2594
AE
3170
3171 return (ssize_t) rc;
602adf40
YS
3172}
3173
de71a297 3174static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3175{
3176 struct list_head *tmp;
3177 struct rbd_device *rbd_dev;
3178
e124a82f 3179 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3180 list_for_each(tmp, &rbd_dev_list) {
3181 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3182 if (rbd_dev->dev_id == dev_id) {
e124a82f 3183 spin_unlock(&rbd_dev_list_lock);
602adf40 3184 return rbd_dev;
e124a82f 3185 }
602adf40 3186 }
e124a82f 3187 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3188 return NULL;
3189}
3190
dfc5606d 3191static void rbd_dev_release(struct device *dev)
602adf40 3192{
593a9e7b 3193 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3194
1dbb4399
AE
3195 if (rbd_dev->watch_request) {
3196 struct ceph_client *client = rbd_dev->rbd_client->client;
3197
3198 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3199 rbd_dev->watch_request);
1dbb4399 3200 }
59c2be1e 3201 if (rbd_dev->watch_event)
070c633f 3202 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 3203
602adf40
YS
3204 rbd_put_client(rbd_dev);
3205
3206 /* clean up and free blkdev */
3207 rbd_free_disk(rbd_dev);
3208 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3209
2ac4e75d
AE
3210 /* release allocated disk header fields */
3211 rbd_header_free(&rbd_dev->header);
3212
32eec68d 3213 /* done with the id, and with the rbd_dev */
f84344f3 3214 kfree(rbd_dev->mapping.snap_name);
589d30e0 3215 kfree(rbd_dev->image_id);
0bed54dc 3216 kfree(rbd_dev->header_name);
d22f76e7 3217 kfree(rbd_dev->pool_name);
0bed54dc 3218 kfree(rbd_dev->image_name);
e2839308 3219 rbd_dev_id_put(rbd_dev);
602adf40
YS
3220 kfree(rbd_dev);
3221
3222 /* release module ref */
3223 module_put(THIS_MODULE);
602adf40
YS
3224}
3225
dfc5606d
YS
3226static ssize_t rbd_remove(struct bus_type *bus,
3227 const char *buf,
3228 size_t count)
602adf40
YS
3229{
3230 struct rbd_device *rbd_dev = NULL;
3231 int target_id, rc;
3232 unsigned long ul;
3233 int ret = count;
3234
3235 rc = strict_strtoul(buf, 10, &ul);
3236 if (rc)
3237 return rc;
3238
3239 /* convert to int; abort if we lost anything in the conversion */
3240 target_id = (int) ul;
3241 if (target_id != ul)
3242 return -EINVAL;
3243
3244 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3245
3246 rbd_dev = __rbd_get_dev(target_id);
3247 if (!rbd_dev) {
3248 ret = -ENOENT;
3249 goto done;
3250 }
3251
dfc5606d
YS
3252 __rbd_remove_all_snaps(rbd_dev);
3253 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3254
3255done:
3256 mutex_unlock(&ctl_mutex);
aafb230e 3257
602adf40
YS
3258 return ret;
3259}
3260
602adf40
YS
3261/*
3262 * create control files in sysfs
dfc5606d 3263 * /sys/bus/rbd/...
602adf40
YS
3264 */
3265static int rbd_sysfs_init(void)
3266{
dfc5606d 3267 int ret;
602adf40 3268
fed4c143 3269 ret = device_register(&rbd_root_dev);
21079786 3270 if (ret < 0)
dfc5606d 3271 return ret;
602adf40 3272
fed4c143
AE
3273 ret = bus_register(&rbd_bus_type);
3274 if (ret < 0)
3275 device_unregister(&rbd_root_dev);
602adf40 3276
602adf40
YS
3277 return ret;
3278}
3279
3280static void rbd_sysfs_cleanup(void)
3281{
dfc5606d 3282 bus_unregister(&rbd_bus_type);
fed4c143 3283 device_unregister(&rbd_root_dev);
602adf40
YS
3284}
3285
3286int __init rbd_init(void)
3287{
3288 int rc;
3289
3290 rc = rbd_sysfs_init();
3291 if (rc)
3292 return rc;
f0f8cef5 3293 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3294 return 0;
3295}
3296
3297void __exit rbd_exit(void)
3298{
3299 rbd_sysfs_cleanup();
3300}
3301
3302module_init(rbd_init);
3303module_exit(rbd_exit);
3304
3305MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3306MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3307MODULE_DESCRIPTION("rados block device");
3308
3309/* following authorship retained from original osdblk.c */
3310MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3311
3312MODULE_LICENSE("GPL");