rbd: rename snap_exists field
[linux-block.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
d4b125e9
AE
64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
35d489f9 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
69#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
1e130199
AE
73#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 75
d889140c
AE
76/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
81a89793
AE
84/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
602adf40 90#define DEV_NAME_LEN 32
81a89793 91#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 92
cc0538b6 93#define RBD_READ_ONLY_DEFAULT false
59c2be1e 94
602adf40
YS
95/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
f84344f3 99 /* These four fields never change for a given rbd image */
849b4260 100 char *object_prefix;
34b13184 101 u64 features;
602adf40
YS
102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
602adf40 105
f84344f3
AE
106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
602adf40
YS
109 char *snap_names;
110 u64 *snap_sizes;
59c2be1e
YS
111
112 u64 obj_version;
113};
114
115struct rbd_options {
cc0538b6 116 bool read_only;
602adf40
YS
117};
118
119/*
f0f8cef5 120 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
f0f8cef5 129 * a request completion status
602adf40 130 */
1fec7093
YS
131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
602adf40
YS
145};
146
f0f8cef5
AE
147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
dfc5606d
YS
159struct rbd_snap {
160 struct device dev;
161 const char *name;
3591538f 162 u64 size;
dfc5606d
YS
163 struct list_head node;
164 u64 id;
34b13184 165 u64 features;
dfc5606d
YS
166};
167
f84344f3 168struct rbd_mapping {
99c1f08f 169 u64 size;
34b13184 170 u64 features;
f84344f3
AE
171 bool read_only;
172};
173
602adf40
YS
174/*
175 * a single device
176 */
177struct rbd_device {
de71a297 178 int dev_id; /* blkdev unique id */
602adf40
YS
179
180 int major; /* blkdev assigned major */
181 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 182
a30b71b9 183 u32 image_format; /* Either 1 or 2 */
602adf40
YS
184 struct rbd_client *rbd_client;
185
186 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
187
188 spinlock_t lock; /* queue lock */
189
190 struct rbd_image_header header;
daba5fdb 191 bool exists;
589d30e0
AE
192 char *image_id;
193 size_t image_id_len;
0bed54dc
AE
194 char *image_name;
195 size_t image_name_len;
196 char *header_name;
d22f76e7 197 char *pool_name;
86992098 198 u64 pool_id;
602adf40 199
971f839a
AE
200 char *snap_name;
201 u64 snap_id;
202
59c2be1e
YS
203 struct ceph_osd_event *watch_event;
204 struct ceph_osd_request *watch_request;
205
c666601a
JD
206 /* protects updating the header */
207 struct rw_semaphore header_rwsem;
f84344f3
AE
208
209 struct rbd_mapping mapping;
602adf40
YS
210
211 struct list_head node;
dfc5606d
YS
212
213 /* list of snapshots */
214 struct list_head snaps;
215
216 /* sysfs related */
217 struct device dev;
218};
219
602adf40 220static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 221
602adf40 222static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
223static DEFINE_SPINLOCK(rbd_dev_list_lock);
224
432b8587
AE
225static LIST_HEAD(rbd_client_list); /* clients */
226static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 227
304f6808
AE
228static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
229static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
230
dfc5606d 231static void rbd_dev_release(struct device *dev);
41f38c2b 232static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 233
f0f8cef5
AE
234static ssize_t rbd_add(struct bus_type *bus, const char *buf,
235 size_t count);
236static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
237 size_t count);
238
239static struct bus_attribute rbd_bus_attrs[] = {
240 __ATTR(add, S_IWUSR, NULL, rbd_add),
241 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
242 __ATTR_NULL
243};
244
245static struct bus_type rbd_bus_type = {
246 .name = "rbd",
247 .bus_attrs = rbd_bus_attrs,
248};
249
250static void rbd_root_dev_release(struct device *dev)
251{
252}
253
254static struct device rbd_root_dev = {
255 .init_name = "rbd",
256 .release = rbd_root_dev_release,
257};
258
aafb230e
AE
259#ifdef RBD_DEBUG
260#define rbd_assert(expr) \
261 if (unlikely(!(expr))) { \
262 printk(KERN_ERR "\nAssertion failure in %s() " \
263 "at line %d:\n\n" \
264 "\trbd_assert(%s);\n\n", \
265 __func__, __LINE__, #expr); \
266 BUG(); \
267 }
268#else /* !RBD_DEBUG */
269# define rbd_assert(expr) ((void) 0)
270#endif /* !RBD_DEBUG */
dfc5606d 271
dfc5606d
YS
272static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
273{
274 return get_device(&rbd_dev->dev);
275}
276
277static void rbd_put_dev(struct rbd_device *rbd_dev)
278{
279 put_device(&rbd_dev->dev);
280}
602adf40 281
117973fb
AE
282static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
283static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 284
602adf40
YS
285static int rbd_open(struct block_device *bdev, fmode_t mode)
286{
f0f8cef5 287 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 288
f84344f3 289 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
290 return -EROFS;
291
340c7a2b 292 rbd_get_dev(rbd_dev);
f84344f3 293 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 294
602adf40
YS
295 return 0;
296}
297
dfc5606d
YS
298static int rbd_release(struct gendisk *disk, fmode_t mode)
299{
300 struct rbd_device *rbd_dev = disk->private_data;
301
302 rbd_put_dev(rbd_dev);
303
304 return 0;
305}
306
602adf40
YS
307static const struct block_device_operations rbd_bd_ops = {
308 .owner = THIS_MODULE,
309 .open = rbd_open,
dfc5606d 310 .release = rbd_release,
602adf40
YS
311};
312
313/*
314 * Initialize an rbd client instance.
43ae4701 315 * We own *ceph_opts.
602adf40 316 */
f8c38929 317static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
318{
319 struct rbd_client *rbdc;
320 int ret = -ENOMEM;
321
322 dout("rbd_client_create\n");
323 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
324 if (!rbdc)
325 goto out_opt;
326
327 kref_init(&rbdc->kref);
328 INIT_LIST_HEAD(&rbdc->node);
329
bc534d86
AE
330 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
331
43ae4701 332 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 333 if (IS_ERR(rbdc->client))
bc534d86 334 goto out_mutex;
43ae4701 335 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
336
337 ret = ceph_open_session(rbdc->client);
338 if (ret < 0)
339 goto out_err;
340
432b8587 341 spin_lock(&rbd_client_list_lock);
602adf40 342 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 343 spin_unlock(&rbd_client_list_lock);
602adf40 344
bc534d86
AE
345 mutex_unlock(&ctl_mutex);
346
602adf40
YS
347 dout("rbd_client_create created %p\n", rbdc);
348 return rbdc;
349
350out_err:
351 ceph_destroy_client(rbdc->client);
bc534d86
AE
352out_mutex:
353 mutex_unlock(&ctl_mutex);
602adf40
YS
354 kfree(rbdc);
355out_opt:
43ae4701
AE
356 if (ceph_opts)
357 ceph_destroy_options(ceph_opts);
28f259b7 358 return ERR_PTR(ret);
602adf40
YS
359}
360
361/*
1f7ba331
AE
362 * Find a ceph client with specific addr and configuration. If
363 * found, bump its reference count.
602adf40 364 */
1f7ba331 365static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
366{
367 struct rbd_client *client_node;
1f7ba331 368 bool found = false;
602adf40 369
43ae4701 370 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
371 return NULL;
372
1f7ba331
AE
373 spin_lock(&rbd_client_list_lock);
374 list_for_each_entry(client_node, &rbd_client_list, node) {
375 if (!ceph_compare_options(ceph_opts, client_node->client)) {
376 kref_get(&client_node->kref);
377 found = true;
378 break;
379 }
380 }
381 spin_unlock(&rbd_client_list_lock);
382
383 return found ? client_node : NULL;
602adf40
YS
384}
385
59c2be1e
YS
386/*
387 * mount options
388 */
389enum {
59c2be1e
YS
390 Opt_last_int,
391 /* int args above */
392 Opt_last_string,
393 /* string args above */
cc0538b6
AE
394 Opt_read_only,
395 Opt_read_write,
396 /* Boolean args above */
397 Opt_last_bool,
59c2be1e
YS
398};
399
43ae4701 400static match_table_t rbd_opts_tokens = {
59c2be1e
YS
401 /* int args above */
402 /* string args above */
be466c1c 403 {Opt_read_only, "read_only"},
cc0538b6
AE
404 {Opt_read_only, "ro"}, /* Alternate spelling */
405 {Opt_read_write, "read_write"},
406 {Opt_read_write, "rw"}, /* Alternate spelling */
407 /* Boolean args above */
59c2be1e
YS
408 {-1, NULL}
409};
410
411static int parse_rbd_opts_token(char *c, void *private)
412{
43ae4701 413 struct rbd_options *rbd_opts = private;
59c2be1e
YS
414 substring_t argstr[MAX_OPT_ARGS];
415 int token, intval, ret;
416
43ae4701 417 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
418 if (token < 0)
419 return -EINVAL;
420
421 if (token < Opt_last_int) {
422 ret = match_int(&argstr[0], &intval);
423 if (ret < 0) {
424 pr_err("bad mount option arg (not int) "
425 "at '%s'\n", c);
426 return ret;
427 }
428 dout("got int token %d val %d\n", token, intval);
429 } else if (token > Opt_last_int && token < Opt_last_string) {
430 dout("got string token %d val %s\n", token,
431 argstr[0].from);
cc0538b6
AE
432 } else if (token > Opt_last_string && token < Opt_last_bool) {
433 dout("got Boolean token %d\n", token);
59c2be1e
YS
434 } else {
435 dout("got token %d\n", token);
436 }
437
438 switch (token) {
cc0538b6
AE
439 case Opt_read_only:
440 rbd_opts->read_only = true;
441 break;
442 case Opt_read_write:
443 rbd_opts->read_only = false;
444 break;
59c2be1e 445 default:
aafb230e
AE
446 rbd_assert(false);
447 break;
59c2be1e
YS
448 }
449 return 0;
450}
451
602adf40
YS
452/*
453 * Get a ceph client with specific addr and configuration, if one does
454 * not exist create it.
455 */
f8c38929
AE
456static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
457 size_t mon_addr_len, char *options)
602adf40 458{
069a4b56 459 struct rbd_options rbd_opts;
43ae4701 460 struct ceph_options *ceph_opts;
f8c38929 461 struct rbd_client *rbdc;
59c2be1e 462
069a4b56
AE
463 /* Initialize all rbd options to the defaults */
464
465 rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
602adf40 466
43ae4701
AE
467 ceph_opts = ceph_parse_options(options, mon_addr,
468 mon_addr + mon_addr_len,
069a4b56 469 parse_rbd_opts_token, &rbd_opts);
f8c38929
AE
470 if (IS_ERR(ceph_opts))
471 return PTR_ERR(ceph_opts);
602adf40 472
069a4b56
AE
473 /* Record the parsed rbd options */
474
475 rbd_dev->mapping.read_only = rbd_opts.read_only;
476
1f7ba331 477 rbdc = rbd_client_find(ceph_opts);
602adf40 478 if (rbdc) {
602adf40 479 /* using an existing client */
43ae4701 480 ceph_destroy_options(ceph_opts);
f8c38929
AE
481 } else {
482 rbdc = rbd_client_create(ceph_opts);
483 if (IS_ERR(rbdc))
484 return PTR_ERR(rbdc);
602adf40 485 }
f8c38929 486 rbd_dev->rbd_client = rbdc;
602adf40 487
f8c38929 488 return 0;
602adf40
YS
489}
490
491/*
492 * Destroy ceph client
d23a4b3f 493 *
432b8587 494 * Caller must hold rbd_client_list_lock.
602adf40
YS
495 */
496static void rbd_client_release(struct kref *kref)
497{
498 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
499
500 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 501 spin_lock(&rbd_client_list_lock);
602adf40 502 list_del(&rbdc->node);
cd9d9f5d 503 spin_unlock(&rbd_client_list_lock);
602adf40
YS
504
505 ceph_destroy_client(rbdc->client);
506 kfree(rbdc);
507}
508
509/*
510 * Drop reference to ceph client node. If it's not referenced anymore, release
511 * it.
512 */
513static void rbd_put_client(struct rbd_device *rbd_dev)
514{
515 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
516 rbd_dev->rbd_client = NULL;
602adf40
YS
517}
518
1fec7093
YS
519/*
520 * Destroy requests collection
521 */
522static void rbd_coll_release(struct kref *kref)
523{
524 struct rbd_req_coll *coll =
525 container_of(kref, struct rbd_req_coll, kref);
526
527 dout("rbd_coll_release %p\n", coll);
528 kfree(coll);
529}
602adf40 530
a30b71b9
AE
531static bool rbd_image_format_valid(u32 image_format)
532{
533 return image_format == 1 || image_format == 2;
534}
535
8e94af8e
AE
536static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
537{
103a150f
AE
538 size_t size;
539 u32 snap_count;
540
541 /* The header has to start with the magic rbd header text */
542 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
543 return false;
544
db2388b6
AE
545 /* The bio layer requires at least sector-sized I/O */
546
547 if (ondisk->options.order < SECTOR_SHIFT)
548 return false;
549
550 /* If we use u64 in a few spots we may be able to loosen this */
551
552 if (ondisk->options.order > 8 * sizeof (int) - 1)
553 return false;
554
103a150f
AE
555 /*
556 * The size of a snapshot header has to fit in a size_t, and
557 * that limits the number of snapshots.
558 */
559 snap_count = le32_to_cpu(ondisk->snap_count);
560 size = SIZE_MAX - sizeof (struct ceph_snap_context);
561 if (snap_count > size / sizeof (__le64))
562 return false;
563
564 /*
565 * Not only that, but the size of the entire the snapshot
566 * header must also be representable in a size_t.
567 */
568 size -= snap_count * sizeof (__le64);
569 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
570 return false;
571
572 return true;
8e94af8e
AE
573}
574
602adf40
YS
575/*
576 * Create a new header structure, translate header format from the on-disk
577 * header.
578 */
579static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 580 struct rbd_image_header_ondisk *ondisk)
602adf40 581{
ccece235 582 u32 snap_count;
58c17b0e 583 size_t len;
d2bb24e5 584 size_t size;
621901d6 585 u32 i;
602adf40 586
6a52325f
AE
587 memset(header, 0, sizeof (*header));
588
103a150f
AE
589 snap_count = le32_to_cpu(ondisk->snap_count);
590
58c17b0e
AE
591 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
592 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 593 if (!header->object_prefix)
602adf40 594 return -ENOMEM;
58c17b0e
AE
595 memcpy(header->object_prefix, ondisk->object_prefix, len);
596 header->object_prefix[len] = '\0';
00f1f36f 597
602adf40 598 if (snap_count) {
f785cc1d
AE
599 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
600
621901d6
AE
601 /* Save a copy of the snapshot names */
602
f785cc1d
AE
603 if (snap_names_len > (u64) SIZE_MAX)
604 return -EIO;
605 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 606 if (!header->snap_names)
6a52325f 607 goto out_err;
f785cc1d
AE
608 /*
609 * Note that rbd_dev_v1_header_read() guarantees
610 * the ondisk buffer we're working with has
611 * snap_names_len bytes beyond the end of the
612 * snapshot id array, this memcpy() is safe.
613 */
614 memcpy(header->snap_names, &ondisk->snaps[snap_count],
615 snap_names_len);
6a52325f 616
621901d6
AE
617 /* Record each snapshot's size */
618
d2bb24e5
AE
619 size = snap_count * sizeof (*header->snap_sizes);
620 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 621 if (!header->snap_sizes)
6a52325f 622 goto out_err;
621901d6
AE
623 for (i = 0; i < snap_count; i++)
624 header->snap_sizes[i] =
625 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 626 } else {
ccece235 627 WARN_ON(ondisk->snap_names_len);
602adf40
YS
628 header->snap_names = NULL;
629 header->snap_sizes = NULL;
630 }
849b4260 631
34b13184 632 header->features = 0; /* No features support in v1 images */
602adf40
YS
633 header->obj_order = ondisk->options.order;
634 header->crypt_type = ondisk->options.crypt_type;
635 header->comp_type = ondisk->options.comp_type;
6a52325f 636
621901d6
AE
637 /* Allocate and fill in the snapshot context */
638
f84344f3 639 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
640 size = sizeof (struct ceph_snap_context);
641 size += snap_count * sizeof (header->snapc->snaps[0]);
642 header->snapc = kzalloc(size, GFP_KERNEL);
643 if (!header->snapc)
644 goto out_err;
602adf40
YS
645
646 atomic_set(&header->snapc->nref, 1);
505cbb9b 647 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 648 header->snapc->num_snaps = snap_count;
621901d6
AE
649 for (i = 0; i < snap_count; i++)
650 header->snapc->snaps[i] =
651 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
652
653 return 0;
654
6a52325f 655out_err:
849b4260 656 kfree(header->snap_sizes);
ccece235 657 header->snap_sizes = NULL;
602adf40 658 kfree(header->snap_names);
ccece235 659 header->snap_names = NULL;
6a52325f
AE
660 kfree(header->object_prefix);
661 header->object_prefix = NULL;
ccece235 662
00f1f36f 663 return -ENOMEM;
602adf40
YS
664}
665
8836b995 666static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 667{
602adf40 668
e86924a8 669 struct rbd_snap *snap;
602adf40 670
e86924a8
AE
671 list_for_each_entry(snap, &rbd_dev->snaps, node) {
672 if (!strcmp(snap_name, snap->name)) {
971f839a 673 rbd_dev->snap_id = snap->id;
e86924a8 674 rbd_dev->mapping.size = snap->size;
34b13184 675 rbd_dev->mapping.features = snap->features;
602adf40 676
e86924a8 677 return 0;
00f1f36f 678 }
00f1f36f 679 }
e86924a8 680
00f1f36f 681 return -ENOENT;
602adf40
YS
682}
683
5ed16177 684static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
602adf40 685{
78dc447d 686 int ret;
602adf40 687
4e1105a2 688 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 689 sizeof (RBD_SNAP_HEAD_NAME))) {
971f839a 690 rbd_dev->snap_id = CEPH_NOSNAP;
99c1f08f 691 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 692 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 693 ret = 0;
602adf40 694 } else {
8836b995 695 ret = snap_by_name(rbd_dev, snap_name);
602adf40
YS
696 if (ret < 0)
697 goto done;
f84344f3 698 rbd_dev->mapping.read_only = true;
602adf40 699 }
971f839a 700 rbd_dev->snap_name = snap_name;
daba5fdb 701 rbd_dev->exists = true;
602adf40 702done:
602adf40
YS
703 return ret;
704}
705
706static void rbd_header_free(struct rbd_image_header *header)
707{
849b4260 708 kfree(header->object_prefix);
d78fd7ae 709 header->object_prefix = NULL;
602adf40 710 kfree(header->snap_sizes);
d78fd7ae 711 header->snap_sizes = NULL;
849b4260 712 kfree(header->snap_names);
d78fd7ae 713 header->snap_names = NULL;
d1d25646 714 ceph_put_snap_context(header->snapc);
d78fd7ae 715 header->snapc = NULL;
602adf40
YS
716}
717
65ccfe21 718static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 719{
65ccfe21
AE
720 char *name;
721 u64 segment;
722 int ret;
602adf40 723
65ccfe21
AE
724 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
725 if (!name)
726 return NULL;
727 segment = offset >> rbd_dev->header.obj_order;
728 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
729 rbd_dev->header.object_prefix, segment);
730 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
731 pr_err("error formatting segment name for #%llu (%d)\n",
732 segment, ret);
733 kfree(name);
734 name = NULL;
735 }
602adf40 736
65ccfe21
AE
737 return name;
738}
602adf40 739
65ccfe21
AE
740static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
741{
742 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 743
65ccfe21
AE
744 return offset & (segment_size - 1);
745}
746
747static u64 rbd_segment_length(struct rbd_device *rbd_dev,
748 u64 offset, u64 length)
749{
750 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
751
752 offset &= segment_size - 1;
753
aafb230e 754 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
755 if (offset + length > segment_size)
756 length = segment_size - offset;
757
758 return length;
602adf40
YS
759}
760
1fec7093
YS
761static int rbd_get_num_segments(struct rbd_image_header *header,
762 u64 ofs, u64 len)
763{
df111be6
AE
764 u64 start_seg;
765 u64 end_seg;
766
767 if (!len)
768 return 0;
769 if (len - 1 > U64_MAX - ofs)
770 return -ERANGE;
771
772 start_seg = ofs >> header->obj_order;
773 end_seg = (ofs + len - 1) >> header->obj_order;
774
1fec7093
YS
775 return end_seg - start_seg + 1;
776}
777
029bcbd8
JD
778/*
779 * returns the size of an object in the image
780 */
781static u64 rbd_obj_bytes(struct rbd_image_header *header)
782{
783 return 1 << header->obj_order;
784}
785
602adf40
YS
786/*
787 * bio helpers
788 */
789
790static void bio_chain_put(struct bio *chain)
791{
792 struct bio *tmp;
793
794 while (chain) {
795 tmp = chain;
796 chain = chain->bi_next;
797 bio_put(tmp);
798 }
799}
800
801/*
802 * zeros a bio chain, starting at specific offset
803 */
804static void zero_bio_chain(struct bio *chain, int start_ofs)
805{
806 struct bio_vec *bv;
807 unsigned long flags;
808 void *buf;
809 int i;
810 int pos = 0;
811
812 while (chain) {
813 bio_for_each_segment(bv, chain, i) {
814 if (pos + bv->bv_len > start_ofs) {
815 int remainder = max(start_ofs - pos, 0);
816 buf = bvec_kmap_irq(bv, &flags);
817 memset(buf + remainder, 0,
818 bv->bv_len - remainder);
85b5aaa6 819 bvec_kunmap_irq(buf, &flags);
602adf40
YS
820 }
821 pos += bv->bv_len;
822 }
823
824 chain = chain->bi_next;
825 }
826}
827
828/*
f7760dad
AE
829 * Clone a portion of a bio, starting at the given byte offset
830 * and continuing for the number of bytes indicated.
602adf40 831 */
f7760dad
AE
832static struct bio *bio_clone_range(struct bio *bio_src,
833 unsigned int offset,
834 unsigned int len,
835 gfp_t gfpmask)
602adf40 836{
f7760dad
AE
837 struct bio_vec *bv;
838 unsigned int resid;
839 unsigned short idx;
840 unsigned int voff;
841 unsigned short end_idx;
842 unsigned short vcnt;
843 struct bio *bio;
844
845 /* Handle the easy case for the caller */
846
847 if (!offset && len == bio_src->bi_size)
848 return bio_clone(bio_src, gfpmask);
849
850 if (WARN_ON_ONCE(!len))
851 return NULL;
852 if (WARN_ON_ONCE(len > bio_src->bi_size))
853 return NULL;
854 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
855 return NULL;
856
857 /* Find first affected segment... */
858
859 resid = offset;
860 __bio_for_each_segment(bv, bio_src, idx, 0) {
861 if (resid < bv->bv_len)
862 break;
863 resid -= bv->bv_len;
602adf40 864 }
f7760dad 865 voff = resid;
602adf40 866
f7760dad 867 /* ...and the last affected segment */
602adf40 868
f7760dad
AE
869 resid += len;
870 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
871 if (resid <= bv->bv_len)
872 break;
873 resid -= bv->bv_len;
874 }
875 vcnt = end_idx - idx + 1;
876
877 /* Build the clone */
878
879 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
880 if (!bio)
881 return NULL; /* ENOMEM */
602adf40 882
f7760dad
AE
883 bio->bi_bdev = bio_src->bi_bdev;
884 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
885 bio->bi_rw = bio_src->bi_rw;
886 bio->bi_flags |= 1 << BIO_CLONED;
887
888 /*
889 * Copy over our part of the bio_vec, then update the first
890 * and last (or only) entries.
891 */
892 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
893 vcnt * sizeof (struct bio_vec));
894 bio->bi_io_vec[0].bv_offset += voff;
895 if (vcnt > 1) {
896 bio->bi_io_vec[0].bv_len -= voff;
897 bio->bi_io_vec[vcnt - 1].bv_len = resid;
898 } else {
899 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
900 }
901
f7760dad
AE
902 bio->bi_vcnt = vcnt;
903 bio->bi_size = len;
904 bio->bi_idx = 0;
905
906 return bio;
907}
908
909/*
910 * Clone a portion of a bio chain, starting at the given byte offset
911 * into the first bio in the source chain and continuing for the
912 * number of bytes indicated. The result is another bio chain of
913 * exactly the given length, or a null pointer on error.
914 *
915 * The bio_src and offset parameters are both in-out. On entry they
916 * refer to the first source bio and the offset into that bio where
917 * the start of data to be cloned is located.
918 *
919 * On return, bio_src is updated to refer to the bio in the source
920 * chain that contains first un-cloned byte, and *offset will
921 * contain the offset of that byte within that bio.
922 */
923static struct bio *bio_chain_clone_range(struct bio **bio_src,
924 unsigned int *offset,
925 unsigned int len,
926 gfp_t gfpmask)
927{
928 struct bio *bi = *bio_src;
929 unsigned int off = *offset;
930 struct bio *chain = NULL;
931 struct bio **end;
932
933 /* Build up a chain of clone bios up to the limit */
934
935 if (!bi || off >= bi->bi_size || !len)
936 return NULL; /* Nothing to clone */
602adf40 937
f7760dad
AE
938 end = &chain;
939 while (len) {
940 unsigned int bi_size;
941 struct bio *bio;
942
943 if (!bi)
944 goto out_err; /* EINVAL; ran out of bio's */
945 bi_size = min_t(unsigned int, bi->bi_size - off, len);
946 bio = bio_clone_range(bi, off, bi_size, gfpmask);
947 if (!bio)
948 goto out_err; /* ENOMEM */
949
950 *end = bio;
951 end = &bio->bi_next;
602adf40 952
f7760dad
AE
953 off += bi_size;
954 if (off == bi->bi_size) {
955 bi = bi->bi_next;
956 off = 0;
957 }
958 len -= bi_size;
959 }
960 *bio_src = bi;
961 *offset = off;
962
963 return chain;
964out_err:
965 bio_chain_put(chain);
602adf40 966
602adf40
YS
967 return NULL;
968}
969
970/*
971 * helpers for osd request op vectors.
972 */
57cfc106
AE
973static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
974 int opcode, u32 payload_len)
602adf40 975{
57cfc106
AE
976 struct ceph_osd_req_op *ops;
977
978 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
979 if (!ops)
980 return NULL;
981
982 ops[0].op = opcode;
983
602adf40
YS
984 /*
985 * op extent offset and length will be set later on
986 * in calc_raw_layout()
987 */
57cfc106
AE
988 ops[0].payload_len = payload_len;
989
990 return ops;
602adf40
YS
991}
992
993static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
994{
995 kfree(ops);
996}
997
1fec7093
YS
998static void rbd_coll_end_req_index(struct request *rq,
999 struct rbd_req_coll *coll,
1000 int index,
1001 int ret, u64 len)
1002{
1003 struct request_queue *q;
1004 int min, max, i;
1005
bd919d45
AE
1006 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1007 coll, index, ret, (unsigned long long) len);
1fec7093
YS
1008
1009 if (!rq)
1010 return;
1011
1012 if (!coll) {
1013 blk_end_request(rq, ret, len);
1014 return;
1015 }
1016
1017 q = rq->q;
1018
1019 spin_lock_irq(q->queue_lock);
1020 coll->status[index].done = 1;
1021 coll->status[index].rc = ret;
1022 coll->status[index].bytes = len;
1023 max = min = coll->num_done;
1024 while (max < coll->total && coll->status[max].done)
1025 max++;
1026
1027 for (i = min; i<max; i++) {
1028 __blk_end_request(rq, coll->status[i].rc,
1029 coll->status[i].bytes);
1030 coll->num_done++;
1031 kref_put(&coll->kref, rbd_coll_release);
1032 }
1033 spin_unlock_irq(q->queue_lock);
1034}
1035
1036static void rbd_coll_end_req(struct rbd_request *req,
1037 int ret, u64 len)
1038{
1039 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1040}
1041
602adf40
YS
1042/*
1043 * Send ceph osd request
1044 */
1045static int rbd_do_request(struct request *rq,
0ce1a794 1046 struct rbd_device *rbd_dev,
602adf40
YS
1047 struct ceph_snap_context *snapc,
1048 u64 snapid,
aded07ea 1049 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1050 struct bio *bio,
1051 struct page **pages,
1052 int num_pages,
1053 int flags,
1054 struct ceph_osd_req_op *ops,
1fec7093
YS
1055 struct rbd_req_coll *coll,
1056 int coll_index,
602adf40 1057 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
1058 struct ceph_msg *msg),
1059 struct ceph_osd_request **linger_req,
1060 u64 *ver)
602adf40
YS
1061{
1062 struct ceph_osd_request *req;
1063 struct ceph_file_layout *layout;
1064 int ret;
1065 u64 bno;
1066 struct timespec mtime = CURRENT_TIME;
1067 struct rbd_request *req_data;
1068 struct ceph_osd_request_head *reqhead;
1dbb4399 1069 struct ceph_osd_client *osdc;
602adf40 1070
602adf40 1071 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
1072 if (!req_data) {
1073 if (coll)
1074 rbd_coll_end_req_index(rq, coll, coll_index,
1075 -ENOMEM, len);
1076 return -ENOMEM;
1077 }
1078
1079 if (coll) {
1080 req_data->coll = coll;
1081 req_data->coll_index = coll_index;
1082 }
602adf40 1083
f7760dad
AE
1084 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1085 object_name, (unsigned long long) ofs,
1086 (unsigned long long) len, coll, coll_index);
602adf40 1087
0ce1a794 1088 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
1089 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1090 false, GFP_NOIO, pages, bio);
4ad12621 1091 if (!req) {
4ad12621 1092 ret = -ENOMEM;
602adf40
YS
1093 goto done_pages;
1094 }
1095
1096 req->r_callback = rbd_cb;
1097
1098 req_data->rq = rq;
1099 req_data->bio = bio;
1100 req_data->pages = pages;
1101 req_data->len = len;
1102
1103 req->r_priv = req_data;
1104
1105 reqhead = req->r_request->front.iov_base;
1106 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1107
aded07ea 1108 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1109 req->r_oid_len = strlen(req->r_oid);
1110
1111 layout = &req->r_file_layout;
1112 memset(layout, 0, sizeof(*layout));
1113 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1114 layout->fl_stripe_count = cpu_to_le32(1);
1115 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
86992098 1116 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->pool_id);
6cae3717
SW
1117 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1118 req, ops);
1119 rbd_assert(ret == 0);
602adf40
YS
1120
1121 ceph_osdc_build_request(req, ofs, &len,
1122 ops,
1123 snapc,
1124 &mtime,
1125 req->r_oid, req->r_oid_len);
602adf40 1126
59c2be1e 1127 if (linger_req) {
1dbb4399 1128 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1129 *linger_req = req;
1130 }
1131
1dbb4399 1132 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1133 if (ret < 0)
1134 goto done_err;
1135
1136 if (!rbd_cb) {
1dbb4399 1137 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1138 if (ver)
1139 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1140 dout("reassert_ver=%llu\n",
1141 (unsigned long long)
1142 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1143 ceph_osdc_put_request(req);
1144 }
1145 return ret;
1146
1147done_err:
1148 bio_chain_put(req_data->bio);
1149 ceph_osdc_put_request(req);
1150done_pages:
1fec7093 1151 rbd_coll_end_req(req_data, ret, len);
602adf40 1152 kfree(req_data);
602adf40
YS
1153 return ret;
1154}
1155
1156/*
1157 * Ceph osd op callback
1158 */
1159static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1160{
1161 struct rbd_request *req_data = req->r_priv;
1162 struct ceph_osd_reply_head *replyhead;
1163 struct ceph_osd_op *op;
1164 __s32 rc;
1165 u64 bytes;
1166 int read_op;
1167
1168 /* parse reply */
1169 replyhead = msg->front.iov_base;
1170 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1171 op = (void *)(replyhead + 1);
1172 rc = le32_to_cpu(replyhead->result);
1173 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1174 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1175
bd919d45
AE
1176 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1177 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1178
1179 if (rc == -ENOENT && read_op) {
1180 zero_bio_chain(req_data->bio, 0);
1181 rc = 0;
1182 } else if (rc == 0 && read_op && bytes < req_data->len) {
1183 zero_bio_chain(req_data->bio, bytes);
1184 bytes = req_data->len;
1185 }
1186
1fec7093 1187 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1188
1189 if (req_data->bio)
1190 bio_chain_put(req_data->bio);
1191
1192 ceph_osdc_put_request(req);
1193 kfree(req_data);
1194}
1195
59c2be1e
YS
1196static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1197{
1198 ceph_osdc_put_request(req);
1199}
1200
602adf40
YS
1201/*
1202 * Do a synchronous ceph osd operation
1203 */
0ce1a794 1204static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1205 struct ceph_snap_context *snapc,
1206 u64 snapid,
602adf40 1207 int flags,
913d2fdc 1208 struct ceph_osd_req_op *ops,
aded07ea 1209 const char *object_name,
f8d4de6e
AE
1210 u64 ofs, u64 inbound_size,
1211 char *inbound,
59c2be1e
YS
1212 struct ceph_osd_request **linger_req,
1213 u64 *ver)
602adf40
YS
1214{
1215 int ret;
1216 struct page **pages;
1217 int num_pages;
913d2fdc 1218
aafb230e 1219 rbd_assert(ops != NULL);
602adf40 1220
f8d4de6e 1221 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1222 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1223 if (IS_ERR(pages))
1224 return PTR_ERR(pages);
602adf40 1225
0ce1a794 1226 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1227 object_name, ofs, inbound_size, NULL,
602adf40
YS
1228 pages, num_pages,
1229 flags,
1230 ops,
1fec7093 1231 NULL, 0,
59c2be1e
YS
1232 NULL,
1233 linger_req, ver);
602adf40 1234 if (ret < 0)
913d2fdc 1235 goto done;
602adf40 1236
f8d4de6e
AE
1237 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1238 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1239
602adf40
YS
1240done:
1241 ceph_release_page_vector(pages, num_pages);
1242 return ret;
1243}
1244
1245/*
1246 * Do an asynchronous ceph osd operation
1247 */
1248static int rbd_do_op(struct request *rq,
0ce1a794 1249 struct rbd_device *rbd_dev,
602adf40 1250 struct ceph_snap_context *snapc,
602adf40 1251 u64 ofs, u64 len,
1fec7093
YS
1252 struct bio *bio,
1253 struct rbd_req_coll *coll,
1254 int coll_index)
602adf40
YS
1255{
1256 char *seg_name;
1257 u64 seg_ofs;
1258 u64 seg_len;
1259 int ret;
1260 struct ceph_osd_req_op *ops;
1261 u32 payload_len;
ff2e4bb5
AE
1262 int opcode;
1263 int flags;
4634246d 1264 u64 snapid;
602adf40 1265
65ccfe21 1266 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1267 if (!seg_name)
1268 return -ENOMEM;
65ccfe21
AE
1269 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1270 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1271
ff2e4bb5
AE
1272 if (rq_data_dir(rq) == WRITE) {
1273 opcode = CEPH_OSD_OP_WRITE;
1274 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1275 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1276 payload_len = seg_len;
1277 } else {
1278 opcode = CEPH_OSD_OP_READ;
1279 flags = CEPH_OSD_FLAG_READ;
4634246d 1280 snapc = NULL;
971f839a 1281 snapid = rbd_dev->snap_id;
ff2e4bb5
AE
1282 payload_len = 0;
1283 }
602adf40 1284
57cfc106
AE
1285 ret = -ENOMEM;
1286 ops = rbd_create_rw_ops(1, opcode, payload_len);
1287 if (!ops)
602adf40
YS
1288 goto done;
1289
1290 /* we've taken care of segment sizes earlier when we
1291 cloned the bios. We should never have a segment
1292 truncated at this point */
aafb230e 1293 rbd_assert(seg_len == len);
602adf40
YS
1294
1295 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1296 seg_name, seg_ofs, seg_len,
1297 bio,
1298 NULL, 0,
1299 flags,
1300 ops,
1fec7093 1301 coll, coll_index,
59c2be1e 1302 rbd_req_cb, 0, NULL);
11f77002
SW
1303
1304 rbd_destroy_ops(ops);
602adf40
YS
1305done:
1306 kfree(seg_name);
1307 return ret;
1308}
1309
602adf40
YS
1310/*
1311 * Request sync osd read
1312 */
0ce1a794 1313static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1314 u64 snapid,
aded07ea 1315 const char *object_name,
602adf40 1316 u64 ofs, u64 len,
59c2be1e
YS
1317 char *buf,
1318 u64 *ver)
602adf40 1319{
913d2fdc
AE
1320 struct ceph_osd_req_op *ops;
1321 int ret;
1322
1323 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1324 if (!ops)
1325 return -ENOMEM;
1326
1327 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1328 snapid,
602adf40 1329 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1330 ops, object_name, ofs, len, buf, NULL, ver);
1331 rbd_destroy_ops(ops);
1332
1333 return ret;
602adf40
YS
1334}
1335
1336/*
59c2be1e
YS
1337 * Request sync osd watch
1338 */
0ce1a794 1339static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1340 u64 ver,
7f0a24d8 1341 u64 notify_id)
59c2be1e
YS
1342{
1343 struct ceph_osd_req_op *ops;
11f77002
SW
1344 int ret;
1345
57cfc106
AE
1346 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1347 if (!ops)
1348 return -ENOMEM;
59c2be1e 1349
a71b891b 1350 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1351 ops[0].watch.cookie = notify_id;
1352 ops[0].watch.flag = 0;
1353
0ce1a794 1354 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1355 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1356 NULL, 0,
59c2be1e
YS
1357 CEPH_OSD_FLAG_READ,
1358 ops,
1fec7093 1359 NULL, 0,
59c2be1e
YS
1360 rbd_simple_req_cb, 0, NULL);
1361
1362 rbd_destroy_ops(ops);
1363 return ret;
1364}
1365
1366static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1367{
0ce1a794 1368 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1369 u64 hver;
13143d2d
SW
1370 int rc;
1371
0ce1a794 1372 if (!rbd_dev)
59c2be1e
YS
1373 return;
1374
bd919d45
AE
1375 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1376 rbd_dev->header_name, (unsigned long long) notify_id,
1377 (unsigned int) opcode);
117973fb 1378 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1379 if (rc)
f0f8cef5 1380 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1381 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1382
7f0a24d8 1383 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1384}
1385
1386/*
1387 * Request sync osd watch
1388 */
0e6f322d 1389static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1390{
1391 struct ceph_osd_req_op *ops;
0ce1a794 1392 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1393 int ret;
59c2be1e 1394
57cfc106
AE
1395 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1396 if (!ops)
1397 return -ENOMEM;
59c2be1e
YS
1398
1399 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1400 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1401 if (ret < 0)
1402 goto fail;
1403
0e6f322d 1404 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1405 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1406 ops[0].watch.flag = 1;
1407
0ce1a794 1408 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1409 CEPH_NOSNAP,
59c2be1e
YS
1410 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1411 ops,
0e6f322d
AE
1412 rbd_dev->header_name,
1413 0, 0, NULL,
0ce1a794 1414 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1415
1416 if (ret < 0)
1417 goto fail_event;
1418
1419 rbd_destroy_ops(ops);
1420 return 0;
1421
1422fail_event:
0ce1a794
AE
1423 ceph_osdc_cancel_event(rbd_dev->watch_event);
1424 rbd_dev->watch_event = NULL;
59c2be1e
YS
1425fail:
1426 rbd_destroy_ops(ops);
1427 return ret;
1428}
1429
79e3057c
YS
1430/*
1431 * Request sync osd unwatch
1432 */
070c633f 1433static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1434{
1435 struct ceph_osd_req_op *ops;
57cfc106 1436 int ret;
79e3057c 1437
57cfc106
AE
1438 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1439 if (!ops)
1440 return -ENOMEM;
79e3057c
YS
1441
1442 ops[0].watch.ver = 0;
0ce1a794 1443 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1444 ops[0].watch.flag = 0;
1445
0ce1a794 1446 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1447 CEPH_NOSNAP,
79e3057c
YS
1448 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1449 ops,
070c633f
AE
1450 rbd_dev->header_name,
1451 0, 0, NULL, NULL, NULL);
1452
79e3057c
YS
1453
1454 rbd_destroy_ops(ops);
0ce1a794
AE
1455 ceph_osdc_cancel_event(rbd_dev->watch_event);
1456 rbd_dev->watch_event = NULL;
79e3057c
YS
1457 return ret;
1458}
1459
602adf40 1460/*
3cb4a687 1461 * Synchronous osd object method call
602adf40 1462 */
0ce1a794 1463static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1464 const char *object_name,
1465 const char *class_name,
1466 const char *method_name,
3cb4a687
AE
1467 const char *outbound,
1468 size_t outbound_size,
f8d4de6e
AE
1469 char *inbound,
1470 size_t inbound_size,
3cb4a687 1471 int flags,
59c2be1e 1472 u64 *ver)
602adf40
YS
1473{
1474 struct ceph_osd_req_op *ops;
aded07ea
AE
1475 int class_name_len = strlen(class_name);
1476 int method_name_len = strlen(method_name);
3cb4a687 1477 int payload_size;
57cfc106
AE
1478 int ret;
1479
3cb4a687
AE
1480 /*
1481 * Any input parameters required by the method we're calling
1482 * will be sent along with the class and method names as
1483 * part of the message payload. That data and its size are
1484 * supplied via the indata and indata_len fields (named from
1485 * the perspective of the server side) in the OSD request
1486 * operation.
1487 */
1488 payload_size = class_name_len + method_name_len + outbound_size;
1489 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1490 if (!ops)
1491 return -ENOMEM;
602adf40 1492
aded07ea
AE
1493 ops[0].cls.class_name = class_name;
1494 ops[0].cls.class_len = (__u8) class_name_len;
1495 ops[0].cls.method_name = method_name;
1496 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1497 ops[0].cls.argc = 0;
3cb4a687
AE
1498 ops[0].cls.indata = outbound;
1499 ops[0].cls.indata_len = outbound_size;
602adf40 1500
0ce1a794 1501 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1502 CEPH_NOSNAP,
3cb4a687 1503 flags, ops,
f8d4de6e
AE
1504 object_name, 0, inbound_size, inbound,
1505 NULL, ver);
602adf40
YS
1506
1507 rbd_destroy_ops(ops);
1508
1509 dout("cls_exec returned %d\n", ret);
1510 return ret;
1511}
1512
1fec7093
YS
1513static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1514{
1515 struct rbd_req_coll *coll =
1516 kzalloc(sizeof(struct rbd_req_coll) +
1517 sizeof(struct rbd_req_status) * num_reqs,
1518 GFP_ATOMIC);
1519
1520 if (!coll)
1521 return NULL;
1522 coll->total = num_reqs;
1523 kref_init(&coll->kref);
1524 return coll;
1525}
1526
602adf40
YS
1527/*
1528 * block device queue callback
1529 */
1530static void rbd_rq_fn(struct request_queue *q)
1531{
1532 struct rbd_device *rbd_dev = q->queuedata;
1533 struct request *rq;
602adf40 1534
00f1f36f 1535 while ((rq = blk_fetch_request(q))) {
602adf40 1536 struct bio *bio;
602adf40 1537 bool do_write;
bd919d45 1538 unsigned int size;
602adf40 1539 u64 ofs;
1fec7093
YS
1540 int num_segs, cur_seg = 0;
1541 struct rbd_req_coll *coll;
d1d25646 1542 struct ceph_snap_context *snapc;
f7760dad 1543 unsigned int bio_offset;
602adf40 1544
602adf40
YS
1545 dout("fetched request\n");
1546
1547 /* filter out block requests we don't understand */
1548 if ((rq->cmd_type != REQ_TYPE_FS)) {
1549 __blk_end_request_all(rq, 0);
00f1f36f 1550 continue;
602adf40
YS
1551 }
1552
1553 /* deduce our operation (read, write) */
1554 do_write = (rq_data_dir(rq) == WRITE);
f84344f3 1555 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1556 __blk_end_request_all(rq, -EROFS);
00f1f36f 1557 continue;
602adf40
YS
1558 }
1559
1560 spin_unlock_irq(q->queue_lock);
1561
d1d25646 1562 down_read(&rbd_dev->header_rwsem);
e88a36ec 1563
daba5fdb
AE
1564 if (!rbd_dev->exists) {
1565 rbd_assert(rbd_dev->snap_id != CEPH_NOSNAP);
e88a36ec 1566 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1567 dout("request for non-existent snapshot");
1568 spin_lock_irq(q->queue_lock);
1569 __blk_end_request_all(rq, -ENXIO);
1570 continue;
e88a36ec
JD
1571 }
1572
d1d25646
JD
1573 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1574
1575 up_read(&rbd_dev->header_rwsem);
1576
f7760dad
AE
1577 size = blk_rq_bytes(rq);
1578 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1579 bio = rq->bio;
1580
602adf40
YS
1581 dout("%s 0x%x bytes at 0x%llx\n",
1582 do_write ? "write" : "read",
bd919d45 1583 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1584
1fec7093 1585 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1586 if (num_segs <= 0) {
1587 spin_lock_irq(q->queue_lock);
1588 __blk_end_request_all(rq, num_segs);
1589 ceph_put_snap_context(snapc);
1590 continue;
1591 }
1fec7093
YS
1592 coll = rbd_alloc_coll(num_segs);
1593 if (!coll) {
1594 spin_lock_irq(q->queue_lock);
1595 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1596 ceph_put_snap_context(snapc);
00f1f36f 1597 continue;
1fec7093
YS
1598 }
1599
f7760dad 1600 bio_offset = 0;
602adf40 1601 do {
f7760dad
AE
1602 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1603 unsigned int chain_size;
1604 struct bio *bio_chain;
1605
1606 BUG_ON(limit > (u64) UINT_MAX);
1607 chain_size = (unsigned int) limit;
bd919d45 1608 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
f7760dad 1609
1fec7093 1610 kref_get(&coll->kref);
f7760dad
AE
1611
1612 /* Pass a cloned bio chain via an osd request */
1613
1614 bio_chain = bio_chain_clone_range(&bio,
1615 &bio_offset, chain_size,
1616 GFP_ATOMIC);
1617 if (bio_chain)
4634246d 1618 (void) rbd_do_op(rq, rbd_dev, snapc,
f7760dad
AE
1619 ofs, chain_size,
1620 bio_chain, coll, cur_seg);
4634246d 1621 else
1fec7093 1622 rbd_coll_end_req_index(rq, coll, cur_seg,
f7760dad
AE
1623 -ENOMEM, chain_size);
1624 size -= chain_size;
1625 ofs += chain_size;
602adf40 1626
1fec7093 1627 cur_seg++;
602adf40 1628 } while (size > 0);
1fec7093 1629 kref_put(&coll->kref, rbd_coll_release);
602adf40 1630
602adf40 1631 spin_lock_irq(q->queue_lock);
d1d25646
JD
1632
1633 ceph_put_snap_context(snapc);
602adf40
YS
1634 }
1635}
1636
1637/*
1638 * a queue callback. Makes sure that we don't create a bio that spans across
1639 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1640 * which we handle later at bio_chain_clone_range()
602adf40
YS
1641 */
1642static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1643 struct bio_vec *bvec)
1644{
1645 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1646 sector_t sector_offset;
1647 sector_t sectors_per_obj;
1648 sector_t obj_sector_offset;
1649 int ret;
1650
1651 /*
1652 * Find how far into its rbd object the partition-relative
1653 * bio start sector is to offset relative to the enclosing
1654 * device.
1655 */
1656 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1657 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1658 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1659
1660 /*
1661 * Compute the number of bytes from that offset to the end
1662 * of the object. Account for what's already used by the bio.
1663 */
1664 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1665 if (ret > bmd->bi_size)
1666 ret -= bmd->bi_size;
1667 else
1668 ret = 0;
1669
1670 /*
1671 * Don't send back more than was asked for. And if the bio
1672 * was empty, let the whole thing through because: "Note
1673 * that a block device *must* allow a single page to be
1674 * added to an empty bio."
1675 */
1676 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1677 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1678 ret = (int) bvec->bv_len;
1679
1680 return ret;
602adf40
YS
1681}
1682
1683static void rbd_free_disk(struct rbd_device *rbd_dev)
1684{
1685 struct gendisk *disk = rbd_dev->disk;
1686
1687 if (!disk)
1688 return;
1689
602adf40
YS
1690 if (disk->flags & GENHD_FL_UP)
1691 del_gendisk(disk);
1692 if (disk->queue)
1693 blk_cleanup_queue(disk->queue);
1694 put_disk(disk);
1695}
1696
1697/*
4156d998
AE
1698 * Read the complete header for the given rbd device.
1699 *
1700 * Returns a pointer to a dynamically-allocated buffer containing
1701 * the complete and validated header. Caller can pass the address
1702 * of a variable that will be filled in with the version of the
1703 * header object at the time it was read.
1704 *
1705 * Returns a pointer-coded errno if a failure occurs.
602adf40 1706 */
4156d998
AE
1707static struct rbd_image_header_ondisk *
1708rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1709{
4156d998 1710 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1711 u32 snap_count = 0;
4156d998
AE
1712 u64 names_size = 0;
1713 u32 want_count;
1714 int ret;
602adf40 1715
00f1f36f 1716 /*
4156d998
AE
1717 * The complete header will include an array of its 64-bit
1718 * snapshot ids, followed by the names of those snapshots as
1719 * a contiguous block of NUL-terminated strings. Note that
1720 * the number of snapshots could change by the time we read
1721 * it in, in which case we re-read it.
00f1f36f 1722 */
4156d998
AE
1723 do {
1724 size_t size;
1725
1726 kfree(ondisk);
1727
1728 size = sizeof (*ondisk);
1729 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1730 size += names_size;
1731 ondisk = kmalloc(size, GFP_KERNEL);
1732 if (!ondisk)
1733 return ERR_PTR(-ENOMEM);
1734
1735 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1736 rbd_dev->header_name,
4156d998
AE
1737 0, size,
1738 (char *) ondisk, version);
1739
1740 if (ret < 0)
1741 goto out_err;
1742 if (WARN_ON((size_t) ret < size)) {
1743 ret = -ENXIO;
1744 pr_warning("short header read for image %s"
1745 " (want %zd got %d)\n",
1746 rbd_dev->image_name, size, ret);
1747 goto out_err;
1748 }
1749 if (!rbd_dev_ondisk_valid(ondisk)) {
1750 ret = -ENXIO;
1751 pr_warning("invalid header for image %s\n",
1752 rbd_dev->image_name);
1753 goto out_err;
81e759fb 1754 }
602adf40 1755
4156d998
AE
1756 names_size = le64_to_cpu(ondisk->snap_names_len);
1757 want_count = snap_count;
1758 snap_count = le32_to_cpu(ondisk->snap_count);
1759 } while (snap_count != want_count);
00f1f36f 1760
4156d998 1761 return ondisk;
00f1f36f 1762
4156d998
AE
1763out_err:
1764 kfree(ondisk);
1765
1766 return ERR_PTR(ret);
1767}
1768
1769/*
1770 * reload the ondisk the header
1771 */
1772static int rbd_read_header(struct rbd_device *rbd_dev,
1773 struct rbd_image_header *header)
1774{
1775 struct rbd_image_header_ondisk *ondisk;
1776 u64 ver = 0;
1777 int ret;
602adf40 1778
4156d998
AE
1779 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1780 if (IS_ERR(ondisk))
1781 return PTR_ERR(ondisk);
1782 ret = rbd_header_from_disk(header, ondisk);
1783 if (ret >= 0)
1784 header->obj_version = ver;
1785 kfree(ondisk);
1786
1787 return ret;
602adf40
YS
1788}
1789
41f38c2b 1790static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1791{
1792 struct rbd_snap *snap;
a0593290 1793 struct rbd_snap *next;
dfc5606d 1794
a0593290 1795 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1796 rbd_remove_snap_dev(snap);
dfc5606d
YS
1797}
1798
9478554a
AE
1799static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1800{
1801 sector_t size;
1802
971f839a 1803 if (rbd_dev->snap_id != CEPH_NOSNAP)
9478554a
AE
1804 return;
1805
1806 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1807 dout("setting size to %llu sectors", (unsigned long long) size);
1808 rbd_dev->mapping.size = (u64) size;
1809 set_capacity(rbd_dev->disk, size);
1810}
1811
602adf40
YS
1812/*
1813 * only read the first part of the ondisk header, without the snaps info
1814 */
117973fb 1815static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1816{
1817 int ret;
1818 struct rbd_image_header h;
602adf40
YS
1819
1820 ret = rbd_read_header(rbd_dev, &h);
1821 if (ret < 0)
1822 return ret;
1823
a51aa0c0
JD
1824 down_write(&rbd_dev->header_rwsem);
1825
9478554a
AE
1826 /* Update image size, and check for resize of mapped image */
1827 rbd_dev->header.image_size = h.image_size;
1828 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1829
849b4260 1830 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1831 kfree(rbd_dev->header.snap_sizes);
849b4260 1832 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1833 /* osd requests may still refer to snapc */
1834 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1835
b813623a
AE
1836 if (hver)
1837 *hver = h.obj_version;
a71b891b 1838 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1839 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1840 rbd_dev->header.snapc = h.snapc;
1841 rbd_dev->header.snap_names = h.snap_names;
1842 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1843 /* Free the extra copy of the object prefix */
1844 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1845 kfree(h.object_prefix);
1846
304f6808
AE
1847 ret = rbd_dev_snaps_update(rbd_dev);
1848 if (!ret)
1849 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1850
c666601a 1851 up_write(&rbd_dev->header_rwsem);
602adf40 1852
dfc5606d 1853 return ret;
602adf40
YS
1854}
1855
117973fb 1856static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1857{
1858 int ret;
1859
117973fb 1860 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1861 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1862 if (rbd_dev->image_format == 1)
1863 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1864 else
1865 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1866 mutex_unlock(&ctl_mutex);
1867
1868 return ret;
1869}
1870
602adf40
YS
1871static int rbd_init_disk(struct rbd_device *rbd_dev)
1872{
1873 struct gendisk *disk;
1874 struct request_queue *q;
593a9e7b 1875 u64 segment_size;
602adf40 1876
602adf40 1877 /* create gendisk info */
602adf40
YS
1878 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1879 if (!disk)
1fcdb8aa 1880 return -ENOMEM;
602adf40 1881
f0f8cef5 1882 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1883 rbd_dev->dev_id);
602adf40
YS
1884 disk->major = rbd_dev->major;
1885 disk->first_minor = 0;
1886 disk->fops = &rbd_bd_ops;
1887 disk->private_data = rbd_dev;
1888
1889 /* init rq */
602adf40
YS
1890 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1891 if (!q)
1892 goto out_disk;
029bcbd8 1893
593a9e7b
AE
1894 /* We use the default size, but let's be explicit about it. */
1895 blk_queue_physical_block_size(q, SECTOR_SIZE);
1896
029bcbd8 1897 /* set io sizes to object size */
593a9e7b
AE
1898 segment_size = rbd_obj_bytes(&rbd_dev->header);
1899 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1900 blk_queue_max_segment_size(q, segment_size);
1901 blk_queue_io_min(q, segment_size);
1902 blk_queue_io_opt(q, segment_size);
029bcbd8 1903
602adf40
YS
1904 blk_queue_merge_bvec(q, rbd_merge_bvec);
1905 disk->queue = q;
1906
1907 q->queuedata = rbd_dev;
1908
1909 rbd_dev->disk = disk;
602adf40 1910
12f02944
AE
1911 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1912
602adf40 1913 return 0;
602adf40
YS
1914out_disk:
1915 put_disk(disk);
1fcdb8aa
AE
1916
1917 return -ENOMEM;
602adf40
YS
1918}
1919
dfc5606d
YS
1920/*
1921 sysfs
1922*/
1923
593a9e7b
AE
1924static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1925{
1926 return container_of(dev, struct rbd_device, dev);
1927}
1928
dfc5606d
YS
1929static ssize_t rbd_size_show(struct device *dev,
1930 struct device_attribute *attr, char *buf)
1931{
593a9e7b 1932 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1933 sector_t size;
1934
1935 down_read(&rbd_dev->header_rwsem);
1936 size = get_capacity(rbd_dev->disk);
1937 up_read(&rbd_dev->header_rwsem);
dfc5606d 1938
a51aa0c0 1939 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1940}
1941
34b13184
AE
1942/*
1943 * Note this shows the features for whatever's mapped, which is not
1944 * necessarily the base image.
1945 */
1946static ssize_t rbd_features_show(struct device *dev,
1947 struct device_attribute *attr, char *buf)
1948{
1949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1950
1951 return sprintf(buf, "0x%016llx\n",
1952 (unsigned long long) rbd_dev->mapping.features);
1953}
1954
dfc5606d
YS
1955static ssize_t rbd_major_show(struct device *dev,
1956 struct device_attribute *attr, char *buf)
1957{
593a9e7b 1958 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1959
dfc5606d
YS
1960 return sprintf(buf, "%d\n", rbd_dev->major);
1961}
1962
1963static ssize_t rbd_client_id_show(struct device *dev,
1964 struct device_attribute *attr, char *buf)
602adf40 1965{
593a9e7b 1966 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1967
1dbb4399
AE
1968 return sprintf(buf, "client%lld\n",
1969 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1970}
1971
dfc5606d
YS
1972static ssize_t rbd_pool_show(struct device *dev,
1973 struct device_attribute *attr, char *buf)
602adf40 1974{
593a9e7b 1975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1976
1977 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1978}
1979
9bb2f334
AE
1980static ssize_t rbd_pool_id_show(struct device *dev,
1981 struct device_attribute *attr, char *buf)
1982{
1983 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1984
86992098 1985 return sprintf(buf, "%llu\n", (unsigned long long) rbd_dev->pool_id);
9bb2f334
AE
1986}
1987
dfc5606d
YS
1988static ssize_t rbd_name_show(struct device *dev,
1989 struct device_attribute *attr, char *buf)
1990{
593a9e7b 1991 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1992
0bed54dc 1993 return sprintf(buf, "%s\n", rbd_dev->image_name);
dfc5606d
YS
1994}
1995
589d30e0
AE
1996static ssize_t rbd_image_id_show(struct device *dev,
1997 struct device_attribute *attr, char *buf)
1998{
1999 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2000
2001 return sprintf(buf, "%s\n", rbd_dev->image_id);
2002}
2003
34b13184
AE
2004/*
2005 * Shows the name of the currently-mapped snapshot (or
2006 * RBD_SNAP_HEAD_NAME for the base image).
2007 */
dfc5606d
YS
2008static ssize_t rbd_snap_show(struct device *dev,
2009 struct device_attribute *attr,
2010 char *buf)
2011{
593a9e7b 2012 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2013
971f839a 2014 return sprintf(buf, "%s\n", rbd_dev->snap_name);
dfc5606d
YS
2015}
2016
2017static ssize_t rbd_image_refresh(struct device *dev,
2018 struct device_attribute *attr,
2019 const char *buf,
2020 size_t size)
2021{
593a9e7b 2022 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2023 int ret;
602adf40 2024
117973fb 2025 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2026
2027 return ret < 0 ? ret : size;
dfc5606d 2028}
602adf40 2029
dfc5606d 2030static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2031static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2032static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2033static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2034static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2035static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2036static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2037static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2038static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2039static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
dfc5606d
YS
2040
2041static struct attribute *rbd_attrs[] = {
2042 &dev_attr_size.attr,
34b13184 2043 &dev_attr_features.attr,
dfc5606d
YS
2044 &dev_attr_major.attr,
2045 &dev_attr_client_id.attr,
2046 &dev_attr_pool.attr,
9bb2f334 2047 &dev_attr_pool_id.attr,
dfc5606d 2048 &dev_attr_name.attr,
589d30e0 2049 &dev_attr_image_id.attr,
dfc5606d
YS
2050 &dev_attr_current_snap.attr,
2051 &dev_attr_refresh.attr,
dfc5606d
YS
2052 NULL
2053};
2054
2055static struct attribute_group rbd_attr_group = {
2056 .attrs = rbd_attrs,
2057};
2058
2059static const struct attribute_group *rbd_attr_groups[] = {
2060 &rbd_attr_group,
2061 NULL
2062};
2063
2064static void rbd_sysfs_dev_release(struct device *dev)
2065{
2066}
2067
2068static struct device_type rbd_device_type = {
2069 .name = "rbd",
2070 .groups = rbd_attr_groups,
2071 .release = rbd_sysfs_dev_release,
2072};
2073
2074
2075/*
2076 sysfs - snapshots
2077*/
2078
2079static ssize_t rbd_snap_size_show(struct device *dev,
2080 struct device_attribute *attr,
2081 char *buf)
2082{
2083 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2084
3591538f 2085 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2086}
2087
2088static ssize_t rbd_snap_id_show(struct device *dev,
2089 struct device_attribute *attr,
2090 char *buf)
2091{
2092 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2093
3591538f 2094 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2095}
2096
34b13184
AE
2097static ssize_t rbd_snap_features_show(struct device *dev,
2098 struct device_attribute *attr,
2099 char *buf)
2100{
2101 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2102
2103 return sprintf(buf, "0x%016llx\n",
2104 (unsigned long long) snap->features);
2105}
2106
dfc5606d
YS
2107static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2108static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2109static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2110
2111static struct attribute *rbd_snap_attrs[] = {
2112 &dev_attr_snap_size.attr,
2113 &dev_attr_snap_id.attr,
34b13184 2114 &dev_attr_snap_features.attr,
dfc5606d
YS
2115 NULL,
2116};
2117
2118static struct attribute_group rbd_snap_attr_group = {
2119 .attrs = rbd_snap_attrs,
2120};
2121
2122static void rbd_snap_dev_release(struct device *dev)
2123{
2124 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2125 kfree(snap->name);
2126 kfree(snap);
2127}
2128
2129static const struct attribute_group *rbd_snap_attr_groups[] = {
2130 &rbd_snap_attr_group,
2131 NULL
2132};
2133
2134static struct device_type rbd_snap_device_type = {
2135 .groups = rbd_snap_attr_groups,
2136 .release = rbd_snap_dev_release,
2137};
2138
304f6808
AE
2139static bool rbd_snap_registered(struct rbd_snap *snap)
2140{
2141 bool ret = snap->dev.type == &rbd_snap_device_type;
2142 bool reg = device_is_registered(&snap->dev);
2143
2144 rbd_assert(!ret ^ reg);
2145
2146 return ret;
2147}
2148
41f38c2b 2149static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2150{
2151 list_del(&snap->node);
304f6808
AE
2152 if (device_is_registered(&snap->dev))
2153 device_unregister(&snap->dev);
dfc5606d
YS
2154}
2155
14e7085d 2156static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2157 struct device *parent)
2158{
2159 struct device *dev = &snap->dev;
2160 int ret;
2161
2162 dev->type = &rbd_snap_device_type;
2163 dev->parent = parent;
2164 dev->release = rbd_snap_dev_release;
d4b125e9 2165 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2166 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2167
dfc5606d
YS
2168 ret = device_register(dev);
2169
2170 return ret;
2171}
2172
4e891e0a 2173static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2174 const char *snap_name,
34b13184
AE
2175 u64 snap_id, u64 snap_size,
2176 u64 snap_features)
dfc5606d 2177{
4e891e0a 2178 struct rbd_snap *snap;
dfc5606d 2179 int ret;
4e891e0a
AE
2180
2181 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2182 if (!snap)
4e891e0a
AE
2183 return ERR_PTR(-ENOMEM);
2184
2185 ret = -ENOMEM;
c8d18425 2186 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2187 if (!snap->name)
2188 goto err;
2189
c8d18425
AE
2190 snap->id = snap_id;
2191 snap->size = snap_size;
34b13184 2192 snap->features = snap_features;
4e891e0a
AE
2193
2194 return snap;
2195
dfc5606d
YS
2196err:
2197 kfree(snap->name);
2198 kfree(snap);
4e891e0a
AE
2199
2200 return ERR_PTR(ret);
dfc5606d
YS
2201}
2202
cd892126
AE
2203static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2204 u64 *snap_size, u64 *snap_features)
2205{
2206 char *snap_name;
2207
2208 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2209
2210 *snap_size = rbd_dev->header.snap_sizes[which];
2211 *snap_features = 0; /* No features for v1 */
2212
2213 /* Skip over names until we find the one we are looking for */
2214
2215 snap_name = rbd_dev->header.snap_names;
2216 while (which--)
2217 snap_name += strlen(snap_name) + 1;
2218
2219 return snap_name;
2220}
2221
9d475de5
AE
2222/*
2223 * Get the size and object order for an image snapshot, or if
2224 * snap_id is CEPH_NOSNAP, gets this information for the base
2225 * image.
2226 */
2227static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2228 u8 *order, u64 *snap_size)
2229{
2230 __le64 snapid = cpu_to_le64(snap_id);
2231 int ret;
2232 struct {
2233 u8 order;
2234 __le64 size;
2235 } __attribute__ ((packed)) size_buf = { 0 };
2236
2237 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2238 "rbd", "get_size",
2239 (char *) &snapid, sizeof (snapid),
2240 (char *) &size_buf, sizeof (size_buf),
2241 CEPH_OSD_FLAG_READ, NULL);
2242 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2243 if (ret < 0)
2244 return ret;
2245
2246 *order = size_buf.order;
2247 *snap_size = le64_to_cpu(size_buf.size);
2248
2249 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2250 (unsigned long long) snap_id, (unsigned int) *order,
2251 (unsigned long long) *snap_size);
2252
2253 return 0;
2254}
2255
2256static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2257{
2258 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2259 &rbd_dev->header.obj_order,
2260 &rbd_dev->header.image_size);
2261}
2262
1e130199
AE
2263static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2264{
2265 void *reply_buf;
2266 int ret;
2267 void *p;
2268
2269 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2270 if (!reply_buf)
2271 return -ENOMEM;
2272
2273 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2274 "rbd", "get_object_prefix",
2275 NULL, 0,
2276 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2277 CEPH_OSD_FLAG_READ, NULL);
2278 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2279 if (ret < 0)
2280 goto out;
a0ea3a40 2281 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2282
2283 p = reply_buf;
2284 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2285 p + RBD_OBJ_PREFIX_LEN_MAX,
2286 NULL, GFP_NOIO);
2287
2288 if (IS_ERR(rbd_dev->header.object_prefix)) {
2289 ret = PTR_ERR(rbd_dev->header.object_prefix);
2290 rbd_dev->header.object_prefix = NULL;
2291 } else {
2292 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2293 }
2294
2295out:
2296 kfree(reply_buf);
2297
2298 return ret;
2299}
2300
b1b5402a
AE
2301static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2302 u64 *snap_features)
2303{
2304 __le64 snapid = cpu_to_le64(snap_id);
2305 struct {
2306 __le64 features;
2307 __le64 incompat;
2308 } features_buf = { 0 };
d889140c 2309 u64 incompat;
b1b5402a
AE
2310 int ret;
2311
2312 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2313 "rbd", "get_features",
2314 (char *) &snapid, sizeof (snapid),
2315 (char *) &features_buf, sizeof (features_buf),
2316 CEPH_OSD_FLAG_READ, NULL);
2317 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2318 if (ret < 0)
2319 return ret;
d889140c
AE
2320
2321 incompat = le64_to_cpu(features_buf.incompat);
2322 if (incompat & ~RBD_FEATURES_ALL)
2323 return -ENOTSUPP;
2324
b1b5402a
AE
2325 *snap_features = le64_to_cpu(features_buf.features);
2326
2327 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2328 (unsigned long long) snap_id,
2329 (unsigned long long) *snap_features,
2330 (unsigned long long) le64_to_cpu(features_buf.incompat));
2331
2332 return 0;
2333}
2334
2335static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2336{
2337 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2338 &rbd_dev->header.features);
2339}
2340
6e14b1a6 2341static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2342{
2343 size_t size;
2344 int ret;
2345 void *reply_buf;
2346 void *p;
2347 void *end;
2348 u64 seq;
2349 u32 snap_count;
2350 struct ceph_snap_context *snapc;
2351 u32 i;
2352
2353 /*
2354 * We'll need room for the seq value (maximum snapshot id),
2355 * snapshot count, and array of that many snapshot ids.
2356 * For now we have a fixed upper limit on the number we're
2357 * prepared to receive.
2358 */
2359 size = sizeof (__le64) + sizeof (__le32) +
2360 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2361 reply_buf = kzalloc(size, GFP_KERNEL);
2362 if (!reply_buf)
2363 return -ENOMEM;
2364
2365 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2366 "rbd", "get_snapcontext",
2367 NULL, 0,
2368 reply_buf, size,
6e14b1a6 2369 CEPH_OSD_FLAG_READ, ver);
35d489f9
AE
2370 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2371 if (ret < 0)
2372 goto out;
2373
2374 ret = -ERANGE;
2375 p = reply_buf;
2376 end = (char *) reply_buf + size;
2377 ceph_decode_64_safe(&p, end, seq, out);
2378 ceph_decode_32_safe(&p, end, snap_count, out);
2379
2380 /*
2381 * Make sure the reported number of snapshot ids wouldn't go
2382 * beyond the end of our buffer. But before checking that,
2383 * make sure the computed size of the snapshot context we
2384 * allocate is representable in a size_t.
2385 */
2386 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2387 / sizeof (u64)) {
2388 ret = -EINVAL;
2389 goto out;
2390 }
2391 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2392 goto out;
2393
2394 size = sizeof (struct ceph_snap_context) +
2395 snap_count * sizeof (snapc->snaps[0]);
2396 snapc = kmalloc(size, GFP_KERNEL);
2397 if (!snapc) {
2398 ret = -ENOMEM;
2399 goto out;
2400 }
2401
2402 atomic_set(&snapc->nref, 1);
2403 snapc->seq = seq;
2404 snapc->num_snaps = snap_count;
2405 for (i = 0; i < snap_count; i++)
2406 snapc->snaps[i] = ceph_decode_64(&p);
2407
2408 rbd_dev->header.snapc = snapc;
2409
2410 dout(" snap context seq = %llu, snap_count = %u\n",
2411 (unsigned long long) seq, (unsigned int) snap_count);
2412
2413out:
2414 kfree(reply_buf);
2415
2416 return 0;
2417}
2418
b8b1e2db
AE
2419static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2420{
2421 size_t size;
2422 void *reply_buf;
2423 __le64 snap_id;
2424 int ret;
2425 void *p;
2426 void *end;
2427 size_t snap_name_len;
2428 char *snap_name;
2429
2430 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2431 reply_buf = kmalloc(size, GFP_KERNEL);
2432 if (!reply_buf)
2433 return ERR_PTR(-ENOMEM);
2434
2435 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2436 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2437 "rbd", "get_snapshot_name",
2438 (char *) &snap_id, sizeof (snap_id),
2439 reply_buf, size,
2440 CEPH_OSD_FLAG_READ, NULL);
2441 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2442 if (ret < 0)
2443 goto out;
2444
2445 p = reply_buf;
2446 end = (char *) reply_buf + size;
2447 snap_name_len = 0;
2448 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2449 GFP_KERNEL);
2450 if (IS_ERR(snap_name)) {
2451 ret = PTR_ERR(snap_name);
2452 goto out;
2453 } else {
2454 dout(" snap_id 0x%016llx snap_name = %s\n",
2455 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2456 }
2457 kfree(reply_buf);
2458
2459 return snap_name;
2460out:
2461 kfree(reply_buf);
2462
2463 return ERR_PTR(ret);
2464}
2465
2466static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2467 u64 *snap_size, u64 *snap_features)
2468{
2469 __le64 snap_id;
2470 u8 order;
2471 int ret;
2472
2473 snap_id = rbd_dev->header.snapc->snaps[which];
2474 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2475 if (ret)
2476 return ERR_PTR(ret);
2477 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2478 if (ret)
2479 return ERR_PTR(ret);
2480
2481 return rbd_dev_v2_snap_name(rbd_dev, which);
2482}
2483
2484static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2485 u64 *snap_size, u64 *snap_features)
2486{
2487 if (rbd_dev->image_format == 1)
2488 return rbd_dev_v1_snap_info(rbd_dev, which,
2489 snap_size, snap_features);
2490 if (rbd_dev->image_format == 2)
2491 return rbd_dev_v2_snap_info(rbd_dev, which,
2492 snap_size, snap_features);
2493 return ERR_PTR(-EINVAL);
2494}
2495
117973fb
AE
2496static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2497{
2498 int ret;
2499 __u8 obj_order;
2500
2501 down_write(&rbd_dev->header_rwsem);
2502
2503 /* Grab old order first, to see if it changes */
2504
2505 obj_order = rbd_dev->header.obj_order,
2506 ret = rbd_dev_v2_image_size(rbd_dev);
2507 if (ret)
2508 goto out;
2509 if (rbd_dev->header.obj_order != obj_order) {
2510 ret = -EIO;
2511 goto out;
2512 }
2513 rbd_update_mapping_size(rbd_dev);
2514
2515 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2516 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2517 if (ret)
2518 goto out;
2519 ret = rbd_dev_snaps_update(rbd_dev);
2520 dout("rbd_dev_snaps_update returned %d\n", ret);
2521 if (ret)
2522 goto out;
2523 ret = rbd_dev_snaps_register(rbd_dev);
2524 dout("rbd_dev_snaps_register returned %d\n", ret);
2525out:
2526 up_write(&rbd_dev->header_rwsem);
2527
2528 return ret;
2529}
2530
dfc5606d 2531/*
35938150
AE
2532 * Scan the rbd device's current snapshot list and compare it to the
2533 * newly-received snapshot context. Remove any existing snapshots
2534 * not present in the new snapshot context. Add a new snapshot for
2535 * any snaphots in the snapshot context not in the current list.
2536 * And verify there are no changes to snapshots we already know
2537 * about.
2538 *
2539 * Assumes the snapshots in the snapshot context are sorted by
2540 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2541 * are also maintained in that order.)
dfc5606d 2542 */
304f6808 2543static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2544{
35938150
AE
2545 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2546 const u32 snap_count = snapc->num_snaps;
35938150
AE
2547 struct list_head *head = &rbd_dev->snaps;
2548 struct list_head *links = head->next;
2549 u32 index = 0;
dfc5606d 2550
9fcbb800 2551 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2552 while (index < snap_count || links != head) {
2553 u64 snap_id;
2554 struct rbd_snap *snap;
cd892126
AE
2555 char *snap_name;
2556 u64 snap_size = 0;
2557 u64 snap_features = 0;
dfc5606d 2558
35938150
AE
2559 snap_id = index < snap_count ? snapc->snaps[index]
2560 : CEPH_NOSNAP;
2561 snap = links != head ? list_entry(links, struct rbd_snap, node)
2562 : NULL;
aafb230e 2563 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2564
35938150
AE
2565 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2566 struct list_head *next = links->next;
dfc5606d 2567
35938150 2568 /* Existing snapshot not in the new snap context */
dfc5606d 2569
971f839a 2570 if (rbd_dev->snap_id == snap->id)
daba5fdb 2571 rbd_dev->exists = false;
41f38c2b 2572 rbd_remove_snap_dev(snap);
9fcbb800 2573 dout("%ssnap id %llu has been removed\n",
971f839a 2574 rbd_dev->snap_id == snap->id ? "mapped " : "",
9fcbb800 2575 (unsigned long long) snap->id);
35938150
AE
2576
2577 /* Done with this list entry; advance */
2578
2579 links = next;
dfc5606d
YS
2580 continue;
2581 }
35938150 2582
b8b1e2db
AE
2583 snap_name = rbd_dev_snap_info(rbd_dev, index,
2584 &snap_size, &snap_features);
cd892126
AE
2585 if (IS_ERR(snap_name))
2586 return PTR_ERR(snap_name);
2587
9fcbb800
AE
2588 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2589 (unsigned long long) snap_id);
35938150
AE
2590 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2591 struct rbd_snap *new_snap;
2592
2593 /* We haven't seen this snapshot before */
2594
c8d18425 2595 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2596 snap_id, snap_size, snap_features);
9fcbb800
AE
2597 if (IS_ERR(new_snap)) {
2598 int err = PTR_ERR(new_snap);
2599
2600 dout(" failed to add dev, error %d\n", err);
2601
2602 return err;
2603 }
35938150
AE
2604
2605 /* New goes before existing, or at end of list */
2606
9fcbb800 2607 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2608 if (snap)
2609 list_add_tail(&new_snap->node, &snap->node);
2610 else
523f3258 2611 list_add_tail(&new_snap->node, head);
35938150
AE
2612 } else {
2613 /* Already have this one */
2614
9fcbb800
AE
2615 dout(" already present\n");
2616
cd892126 2617 rbd_assert(snap->size == snap_size);
aafb230e 2618 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2619 rbd_assert(snap->features == snap_features);
35938150
AE
2620
2621 /* Done with this list entry; advance */
2622
2623 links = links->next;
dfc5606d 2624 }
35938150
AE
2625
2626 /* Advance to the next entry in the snapshot context */
2627
2628 index++;
dfc5606d 2629 }
9fcbb800 2630 dout("%s: done\n", __func__);
dfc5606d
YS
2631
2632 return 0;
2633}
2634
304f6808
AE
2635/*
2636 * Scan the list of snapshots and register the devices for any that
2637 * have not already been registered.
2638 */
2639static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2640{
2641 struct rbd_snap *snap;
2642 int ret = 0;
2643
2644 dout("%s called\n", __func__);
86ff77bb
AE
2645 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2646 return -EIO;
304f6808
AE
2647
2648 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2649 if (!rbd_snap_registered(snap)) {
2650 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2651 if (ret < 0)
2652 break;
2653 }
2654 }
2655 dout("%s: returning %d\n", __func__, ret);
2656
2657 return ret;
2658}
2659
dfc5606d
YS
2660static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2661{
dfc5606d 2662 struct device *dev;
cd789ab9 2663 int ret;
dfc5606d
YS
2664
2665 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2666
cd789ab9 2667 dev = &rbd_dev->dev;
dfc5606d
YS
2668 dev->bus = &rbd_bus_type;
2669 dev->type = &rbd_device_type;
2670 dev->parent = &rbd_root_dev;
2671 dev->release = rbd_dev_release;
de71a297 2672 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2673 ret = device_register(dev);
dfc5606d 2674
dfc5606d 2675 mutex_unlock(&ctl_mutex);
cd789ab9 2676
dfc5606d 2677 return ret;
602adf40
YS
2678}
2679
dfc5606d
YS
2680static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2681{
2682 device_unregister(&rbd_dev->dev);
2683}
2684
59c2be1e
YS
2685static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2686{
2687 int ret, rc;
2688
2689 do {
0e6f322d 2690 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2691 if (ret == -ERANGE) {
117973fb 2692 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
2693 if (rc < 0)
2694 return rc;
2695 }
2696 } while (ret == -ERANGE);
2697
2698 return ret;
2699}
2700
e2839308 2701static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2702
2703/*
499afd5b
AE
2704 * Get a unique rbd identifier for the given new rbd_dev, and add
2705 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2706 */
e2839308 2707static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2708{
e2839308 2709 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2710
2711 spin_lock(&rbd_dev_list_lock);
2712 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2713 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2714 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2715 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2716}
b7f23c36 2717
1ddbe94e 2718/*
499afd5b
AE
2719 * Remove an rbd_dev from the global list, and record that its
2720 * identifier is no longer in use.
1ddbe94e 2721 */
e2839308 2722static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2723{
d184f6bf 2724 struct list_head *tmp;
de71a297 2725 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2726 int max_id;
2727
aafb230e 2728 rbd_assert(rbd_id > 0);
499afd5b 2729
e2839308
AE
2730 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2731 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2732 spin_lock(&rbd_dev_list_lock);
2733 list_del_init(&rbd_dev->node);
d184f6bf
AE
2734
2735 /*
2736 * If the id being "put" is not the current maximum, there
2737 * is nothing special we need to do.
2738 */
e2839308 2739 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2740 spin_unlock(&rbd_dev_list_lock);
2741 return;
2742 }
2743
2744 /*
2745 * We need to update the current maximum id. Search the
2746 * list to find out what it is. We're more likely to find
2747 * the maximum at the end, so search the list backward.
2748 */
2749 max_id = 0;
2750 list_for_each_prev(tmp, &rbd_dev_list) {
2751 struct rbd_device *rbd_dev;
2752
2753 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
2754 if (rbd_dev->dev_id > max_id)
2755 max_id = rbd_dev->dev_id;
d184f6bf 2756 }
499afd5b 2757 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2758
1ddbe94e 2759 /*
e2839308 2760 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2761 * which case it now accurately reflects the new maximum.
2762 * Be careful not to overwrite the maximum value in that
2763 * case.
1ddbe94e 2764 */
e2839308
AE
2765 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2766 dout(" max dev id has been reset\n");
b7f23c36
AE
2767}
2768
e28fff26
AE
2769/*
2770 * Skips over white space at *buf, and updates *buf to point to the
2771 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2772 * the token (string of non-white space characters) found. Note
2773 * that *buf must be terminated with '\0'.
e28fff26
AE
2774 */
2775static inline size_t next_token(const char **buf)
2776{
2777 /*
2778 * These are the characters that produce nonzero for
2779 * isspace() in the "C" and "POSIX" locales.
2780 */
2781 const char *spaces = " \f\n\r\t\v";
2782
2783 *buf += strspn(*buf, spaces); /* Find start of token */
2784
2785 return strcspn(*buf, spaces); /* Return token length */
2786}
2787
2788/*
2789 * Finds the next token in *buf, and if the provided token buffer is
2790 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2791 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2792 * must be terminated with '\0' on entry.
e28fff26
AE
2793 *
2794 * Returns the length of the token found (not including the '\0').
2795 * Return value will be 0 if no token is found, and it will be >=
2796 * token_size if the token would not fit.
2797 *
593a9e7b 2798 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2799 * found token. Note that this occurs even if the token buffer is
2800 * too small to hold it.
2801 */
2802static inline size_t copy_token(const char **buf,
2803 char *token,
2804 size_t token_size)
2805{
2806 size_t len;
2807
2808 len = next_token(buf);
2809 if (len < token_size) {
2810 memcpy(token, *buf, len);
2811 *(token + len) = '\0';
2812 }
2813 *buf += len;
2814
2815 return len;
2816}
2817
ea3352f4
AE
2818/*
2819 * Finds the next token in *buf, dynamically allocates a buffer big
2820 * enough to hold a copy of it, and copies the token into the new
2821 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2822 * that a duplicate buffer is created even for a zero-length token.
2823 *
2824 * Returns a pointer to the newly-allocated duplicate, or a null
2825 * pointer if memory for the duplicate was not available. If
2826 * the lenp argument is a non-null pointer, the length of the token
2827 * (not including the '\0') is returned in *lenp.
2828 *
2829 * If successful, the *buf pointer will be updated to point beyond
2830 * the end of the found token.
2831 *
2832 * Note: uses GFP_KERNEL for allocation.
2833 */
2834static inline char *dup_token(const char **buf, size_t *lenp)
2835{
2836 char *dup;
2837 size_t len;
2838
2839 len = next_token(buf);
2840 dup = kmalloc(len + 1, GFP_KERNEL);
2841 if (!dup)
2842 return NULL;
2843
2844 memcpy(dup, *buf, len);
2845 *(dup + len) = '\0';
2846 *buf += len;
2847
2848 if (lenp)
2849 *lenp = len;
2850
2851 return dup;
2852}
2853
a725f65e 2854/*
3feeb894
AE
2855 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2856 * rbd_md_name, and name fields of the given rbd_dev, based on the
2857 * list of monitor addresses and other options provided via
2858 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2859 * copy of the snapshot name to map if successful, or a
2860 * pointer-coded error otherwise.
d22f76e7
AE
2861 *
2862 * Note: rbd_dev is assumed to have been initially zero-filled.
a725f65e 2863 */
3feeb894
AE
2864static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2865 const char *buf,
2866 const char **mon_addrs,
2867 size_t *mon_addrs_size,
2868 char *options,
2869 size_t options_size)
e28fff26 2870{
d22f76e7 2871 size_t len;
3feeb894
AE
2872 char *err_ptr = ERR_PTR(-EINVAL);
2873 char *snap_name;
e28fff26
AE
2874
2875 /* The first four tokens are required */
2876
7ef3214a
AE
2877 len = next_token(&buf);
2878 if (!len)
3feeb894 2879 return err_ptr;
5214ecc4 2880 *mon_addrs_size = len + 1;
7ef3214a
AE
2881 *mon_addrs = buf;
2882
2883 buf += len;
a725f65e 2884
e28fff26
AE
2885 len = copy_token(&buf, options, options_size);
2886 if (!len || len >= options_size)
3feeb894 2887 return err_ptr;
e28fff26 2888
3feeb894 2889 err_ptr = ERR_PTR(-ENOMEM);
d22f76e7
AE
2890 rbd_dev->pool_name = dup_token(&buf, NULL);
2891 if (!rbd_dev->pool_name)
d22f76e7 2892 goto out_err;
e28fff26 2893
0bed54dc
AE
2894 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2895 if (!rbd_dev->image_name)
bf3e5ae1 2896 goto out_err;
a725f65e 2897
d4b125e9
AE
2898 /* Snapshot name is optional; default is to use "head" */
2899
3feeb894 2900 len = next_token(&buf);
d4b125e9
AE
2901 if (len > RBD_MAX_SNAP_NAME_LEN) {
2902 err_ptr = ERR_PTR(-ENAMETOOLONG);
2903 goto out_err;
2904 }
820a5f3e 2905 if (!len) {
3feeb894
AE
2906 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2907 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
849b4260 2908 }
3feeb894
AE
2909 snap_name = kmalloc(len + 1, GFP_KERNEL);
2910 if (!snap_name)
2911 goto out_err;
2912 memcpy(snap_name, buf, len);
2913 *(snap_name + len) = '\0';
e28fff26 2914
3feeb894 2915 return snap_name;
d22f76e7
AE
2916
2917out_err:
0bed54dc 2918 kfree(rbd_dev->image_name);
d78fd7ae
AE
2919 rbd_dev->image_name = NULL;
2920 rbd_dev->image_name_len = 0;
d22f76e7
AE
2921 kfree(rbd_dev->pool_name);
2922 rbd_dev->pool_name = NULL;
2923
3feeb894 2924 return err_ptr;
a725f65e
AE
2925}
2926
589d30e0
AE
2927/*
2928 * An rbd format 2 image has a unique identifier, distinct from the
2929 * name given to it by the user. Internally, that identifier is
2930 * what's used to specify the names of objects related to the image.
2931 *
2932 * A special "rbd id" object is used to map an rbd image name to its
2933 * id. If that object doesn't exist, then there is no v2 rbd image
2934 * with the supplied name.
2935 *
2936 * This function will record the given rbd_dev's image_id field if
2937 * it can be determined, and in that case will return 0. If any
2938 * errors occur a negative errno will be returned and the rbd_dev's
2939 * image_id field will be unchanged (and should be NULL).
2940 */
2941static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2942{
2943 int ret;
2944 size_t size;
2945 char *object_name;
2946 void *response;
2947 void *p;
2948
2949 /*
2950 * First, see if the format 2 image id file exists, and if
2951 * so, get the image's persistent id from it.
2952 */
2953 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2954 object_name = kmalloc(size, GFP_NOIO);
2955 if (!object_name)
2956 return -ENOMEM;
2957 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2958 dout("rbd id object name is %s\n", object_name);
2959
2960 /* Response will be an encoded string, which includes a length */
2961
2962 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2963 response = kzalloc(size, GFP_NOIO);
2964 if (!response) {
2965 ret = -ENOMEM;
2966 goto out;
2967 }
2968
2969 ret = rbd_req_sync_exec(rbd_dev, object_name,
2970 "rbd", "get_id",
2971 NULL, 0,
2972 response, RBD_IMAGE_ID_LEN_MAX,
2973 CEPH_OSD_FLAG_READ, NULL);
2974 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2975 if (ret < 0)
2976 goto out;
a0ea3a40 2977 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
2978
2979 p = response;
2980 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2981 p + RBD_IMAGE_ID_LEN_MAX,
2982 &rbd_dev->image_id_len,
2983 GFP_NOIO);
2984 if (IS_ERR(rbd_dev->image_id)) {
2985 ret = PTR_ERR(rbd_dev->image_id);
2986 rbd_dev->image_id = NULL;
2987 } else {
2988 dout("image_id is %s\n", rbd_dev->image_id);
2989 }
2990out:
2991 kfree(response);
2992 kfree(object_name);
2993
2994 return ret;
2995}
2996
a30b71b9
AE
2997static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2998{
2999 int ret;
3000 size_t size;
3001
3002 /* Version 1 images have no id; empty string is used */
3003
3004 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
3005 if (!rbd_dev->image_id)
3006 return -ENOMEM;
3007 rbd_dev->image_id_len = 0;
3008
3009 /* Record the header object name for this rbd image. */
3010
3011 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
3012 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3013 if (!rbd_dev->header_name) {
3014 ret = -ENOMEM;
3015 goto out_err;
3016 }
3017 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
3018
3019 /* Populate rbd image metadata */
3020
3021 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3022 if (ret < 0)
3023 goto out_err;
3024 rbd_dev->image_format = 1;
3025
3026 dout("discovered version 1 image, header name is %s\n",
3027 rbd_dev->header_name);
3028
3029 return 0;
3030
3031out_err:
3032 kfree(rbd_dev->header_name);
3033 rbd_dev->header_name = NULL;
3034 kfree(rbd_dev->image_id);
3035 rbd_dev->image_id = NULL;
3036
3037 return ret;
3038}
3039
3040static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3041{
3042 size_t size;
9d475de5 3043 int ret;
6e14b1a6 3044 u64 ver = 0;
a30b71b9
AE
3045
3046 /*
3047 * Image id was filled in by the caller. Record the header
3048 * object name for this rbd image.
3049 */
3050 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
3051 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3052 if (!rbd_dev->header_name)
3053 return -ENOMEM;
3054 sprintf(rbd_dev->header_name, "%s%s",
3055 RBD_HEADER_PREFIX, rbd_dev->image_id);
9d475de5
AE
3056
3057 /* Get the size and object order for the image */
3058
3059 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3060 if (ret < 0)
3061 goto out_err;
3062
3063 /* Get the object prefix (a.k.a. block_name) for the image */
3064
3065 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3066 if (ret < 0)
3067 goto out_err;
3068
d889140c 3069 /* Get the and check features for the image */
b1b5402a
AE
3070
3071 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3072 if (ret < 0)
3073 goto out_err;
35d489f9 3074
6e14b1a6
AE
3075 /* crypto and compression type aren't (yet) supported for v2 images */
3076
3077 rbd_dev->header.crypt_type = 0;
3078 rbd_dev->header.comp_type = 0;
35d489f9 3079
6e14b1a6
AE
3080 /* Get the snapshot context, plus the header version */
3081
3082 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3083 if (ret)
3084 goto out_err;
6e14b1a6
AE
3085 rbd_dev->header.obj_version = ver;
3086
a30b71b9
AE
3087 rbd_dev->image_format = 2;
3088
3089 dout("discovered version 2 image, header name is %s\n",
3090 rbd_dev->header_name);
3091
35152979 3092 return 0;
9d475de5
AE
3093out_err:
3094 kfree(rbd_dev->header_name);
3095 rbd_dev->header_name = NULL;
1e130199
AE
3096 kfree(rbd_dev->header.object_prefix);
3097 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3098
3099 return ret;
a30b71b9
AE
3100}
3101
3102/*
3103 * Probe for the existence of the header object for the given rbd
3104 * device. For format 2 images this includes determining the image
3105 * id.
3106 */
3107static int rbd_dev_probe(struct rbd_device *rbd_dev)
3108{
3109 int ret;
3110
3111 /*
3112 * Get the id from the image id object. If it's not a
3113 * format 2 image, we'll get ENOENT back, and we'll assume
3114 * it's a format 1 image.
3115 */
3116 ret = rbd_dev_image_id(rbd_dev);
3117 if (ret)
3118 ret = rbd_dev_v1_probe(rbd_dev);
3119 else
3120 ret = rbd_dev_v2_probe(rbd_dev);
3121 if (ret)
3122 dout("probe failed, returning %d\n", ret);
3123
3124 return ret;
3125}
3126
59c2be1e
YS
3127static ssize_t rbd_add(struct bus_type *bus,
3128 const char *buf,
3129 size_t count)
602adf40 3130{
cb8627c7
AE
3131 char *options;
3132 struct rbd_device *rbd_dev = NULL;
7ef3214a
AE
3133 const char *mon_addrs = NULL;
3134 size_t mon_addrs_size = 0;
27cc2594
AE
3135 struct ceph_osd_client *osdc;
3136 int rc = -ENOMEM;
3feeb894 3137 char *snap_name;
602adf40
YS
3138
3139 if (!try_module_get(THIS_MODULE))
3140 return -ENODEV;
3141
60571c7d 3142 options = kmalloc(count, GFP_KERNEL);
602adf40 3143 if (!options)
85ae8926 3144 goto err_out_mem;
cb8627c7
AE
3145 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3146 if (!rbd_dev)
85ae8926 3147 goto err_out_mem;
602adf40
YS
3148
3149 /* static rbd_device initialization */
3150 spin_lock_init(&rbd_dev->lock);
3151 INIT_LIST_HEAD(&rbd_dev->node);
dfc5606d 3152 INIT_LIST_HEAD(&rbd_dev->snaps);
c666601a 3153 init_rwsem(&rbd_dev->header_rwsem);
602adf40 3154
602adf40 3155 /* parse add command */
3feeb894
AE
3156 snap_name = rbd_add_parse_args(rbd_dev, buf,
3157 &mon_addrs, &mon_addrs_size, options, count);
3158 if (IS_ERR(snap_name)) {
3159 rc = PTR_ERR(snap_name);
85ae8926 3160 goto err_out_mem;
3feeb894 3161 }
e124a82f 3162
f8c38929
AE
3163 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3164 if (rc < 0)
85ae8926 3165 goto err_out_args;
602adf40 3166
602adf40 3167 /* pick the pool */
1dbb4399 3168 osdc = &rbd_dev->rbd_client->client->osdc;
602adf40
YS
3169 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3170 if (rc < 0)
3171 goto err_out_client;
86992098 3172 rbd_dev->pool_id = (u64) rc;
602adf40 3173
a30b71b9
AE
3174 rc = rbd_dev_probe(rbd_dev);
3175 if (rc < 0)
05fd6f6f
AE
3176 goto err_out_client;
3177
3178 /* no need to lock here, as rbd_dev is not registered yet */
3179 rc = rbd_dev_snaps_update(rbd_dev);
3180 if (rc)
41f38c2b 3181 goto err_out_probe;
05fd6f6f
AE
3182
3183 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3184 if (rc)
41f38c2b 3185 goto err_out_snaps;
05fd6f6f 3186
85ae8926
AE
3187 /* generate unique id: find highest unique id, add one */
3188 rbd_dev_id_get(rbd_dev);
3189
3190 /* Fill in the device name, now that we have its id. */
3191 BUILD_BUG_ON(DEV_NAME_LEN
3192 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3193 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3194
3195 /* Get our block major device number. */
3196
27cc2594
AE
3197 rc = register_blkdev(0, rbd_dev->name);
3198 if (rc < 0)
85ae8926 3199 goto err_out_id;
27cc2594 3200 rbd_dev->major = rc;
602adf40 3201
0f308a31
AE
3202 /* Set up the blkdev mapping. */
3203
3204 rc = rbd_init_disk(rbd_dev);
dfc5606d 3205 if (rc)
766fc439
YS
3206 goto err_out_blkdev;
3207
0f308a31
AE
3208 rc = rbd_bus_add_dev(rbd_dev);
3209 if (rc)
3210 goto err_out_disk;
3211
32eec68d
AE
3212 /*
3213 * At this point cleanup in the event of an error is the job
3214 * of the sysfs code (initiated by rbd_bus_del_dev()).
32eec68d 3215 */
2ac4e75d 3216
4bb1f1ed 3217 down_write(&rbd_dev->header_rwsem);
5ed16177 3218 rc = rbd_dev_snaps_register(rbd_dev);
4bb1f1ed 3219 up_write(&rbd_dev->header_rwsem);
2ac4e75d
AE
3220 if (rc)
3221 goto err_out_bus;
3222
3ee4001e
AE
3223 rc = rbd_init_watch_dev(rbd_dev);
3224 if (rc)
3225 goto err_out_bus;
3226
2ac4e75d
AE
3227 /* Everything's ready. Announce the disk to the world. */
3228
2ac4e75d 3229 add_disk(rbd_dev->disk);
3ee4001e 3230
2ac4e75d
AE
3231 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3232 (unsigned long long) rbd_dev->mapping.size);
3233
602adf40
YS
3234 return count;
3235
766fc439 3236err_out_bus:
766fc439
YS
3237 /* this will also clean up rest of rbd_dev stuff */
3238
3239 rbd_bus_del_dev(rbd_dev);
3240 kfree(options);
766fc439
YS
3241 return rc;
3242
0f308a31
AE
3243err_out_disk:
3244 rbd_free_disk(rbd_dev);
602adf40
YS
3245err_out_blkdev:
3246 unregister_blkdev(rbd_dev->major, rbd_dev->name);
85ae8926
AE
3247err_out_id:
3248 rbd_dev_id_put(rbd_dev);
41f38c2b
AE
3249err_out_snaps:
3250 rbd_remove_all_snaps(rbd_dev);
3251err_out_probe:
05fd6f6f 3252 rbd_header_free(&rbd_dev->header);
602adf40 3253err_out_client:
3fcf2581 3254 kfree(rbd_dev->header_name);
602adf40 3255 rbd_put_client(rbd_dev);
589d30e0 3256 kfree(rbd_dev->image_id);
85ae8926 3257err_out_args:
971f839a 3258 kfree(rbd_dev->snap_name);
85ae8926
AE
3259 kfree(rbd_dev->image_name);
3260 kfree(rbd_dev->pool_name);
3261err_out_mem:
27cc2594 3262 kfree(rbd_dev);
cb8627c7 3263 kfree(options);
27cc2594 3264
602adf40
YS
3265 dout("Error adding device %s\n", buf);
3266 module_put(THIS_MODULE);
27cc2594
AE
3267
3268 return (ssize_t) rc;
602adf40
YS
3269}
3270
de71a297 3271static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3272{
3273 struct list_head *tmp;
3274 struct rbd_device *rbd_dev;
3275
e124a82f 3276 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3277 list_for_each(tmp, &rbd_dev_list) {
3278 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3279 if (rbd_dev->dev_id == dev_id) {
e124a82f 3280 spin_unlock(&rbd_dev_list_lock);
602adf40 3281 return rbd_dev;
e124a82f 3282 }
602adf40 3283 }
e124a82f 3284 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3285 return NULL;
3286}
3287
dfc5606d 3288static void rbd_dev_release(struct device *dev)
602adf40 3289{
593a9e7b 3290 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3291
1dbb4399
AE
3292 if (rbd_dev->watch_request) {
3293 struct ceph_client *client = rbd_dev->rbd_client->client;
3294
3295 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3296 rbd_dev->watch_request);
1dbb4399 3297 }
59c2be1e 3298 if (rbd_dev->watch_event)
070c633f 3299 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 3300
602adf40
YS
3301 rbd_put_client(rbd_dev);
3302
3303 /* clean up and free blkdev */
3304 rbd_free_disk(rbd_dev);
3305 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3306
2ac4e75d
AE
3307 /* release allocated disk header fields */
3308 rbd_header_free(&rbd_dev->header);
3309
32eec68d 3310 /* done with the id, and with the rbd_dev */
971f839a 3311 kfree(rbd_dev->snap_name);
589d30e0 3312 kfree(rbd_dev->image_id);
0bed54dc 3313 kfree(rbd_dev->header_name);
d22f76e7 3314 kfree(rbd_dev->pool_name);
0bed54dc 3315 kfree(rbd_dev->image_name);
e2839308 3316 rbd_dev_id_put(rbd_dev);
602adf40
YS
3317 kfree(rbd_dev);
3318
3319 /* release module ref */
3320 module_put(THIS_MODULE);
602adf40
YS
3321}
3322
dfc5606d
YS
3323static ssize_t rbd_remove(struct bus_type *bus,
3324 const char *buf,
3325 size_t count)
602adf40
YS
3326{
3327 struct rbd_device *rbd_dev = NULL;
3328 int target_id, rc;
3329 unsigned long ul;
3330 int ret = count;
3331
3332 rc = strict_strtoul(buf, 10, &ul);
3333 if (rc)
3334 return rc;
3335
3336 /* convert to int; abort if we lost anything in the conversion */
3337 target_id = (int) ul;
3338 if (target_id != ul)
3339 return -EINVAL;
3340
3341 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3342
3343 rbd_dev = __rbd_get_dev(target_id);
3344 if (!rbd_dev) {
3345 ret = -ENOENT;
3346 goto done;
3347 }
3348
41f38c2b 3349 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3350 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3351
3352done:
3353 mutex_unlock(&ctl_mutex);
aafb230e 3354
602adf40
YS
3355 return ret;
3356}
3357
602adf40
YS
3358/*
3359 * create control files in sysfs
dfc5606d 3360 * /sys/bus/rbd/...
602adf40
YS
3361 */
3362static int rbd_sysfs_init(void)
3363{
dfc5606d 3364 int ret;
602adf40 3365
fed4c143 3366 ret = device_register(&rbd_root_dev);
21079786 3367 if (ret < 0)
dfc5606d 3368 return ret;
602adf40 3369
fed4c143
AE
3370 ret = bus_register(&rbd_bus_type);
3371 if (ret < 0)
3372 device_unregister(&rbd_root_dev);
602adf40 3373
602adf40
YS
3374 return ret;
3375}
3376
3377static void rbd_sysfs_cleanup(void)
3378{
dfc5606d 3379 bus_unregister(&rbd_bus_type);
fed4c143 3380 device_unregister(&rbd_root_dev);
602adf40
YS
3381}
3382
3383int __init rbd_init(void)
3384{
3385 int rc;
3386
3387 rc = rbd_sysfs_init();
3388 if (rc)
3389 return rc;
f0f8cef5 3390 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3391 return 0;
3392}
3393
3394void __exit rbd_exit(void)
3395{
3396 rbd_sysfs_cleanup();
3397}
3398
3399module_init(rbd_init);
3400module_exit(rbd_exit);
3401
3402MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3403MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3404MODULE_DESCRIPTION("rados block device");
3405
3406/* following authorship retained from original osdblk.c */
3407MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3408
3409MODULE_LICENSE("GPL");