Merge tag 'for-5.7/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device...
[linux-block.git] / fs / block_dev.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * linux/fs/block_dev.c
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
7 */
8
1da177e4
LT
9#include <linux/init.h>
10#include <linux/mm.h>
11#include <linux/fcntl.h>
12#include <linux/slab.h>
13#include <linux/kmod.h>
14#include <linux/major.h>
7db9cfd3 15#include <linux/device_cgroup.h>
1da177e4
LT
16#include <linux/highmem.h>
17#include <linux/blkdev.h>
66114cad 18#include <linux/backing-dev.h>
1da177e4
LT
19#include <linux/module.h>
20#include <linux/blkpg.h>
b502bd11 21#include <linux/magic.h>
b0686260 22#include <linux/dax.h>
1da177e4 23#include <linux/buffer_head.h>
ff01bb48 24#include <linux/swap.h>
585d3bc0 25#include <linux/pagevec.h>
811d736f 26#include <linux/writeback.h>
1da177e4
LT
27#include <linux/mpage.h>
28#include <linux/mount.h>
9030d16e 29#include <linux/pseudo_fs.h>
1da177e4
LT
30#include <linux/uio.h>
31#include <linux/namei.h>
1368c4f2 32#include <linux/log2.h>
ff01bb48 33#include <linux/cleancache.h>
189ce2b9 34#include <linux/task_io_accounting_ops.h>
25f4c414 35#include <linux/falloc.h>
7c0f6ba6 36#include <linux/uaccess.h>
56939e01 37#include <linux/suspend.h>
07f3f05c 38#include "internal.h"
1da177e4
LT
39
40struct bdev_inode {
41 struct block_device bdev;
42 struct inode vfs_inode;
43};
44
4c54ac62
AB
45static const struct address_space_operations def_blk_aops;
46
1da177e4
LT
47static inline struct bdev_inode *BDEV_I(struct inode *inode)
48{
49 return container_of(inode, struct bdev_inode, vfs_inode);
50}
51
ff5053f6 52struct block_device *I_BDEV(struct inode *inode)
1da177e4
LT
53{
54 return &BDEV_I(inode)->bdev;
55}
1da177e4
LT
56EXPORT_SYMBOL(I_BDEV);
57
dbd3ca50 58static void bdev_write_inode(struct block_device *bdev)
564f00f6 59{
dbd3ca50
VG
60 struct inode *inode = bdev->bd_inode;
61 int ret;
62
564f00f6
CH
63 spin_lock(&inode->i_lock);
64 while (inode->i_state & I_DIRTY) {
65 spin_unlock(&inode->i_lock);
dbd3ca50
VG
66 ret = write_inode_now(inode, true);
67 if (ret) {
68 char name[BDEVNAME_SIZE];
69 pr_warn_ratelimited("VFS: Dirty inode writeback failed "
70 "for block device %s (err=%d).\n",
71 bdevname(bdev, name), ret);
72 }
564f00f6
CH
73 spin_lock(&inode->i_lock);
74 }
75 spin_unlock(&inode->i_lock);
76}
77
f9a14399 78/* Kill _all_ buffers and pagecache , dirty or not.. */
ff01bb48 79void kill_bdev(struct block_device *bdev)
1da177e4 80{
ff01bb48
AV
81 struct address_space *mapping = bdev->bd_inode->i_mapping;
82
f9fe48be 83 if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
f9a14399 84 return;
ff01bb48 85
f9a14399 86 invalidate_bh_lrus();
ff01bb48 87 truncate_inode_pages(mapping, 0);
1da177e4 88}
ff01bb48
AV
89EXPORT_SYMBOL(kill_bdev);
90
91/* Invalidate clean unused buffers and pagecache. */
92void invalidate_bdev(struct block_device *bdev)
93{
94 struct address_space *mapping = bdev->bd_inode->i_mapping;
95
a5f6a6a9
AR
96 if (mapping->nrpages) {
97 invalidate_bh_lrus();
98 lru_add_drain_all(); /* make sure all lru add caches are flushed */
99 invalidate_mapping_pages(mapping, 0, -1);
100 }
ff01bb48
AV
101 /* 99% of the time, we don't need to flush the cleancache on the bdev.
102 * But, for the strange corners, lets be cautious
103 */
3167760f 104 cleancache_invalidate_inode(mapping);
ff01bb48
AV
105}
106EXPORT_SYMBOL(invalidate_bdev);
1da177e4 107
04906b2f
JK
108static void set_init_blocksize(struct block_device *bdev)
109{
110 unsigned bsize = bdev_logical_block_size(bdev);
111 loff_t size = i_size_read(bdev->bd_inode);
112
113 while (bsize < PAGE_SIZE) {
114 if (size & bsize)
115 break;
116 bsize <<= 1;
117 }
118 bdev->bd_block_size = bsize;
119 bdev->bd_inode->i_blkbits = blksize_bits(bsize);
120}
121
1da177e4
LT
122int set_blocksize(struct block_device *bdev, int size)
123{
124 /* Size must be a power of two, and between 512 and PAGE_SIZE */
1368c4f2 125 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
1da177e4
LT
126 return -EINVAL;
127
128 /* Size cannot be smaller than the size supported by the device */
e1defc4f 129 if (size < bdev_logical_block_size(bdev))
1da177e4
LT
130 return -EINVAL;
131
132 /* Don't change the size if it is same as current */
133 if (bdev->bd_block_size != size) {
134 sync_blockdev(bdev);
135 bdev->bd_block_size = size;
136 bdev->bd_inode->i_blkbits = blksize_bits(size);
137 kill_bdev(bdev);
138 }
139 return 0;
140}
141
142EXPORT_SYMBOL(set_blocksize);
143
144int sb_set_blocksize(struct super_block *sb, int size)
145{
1da177e4
LT
146 if (set_blocksize(sb->s_bdev, size))
147 return 0;
148 /* If we get here, we know size is power of two
149 * and it's value is between 512 and PAGE_SIZE */
150 sb->s_blocksize = size;
38885bd4 151 sb->s_blocksize_bits = blksize_bits(size);
1da177e4
LT
152 return sb->s_blocksize;
153}
154
155EXPORT_SYMBOL(sb_set_blocksize);
156
157int sb_min_blocksize(struct super_block *sb, int size)
158{
e1defc4f 159 int minsize = bdev_logical_block_size(sb->s_bdev);
1da177e4
LT
160 if (size < minsize)
161 size = minsize;
162 return sb_set_blocksize(sb, size);
163}
164
165EXPORT_SYMBOL(sb_min_blocksize);
166
167static int
168blkdev_get_block(struct inode *inode, sector_t iblock,
169 struct buffer_head *bh, int create)
170{
1da177e4
LT
171 bh->b_bdev = I_BDEV(inode);
172 bh->b_blocknr = iblock;
173 set_buffer_mapped(bh);
174 return 0;
175}
176
4ebb16ca
DW
177static struct inode *bdev_file_inode(struct file *file)
178{
179 return file->f_mapping->host;
180}
181
78250c02
JA
182static unsigned int dio_bio_write_op(struct kiocb *iocb)
183{
184 unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
185
186 /* avoid the need for a I/O completion work item */
187 if (iocb->ki_flags & IOCB_DSYNC)
188 op |= REQ_FUA;
189 return op;
190}
191
189ce2b9
CH
192#define DIO_INLINE_BIO_VECS 4
193
194static void blkdev_bio_end_io_simple(struct bio *bio)
195{
196 struct task_struct *waiter = bio->bi_private;
197
198 WRITE_ONCE(bio->bi_private, NULL);
0619317f 199 blk_wake_io_task(waiter);
189ce2b9
CH
200}
201
202static ssize_t
203__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
204 int nr_pages)
205{
206 struct file *file = iocb->ki_filp;
207 struct block_device *bdev = I_BDEV(bdev_file_inode(file));
9fec4a21 208 struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
189ce2b9
CH
209 loff_t pos = iocb->ki_pos;
210 bool should_dirty = false;
211 struct bio bio;
212 ssize_t ret;
213 blk_qc_t qc;
189ce2b9 214
9a794fb9
JA
215 if ((pos | iov_iter_alignment(iter)) &
216 (bdev_logical_block_size(bdev) - 1))
189ce2b9
CH
217 return -EINVAL;
218
72ecad22
JA
219 if (nr_pages <= DIO_INLINE_BIO_VECS)
220 vecs = inline_vecs;
221 else {
6da2ec56
KC
222 vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
223 GFP_KERNEL);
72ecad22
JA
224 if (!vecs)
225 return -ENOMEM;
226 }
227
3a83f467 228 bio_init(&bio, vecs, nr_pages);
74d46992 229 bio_set_dev(&bio, bdev);
4d1a4765 230 bio.bi_iter.bi_sector = pos >> 9;
45d06cf7 231 bio.bi_write_hint = iocb->ki_hint;
189ce2b9
CH
232 bio.bi_private = current;
233 bio.bi_end_io = blkdev_bio_end_io_simple;
074111ca 234 bio.bi_ioprio = iocb->ki_ioprio;
189ce2b9
CH
235
236 ret = bio_iov_iter_get_pages(&bio, iter);
237 if (unlikely(ret))
9362dd11 238 goto out;
189ce2b9
CH
239 ret = bio.bi_iter.bi_size;
240
241 if (iov_iter_rw(iter) == READ) {
78250c02 242 bio.bi_opf = REQ_OP_READ;
189ce2b9
CH
243 if (iter_is_iovec(iter))
244 should_dirty = true;
245 } else {
78250c02 246 bio.bi_opf = dio_bio_write_op(iocb);
189ce2b9
CH
247 task_io_account_write(ret);
248 }
d1e36282 249 if (iocb->ki_flags & IOCB_HIPRI)
0bbb280d 250 bio_set_polled(&bio, iocb);
189ce2b9
CH
251
252 qc = submit_bio(&bio);
253 for (;;) {
1ac5cd49 254 set_current_state(TASK_UNINTERRUPTIBLE);
189ce2b9
CH
255 if (!READ_ONCE(bio.bi_private))
256 break;
257 if (!(iocb->ki_flags & IOCB_HIPRI) ||
0a1b8b87 258 !blk_poll(bdev_get_queue(bdev), qc, true))
189ce2b9
CH
259 io_schedule();
260 }
261 __set_current_state(TASK_RUNNING);
262
9fec4a21 263 bio_release_pages(&bio, should_dirty);
4e4cbee9 264 if (unlikely(bio.bi_status))
c6b1e36c 265 ret = blk_status_to_errno(bio.bi_status);
9ae3b3f5 266
9362dd11
MW
267out:
268 if (vecs != inline_vecs)
269 kfree(vecs);
270
9ae3b3f5
JA
271 bio_uninit(&bio);
272
189ce2b9
CH
273 return ret;
274}
275
542ff7bf
CH
276struct blkdev_dio {
277 union {
278 struct kiocb *iocb;
279 struct task_struct *waiter;
280 };
281 size_t size;
282 atomic_t ref;
283 bool multi_bio : 1;
284 bool should_dirty : 1;
285 bool is_sync : 1;
286 struct bio bio;
287};
288
52190f8a 289static struct bio_set blkdev_dio_pool;
542ff7bf 290
eae83ce1
CH
291static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
292{
293 struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
294 struct request_queue *q = bdev_get_queue(bdev);
295
296 return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
297}
298
542ff7bf
CH
299static void blkdev_bio_end_io(struct bio *bio)
300{
301 struct blkdev_dio *dio = bio->bi_private;
302 bool should_dirty = dio->should_dirty;
303
a89afe58
JY
304 if (bio->bi_status && !dio->bio.bi_status)
305 dio->bio.bi_status = bio->bi_status;
306
307 if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
542ff7bf
CH
308 if (!dio->is_sync) {
309 struct kiocb *iocb = dio->iocb;
4e4cbee9 310 ssize_t ret;
542ff7bf 311
4e4cbee9 312 if (likely(!dio->bio.bi_status)) {
542ff7bf
CH
313 ret = dio->size;
314 iocb->ki_pos += ret;
4e4cbee9
CH
315 } else {
316 ret = blk_status_to_errno(dio->bio.bi_status);
542ff7bf
CH
317 }
318
319 dio->iocb->ki_complete(iocb, ret, 0);
531724ab
CH
320 if (dio->multi_bio)
321 bio_put(&dio->bio);
542ff7bf
CH
322 } else {
323 struct task_struct *waiter = dio->waiter;
324
325 WRITE_ONCE(dio->waiter, NULL);
0619317f 326 blk_wake_io_task(waiter);
542ff7bf
CH
327 }
328 }
329
330 if (should_dirty) {
331 bio_check_pages_dirty(bio);
332 } else {
57dfe3ce 333 bio_release_pages(bio, false);
542ff7bf
CH
334 bio_put(bio);
335 }
336}
337
b2e895db 338static ssize_t
542ff7bf 339__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
b2e895db
AM
340{
341 struct file *file = iocb->ki_filp;
4ebb16ca 342 struct inode *inode = bdev_file_inode(file);
542ff7bf 343 struct block_device *bdev = I_BDEV(inode);
64d656a1 344 struct blk_plug plug;
542ff7bf
CH
345 struct blkdev_dio *dio;
346 struct bio *bio;
cb700eb3 347 bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
690e5325 348 bool is_read = (iov_iter_rw(iter) == READ), is_sync;
542ff7bf
CH
349 loff_t pos = iocb->ki_pos;
350 blk_qc_t qc = BLK_QC_T_NONE;
7b6620d7 351 int ret = 0;
542ff7bf 352
9a794fb9
JA
353 if ((pos | iov_iter_alignment(iter)) &
354 (bdev_logical_block_size(bdev) - 1))
542ff7bf
CH
355 return -EINVAL;
356
7b6620d7 357 bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
542ff7bf
CH
358
359 dio = container_of(bio, struct blkdev_dio, bio);
690e5325 360 dio->is_sync = is_sync = is_sync_kiocb(iocb);
531724ab 361 if (dio->is_sync) {
542ff7bf 362 dio->waiter = current;
531724ab
CH
363 bio_get(bio);
364 } else {
542ff7bf 365 dio->iocb = iocb;
531724ab 366 }
542ff7bf
CH
367
368 dio->size = 0;
369 dio->multi_bio = false;
00e23707 370 dio->should_dirty = is_read && iter_is_iovec(iter);
542ff7bf 371
cb700eb3
JA
372 /*
373 * Don't plug for HIPRI/polled IO, as those should go straight
374 * to issue
375 */
376 if (!is_poll)
377 blk_start_plug(&plug);
378
542ff7bf 379 for (;;) {
74d46992 380 bio_set_dev(bio, bdev);
4d1a4765 381 bio->bi_iter.bi_sector = pos >> 9;
45d06cf7 382 bio->bi_write_hint = iocb->ki_hint;
542ff7bf
CH
383 bio->bi_private = dio;
384 bio->bi_end_io = blkdev_bio_end_io;
074111ca 385 bio->bi_ioprio = iocb->ki_ioprio;
542ff7bf 386
e15c2ffa
JA
387 ret = bio_iov_iter_get_pages(bio, iter);
388 if (unlikely(ret)) {
4e4cbee9 389 bio->bi_status = BLK_STS_IOERR;
542ff7bf
CH
390 bio_endio(bio);
391 break;
392 }
393
394 if (is_read) {
395 bio->bi_opf = REQ_OP_READ;
396 if (dio->should_dirty)
397 bio_set_pages_dirty(bio);
398 } else {
399 bio->bi_opf = dio_bio_write_op(iocb);
400 task_io_account_write(bio->bi_iter.bi_size);
401 }
402
7b6620d7 403 dio->size += bio->bi_iter.bi_size;
542ff7bf
CH
404 pos += bio->bi_iter.bi_size;
405
406 nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
407 if (!nr_pages) {
eae83ce1
CH
408 bool polled = false;
409
410 if (iocb->ki_flags & IOCB_HIPRI) {
0bbb280d 411 bio_set_polled(bio, iocb);
eae83ce1
CH
412 polled = true;
413 }
d34513d3 414
542ff7bf 415 qc = submit_bio(bio);
eae83ce1
CH
416
417 if (polled)
418 WRITE_ONCE(iocb->ki_cookie, qc);
542ff7bf
CH
419 break;
420 }
421
422 if (!dio->multi_bio) {
531724ab
CH
423 /*
424 * AIO needs an extra reference to ensure the dio
425 * structure which is embedded into the first bio
426 * stays around.
427 */
428 if (!is_sync)
429 bio_get(bio);
542ff7bf
CH
430 dio->multi_bio = true;
431 atomic_set(&dio->ref, 2);
432 } else {
433 atomic_inc(&dio->ref);
434 }
435
7b6620d7
JA
436 submit_bio(bio);
437 bio = bio_alloc(GFP_KERNEL, nr_pages);
542ff7bf 438 }
cb700eb3
JA
439
440 if (!is_poll)
441 blk_finish_plug(&plug);
542ff7bf 442
690e5325 443 if (!is_sync)
542ff7bf
CH
444 return -EIOCBQUEUED;
445
446 for (;;) {
1ac5cd49 447 set_current_state(TASK_UNINTERRUPTIBLE);
542ff7bf
CH
448 if (!READ_ONCE(dio->waiter))
449 break;
450
451 if (!(iocb->ki_flags & IOCB_HIPRI) ||
0a1b8b87 452 !blk_poll(bdev_get_queue(bdev), qc, true))
542ff7bf
CH
453 io_schedule();
454 }
455 __set_current_state(TASK_RUNNING);
456
36ffc6c1 457 if (!ret)
4e4cbee9 458 ret = blk_status_to_errno(dio->bio.bi_status);
e15c2ffa
JA
459 if (likely(!ret))
460 ret = dio->size;
542ff7bf
CH
461
462 bio_put(&dio->bio);
463 return ret;
464}
465
466static ssize_t
467blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
468{
189ce2b9 469 int nr_pages;
b2e895db 470
72ecad22 471 nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
189ce2b9
CH
472 if (!nr_pages)
473 return 0;
72ecad22 474 if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
189ce2b9 475 return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
542ff7bf
CH
476
477 return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
478}
479
480static __init int blkdev_init(void)
481{
52190f8a 482 return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
b2e895db 483}
542ff7bf 484module_init(blkdev_init);
b2e895db 485
5cee5815
JK
486int __sync_blockdev(struct block_device *bdev, int wait)
487{
488 if (!bdev)
489 return 0;
490 if (!wait)
491 return filemap_flush(bdev->bd_inode->i_mapping);
492 return filemap_write_and_wait(bdev->bd_inode->i_mapping);
493}
494
585d3bc0
NP
495/*
496 * Write out and wait upon all the dirty data associated with a block
497 * device via its mapping. Does not take the superblock lock.
498 */
499int sync_blockdev(struct block_device *bdev)
500{
5cee5815 501 return __sync_blockdev(bdev, 1);
585d3bc0
NP
502}
503EXPORT_SYMBOL(sync_blockdev);
504
505/*
506 * Write out and wait upon all dirty data associated with this
507 * device. Filesystem data as well as the underlying block
508 * device. Takes the superblock lock.
509 */
510int fsync_bdev(struct block_device *bdev)
511{
512 struct super_block *sb = get_super(bdev);
513 if (sb) {
60b0680f 514 int res = sync_filesystem(sb);
585d3bc0
NP
515 drop_super(sb);
516 return res;
517 }
518 return sync_blockdev(bdev);
519}
47e4491b 520EXPORT_SYMBOL(fsync_bdev);
585d3bc0
NP
521
522/**
523 * freeze_bdev -- lock a filesystem and force it into a consistent state
524 * @bdev: blockdevice to lock
525 *
585d3bc0
NP
526 * If a superblock is found on this device, we take the s_umount semaphore
527 * on it to make sure nobody unmounts until the snapshot creation is done.
528 * The reference counter (bd_fsfreeze_count) guarantees that only the last
529 * unfreeze process can unfreeze the frozen filesystem actually when multiple
530 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
531 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
532 * actually.
533 */
534struct super_block *freeze_bdev(struct block_device *bdev)
535{
536 struct super_block *sb;
537 int error = 0;
538
539 mutex_lock(&bdev->bd_fsfreeze_mutex);
4504230a
CH
540 if (++bdev->bd_fsfreeze_count > 1) {
541 /*
542 * We don't even need to grab a reference - the first call
543 * to freeze_bdev grab an active reference and only the last
544 * thaw_bdev drops it.
545 */
585d3bc0 546 sb = get_super(bdev);
5bb53c0f
AR
547 if (sb)
548 drop_super(sb);
4504230a
CH
549 mutex_unlock(&bdev->bd_fsfreeze_mutex);
550 return sb;
551 }
552
553 sb = get_active_super(bdev);
554 if (!sb)
555 goto out;
48b6bca6
BM
556 if (sb->s_op->freeze_super)
557 error = sb->s_op->freeze_super(sb);
558 else
559 error = freeze_super(sb);
18e9e510
JB
560 if (error) {
561 deactivate_super(sb);
562 bdev->bd_fsfreeze_count--;
585d3bc0 563 mutex_unlock(&bdev->bd_fsfreeze_mutex);
18e9e510 564 return ERR_PTR(error);
585d3bc0 565 }
18e9e510 566 deactivate_super(sb);
4504230a 567 out:
585d3bc0
NP
568 sync_blockdev(bdev);
569 mutex_unlock(&bdev->bd_fsfreeze_mutex);
4fadd7bb 570 return sb; /* thaw_bdev releases s->s_umount */
585d3bc0
NP
571}
572EXPORT_SYMBOL(freeze_bdev);
573
574/**
575 * thaw_bdev -- unlock filesystem
576 * @bdev: blockdevice to unlock
577 * @sb: associated superblock
578 *
579 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
580 */
581int thaw_bdev(struct block_device *bdev, struct super_block *sb)
582{
4504230a 583 int error = -EINVAL;
585d3bc0
NP
584
585 mutex_lock(&bdev->bd_fsfreeze_mutex);
4504230a 586 if (!bdev->bd_fsfreeze_count)
18e9e510 587 goto out;
4504230a
CH
588
589 error = 0;
590 if (--bdev->bd_fsfreeze_count > 0)
18e9e510 591 goto out;
4504230a
CH
592
593 if (!sb)
18e9e510 594 goto out;
4504230a 595
48b6bca6
BM
596 if (sb->s_op->thaw_super)
597 error = sb->s_op->thaw_super(sb);
598 else
599 error = thaw_super(sb);
997198ba 600 if (error)
18e9e510 601 bdev->bd_fsfreeze_count++;
18e9e510 602out:
585d3bc0 603 mutex_unlock(&bdev->bd_fsfreeze_mutex);
997198ba 604 return error;
585d3bc0
NP
605}
606EXPORT_SYMBOL(thaw_bdev);
607
1da177e4
LT
608static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
609{
610 return block_write_full_page(page, blkdev_get_block, wbc);
611}
612
613static int blkdev_readpage(struct file * file, struct page * page)
614{
615 return block_read_full_page(page, blkdev_get_block);
616}
617
447f05bb
AM
618static int blkdev_readpages(struct file *file, struct address_space *mapping,
619 struct list_head *pages, unsigned nr_pages)
620{
621 return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
622}
623
6272b5a5
NP
624static int blkdev_write_begin(struct file *file, struct address_space *mapping,
625 loff_t pos, unsigned len, unsigned flags,
626 struct page **pagep, void **fsdata)
1da177e4 627{
155130a4
CH
628 return block_write_begin(mapping, pos, len, flags, pagep,
629 blkdev_get_block);
1da177e4
LT
630}
631
6272b5a5
NP
632static int blkdev_write_end(struct file *file, struct address_space *mapping,
633 loff_t pos, unsigned len, unsigned copied,
634 struct page *page, void *fsdata)
1da177e4 635{
6272b5a5
NP
636 int ret;
637 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
638
639 unlock_page(page);
09cbfeaf 640 put_page(page);
6272b5a5
NP
641
642 return ret;
1da177e4
LT
643}
644
645/*
646 * private llseek:
496ad9aa 647 * for a block special file file_inode(file)->i_size is zero
1da177e4
LT
648 * so we compute the size by hand (just as in block_read/write above)
649 */
965c8e59 650static loff_t block_llseek(struct file *file, loff_t offset, int whence)
1da177e4 651{
4ebb16ca 652 struct inode *bd_inode = bdev_file_inode(file);
1da177e4
LT
653 loff_t retval;
654
5955102c 655 inode_lock(bd_inode);
5d48f3a2 656 retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
5955102c 657 inode_unlock(bd_inode);
1da177e4
LT
658 return retval;
659}
660
02c24a82 661int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
1da177e4 662{
4ebb16ca 663 struct inode *bd_inode = bdev_file_inode(filp);
b8af67e2 664 struct block_device *bdev = I_BDEV(bd_inode);
ab0a9735 665 int error;
da5aa861 666
372cf243 667 error = file_write_and_wait_range(filp, start, end);
da5aa861
RW
668 if (error)
669 return error;
ab0a9735 670
b8af67e2
AB
671 /*
672 * There is no need to serialise calls to blkdev_issue_flush with
673 * i_mutex and doing so causes performance issues with concurrent
674 * O_SYNC writers to a block device.
675 */
dd3932ed 676 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
ab0a9735
CH
677 if (error == -EOPNOTSUPP)
678 error = 0;
b8af67e2 679
ab0a9735 680 return error;
1da177e4 681}
b1dd3b28 682EXPORT_SYMBOL(blkdev_fsync);
1da177e4 683
47a191fd
MW
684/**
685 * bdev_read_page() - Start reading a page from a block device
686 * @bdev: The device to read the page from
687 * @sector: The offset on the device to read the page to (need not be aligned)
688 * @page: The page to read
689 *
690 * On entry, the page should be locked. It will be unlocked when the page
691 * has been read. If the block driver implements rw_page synchronously,
692 * that will be true on exit from this function, but it need not be.
693 *
694 * Errors returned by this function are usually "soft", eg out of memory, or
695 * queue full; callers should try a different route to read this page rather
696 * than propagate an error back up the stack.
697 *
698 * Return: negative errno if an error occurs, 0 if submission was successful.
699 */
700int bdev_read_page(struct block_device *bdev, sector_t sector,
701 struct page *page)
702{
703 const struct block_device_operations *ops = bdev->bd_disk->fops;
2e6edc95
DW
704 int result = -EOPNOTSUPP;
705
f68eb1e7 706 if (!ops->rw_page || bdev_get_integrity(bdev))
2e6edc95
DW
707 return result;
708
3a0a5299 709 result = blk_queue_enter(bdev->bd_queue, 0);
2e6edc95
DW
710 if (result)
711 return result;
3f289dcb
TH
712 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
713 REQ_OP_READ);
2e6edc95
DW
714 blk_queue_exit(bdev->bd_queue);
715 return result;
47a191fd
MW
716}
717EXPORT_SYMBOL_GPL(bdev_read_page);
718
719/**
720 * bdev_write_page() - Start writing a page to a block device
721 * @bdev: The device to write the page to
722 * @sector: The offset on the device to write the page to (need not be aligned)
723 * @page: The page to write
724 * @wbc: The writeback_control for the write
725 *
726 * On entry, the page should be locked and not currently under writeback.
727 * On exit, if the write started successfully, the page will be unlocked and
728 * under writeback. If the write failed already (eg the driver failed to
729 * queue the page to the device), the page will still be locked. If the
730 * caller is a ->writepage implementation, it will need to unlock the page.
731 *
732 * Errors returned by this function are usually "soft", eg out of memory, or
733 * queue full; callers should try a different route to write this page rather
734 * than propagate an error back up the stack.
735 *
736 * Return: negative errno if an error occurs, 0 if submission was successful.
737 */
738int bdev_write_page(struct block_device *bdev, sector_t sector,
739 struct page *page, struct writeback_control *wbc)
740{
741 int result;
47a191fd 742 const struct block_device_operations *ops = bdev->bd_disk->fops;
2e6edc95 743
f68eb1e7 744 if (!ops->rw_page || bdev_get_integrity(bdev))
47a191fd 745 return -EOPNOTSUPP;
3a0a5299 746 result = blk_queue_enter(bdev->bd_queue, 0);
2e6edc95
DW
747 if (result)
748 return result;
749
47a191fd 750 set_page_writeback(page);
3f289dcb
TH
751 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
752 REQ_OP_WRITE);
f892760a 753 if (result) {
47a191fd 754 end_page_writeback(page);
f892760a
MW
755 } else {
756 clean_page_buffers(page);
47a191fd 757 unlock_page(page);
f892760a 758 }
2e6edc95 759 blk_queue_exit(bdev->bd_queue);
47a191fd
MW
760 return result;
761}
762EXPORT_SYMBOL_GPL(bdev_write_page);
763
1da177e4
LT
764/*
765 * pseudo-fs
766 */
767
768static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
e18b890b 769static struct kmem_cache * bdev_cachep __read_mostly;
1da177e4
LT
770
771static struct inode *bdev_alloc_inode(struct super_block *sb)
772{
e94b1766 773 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
1da177e4
LT
774 if (!ei)
775 return NULL;
776 return &ei->vfs_inode;
777}
778
41149cb0 779static void bdev_free_inode(struct inode *inode)
1da177e4 780{
41149cb0 781 kmem_cache_free(bdev_cachep, BDEV_I(inode));
fa0d7e3d
NP
782}
783
51cc5068 784static void init_once(void *foo)
1da177e4
LT
785{
786 struct bdev_inode *ei = (struct bdev_inode *) foo;
787 struct block_device *bdev = &ei->bdev;
788
a35afb83
CL
789 memset(bdev, 0, sizeof(*bdev));
790 mutex_init(&bdev->bd_mutex);
a35afb83 791 INIT_LIST_HEAD(&bdev->bd_list);
49731baa
TH
792#ifdef CONFIG_SYSFS
793 INIT_LIST_HEAD(&bdev->bd_holder_disks);
794#endif
a5a79d00 795 bdev->bd_bdi = &noop_backing_dev_info;
a35afb83 796 inode_init_once(&ei->vfs_inode);
fcccf502
TS
797 /* Initialize mutex for freeze. */
798 mutex_init(&bdev->bd_fsfreeze_mutex);
1da177e4
LT
799}
800
b57922d9 801static void bdev_evict_inode(struct inode *inode)
1da177e4
LT
802{
803 struct block_device *bdev = &BDEV_I(inode)->bdev;
91b0abe3 804 truncate_inode_pages_final(&inode->i_data);
b57922d9 805 invalidate_inode_buffers(inode); /* is it needed here? */
dbd5768f 806 clear_inode(inode);
1da177e4 807 spin_lock(&bdev_lock);
1da177e4
LT
808 list_del_init(&bdev->bd_list);
809 spin_unlock(&bdev_lock);
f759741d
JK
810 /* Detach inode from wb early as bdi_put() may free bdi->wb */
811 inode_detach_wb(inode);
a5a79d00 812 if (bdev->bd_bdi != &noop_backing_dev_info) {
b1d2dc56 813 bdi_put(bdev->bd_bdi);
a5a79d00
JK
814 bdev->bd_bdi = &noop_backing_dev_info;
815 }
1da177e4
LT
816}
817
ee9b6d61 818static const struct super_operations bdev_sops = {
1da177e4
LT
819 .statfs = simple_statfs,
820 .alloc_inode = bdev_alloc_inode,
41149cb0 821 .free_inode = bdev_free_inode,
1da177e4 822 .drop_inode = generic_delete_inode,
b57922d9 823 .evict_inode = bdev_evict_inode,
1da177e4
LT
824};
825
9030d16e 826static int bd_init_fs_context(struct fs_context *fc)
1da177e4 827{
9030d16e
DH
828 struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
829 if (!ctx)
830 return -ENOMEM;
831 fc->s_iflags |= SB_I_CGROUPWB;
832 ctx->ops = &bdev_sops;
833 return 0;
1da177e4
LT
834}
835
836static struct file_system_type bd_type = {
837 .name = "bdev",
9030d16e 838 .init_fs_context = bd_init_fs_context,
1da177e4
LT
839 .kill_sb = kill_anon_super,
840};
841
a212b105
TH
842struct super_block *blockdev_superblock __read_mostly;
843EXPORT_SYMBOL_GPL(blockdev_superblock);
1da177e4
LT
844
845void __init bdev_cache_init(void)
846{
847 int err;
ace8577a 848 static struct vfsmount *bd_mnt;
c2acf7b9 849
1da177e4 850 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
fffb60f9 851 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
5d097056 852 SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
20c2df83 853 init_once);
1da177e4
LT
854 err = register_filesystem(&bd_type);
855 if (err)
856 panic("Cannot register bdev pseudo-fs");
857 bd_mnt = kern_mount(&bd_type);
1da177e4
LT
858 if (IS_ERR(bd_mnt))
859 panic("Cannot create bdev pseudo-fs");
ace8577a 860 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
1da177e4
LT
861}
862
863/*
864 * Most likely _very_ bad one - but then it's hardly critical for small
865 * /dev and can be fixed when somebody will need really large one.
866 * Keep in mind that it will be fed through icache hash function too.
867 */
868static inline unsigned long hash(dev_t dev)
869{
870 return MAJOR(dev)+MINOR(dev);
871}
872
873static int bdev_test(struct inode *inode, void *data)
874{
875 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
876}
877
878static int bdev_set(struct inode *inode, void *data)
879{
880 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
881 return 0;
882}
883
884static LIST_HEAD(all_bdevs);
885
f44f1ab5
JK
886/*
887 * If there is a bdev inode for this device, unhash it so that it gets evicted
888 * as soon as last inode reference is dropped.
889 */
890void bdev_unhash_inode(dev_t dev)
891{
892 struct inode *inode;
893
894 inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev);
895 if (inode) {
896 remove_inode_hash(inode);
897 iput(inode);
898 }
899}
900
1da177e4
LT
901struct block_device *bdget(dev_t dev)
902{
903 struct block_device *bdev;
904 struct inode *inode;
905
c2acf7b9 906 inode = iget5_locked(blockdev_superblock, hash(dev),
1da177e4
LT
907 bdev_test, bdev_set, &dev);
908
909 if (!inode)
910 return NULL;
911
912 bdev = &BDEV_I(inode)->bdev;
913
914 if (inode->i_state & I_NEW) {
915 bdev->bd_contains = NULL;
782b94cd 916 bdev->bd_super = NULL;
1da177e4 917 bdev->bd_inode = inode;
93407472 918 bdev->bd_block_size = i_blocksize(inode);
1da177e4
LT
919 bdev->bd_part_count = 0;
920 bdev->bd_invalidated = 0;
921 inode->i_mode = S_IFBLK;
922 inode->i_rdev = dev;
923 inode->i_bdev = bdev;
924 inode->i_data.a_ops = &def_blk_aops;
925 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
1da177e4
LT
926 spin_lock(&bdev_lock);
927 list_add(&bdev->bd_list, &all_bdevs);
928 spin_unlock(&bdev_lock);
929 unlock_new_inode(inode);
930 }
931 return bdev;
932}
933
934EXPORT_SYMBOL(bdget);
935
dddac6a7
AJ
936/**
937 * bdgrab -- Grab a reference to an already referenced block device
938 * @bdev: Block device to grab a reference to.
939 */
940struct block_device *bdgrab(struct block_device *bdev)
941{
7de9c6ee 942 ihold(bdev->bd_inode);
dddac6a7
AJ
943 return bdev;
944}
c1681bf8 945EXPORT_SYMBOL(bdgrab);
dddac6a7 946
1da177e4
LT
947long nr_blockdev_pages(void)
948{
203a2935 949 struct block_device *bdev;
1da177e4
LT
950 long ret = 0;
951 spin_lock(&bdev_lock);
203a2935 952 list_for_each_entry(bdev, &all_bdevs, bd_list) {
1da177e4
LT
953 ret += bdev->bd_inode->i_mapping->nrpages;
954 }
955 spin_unlock(&bdev_lock);
956 return ret;
957}
958
959void bdput(struct block_device *bdev)
960{
961 iput(bdev->bd_inode);
962}
963
964EXPORT_SYMBOL(bdput);
965
966static struct block_device *bd_acquire(struct inode *inode)
967{
968 struct block_device *bdev;
09d967c6 969
1da177e4
LT
970 spin_lock(&bdev_lock);
971 bdev = inode->i_bdev;
cccd9fb9 972 if (bdev && !inode_unhashed(bdev->bd_inode)) {
ed8a9d2c 973 bdgrab(bdev);
1da177e4
LT
974 spin_unlock(&bdev_lock);
975 return bdev;
976 }
977 spin_unlock(&bdev_lock);
09d967c6 978
cccd9fb9
JK
979 /*
980 * i_bdev references block device inode that was already shut down
981 * (corresponding device got removed). Remove the reference and look
982 * up block device inode again just in case new device got
983 * reestablished under the same device number.
984 */
985 if (bdev)
986 bd_forget(inode);
987
1da177e4
LT
988 bdev = bdget(inode->i_rdev);
989 if (bdev) {
990 spin_lock(&bdev_lock);
09d967c6
OH
991 if (!inode->i_bdev) {
992 /*
7de9c6ee 993 * We take an additional reference to bd_inode,
09d967c6
OH
994 * and it's released in clear_inode() of inode.
995 * So, we can access it via ->i_mapping always
996 * without igrab().
997 */
ed8a9d2c 998 bdgrab(bdev);
09d967c6
OH
999 inode->i_bdev = bdev;
1000 inode->i_mapping = bdev->bd_inode->i_mapping;
09d967c6 1001 }
1da177e4
LT
1002 spin_unlock(&bdev_lock);
1003 }
1004 return bdev;
1005}
1006
1007/* Call when you free inode */
1008
1009void bd_forget(struct inode *inode)
1010{
09d967c6
OH
1011 struct block_device *bdev = NULL;
1012
1da177e4 1013 spin_lock(&bdev_lock);
b4ea2eaa
YH
1014 if (!sb_is_blkdev_sb(inode->i_sb))
1015 bdev = inode->i_bdev;
a4a4f943
AV
1016 inode->i_bdev = NULL;
1017 inode->i_mapping = &inode->i_data;
1da177e4 1018 spin_unlock(&bdev_lock);
09d967c6
OH
1019
1020 if (bdev)
ed8a9d2c 1021 bdput(bdev);
1da177e4
LT
1022}
1023
1a3cbbc5
TH
1024/**
1025 * bd_may_claim - test whether a block device can be claimed
1026 * @bdev: block device of interest
1027 * @whole: whole block device containing @bdev, may equal @bdev
1028 * @holder: holder trying to claim @bdev
1029 *
25985edc 1030 * Test whether @bdev can be claimed by @holder.
1a3cbbc5
TH
1031 *
1032 * CONTEXT:
1033 * spin_lock(&bdev_lock).
1034 *
1035 * RETURNS:
1036 * %true if @bdev can be claimed, %false otherwise.
1037 */
1038static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
1039 void *holder)
1da177e4 1040{
1da177e4 1041 if (bdev->bd_holder == holder)
1a3cbbc5 1042 return true; /* already a holder */
1da177e4 1043 else if (bdev->bd_holder != NULL)
1a3cbbc5 1044 return false; /* held by someone else */
bcc7f5b4 1045 else if (whole == bdev)
1a3cbbc5 1046 return true; /* is a whole device which isn't held */
1da177e4 1047
e525fd89 1048 else if (whole->bd_holder == bd_may_claim)
1a3cbbc5
TH
1049 return true; /* is a partition of a device that is being partitioned */
1050 else if (whole->bd_holder != NULL)
1051 return false; /* is a partition of a held device */
1da177e4 1052 else
1a3cbbc5
TH
1053 return true; /* is a partition of an un-held device */
1054}
1055
6b4517a7
TH
1056/**
1057 * bd_prepare_to_claim - prepare to claim a block device
1058 * @bdev: block device of interest
1059 * @whole: the whole device containing @bdev, may equal @bdev
1060 * @holder: holder trying to claim @bdev
1061 *
1062 * Prepare to claim @bdev. This function fails if @bdev is already
1063 * claimed by another holder and waits if another claiming is in
1064 * progress. This function doesn't actually claim. On successful
1065 * return, the caller has ownership of bd_claiming and bd_holder[s].
1066 *
1067 * CONTEXT:
1068 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
1069 * it multiple times.
1070 *
1071 * RETURNS:
1072 * 0 if @bdev can be claimed, -EBUSY otherwise.
1073 */
1074static int bd_prepare_to_claim(struct block_device *bdev,
1075 struct block_device *whole, void *holder)
1076{
1077retry:
1078 /* if someone else claimed, fail */
1079 if (!bd_may_claim(bdev, whole, holder))
1080 return -EBUSY;
1081
e75aa858
TH
1082 /* if claiming is already in progress, wait for it to finish */
1083 if (whole->bd_claiming) {
6b4517a7
TH
1084 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
1085 DEFINE_WAIT(wait);
1086
1087 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
1088 spin_unlock(&bdev_lock);
1089 schedule();
1090 finish_wait(wq, &wait);
1091 spin_lock(&bdev_lock);
1092 goto retry;
1093 }
1094
1095 /* yay, all mine */
1096 return 0;
1097}
1098
560e7cb2
JK
1099static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
1100{
1101 struct gendisk *disk = get_gendisk(bdev->bd_dev, partno);
1102
1103 if (!disk)
1104 return NULL;
1105 /*
1106 * Now that we hold gendisk reference we make sure bdev we looked up is
1107 * not stale. If it is, it means device got removed and created before
1108 * we looked up gendisk and we fail open in such case. Associating
1109 * unhashed bdev with newly created gendisk could lead to two bdevs
1110 * (and thus two independent caches) being associated with one device
1111 * which is bad.
1112 */
1113 if (inode_unhashed(bdev->bd_inode)) {
1114 put_disk_and_module(disk);
1115 return NULL;
1116 }
1117 return disk;
1118}
1119
6b4517a7
TH
1120/**
1121 * bd_start_claiming - start claiming a block device
1122 * @bdev: block device of interest
1123 * @holder: holder trying to claim @bdev
1124 *
1125 * @bdev is about to be opened exclusively. Check @bdev can be opened
1126 * exclusively and mark that an exclusive open is in progress. Each
1127 * successful call to this function must be matched with a call to
b0018361
NP
1128 * either bd_finish_claiming() or bd_abort_claiming() (which do not
1129 * fail).
1130 *
1131 * This function is used to gain exclusive access to the block device
1132 * without actually causing other exclusive open attempts to fail. It
1133 * should be used when the open sequence itself requires exclusive
1134 * access but may subsequently fail.
6b4517a7
TH
1135 *
1136 * CONTEXT:
1137 * Might sleep.
1138 *
1139 * RETURNS:
1140 * Pointer to the block device containing @bdev on success, ERR_PTR()
1141 * value on failure.
1142 */
89e524c0 1143struct block_device *bd_start_claiming(struct block_device *bdev, void *holder)
6b4517a7
TH
1144{
1145 struct gendisk *disk;
1146 struct block_device *whole;
1147 int partno, err;
1148
1149 might_sleep();
1150
1151 /*
1152 * @bdev might not have been initialized properly yet, look up
1153 * and grab the outer block device the hard way.
1154 */
560e7cb2 1155 disk = bdev_get_gendisk(bdev, &partno);
6b4517a7
TH
1156 if (!disk)
1157 return ERR_PTR(-ENXIO);
1158
d4c208b8
TH
1159 /*
1160 * Normally, @bdev should equal what's returned from bdget_disk()
1161 * if partno is 0; however, some drivers (floppy) use multiple
1162 * bdev's for the same physical device and @bdev may be one of the
1163 * aliases. Keep @bdev if partno is 0. This means claimer
1164 * tracking is broken for those devices but it has always been that
1165 * way.
1166 */
1167 if (partno)
1168 whole = bdget_disk(disk, 0);
1169 else
1170 whole = bdgrab(bdev);
1171
9df6c299 1172 put_disk_and_module(disk);
6b4517a7
TH
1173 if (!whole)
1174 return ERR_PTR(-ENOMEM);
1175
1176 /* prepare to claim, if successful, mark claiming in progress */
1177 spin_lock(&bdev_lock);
1178
1179 err = bd_prepare_to_claim(bdev, whole, holder);
1180 if (err == 0) {
1181 whole->bd_claiming = holder;
1182 spin_unlock(&bdev_lock);
1183 return whole;
1184 } else {
1185 spin_unlock(&bdev_lock);
1186 bdput(whole);
1187 return ERR_PTR(err);
1188 }
1189}
89e524c0
JK
1190EXPORT_SYMBOL(bd_start_claiming);
1191
1192static void bd_clear_claiming(struct block_device *whole, void *holder)
1193{
1194 lockdep_assert_held(&bdev_lock);
1195 /* tell others that we're done */
1196 BUG_ON(whole->bd_claiming != holder);
1197 whole->bd_claiming = NULL;
1198 wake_up_bit(&whole->bd_claiming, 0);
1199}
1200
1201/**
1202 * bd_finish_claiming - finish claiming of a block device
1203 * @bdev: block device of interest
1204 * @whole: whole block device (returned from bd_start_claiming())
1205 * @holder: holder that has claimed @bdev
1206 *
1207 * Finish exclusive open of a block device. Mark the device as exlusively
1208 * open by the holder and wake up all waiters for exclusive open to finish.
1209 */
1210void bd_finish_claiming(struct block_device *bdev, struct block_device *whole,
1211 void *holder)
1212{
1213 spin_lock(&bdev_lock);
1214 BUG_ON(!bd_may_claim(bdev, whole, holder));
1215 /*
1216 * Note that for a whole device bd_holders will be incremented twice,
1217 * and bd_holder will be set to bd_may_claim before being set to holder
1218 */
1219 whole->bd_holders++;
1220 whole->bd_holder = bd_may_claim;
1221 bdev->bd_holders++;
1222 bdev->bd_holder = holder;
1223 bd_clear_claiming(whole, holder);
1224 spin_unlock(&bdev_lock);
1225}
1226EXPORT_SYMBOL(bd_finish_claiming);
1227
1228/**
1229 * bd_abort_claiming - abort claiming of a block device
1230 * @bdev: block device of interest
1231 * @whole: whole block device (returned from bd_start_claiming())
1232 * @holder: holder that has claimed @bdev
1233 *
1234 * Abort claiming of a block device when the exclusive open failed. This can be
1235 * also used when exclusive open is not actually desired and we just needed
1236 * to block other exclusive openers for a while.
1237 */
1238void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
1239 void *holder)
1240{
1241 spin_lock(&bdev_lock);
1242 bd_clear_claiming(whole, holder);
1243 spin_unlock(&bdev_lock);
1244}
1245EXPORT_SYMBOL(bd_abort_claiming);
6b4517a7 1246
641dc636 1247#ifdef CONFIG_SYSFS
49731baa
TH
1248struct bd_holder_disk {
1249 struct list_head list;
1250 struct gendisk *disk;
1251 int refcnt;
1252};
1253
1254static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
1255 struct gendisk *disk)
1256{
1257 struct bd_holder_disk *holder;
1258
1259 list_for_each_entry(holder, &bdev->bd_holder_disks, list)
1260 if (holder->disk == disk)
1261 return holder;
1262 return NULL;
1263}
1264
4d7dd8fd 1265static int add_symlink(struct kobject *from, struct kobject *to)
641dc636 1266{
4d7dd8fd 1267 return sysfs_create_link(from, to, kobject_name(to));
641dc636
JN
1268}
1269
1270static void del_symlink(struct kobject *from, struct kobject *to)
1271{
641dc636
JN
1272 sysfs_remove_link(from, kobject_name(to));
1273}
1274
df6c0cd9 1275/**
e09b457b
TH
1276 * bd_link_disk_holder - create symlinks between holding disk and slave bdev
1277 * @bdev: the claimed slave bdev
1278 * @disk: the holding disk
df6c0cd9 1279 *
49731baa
TH
1280 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
1281 *
e09b457b 1282 * This functions creates the following sysfs symlinks.
641dc636 1283 *
e09b457b
TH
1284 * - from "slaves" directory of the holder @disk to the claimed @bdev
1285 * - from "holders" directory of the @bdev to the holder @disk
641dc636 1286 *
e09b457b
TH
1287 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
1288 * passed to bd_link_disk_holder(), then:
641dc636 1289 *
e09b457b
TH
1290 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
1291 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
641dc636 1292 *
e09b457b
TH
1293 * The caller must have claimed @bdev before calling this function and
1294 * ensure that both @bdev and @disk are valid during the creation and
1295 * lifetime of these symlinks.
641dc636 1296 *
e09b457b
TH
1297 * CONTEXT:
1298 * Might sleep.
641dc636 1299 *
e09b457b
TH
1300 * RETURNS:
1301 * 0 on success, -errno on failure.
641dc636 1302 */
e09b457b 1303int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
641dc636 1304{
49731baa 1305 struct bd_holder_disk *holder;
e09b457b 1306 int ret = 0;
641dc636 1307
2e7b651d 1308 mutex_lock(&bdev->bd_mutex);
df6c0cd9 1309
49731baa 1310 WARN_ON_ONCE(!bdev->bd_holder);
4e91672c 1311
e09b457b
TH
1312 /* FIXME: remove the following once add_disk() handles errors */
1313 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
1314 goto out_unlock;
4e91672c 1315
49731baa
TH
1316 holder = bd_find_holder_disk(bdev, disk);
1317 if (holder) {
1318 holder->refcnt++;
e09b457b 1319 goto out_unlock;
49731baa 1320 }
641dc636 1321
49731baa
TH
1322 holder = kzalloc(sizeof(*holder), GFP_KERNEL);
1323 if (!holder) {
1324 ret = -ENOMEM;
e09b457b
TH
1325 goto out_unlock;
1326 }
641dc636 1327
49731baa
TH
1328 INIT_LIST_HEAD(&holder->list);
1329 holder->disk = disk;
1330 holder->refcnt = 1;
1331
1332 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1333 if (ret)
1334 goto out_free;
1335
1336 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
1337 if (ret)
1338 goto out_del;
e7407d16
TH
1339 /*
1340 * bdev could be deleted beneath us which would implicitly destroy
1341 * the holder directory. Hold on to it.
1342 */
1343 kobject_get(bdev->bd_part->holder_dir);
49731baa
TH
1344
1345 list_add(&holder->list, &bdev->bd_holder_disks);
1346 goto out_unlock;
1347
1348out_del:
1349 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1350out_free:
1351 kfree(holder);
e09b457b 1352out_unlock:
b4cf1b72 1353 mutex_unlock(&bdev->bd_mutex);
e09b457b 1354 return ret;
641dc636 1355}
e09b457b 1356EXPORT_SYMBOL_GPL(bd_link_disk_holder);
641dc636 1357
49731baa
TH
1358/**
1359 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
1360 * @bdev: the calimed slave bdev
1361 * @disk: the holding disk
1362 *
1363 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
1364 *
1365 * CONTEXT:
1366 * Might sleep.
1367 */
1368void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
641dc636 1369{
49731baa 1370 struct bd_holder_disk *holder;
641dc636 1371
49731baa 1372 mutex_lock(&bdev->bd_mutex);
641dc636 1373
49731baa
TH
1374 holder = bd_find_holder_disk(bdev, disk);
1375
1376 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
1377 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1378 del_symlink(bdev->bd_part->holder_dir,
1379 &disk_to_dev(disk)->kobj);
e7407d16 1380 kobject_put(bdev->bd_part->holder_dir);
49731baa
TH
1381 list_del_init(&holder->list);
1382 kfree(holder);
1383 }
1384
1385 mutex_unlock(&bdev->bd_mutex);
1da177e4 1386}
49731baa 1387EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
641dc636 1388#endif
1da177e4 1389
56ade44b
AP
1390/**
1391 * flush_disk - invalidates all buffer-cache entries on a disk
1392 *
1393 * @bdev: struct block device to be flushed
e6eb5ce1 1394 * @kill_dirty: flag to guide handling of dirty inodes
56ade44b
AP
1395 *
1396 * Invalidates all buffer-cache entries on a disk. It should be called
1397 * when a disk has been changed -- either by a media change or online
1398 * resize.
1399 */
93b270f7 1400static void flush_disk(struct block_device *bdev, bool kill_dirty)
56ade44b 1401{
93b270f7 1402 if (__invalidate_device(bdev, kill_dirty)) {
56ade44b 1403 printk(KERN_WARNING "VFS: busy inodes on changed media or "
424081f3
DM
1404 "resized disk %s\n",
1405 bdev->bd_disk ? bdev->bd_disk->disk_name : "");
56ade44b 1406 }
cba22d86 1407 bdev->bd_invalidated = 1;
56ade44b
AP
1408}
1409
c3279d14 1410/**
57d1b536 1411 * check_disk_size_change - checks for disk size change and adjusts bdev size.
c3279d14
AP
1412 * @disk: struct gendisk to check
1413 * @bdev: struct bdev to adjust.
5afb7835 1414 * @verbose: if %true log a message about a size change if there is any
c3279d14
AP
1415 *
1416 * This routine checks to see if the bdev size does not match the disk size
849cf559 1417 * and adjusts it if it differs. When shrinking the bdev size, its all caches
1418 * are freed.
c3279d14 1419 */
a1548b67
CH
1420static void check_disk_size_change(struct gendisk *disk,
1421 struct block_device *bdev, bool verbose)
c3279d14
AP
1422{
1423 loff_t disk_size, bdev_size;
1424
1425 disk_size = (loff_t)get_capacity(disk) << 9;
1426 bdev_size = i_size_read(bdev->bd_inode);
1427 if (disk_size != bdev_size) {
5afb7835
CH
1428 if (verbose) {
1429 printk(KERN_INFO
1430 "%s: detected capacity change from %lld to %lld\n",
1431 disk->disk_name, bdev_size, disk_size);
1432 }
c3279d14 1433 i_size_write(bdev->bd_inode, disk_size);
849cf559 1434 if (bdev_size > disk_size)
1435 flush_disk(bdev, false);
c3279d14 1436 }
979c690d 1437 bdev->bd_invalidated = 0;
c3279d14 1438}
c3279d14 1439
0c002c2f 1440/**
57d1b536 1441 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
0c002c2f
AP
1442 * @disk: struct gendisk to be revalidated
1443 *
1444 * This routine is a wrapper for lower-level driver's revalidate_disk
1445 * call-backs. It is used to do common pre and post operations needed
1446 * for all revalidate_disk operations.
1447 */
1448int revalidate_disk(struct gendisk *disk)
1449{
1450 int ret = 0;
1451
1452 if (disk->fops->revalidate_disk)
1453 ret = disk->fops->revalidate_disk(disk);
c3279d14 1454
31cb1d64
JK
1455 /*
1456 * Hidden disks don't have associated bdev so there's no point in
1457 * revalidating it.
1458 */
1459 if (!(disk->flags & GENHD_FL_HIDDEN)) {
1460 struct block_device *bdev = bdget_disk(disk, 0);
1461
1462 if (!bdev)
1463 return ret;
1464
1465 mutex_lock(&bdev->bd_mutex);
1466 check_disk_size_change(disk, bdev, ret == 0);
31cb1d64
JK
1467 mutex_unlock(&bdev->bd_mutex);
1468 bdput(bdev);
1469 }
0c002c2f
AP
1470 return ret;
1471}
1472EXPORT_SYMBOL(revalidate_disk);
1473
1da177e4
LT
1474/*
1475 * This routine checks whether a removable media has been changed,
1476 * and invalidates all buffer-cache-entries in that case. This
1477 * is a relatively slow routine, so we have to try to minimize using
1478 * it. Thus it is called only upon a 'mount' or 'open'. This
1479 * is the best way of combining speed and utility, I think.
1480 * People changing diskettes in the middle of an operation deserve
1481 * to lose :-)
1482 */
1483int check_disk_change(struct block_device *bdev)
1484{
1485 struct gendisk *disk = bdev->bd_disk;
83d5cde4 1486 const struct block_device_operations *bdops = disk->fops;
77ea887e 1487 unsigned int events;
1da177e4 1488
77ea887e
TH
1489 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1490 DISK_EVENT_EJECT_REQUEST);
1491 if (!(events & DISK_EVENT_MEDIA_CHANGE))
1da177e4
LT
1492 return 0;
1493
93b270f7 1494 flush_disk(bdev, true);
1da177e4
LT
1495 if (bdops->revalidate_disk)
1496 bdops->revalidate_disk(bdev->bd_disk);
1da177e4
LT
1497 return 1;
1498}
1499
1500EXPORT_SYMBOL(check_disk_change);
1501
1502void bd_set_size(struct block_device *bdev, loff_t size)
1503{
5955102c 1504 inode_lock(bdev->bd_inode);
d646a02a 1505 i_size_write(bdev->bd_inode, size);
5955102c 1506 inode_unlock(bdev->bd_inode);
1da177e4
LT
1507}
1508EXPORT_SYMBOL(bd_set_size);
1509
4385bab1 1510static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
37be4124 1511
142fe8f4 1512int bdev_disk_changed(struct block_device *bdev, bool invalidate)
a1548b67 1513{
142fe8f4 1514 struct gendisk *disk = bdev->bd_disk;
a1548b67
CH
1515 int ret;
1516
f0b870df
CH
1517 lockdep_assert_held(&bdev->bd_mutex);
1518
a1548b67
CH
1519rescan:
1520 ret = blk_drop_partitions(disk, bdev);
1521 if (ret)
1522 return ret;
1523
d981cb5b
CH
1524 /*
1525 * Historically we only set the capacity to zero for devices that
1526 * support partitions (independ of actually having partitions created).
1527 * Doing that is rather inconsistent, but changing it broke legacy
1528 * udisks polling for legacy ide-cdrom devices. Use the crude check
1529 * below to get the sane behavior for most device while not breaking
1530 * userspace for this particular setup.
1531 */
1532 if (invalidate) {
1533 if (disk_part_scan_enabled(disk) ||
1534 !(disk->flags & GENHD_FL_REMOVABLE))
1535 set_capacity(disk, 0);
1536 } else {
1537 if (disk->fops->revalidate_disk)
1538 disk->fops->revalidate_disk(disk);
1539 }
a1548b67
CH
1540
1541 check_disk_size_change(disk, bdev, !invalidate);
a1548b67 1542
142fe8f4
CH
1543 if (get_capacity(disk)) {
1544 ret = blk_add_partitions(disk, bdev);
1545 if (ret == -EAGAIN)
1546 goto rescan;
490547ca 1547 } else if (invalidate) {
a1548b67
CH
1548 /*
1549 * Tell userspace that the media / partition table may have
1550 * changed.
1551 */
1552 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
a1548b67
CH
1553 }
1554
a1548b67
CH
1555 return ret;
1556}
f0b870df
CH
1557/*
1558 * Only exported for for loop and dasd for historic reasons. Don't use in new
1559 * code!
1560 */
1561EXPORT_SYMBOL_GPL(bdev_disk_changed);
a1548b67 1562
6d740cd5
PZ
1563/*
1564 * bd_mutex locking:
1565 *
1566 * mutex_lock(part->bd_mutex)
1567 * mutex_lock_nested(whole->bd_mutex, 1)
1568 */
1569
572c4892 1570static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1da177e4 1571{
1da177e4 1572 struct gendisk *disk;
7db9cfd3 1573 int ret;
cf771cb5 1574 int partno;
fe6e9c1f 1575 int perm = 0;
89736653 1576 bool first_open = false;
fe6e9c1f 1577
572c4892 1578 if (mode & FMODE_READ)
fe6e9c1f 1579 perm |= MAY_READ;
572c4892 1580 if (mode & FMODE_WRITE)
fe6e9c1f
AV
1581 perm |= MAY_WRITE;
1582 /*
1583 * hooks: /n/, see "layering violations".
1584 */
b7300b78
CW
1585 if (!for_part) {
1586 ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1587 if (ret != 0) {
1588 bdput(bdev);
1589 return ret;
1590 }
82666020 1591 }
7db9cfd3 1592
d3374825 1593 restart:
0762b8bd 1594
89f97496 1595 ret = -ENXIO;
560e7cb2 1596 disk = bdev_get_gendisk(bdev, &partno);
0762b8bd 1597 if (!disk)
6e9624b8 1598 goto out;
1da177e4 1599
69e02c59 1600 disk_block_events(disk);
6796bf54 1601 mutex_lock_nested(&bdev->bd_mutex, for_part);
1da177e4 1602 if (!bdev->bd_openers) {
89736653 1603 first_open = true;
1da177e4 1604 bdev->bd_disk = disk;
87192a2a 1605 bdev->bd_queue = disk->queue;
1da177e4 1606 bdev->bd_contains = bdev;
c2ee070f 1607 bdev->bd_partno = partno;
03cdadb0 1608
cf771cb5 1609 if (!partno) {
89f97496
TH
1610 ret = -ENXIO;
1611 bdev->bd_part = disk_get_part(disk, partno);
1612 if (!bdev->bd_part)
1613 goto out_clear;
1614
1196f8b8 1615 ret = 0;
1da177e4 1616 if (disk->fops->open) {
572c4892 1617 ret = disk->fops->open(bdev, mode);
d3374825
N
1618 if (ret == -ERESTARTSYS) {
1619 /* Lost a race with 'disk' being
1620 * deleted, try again.
1621 * See md.c
1622 */
1623 disk_put_part(bdev->bd_part);
1624 bdev->bd_part = NULL;
d3374825 1625 bdev->bd_disk = NULL;
87192a2a 1626 bdev->bd_queue = NULL;
d3374825 1627 mutex_unlock(&bdev->bd_mutex);
69e02c59 1628 disk_unblock_events(disk);
9df6c299 1629 put_disk_and_module(disk);
d3374825
N
1630 goto restart;
1631 }
1da177e4 1632 }
7e69723f 1633
04906b2f 1634 if (!ret) {
7e69723f 1635 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
04906b2f
JK
1636 set_init_blocksize(bdev);
1637 }
7e69723f 1638
1196f8b8
TH
1639 /*
1640 * If the device is invalidated, rescan partition
1641 * if open succeeded or failed with -ENOMEDIUM.
1642 * The latter is necessary to prevent ghost
1643 * partitions on a removed medium.
1644 */
731dc486
JK
1645 if (bdev->bd_invalidated &&
1646 (!ret || ret == -ENOMEDIUM))
1647 bdev_disk_changed(bdev, ret == -ENOMEDIUM);
5a023cdb 1648
1196f8b8
TH
1649 if (ret)
1650 goto out_clear;
1da177e4 1651 } else {
1da177e4
LT
1652 struct block_device *whole;
1653 whole = bdget_disk(disk, 0);
1654 ret = -ENOMEM;
1655 if (!whole)
0762b8bd 1656 goto out_clear;
37be4124 1657 BUG_ON(for_part);
572c4892 1658 ret = __blkdev_get(whole, mode, 1);
1da177e4 1659 if (ret)
0762b8bd 1660 goto out_clear;
1da177e4 1661 bdev->bd_contains = whole;
89f97496 1662 bdev->bd_part = disk_get_part(disk, partno);
e71bf0d0 1663 if (!(disk->flags & GENHD_FL_UP) ||
89f97496 1664 !bdev->bd_part || !bdev->bd_part->nr_sects) {
1da177e4 1665 ret = -ENXIO;
0762b8bd 1666 goto out_clear;
1da177e4 1667 }
89f97496 1668 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
04906b2f 1669 set_init_blocksize(bdev);
1da177e4 1670 }
03e26279
JK
1671
1672 if (bdev->bd_bdi == &noop_backing_dev_info)
1673 bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
1da177e4 1674 } else {
1da177e4 1675 if (bdev->bd_contains == bdev) {
1196f8b8
TH
1676 ret = 0;
1677 if (bdev->bd_disk->fops->open)
572c4892 1678 ret = bdev->bd_disk->fops->open(bdev, mode);
1196f8b8 1679 /* the same as first opener case, read comment there */
731dc486
JK
1680 if (bdev->bd_invalidated &&
1681 (!ret || ret == -ENOMEDIUM))
1682 bdev_disk_changed(bdev, ret == -ENOMEDIUM);
1196f8b8
TH
1683 if (ret)
1684 goto out_unlock_bdev;
1da177e4
LT
1685 }
1686 }
1687 bdev->bd_openers++;
37be4124
N
1688 if (for_part)
1689 bdev->bd_part_count++;
c039e313 1690 mutex_unlock(&bdev->bd_mutex);
69e02c59 1691 disk_unblock_events(disk);
89736653
JK
1692 /* only one opener holds refs to the module and disk */
1693 if (!first_open)
1694 put_disk_and_module(disk);
1da177e4
LT
1695 return 0;
1696
0762b8bd 1697 out_clear:
89f97496 1698 disk_put_part(bdev->bd_part);
1da177e4 1699 bdev->bd_disk = NULL;
0762b8bd 1700 bdev->bd_part = NULL;
87192a2a 1701 bdev->bd_queue = NULL;
1da177e4 1702 if (bdev != bdev->bd_contains)
572c4892 1703 __blkdev_put(bdev->bd_contains, mode, 1);
1da177e4 1704 bdev->bd_contains = NULL;
0762b8bd 1705 out_unlock_bdev:
c039e313 1706 mutex_unlock(&bdev->bd_mutex);
69e02c59 1707 disk_unblock_events(disk);
9df6c299 1708 put_disk_and_module(disk);
4345caba 1709 out:
0762b8bd
TH
1710 bdput(bdev);
1711
1da177e4
LT
1712 return ret;
1713}
1714
d4d77629
TH
1715/**
1716 * blkdev_get - open a block device
1717 * @bdev: block_device to open
1718 * @mode: FMODE_* mask
1719 * @holder: exclusive holder identifier
1720 *
1721 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
1722 * open with exclusive access. Specifying %FMODE_EXCL with %NULL
1723 * @holder is invalid. Exclusive opens may nest for the same @holder.
1724 *
1725 * On success, the reference count of @bdev is unchanged. On failure,
1726 * @bdev is put.
1727 *
1728 * CONTEXT:
1729 * Might sleep.
1730 *
1731 * RETURNS:
1732 * 0 on success, -errno on failure.
1733 */
e525fd89 1734int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1da177e4 1735{
e525fd89
TH
1736 struct block_device *whole = NULL;
1737 int res;
1738
1739 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1740
1741 if ((mode & FMODE_EXCL) && holder) {
1742 whole = bd_start_claiming(bdev, holder);
1743 if (IS_ERR(whole)) {
1744 bdput(bdev);
1745 return PTR_ERR(whole);
1746 }
1747 }
1748
1749 res = __blkdev_get(bdev, mode, 0);
1750
1751 if (whole) {
d4dc210f
TH
1752 struct gendisk *disk = whole->bd_disk;
1753
6a027eff 1754 /* finish claiming */
77ea887e 1755 mutex_lock(&bdev->bd_mutex);
e91455ba
JK
1756 if (!res)
1757 bd_finish_claiming(bdev, whole, holder);
1758 else
1759 bd_abort_claiming(bdev, whole, holder);
77ea887e 1760 /*
d4dc210f
TH
1761 * Block event polling for write claims if requested. Any
1762 * write holder makes the write_holder state stick until
1763 * all are released. This is good enough and tracking
1764 * individual writeable reference is too fragile given the
1765 * way @mode is used in blkdev_get/put().
77ea887e 1766 */
4c49ff3f
TH
1767 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1768 (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
77ea887e 1769 bdev->bd_write_holder = true;
d4dc210f 1770 disk_block_events(disk);
77ea887e
TH
1771 }
1772
1773 mutex_unlock(&bdev->bd_mutex);
6a027eff 1774 bdput(whole);
e525fd89
TH
1775 }
1776
1777 return res;
37be4124 1778}
1da177e4
LT
1779EXPORT_SYMBOL(blkdev_get);
1780
d4d77629
TH
1781/**
1782 * blkdev_get_by_path - open a block device by name
1783 * @path: path to the block device to open
1784 * @mode: FMODE_* mask
1785 * @holder: exclusive holder identifier
1786 *
1787 * Open the blockdevice described by the device file at @path. @mode
1788 * and @holder are identical to blkdev_get().
1789 *
1790 * On success, the returned block_device has reference count of one.
1791 *
1792 * CONTEXT:
1793 * Might sleep.
1794 *
1795 * RETURNS:
1796 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1797 */
1798struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1799 void *holder)
1800{
1801 struct block_device *bdev;
1802 int err;
1803
1804 bdev = lookup_bdev(path);
1805 if (IS_ERR(bdev))
1806 return bdev;
1807
1808 err = blkdev_get(bdev, mode, holder);
1809 if (err)
1810 return ERR_PTR(err);
1811
e51900f7
CE
1812 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1813 blkdev_put(bdev, mode);
1814 return ERR_PTR(-EACCES);
1815 }
1816
d4d77629
TH
1817 return bdev;
1818}
1819EXPORT_SYMBOL(blkdev_get_by_path);
1820
1821/**
1822 * blkdev_get_by_dev - open a block device by device number
1823 * @dev: device number of block device to open
1824 * @mode: FMODE_* mask
1825 * @holder: exclusive holder identifier
1826 *
1827 * Open the blockdevice described by device number @dev. @mode and
1828 * @holder are identical to blkdev_get().
1829 *
1830 * Use it ONLY if you really do not have anything better - i.e. when
1831 * you are behind a truly sucky interface and all you are given is a
1832 * device number. _Never_ to be used for internal purposes. If you
1833 * ever need it - reconsider your API.
1834 *
1835 * On success, the returned block_device has reference count of one.
1836 *
1837 * CONTEXT:
1838 * Might sleep.
1839 *
1840 * RETURNS:
1841 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1842 */
1843struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
1844{
1845 struct block_device *bdev;
1846 int err;
1847
1848 bdev = bdget(dev);
1849 if (!bdev)
1850 return ERR_PTR(-ENOMEM);
1851
1852 err = blkdev_get(bdev, mode, holder);
1853 if (err)
1854 return ERR_PTR(err);
1855
1856 return bdev;
1857}
1858EXPORT_SYMBOL(blkdev_get_by_dev);
1859
1da177e4
LT
1860static int blkdev_open(struct inode * inode, struct file * filp)
1861{
1862 struct block_device *bdev;
1da177e4
LT
1863
1864 /*
1865 * Preserve backwards compatibility and allow large file access
1866 * even if userspace doesn't ask for it explicitly. Some mkfs
1867 * binary needs it. We might want to drop this workaround
1868 * during an unstable branch.
1869 */
1870 filp->f_flags |= O_LARGEFILE;
1871
c35fc7a5
CH
1872 filp->f_mode |= FMODE_NOWAIT;
1873
572c4892
AV
1874 if (filp->f_flags & O_NDELAY)
1875 filp->f_mode |= FMODE_NDELAY;
1876 if (filp->f_flags & O_EXCL)
1877 filp->f_mode |= FMODE_EXCL;
1878 if ((filp->f_flags & O_ACCMODE) == 3)
1879 filp->f_mode |= FMODE_WRITE_IOCTL;
1880
1da177e4 1881 bdev = bd_acquire(inode);
6a2aae06
PE
1882 if (bdev == NULL)
1883 return -ENOMEM;
1da177e4 1884
572c4892 1885 filp->f_mapping = bdev->bd_inode->i_mapping;
5660e13d 1886 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
572c4892 1887
e525fd89 1888 return blkdev_get(bdev, filp->f_mode, filp);
1da177e4
LT
1889}
1890
4385bab1 1891static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
2e7b651d 1892{
2e7b651d 1893 struct gendisk *disk = bdev->bd_disk;
37be4124 1894 struct block_device *victim = NULL;
2e7b651d 1895
6796bf54 1896 mutex_lock_nested(&bdev->bd_mutex, for_part);
37be4124
N
1897 if (for_part)
1898 bdev->bd_part_count--;
1899
2e7b651d 1900 if (!--bdev->bd_openers) {
6a027eff 1901 WARN_ON_ONCE(bdev->bd_holders);
2e7b651d
PZ
1902 sync_blockdev(bdev);
1903 kill_bdev(bdev);
43d1c0eb
ID
1904
1905 bdev_write_inode(bdev);
2e7b651d
PZ
1906 }
1907 if (bdev->bd_contains == bdev) {
1908 if (disk->fops->release)
db2a144b 1909 disk->fops->release(disk, mode);
2e7b651d
PZ
1910 }
1911 if (!bdev->bd_openers) {
0762b8bd
TH
1912 disk_put_part(bdev->bd_part);
1913 bdev->bd_part = NULL;
2e7b651d 1914 bdev->bd_disk = NULL;
37be4124
N
1915 if (bdev != bdev->bd_contains)
1916 victim = bdev->bd_contains;
2e7b651d 1917 bdev->bd_contains = NULL;
523e1d39 1918
9df6c299 1919 put_disk_and_module(disk);
2e7b651d 1920 }
2e7b651d
PZ
1921 mutex_unlock(&bdev->bd_mutex);
1922 bdput(bdev);
37be4124 1923 if (victim)
9a1c3542 1924 __blkdev_put(victim, mode, 1);
2e7b651d
PZ
1925}
1926
4385bab1 1927void blkdev_put(struct block_device *bdev, fmode_t mode)
37be4124 1928{
85ef06d1
TH
1929 mutex_lock(&bdev->bd_mutex);
1930
e525fd89 1931 if (mode & FMODE_EXCL) {
6a027eff
TH
1932 bool bdev_free;
1933
1934 /*
1935 * Release a claim on the device. The holder fields
1936 * are protected with bdev_lock. bd_mutex is to
1937 * synchronize disk_holder unlinking.
1938 */
6a027eff
TH
1939 spin_lock(&bdev_lock);
1940
1941 WARN_ON_ONCE(--bdev->bd_holders < 0);
1942 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1943
1944 /* bd_contains might point to self, check in a separate step */
1945 if ((bdev_free = !bdev->bd_holders))
1946 bdev->bd_holder = NULL;
1947 if (!bdev->bd_contains->bd_holders)
1948 bdev->bd_contains->bd_holder = NULL;
1949
1950 spin_unlock(&bdev_lock);
1951
77ea887e
TH
1952 /*
1953 * If this was the last claim, remove holder link and
1954 * unblock evpoll if it was a write holder.
1955 */
85ef06d1
TH
1956 if (bdev_free && bdev->bd_write_holder) {
1957 disk_unblock_events(bdev->bd_disk);
1958 bdev->bd_write_holder = false;
77ea887e 1959 }
6936217c 1960 }
77ea887e 1961
85ef06d1
TH
1962 /*
1963 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
1964 * event. This is to ensure detection of media removal commanded
1965 * from userland - e.g. eject(1).
1966 */
1967 disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
1968
1969 mutex_unlock(&bdev->bd_mutex);
1970
4385bab1 1971 __blkdev_put(bdev, mode, 0);
37be4124 1972}
2e7b651d
PZ
1973EXPORT_SYMBOL(blkdev_put);
1974
1da177e4
LT
1975static int blkdev_close(struct inode * inode, struct file * filp)
1976{
4ebb16ca 1977 struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
4385bab1
AV
1978 blkdev_put(bdev, filp->f_mode);
1979 return 0;
1da177e4
LT
1980}
1981
bb93e3a5 1982static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1da177e4 1983{
4ebb16ca 1984 struct block_device *bdev = I_BDEV(bdev_file_inode(file));
56b26add 1985 fmode_t mode = file->f_mode;
fd4ce1ac
CH
1986
1987 /*
1988 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
1989 * to updated it before every ioctl.
1990 */
56b26add 1991 if (file->f_flags & O_NDELAY)
fd4ce1ac
CH
1992 mode |= FMODE_NDELAY;
1993 else
1994 mode &= ~FMODE_NDELAY;
1995
56b26add 1996 return blkdev_ioctl(bdev, mode, cmd, arg);
1da177e4
LT
1997}
1998
eef99380
CH
1999/*
2000 * Write data to the block device. Only intended for the block device itself
2001 * and the raw driver which basically is a fake block device.
2002 *
2003 * Does not take i_mutex for the write and thus is not for general purpose
2004 * use.
2005 */
1456c0a8 2006ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
eef99380
CH
2007{
2008 struct file *file = iocb->ki_filp;
4ebb16ca 2009 struct inode *bd_inode = bdev_file_inode(file);
7ec7b94a 2010 loff_t size = i_size_read(bd_inode);
53362a05 2011 struct blk_plug plug;
eef99380 2012 ssize_t ret;
5f380c7f 2013
7ec7b94a
AV
2014 if (bdev_read_only(I_BDEV(bd_inode)))
2015 return -EPERM;
5f380c7f 2016
56939e01
DA
2017 /* uswsusp needs write permission to the swap */
2018 if (IS_SWAPFILE(bd_inode) && !hibernation_available())
dc617f29
DW
2019 return -ETXTBSY;
2020
7ec7b94a 2021 if (!iov_iter_count(from))
5f380c7f
AV
2022 return 0;
2023
7ec7b94a
AV
2024 if (iocb->ki_pos >= size)
2025 return -ENOSPC;
2026
c35fc7a5
CH
2027 if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
2028 return -EOPNOTSUPP;
2029
7ec7b94a 2030 iov_iter_truncate(from, size - iocb->ki_pos);
eef99380 2031
53362a05 2032 blk_start_plug(&plug);
1456c0a8 2033 ret = __generic_file_write_iter(iocb, from);
e2592217
CH
2034 if (ret > 0)
2035 ret = generic_write_sync(iocb, ret);
53362a05 2036 blk_finish_plug(&plug);
eef99380
CH
2037 return ret;
2038}
1456c0a8 2039EXPORT_SYMBOL_GPL(blkdev_write_iter);
eef99380 2040
b2de525f 2041ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
684c9aae
LT
2042{
2043 struct file *file = iocb->ki_filp;
4ebb16ca 2044 struct inode *bd_inode = bdev_file_inode(file);
684c9aae 2045 loff_t size = i_size_read(bd_inode);
a886038b 2046 loff_t pos = iocb->ki_pos;
684c9aae
LT
2047
2048 if (pos >= size)
2049 return 0;
2050
2051 size -= pos;
a886038b
AV
2052 iov_iter_truncate(to, size);
2053 return generic_file_read_iter(iocb, to);
684c9aae 2054}
b2de525f 2055EXPORT_SYMBOL_GPL(blkdev_read_iter);
684c9aae 2056
87d8fe1e
TT
2057/*
2058 * Try to release a page associated with block device when the system
2059 * is under memory pressure.
2060 */
2061static int blkdev_releasepage(struct page *page, gfp_t wait)
2062{
2063 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
2064
2065 if (super && super->s_op->bdev_try_to_free_page)
2066 return super->s_op->bdev_try_to_free_page(super, page, wait);
2067
2068 return try_to_free_buffers(page);
2069}
2070
7f6d5b52
RZ
2071static int blkdev_writepages(struct address_space *mapping,
2072 struct writeback_control *wbc)
2073{
7f6d5b52
RZ
2074 return generic_writepages(mapping, wbc);
2075}
2076
4c54ac62 2077static const struct address_space_operations def_blk_aops = {
1da177e4 2078 .readpage = blkdev_readpage,
447f05bb 2079 .readpages = blkdev_readpages,
1da177e4 2080 .writepage = blkdev_writepage,
6272b5a5
NP
2081 .write_begin = blkdev_write_begin,
2082 .write_end = blkdev_write_end,
7f6d5b52 2083 .writepages = blkdev_writepages,
87d8fe1e 2084 .releasepage = blkdev_releasepage,
1da177e4 2085 .direct_IO = blkdev_direct_IO,
88dbcbb3 2086 .migratepage = buffer_migrate_page_norefs,
b4597226 2087 .is_dirty_writeback = buffer_check_dirty_writeback,
1da177e4
LT
2088};
2089
25f4c414
DW
2090#define BLKDEV_FALLOC_FL_SUPPORTED \
2091 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
2092 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
2093
2094static long blkdev_fallocate(struct file *file, int mode, loff_t start,
2095 loff_t len)
2096{
2097 struct block_device *bdev = I_BDEV(bdev_file_inode(file));
25f4c414
DW
2098 struct address_space *mapping;
2099 loff_t end = start + len - 1;
2100 loff_t isize;
2101 int error;
2102
2103 /* Fail if we don't recognize the flags. */
2104 if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
2105 return -EOPNOTSUPP;
2106
2107 /* Don't go off the end of the device. */
2108 isize = i_size_read(bdev->bd_inode);
2109 if (start >= isize)
2110 return -EINVAL;
2111 if (end >= isize) {
2112 if (mode & FALLOC_FL_KEEP_SIZE) {
2113 len = isize - start;
2114 end = start + len - 1;
2115 } else
2116 return -EINVAL;
2117 }
2118
2119 /*
2120 * Don't allow IO that isn't aligned to logical block size.
2121 */
2122 if ((start | len) & (bdev_logical_block_size(bdev) - 1))
2123 return -EINVAL;
2124
2125 /* Invalidate the page cache, including dirty pages. */
2126 mapping = bdev->bd_inode->i_mapping;
2127 truncate_inode_pages_range(mapping, start, end);
2128
2129 switch (mode) {
2130 case FALLOC_FL_ZERO_RANGE:
2131 case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
2132 error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
ee472d83 2133 GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
25f4c414
DW
2134 break;
2135 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
34045129
CH
2136 error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
2137 GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
25f4c414
DW
2138 break;
2139 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
25f4c414
DW
2140 error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
2141 GFP_KERNEL, 0);
2142 break;
2143 default:
2144 return -EOPNOTSUPP;
2145 }
2146 if (error)
2147 return error;
2148
2149 /*
2150 * Invalidate again; if someone wandered in and dirtied a page,
2151 * the caller will be given -EBUSY. The third argument is
2152 * inclusive, so the rounding here is safe.
2153 */
2154 return invalidate_inode_pages2_range(mapping,
2155 start >> PAGE_SHIFT,
2156 end >> PAGE_SHIFT);
2157}
2158
4b6f5d20 2159const struct file_operations def_blk_fops = {
1da177e4
LT
2160 .open = blkdev_open,
2161 .release = blkdev_close,
2162 .llseek = block_llseek,
a886038b 2163 .read_iter = blkdev_read_iter,
1456c0a8 2164 .write_iter = blkdev_write_iter,
eae83ce1 2165 .iopoll = blkdev_iopoll,
acc93d30 2166 .mmap = generic_file_mmap,
b1dd3b28 2167 .fsync = blkdev_fsync,
bb93e3a5 2168 .unlocked_ioctl = block_ioctl,
1da177e4
LT
2169#ifdef CONFIG_COMPAT
2170 .compat_ioctl = compat_blkdev_ioctl,
2171#endif
1e8b3332 2172 .splice_read = generic_file_splice_read,
8d020765 2173 .splice_write = iter_file_splice_write,
25f4c414 2174 .fallocate = blkdev_fallocate,
1da177e4
LT
2175};
2176
2177int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
2178{
2179 int res;
2180 mm_segment_t old_fs = get_fs();
2181 set_fs(KERNEL_DS);
56b26add 2182 res = blkdev_ioctl(bdev, 0, cmd, arg);
1da177e4
LT
2183 set_fs(old_fs);
2184 return res;
2185}
2186
2187EXPORT_SYMBOL(ioctl_by_bdev);
2188
2189/**
2190 * lookup_bdev - lookup a struct block_device by name
94e2959e 2191 * @pathname: special file representing the block device
1da177e4 2192 *
57d1b536 2193 * Get a reference to the blockdevice at @pathname in the current
1da177e4
LT
2194 * namespace if possible and return it. Return ERR_PTR(error)
2195 * otherwise.
2196 */
421748ec 2197struct block_device *lookup_bdev(const char *pathname)
1da177e4
LT
2198{
2199 struct block_device *bdev;
2200 struct inode *inode;
421748ec 2201 struct path path;
1da177e4
LT
2202 int error;
2203
421748ec 2204 if (!pathname || !*pathname)
1da177e4
LT
2205 return ERR_PTR(-EINVAL);
2206
421748ec 2207 error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1da177e4
LT
2208 if (error)
2209 return ERR_PTR(error);
2210
bb668734 2211 inode = d_backing_inode(path.dentry);
1da177e4
LT
2212 error = -ENOTBLK;
2213 if (!S_ISBLK(inode->i_mode))
2214 goto fail;
2215 error = -EACCES;
a2982cc9 2216 if (!may_open_dev(&path))
1da177e4
LT
2217 goto fail;
2218 error = -ENOMEM;
2219 bdev = bd_acquire(inode);
2220 if (!bdev)
2221 goto fail;
2222out:
421748ec 2223 path_put(&path);
1da177e4
LT
2224 return bdev;
2225fail:
2226 bdev = ERR_PTR(error);
2227 goto out;
2228}
d5686b44 2229EXPORT_SYMBOL(lookup_bdev);
1da177e4 2230
93b270f7 2231int __invalidate_device(struct block_device *bdev, bool kill_dirty)
b71e8a4c
DH
2232{
2233 struct super_block *sb = get_super(bdev);
2234 int res = 0;
2235
2236 if (sb) {
2237 /*
2238 * no need to lock the super, get_super holds the
2239 * read mutex so the filesystem cannot go away
2240 * under us (->put_super runs with the write lock
2241 * hold).
2242 */
2243 shrink_dcache_sb(sb);
93b270f7 2244 res = invalidate_inodes(sb, kill_dirty);
b71e8a4c
DH
2245 drop_super(sb);
2246 }
f98393a6 2247 invalidate_bdev(bdev);
b71e8a4c
DH
2248 return res;
2249}
2250EXPORT_SYMBOL(__invalidate_device);
5c0d6b60
JK
2251
2252void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
2253{
2254 struct inode *inode, *old_inode = NULL;
2255
74278da9 2256 spin_lock(&blockdev_superblock->s_inode_list_lock);
5c0d6b60
JK
2257 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
2258 struct address_space *mapping = inode->i_mapping;
af309226 2259 struct block_device *bdev;
5c0d6b60
JK
2260
2261 spin_lock(&inode->i_lock);
2262 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
2263 mapping->nrpages == 0) {
2264 spin_unlock(&inode->i_lock);
2265 continue;
2266 }
2267 __iget(inode);
2268 spin_unlock(&inode->i_lock);
74278da9 2269 spin_unlock(&blockdev_superblock->s_inode_list_lock);
5c0d6b60
JK
2270 /*
2271 * We hold a reference to 'inode' so it couldn't have been
2272 * removed from s_inodes list while we dropped the
74278da9 2273 * s_inode_list_lock We cannot iput the inode now as we can
5c0d6b60 2274 * be holding the last reference and we cannot iput it under
74278da9 2275 * s_inode_list_lock. So we keep the reference and iput it
5c0d6b60
JK
2276 * later.
2277 */
2278 iput(old_inode);
2279 old_inode = inode;
af309226 2280 bdev = I_BDEV(inode);
5c0d6b60 2281
af309226
RV
2282 mutex_lock(&bdev->bd_mutex);
2283 if (bdev->bd_openers)
2284 func(bdev, arg);
2285 mutex_unlock(&bdev->bd_mutex);
5c0d6b60 2286
74278da9 2287 spin_lock(&blockdev_superblock->s_inode_list_lock);
5c0d6b60 2288 }
74278da9 2289 spin_unlock(&blockdev_superblock->s_inode_list_lock);
5c0d6b60
JK
2290 iput(old_inode);
2291}