blk-mq: make the polling code adaptive
[linux-2.6-block.git] / fs / block_dev.c
CommitLineData
1da177e4
LT
1/*
2 * linux/fs/block_dev.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
6 */
7
1da177e4
LT
8#include <linux/init.h>
9#include <linux/mm.h>
10#include <linux/fcntl.h>
11#include <linux/slab.h>
12#include <linux/kmod.h>
13#include <linux/major.h>
7db9cfd3 14#include <linux/device_cgroup.h>
1da177e4
LT
15#include <linux/highmem.h>
16#include <linux/blkdev.h>
66114cad 17#include <linux/backing-dev.h>
1da177e4
LT
18#include <linux/module.h>
19#include <linux/blkpg.h>
b502bd11 20#include <linux/magic.h>
1da177e4 21#include <linux/buffer_head.h>
ff01bb48 22#include <linux/swap.h>
585d3bc0 23#include <linux/pagevec.h>
811d736f 24#include <linux/writeback.h>
1da177e4
LT
25#include <linux/mpage.h>
26#include <linux/mount.h>
27#include <linux/uio.h>
28#include <linux/namei.h>
1368c4f2 29#include <linux/log2.h>
ff01bb48 30#include <linux/cleancache.h>
c94c2acf 31#include <linux/dax.h>
acc93d30 32#include <linux/badblocks.h>
189ce2b9 33#include <linux/task_io_accounting_ops.h>
25f4c414 34#include <linux/falloc.h>
1da177e4 35#include <asm/uaccess.h>
07f3f05c 36#include "internal.h"
1da177e4
LT
37
38struct bdev_inode {
39 struct block_device bdev;
40 struct inode vfs_inode;
41};
42
4c54ac62
AB
43static const struct address_space_operations def_blk_aops;
44
1da177e4
LT
45static inline struct bdev_inode *BDEV_I(struct inode *inode)
46{
47 return container_of(inode, struct bdev_inode, vfs_inode);
48}
49
ff5053f6 50struct block_device *I_BDEV(struct inode *inode)
1da177e4
LT
51{
52 return &BDEV_I(inode)->bdev;
53}
1da177e4
LT
54EXPORT_SYMBOL(I_BDEV);
55
2af3a815
TK
56void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
57{
58 struct va_format vaf;
59 va_list args;
60
61 va_start(args, fmt);
62 vaf.fmt = fmt;
63 vaf.va = &args;
64 printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
65 va_end(args);
66}
67
dbd3ca50 68static void bdev_write_inode(struct block_device *bdev)
564f00f6 69{
dbd3ca50
VG
70 struct inode *inode = bdev->bd_inode;
71 int ret;
72
564f00f6
CH
73 spin_lock(&inode->i_lock);
74 while (inode->i_state & I_DIRTY) {
75 spin_unlock(&inode->i_lock);
dbd3ca50
VG
76 ret = write_inode_now(inode, true);
77 if (ret) {
78 char name[BDEVNAME_SIZE];
79 pr_warn_ratelimited("VFS: Dirty inode writeback failed "
80 "for block device %s (err=%d).\n",
81 bdevname(bdev, name), ret);
82 }
564f00f6
CH
83 spin_lock(&inode->i_lock);
84 }
85 spin_unlock(&inode->i_lock);
86}
87
f9a14399 88/* Kill _all_ buffers and pagecache , dirty or not.. */
ff01bb48 89void kill_bdev(struct block_device *bdev)
1da177e4 90{
ff01bb48
AV
91 struct address_space *mapping = bdev->bd_inode->i_mapping;
92
f9fe48be 93 if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
f9a14399 94 return;
ff01bb48 95
f9a14399 96 invalidate_bh_lrus();
ff01bb48 97 truncate_inode_pages(mapping, 0);
1da177e4 98}
ff01bb48
AV
99EXPORT_SYMBOL(kill_bdev);
100
101/* Invalidate clean unused buffers and pagecache. */
102void invalidate_bdev(struct block_device *bdev)
103{
104 struct address_space *mapping = bdev->bd_inode->i_mapping;
105
106 if (mapping->nrpages == 0)
107 return;
108
109 invalidate_bh_lrus();
110 lru_add_drain_all(); /* make sure all lru add caches are flushed */
111 invalidate_mapping_pages(mapping, 0, -1);
112 /* 99% of the time, we don't need to flush the cleancache on the bdev.
113 * But, for the strange corners, lets be cautious
114 */
3167760f 115 cleancache_invalidate_inode(mapping);
ff01bb48
AV
116}
117EXPORT_SYMBOL(invalidate_bdev);
1da177e4
LT
118
119int set_blocksize(struct block_device *bdev, int size)
120{
121 /* Size must be a power of two, and between 512 and PAGE_SIZE */
1368c4f2 122 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
1da177e4
LT
123 return -EINVAL;
124
125 /* Size cannot be smaller than the size supported by the device */
e1defc4f 126 if (size < bdev_logical_block_size(bdev))
1da177e4
LT
127 return -EINVAL;
128
129 /* Don't change the size if it is same as current */
130 if (bdev->bd_block_size != size) {
131 sync_blockdev(bdev);
132 bdev->bd_block_size = size;
133 bdev->bd_inode->i_blkbits = blksize_bits(size);
134 kill_bdev(bdev);
135 }
136 return 0;
137}
138
139EXPORT_SYMBOL(set_blocksize);
140
141int sb_set_blocksize(struct super_block *sb, int size)
142{
1da177e4
LT
143 if (set_blocksize(sb->s_bdev, size))
144 return 0;
145 /* If we get here, we know size is power of two
146 * and it's value is between 512 and PAGE_SIZE */
147 sb->s_blocksize = size;
38885bd4 148 sb->s_blocksize_bits = blksize_bits(size);
1da177e4
LT
149 return sb->s_blocksize;
150}
151
152EXPORT_SYMBOL(sb_set_blocksize);
153
154int sb_min_blocksize(struct super_block *sb, int size)
155{
e1defc4f 156 int minsize = bdev_logical_block_size(sb->s_bdev);
1da177e4
LT
157 if (size < minsize)
158 size = minsize;
159 return sb_set_blocksize(sb, size);
160}
161
162EXPORT_SYMBOL(sb_min_blocksize);
163
164static int
165blkdev_get_block(struct inode *inode, sector_t iblock,
166 struct buffer_head *bh, int create)
167{
1da177e4
LT
168 bh->b_bdev = I_BDEV(inode);
169 bh->b_blocknr = iblock;
170 set_buffer_mapped(bh);
171 return 0;
172}
173
4ebb16ca
DW
174static struct inode *bdev_file_inode(struct file *file)
175{
176 return file->f_mapping->host;
177}
178
189ce2b9
CH
179#define DIO_INLINE_BIO_VECS 4
180
181static void blkdev_bio_end_io_simple(struct bio *bio)
182{
183 struct task_struct *waiter = bio->bi_private;
184
185 WRITE_ONCE(bio->bi_private, NULL);
186 wake_up_process(waiter);
187}
188
189static ssize_t
190__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
191 int nr_pages)
192{
193 struct file *file = iocb->ki_filp;
194 struct block_device *bdev = I_BDEV(bdev_file_inode(file));
195 unsigned blkbits = blksize_bits(bdev_logical_block_size(bdev));
196 struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *bvec;
197 loff_t pos = iocb->ki_pos;
198 bool should_dirty = false;
199 struct bio bio;
200 ssize_t ret;
201 blk_qc_t qc;
202 int i;
203
204 if ((pos | iov_iter_alignment(iter)) & ((1 << blkbits) - 1))
205 return -EINVAL;
206
207 bio_init(&bio);
208 bio.bi_max_vecs = nr_pages;
209 bio.bi_io_vec = inline_vecs;
210 bio.bi_bdev = bdev;
211 bio.bi_iter.bi_sector = pos >> blkbits;
212 bio.bi_private = current;
213 bio.bi_end_io = blkdev_bio_end_io_simple;
214
215 ret = bio_iov_iter_get_pages(&bio, iter);
216 if (unlikely(ret))
217 return ret;
218 ret = bio.bi_iter.bi_size;
219
220 if (iov_iter_rw(iter) == READ) {
221 bio_set_op_attrs(&bio, REQ_OP_READ, 0);
222 if (iter_is_iovec(iter))
223 should_dirty = true;
224 } else {
225 bio_set_op_attrs(&bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
226 task_io_account_write(ret);
227 }
228
229 qc = submit_bio(&bio);
230 for (;;) {
231 set_current_state(TASK_UNINTERRUPTIBLE);
232 if (!READ_ONCE(bio.bi_private))
233 break;
234 if (!(iocb->ki_flags & IOCB_HIPRI) ||
235 !blk_mq_poll(bdev_get_queue(bdev), qc))
236 io_schedule();
237 }
238 __set_current_state(TASK_RUNNING);
239
240 bio_for_each_segment_all(bvec, &bio, i) {
241 if (should_dirty && !PageCompound(bvec->bv_page))
242 set_page_dirty_lock(bvec->bv_page);
243 put_page(bvec->bv_page);
244 }
245
246 if (unlikely(bio.bi_error))
247 return bio.bi_error;
248 iocb->ki_pos += ret;
249 return ret;
250}
251
b2e895db 252static ssize_t
c8b8e32d 253blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
b2e895db
AM
254{
255 struct file *file = iocb->ki_filp;
4ebb16ca 256 struct inode *inode = bdev_file_inode(file);
189ce2b9 257 int nr_pages;
b2e895db 258
189ce2b9
CH
259 nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
260 if (!nr_pages)
261 return 0;
262 if (is_sync_kiocb(iocb) && nr_pages <= DIO_INLINE_BIO_VECS)
263 return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
c8b8e32d 264 return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
fe0f07d0
JA
265 blkdev_get_block, NULL, NULL,
266 DIO_SKIP_DIO_COUNT);
b2e895db
AM
267}
268
5cee5815
JK
269int __sync_blockdev(struct block_device *bdev, int wait)
270{
271 if (!bdev)
272 return 0;
273 if (!wait)
274 return filemap_flush(bdev->bd_inode->i_mapping);
275 return filemap_write_and_wait(bdev->bd_inode->i_mapping);
276}
277
585d3bc0
NP
278/*
279 * Write out and wait upon all the dirty data associated with a block
280 * device via its mapping. Does not take the superblock lock.
281 */
282int sync_blockdev(struct block_device *bdev)
283{
5cee5815 284 return __sync_blockdev(bdev, 1);
585d3bc0
NP
285}
286EXPORT_SYMBOL(sync_blockdev);
287
288/*
289 * Write out and wait upon all dirty data associated with this
290 * device. Filesystem data as well as the underlying block
291 * device. Takes the superblock lock.
292 */
293int fsync_bdev(struct block_device *bdev)
294{
295 struct super_block *sb = get_super(bdev);
296 if (sb) {
60b0680f 297 int res = sync_filesystem(sb);
585d3bc0
NP
298 drop_super(sb);
299 return res;
300 }
301 return sync_blockdev(bdev);
302}
47e4491b 303EXPORT_SYMBOL(fsync_bdev);
585d3bc0
NP
304
305/**
306 * freeze_bdev -- lock a filesystem and force it into a consistent state
307 * @bdev: blockdevice to lock
308 *
585d3bc0
NP
309 * If a superblock is found on this device, we take the s_umount semaphore
310 * on it to make sure nobody unmounts until the snapshot creation is done.
311 * The reference counter (bd_fsfreeze_count) guarantees that only the last
312 * unfreeze process can unfreeze the frozen filesystem actually when multiple
313 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
314 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
315 * actually.
316 */
317struct super_block *freeze_bdev(struct block_device *bdev)
318{
319 struct super_block *sb;
320 int error = 0;
321
322 mutex_lock(&bdev->bd_fsfreeze_mutex);
4504230a
CH
323 if (++bdev->bd_fsfreeze_count > 1) {
324 /*
325 * We don't even need to grab a reference - the first call
326 * to freeze_bdev grab an active reference and only the last
327 * thaw_bdev drops it.
328 */
585d3bc0 329 sb = get_super(bdev);
5bb53c0f
AR
330 if (sb)
331 drop_super(sb);
4504230a
CH
332 mutex_unlock(&bdev->bd_fsfreeze_mutex);
333 return sb;
334 }
335
336 sb = get_active_super(bdev);
337 if (!sb)
338 goto out;
48b6bca6
BM
339 if (sb->s_op->freeze_super)
340 error = sb->s_op->freeze_super(sb);
341 else
342 error = freeze_super(sb);
18e9e510
JB
343 if (error) {
344 deactivate_super(sb);
345 bdev->bd_fsfreeze_count--;
585d3bc0 346 mutex_unlock(&bdev->bd_fsfreeze_mutex);
18e9e510 347 return ERR_PTR(error);
585d3bc0 348 }
18e9e510 349 deactivate_super(sb);
4504230a 350 out:
585d3bc0
NP
351 sync_blockdev(bdev);
352 mutex_unlock(&bdev->bd_fsfreeze_mutex);
4fadd7bb 353 return sb; /* thaw_bdev releases s->s_umount */
585d3bc0
NP
354}
355EXPORT_SYMBOL(freeze_bdev);
356
357/**
358 * thaw_bdev -- unlock filesystem
359 * @bdev: blockdevice to unlock
360 * @sb: associated superblock
361 *
362 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
363 */
364int thaw_bdev(struct block_device *bdev, struct super_block *sb)
365{
4504230a 366 int error = -EINVAL;
585d3bc0
NP
367
368 mutex_lock(&bdev->bd_fsfreeze_mutex);
4504230a 369 if (!bdev->bd_fsfreeze_count)
18e9e510 370 goto out;
4504230a
CH
371
372 error = 0;
373 if (--bdev->bd_fsfreeze_count > 0)
18e9e510 374 goto out;
4504230a
CH
375
376 if (!sb)
18e9e510 377 goto out;
4504230a 378
48b6bca6
BM
379 if (sb->s_op->thaw_super)
380 error = sb->s_op->thaw_super(sb);
381 else
382 error = thaw_super(sb);
997198ba 383 if (error)
18e9e510 384 bdev->bd_fsfreeze_count++;
18e9e510 385out:
585d3bc0 386 mutex_unlock(&bdev->bd_fsfreeze_mutex);
997198ba 387 return error;
585d3bc0
NP
388}
389EXPORT_SYMBOL(thaw_bdev);
390
1da177e4
LT
391static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
392{
393 return block_write_full_page(page, blkdev_get_block, wbc);
394}
395
396static int blkdev_readpage(struct file * file, struct page * page)
397{
398 return block_read_full_page(page, blkdev_get_block);
399}
400
447f05bb
AM
401static int blkdev_readpages(struct file *file, struct address_space *mapping,
402 struct list_head *pages, unsigned nr_pages)
403{
404 return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
405}
406
6272b5a5
NP
407static int blkdev_write_begin(struct file *file, struct address_space *mapping,
408 loff_t pos, unsigned len, unsigned flags,
409 struct page **pagep, void **fsdata)
1da177e4 410{
155130a4
CH
411 return block_write_begin(mapping, pos, len, flags, pagep,
412 blkdev_get_block);
1da177e4
LT
413}
414
6272b5a5
NP
415static int blkdev_write_end(struct file *file, struct address_space *mapping,
416 loff_t pos, unsigned len, unsigned copied,
417 struct page *page, void *fsdata)
1da177e4 418{
6272b5a5
NP
419 int ret;
420 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
421
422 unlock_page(page);
09cbfeaf 423 put_page(page);
6272b5a5
NP
424
425 return ret;
1da177e4
LT
426}
427
428/*
429 * private llseek:
496ad9aa 430 * for a block special file file_inode(file)->i_size is zero
1da177e4
LT
431 * so we compute the size by hand (just as in block_read/write above)
432 */
965c8e59 433static loff_t block_llseek(struct file *file, loff_t offset, int whence)
1da177e4 434{
4ebb16ca 435 struct inode *bd_inode = bdev_file_inode(file);
1da177e4
LT
436 loff_t retval;
437
5955102c 438 inode_lock(bd_inode);
5d48f3a2 439 retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
5955102c 440 inode_unlock(bd_inode);
1da177e4
LT
441 return retval;
442}
443
02c24a82 444int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
1da177e4 445{
4ebb16ca 446 struct inode *bd_inode = bdev_file_inode(filp);
b8af67e2 447 struct block_device *bdev = I_BDEV(bd_inode);
ab0a9735 448 int error;
da5aa861
RW
449
450 error = filemap_write_and_wait_range(filp->f_mapping, start, end);
451 if (error)
452 return error;
ab0a9735 453
b8af67e2
AB
454 /*
455 * There is no need to serialise calls to blkdev_issue_flush with
456 * i_mutex and doing so causes performance issues with concurrent
457 * O_SYNC writers to a block device.
458 */
dd3932ed 459 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
ab0a9735
CH
460 if (error == -EOPNOTSUPP)
461 error = 0;
b8af67e2 462
ab0a9735 463 return error;
1da177e4 464}
b1dd3b28 465EXPORT_SYMBOL(blkdev_fsync);
1da177e4 466
47a191fd
MW
467/**
468 * bdev_read_page() - Start reading a page from a block device
469 * @bdev: The device to read the page from
470 * @sector: The offset on the device to read the page to (need not be aligned)
471 * @page: The page to read
472 *
473 * On entry, the page should be locked. It will be unlocked when the page
474 * has been read. If the block driver implements rw_page synchronously,
475 * that will be true on exit from this function, but it need not be.
476 *
477 * Errors returned by this function are usually "soft", eg out of memory, or
478 * queue full; callers should try a different route to read this page rather
479 * than propagate an error back up the stack.
480 *
481 * Return: negative errno if an error occurs, 0 if submission was successful.
482 */
483int bdev_read_page(struct block_device *bdev, sector_t sector,
484 struct page *page)
485{
486 const struct block_device_operations *ops = bdev->bd_disk->fops;
2e6edc95
DW
487 int result = -EOPNOTSUPP;
488
f68eb1e7 489 if (!ops->rw_page || bdev_get_integrity(bdev))
2e6edc95
DW
490 return result;
491
6f3b0e8b 492 result = blk_queue_enter(bdev->bd_queue, false);
2e6edc95
DW
493 if (result)
494 return result;
c11f0c0b 495 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
2e6edc95
DW
496 blk_queue_exit(bdev->bd_queue);
497 return result;
47a191fd
MW
498}
499EXPORT_SYMBOL_GPL(bdev_read_page);
500
501/**
502 * bdev_write_page() - Start writing a page to a block device
503 * @bdev: The device to write the page to
504 * @sector: The offset on the device to write the page to (need not be aligned)
505 * @page: The page to write
506 * @wbc: The writeback_control for the write
507 *
508 * On entry, the page should be locked and not currently under writeback.
509 * On exit, if the write started successfully, the page will be unlocked and
510 * under writeback. If the write failed already (eg the driver failed to
511 * queue the page to the device), the page will still be locked. If the
512 * caller is a ->writepage implementation, it will need to unlock the page.
513 *
514 * Errors returned by this function are usually "soft", eg out of memory, or
515 * queue full; callers should try a different route to write this page rather
516 * than propagate an error back up the stack.
517 *
518 * Return: negative errno if an error occurs, 0 if submission was successful.
519 */
520int bdev_write_page(struct block_device *bdev, sector_t sector,
521 struct page *page, struct writeback_control *wbc)
522{
523 int result;
47a191fd 524 const struct block_device_operations *ops = bdev->bd_disk->fops;
2e6edc95 525
f68eb1e7 526 if (!ops->rw_page || bdev_get_integrity(bdev))
47a191fd 527 return -EOPNOTSUPP;
6f3b0e8b 528 result = blk_queue_enter(bdev->bd_queue, false);
2e6edc95
DW
529 if (result)
530 return result;
531
47a191fd 532 set_page_writeback(page);
c11f0c0b 533 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
47a191fd
MW
534 if (result)
535 end_page_writeback(page);
536 else
537 unlock_page(page);
2e6edc95 538 blk_queue_exit(bdev->bd_queue);
47a191fd
MW
539 return result;
540}
541EXPORT_SYMBOL_GPL(bdev_write_page);
542
dd22f551
MW
543/**
544 * bdev_direct_access() - Get the address for directly-accessibly memory
545 * @bdev: The device containing the memory
b2e0d162 546 * @dax: control and output parameters for ->direct_access
dd22f551
MW
547 *
548 * If a block device is made up of directly addressable memory, this function
549 * will tell the caller the PFN and the address of the memory. The address
550 * may be directly dereferenced within the kernel without the need to call
551 * ioremap(), kmap() or similar. The PFN is suitable for inserting into
552 * page tables.
553 *
554 * Return: negative errno if an error occurs, otherwise the number of bytes
555 * accessible at this address.
556 */
b2e0d162 557long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
dd22f551 558{
b2e0d162
DW
559 sector_t sector = dax->sector;
560 long avail, size = dax->size;
dd22f551
MW
561 const struct block_device_operations *ops = bdev->bd_disk->fops;
562
43c3dd08
MW
563 /*
564 * The device driver is allowed to sleep, in order to make the
565 * memory directly accessible.
566 */
567 might_sleep();
568
dd22f551
MW
569 if (size < 0)
570 return size;
163d4baa 571 if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
dd22f551
MW
572 return -EOPNOTSUPP;
573 if ((sector + DIV_ROUND_UP(size, 512)) >
574 part_nr_sects_read(bdev->bd_part))
575 return -ERANGE;
576 sector += get_start_sect(bdev);
577 if (sector % (PAGE_SIZE / 512))
578 return -EINVAL;
0a70bd43 579 avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
dd22f551
MW
580 if (!avail)
581 return -ERANGE;
fe683ada
DW
582 if (avail > 0 && avail & ~PAGE_MASK)
583 return -ENXIO;
dd22f551
MW
584 return min(avail, size);
585}
586EXPORT_SYMBOL_GPL(bdev_direct_access);
587
2d96afc8
TK
588/**
589 * bdev_dax_supported() - Check if the device supports dax for filesystem
590 * @sb: The superblock of the device
591 * @blocksize: The block size of the device
592 *
593 * This is a library function for filesystems to check if the block device
594 * can be mounted with dax option.
595 *
596 * Return: negative errno if unsupported, 0 if supported.
597 */
598int bdev_dax_supported(struct super_block *sb, int blocksize)
599{
600 struct blk_dax_ctl dax = {
601 .sector = 0,
602 .size = PAGE_SIZE,
603 };
604 int err;
605
606 if (blocksize != PAGE_SIZE) {
607 vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
608 return -EINVAL;
609 }
610
611 err = bdev_direct_access(sb->s_bdev, &dax);
612 if (err < 0) {
613 switch (err) {
614 case -EOPNOTSUPP:
615 vfs_msg(sb, KERN_ERR,
616 "error: device does not support dax");
617 break;
618 case -EINVAL:
619 vfs_msg(sb, KERN_ERR,
620 "error: unaligned partition for dax");
621 break;
622 default:
623 vfs_msg(sb, KERN_ERR,
624 "error: dax access failed (%d)", err);
625 }
626 return err;
627 }
628
629 return 0;
630}
631EXPORT_SYMBOL_GPL(bdev_dax_supported);
632
a8078b1f
TK
633/**
634 * bdev_dax_capable() - Return if the raw device is capable for dax
635 * @bdev: The device for raw block device access
636 */
637bool bdev_dax_capable(struct block_device *bdev)
638{
a8078b1f
TK
639 struct blk_dax_ctl dax = {
640 .size = PAGE_SIZE,
641 };
642
643 if (!IS_ENABLED(CONFIG_FS_DAX))
644 return false;
645
646 dax.sector = 0;
647 if (bdev_direct_access(bdev, &dax) < 0)
648 return false;
649
650 dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
651 if (bdev_direct_access(bdev, &dax) < 0)
652 return false;
653
a8078b1f
TK
654 return true;
655}
656
1da177e4
LT
657/*
658 * pseudo-fs
659 */
660
661static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
e18b890b 662static struct kmem_cache * bdev_cachep __read_mostly;
1da177e4
LT
663
664static struct inode *bdev_alloc_inode(struct super_block *sb)
665{
e94b1766 666 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
1da177e4
LT
667 if (!ei)
668 return NULL;
669 return &ei->vfs_inode;
670}
671
fa0d7e3d 672static void bdev_i_callback(struct rcu_head *head)
1da177e4 673{
fa0d7e3d 674 struct inode *inode = container_of(head, struct inode, i_rcu);
1da177e4
LT
675 struct bdev_inode *bdi = BDEV_I(inode);
676
1da177e4
LT
677 kmem_cache_free(bdev_cachep, bdi);
678}
679
fa0d7e3d
NP
680static void bdev_destroy_inode(struct inode *inode)
681{
682 call_rcu(&inode->i_rcu, bdev_i_callback);
683}
684
51cc5068 685static void init_once(void *foo)
1da177e4
LT
686{
687 struct bdev_inode *ei = (struct bdev_inode *) foo;
688 struct block_device *bdev = &ei->bdev;
689
a35afb83
CL
690 memset(bdev, 0, sizeof(*bdev));
691 mutex_init(&bdev->bd_mutex);
a35afb83 692 INIT_LIST_HEAD(&bdev->bd_list);
49731baa
TH
693#ifdef CONFIG_SYSFS
694 INIT_LIST_HEAD(&bdev->bd_holder_disks);
695#endif
a35afb83 696 inode_init_once(&ei->vfs_inode);
fcccf502
TS
697 /* Initialize mutex for freeze. */
698 mutex_init(&bdev->bd_fsfreeze_mutex);
1da177e4
LT
699}
700
b57922d9 701static void bdev_evict_inode(struct inode *inode)
1da177e4
LT
702{
703 struct block_device *bdev = &BDEV_I(inode)->bdev;
91b0abe3 704 truncate_inode_pages_final(&inode->i_data);
b57922d9 705 invalidate_inode_buffers(inode); /* is it needed here? */
dbd5768f 706 clear_inode(inode);
1da177e4 707 spin_lock(&bdev_lock);
1da177e4
LT
708 list_del_init(&bdev->bd_list);
709 spin_unlock(&bdev_lock);
710}
711
ee9b6d61 712static const struct super_operations bdev_sops = {
1da177e4
LT
713 .statfs = simple_statfs,
714 .alloc_inode = bdev_alloc_inode,
715 .destroy_inode = bdev_destroy_inode,
716 .drop_inode = generic_delete_inode,
b57922d9 717 .evict_inode = bdev_evict_inode,
1da177e4
LT
718};
719
51139ada
AV
720static struct dentry *bd_mount(struct file_system_type *fs_type,
721 int flags, const char *dev_name, void *data)
1da177e4 722{
3684aa70
SL
723 struct dentry *dent;
724 dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
e9e5e3fa 725 if (!IS_ERR(dent))
3684aa70
SL
726 dent->d_sb->s_iflags |= SB_I_CGROUPWB;
727 return dent;
1da177e4
LT
728}
729
730static struct file_system_type bd_type = {
731 .name = "bdev",
51139ada 732 .mount = bd_mount,
1da177e4
LT
733 .kill_sb = kill_anon_super,
734};
735
a212b105
TH
736struct super_block *blockdev_superblock __read_mostly;
737EXPORT_SYMBOL_GPL(blockdev_superblock);
1da177e4
LT
738
739void __init bdev_cache_init(void)
740{
741 int err;
ace8577a 742 static struct vfsmount *bd_mnt;
c2acf7b9 743
1da177e4 744 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
fffb60f9 745 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
5d097056 746 SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
20c2df83 747 init_once);
1da177e4
LT
748 err = register_filesystem(&bd_type);
749 if (err)
750 panic("Cannot register bdev pseudo-fs");
751 bd_mnt = kern_mount(&bd_type);
1da177e4
LT
752 if (IS_ERR(bd_mnt))
753 panic("Cannot create bdev pseudo-fs");
ace8577a 754 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
1da177e4
LT
755}
756
757/*
758 * Most likely _very_ bad one - but then it's hardly critical for small
759 * /dev and can be fixed when somebody will need really large one.
760 * Keep in mind that it will be fed through icache hash function too.
761 */
762static inline unsigned long hash(dev_t dev)
763{
764 return MAJOR(dev)+MINOR(dev);
765}
766
767static int bdev_test(struct inode *inode, void *data)
768{
769 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
770}
771
772static int bdev_set(struct inode *inode, void *data)
773{
774 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
775 return 0;
776}
777
778static LIST_HEAD(all_bdevs);
779
780struct block_device *bdget(dev_t dev)
781{
782 struct block_device *bdev;
783 struct inode *inode;
784
c2acf7b9 785 inode = iget5_locked(blockdev_superblock, hash(dev),
1da177e4
LT
786 bdev_test, bdev_set, &dev);
787
788 if (!inode)
789 return NULL;
790
791 bdev = &BDEV_I(inode)->bdev;
792
793 if (inode->i_state & I_NEW) {
794 bdev->bd_contains = NULL;
782b94cd 795 bdev->bd_super = NULL;
1da177e4
LT
796 bdev->bd_inode = inode;
797 bdev->bd_block_size = (1 << inode->i_blkbits);
798 bdev->bd_part_count = 0;
799 bdev->bd_invalidated = 0;
800 inode->i_mode = S_IFBLK;
801 inode->i_rdev = dev;
802 inode->i_bdev = bdev;
803 inode->i_data.a_ops = &def_blk_aops;
804 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
1da177e4
LT
805 spin_lock(&bdev_lock);
806 list_add(&bdev->bd_list, &all_bdevs);
807 spin_unlock(&bdev_lock);
808 unlock_new_inode(inode);
809 }
810 return bdev;
811}
812
813EXPORT_SYMBOL(bdget);
814
dddac6a7
AJ
815/**
816 * bdgrab -- Grab a reference to an already referenced block device
817 * @bdev: Block device to grab a reference to.
818 */
819struct block_device *bdgrab(struct block_device *bdev)
820{
7de9c6ee 821 ihold(bdev->bd_inode);
dddac6a7
AJ
822 return bdev;
823}
c1681bf8 824EXPORT_SYMBOL(bdgrab);
dddac6a7 825
1da177e4
LT
826long nr_blockdev_pages(void)
827{
203a2935 828 struct block_device *bdev;
1da177e4
LT
829 long ret = 0;
830 spin_lock(&bdev_lock);
203a2935 831 list_for_each_entry(bdev, &all_bdevs, bd_list) {
1da177e4
LT
832 ret += bdev->bd_inode->i_mapping->nrpages;
833 }
834 spin_unlock(&bdev_lock);
835 return ret;
836}
837
838void bdput(struct block_device *bdev)
839{
840 iput(bdev->bd_inode);
841}
842
843EXPORT_SYMBOL(bdput);
844
845static struct block_device *bd_acquire(struct inode *inode)
846{
847 struct block_device *bdev;
09d967c6 848
1da177e4
LT
849 spin_lock(&bdev_lock);
850 bdev = inode->i_bdev;
09d967c6 851 if (bdev) {
ed8a9d2c 852 bdgrab(bdev);
1da177e4
LT
853 spin_unlock(&bdev_lock);
854 return bdev;
855 }
856 spin_unlock(&bdev_lock);
09d967c6 857
1da177e4
LT
858 bdev = bdget(inode->i_rdev);
859 if (bdev) {
860 spin_lock(&bdev_lock);
09d967c6
OH
861 if (!inode->i_bdev) {
862 /*
7de9c6ee 863 * We take an additional reference to bd_inode,
09d967c6
OH
864 * and it's released in clear_inode() of inode.
865 * So, we can access it via ->i_mapping always
866 * without igrab().
867 */
ed8a9d2c 868 bdgrab(bdev);
09d967c6
OH
869 inode->i_bdev = bdev;
870 inode->i_mapping = bdev->bd_inode->i_mapping;
09d967c6 871 }
1da177e4
LT
872 spin_unlock(&bdev_lock);
873 }
874 return bdev;
875}
876
877/* Call when you free inode */
878
879void bd_forget(struct inode *inode)
880{
09d967c6
OH
881 struct block_device *bdev = NULL;
882
1da177e4 883 spin_lock(&bdev_lock);
b4ea2eaa
YH
884 if (!sb_is_blkdev_sb(inode->i_sb))
885 bdev = inode->i_bdev;
a4a4f943
AV
886 inode->i_bdev = NULL;
887 inode->i_mapping = &inode->i_data;
1da177e4 888 spin_unlock(&bdev_lock);
09d967c6
OH
889
890 if (bdev)
ed8a9d2c 891 bdput(bdev);
1da177e4
LT
892}
893
1a3cbbc5
TH
894/**
895 * bd_may_claim - test whether a block device can be claimed
896 * @bdev: block device of interest
897 * @whole: whole block device containing @bdev, may equal @bdev
898 * @holder: holder trying to claim @bdev
899 *
25985edc 900 * Test whether @bdev can be claimed by @holder.
1a3cbbc5
TH
901 *
902 * CONTEXT:
903 * spin_lock(&bdev_lock).
904 *
905 * RETURNS:
906 * %true if @bdev can be claimed, %false otherwise.
907 */
908static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
909 void *holder)
1da177e4 910{
1da177e4 911 if (bdev->bd_holder == holder)
1a3cbbc5 912 return true; /* already a holder */
1da177e4 913 else if (bdev->bd_holder != NULL)
1a3cbbc5 914 return false; /* held by someone else */
1da177e4 915 else if (bdev->bd_contains == bdev)
1a3cbbc5 916 return true; /* is a whole device which isn't held */
1da177e4 917
e525fd89 918 else if (whole->bd_holder == bd_may_claim)
1a3cbbc5
TH
919 return true; /* is a partition of a device that is being partitioned */
920 else if (whole->bd_holder != NULL)
921 return false; /* is a partition of a held device */
1da177e4 922 else
1a3cbbc5
TH
923 return true; /* is a partition of an un-held device */
924}
925
6b4517a7
TH
926/**
927 * bd_prepare_to_claim - prepare to claim a block device
928 * @bdev: block device of interest
929 * @whole: the whole device containing @bdev, may equal @bdev
930 * @holder: holder trying to claim @bdev
931 *
932 * Prepare to claim @bdev. This function fails if @bdev is already
933 * claimed by another holder and waits if another claiming is in
934 * progress. This function doesn't actually claim. On successful
935 * return, the caller has ownership of bd_claiming and bd_holder[s].
936 *
937 * CONTEXT:
938 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
939 * it multiple times.
940 *
941 * RETURNS:
942 * 0 if @bdev can be claimed, -EBUSY otherwise.
943 */
944static int bd_prepare_to_claim(struct block_device *bdev,
945 struct block_device *whole, void *holder)
946{
947retry:
948 /* if someone else claimed, fail */
949 if (!bd_may_claim(bdev, whole, holder))
950 return -EBUSY;
951
e75aa858
TH
952 /* if claiming is already in progress, wait for it to finish */
953 if (whole->bd_claiming) {
6b4517a7
TH
954 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
955 DEFINE_WAIT(wait);
956
957 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
958 spin_unlock(&bdev_lock);
959 schedule();
960 finish_wait(wq, &wait);
961 spin_lock(&bdev_lock);
962 goto retry;
963 }
964
965 /* yay, all mine */
966 return 0;
967}
968
969/**
970 * bd_start_claiming - start claiming a block device
971 * @bdev: block device of interest
972 * @holder: holder trying to claim @bdev
973 *
974 * @bdev is about to be opened exclusively. Check @bdev can be opened
975 * exclusively and mark that an exclusive open is in progress. Each
976 * successful call to this function must be matched with a call to
b0018361
NP
977 * either bd_finish_claiming() or bd_abort_claiming() (which do not
978 * fail).
979 *
980 * This function is used to gain exclusive access to the block device
981 * without actually causing other exclusive open attempts to fail. It
982 * should be used when the open sequence itself requires exclusive
983 * access but may subsequently fail.
6b4517a7
TH
984 *
985 * CONTEXT:
986 * Might sleep.
987 *
988 * RETURNS:
989 * Pointer to the block device containing @bdev on success, ERR_PTR()
990 * value on failure.
991 */
992static struct block_device *bd_start_claiming(struct block_device *bdev,
993 void *holder)
994{
995 struct gendisk *disk;
996 struct block_device *whole;
997 int partno, err;
998
999 might_sleep();
1000
1001 /*
1002 * @bdev might not have been initialized properly yet, look up
1003 * and grab the outer block device the hard way.
1004 */
1005 disk = get_gendisk(bdev->bd_dev, &partno);
1006 if (!disk)
1007 return ERR_PTR(-ENXIO);
1008
d4c208b8
TH
1009 /*
1010 * Normally, @bdev should equal what's returned from bdget_disk()
1011 * if partno is 0; however, some drivers (floppy) use multiple
1012 * bdev's for the same physical device and @bdev may be one of the
1013 * aliases. Keep @bdev if partno is 0. This means claimer
1014 * tracking is broken for those devices but it has always been that
1015 * way.
1016 */
1017 if (partno)
1018 whole = bdget_disk(disk, 0);
1019 else
1020 whole = bdgrab(bdev);
1021
cf342570 1022 module_put(disk->fops->owner);
6b4517a7
TH
1023 put_disk(disk);
1024 if (!whole)
1025 return ERR_PTR(-ENOMEM);
1026
1027 /* prepare to claim, if successful, mark claiming in progress */
1028 spin_lock(&bdev_lock);
1029
1030 err = bd_prepare_to_claim(bdev, whole, holder);
1031 if (err == 0) {
1032 whole->bd_claiming = holder;
1033 spin_unlock(&bdev_lock);
1034 return whole;
1035 } else {
1036 spin_unlock(&bdev_lock);
1037 bdput(whole);
1038 return ERR_PTR(err);
1039 }
1040}
1041
641dc636 1042#ifdef CONFIG_SYSFS
49731baa
TH
1043struct bd_holder_disk {
1044 struct list_head list;
1045 struct gendisk *disk;
1046 int refcnt;
1047};
1048
1049static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
1050 struct gendisk *disk)
1051{
1052 struct bd_holder_disk *holder;
1053
1054 list_for_each_entry(holder, &bdev->bd_holder_disks, list)
1055 if (holder->disk == disk)
1056 return holder;
1057 return NULL;
1058}
1059
4d7dd8fd 1060static int add_symlink(struct kobject *from, struct kobject *to)
641dc636 1061{
4d7dd8fd 1062 return sysfs_create_link(from, to, kobject_name(to));
641dc636
JN
1063}
1064
1065static void del_symlink(struct kobject *from, struct kobject *to)
1066{
641dc636
JN
1067 sysfs_remove_link(from, kobject_name(to));
1068}
1069
df6c0cd9 1070/**
e09b457b
TH
1071 * bd_link_disk_holder - create symlinks between holding disk and slave bdev
1072 * @bdev: the claimed slave bdev
1073 * @disk: the holding disk
df6c0cd9 1074 *
49731baa
TH
1075 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
1076 *
e09b457b 1077 * This functions creates the following sysfs symlinks.
641dc636 1078 *
e09b457b
TH
1079 * - from "slaves" directory of the holder @disk to the claimed @bdev
1080 * - from "holders" directory of the @bdev to the holder @disk
641dc636 1081 *
e09b457b
TH
1082 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
1083 * passed to bd_link_disk_holder(), then:
641dc636 1084 *
e09b457b
TH
1085 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
1086 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
641dc636 1087 *
e09b457b
TH
1088 * The caller must have claimed @bdev before calling this function and
1089 * ensure that both @bdev and @disk are valid during the creation and
1090 * lifetime of these symlinks.
641dc636 1091 *
e09b457b
TH
1092 * CONTEXT:
1093 * Might sleep.
641dc636 1094 *
e09b457b
TH
1095 * RETURNS:
1096 * 0 on success, -errno on failure.
641dc636 1097 */
e09b457b 1098int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
641dc636 1099{
49731baa 1100 struct bd_holder_disk *holder;
e09b457b 1101 int ret = 0;
641dc636 1102
2e7b651d 1103 mutex_lock(&bdev->bd_mutex);
df6c0cd9 1104
49731baa 1105 WARN_ON_ONCE(!bdev->bd_holder);
4e91672c 1106
e09b457b
TH
1107 /* FIXME: remove the following once add_disk() handles errors */
1108 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
1109 goto out_unlock;
4e91672c 1110
49731baa
TH
1111 holder = bd_find_holder_disk(bdev, disk);
1112 if (holder) {
1113 holder->refcnt++;
e09b457b 1114 goto out_unlock;
49731baa 1115 }
641dc636 1116
49731baa
TH
1117 holder = kzalloc(sizeof(*holder), GFP_KERNEL);
1118 if (!holder) {
1119 ret = -ENOMEM;
e09b457b
TH
1120 goto out_unlock;
1121 }
641dc636 1122
49731baa
TH
1123 INIT_LIST_HEAD(&holder->list);
1124 holder->disk = disk;
1125 holder->refcnt = 1;
1126
1127 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1128 if (ret)
1129 goto out_free;
1130
1131 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
1132 if (ret)
1133 goto out_del;
e7407d16
TH
1134 /*
1135 * bdev could be deleted beneath us which would implicitly destroy
1136 * the holder directory. Hold on to it.
1137 */
1138 kobject_get(bdev->bd_part->holder_dir);
49731baa
TH
1139
1140 list_add(&holder->list, &bdev->bd_holder_disks);
1141 goto out_unlock;
1142
1143out_del:
1144 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1145out_free:
1146 kfree(holder);
e09b457b 1147out_unlock:
b4cf1b72 1148 mutex_unlock(&bdev->bd_mutex);
e09b457b 1149 return ret;
641dc636 1150}
e09b457b 1151EXPORT_SYMBOL_GPL(bd_link_disk_holder);
641dc636 1152
49731baa
TH
1153/**
1154 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
1155 * @bdev: the calimed slave bdev
1156 * @disk: the holding disk
1157 *
1158 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
1159 *
1160 * CONTEXT:
1161 * Might sleep.
1162 */
1163void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
641dc636 1164{
49731baa 1165 struct bd_holder_disk *holder;
641dc636 1166
49731baa 1167 mutex_lock(&bdev->bd_mutex);
641dc636 1168
49731baa
TH
1169 holder = bd_find_holder_disk(bdev, disk);
1170
1171 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
1172 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1173 del_symlink(bdev->bd_part->holder_dir,
1174 &disk_to_dev(disk)->kobj);
e7407d16 1175 kobject_put(bdev->bd_part->holder_dir);
49731baa
TH
1176 list_del_init(&holder->list);
1177 kfree(holder);
1178 }
1179
1180 mutex_unlock(&bdev->bd_mutex);
1da177e4 1181}
49731baa 1182EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
641dc636 1183#endif
1da177e4 1184
56ade44b
AP
1185/**
1186 * flush_disk - invalidates all buffer-cache entries on a disk
1187 *
1188 * @bdev: struct block device to be flushed
e6eb5ce1 1189 * @kill_dirty: flag to guide handling of dirty inodes
56ade44b
AP
1190 *
1191 * Invalidates all buffer-cache entries on a disk. It should be called
1192 * when a disk has been changed -- either by a media change or online
1193 * resize.
1194 */
93b270f7 1195static void flush_disk(struct block_device *bdev, bool kill_dirty)
56ade44b 1196{
93b270f7 1197 if (__invalidate_device(bdev, kill_dirty)) {
56ade44b 1198 printk(KERN_WARNING "VFS: busy inodes on changed media or "
424081f3
DM
1199 "resized disk %s\n",
1200 bdev->bd_disk ? bdev->bd_disk->disk_name : "");
56ade44b
AP
1201 }
1202
1203 if (!bdev->bd_disk)
1204 return;
d27769ec 1205 if (disk_part_scan_enabled(bdev->bd_disk))
56ade44b
AP
1206 bdev->bd_invalidated = 1;
1207}
1208
c3279d14 1209/**
57d1b536 1210 * check_disk_size_change - checks for disk size change and adjusts bdev size.
c3279d14
AP
1211 * @disk: struct gendisk to check
1212 * @bdev: struct bdev to adjust.
1213 *
1214 * This routine checks to see if the bdev size does not match the disk size
1215 * and adjusts it if it differs.
1216 */
1217void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
1218{
1219 loff_t disk_size, bdev_size;
1220
1221 disk_size = (loff_t)get_capacity(disk) << 9;
1222 bdev_size = i_size_read(bdev->bd_inode);
1223 if (disk_size != bdev_size) {
c3279d14
AP
1224 printk(KERN_INFO
1225 "%s: detected capacity change from %lld to %lld\n",
424081f3 1226 disk->disk_name, bdev_size, disk_size);
c3279d14 1227 i_size_write(bdev->bd_inode, disk_size);
93b270f7 1228 flush_disk(bdev, false);
c3279d14
AP
1229 }
1230}
1231EXPORT_SYMBOL(check_disk_size_change);
1232
0c002c2f 1233/**
57d1b536 1234 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
0c002c2f
AP
1235 * @disk: struct gendisk to be revalidated
1236 *
1237 * This routine is a wrapper for lower-level driver's revalidate_disk
1238 * call-backs. It is used to do common pre and post operations needed
1239 * for all revalidate_disk operations.
1240 */
1241int revalidate_disk(struct gendisk *disk)
1242{
c3279d14 1243 struct block_device *bdev;
0c002c2f
AP
1244 int ret = 0;
1245
1246 if (disk->fops->revalidate_disk)
1247 ret = disk->fops->revalidate_disk(disk);
25520d55 1248 blk_integrity_revalidate(disk);
c3279d14
AP
1249 bdev = bdget_disk(disk, 0);
1250 if (!bdev)
1251 return ret;
1252
1253 mutex_lock(&bdev->bd_mutex);
1254 check_disk_size_change(disk, bdev);
7630b661 1255 bdev->bd_invalidated = 0;
c3279d14
AP
1256 mutex_unlock(&bdev->bd_mutex);
1257 bdput(bdev);
0c002c2f
AP
1258 return ret;
1259}
1260EXPORT_SYMBOL(revalidate_disk);
1261
1da177e4
LT
1262/*
1263 * This routine checks whether a removable media has been changed,
1264 * and invalidates all buffer-cache-entries in that case. This
1265 * is a relatively slow routine, so we have to try to minimize using
1266 * it. Thus it is called only upon a 'mount' or 'open'. This
1267 * is the best way of combining speed and utility, I think.
1268 * People changing diskettes in the middle of an operation deserve
1269 * to lose :-)
1270 */
1271int check_disk_change(struct block_device *bdev)
1272{
1273 struct gendisk *disk = bdev->bd_disk;
83d5cde4 1274 const struct block_device_operations *bdops = disk->fops;
77ea887e 1275 unsigned int events;
1da177e4 1276
77ea887e
TH
1277 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1278 DISK_EVENT_EJECT_REQUEST);
1279 if (!(events & DISK_EVENT_MEDIA_CHANGE))
1da177e4
LT
1280 return 0;
1281
93b270f7 1282 flush_disk(bdev, true);
1da177e4
LT
1283 if (bdops->revalidate_disk)
1284 bdops->revalidate_disk(bdev->bd_disk);
1da177e4
LT
1285 return 1;
1286}
1287
1288EXPORT_SYMBOL(check_disk_change);
1289
1290void bd_set_size(struct block_device *bdev, loff_t size)
1291{
e1defc4f 1292 unsigned bsize = bdev_logical_block_size(bdev);
1da177e4 1293
5955102c 1294 inode_lock(bdev->bd_inode);
d646a02a 1295 i_size_write(bdev->bd_inode, size);
5955102c 1296 inode_unlock(bdev->bd_inode);
09cbfeaf 1297 while (bsize < PAGE_SIZE) {
1da177e4
LT
1298 if (size & bsize)
1299 break;
1300 bsize <<= 1;
1301 }
1302 bdev->bd_block_size = bsize;
1303 bdev->bd_inode->i_blkbits = blksize_bits(bsize);
1304}
1305EXPORT_SYMBOL(bd_set_size);
1306
4385bab1 1307static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
37be4124 1308
6d740cd5
PZ
1309/*
1310 * bd_mutex locking:
1311 *
1312 * mutex_lock(part->bd_mutex)
1313 * mutex_lock_nested(whole->bd_mutex, 1)
1314 */
1315
572c4892 1316static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1da177e4 1317{
1da177e4 1318 struct gendisk *disk;
523e1d39 1319 struct module *owner;
7db9cfd3 1320 int ret;
cf771cb5 1321 int partno;
fe6e9c1f
AV
1322 int perm = 0;
1323
572c4892 1324 if (mode & FMODE_READ)
fe6e9c1f 1325 perm |= MAY_READ;
572c4892 1326 if (mode & FMODE_WRITE)
fe6e9c1f
AV
1327 perm |= MAY_WRITE;
1328 /*
1329 * hooks: /n/, see "layering violations".
1330 */
b7300b78
CW
1331 if (!for_part) {
1332 ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1333 if (ret != 0) {
1334 bdput(bdev);
1335 return ret;
1336 }
82666020 1337 }
7db9cfd3 1338
d3374825 1339 restart:
0762b8bd 1340
89f97496 1341 ret = -ENXIO;
cf771cb5 1342 disk = get_gendisk(bdev->bd_dev, &partno);
0762b8bd 1343 if (!disk)
6e9624b8 1344 goto out;
523e1d39 1345 owner = disk->fops->owner;
1da177e4 1346
69e02c59 1347 disk_block_events(disk);
6796bf54 1348 mutex_lock_nested(&bdev->bd_mutex, for_part);
1da177e4
LT
1349 if (!bdev->bd_openers) {
1350 bdev->bd_disk = disk;
87192a2a 1351 bdev->bd_queue = disk->queue;
1da177e4 1352 bdev->bd_contains = bdev;
03cdadb0 1353
cf771cb5 1354 if (!partno) {
89f97496
TH
1355 ret = -ENXIO;
1356 bdev->bd_part = disk_get_part(disk, partno);
1357 if (!bdev->bd_part)
1358 goto out_clear;
1359
1196f8b8 1360 ret = 0;
1da177e4 1361 if (disk->fops->open) {
572c4892 1362 ret = disk->fops->open(bdev, mode);
d3374825
N
1363 if (ret == -ERESTARTSYS) {
1364 /* Lost a race with 'disk' being
1365 * deleted, try again.
1366 * See md.c
1367 */
1368 disk_put_part(bdev->bd_part);
1369 bdev->bd_part = NULL;
d3374825 1370 bdev->bd_disk = NULL;
87192a2a 1371 bdev->bd_queue = NULL;
d3374825 1372 mutex_unlock(&bdev->bd_mutex);
69e02c59 1373 disk_unblock_events(disk);
69e02c59 1374 put_disk(disk);
523e1d39 1375 module_put(owner);
d3374825
N
1376 goto restart;
1377 }
1da177e4 1378 }
7e69723f 1379
22375701 1380 if (!ret)
7e69723f 1381 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
7e69723f 1382
1196f8b8
TH
1383 /*
1384 * If the device is invalidated, rescan partition
1385 * if open succeeded or failed with -ENOMEDIUM.
1386 * The latter is necessary to prevent ghost
1387 * partitions on a removed medium.
1388 */
fe316bf2
JN
1389 if (bdev->bd_invalidated) {
1390 if (!ret)
1391 rescan_partitions(disk, bdev);
1392 else if (ret == -ENOMEDIUM)
1393 invalidate_partitions(disk, bdev);
1394 }
5a023cdb 1395
1196f8b8
TH
1396 if (ret)
1397 goto out_clear;
1da177e4 1398 } else {
1da177e4
LT
1399 struct block_device *whole;
1400 whole = bdget_disk(disk, 0);
1401 ret = -ENOMEM;
1402 if (!whole)
0762b8bd 1403 goto out_clear;
37be4124 1404 BUG_ON(for_part);
572c4892 1405 ret = __blkdev_get(whole, mode, 1);
1da177e4 1406 if (ret)
0762b8bd 1407 goto out_clear;
1da177e4 1408 bdev->bd_contains = whole;
89f97496 1409 bdev->bd_part = disk_get_part(disk, partno);
e71bf0d0 1410 if (!(disk->flags & GENHD_FL_UP) ||
89f97496 1411 !bdev->bd_part || !bdev->bd_part->nr_sects) {
1da177e4 1412 ret = -ENXIO;
0762b8bd 1413 goto out_clear;
1da177e4 1414 }
89f97496 1415 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1da177e4
LT
1416 }
1417 } else {
1da177e4 1418 if (bdev->bd_contains == bdev) {
1196f8b8
TH
1419 ret = 0;
1420 if (bdev->bd_disk->fops->open)
572c4892 1421 ret = bdev->bd_disk->fops->open(bdev, mode);
1196f8b8 1422 /* the same as first opener case, read comment there */
fe316bf2
JN
1423 if (bdev->bd_invalidated) {
1424 if (!ret)
1425 rescan_partitions(bdev->bd_disk, bdev);
1426 else if (ret == -ENOMEDIUM)
1427 invalidate_partitions(bdev->bd_disk, bdev);
1428 }
1196f8b8
TH
1429 if (ret)
1430 goto out_unlock_bdev;
1da177e4 1431 }
69e02c59 1432 /* only one opener holds refs to the module and disk */
69e02c59 1433 put_disk(disk);
523e1d39 1434 module_put(owner);
1da177e4
LT
1435 }
1436 bdev->bd_openers++;
37be4124
N
1437 if (for_part)
1438 bdev->bd_part_count++;
c039e313 1439 mutex_unlock(&bdev->bd_mutex);
69e02c59 1440 disk_unblock_events(disk);
1da177e4
LT
1441 return 0;
1442
0762b8bd 1443 out_clear:
89f97496 1444 disk_put_part(bdev->bd_part);
1da177e4 1445 bdev->bd_disk = NULL;
0762b8bd 1446 bdev->bd_part = NULL;
87192a2a 1447 bdev->bd_queue = NULL;
1da177e4 1448 if (bdev != bdev->bd_contains)
572c4892 1449 __blkdev_put(bdev->bd_contains, mode, 1);
1da177e4 1450 bdev->bd_contains = NULL;
0762b8bd 1451 out_unlock_bdev:
c039e313 1452 mutex_unlock(&bdev->bd_mutex);
69e02c59 1453 disk_unblock_events(disk);
0762b8bd 1454 put_disk(disk);
523e1d39 1455 module_put(owner);
4345caba 1456 out:
0762b8bd
TH
1457 bdput(bdev);
1458
1da177e4
LT
1459 return ret;
1460}
1461
d4d77629
TH
1462/**
1463 * blkdev_get - open a block device
1464 * @bdev: block_device to open
1465 * @mode: FMODE_* mask
1466 * @holder: exclusive holder identifier
1467 *
1468 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
1469 * open with exclusive access. Specifying %FMODE_EXCL with %NULL
1470 * @holder is invalid. Exclusive opens may nest for the same @holder.
1471 *
1472 * On success, the reference count of @bdev is unchanged. On failure,
1473 * @bdev is put.
1474 *
1475 * CONTEXT:
1476 * Might sleep.
1477 *
1478 * RETURNS:
1479 * 0 on success, -errno on failure.
1480 */
e525fd89 1481int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1da177e4 1482{
e525fd89
TH
1483 struct block_device *whole = NULL;
1484 int res;
1485
1486 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1487
1488 if ((mode & FMODE_EXCL) && holder) {
1489 whole = bd_start_claiming(bdev, holder);
1490 if (IS_ERR(whole)) {
1491 bdput(bdev);
1492 return PTR_ERR(whole);
1493 }
1494 }
1495
1496 res = __blkdev_get(bdev, mode, 0);
1497
1498 if (whole) {
d4dc210f
TH
1499 struct gendisk *disk = whole->bd_disk;
1500
6a027eff 1501 /* finish claiming */
77ea887e 1502 mutex_lock(&bdev->bd_mutex);
6a027eff
TH
1503 spin_lock(&bdev_lock);
1504
77ea887e 1505 if (!res) {
6a027eff
TH
1506 BUG_ON(!bd_may_claim(bdev, whole, holder));
1507 /*
1508 * Note that for a whole device bd_holders
1509 * will be incremented twice, and bd_holder
1510 * will be set to bd_may_claim before being
1511 * set to holder
1512 */
1513 whole->bd_holders++;
1514 whole->bd_holder = bd_may_claim;
1515 bdev->bd_holders++;
1516 bdev->bd_holder = holder;
1517 }
1518
1519 /* tell others that we're done */
1520 BUG_ON(whole->bd_claiming != holder);
1521 whole->bd_claiming = NULL;
1522 wake_up_bit(&whole->bd_claiming, 0);
1523
1524 spin_unlock(&bdev_lock);
77ea887e
TH
1525
1526 /*
d4dc210f
TH
1527 * Block event polling for write claims if requested. Any
1528 * write holder makes the write_holder state stick until
1529 * all are released. This is good enough and tracking
1530 * individual writeable reference is too fragile given the
1531 * way @mode is used in blkdev_get/put().
77ea887e 1532 */
4c49ff3f
TH
1533 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1534 (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
77ea887e 1535 bdev->bd_write_holder = true;
d4dc210f 1536 disk_block_events(disk);
77ea887e
TH
1537 }
1538
1539 mutex_unlock(&bdev->bd_mutex);
6a027eff 1540 bdput(whole);
e525fd89
TH
1541 }
1542
1543 return res;
37be4124 1544}
1da177e4
LT
1545EXPORT_SYMBOL(blkdev_get);
1546
d4d77629
TH
1547/**
1548 * blkdev_get_by_path - open a block device by name
1549 * @path: path to the block device to open
1550 * @mode: FMODE_* mask
1551 * @holder: exclusive holder identifier
1552 *
1553 * Open the blockdevice described by the device file at @path. @mode
1554 * and @holder are identical to blkdev_get().
1555 *
1556 * On success, the returned block_device has reference count of one.
1557 *
1558 * CONTEXT:
1559 * Might sleep.
1560 *
1561 * RETURNS:
1562 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1563 */
1564struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1565 void *holder)
1566{
1567 struct block_device *bdev;
1568 int err;
1569
1570 bdev = lookup_bdev(path);
1571 if (IS_ERR(bdev))
1572 return bdev;
1573
1574 err = blkdev_get(bdev, mode, holder);
1575 if (err)
1576 return ERR_PTR(err);
1577
e51900f7
CE
1578 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1579 blkdev_put(bdev, mode);
1580 return ERR_PTR(-EACCES);
1581 }
1582
d4d77629
TH
1583 return bdev;
1584}
1585EXPORT_SYMBOL(blkdev_get_by_path);
1586
1587/**
1588 * blkdev_get_by_dev - open a block device by device number
1589 * @dev: device number of block device to open
1590 * @mode: FMODE_* mask
1591 * @holder: exclusive holder identifier
1592 *
1593 * Open the blockdevice described by device number @dev. @mode and
1594 * @holder are identical to blkdev_get().
1595 *
1596 * Use it ONLY if you really do not have anything better - i.e. when
1597 * you are behind a truly sucky interface and all you are given is a
1598 * device number. _Never_ to be used for internal purposes. If you
1599 * ever need it - reconsider your API.
1600 *
1601 * On success, the returned block_device has reference count of one.
1602 *
1603 * CONTEXT:
1604 * Might sleep.
1605 *
1606 * RETURNS:
1607 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1608 */
1609struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
1610{
1611 struct block_device *bdev;
1612 int err;
1613
1614 bdev = bdget(dev);
1615 if (!bdev)
1616 return ERR_PTR(-ENOMEM);
1617
1618 err = blkdev_get(bdev, mode, holder);
1619 if (err)
1620 return ERR_PTR(err);
1621
1622 return bdev;
1623}
1624EXPORT_SYMBOL(blkdev_get_by_dev);
1625
1da177e4
LT
1626static int blkdev_open(struct inode * inode, struct file * filp)
1627{
1628 struct block_device *bdev;
1da177e4
LT
1629
1630 /*
1631 * Preserve backwards compatibility and allow large file access
1632 * even if userspace doesn't ask for it explicitly. Some mkfs
1633 * binary needs it. We might want to drop this workaround
1634 * during an unstable branch.
1635 */
1636 filp->f_flags |= O_LARGEFILE;
1637
572c4892
AV
1638 if (filp->f_flags & O_NDELAY)
1639 filp->f_mode |= FMODE_NDELAY;
1640 if (filp->f_flags & O_EXCL)
1641 filp->f_mode |= FMODE_EXCL;
1642 if ((filp->f_flags & O_ACCMODE) == 3)
1643 filp->f_mode |= FMODE_WRITE_IOCTL;
1644
1da177e4 1645 bdev = bd_acquire(inode);
6a2aae06
PE
1646 if (bdev == NULL)
1647 return -ENOMEM;
1da177e4 1648
572c4892
AV
1649 filp->f_mapping = bdev->bd_inode->i_mapping;
1650
e525fd89 1651 return blkdev_get(bdev, filp->f_mode, filp);
1da177e4
LT
1652}
1653
4385bab1 1654static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
2e7b651d 1655{
2e7b651d 1656 struct gendisk *disk = bdev->bd_disk;
37be4124 1657 struct block_device *victim = NULL;
2e7b651d 1658
6796bf54 1659 mutex_lock_nested(&bdev->bd_mutex, for_part);
37be4124
N
1660 if (for_part)
1661 bdev->bd_part_count--;
1662
2e7b651d 1663 if (!--bdev->bd_openers) {
6a027eff 1664 WARN_ON_ONCE(bdev->bd_holders);
2e7b651d
PZ
1665 sync_blockdev(bdev);
1666 kill_bdev(bdev);
43d1c0eb
ID
1667
1668 bdev_write_inode(bdev);
564f00f6 1669 /*
43d1c0eb
ID
1670 * Detaching bdev inode from its wb in __destroy_inode()
1671 * is too late: the queue which embeds its bdi (along with
1672 * root wb) can be gone as soon as we put_disk() below.
94007751 1673 */
43d1c0eb 1674 inode_detach_wb(bdev->bd_inode);
2e7b651d
PZ
1675 }
1676 if (bdev->bd_contains == bdev) {
1677 if (disk->fops->release)
db2a144b 1678 disk->fops->release(disk, mode);
2e7b651d
PZ
1679 }
1680 if (!bdev->bd_openers) {
1681 struct module *owner = disk->fops->owner;
1682
0762b8bd
TH
1683 disk_put_part(bdev->bd_part);
1684 bdev->bd_part = NULL;
2e7b651d 1685 bdev->bd_disk = NULL;
37be4124
N
1686 if (bdev != bdev->bd_contains)
1687 victim = bdev->bd_contains;
2e7b651d 1688 bdev->bd_contains = NULL;
523e1d39
TH
1689
1690 put_disk(disk);
1691 module_put(owner);
2e7b651d 1692 }
2e7b651d
PZ
1693 mutex_unlock(&bdev->bd_mutex);
1694 bdput(bdev);
37be4124 1695 if (victim)
9a1c3542 1696 __blkdev_put(victim, mode, 1);
2e7b651d
PZ
1697}
1698
4385bab1 1699void blkdev_put(struct block_device *bdev, fmode_t mode)
37be4124 1700{
85ef06d1
TH
1701 mutex_lock(&bdev->bd_mutex);
1702
e525fd89 1703 if (mode & FMODE_EXCL) {
6a027eff
TH
1704 bool bdev_free;
1705
1706 /*
1707 * Release a claim on the device. The holder fields
1708 * are protected with bdev_lock. bd_mutex is to
1709 * synchronize disk_holder unlinking.
1710 */
6a027eff
TH
1711 spin_lock(&bdev_lock);
1712
1713 WARN_ON_ONCE(--bdev->bd_holders < 0);
1714 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1715
1716 /* bd_contains might point to self, check in a separate step */
1717 if ((bdev_free = !bdev->bd_holders))
1718 bdev->bd_holder = NULL;
1719 if (!bdev->bd_contains->bd_holders)
1720 bdev->bd_contains->bd_holder = NULL;
1721
1722 spin_unlock(&bdev_lock);
1723
77ea887e
TH
1724 /*
1725 * If this was the last claim, remove holder link and
1726 * unblock evpoll if it was a write holder.
1727 */
85ef06d1
TH
1728 if (bdev_free && bdev->bd_write_holder) {
1729 disk_unblock_events(bdev->bd_disk);
1730 bdev->bd_write_holder = false;
77ea887e 1731 }
6936217c 1732 }
77ea887e 1733
85ef06d1
TH
1734 /*
1735 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
1736 * event. This is to ensure detection of media removal commanded
1737 * from userland - e.g. eject(1).
1738 */
1739 disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
1740
1741 mutex_unlock(&bdev->bd_mutex);
1742
4385bab1 1743 __blkdev_put(bdev, mode, 0);
37be4124 1744}
2e7b651d
PZ
1745EXPORT_SYMBOL(blkdev_put);
1746
1da177e4
LT
1747static int blkdev_close(struct inode * inode, struct file * filp)
1748{
4ebb16ca 1749 struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
4385bab1
AV
1750 blkdev_put(bdev, filp->f_mode);
1751 return 0;
1da177e4
LT
1752}
1753
bb93e3a5 1754static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1da177e4 1755{
4ebb16ca 1756 struct block_device *bdev = I_BDEV(bdev_file_inode(file));
56b26add 1757 fmode_t mode = file->f_mode;
fd4ce1ac
CH
1758
1759 /*
1760 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
1761 * to updated it before every ioctl.
1762 */
56b26add 1763 if (file->f_flags & O_NDELAY)
fd4ce1ac
CH
1764 mode |= FMODE_NDELAY;
1765 else
1766 mode &= ~FMODE_NDELAY;
1767
56b26add 1768 return blkdev_ioctl(bdev, mode, cmd, arg);
1da177e4
LT
1769}
1770
eef99380
CH
1771/*
1772 * Write data to the block device. Only intended for the block device itself
1773 * and the raw driver which basically is a fake block device.
1774 *
1775 * Does not take i_mutex for the write and thus is not for general purpose
1776 * use.
1777 */
1456c0a8 1778ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
eef99380
CH
1779{
1780 struct file *file = iocb->ki_filp;
4ebb16ca 1781 struct inode *bd_inode = bdev_file_inode(file);
7ec7b94a 1782 loff_t size = i_size_read(bd_inode);
53362a05 1783 struct blk_plug plug;
eef99380 1784 ssize_t ret;
5f380c7f 1785
7ec7b94a
AV
1786 if (bdev_read_only(I_BDEV(bd_inode)))
1787 return -EPERM;
5f380c7f 1788
7ec7b94a 1789 if (!iov_iter_count(from))
5f380c7f
AV
1790 return 0;
1791
7ec7b94a
AV
1792 if (iocb->ki_pos >= size)
1793 return -ENOSPC;
1794
1795 iov_iter_truncate(from, size - iocb->ki_pos);
eef99380 1796
53362a05 1797 blk_start_plug(&plug);
1456c0a8 1798 ret = __generic_file_write_iter(iocb, from);
e2592217
CH
1799 if (ret > 0)
1800 ret = generic_write_sync(iocb, ret);
53362a05 1801 blk_finish_plug(&plug);
eef99380
CH
1802 return ret;
1803}
1456c0a8 1804EXPORT_SYMBOL_GPL(blkdev_write_iter);
eef99380 1805
b2de525f 1806ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
684c9aae
LT
1807{
1808 struct file *file = iocb->ki_filp;
4ebb16ca 1809 struct inode *bd_inode = bdev_file_inode(file);
684c9aae 1810 loff_t size = i_size_read(bd_inode);
a886038b 1811 loff_t pos = iocb->ki_pos;
684c9aae
LT
1812
1813 if (pos >= size)
1814 return 0;
1815
1816 size -= pos;
a886038b
AV
1817 iov_iter_truncate(to, size);
1818 return generic_file_read_iter(iocb, to);
684c9aae 1819}
b2de525f 1820EXPORT_SYMBOL_GPL(blkdev_read_iter);
684c9aae 1821
87d8fe1e
TT
1822/*
1823 * Try to release a page associated with block device when the system
1824 * is under memory pressure.
1825 */
1826static int blkdev_releasepage(struct page *page, gfp_t wait)
1827{
1828 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1829
1830 if (super && super->s_op->bdev_try_to_free_page)
1831 return super->s_op->bdev_try_to_free_page(super, page, wait);
1832
1833 return try_to_free_buffers(page);
1834}
1835
7f6d5b52
RZ
1836static int blkdev_writepages(struct address_space *mapping,
1837 struct writeback_control *wbc)
1838{
1839 if (dax_mapping(mapping)) {
1840 struct block_device *bdev = I_BDEV(mapping->host);
1841
1842 return dax_writeback_mapping_range(mapping, bdev, wbc);
1843 }
1844 return generic_writepages(mapping, wbc);
1845}
1846
4c54ac62 1847static const struct address_space_operations def_blk_aops = {
1da177e4 1848 .readpage = blkdev_readpage,
447f05bb 1849 .readpages = blkdev_readpages,
1da177e4 1850 .writepage = blkdev_writepage,
6272b5a5
NP
1851 .write_begin = blkdev_write_begin,
1852 .write_end = blkdev_write_end,
7f6d5b52 1853 .writepages = blkdev_writepages,
87d8fe1e 1854 .releasepage = blkdev_releasepage,
1da177e4 1855 .direct_IO = blkdev_direct_IO,
b4597226 1856 .is_dirty_writeback = buffer_check_dirty_writeback,
1da177e4
LT
1857};
1858
25f4c414
DW
1859#define BLKDEV_FALLOC_FL_SUPPORTED \
1860 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
1861 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
1862
1863static long blkdev_fallocate(struct file *file, int mode, loff_t start,
1864 loff_t len)
1865{
1866 struct block_device *bdev = I_BDEV(bdev_file_inode(file));
1867 struct request_queue *q = bdev_get_queue(bdev);
1868 struct address_space *mapping;
1869 loff_t end = start + len - 1;
1870 loff_t isize;
1871 int error;
1872
1873 /* Fail if we don't recognize the flags. */
1874 if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
1875 return -EOPNOTSUPP;
1876
1877 /* Don't go off the end of the device. */
1878 isize = i_size_read(bdev->bd_inode);
1879 if (start >= isize)
1880 return -EINVAL;
1881 if (end >= isize) {
1882 if (mode & FALLOC_FL_KEEP_SIZE) {
1883 len = isize - start;
1884 end = start + len - 1;
1885 } else
1886 return -EINVAL;
1887 }
1888
1889 /*
1890 * Don't allow IO that isn't aligned to logical block size.
1891 */
1892 if ((start | len) & (bdev_logical_block_size(bdev) - 1))
1893 return -EINVAL;
1894
1895 /* Invalidate the page cache, including dirty pages. */
1896 mapping = bdev->bd_inode->i_mapping;
1897 truncate_inode_pages_range(mapping, start, end);
1898
1899 switch (mode) {
1900 case FALLOC_FL_ZERO_RANGE:
1901 case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
1902 error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
1903 GFP_KERNEL, false);
1904 break;
1905 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
1906 /* Only punch if the device can do zeroing discard. */
1907 if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data)
1908 return -EOPNOTSUPP;
1909 error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
1910 GFP_KERNEL, 0);
1911 break;
1912 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
1913 if (!blk_queue_discard(q))
1914 return -EOPNOTSUPP;
1915 error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
1916 GFP_KERNEL, 0);
1917 break;
1918 default:
1919 return -EOPNOTSUPP;
1920 }
1921 if (error)
1922 return error;
1923
1924 /*
1925 * Invalidate again; if someone wandered in and dirtied a page,
1926 * the caller will be given -EBUSY. The third argument is
1927 * inclusive, so the rounding here is safe.
1928 */
1929 return invalidate_inode_pages2_range(mapping,
1930 start >> PAGE_SHIFT,
1931 end >> PAGE_SHIFT);
1932}
1933
4b6f5d20 1934const struct file_operations def_blk_fops = {
1da177e4
LT
1935 .open = blkdev_open,
1936 .release = blkdev_close,
1937 .llseek = block_llseek,
a886038b 1938 .read_iter = blkdev_read_iter,
1456c0a8 1939 .write_iter = blkdev_write_iter,
acc93d30 1940 .mmap = generic_file_mmap,
b1dd3b28 1941 .fsync = blkdev_fsync,
bb93e3a5 1942 .unlocked_ioctl = block_ioctl,
1da177e4
LT
1943#ifdef CONFIG_COMPAT
1944 .compat_ioctl = compat_blkdev_ioctl,
1945#endif
1e8b3332 1946 .splice_read = generic_file_splice_read,
8d020765 1947 .splice_write = iter_file_splice_write,
25f4c414 1948 .fallocate = blkdev_fallocate,
1da177e4
LT
1949};
1950
1951int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
1952{
1953 int res;
1954 mm_segment_t old_fs = get_fs();
1955 set_fs(KERNEL_DS);
56b26add 1956 res = blkdev_ioctl(bdev, 0, cmd, arg);
1da177e4
LT
1957 set_fs(old_fs);
1958 return res;
1959}
1960
1961EXPORT_SYMBOL(ioctl_by_bdev);
1962
1963/**
1964 * lookup_bdev - lookup a struct block_device by name
94e2959e 1965 * @pathname: special file representing the block device
1da177e4 1966 *
57d1b536 1967 * Get a reference to the blockdevice at @pathname in the current
1da177e4
LT
1968 * namespace if possible and return it. Return ERR_PTR(error)
1969 * otherwise.
1970 */
421748ec 1971struct block_device *lookup_bdev(const char *pathname)
1da177e4
LT
1972{
1973 struct block_device *bdev;
1974 struct inode *inode;
421748ec 1975 struct path path;
1da177e4
LT
1976 int error;
1977
421748ec 1978 if (!pathname || !*pathname)
1da177e4
LT
1979 return ERR_PTR(-EINVAL);
1980
421748ec 1981 error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1da177e4
LT
1982 if (error)
1983 return ERR_PTR(error);
1984
bb668734 1985 inode = d_backing_inode(path.dentry);
1da177e4
LT
1986 error = -ENOTBLK;
1987 if (!S_ISBLK(inode->i_mode))
1988 goto fail;
1989 error = -EACCES;
a2982cc9 1990 if (!may_open_dev(&path))
1da177e4
LT
1991 goto fail;
1992 error = -ENOMEM;
1993 bdev = bd_acquire(inode);
1994 if (!bdev)
1995 goto fail;
1996out:
421748ec 1997 path_put(&path);
1da177e4
LT
1998 return bdev;
1999fail:
2000 bdev = ERR_PTR(error);
2001 goto out;
2002}
d5686b44 2003EXPORT_SYMBOL(lookup_bdev);
1da177e4 2004
93b270f7 2005int __invalidate_device(struct block_device *bdev, bool kill_dirty)
b71e8a4c
DH
2006{
2007 struct super_block *sb = get_super(bdev);
2008 int res = 0;
2009
2010 if (sb) {
2011 /*
2012 * no need to lock the super, get_super holds the
2013 * read mutex so the filesystem cannot go away
2014 * under us (->put_super runs with the write lock
2015 * hold).
2016 */
2017 shrink_dcache_sb(sb);
93b270f7 2018 res = invalidate_inodes(sb, kill_dirty);
b71e8a4c
DH
2019 drop_super(sb);
2020 }
f98393a6 2021 invalidate_bdev(bdev);
b71e8a4c
DH
2022 return res;
2023}
2024EXPORT_SYMBOL(__invalidate_device);
5c0d6b60
JK
2025
2026void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
2027{
2028 struct inode *inode, *old_inode = NULL;
2029
74278da9 2030 spin_lock(&blockdev_superblock->s_inode_list_lock);
5c0d6b60
JK
2031 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
2032 struct address_space *mapping = inode->i_mapping;
2033
2034 spin_lock(&inode->i_lock);
2035 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
2036 mapping->nrpages == 0) {
2037 spin_unlock(&inode->i_lock);
2038 continue;
2039 }
2040 __iget(inode);
2041 spin_unlock(&inode->i_lock);
74278da9 2042 spin_unlock(&blockdev_superblock->s_inode_list_lock);
5c0d6b60
JK
2043 /*
2044 * We hold a reference to 'inode' so it couldn't have been
2045 * removed from s_inodes list while we dropped the
74278da9 2046 * s_inode_list_lock We cannot iput the inode now as we can
5c0d6b60 2047 * be holding the last reference and we cannot iput it under
74278da9 2048 * s_inode_list_lock. So we keep the reference and iput it
5c0d6b60
JK
2049 * later.
2050 */
2051 iput(old_inode);
2052 old_inode = inode;
2053
2054 func(I_BDEV(inode), arg);
2055
74278da9 2056 spin_lock(&blockdev_superblock->s_inode_list_lock);
5c0d6b60 2057 }
74278da9 2058 spin_unlock(&blockdev_superblock->s_inode_list_lock);
5c0d6b60
JK
2059 iput(old_inode);
2060}