aio: first support for buffered async writes
[linux-block.git] / fs / buffer.c
CommitLineData
1da177e4
LT
1/*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
6
7/*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9 *
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12 *
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
15 *
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17 *
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19 */
20
1da177e4
LT
21#include <linux/kernel.h>
22#include <linux/syscalls.h>
23#include <linux/fs.h>
24#include <linux/mm.h>
25#include <linux/percpu.h>
26#include <linux/slab.h>
16f7e0fe 27#include <linux/capability.h>
1da177e4
LT
28#include <linux/blkdev.h>
29#include <linux/file.h>
30#include <linux/quotaops.h>
31#include <linux/highmem.h>
32#include <linux/module.h>
33#include <linux/writeback.h>
34#include <linux/hash.h>
35#include <linux/suspend.h>
36#include <linux/buffer_head.h>
55e829af 37#include <linux/task_io_accounting_ops.h>
1da177e4
LT
38#include <linux/bio.h>
39#include <linux/notifier.h>
40#include <linux/cpu.h>
41#include <linux/bitops.h>
42#include <linux/mpage.h>
fb1c8f93 43#include <linux/bit_spinlock.h>
1da177e4
LT
44
45static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
1da177e4
LT
46
47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48
49inline void
50init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51{
52 bh->b_end_io = handler;
53 bh->b_private = private;
54}
1fe72eaa 55EXPORT_SYMBOL(init_buffer);
1da177e4
LT
56
57static int sync_buffer(void *word)
58{
59 struct block_device *bd;
60 struct buffer_head *bh
61 = container_of(word, struct buffer_head, b_state);
62
63 smp_mb();
64 bd = bh->b_bdev;
65 if (bd)
66 blk_run_address_space(bd->bd_inode->i_mapping);
f80e69e7
JA
67 if (!in_aio(current))
68 io_schedule();
1da177e4
LT
69 return 0;
70}
71
fc9b52cd 72void __lock_buffer(struct buffer_head *bh)
1da177e4
LT
73{
74 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
75 TASK_UNINTERRUPTIBLE);
76}
77EXPORT_SYMBOL(__lock_buffer);
78
fc9b52cd 79void unlock_buffer(struct buffer_head *bh)
1da177e4 80{
51b07fc3 81 clear_bit_unlock(BH_Lock, &bh->b_state);
1da177e4
LT
82 smp_mb__after_clear_bit();
83 wake_up_bit(&bh->b_state, BH_Lock);
84}
1fe72eaa 85EXPORT_SYMBOL(unlock_buffer);
1da177e4
LT
86
87/*
88 * Block until a buffer comes unlocked. This doesn't stop it
89 * from becoming locked again - you have to lock it yourself
90 * if you want to preserve its state.
91 */
92void __wait_on_buffer(struct buffer_head * bh)
93{
94 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
95}
1fe72eaa 96EXPORT_SYMBOL(__wait_on_buffer);
1da177e4 97
f80e69e7
JA
98int __wait_on_buffer_async(struct buffer_head *bh, struct wait_bit_queue *wait)
99{
100 return wait_on_bit_async(&bh->b_state, BH_Lock, sync_buffer,
101 TASK_UNINTERRUPTIBLE, wait);
102}
103EXPORT_SYMBOL(__wait_on_buffer_async);
104
1da177e4
LT
105static void
106__clear_page_buffers(struct page *page)
107{
108 ClearPagePrivate(page);
4c21e2f2 109 set_page_private(page, 0);
1da177e4
LT
110 page_cache_release(page);
111}
112
08bafc03
KM
113
114static int quiet_error(struct buffer_head *bh)
115{
116 if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
117 return 0;
118 return 1;
119}
120
121
1da177e4
LT
122static void buffer_io_error(struct buffer_head *bh)
123{
124 char b[BDEVNAME_SIZE];
1da177e4
LT
125 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
126 bdevname(bh->b_bdev, b),
127 (unsigned long long)bh->b_blocknr);
128}
129
130/*
68671f35
DM
131 * End-of-IO handler helper function which does not touch the bh after
132 * unlocking it.
133 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
134 * a race there is benign: unlock_buffer() only use the bh's address for
135 * hashing after unlocking the buffer, so it doesn't actually touch the bh
136 * itself.
1da177e4 137 */
68671f35 138static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
1da177e4
LT
139{
140 if (uptodate) {
141 set_buffer_uptodate(bh);
142 } else {
143 /* This happens, due to failed READA attempts. */
144 clear_buffer_uptodate(bh);
145 }
146 unlock_buffer(bh);
68671f35
DM
147}
148
149/*
150 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
151 * unlock the buffer. This is what ll_rw_block uses too.
152 */
153void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
154{
155 __end_buffer_read_notouch(bh, uptodate);
1da177e4
LT
156 put_bh(bh);
157}
1fe72eaa 158EXPORT_SYMBOL(end_buffer_read_sync);
1da177e4
LT
159
160void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
161{
162 char b[BDEVNAME_SIZE];
163
164 if (uptodate) {
165 set_buffer_uptodate(bh);
166 } else {
08bafc03 167 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
1da177e4
LT
168 buffer_io_error(bh);
169 printk(KERN_WARNING "lost page write due to "
170 "I/O error on %s\n",
171 bdevname(bh->b_bdev, b));
172 }
173 set_buffer_write_io_error(bh);
174 clear_buffer_uptodate(bh);
175 }
176 unlock_buffer(bh);
177 put_bh(bh);
178}
1fe72eaa 179EXPORT_SYMBOL(end_buffer_write_sync);
1da177e4 180
1da177e4
LT
181/*
182 * Various filesystems appear to want __find_get_block to be non-blocking.
183 * But it's the page lock which protects the buffers. To get around this,
184 * we get exclusion from try_to_free_buffers with the blockdev mapping's
185 * private_lock.
186 *
187 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
188 * may be quite high. This code could TryLock the page, and if that
189 * succeeds, there is no need to take private_lock. (But if
190 * private_lock is contended then so is mapping->tree_lock).
191 */
192static struct buffer_head *
385fd4c5 193__find_get_block_slow(struct block_device *bdev, sector_t block)
1da177e4
LT
194{
195 struct inode *bd_inode = bdev->bd_inode;
196 struct address_space *bd_mapping = bd_inode->i_mapping;
197 struct buffer_head *ret = NULL;
198 pgoff_t index;
199 struct buffer_head *bh;
200 struct buffer_head *head;
201 struct page *page;
202 int all_mapped = 1;
203
204 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
205 page = find_get_page(bd_mapping, index);
206 if (!page)
207 goto out;
208
209 spin_lock(&bd_mapping->private_lock);
210 if (!page_has_buffers(page))
211 goto out_unlock;
212 head = page_buffers(page);
213 bh = head;
214 do {
97f76d3d
NK
215 if (!buffer_mapped(bh))
216 all_mapped = 0;
217 else if (bh->b_blocknr == block) {
1da177e4
LT
218 ret = bh;
219 get_bh(bh);
220 goto out_unlock;
221 }
1da177e4
LT
222 bh = bh->b_this_page;
223 } while (bh != head);
224
225 /* we might be here because some of the buffers on this page are
226 * not mapped. This is due to various races between
227 * file io on the block device and getblk. It gets dealt with
228 * elsewhere, don't buffer_error if we had some unmapped buffers
229 */
230 if (all_mapped) {
231 printk("__find_get_block_slow() failed. "
232 "block=%llu, b_blocknr=%llu\n",
205f87f6
BP
233 (unsigned long long)block,
234 (unsigned long long)bh->b_blocknr);
235 printk("b_state=0x%08lx, b_size=%zu\n",
236 bh->b_state, bh->b_size);
1da177e4
LT
237 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
238 }
239out_unlock:
240 spin_unlock(&bd_mapping->private_lock);
241 page_cache_release(page);
242out:
243 return ret;
244}
245
246/* If invalidate_buffers() will trash dirty buffers, it means some kind
247 of fs corruption is going on. Trashing dirty data always imply losing
248 information that was supposed to be just stored on the physical layer
249 by the user.
250
251 Thus invalidate_buffers in general usage is not allwowed to trash
252 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
253 be preserved. These buffers are simply skipped.
254
255 We also skip buffers which are still in use. For example this can
256 happen if a userspace program is reading the block device.
257
258 NOTE: In the case where the user removed a removable-media-disk even if
259 there's still dirty data not synced on disk (due a bug in the device driver
260 or due an error of the user), by not destroying the dirty buffers we could
261 generate corruption also on the next media inserted, thus a parameter is
262 necessary to handle this case in the most safe way possible (trying
263 to not corrupt also the new disk inserted with the data belonging to
264 the old now corrupted disk). Also for the ramdisk the natural thing
265 to do in order to release the ramdisk memory is to destroy dirty buffers.
266
267 These are two special cases. Normal usage imply the device driver
268 to issue a sync on the device (without waiting I/O completion) and
269 then an invalidate_buffers call that doesn't trash dirty buffers.
270
271 For handling cache coherency with the blkdev pagecache the 'update' case
272 is been introduced. It is needed to re-read from disk any pinned
273 buffer. NOTE: re-reading from disk is destructive so we can do it only
274 when we assume nobody is changing the buffercache under our I/O and when
275 we think the disk contains more recent information than the buffercache.
276 The update == 1 pass marks the buffers we need to update, the update == 2
277 pass does the actual I/O. */
f98393a6 278void invalidate_bdev(struct block_device *bdev)
1da177e4 279{
0e1dfc66
AM
280 struct address_space *mapping = bdev->bd_inode->i_mapping;
281
282 if (mapping->nrpages == 0)
283 return;
284
1da177e4 285 invalidate_bh_lrus();
fc0ecff6 286 invalidate_mapping_pages(mapping, 0, -1);
1da177e4 287}
1fe72eaa 288EXPORT_SYMBOL(invalidate_bdev);
1da177e4
LT
289
290/*
5b0830cb 291 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
1da177e4
LT
292 */
293static void free_more_memory(void)
294{
19770b32 295 struct zone *zone;
0e88460d 296 int nid;
1da177e4 297
03ba3782 298 wakeup_flusher_threads(1024);
1da177e4
LT
299 yield();
300
0e88460d 301 for_each_online_node(nid) {
19770b32
MG
302 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
303 gfp_zone(GFP_NOFS), NULL,
304 &zone);
305 if (zone)
54a6eb5c 306 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
327c0e96 307 GFP_NOFS, NULL);
1da177e4
LT
308 }
309}
310
311/*
312 * I/O completion handler for block_read_full_page() - pages
313 * which come unlocked at the end of I/O.
314 */
315static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
316{
1da177e4 317 unsigned long flags;
a3972203 318 struct buffer_head *first;
1da177e4
LT
319 struct buffer_head *tmp;
320 struct page *page;
321 int page_uptodate = 1;
322
323 BUG_ON(!buffer_async_read(bh));
324
325 page = bh->b_page;
326 if (uptodate) {
327 set_buffer_uptodate(bh);
328 } else {
329 clear_buffer_uptodate(bh);
08bafc03 330 if (!quiet_error(bh))
1da177e4
LT
331 buffer_io_error(bh);
332 SetPageError(page);
333 }
334
335 /*
336 * Be _very_ careful from here on. Bad things can happen if
337 * two buffer heads end IO at almost the same time and both
338 * decide that the page is now completely done.
339 */
a3972203
NP
340 first = page_buffers(page);
341 local_irq_save(flags);
342 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
1da177e4
LT
343 clear_buffer_async_read(bh);
344 unlock_buffer(bh);
345 tmp = bh;
346 do {
347 if (!buffer_uptodate(tmp))
348 page_uptodate = 0;
349 if (buffer_async_read(tmp)) {
350 BUG_ON(!buffer_locked(tmp));
351 goto still_busy;
352 }
353 tmp = tmp->b_this_page;
354 } while (tmp != bh);
a3972203
NP
355 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
356 local_irq_restore(flags);
1da177e4
LT
357
358 /*
359 * If none of the buffers had errors and they are all
360 * uptodate then we can set the page uptodate.
361 */
362 if (page_uptodate && !PageError(page))
363 SetPageUptodate(page);
364 unlock_page(page);
365 return;
366
367still_busy:
a3972203
NP
368 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
369 local_irq_restore(flags);
1da177e4
LT
370 return;
371}
372
373/*
374 * Completion handler for block_write_full_page() - pages which are unlocked
375 * during I/O, and which have PageWriteback cleared upon I/O completion.
376 */
35c80d5f 377void end_buffer_async_write(struct buffer_head *bh, int uptodate)
1da177e4
LT
378{
379 char b[BDEVNAME_SIZE];
1da177e4 380 unsigned long flags;
a3972203 381 struct buffer_head *first;
1da177e4
LT
382 struct buffer_head *tmp;
383 struct page *page;
384
385 BUG_ON(!buffer_async_write(bh));
386
387 page = bh->b_page;
388 if (uptodate) {
389 set_buffer_uptodate(bh);
390 } else {
08bafc03 391 if (!quiet_error(bh)) {
1da177e4
LT
392 buffer_io_error(bh);
393 printk(KERN_WARNING "lost page write due to "
394 "I/O error on %s\n",
395 bdevname(bh->b_bdev, b));
396 }
397 set_bit(AS_EIO, &page->mapping->flags);
58ff407b 398 set_buffer_write_io_error(bh);
1da177e4
LT
399 clear_buffer_uptodate(bh);
400 SetPageError(page);
401 }
402
a3972203
NP
403 first = page_buffers(page);
404 local_irq_save(flags);
405 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
406
1da177e4
LT
407 clear_buffer_async_write(bh);
408 unlock_buffer(bh);
409 tmp = bh->b_this_page;
410 while (tmp != bh) {
411 if (buffer_async_write(tmp)) {
412 BUG_ON(!buffer_locked(tmp));
413 goto still_busy;
414 }
415 tmp = tmp->b_this_page;
416 }
a3972203
NP
417 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
418 local_irq_restore(flags);
1da177e4
LT
419 end_page_writeback(page);
420 return;
421
422still_busy:
a3972203
NP
423 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
424 local_irq_restore(flags);
1da177e4
LT
425 return;
426}
1fe72eaa 427EXPORT_SYMBOL(end_buffer_async_write);
1da177e4
LT
428
429/*
430 * If a page's buffers are under async readin (end_buffer_async_read
431 * completion) then there is a possibility that another thread of
432 * control could lock one of the buffers after it has completed
433 * but while some of the other buffers have not completed. This
434 * locked buffer would confuse end_buffer_async_read() into not unlocking
435 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
436 * that this buffer is not under async I/O.
437 *
438 * The page comes unlocked when it has no locked buffer_async buffers
439 * left.
440 *
441 * PageLocked prevents anyone starting new async I/O reads any of
442 * the buffers.
443 *
444 * PageWriteback is used to prevent simultaneous writeout of the same
445 * page.
446 *
447 * PageLocked prevents anyone from starting writeback of a page which is
448 * under read I/O (PageWriteback is only ever set against a locked page).
449 */
450static void mark_buffer_async_read(struct buffer_head *bh)
451{
452 bh->b_end_io = end_buffer_async_read;
453 set_buffer_async_read(bh);
454}
455
1fe72eaa
HS
456static void mark_buffer_async_write_endio(struct buffer_head *bh,
457 bh_end_io_t *handler)
1da177e4 458{
35c80d5f 459 bh->b_end_io = handler;
1da177e4
LT
460 set_buffer_async_write(bh);
461}
35c80d5f
CM
462
463void mark_buffer_async_write(struct buffer_head *bh)
464{
465 mark_buffer_async_write_endio(bh, end_buffer_async_write);
466}
1da177e4
LT
467EXPORT_SYMBOL(mark_buffer_async_write);
468
469
470/*
471 * fs/buffer.c contains helper functions for buffer-backed address space's
472 * fsync functions. A common requirement for buffer-based filesystems is
473 * that certain data from the backing blockdev needs to be written out for
474 * a successful fsync(). For example, ext2 indirect blocks need to be
475 * written back and waited upon before fsync() returns.
476 *
477 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
478 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
479 * management of a list of dependent buffers at ->i_mapping->private_list.
480 *
481 * Locking is a little subtle: try_to_free_buffers() will remove buffers
482 * from their controlling inode's queue when they are being freed. But
483 * try_to_free_buffers() will be operating against the *blockdev* mapping
484 * at the time, not against the S_ISREG file which depends on those buffers.
485 * So the locking for private_list is via the private_lock in the address_space
486 * which backs the buffers. Which is different from the address_space
487 * against which the buffers are listed. So for a particular address_space,
488 * mapping->private_lock does *not* protect mapping->private_list! In fact,
489 * mapping->private_list will always be protected by the backing blockdev's
490 * ->private_lock.
491 *
492 * Which introduces a requirement: all buffers on an address_space's
493 * ->private_list must be from the same address_space: the blockdev's.
494 *
495 * address_spaces which do not place buffers at ->private_list via these
496 * utility functions are free to use private_lock and private_list for
497 * whatever they want. The only requirement is that list_empty(private_list)
498 * be true at clear_inode() time.
499 *
500 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
501 * filesystems should do that. invalidate_inode_buffers() should just go
502 * BUG_ON(!list_empty).
503 *
504 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
505 * take an address_space, not an inode. And it should be called
506 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
507 * queued up.
508 *
509 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
510 * list if it is already on a list. Because if the buffer is on a list,
511 * it *must* already be on the right one. If not, the filesystem is being
512 * silly. This will save a ton of locking. But first we have to ensure
513 * that buffers are taken *off* the old inode's list when they are freed
514 * (presumably in truncate). That requires careful auditing of all
515 * filesystems (do it inside bforget()). It could also be done by bringing
516 * b_inode back.
517 */
518
519/*
520 * The buffer's backing address_space's private_lock must be held
521 */
dbacefc9 522static void __remove_assoc_queue(struct buffer_head *bh)
1da177e4
LT
523{
524 list_del_init(&bh->b_assoc_buffers);
58ff407b
JK
525 WARN_ON(!bh->b_assoc_map);
526 if (buffer_write_io_error(bh))
527 set_bit(AS_EIO, &bh->b_assoc_map->flags);
528 bh->b_assoc_map = NULL;
1da177e4
LT
529}
530
531int inode_has_buffers(struct inode *inode)
532{
533 return !list_empty(&inode->i_data.private_list);
534}
535
536/*
537 * osync is designed to support O_SYNC io. It waits synchronously for
538 * all already-submitted IO to complete, but does not queue any new
539 * writes to the disk.
540 *
541 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
542 * you dirty the buffers, and then use osync_inode_buffers to wait for
543 * completion. Any other dirty buffers which are not yet queued for
544 * write will not be flushed to disk by the osync.
545 */
546static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
547{
548 struct buffer_head *bh;
549 struct list_head *p;
550 int err = 0;
551
552 spin_lock(lock);
553repeat:
554 list_for_each_prev(p, list) {
555 bh = BH_ENTRY(p);
556 if (buffer_locked(bh)) {
557 get_bh(bh);
558 spin_unlock(lock);
559 wait_on_buffer(bh);
560 if (!buffer_uptodate(bh))
561 err = -EIO;
562 brelse(bh);
563 spin_lock(lock);
564 goto repeat;
565 }
566 }
567 spin_unlock(lock);
568 return err;
569}
570
1fe72eaa 571static void do_thaw_all(struct work_struct *work)
c2d75438
ES
572{
573 struct super_block *sb;
574 char b[BDEVNAME_SIZE];
575
576 spin_lock(&sb_lock);
577restart:
578 list_for_each_entry(sb, &super_blocks, s_list) {
579 sb->s_count++;
580 spin_unlock(&sb_lock);
581 down_read(&sb->s_umount);
582 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
583 printk(KERN_WARNING "Emergency Thaw on %s\n",
584 bdevname(sb->s_bdev, b));
585 up_read(&sb->s_umount);
586 spin_lock(&sb_lock);
587 if (__put_super_and_need_restart(sb))
588 goto restart;
589 }
590 spin_unlock(&sb_lock);
053c525f 591 kfree(work);
c2d75438
ES
592 printk(KERN_WARNING "Emergency Thaw complete\n");
593}
594
595/**
596 * emergency_thaw_all -- forcibly thaw every frozen filesystem
597 *
598 * Used for emergency unfreeze of all filesystems via SysRq
599 */
600void emergency_thaw_all(void)
601{
053c525f
JA
602 struct work_struct *work;
603
604 work = kmalloc(sizeof(*work), GFP_ATOMIC);
605 if (work) {
606 INIT_WORK(work, do_thaw_all);
607 schedule_work(work);
608 }
c2d75438
ES
609}
610
1da177e4 611/**
78a4a50a 612 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
67be2dd1 613 * @mapping: the mapping which wants those buffers written
1da177e4
LT
614 *
615 * Starts I/O against the buffers at mapping->private_list, and waits upon
616 * that I/O.
617 *
67be2dd1
MW
618 * Basically, this is a convenience function for fsync().
619 * @mapping is a file or directory which needs those buffers to be written for
620 * a successful fsync().
1da177e4
LT
621 */
622int sync_mapping_buffers(struct address_space *mapping)
623{
624 struct address_space *buffer_mapping = mapping->assoc_mapping;
625
626 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
627 return 0;
628
629 return fsync_buffers_list(&buffer_mapping->private_lock,
630 &mapping->private_list);
631}
632EXPORT_SYMBOL(sync_mapping_buffers);
633
634/*
635 * Called when we've recently written block `bblock', and it is known that
636 * `bblock' was for a buffer_boundary() buffer. This means that the block at
637 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
638 * dirty, schedule it for IO. So that indirects merge nicely with their data.
639 */
640void write_boundary_block(struct block_device *bdev,
641 sector_t bblock, unsigned blocksize)
642{
643 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
644 if (bh) {
645 if (buffer_dirty(bh))
646 ll_rw_block(WRITE, 1, &bh);
647 put_bh(bh);
648 }
649}
650
651void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
652{
653 struct address_space *mapping = inode->i_mapping;
654 struct address_space *buffer_mapping = bh->b_page->mapping;
655
656 mark_buffer_dirty(bh);
657 if (!mapping->assoc_mapping) {
658 mapping->assoc_mapping = buffer_mapping;
659 } else {
e827f923 660 BUG_ON(mapping->assoc_mapping != buffer_mapping);
1da177e4 661 }
535ee2fb 662 if (!bh->b_assoc_map) {
1da177e4
LT
663 spin_lock(&buffer_mapping->private_lock);
664 list_move_tail(&bh->b_assoc_buffers,
665 &mapping->private_list);
58ff407b 666 bh->b_assoc_map = mapping;
1da177e4
LT
667 spin_unlock(&buffer_mapping->private_lock);
668 }
669}
670EXPORT_SYMBOL(mark_buffer_dirty_inode);
671
787d2214
NP
672/*
673 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
674 * dirty.
675 *
676 * If warn is true, then emit a warning if the page is not uptodate and has
677 * not been truncated.
678 */
a8e7d49a 679static void __set_page_dirty(struct page *page,
787d2214
NP
680 struct address_space *mapping, int warn)
681{
19fd6231 682 spin_lock_irq(&mapping->tree_lock);
787d2214
NP
683 if (page->mapping) { /* Race with truncate? */
684 WARN_ON_ONCE(warn && !PageUptodate(page));
e3a7cca1 685 account_page_dirtied(page, mapping);
787d2214
NP
686 radix_tree_tag_set(&mapping->page_tree,
687 page_index(page), PAGECACHE_TAG_DIRTY);
688 }
19fd6231 689 spin_unlock_irq(&mapping->tree_lock);
787d2214 690 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
787d2214
NP
691}
692
1da177e4
LT
693/*
694 * Add a page to the dirty page list.
695 *
696 * It is a sad fact of life that this function is called from several places
697 * deeply under spinlocking. It may not sleep.
698 *
699 * If the page has buffers, the uptodate buffers are set dirty, to preserve
700 * dirty-state coherency between the page and the buffers. It the page does
701 * not have buffers then when they are later attached they will all be set
702 * dirty.
703 *
704 * The buffers are dirtied before the page is dirtied. There's a small race
705 * window in which a writepage caller may see the page cleanness but not the
706 * buffer dirtiness. That's fine. If this code were to set the page dirty
707 * before the buffers, a concurrent writepage caller could clear the page dirty
708 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
709 * page on the dirty page list.
710 *
711 * We use private_lock to lock against try_to_free_buffers while using the
712 * page's buffer list. Also use this to protect against clean buffers being
713 * added to the page after it was set dirty.
714 *
715 * FIXME: may need to call ->reservepage here as well. That's rather up to the
716 * address_space though.
717 */
718int __set_page_dirty_buffers(struct page *page)
719{
a8e7d49a 720 int newly_dirty;
787d2214 721 struct address_space *mapping = page_mapping(page);
ebf7a227
NP
722
723 if (unlikely(!mapping))
724 return !TestSetPageDirty(page);
1da177e4
LT
725
726 spin_lock(&mapping->private_lock);
727 if (page_has_buffers(page)) {
728 struct buffer_head *head = page_buffers(page);
729 struct buffer_head *bh = head;
730
731 do {
732 set_buffer_dirty(bh);
733 bh = bh->b_this_page;
734 } while (bh != head);
735 }
a8e7d49a 736 newly_dirty = !TestSetPageDirty(page);
1da177e4
LT
737 spin_unlock(&mapping->private_lock);
738
a8e7d49a
LT
739 if (newly_dirty)
740 __set_page_dirty(page, mapping, 1);
741 return newly_dirty;
1da177e4
LT
742}
743EXPORT_SYMBOL(__set_page_dirty_buffers);
744
745/*
746 * Write out and wait upon a list of buffers.
747 *
748 * We have conflicting pressures: we want to make sure that all
749 * initially dirty buffers get waited on, but that any subsequently
750 * dirtied buffers don't. After all, we don't want fsync to last
751 * forever if somebody is actively writing to the file.
752 *
753 * Do this in two main stages: first we copy dirty buffers to a
754 * temporary inode list, queueing the writes as we go. Then we clean
755 * up, waiting for those writes to complete.
756 *
757 * During this second stage, any subsequent updates to the file may end
758 * up refiling the buffer on the original inode's dirty list again, so
759 * there is a chance we will end up with a buffer queued for write but
760 * not yet completed on that list. So, as a final cleanup we go through
761 * the osync code to catch these locked, dirty buffers without requeuing
762 * any newly dirty buffers for write.
763 */
764static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
765{
766 struct buffer_head *bh;
767 struct list_head tmp;
9cf6b720 768 struct address_space *mapping, *prev_mapping = NULL;
1da177e4
LT
769 int err = 0, err2;
770
771 INIT_LIST_HEAD(&tmp);
772
773 spin_lock(lock);
774 while (!list_empty(list)) {
775 bh = BH_ENTRY(list->next);
535ee2fb 776 mapping = bh->b_assoc_map;
58ff407b 777 __remove_assoc_queue(bh);
535ee2fb
JK
778 /* Avoid race with mark_buffer_dirty_inode() which does
779 * a lockless check and we rely on seeing the dirty bit */
780 smp_mb();
1da177e4
LT
781 if (buffer_dirty(bh) || buffer_locked(bh)) {
782 list_add(&bh->b_assoc_buffers, &tmp);
535ee2fb 783 bh->b_assoc_map = mapping;
1da177e4
LT
784 if (buffer_dirty(bh)) {
785 get_bh(bh);
786 spin_unlock(lock);
787 /*
788 * Ensure any pending I/O completes so that
789 * ll_rw_block() actually writes the current
790 * contents - it is a noop if I/O is still in
791 * flight on potentially older contents.
792 */
9cf6b720
JA
793 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
794
795 /*
796 * Kick off IO for the previous mapping. Note
797 * that we will not run the very last mapping,
798 * wait_on_buffer() will do that for us
799 * through sync_buffer().
800 */
801 if (prev_mapping && prev_mapping != mapping)
802 blk_run_address_space(prev_mapping);
803 prev_mapping = mapping;
804
1da177e4
LT
805 brelse(bh);
806 spin_lock(lock);
807 }
808 }
809 }
810
811 while (!list_empty(&tmp)) {
812 bh = BH_ENTRY(tmp.prev);
1da177e4 813 get_bh(bh);
535ee2fb
JK
814 mapping = bh->b_assoc_map;
815 __remove_assoc_queue(bh);
816 /* Avoid race with mark_buffer_dirty_inode() which does
817 * a lockless check and we rely on seeing the dirty bit */
818 smp_mb();
819 if (buffer_dirty(bh)) {
820 list_add(&bh->b_assoc_buffers,
e3892296 821 &mapping->private_list);
535ee2fb
JK
822 bh->b_assoc_map = mapping;
823 }
1da177e4
LT
824 spin_unlock(lock);
825 wait_on_buffer(bh);
826 if (!buffer_uptodate(bh))
827 err = -EIO;
828 brelse(bh);
829 spin_lock(lock);
830 }
831
832 spin_unlock(lock);
833 err2 = osync_buffers_list(lock, list);
834 if (err)
835 return err;
836 else
837 return err2;
838}
839
840/*
841 * Invalidate any and all dirty buffers on a given inode. We are
842 * probably unmounting the fs, but that doesn't mean we have already
843 * done a sync(). Just drop the buffers from the inode list.
844 *
845 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
846 * assumes that all the buffers are against the blockdev. Not true
847 * for reiserfs.
848 */
849void invalidate_inode_buffers(struct inode *inode)
850{
851 if (inode_has_buffers(inode)) {
852 struct address_space *mapping = &inode->i_data;
853 struct list_head *list = &mapping->private_list;
854 struct address_space *buffer_mapping = mapping->assoc_mapping;
855
856 spin_lock(&buffer_mapping->private_lock);
857 while (!list_empty(list))
858 __remove_assoc_queue(BH_ENTRY(list->next));
859 spin_unlock(&buffer_mapping->private_lock);
860 }
861}
52b19ac9 862EXPORT_SYMBOL(invalidate_inode_buffers);
1da177e4
LT
863
864/*
865 * Remove any clean buffers from the inode's buffer list. This is called
866 * when we're trying to free the inode itself. Those buffers can pin it.
867 *
868 * Returns true if all buffers were removed.
869 */
870int remove_inode_buffers(struct inode *inode)
871{
872 int ret = 1;
873
874 if (inode_has_buffers(inode)) {
875 struct address_space *mapping = &inode->i_data;
876 struct list_head *list = &mapping->private_list;
877 struct address_space *buffer_mapping = mapping->assoc_mapping;
878
879 spin_lock(&buffer_mapping->private_lock);
880 while (!list_empty(list)) {
881 struct buffer_head *bh = BH_ENTRY(list->next);
882 if (buffer_dirty(bh)) {
883 ret = 0;
884 break;
885 }
886 __remove_assoc_queue(bh);
887 }
888 spin_unlock(&buffer_mapping->private_lock);
889 }
890 return ret;
891}
892
893/*
894 * Create the appropriate buffers when given a page for data area and
895 * the size of each buffer.. Use the bh->b_this_page linked list to
896 * follow the buffers created. Return NULL if unable to create more
897 * buffers.
898 *
899 * The retry flag is used to differentiate async IO (paging, swapping)
900 * which may not fail from ordinary buffer allocations.
901 */
902struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
903 int retry)
904{
905 struct buffer_head *bh, *head;
906 long offset;
907
908try_again:
909 head = NULL;
910 offset = PAGE_SIZE;
911 while ((offset -= size) >= 0) {
912 bh = alloc_buffer_head(GFP_NOFS);
913 if (!bh)
914 goto no_grow;
915
916 bh->b_bdev = NULL;
917 bh->b_this_page = head;
918 bh->b_blocknr = -1;
919 head = bh;
920
921 bh->b_state = 0;
922 atomic_set(&bh->b_count, 0);
fc5cd582 923 bh->b_private = NULL;
1da177e4
LT
924 bh->b_size = size;
925
926 /* Link the buffer to its page */
927 set_bh_page(bh, page, offset);
928
01ffe339 929 init_buffer(bh, NULL, NULL);
1da177e4
LT
930 }
931 return head;
932/*
933 * In case anything failed, we just free everything we got.
934 */
935no_grow:
936 if (head) {
937 do {
938 bh = head;
939 head = head->b_this_page;
940 free_buffer_head(bh);
941 } while (head);
942 }
943
944 /*
945 * Return failure for non-async IO requests. Async IO requests
946 * are not allowed to fail, so we have to wait until buffer heads
947 * become available. But we don't want tasks sleeping with
948 * partially complete buffers, so all were released above.
949 */
950 if (!retry)
951 return NULL;
952
953 /* We're _really_ low on memory. Now we just
954 * wait for old buffer heads to become free due to
955 * finishing IO. Since this is an async request and
956 * the reserve list is empty, we're sure there are
957 * async buffer heads in use.
958 */
959 free_more_memory();
960 goto try_again;
961}
962EXPORT_SYMBOL_GPL(alloc_page_buffers);
963
964static inline void
965link_dev_buffers(struct page *page, struct buffer_head *head)
966{
967 struct buffer_head *bh, *tail;
968
969 bh = head;
970 do {
971 tail = bh;
972 bh = bh->b_this_page;
973 } while (bh);
974 tail->b_this_page = head;
975 attach_page_buffers(page, head);
976}
977
978/*
979 * Initialise the state of a blockdev page's buffers.
980 */
981static void
982init_page_buffers(struct page *page, struct block_device *bdev,
983 sector_t block, int size)
984{
985 struct buffer_head *head = page_buffers(page);
986 struct buffer_head *bh = head;
987 int uptodate = PageUptodate(page);
988
989 do {
990 if (!buffer_mapped(bh)) {
991 init_buffer(bh, NULL, NULL);
992 bh->b_bdev = bdev;
993 bh->b_blocknr = block;
994 if (uptodate)
995 set_buffer_uptodate(bh);
996 set_buffer_mapped(bh);
997 }
998 block++;
999 bh = bh->b_this_page;
1000 } while (bh != head);
1001}
1002
1003/*
1004 * Create the page-cache page that contains the requested block.
1005 *
1006 * This is user purely for blockdev mappings.
1007 */
1008static struct page *
1009grow_dev_page(struct block_device *bdev, sector_t block,
1010 pgoff_t index, int size)
1011{
1012 struct inode *inode = bdev->bd_inode;
1013 struct page *page;
1014 struct buffer_head *bh;
1015
ea125892 1016 page = find_or_create_page(inode->i_mapping, index,
769848c0 1017 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1da177e4
LT
1018 if (!page)
1019 return NULL;
1020
e827f923 1021 BUG_ON(!PageLocked(page));
1da177e4
LT
1022
1023 if (page_has_buffers(page)) {
1024 bh = page_buffers(page);
1025 if (bh->b_size == size) {
1026 init_page_buffers(page, bdev, block, size);
1027 return page;
1028 }
1029 if (!try_to_free_buffers(page))
1030 goto failed;
1031 }
1032
1033 /*
1034 * Allocate some buffers for this page
1035 */
1036 bh = alloc_page_buffers(page, size, 0);
1037 if (!bh)
1038 goto failed;
1039
1040 /*
1041 * Link the page to the buffers and initialise them. Take the
1042 * lock to be atomic wrt __find_get_block(), which does not
1043 * run under the page lock.
1044 */
1045 spin_lock(&inode->i_mapping->private_lock);
1046 link_dev_buffers(page, bh);
1047 init_page_buffers(page, bdev, block, size);
1048 spin_unlock(&inode->i_mapping->private_lock);
1049 return page;
1050
1051failed:
1052 BUG();
1053 unlock_page(page);
1054 page_cache_release(page);
1055 return NULL;
1056}
1057
1058/*
1059 * Create buffers for the specified block device block's page. If
1060 * that page was dirty, the buffers are set dirty also.
1da177e4 1061 */
858119e1 1062static int
1da177e4
LT
1063grow_buffers(struct block_device *bdev, sector_t block, int size)
1064{
1065 struct page *page;
1066 pgoff_t index;
1067 int sizebits;
1068
1069 sizebits = -1;
1070 do {
1071 sizebits++;
1072 } while ((size << sizebits) < PAGE_SIZE);
1073
1074 index = block >> sizebits;
1da177e4 1075
e5657933
AM
1076 /*
1077 * Check for a block which wants to lie outside our maximum possible
1078 * pagecache index. (this comparison is done using sector_t types).
1079 */
1080 if (unlikely(index != block >> sizebits)) {
1081 char b[BDEVNAME_SIZE];
1082
1083 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1084 "device %s\n",
8e24eea7 1085 __func__, (unsigned long long)block,
e5657933
AM
1086 bdevname(bdev, b));
1087 return -EIO;
1088 }
1089 block = index << sizebits;
1da177e4
LT
1090 /* Create a page with the proper size buffers.. */
1091 page = grow_dev_page(bdev, block, index, size);
1092 if (!page)
1093 return 0;
1094 unlock_page(page);
1095 page_cache_release(page);
1096 return 1;
1097}
1098
75c96f85 1099static struct buffer_head *
1da177e4
LT
1100__getblk_slow(struct block_device *bdev, sector_t block, int size)
1101{
1102 /* Size must be multiple of hard sectorsize */
e1defc4f 1103 if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1da177e4
LT
1104 (size < 512 || size > PAGE_SIZE))) {
1105 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1106 size);
e1defc4f
MP
1107 printk(KERN_ERR "logical block size: %d\n",
1108 bdev_logical_block_size(bdev));
1da177e4
LT
1109
1110 dump_stack();
1111 return NULL;
1112 }
1113
1114 for (;;) {
1115 struct buffer_head * bh;
e5657933 1116 int ret;
1da177e4
LT
1117
1118 bh = __find_get_block(bdev, block, size);
1119 if (bh)
1120 return bh;
1121
e5657933
AM
1122 ret = grow_buffers(bdev, block, size);
1123 if (ret < 0)
1124 return NULL;
1125 if (ret == 0)
1da177e4
LT
1126 free_more_memory();
1127 }
1128}
1129
1130/*
1131 * The relationship between dirty buffers and dirty pages:
1132 *
1133 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1134 * the page is tagged dirty in its radix tree.
1135 *
1136 * At all times, the dirtiness of the buffers represents the dirtiness of
1137 * subsections of the page. If the page has buffers, the page dirty bit is
1138 * merely a hint about the true dirty state.
1139 *
1140 * When a page is set dirty in its entirety, all its buffers are marked dirty
1141 * (if the page has buffers).
1142 *
1143 * When a buffer is marked dirty, its page is dirtied, but the page's other
1144 * buffers are not.
1145 *
1146 * Also. When blockdev buffers are explicitly read with bread(), they
1147 * individually become uptodate. But their backing page remains not
1148 * uptodate - even if all of its buffers are uptodate. A subsequent
1149 * block_read_full_page() against that page will discover all the uptodate
1150 * buffers, will set the page uptodate and will perform no I/O.
1151 */
1152
1153/**
1154 * mark_buffer_dirty - mark a buffer_head as needing writeout
67be2dd1 1155 * @bh: the buffer_head to mark dirty
1da177e4
LT
1156 *
1157 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1158 * backing page dirty, then tag the page as dirty in its address_space's radix
1159 * tree and then attach the address_space's inode to its superblock's dirty
1160 * inode list.
1161 *
1162 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1163 * mapping->tree_lock and the global inode_lock.
1164 */
fc9b52cd 1165void mark_buffer_dirty(struct buffer_head *bh)
1da177e4 1166{
787d2214 1167 WARN_ON_ONCE(!buffer_uptodate(bh));
1be62dc1
LT
1168
1169 /*
1170 * Very *carefully* optimize the it-is-already-dirty case.
1171 *
1172 * Don't let the final "is it dirty" escape to before we
1173 * perhaps modified the buffer.
1174 */
1175 if (buffer_dirty(bh)) {
1176 smp_mb();
1177 if (buffer_dirty(bh))
1178 return;
1179 }
1180
a8e7d49a
LT
1181 if (!test_set_buffer_dirty(bh)) {
1182 struct page *page = bh->b_page;
8e9d78ed
LT
1183 if (!TestSetPageDirty(page)) {
1184 struct address_space *mapping = page_mapping(page);
1185 if (mapping)
1186 __set_page_dirty(page, mapping, 0);
1187 }
a8e7d49a 1188 }
1da177e4 1189}
1fe72eaa 1190EXPORT_SYMBOL(mark_buffer_dirty);
1da177e4
LT
1191
1192/*
1193 * Decrement a buffer_head's reference count. If all buffers against a page
1194 * have zero reference count, are clean and unlocked, and if the page is clean
1195 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1196 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1197 * a page but it ends up not being freed, and buffers may later be reattached).
1198 */
1199void __brelse(struct buffer_head * buf)
1200{
1201 if (atomic_read(&buf->b_count)) {
1202 put_bh(buf);
1203 return;
1204 }
5c752ad9 1205 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1da177e4 1206}
1fe72eaa 1207EXPORT_SYMBOL(__brelse);
1da177e4
LT
1208
1209/*
1210 * bforget() is like brelse(), except it discards any
1211 * potentially dirty data.
1212 */
1213void __bforget(struct buffer_head *bh)
1214{
1215 clear_buffer_dirty(bh);
535ee2fb 1216 if (bh->b_assoc_map) {
1da177e4
LT
1217 struct address_space *buffer_mapping = bh->b_page->mapping;
1218
1219 spin_lock(&buffer_mapping->private_lock);
1220 list_del_init(&bh->b_assoc_buffers);
58ff407b 1221 bh->b_assoc_map = NULL;
1da177e4
LT
1222 spin_unlock(&buffer_mapping->private_lock);
1223 }
1224 __brelse(bh);
1225}
1fe72eaa 1226EXPORT_SYMBOL(__bforget);
1da177e4
LT
1227
1228static struct buffer_head *__bread_slow(struct buffer_head *bh)
1229{
1230 lock_buffer(bh);
1231 if (buffer_uptodate(bh)) {
1232 unlock_buffer(bh);
1233 return bh;
1234 } else {
1235 get_bh(bh);
1236 bh->b_end_io = end_buffer_read_sync;
1237 submit_bh(READ, bh);
1238 wait_on_buffer(bh);
1239 if (buffer_uptodate(bh))
1240 return bh;
1241 }
1242 brelse(bh);
1243 return NULL;
1244}
1245
1246/*
1247 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1248 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1249 * refcount elevated by one when they're in an LRU. A buffer can only appear
1250 * once in a particular CPU's LRU. A single buffer can be present in multiple
1251 * CPU's LRUs at the same time.
1252 *
1253 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1254 * sb_find_get_block().
1255 *
1256 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1257 * a local interrupt disable for that.
1258 */
1259
1260#define BH_LRU_SIZE 8
1261
1262struct bh_lru {
1263 struct buffer_head *bhs[BH_LRU_SIZE];
1264};
1265
1266static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1267
1268#ifdef CONFIG_SMP
1269#define bh_lru_lock() local_irq_disable()
1270#define bh_lru_unlock() local_irq_enable()
1271#else
1272#define bh_lru_lock() preempt_disable()
1273#define bh_lru_unlock() preempt_enable()
1274#endif
1275
1276static inline void check_irqs_on(void)
1277{
1278#ifdef irqs_disabled
1279 BUG_ON(irqs_disabled());
1280#endif
1281}
1282
1283/*
1284 * The LRU management algorithm is dopey-but-simple. Sorry.
1285 */
1286static void bh_lru_install(struct buffer_head *bh)
1287{
1288 struct buffer_head *evictee = NULL;
1289 struct bh_lru *lru;
1290
1291 check_irqs_on();
1292 bh_lru_lock();
1293 lru = &__get_cpu_var(bh_lrus);
1294 if (lru->bhs[0] != bh) {
1295 struct buffer_head *bhs[BH_LRU_SIZE];
1296 int in;
1297 int out = 0;
1298
1299 get_bh(bh);
1300 bhs[out++] = bh;
1301 for (in = 0; in < BH_LRU_SIZE; in++) {
1302 struct buffer_head *bh2 = lru->bhs[in];
1303
1304 if (bh2 == bh) {
1305 __brelse(bh2);
1306 } else {
1307 if (out >= BH_LRU_SIZE) {
1308 BUG_ON(evictee != NULL);
1309 evictee = bh2;
1310 } else {
1311 bhs[out++] = bh2;
1312 }
1313 }
1314 }
1315 while (out < BH_LRU_SIZE)
1316 bhs[out++] = NULL;
1317 memcpy(lru->bhs, bhs, sizeof(bhs));
1318 }
1319 bh_lru_unlock();
1320
1321 if (evictee)
1322 __brelse(evictee);
1323}
1324
1325/*
1326 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1327 */
858119e1 1328static struct buffer_head *
3991d3bd 1329lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1330{
1331 struct buffer_head *ret = NULL;
1332 struct bh_lru *lru;
3991d3bd 1333 unsigned int i;
1da177e4
LT
1334
1335 check_irqs_on();
1336 bh_lru_lock();
1337 lru = &__get_cpu_var(bh_lrus);
1338 for (i = 0; i < BH_LRU_SIZE; i++) {
1339 struct buffer_head *bh = lru->bhs[i];
1340
1341 if (bh && bh->b_bdev == bdev &&
1342 bh->b_blocknr == block && bh->b_size == size) {
1343 if (i) {
1344 while (i) {
1345 lru->bhs[i] = lru->bhs[i - 1];
1346 i--;
1347 }
1348 lru->bhs[0] = bh;
1349 }
1350 get_bh(bh);
1351 ret = bh;
1352 break;
1353 }
1354 }
1355 bh_lru_unlock();
1356 return ret;
1357}
1358
1359/*
1360 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1361 * it in the LRU and mark it as accessed. If it is not present then return
1362 * NULL
1363 */
1364struct buffer_head *
3991d3bd 1365__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1366{
1367 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1368
1369 if (bh == NULL) {
385fd4c5 1370 bh = __find_get_block_slow(bdev, block);
1da177e4
LT
1371 if (bh)
1372 bh_lru_install(bh);
1373 }
1374 if (bh)
1375 touch_buffer(bh);
1376 return bh;
1377}
1378EXPORT_SYMBOL(__find_get_block);
1379
1380/*
1381 * __getblk will locate (and, if necessary, create) the buffer_head
1382 * which corresponds to the passed block_device, block and size. The
1383 * returned buffer has its reference count incremented.
1384 *
1385 * __getblk() cannot fail - it just keeps trying. If you pass it an
1386 * illegal block number, __getblk() will happily return a buffer_head
1387 * which represents the non-existent block. Very weird.
1388 *
1389 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1390 * attempt is failing. FIXME, perhaps?
1391 */
1392struct buffer_head *
3991d3bd 1393__getblk(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1394{
1395 struct buffer_head *bh = __find_get_block(bdev, block, size);
1396
1397 might_sleep();
1398 if (bh == NULL)
1399 bh = __getblk_slow(bdev, block, size);
1400 return bh;
1401}
1402EXPORT_SYMBOL(__getblk);
1403
1404/*
1405 * Do async read-ahead on a buffer..
1406 */
3991d3bd 1407void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1408{
1409 struct buffer_head *bh = __getblk(bdev, block, size);
a3e713b5
AM
1410 if (likely(bh)) {
1411 ll_rw_block(READA, 1, &bh);
1412 brelse(bh);
1413 }
1da177e4
LT
1414}
1415EXPORT_SYMBOL(__breadahead);
1416
1417/**
1418 * __bread() - reads a specified block and returns the bh
67be2dd1 1419 * @bdev: the block_device to read from
1da177e4
LT
1420 * @block: number of block
1421 * @size: size (in bytes) to read
1422 *
1423 * Reads a specified block, and returns buffer head that contains it.
1424 * It returns NULL if the block was unreadable.
1425 */
1426struct buffer_head *
3991d3bd 1427__bread(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1428{
1429 struct buffer_head *bh = __getblk(bdev, block, size);
1430
a3e713b5 1431 if (likely(bh) && !buffer_uptodate(bh))
1da177e4
LT
1432 bh = __bread_slow(bh);
1433 return bh;
1434}
1435EXPORT_SYMBOL(__bread);
1436
1437/*
1438 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1439 * This doesn't race because it runs in each cpu either in irq
1440 * or with preempt disabled.
1441 */
1442static void invalidate_bh_lru(void *arg)
1443{
1444 struct bh_lru *b = &get_cpu_var(bh_lrus);
1445 int i;
1446
1447 for (i = 0; i < BH_LRU_SIZE; i++) {
1448 brelse(b->bhs[i]);
1449 b->bhs[i] = NULL;
1450 }
1451 put_cpu_var(bh_lrus);
1452}
1453
f9a14399 1454void invalidate_bh_lrus(void)
1da177e4 1455{
15c8b6c1 1456 on_each_cpu(invalidate_bh_lru, NULL, 1);
1da177e4 1457}
9db5579b 1458EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1da177e4
LT
1459
1460void set_bh_page(struct buffer_head *bh,
1461 struct page *page, unsigned long offset)
1462{
1463 bh->b_page = page;
e827f923 1464 BUG_ON(offset >= PAGE_SIZE);
1da177e4
LT
1465 if (PageHighMem(page))
1466 /*
1467 * This catches illegal uses and preserves the offset:
1468 */
1469 bh->b_data = (char *)(0 + offset);
1470 else
1471 bh->b_data = page_address(page) + offset;
1472}
1473EXPORT_SYMBOL(set_bh_page);
1474
1475/*
1476 * Called when truncating a buffer on a page completely.
1477 */
858119e1 1478static void discard_buffer(struct buffer_head * bh)
1da177e4
LT
1479{
1480 lock_buffer(bh);
1481 clear_buffer_dirty(bh);
1482 bh->b_bdev = NULL;
1483 clear_buffer_mapped(bh);
1484 clear_buffer_req(bh);
1485 clear_buffer_new(bh);
1486 clear_buffer_delay(bh);
33a266dd 1487 clear_buffer_unwritten(bh);
1da177e4
LT
1488 unlock_buffer(bh);
1489}
1490
1da177e4
LT
1491/**
1492 * block_invalidatepage - invalidate part of all of a buffer-backed page
1493 *
1494 * @page: the page which is affected
1495 * @offset: the index of the truncation point
1496 *
1497 * block_invalidatepage() is called when all or part of the page has become
1498 * invalidatedby a truncate operation.
1499 *
1500 * block_invalidatepage() does not have to release all buffers, but it must
1501 * ensure that no dirty buffer is left outside @offset and that no I/O
1502 * is underway against any of the blocks which are outside the truncation
1503 * point. Because the caller is about to free (and possibly reuse) those
1504 * blocks on-disk.
1505 */
2ff28e22 1506void block_invalidatepage(struct page *page, unsigned long offset)
1da177e4
LT
1507{
1508 struct buffer_head *head, *bh, *next;
1509 unsigned int curr_off = 0;
1da177e4
LT
1510
1511 BUG_ON(!PageLocked(page));
1512 if (!page_has_buffers(page))
1513 goto out;
1514
1515 head = page_buffers(page);
1516 bh = head;
1517 do {
1518 unsigned int next_off = curr_off + bh->b_size;
1519 next = bh->b_this_page;
1520
1521 /*
1522 * is this block fully invalidated?
1523 */
1524 if (offset <= curr_off)
1525 discard_buffer(bh);
1526 curr_off = next_off;
1527 bh = next;
1528 } while (bh != head);
1529
1530 /*
1531 * We release buffers only if the entire page is being invalidated.
1532 * The get_block cached value has been unconditionally invalidated,
1533 * so real IO is not possible anymore.
1534 */
1535 if (offset == 0)
2ff28e22 1536 try_to_release_page(page, 0);
1da177e4 1537out:
2ff28e22 1538 return;
1da177e4
LT
1539}
1540EXPORT_SYMBOL(block_invalidatepage);
1541
1542/*
1543 * We attach and possibly dirty the buffers atomically wrt
1544 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1545 * is already excluded via the page lock.
1546 */
1547void create_empty_buffers(struct page *page,
1548 unsigned long blocksize, unsigned long b_state)
1549{
1550 struct buffer_head *bh, *head, *tail;
1551
1552 head = alloc_page_buffers(page, blocksize, 1);
1553 bh = head;
1554 do {
1555 bh->b_state |= b_state;
1556 tail = bh;
1557 bh = bh->b_this_page;
1558 } while (bh);
1559 tail->b_this_page = head;
1560
1561 spin_lock(&page->mapping->private_lock);
1562 if (PageUptodate(page) || PageDirty(page)) {
1563 bh = head;
1564 do {
1565 if (PageDirty(page))
1566 set_buffer_dirty(bh);
1567 if (PageUptodate(page))
1568 set_buffer_uptodate(bh);
1569 bh = bh->b_this_page;
1570 } while (bh != head);
1571 }
1572 attach_page_buffers(page, head);
1573 spin_unlock(&page->mapping->private_lock);
1574}
1575EXPORT_SYMBOL(create_empty_buffers);
1576
1577/*
1578 * We are taking a block for data and we don't want any output from any
1579 * buffer-cache aliases starting from return from that function and
1580 * until the moment when something will explicitly mark the buffer
1581 * dirty (hopefully that will not happen until we will free that block ;-)
1582 * We don't even need to mark it not-uptodate - nobody can expect
1583 * anything from a newly allocated buffer anyway. We used to used
1584 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1585 * don't want to mark the alias unmapped, for example - it would confuse
1586 * anyone who might pick it with bread() afterwards...
1587 *
1588 * Also.. Note that bforget() doesn't lock the buffer. So there can
1589 * be writeout I/O going on against recently-freed buffers. We don't
1590 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1591 * only if we really need to. That happens here.
1592 */
1593void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1594{
1595 struct buffer_head *old_bh;
1596
1597 might_sleep();
1598
385fd4c5 1599 old_bh = __find_get_block_slow(bdev, block);
1da177e4
LT
1600 if (old_bh) {
1601 clear_buffer_dirty(old_bh);
1602 wait_on_buffer(old_bh);
1603 clear_buffer_req(old_bh);
1604 __brelse(old_bh);
1605 }
1606}
1607EXPORT_SYMBOL(unmap_underlying_metadata);
1608
1609/*
1610 * NOTE! All mapped/uptodate combinations are valid:
1611 *
1612 * Mapped Uptodate Meaning
1613 *
1614 * No No "unknown" - must do get_block()
1615 * No Yes "hole" - zero-filled
1616 * Yes No "allocated" - allocated on disk, not read in
1617 * Yes Yes "valid" - allocated and up-to-date in memory.
1618 *
1619 * "Dirty" is valid only with the last case (mapped+uptodate).
1620 */
1621
1622/*
1623 * While block_write_full_page is writing back the dirty buffers under
1624 * the page lock, whoever dirtied the buffers may decide to clean them
1625 * again at any time. We handle that by only looking at the buffer
1626 * state inside lock_buffer().
1627 *
1628 * If block_write_full_page() is called for regular writeback
1629 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1630 * locked buffer. This only can happen if someone has written the buffer
1631 * directly, with submit_bh(). At the address_space level PageWriteback
1632 * prevents this contention from occurring.
6e34eedd
TT
1633 *
1634 * If block_write_full_page() is called with wbc->sync_mode ==
1635 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
1636 * causes the writes to be flagged as synchronous writes, but the
1637 * block device queue will NOT be unplugged, since usually many pages
1638 * will be pushed to the out before the higher-level caller actually
1639 * waits for the writes to be completed. The various wait functions,
1640 * such as wait_on_writeback_range() will ultimately call sync_page()
1641 * which will ultimately call blk_run_backing_dev(), which will end up
1642 * unplugging the device queue.
1da177e4
LT
1643 */
1644static int __block_write_full_page(struct inode *inode, struct page *page,
35c80d5f
CM
1645 get_block_t *get_block, struct writeback_control *wbc,
1646 bh_end_io_t *handler)
1da177e4
LT
1647{
1648 int err;
1649 sector_t block;
1650 sector_t last_block;
f0fbd5fc 1651 struct buffer_head *bh, *head;
b0cf2321 1652 const unsigned blocksize = 1 << inode->i_blkbits;
1da177e4 1653 int nr_underway = 0;
6e34eedd
TT
1654 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1655 WRITE_SYNC_PLUG : WRITE);
1da177e4
LT
1656
1657 BUG_ON(!PageLocked(page));
1658
1659 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1660
1661 if (!page_has_buffers(page)) {
b0cf2321 1662 create_empty_buffers(page, blocksize,
1da177e4
LT
1663 (1 << BH_Dirty)|(1 << BH_Uptodate));
1664 }
1665
1666 /*
1667 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1668 * here, and the (potentially unmapped) buffers may become dirty at
1669 * any time. If a buffer becomes dirty here after we've inspected it
1670 * then we just miss that fact, and the page stays dirty.
1671 *
1672 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1673 * handle that here by just cleaning them.
1674 */
1675
54b21a79 1676 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1da177e4
LT
1677 head = page_buffers(page);
1678 bh = head;
1679
1680 /*
1681 * Get all the dirty buffers mapped to disk addresses and
1682 * handle any aliases from the underlying blockdev's mapping.
1683 */
1684 do {
1685 if (block > last_block) {
1686 /*
1687 * mapped buffers outside i_size will occur, because
1688 * this page can be outside i_size when there is a
1689 * truncate in progress.
1690 */
1691 /*
1692 * The buffer was zeroed by block_write_full_page()
1693 */
1694 clear_buffer_dirty(bh);
1695 set_buffer_uptodate(bh);
29a814d2
AT
1696 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1697 buffer_dirty(bh)) {
b0cf2321 1698 WARN_ON(bh->b_size != blocksize);
1da177e4
LT
1699 err = get_block(inode, block, bh, 1);
1700 if (err)
1701 goto recover;
29a814d2 1702 clear_buffer_delay(bh);
1da177e4
LT
1703 if (buffer_new(bh)) {
1704 /* blockdev mappings never come here */
1705 clear_buffer_new(bh);
1706 unmap_underlying_metadata(bh->b_bdev,
1707 bh->b_blocknr);
1708 }
1709 }
1710 bh = bh->b_this_page;
1711 block++;
1712 } while (bh != head);
1713
1714 do {
1da177e4
LT
1715 if (!buffer_mapped(bh))
1716 continue;
1717 /*
1718 * If it's a fully non-blocking write attempt and we cannot
1719 * lock the buffer then redirty the page. Note that this can
5b0830cb
JA
1720 * potentially cause a busy-wait loop from writeback threads
1721 * and kswapd activity, but those code paths have their own
1722 * higher-level throttling.
1da177e4
LT
1723 */
1724 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1725 lock_buffer(bh);
ca5de404 1726 } else if (!trylock_buffer(bh)) {
1da177e4
LT
1727 redirty_page_for_writepage(wbc, page);
1728 continue;
1729 }
1730 if (test_clear_buffer_dirty(bh)) {
35c80d5f 1731 mark_buffer_async_write_endio(bh, handler);
1da177e4
LT
1732 } else {
1733 unlock_buffer(bh);
1734 }
1735 } while ((bh = bh->b_this_page) != head);
1736
1737 /*
1738 * The page and its buffers are protected by PageWriteback(), so we can
1739 * drop the bh refcounts early.
1740 */
1741 BUG_ON(PageWriteback(page));
1742 set_page_writeback(page);
1da177e4
LT
1743
1744 do {
1745 struct buffer_head *next = bh->b_this_page;
1746 if (buffer_async_write(bh)) {
a64c8610 1747 submit_bh(write_op, bh);
1da177e4
LT
1748 nr_underway++;
1749 }
1da177e4
LT
1750 bh = next;
1751 } while (bh != head);
05937baa 1752 unlock_page(page);
1da177e4
LT
1753
1754 err = 0;
1755done:
1756 if (nr_underway == 0) {
1757 /*
1758 * The page was marked dirty, but the buffers were
1759 * clean. Someone wrote them back by hand with
1760 * ll_rw_block/submit_bh. A rare case.
1761 */
1da177e4 1762 end_page_writeback(page);
3d67f2d7 1763
1da177e4
LT
1764 /*
1765 * The page and buffer_heads can be released at any time from
1766 * here on.
1767 */
1da177e4
LT
1768 }
1769 return err;
1770
1771recover:
1772 /*
1773 * ENOSPC, or some other error. We may already have added some
1774 * blocks to the file, so we need to write these out to avoid
1775 * exposing stale data.
1776 * The page is currently locked and not marked for writeback
1777 */
1778 bh = head;
1779 /* Recovery: lock and submit the mapped buffers */
1780 do {
29a814d2
AT
1781 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1782 !buffer_delay(bh)) {
1da177e4 1783 lock_buffer(bh);
35c80d5f 1784 mark_buffer_async_write_endio(bh, handler);
1da177e4
LT
1785 } else {
1786 /*
1787 * The buffer may have been set dirty during
1788 * attachment to a dirty page.
1789 */
1790 clear_buffer_dirty(bh);
1791 }
1792 } while ((bh = bh->b_this_page) != head);
1793 SetPageError(page);
1794 BUG_ON(PageWriteback(page));
7e4c3690 1795 mapping_set_error(page->mapping, err);
1da177e4 1796 set_page_writeback(page);
1da177e4
LT
1797 do {
1798 struct buffer_head *next = bh->b_this_page;
1799 if (buffer_async_write(bh)) {
1800 clear_buffer_dirty(bh);
a64c8610 1801 submit_bh(write_op, bh);
1da177e4
LT
1802 nr_underway++;
1803 }
1da177e4
LT
1804 bh = next;
1805 } while (bh != head);
ffda9d30 1806 unlock_page(page);
1da177e4
LT
1807 goto done;
1808}
1809
afddba49
NP
1810/*
1811 * If a page has any new buffers, zero them out here, and mark them uptodate
1812 * and dirty so they'll be written out (in order to prevent uninitialised
1813 * block data from leaking). And clear the new bit.
1814 */
1815void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1816{
1817 unsigned int block_start, block_end;
1818 struct buffer_head *head, *bh;
1819
1820 BUG_ON(!PageLocked(page));
1821 if (!page_has_buffers(page))
1822 return;
1823
1824 bh = head = page_buffers(page);
1825 block_start = 0;
1826 do {
1827 block_end = block_start + bh->b_size;
1828
1829 if (buffer_new(bh)) {
1830 if (block_end > from && block_start < to) {
1831 if (!PageUptodate(page)) {
1832 unsigned start, size;
1833
1834 start = max(from, block_start);
1835 size = min(to, block_end) - start;
1836
eebd2aa3 1837 zero_user(page, start, size);
afddba49
NP
1838 set_buffer_uptodate(bh);
1839 }
1840
1841 clear_buffer_new(bh);
1842 mark_buffer_dirty(bh);
1843 }
1844 }
1845
1846 block_start = block_end;
1847 bh = bh->b_this_page;
1848 } while (bh != head);
1849}
1850EXPORT_SYMBOL(page_zero_new_buffers);
1851
1da177e4
LT
1852static int __block_prepare_write(struct inode *inode, struct page *page,
1853 unsigned from, unsigned to, get_block_t *get_block)
1854{
1855 unsigned block_start, block_end;
1856 sector_t block;
1857 int err = 0;
1858 unsigned blocksize, bbits;
1859 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1860
1861 BUG_ON(!PageLocked(page));
1862 BUG_ON(from > PAGE_CACHE_SIZE);
1863 BUG_ON(to > PAGE_CACHE_SIZE);
1864 BUG_ON(from > to);
1865
1866 blocksize = 1 << inode->i_blkbits;
1867 if (!page_has_buffers(page))
1868 create_empty_buffers(page, blocksize, 0);
1869 head = page_buffers(page);
1870
1871 bbits = inode->i_blkbits;
1872 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1873
1874 for(bh = head, block_start = 0; bh != head || !block_start;
1875 block++, block_start=block_end, bh = bh->b_this_page) {
1876 block_end = block_start + blocksize;
1877 if (block_end <= from || block_start >= to) {
1878 if (PageUptodate(page)) {
1879 if (!buffer_uptodate(bh))
1880 set_buffer_uptodate(bh);
1881 }
1882 continue;
1883 }
1884 if (buffer_new(bh))
1885 clear_buffer_new(bh);
1886 if (!buffer_mapped(bh)) {
b0cf2321 1887 WARN_ON(bh->b_size != blocksize);
1da177e4
LT
1888 err = get_block(inode, block, bh, 1);
1889 if (err)
f3ddbdc6 1890 break;
1da177e4 1891 if (buffer_new(bh)) {
1da177e4
LT
1892 unmap_underlying_metadata(bh->b_bdev,
1893 bh->b_blocknr);
1894 if (PageUptodate(page)) {
637aff46 1895 clear_buffer_new(bh);
1da177e4 1896 set_buffer_uptodate(bh);
637aff46 1897 mark_buffer_dirty(bh);
1da177e4
LT
1898 continue;
1899 }
eebd2aa3
CL
1900 if (block_end > to || block_start < from)
1901 zero_user_segments(page,
1902 to, block_end,
1903 block_start, from);
1da177e4
LT
1904 continue;
1905 }
1906 }
1907 if (PageUptodate(page)) {
1908 if (!buffer_uptodate(bh))
1909 set_buffer_uptodate(bh);
1910 continue;
1911 }
1912 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
33a266dd 1913 !buffer_unwritten(bh) &&
1da177e4
LT
1914 (block_start < from || block_end > to)) {
1915 ll_rw_block(READ, 1, &bh);
1916 *wait_bh++=bh;
1917 }
1918 }
1919 /*
1920 * If we issued read requests - let them complete.
1921 */
1922 while(wait_bh > wait) {
f80e69e7
JA
1923 int ret;
1924
1925 ret = wait_on_buffer_async(*--wait_bh, current->io_wait);
1be3d0ec
JA
1926 if (ret && !err) {
1927 WARN(1, "%s: ret\n", __FUNCTION__);
f80e69e7 1928 err = ret;
1be3d0ec 1929 }
1da177e4 1930 if (!buffer_uptodate(*wait_bh))
f3ddbdc6 1931 err = -EIO;
1da177e4 1932 }
afddba49
NP
1933 if (unlikely(err))
1934 page_zero_new_buffers(page, from, to);
1da177e4
LT
1935 return err;
1936}
1937
1938static int __block_commit_write(struct inode *inode, struct page *page,
1939 unsigned from, unsigned to)
1940{
1941 unsigned block_start, block_end;
1942 int partial = 0;
1943 unsigned blocksize;
1944 struct buffer_head *bh, *head;
1945
1946 blocksize = 1 << inode->i_blkbits;
1947
1948 for(bh = head = page_buffers(page), block_start = 0;
1949 bh != head || !block_start;
1950 block_start=block_end, bh = bh->b_this_page) {
1951 block_end = block_start + blocksize;
1952 if (block_end <= from || block_start >= to) {
1953 if (!buffer_uptodate(bh))
1954 partial = 1;
1955 } else {
1956 set_buffer_uptodate(bh);
1957 mark_buffer_dirty(bh);
1958 }
afddba49 1959 clear_buffer_new(bh);
1da177e4
LT
1960 }
1961
1962 /*
1963 * If this is a partial write which happened to make all buffers
1964 * uptodate then we can optimize away a bogus readpage() for
1965 * the next read(). Here we 'discover' whether the page went
1966 * uptodate as a result of this (potentially partial) write.
1967 */
1968 if (!partial)
1969 SetPageUptodate(page);
1970 return 0;
1971}
1972
afddba49
NP
1973/*
1974 * block_write_begin takes care of the basic task of block allocation and
1975 * bringing partial write blocks uptodate first.
1976 *
1977 * If *pagep is not NULL, then block_write_begin uses the locked page
1978 * at *pagep rather than allocating its own. In this case, the page will
1979 * not be unlocked or deallocated on failure.
1980 */
1981int block_write_begin(struct file *file, struct address_space *mapping,
1982 loff_t pos, unsigned len, unsigned flags,
1983 struct page **pagep, void **fsdata,
1984 get_block_t *get_block)
1985{
1986 struct inode *inode = mapping->host;
1987 int status = 0;
1988 struct page *page;
1989 pgoff_t index;
1990 unsigned start, end;
1991 int ownpage = 0;
1992
1993 index = pos >> PAGE_CACHE_SHIFT;
1994 start = pos & (PAGE_CACHE_SIZE - 1);
1995 end = start + len;
1996
1997 page = *pagep;
1998 if (page == NULL) {
1999 ownpage = 1;
54566b2c 2000 page = grab_cache_page_write_begin(mapping, index, flags);
afddba49
NP
2001 if (!page) {
2002 status = -ENOMEM;
2003 goto out;
2004 }
2005 *pagep = page;
2006 } else
2007 BUG_ON(!PageLocked(page));
2008
2009 status = __block_prepare_write(inode, page, start, end, get_block);
2010 if (unlikely(status)) {
2011 ClearPageUptodate(page);
2012
2013 if (ownpage) {
2014 unlock_page(page);
2015 page_cache_release(page);
2016 *pagep = NULL;
2017
2018 /*
2019 * prepare_write() may have instantiated a few blocks
2020 * outside i_size. Trim these off again. Don't need
2021 * i_size_read because we hold i_mutex.
2022 */
2023 if (pos + len > inode->i_size)
2024 vmtruncate(inode, inode->i_size);
2025 }
afddba49
NP
2026 }
2027
2028out:
2029 return status;
2030}
2031EXPORT_SYMBOL(block_write_begin);
2032
2033int block_write_end(struct file *file, struct address_space *mapping,
2034 loff_t pos, unsigned len, unsigned copied,
2035 struct page *page, void *fsdata)
2036{
2037 struct inode *inode = mapping->host;
2038 unsigned start;
2039
2040 start = pos & (PAGE_CACHE_SIZE - 1);
2041
2042 if (unlikely(copied < len)) {
2043 /*
2044 * The buffers that were written will now be uptodate, so we
2045 * don't have to worry about a readpage reading them and
2046 * overwriting a partial write. However if we have encountered
2047 * a short write and only partially written into a buffer, it
2048 * will not be marked uptodate, so a readpage might come in and
2049 * destroy our partial write.
2050 *
2051 * Do the simplest thing, and just treat any short write to a
2052 * non uptodate page as a zero-length write, and force the
2053 * caller to redo the whole thing.
2054 */
2055 if (!PageUptodate(page))
2056 copied = 0;
2057
2058 page_zero_new_buffers(page, start+copied, start+len);
2059 }
2060 flush_dcache_page(page);
2061
2062 /* This could be a short (even 0-length) commit */
2063 __block_commit_write(inode, page, start, start+copied);
2064
2065 return copied;
2066}
2067EXPORT_SYMBOL(block_write_end);
2068
2069int generic_write_end(struct file *file, struct address_space *mapping,
2070 loff_t pos, unsigned len, unsigned copied,
2071 struct page *page, void *fsdata)
2072{
2073 struct inode *inode = mapping->host;
c7d206b3 2074 int i_size_changed = 0;
afddba49
NP
2075
2076 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2077
2078 /*
2079 * No need to use i_size_read() here, the i_size
2080 * cannot change under us because we hold i_mutex.
2081 *
2082 * But it's important to update i_size while still holding page lock:
2083 * page writeout could otherwise come in and zero beyond i_size.
2084 */
2085 if (pos+copied > inode->i_size) {
2086 i_size_write(inode, pos+copied);
c7d206b3 2087 i_size_changed = 1;
afddba49
NP
2088 }
2089
2090 unlock_page(page);
2091 page_cache_release(page);
2092
c7d206b3
JK
2093 /*
2094 * Don't mark the inode dirty under page lock. First, it unnecessarily
2095 * makes the holding time of page lock longer. Second, it forces lock
2096 * ordering of page lock and transaction start for journaling
2097 * filesystems.
2098 */
2099 if (i_size_changed)
2100 mark_inode_dirty(inode);
2101
afddba49
NP
2102 return copied;
2103}
2104EXPORT_SYMBOL(generic_write_end);
2105
8ab22b9a
HH
2106/*
2107 * block_is_partially_uptodate checks whether buffers within a page are
2108 * uptodate or not.
2109 *
2110 * Returns true if all buffers which correspond to a file portion
2111 * we want to read are uptodate.
2112 */
2113int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2114 unsigned long from)
2115{
2116 struct inode *inode = page->mapping->host;
2117 unsigned block_start, block_end, blocksize;
2118 unsigned to;
2119 struct buffer_head *bh, *head;
2120 int ret = 1;
2121
2122 if (!page_has_buffers(page))
2123 return 0;
2124
2125 blocksize = 1 << inode->i_blkbits;
2126 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2127 to = from + to;
2128 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2129 return 0;
2130
2131 head = page_buffers(page);
2132 bh = head;
2133 block_start = 0;
2134 do {
2135 block_end = block_start + blocksize;
2136 if (block_end > from && block_start < to) {
2137 if (!buffer_uptodate(bh)) {
2138 ret = 0;
2139 break;
2140 }
2141 if (block_end >= to)
2142 break;
2143 }
2144 block_start = block_end;
2145 bh = bh->b_this_page;
2146 } while (bh != head);
2147
2148 return ret;
2149}
2150EXPORT_SYMBOL(block_is_partially_uptodate);
2151
1da177e4
LT
2152/*
2153 * Generic "read page" function for block devices that have the normal
2154 * get_block functionality. This is most of the block device filesystems.
2155 * Reads the page asynchronously --- the unlock_buffer() and
2156 * set/clear_buffer_uptodate() functions propagate buffer state into the
2157 * page struct once IO has completed.
2158 */
2159int block_read_full_page(struct page *page, get_block_t *get_block)
2160{
2161 struct inode *inode = page->mapping->host;
2162 sector_t iblock, lblock;
2163 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2164 unsigned int blocksize;
2165 int nr, i;
2166 int fully_mapped = 1;
2167
cd7619d6 2168 BUG_ON(!PageLocked(page));
1da177e4
LT
2169 blocksize = 1 << inode->i_blkbits;
2170 if (!page_has_buffers(page))
2171 create_empty_buffers(page, blocksize, 0);
2172 head = page_buffers(page);
2173
2174 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2175 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2176 bh = head;
2177 nr = 0;
2178 i = 0;
2179
2180 do {
2181 if (buffer_uptodate(bh))
2182 continue;
2183
2184 if (!buffer_mapped(bh)) {
c64610ba
AM
2185 int err = 0;
2186
1da177e4
LT
2187 fully_mapped = 0;
2188 if (iblock < lblock) {
b0cf2321 2189 WARN_ON(bh->b_size != blocksize);
c64610ba
AM
2190 err = get_block(inode, iblock, bh, 0);
2191 if (err)
1da177e4
LT
2192 SetPageError(page);
2193 }
2194 if (!buffer_mapped(bh)) {
eebd2aa3 2195 zero_user(page, i * blocksize, blocksize);
c64610ba
AM
2196 if (!err)
2197 set_buffer_uptodate(bh);
1da177e4
LT
2198 continue;
2199 }
2200 /*
2201 * get_block() might have updated the buffer
2202 * synchronously
2203 */
2204 if (buffer_uptodate(bh))
2205 continue;
2206 }
2207 arr[nr++] = bh;
2208 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2209
2210 if (fully_mapped)
2211 SetPageMappedToDisk(page);
2212
2213 if (!nr) {
2214 /*
2215 * All buffers are uptodate - we can set the page uptodate
2216 * as well. But not if get_block() returned an error.
2217 */
2218 if (!PageError(page))
2219 SetPageUptodate(page);
2220 unlock_page(page);
2221 return 0;
2222 }
2223
2224 /* Stage two: lock the buffers */
2225 for (i = 0; i < nr; i++) {
2226 bh = arr[i];
2227 lock_buffer(bh);
2228 mark_buffer_async_read(bh);
2229 }
2230
2231 /*
2232 * Stage 3: start the IO. Check for uptodateness
2233 * inside the buffer lock in case another process reading
2234 * the underlying blockdev brought it uptodate (the sct fix).
2235 */
2236 for (i = 0; i < nr; i++) {
2237 bh = arr[i];
2238 if (buffer_uptodate(bh))
2239 end_buffer_async_read(bh, 1);
2240 else
2241 submit_bh(READ, bh);
2242 }
2243 return 0;
2244}
1fe72eaa 2245EXPORT_SYMBOL(block_read_full_page);
1da177e4
LT
2246
2247/* utility function for filesystems that need to do work on expanding
89e10787 2248 * truncates. Uses filesystem pagecache writes to allow the filesystem to
1da177e4
LT
2249 * deal with the hole.
2250 */
89e10787 2251int generic_cont_expand_simple(struct inode *inode, loff_t size)
1da177e4
LT
2252{
2253 struct address_space *mapping = inode->i_mapping;
2254 struct page *page;
89e10787 2255 void *fsdata;
1da177e4
LT
2256 int err;
2257
c08d3b0e 2258 err = inode_newsize_ok(inode, size);
2259 if (err)
1da177e4
LT
2260 goto out;
2261
89e10787
NP
2262 err = pagecache_write_begin(NULL, mapping, size, 0,
2263 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2264 &page, &fsdata);
2265 if (err)
05eb0b51 2266 goto out;
05eb0b51 2267
89e10787
NP
2268 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2269 BUG_ON(err > 0);
05eb0b51 2270
1da177e4
LT
2271out:
2272 return err;
2273}
1fe72eaa 2274EXPORT_SYMBOL(generic_cont_expand_simple);
1da177e4 2275
f1e3af72
AB
2276static int cont_expand_zero(struct file *file, struct address_space *mapping,
2277 loff_t pos, loff_t *bytes)
1da177e4 2278{
1da177e4 2279 struct inode *inode = mapping->host;
1da177e4 2280 unsigned blocksize = 1 << inode->i_blkbits;
89e10787
NP
2281 struct page *page;
2282 void *fsdata;
2283 pgoff_t index, curidx;
2284 loff_t curpos;
2285 unsigned zerofrom, offset, len;
2286 int err = 0;
1da177e4 2287
89e10787
NP
2288 index = pos >> PAGE_CACHE_SHIFT;
2289 offset = pos & ~PAGE_CACHE_MASK;
2290
2291 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2292 zerofrom = curpos & ~PAGE_CACHE_MASK;
1da177e4
LT
2293 if (zerofrom & (blocksize-1)) {
2294 *bytes |= (blocksize-1);
2295 (*bytes)++;
2296 }
89e10787 2297 len = PAGE_CACHE_SIZE - zerofrom;
1da177e4 2298
89e10787
NP
2299 err = pagecache_write_begin(file, mapping, curpos, len,
2300 AOP_FLAG_UNINTERRUPTIBLE,
2301 &page, &fsdata);
2302 if (err)
2303 goto out;
eebd2aa3 2304 zero_user(page, zerofrom, len);
89e10787
NP
2305 err = pagecache_write_end(file, mapping, curpos, len, len,
2306 page, fsdata);
2307 if (err < 0)
2308 goto out;
2309 BUG_ON(err != len);
2310 err = 0;
061e9746
OH
2311
2312 balance_dirty_pages_ratelimited(mapping);
89e10787 2313 }
1da177e4 2314
89e10787
NP
2315 /* page covers the boundary, find the boundary offset */
2316 if (index == curidx) {
2317 zerofrom = curpos & ~PAGE_CACHE_MASK;
1da177e4 2318 /* if we will expand the thing last block will be filled */
89e10787
NP
2319 if (offset <= zerofrom) {
2320 goto out;
2321 }
2322 if (zerofrom & (blocksize-1)) {
1da177e4
LT
2323 *bytes |= (blocksize-1);
2324 (*bytes)++;
2325 }
89e10787 2326 len = offset - zerofrom;
1da177e4 2327
89e10787
NP
2328 err = pagecache_write_begin(file, mapping, curpos, len,
2329 AOP_FLAG_UNINTERRUPTIBLE,
2330 &page, &fsdata);
2331 if (err)
2332 goto out;
eebd2aa3 2333 zero_user(page, zerofrom, len);
89e10787
NP
2334 err = pagecache_write_end(file, mapping, curpos, len, len,
2335 page, fsdata);
2336 if (err < 0)
2337 goto out;
2338 BUG_ON(err != len);
2339 err = 0;
1da177e4 2340 }
89e10787
NP
2341out:
2342 return err;
2343}
2344
2345/*
2346 * For moronic filesystems that do not allow holes in file.
2347 * We may have to extend the file.
2348 */
2349int cont_write_begin(struct file *file, struct address_space *mapping,
2350 loff_t pos, unsigned len, unsigned flags,
2351 struct page **pagep, void **fsdata,
2352 get_block_t *get_block, loff_t *bytes)
2353{
2354 struct inode *inode = mapping->host;
2355 unsigned blocksize = 1 << inode->i_blkbits;
2356 unsigned zerofrom;
2357 int err;
2358
2359 err = cont_expand_zero(file, mapping, pos, bytes);
2360 if (err)
2361 goto out;
2362
2363 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2364 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2365 *bytes |= (blocksize-1);
2366 (*bytes)++;
1da177e4 2367 }
1da177e4 2368
89e10787
NP
2369 *pagep = NULL;
2370 err = block_write_begin(file, mapping, pos, len,
2371 flags, pagep, fsdata, get_block);
1da177e4 2372out:
89e10787 2373 return err;
1da177e4 2374}
1fe72eaa 2375EXPORT_SYMBOL(cont_write_begin);
1da177e4
LT
2376
2377int block_prepare_write(struct page *page, unsigned from, unsigned to,
2378 get_block_t *get_block)
2379{
2380 struct inode *inode = page->mapping->host;
2381 int err = __block_prepare_write(inode, page, from, to, get_block);
2382 if (err)
2383 ClearPageUptodate(page);
2384 return err;
2385}
1fe72eaa 2386EXPORT_SYMBOL(block_prepare_write);
1da177e4
LT
2387
2388int block_commit_write(struct page *page, unsigned from, unsigned to)
2389{
2390 struct inode *inode = page->mapping->host;
2391 __block_commit_write(inode,page,from,to);
2392 return 0;
2393}
1fe72eaa 2394EXPORT_SYMBOL(block_commit_write);
1da177e4 2395
54171690
DC
2396/*
2397 * block_page_mkwrite() is not allowed to change the file size as it gets
2398 * called from a page fault handler when a page is first dirtied. Hence we must
2399 * be careful to check for EOF conditions here. We set the page up correctly
2400 * for a written page which means we get ENOSPC checking when writing into
2401 * holes and correct delalloc and unwritten extent mapping on filesystems that
2402 * support these features.
2403 *
2404 * We are not allowed to take the i_mutex here so we have to play games to
2405 * protect against truncate races as the page could now be beyond EOF. Because
2406 * vmtruncate() writes the inode size before removing pages, once we have the
2407 * page lock we can determine safely if the page is beyond EOF. If it is not
2408 * beyond EOF, then the page is guaranteed safe against truncation until we
2409 * unlock the page.
2410 */
2411int
c2ec175c 2412block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
54171690
DC
2413 get_block_t get_block)
2414{
c2ec175c 2415 struct page *page = vmf->page;
54171690
DC
2416 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2417 unsigned long end;
2418 loff_t size;
56a76f82 2419 int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
54171690
DC
2420
2421 lock_page(page);
2422 size = i_size_read(inode);
2423 if ((page->mapping != inode->i_mapping) ||
18336338 2424 (page_offset(page) > size)) {
54171690 2425 /* page got truncated out from underneath us */
b827e496
NP
2426 unlock_page(page);
2427 goto out;
54171690
DC
2428 }
2429
2430 /* page is wholly or partially inside EOF */
2431 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2432 end = size & ~PAGE_CACHE_MASK;
2433 else
2434 end = PAGE_CACHE_SIZE;
2435
2436 ret = block_prepare_write(page, 0, end, get_block);
2437 if (!ret)
2438 ret = block_commit_write(page, 0, end);
2439
56a76f82 2440 if (unlikely(ret)) {
b827e496 2441 unlock_page(page);
56a76f82
NP
2442 if (ret == -ENOMEM)
2443 ret = VM_FAULT_OOM;
2444 else /* -ENOSPC, -EIO, etc */
2445 ret = VM_FAULT_SIGBUS;
b827e496
NP
2446 } else
2447 ret = VM_FAULT_LOCKED;
c2ec175c 2448
b827e496 2449out:
54171690
DC
2450 return ret;
2451}
1fe72eaa 2452EXPORT_SYMBOL(block_page_mkwrite);
1da177e4
LT
2453
2454/*
03158cd7 2455 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
1da177e4
LT
2456 * immediately, while under the page lock. So it needs a special end_io
2457 * handler which does not touch the bh after unlocking it.
1da177e4
LT
2458 */
2459static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2460{
68671f35 2461 __end_buffer_read_notouch(bh, uptodate);
1da177e4
LT
2462}
2463
03158cd7
NP
2464/*
2465 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2466 * the page (converting it to circular linked list and taking care of page
2467 * dirty races).
2468 */
2469static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2470{
2471 struct buffer_head *bh;
2472
2473 BUG_ON(!PageLocked(page));
2474
2475 spin_lock(&page->mapping->private_lock);
2476 bh = head;
2477 do {
2478 if (PageDirty(page))
2479 set_buffer_dirty(bh);
2480 if (!bh->b_this_page)
2481 bh->b_this_page = head;
2482 bh = bh->b_this_page;
2483 } while (bh != head);
2484 attach_page_buffers(page, head);
2485 spin_unlock(&page->mapping->private_lock);
2486}
2487
1da177e4
LT
2488/*
2489 * On entry, the page is fully not uptodate.
2490 * On exit the page is fully uptodate in the areas outside (from,to)
2491 */
03158cd7
NP
2492int nobh_write_begin(struct file *file, struct address_space *mapping,
2493 loff_t pos, unsigned len, unsigned flags,
2494 struct page **pagep, void **fsdata,
1da177e4
LT
2495 get_block_t *get_block)
2496{
03158cd7 2497 struct inode *inode = mapping->host;
1da177e4
LT
2498 const unsigned blkbits = inode->i_blkbits;
2499 const unsigned blocksize = 1 << blkbits;
a4b0672d 2500 struct buffer_head *head, *bh;
03158cd7
NP
2501 struct page *page;
2502 pgoff_t index;
2503 unsigned from, to;
1da177e4 2504 unsigned block_in_page;
a4b0672d 2505 unsigned block_start, block_end;
1da177e4 2506 sector_t block_in_file;
1da177e4 2507 int nr_reads = 0;
1da177e4
LT
2508 int ret = 0;
2509 int is_mapped_to_disk = 1;
1da177e4 2510
03158cd7
NP
2511 index = pos >> PAGE_CACHE_SHIFT;
2512 from = pos & (PAGE_CACHE_SIZE - 1);
2513 to = from + len;
2514
54566b2c 2515 page = grab_cache_page_write_begin(mapping, index, flags);
03158cd7
NP
2516 if (!page)
2517 return -ENOMEM;
2518 *pagep = page;
2519 *fsdata = NULL;
2520
2521 if (page_has_buffers(page)) {
2522 unlock_page(page);
2523 page_cache_release(page);
2524 *pagep = NULL;
2525 return block_write_begin(file, mapping, pos, len, flags, pagep,
2526 fsdata, get_block);
2527 }
a4b0672d 2528
1da177e4
LT
2529 if (PageMappedToDisk(page))
2530 return 0;
2531
a4b0672d
NP
2532 /*
2533 * Allocate buffers so that we can keep track of state, and potentially
2534 * attach them to the page if an error occurs. In the common case of
2535 * no error, they will just be freed again without ever being attached
2536 * to the page (which is all OK, because we're under the page lock).
2537 *
2538 * Be careful: the buffer linked list is a NULL terminated one, rather
2539 * than the circular one we're used to.
2540 */
2541 head = alloc_page_buffers(page, blocksize, 0);
03158cd7
NP
2542 if (!head) {
2543 ret = -ENOMEM;
2544 goto out_release;
2545 }
a4b0672d 2546
1da177e4 2547 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
1da177e4
LT
2548
2549 /*
2550 * We loop across all blocks in the page, whether or not they are
2551 * part of the affected region. This is so we can discover if the
2552 * page is fully mapped-to-disk.
2553 */
a4b0672d 2554 for (block_start = 0, block_in_page = 0, bh = head;
1da177e4 2555 block_start < PAGE_CACHE_SIZE;
a4b0672d 2556 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
1da177e4
LT
2557 int create;
2558
a4b0672d
NP
2559 block_end = block_start + blocksize;
2560 bh->b_state = 0;
1da177e4
LT
2561 create = 1;
2562 if (block_start >= to)
2563 create = 0;
2564 ret = get_block(inode, block_in_file + block_in_page,
a4b0672d 2565 bh, create);
1da177e4
LT
2566 if (ret)
2567 goto failed;
a4b0672d 2568 if (!buffer_mapped(bh))
1da177e4 2569 is_mapped_to_disk = 0;
a4b0672d
NP
2570 if (buffer_new(bh))
2571 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2572 if (PageUptodate(page)) {
2573 set_buffer_uptodate(bh);
1da177e4 2574 continue;
a4b0672d
NP
2575 }
2576 if (buffer_new(bh) || !buffer_mapped(bh)) {
eebd2aa3
CL
2577 zero_user_segments(page, block_start, from,
2578 to, block_end);
1da177e4
LT
2579 continue;
2580 }
a4b0672d 2581 if (buffer_uptodate(bh))
1da177e4
LT
2582 continue; /* reiserfs does this */
2583 if (block_start < from || block_end > to) {
a4b0672d
NP
2584 lock_buffer(bh);
2585 bh->b_end_io = end_buffer_read_nobh;
2586 submit_bh(READ, bh);
2587 nr_reads++;
1da177e4
LT
2588 }
2589 }
2590
2591 if (nr_reads) {
1da177e4
LT
2592 /*
2593 * The page is locked, so these buffers are protected from
2594 * any VM or truncate activity. Hence we don't need to care
2595 * for the buffer_head refcounts.
2596 */
a4b0672d 2597 for (bh = head; bh; bh = bh->b_this_page) {
f80e69e7
JA
2598 int err;
2599
2600 err = wait_on_buffer_async(bh, current->io_wait);
1be3d0ec
JA
2601 if (err && !ret) {
2602 WARN(1, "%s: ret\n", __FUNCTION__);
f80e69e7 2603 ret = err;
1be3d0ec 2604 }
1da177e4
LT
2605 if (!buffer_uptodate(bh))
2606 ret = -EIO;
1da177e4
LT
2607 }
2608 if (ret)
2609 goto failed;
2610 }
2611
2612 if (is_mapped_to_disk)
2613 SetPageMappedToDisk(page);
1da177e4 2614
03158cd7 2615 *fsdata = head; /* to be released by nobh_write_end */
a4b0672d 2616
1da177e4
LT
2617 return 0;
2618
2619failed:
03158cd7 2620 BUG_ON(!ret);
1da177e4 2621 /*
a4b0672d
NP
2622 * Error recovery is a bit difficult. We need to zero out blocks that
2623 * were newly allocated, and dirty them to ensure they get written out.
2624 * Buffers need to be attached to the page at this point, otherwise
2625 * the handling of potential IO errors during writeout would be hard
2626 * (could try doing synchronous writeout, but what if that fails too?)
1da177e4 2627 */
03158cd7
NP
2628 attach_nobh_buffers(page, head);
2629 page_zero_new_buffers(page, from, to);
a4b0672d 2630
03158cd7
NP
2631out_release:
2632 unlock_page(page);
2633 page_cache_release(page);
2634 *pagep = NULL;
a4b0672d 2635
03158cd7
NP
2636 if (pos + len > inode->i_size)
2637 vmtruncate(inode, inode->i_size);
a4b0672d 2638
1da177e4
LT
2639 return ret;
2640}
03158cd7 2641EXPORT_SYMBOL(nobh_write_begin);
1da177e4 2642
03158cd7
NP
2643int nobh_write_end(struct file *file, struct address_space *mapping,
2644 loff_t pos, unsigned len, unsigned copied,
2645 struct page *page, void *fsdata)
1da177e4
LT
2646{
2647 struct inode *inode = page->mapping->host;
efdc3131 2648 struct buffer_head *head = fsdata;
03158cd7 2649 struct buffer_head *bh;
5b41e74a 2650 BUG_ON(fsdata != NULL && page_has_buffers(page));
1da177e4 2651
d4cf109f 2652 if (unlikely(copied < len) && head)
5b41e74a
DM
2653 attach_nobh_buffers(page, head);
2654 if (page_has_buffers(page))
2655 return generic_write_end(file, mapping, pos, len,
2656 copied, page, fsdata);
a4b0672d 2657
22c8ca78 2658 SetPageUptodate(page);
1da177e4 2659 set_page_dirty(page);
03158cd7
NP
2660 if (pos+copied > inode->i_size) {
2661 i_size_write(inode, pos+copied);
1da177e4
LT
2662 mark_inode_dirty(inode);
2663 }
03158cd7
NP
2664
2665 unlock_page(page);
2666 page_cache_release(page);
2667
03158cd7
NP
2668 while (head) {
2669 bh = head;
2670 head = head->b_this_page;
2671 free_buffer_head(bh);
2672 }
2673
2674 return copied;
1da177e4 2675}
03158cd7 2676EXPORT_SYMBOL(nobh_write_end);
1da177e4
LT
2677
2678/*
2679 * nobh_writepage() - based on block_full_write_page() except
2680 * that it tries to operate without attaching bufferheads to
2681 * the page.
2682 */
2683int nobh_writepage(struct page *page, get_block_t *get_block,
2684 struct writeback_control *wbc)
2685{
2686 struct inode * const inode = page->mapping->host;
2687 loff_t i_size = i_size_read(inode);
2688 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2689 unsigned offset;
1da177e4
LT
2690 int ret;
2691
2692 /* Is the page fully inside i_size? */
2693 if (page->index < end_index)
2694 goto out;
2695
2696 /* Is the page fully outside i_size? (truncate in progress) */
2697 offset = i_size & (PAGE_CACHE_SIZE-1);
2698 if (page->index >= end_index+1 || !offset) {
2699 /*
2700 * The page may have dirty, unmapped buffers. For example,
2701 * they may have been added in ext3_writepage(). Make them
2702 * freeable here, so the page does not leak.
2703 */
2704#if 0
2705 /* Not really sure about this - do we need this ? */
2706 if (page->mapping->a_ops->invalidatepage)
2707 page->mapping->a_ops->invalidatepage(page, offset);
2708#endif
2709 unlock_page(page);
2710 return 0; /* don't care */
2711 }
2712
2713 /*
2714 * The page straddles i_size. It must be zeroed out on each and every
2715 * writepage invocation because it may be mmapped. "A file is mapped
2716 * in multiples of the page size. For a file that is not a multiple of
2717 * the page size, the remaining memory is zeroed when mapped, and
2718 * writes to that region are not written out to the file."
2719 */
eebd2aa3 2720 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
1da177e4
LT
2721out:
2722 ret = mpage_writepage(page, get_block, wbc);
2723 if (ret == -EAGAIN)
35c80d5f
CM
2724 ret = __block_write_full_page(inode, page, get_block, wbc,
2725 end_buffer_async_write);
1da177e4
LT
2726 return ret;
2727}
2728EXPORT_SYMBOL(nobh_writepage);
2729
03158cd7
NP
2730int nobh_truncate_page(struct address_space *mapping,
2731 loff_t from, get_block_t *get_block)
1da177e4 2732{
1da177e4
LT
2733 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2734 unsigned offset = from & (PAGE_CACHE_SIZE-1);
03158cd7
NP
2735 unsigned blocksize;
2736 sector_t iblock;
2737 unsigned length, pos;
2738 struct inode *inode = mapping->host;
1da177e4 2739 struct page *page;
03158cd7
NP
2740 struct buffer_head map_bh;
2741 int err;
1da177e4 2742
03158cd7
NP
2743 blocksize = 1 << inode->i_blkbits;
2744 length = offset & (blocksize - 1);
2745
2746 /* Block boundary? Nothing to do */
2747 if (!length)
2748 return 0;
2749
2750 length = blocksize - length;
2751 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1da177e4 2752
1da177e4 2753 page = grab_cache_page(mapping, index);
03158cd7 2754 err = -ENOMEM;
1da177e4
LT
2755 if (!page)
2756 goto out;
2757
03158cd7
NP
2758 if (page_has_buffers(page)) {
2759has_buffers:
2760 unlock_page(page);
2761 page_cache_release(page);
2762 return block_truncate_page(mapping, from, get_block);
2763 }
2764
2765 /* Find the buffer that contains "offset" */
2766 pos = blocksize;
2767 while (offset >= pos) {
2768 iblock++;
2769 pos += blocksize;
2770 }
2771
460bcf57
TT
2772 map_bh.b_size = blocksize;
2773 map_bh.b_state = 0;
03158cd7
NP
2774 err = get_block(inode, iblock, &map_bh, 0);
2775 if (err)
2776 goto unlock;
2777 /* unmapped? It's a hole - nothing to do */
2778 if (!buffer_mapped(&map_bh))
2779 goto unlock;
2780
2781 /* Ok, it's mapped. Make sure it's up-to-date */
2782 if (!PageUptodate(page)) {
2783 err = mapping->a_ops->readpage(NULL, page);
2784 if (err) {
2785 page_cache_release(page);
2786 goto out;
2787 }
2788 lock_page(page);
2789 if (!PageUptodate(page)) {
2790 err = -EIO;
2791 goto unlock;
2792 }
2793 if (page_has_buffers(page))
2794 goto has_buffers;
1da177e4 2795 }
eebd2aa3 2796 zero_user(page, offset, length);
03158cd7
NP
2797 set_page_dirty(page);
2798 err = 0;
2799
2800unlock:
1da177e4
LT
2801 unlock_page(page);
2802 page_cache_release(page);
2803out:
03158cd7 2804 return err;
1da177e4
LT
2805}
2806EXPORT_SYMBOL(nobh_truncate_page);
2807
2808int block_truncate_page(struct address_space *mapping,
2809 loff_t from, get_block_t *get_block)
2810{
2811 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2812 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2813 unsigned blocksize;
54b21a79 2814 sector_t iblock;
1da177e4
LT
2815 unsigned length, pos;
2816 struct inode *inode = mapping->host;
2817 struct page *page;
2818 struct buffer_head *bh;
1da177e4
LT
2819 int err;
2820
2821 blocksize = 1 << inode->i_blkbits;
2822 length = offset & (blocksize - 1);
2823
2824 /* Block boundary? Nothing to do */
2825 if (!length)
2826 return 0;
2827
2828 length = blocksize - length;
54b21a79 2829 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1da177e4
LT
2830
2831 page = grab_cache_page(mapping, index);
2832 err = -ENOMEM;
2833 if (!page)
2834 goto out;
2835
2836 if (!page_has_buffers(page))
2837 create_empty_buffers(page, blocksize, 0);
2838
2839 /* Find the buffer that contains "offset" */
2840 bh = page_buffers(page);
2841 pos = blocksize;
2842 while (offset >= pos) {
2843 bh = bh->b_this_page;
2844 iblock++;
2845 pos += blocksize;
2846 }
2847
2848 err = 0;
2849 if (!buffer_mapped(bh)) {
b0cf2321 2850 WARN_ON(bh->b_size != blocksize);
1da177e4
LT
2851 err = get_block(inode, iblock, bh, 0);
2852 if (err)
2853 goto unlock;
2854 /* unmapped? It's a hole - nothing to do */
2855 if (!buffer_mapped(bh))
2856 goto unlock;
2857 }
2858
2859 /* Ok, it's mapped. Make sure it's up-to-date */
2860 if (PageUptodate(page))
2861 set_buffer_uptodate(bh);
2862
33a266dd 2863 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
1da177e4 2864 ll_rw_block(READ, 1, &bh);
f80e69e7 2865 err = wait_on_buffer_async(bh, current->io_wait);
1be3d0ec
JA
2866 if (err) {
2867 WARN(1, "err=%d\n", err);
2868 goto out;
2869 }
1da177e4 2870 /* Uhhuh. Read error. Complain and punt. */
f80e69e7 2871 err = -EIO;
1da177e4
LT
2872 if (!buffer_uptodate(bh))
2873 goto unlock;
2874 }
2875
eebd2aa3 2876 zero_user(page, offset, length);
1da177e4
LT
2877 mark_buffer_dirty(bh);
2878 err = 0;
2879
2880unlock:
2881 unlock_page(page);
2882 page_cache_release(page);
2883out:
2884 return err;
2885}
1fe72eaa 2886EXPORT_SYMBOL(block_truncate_page);
1da177e4
LT
2887
2888/*
2889 * The generic ->writepage function for buffer-backed address_spaces
35c80d5f 2890 * this form passes in the end_io handler used to finish the IO.
1da177e4 2891 */
35c80d5f
CM
2892int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2893 struct writeback_control *wbc, bh_end_io_t *handler)
1da177e4
LT
2894{
2895 struct inode * const inode = page->mapping->host;
2896 loff_t i_size = i_size_read(inode);
2897 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2898 unsigned offset;
1da177e4
LT
2899
2900 /* Is the page fully inside i_size? */
2901 if (page->index < end_index)
35c80d5f
CM
2902 return __block_write_full_page(inode, page, get_block, wbc,
2903 handler);
1da177e4
LT
2904
2905 /* Is the page fully outside i_size? (truncate in progress) */
2906 offset = i_size & (PAGE_CACHE_SIZE-1);
2907 if (page->index >= end_index+1 || !offset) {
2908 /*
2909 * The page may have dirty, unmapped buffers. For example,
2910 * they may have been added in ext3_writepage(). Make them
2911 * freeable here, so the page does not leak.
2912 */
aaa4059b 2913 do_invalidatepage(page, 0);
1da177e4
LT
2914 unlock_page(page);
2915 return 0; /* don't care */
2916 }
2917
2918 /*
2919 * The page straddles i_size. It must be zeroed out on each and every
2920 * writepage invokation because it may be mmapped. "A file is mapped
2921 * in multiples of the page size. For a file that is not a multiple of
2922 * the page size, the remaining memory is zeroed when mapped, and
2923 * writes to that region are not written out to the file."
2924 */
eebd2aa3 2925 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
35c80d5f 2926 return __block_write_full_page(inode, page, get_block, wbc, handler);
1da177e4 2927}
1fe72eaa 2928EXPORT_SYMBOL(block_write_full_page_endio);
1da177e4 2929
35c80d5f
CM
2930/*
2931 * The generic ->writepage function for buffer-backed address_spaces
2932 */
2933int block_write_full_page(struct page *page, get_block_t *get_block,
2934 struct writeback_control *wbc)
2935{
2936 return block_write_full_page_endio(page, get_block, wbc,
2937 end_buffer_async_write);
2938}
1fe72eaa 2939EXPORT_SYMBOL(block_write_full_page);
35c80d5f 2940
1da177e4
LT
2941sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2942 get_block_t *get_block)
2943{
2944 struct buffer_head tmp;
2945 struct inode *inode = mapping->host;
2946 tmp.b_state = 0;
2947 tmp.b_blocknr = 0;
b0cf2321 2948 tmp.b_size = 1 << inode->i_blkbits;
1da177e4
LT
2949 get_block(inode, block, &tmp, 0);
2950 return tmp.b_blocknr;
2951}
1fe72eaa 2952EXPORT_SYMBOL(generic_block_bmap);
1da177e4 2953
6712ecf8 2954static void end_bio_bh_io_sync(struct bio *bio, int err)
1da177e4
LT
2955{
2956 struct buffer_head *bh = bio->bi_private;
2957
1da177e4
LT
2958 if (err == -EOPNOTSUPP) {
2959 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2960 set_bit(BH_Eopnotsupp, &bh->b_state);
2961 }
2962
08bafc03
KM
2963 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2964 set_bit(BH_Quiet, &bh->b_state);
2965
1da177e4
LT
2966 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2967 bio_put(bio);
1da177e4
LT
2968}
2969
2970int submit_bh(int rw, struct buffer_head * bh)
2971{
2972 struct bio *bio;
2973 int ret = 0;
2974
2975 BUG_ON(!buffer_locked(bh));
2976 BUG_ON(!buffer_mapped(bh));
2977 BUG_ON(!bh->b_end_io);
8fb0e342
AK
2978 BUG_ON(buffer_delay(bh));
2979 BUG_ON(buffer_unwritten(bh));
1da177e4 2980
48fd4f93
JA
2981 /*
2982 * Mask in barrier bit for a write (could be either a WRITE or a
2983 * WRITE_SYNC
2984 */
2985 if (buffer_ordered(bh) && (rw & WRITE))
2986 rw |= WRITE_BARRIER;
1da177e4
LT
2987
2988 /*
48fd4f93 2989 * Only clear out a write error when rewriting
1da177e4 2990 */
48fd4f93 2991 if (test_set_buffer_req(bh) && (rw & WRITE))
1da177e4
LT
2992 clear_buffer_write_io_error(bh);
2993
2994 /*
2995 * from here on down, it's all bio -- do the initial mapping,
2996 * submit_bio -> generic_make_request may further map this bio around
2997 */
2998 bio = bio_alloc(GFP_NOIO, 1);
2999
3000 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3001 bio->bi_bdev = bh->b_bdev;
3002 bio->bi_io_vec[0].bv_page = bh->b_page;
3003 bio->bi_io_vec[0].bv_len = bh->b_size;
3004 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
3005
3006 bio->bi_vcnt = 1;
3007 bio->bi_idx = 0;
3008 bio->bi_size = bh->b_size;
3009
3010 bio->bi_end_io = end_bio_bh_io_sync;
3011 bio->bi_private = bh;
3012
3013 bio_get(bio);
3014 submit_bio(rw, bio);
3015
3016 if (bio_flagged(bio, BIO_EOPNOTSUPP))
3017 ret = -EOPNOTSUPP;
3018
3019 bio_put(bio);
3020 return ret;
3021}
1fe72eaa 3022EXPORT_SYMBOL(submit_bh);
1da177e4
LT
3023
3024/**
3025 * ll_rw_block: low-level access to block devices (DEPRECATED)
a7662236 3026 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
1da177e4
LT
3027 * @nr: number of &struct buffer_heads in the array
3028 * @bhs: array of pointers to &struct buffer_head
3029 *
a7662236
JK
3030 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3031 * requests an I/O operation on them, either a %READ or a %WRITE. The third
3032 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
3033 * are sent to disk. The fourth %READA option is described in the documentation
3034 * for generic_make_request() which ll_rw_block() calls.
1da177e4
LT
3035 *
3036 * This function drops any buffer that it cannot get a lock on (with the
a7662236
JK
3037 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
3038 * clean when doing a write request, and any buffer that appears to be
3039 * up-to-date when doing read request. Further it marks as clean buffers that
3040 * are processed for writing (the buffer cache won't assume that they are
3041 * actually clean until the buffer gets unlocked).
1da177e4
LT
3042 *
3043 * ll_rw_block sets b_end_io to simple completion handler that marks
3044 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
3045 * any waiters.
3046 *
3047 * All of the buffers must be for the same device, and must also be a
3048 * multiple of the current approved size for the device.
3049 */
3050void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3051{
3052 int i;
3053
3054 for (i = 0; i < nr; i++) {
3055 struct buffer_head *bh = bhs[i];
3056
9cf6b720 3057 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
a7662236 3058 lock_buffer(bh);
ca5de404 3059 else if (!trylock_buffer(bh))
1da177e4
LT
3060 continue;
3061
9cf6b720
JA
3062 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
3063 rw == SWRITE_SYNC_PLUG) {
1da177e4 3064 if (test_clear_buffer_dirty(bh)) {
76c3073a 3065 bh->b_end_io = end_buffer_write_sync;
e60e5c50 3066 get_bh(bh);
18ce3751
JA
3067 if (rw == SWRITE_SYNC)
3068 submit_bh(WRITE_SYNC, bh);
3069 else
3070 submit_bh(WRITE, bh);
1da177e4
LT
3071 continue;
3072 }
3073 } else {
1da177e4 3074 if (!buffer_uptodate(bh)) {
76c3073a 3075 bh->b_end_io = end_buffer_read_sync;
e60e5c50 3076 get_bh(bh);
1da177e4
LT
3077 submit_bh(rw, bh);
3078 continue;
3079 }
3080 }
3081 unlock_buffer(bh);
1da177e4
LT
3082 }
3083}
1fe72eaa 3084EXPORT_SYMBOL(ll_rw_block);
1da177e4
LT
3085
3086/*
3087 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3088 * and then start new I/O and then wait upon it. The caller must have a ref on
3089 * the buffer_head.
3090 */
3091int sync_dirty_buffer(struct buffer_head *bh)
3092{
3093 int ret = 0;
3094
3095 WARN_ON(atomic_read(&bh->b_count) < 1);
3096 lock_buffer(bh);
3097 if (test_clear_buffer_dirty(bh)) {
3098 get_bh(bh);
3099 bh->b_end_io = end_buffer_write_sync;
1aa2a7cc 3100 ret = submit_bh(WRITE_SYNC, bh);
1da177e4
LT
3101 wait_on_buffer(bh);
3102 if (buffer_eopnotsupp(bh)) {
3103 clear_buffer_eopnotsupp(bh);
3104 ret = -EOPNOTSUPP;
3105 }
3106 if (!ret && !buffer_uptodate(bh))
3107 ret = -EIO;
3108 } else {
3109 unlock_buffer(bh);
3110 }
3111 return ret;
3112}
1fe72eaa 3113EXPORT_SYMBOL(sync_dirty_buffer);
1da177e4
LT
3114
3115/*
3116 * try_to_free_buffers() checks if all the buffers on this particular page
3117 * are unused, and releases them if so.
3118 *
3119 * Exclusion against try_to_free_buffers may be obtained by either
3120 * locking the page or by holding its mapping's private_lock.
3121 *
3122 * If the page is dirty but all the buffers are clean then we need to
3123 * be sure to mark the page clean as well. This is because the page
3124 * may be against a block device, and a later reattachment of buffers
3125 * to a dirty page will set *all* buffers dirty. Which would corrupt
3126 * filesystem data on the same device.
3127 *
3128 * The same applies to regular filesystem pages: if all the buffers are
3129 * clean then we set the page clean and proceed. To do that, we require
3130 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3131 * private_lock.
3132 *
3133 * try_to_free_buffers() is non-blocking.
3134 */
3135static inline int buffer_busy(struct buffer_head *bh)
3136{
3137 return atomic_read(&bh->b_count) |
3138 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3139}
3140
3141static int
3142drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3143{
3144 struct buffer_head *head = page_buffers(page);
3145 struct buffer_head *bh;
3146
3147 bh = head;
3148 do {
de7d5a3b 3149 if (buffer_write_io_error(bh) && page->mapping)
1da177e4
LT
3150 set_bit(AS_EIO, &page->mapping->flags);
3151 if (buffer_busy(bh))
3152 goto failed;
3153 bh = bh->b_this_page;
3154 } while (bh != head);
3155
3156 do {
3157 struct buffer_head *next = bh->b_this_page;
3158
535ee2fb 3159 if (bh->b_assoc_map)
1da177e4
LT
3160 __remove_assoc_queue(bh);
3161 bh = next;
3162 } while (bh != head);
3163 *buffers_to_free = head;
3164 __clear_page_buffers(page);
3165 return 1;
3166failed:
3167 return 0;
3168}
3169
3170int try_to_free_buffers(struct page *page)
3171{
3172 struct address_space * const mapping = page->mapping;
3173 struct buffer_head *buffers_to_free = NULL;
3174 int ret = 0;
3175
3176 BUG_ON(!PageLocked(page));
ecdfc978 3177 if (PageWriteback(page))
1da177e4
LT
3178 return 0;
3179
3180 if (mapping == NULL) { /* can this still happen? */
3181 ret = drop_buffers(page, &buffers_to_free);
3182 goto out;
3183 }
3184
3185 spin_lock(&mapping->private_lock);
3186 ret = drop_buffers(page, &buffers_to_free);
ecdfc978
LT
3187
3188 /*
3189 * If the filesystem writes its buffers by hand (eg ext3)
3190 * then we can have clean buffers against a dirty page. We
3191 * clean the page here; otherwise the VM will never notice
3192 * that the filesystem did any IO at all.
3193 *
3194 * Also, during truncate, discard_buffer will have marked all
3195 * the page's buffers clean. We discover that here and clean
3196 * the page also.
87df7241
NP
3197 *
3198 * private_lock must be held over this entire operation in order
3199 * to synchronise against __set_page_dirty_buffers and prevent the
3200 * dirty bit from being lost.
ecdfc978
LT
3201 */
3202 if (ret)
3203 cancel_dirty_page(page, PAGE_CACHE_SIZE);
87df7241 3204 spin_unlock(&mapping->private_lock);
1da177e4
LT
3205out:
3206 if (buffers_to_free) {
3207 struct buffer_head *bh = buffers_to_free;
3208
3209 do {
3210 struct buffer_head *next = bh->b_this_page;
3211 free_buffer_head(bh);
3212 bh = next;
3213 } while (bh != buffers_to_free);
3214 }
3215 return ret;
3216}
3217EXPORT_SYMBOL(try_to_free_buffers);
3218
3978d717 3219void block_sync_page(struct page *page)
1da177e4
LT
3220{
3221 struct address_space *mapping;
3222
3223 smp_mb();
3224 mapping = page_mapping(page);
3225 if (mapping)
3226 blk_run_backing_dev(mapping->backing_dev_info, page);
1da177e4 3227}
1fe72eaa 3228EXPORT_SYMBOL(block_sync_page);
1da177e4
LT
3229
3230/*
3231 * There are no bdflush tunables left. But distributions are
3232 * still running obsolete flush daemons, so we terminate them here.
3233 *
3234 * Use of bdflush() is deprecated and will be removed in a future kernel.
5b0830cb 3235 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
1da177e4 3236 */
bdc480e3 3237SYSCALL_DEFINE2(bdflush, int, func, long, data)
1da177e4
LT
3238{
3239 static int msg_count;
3240
3241 if (!capable(CAP_SYS_ADMIN))
3242 return -EPERM;
3243
3244 if (msg_count < 5) {
3245 msg_count++;
3246 printk(KERN_INFO
3247 "warning: process `%s' used the obsolete bdflush"
3248 " system call\n", current->comm);
3249 printk(KERN_INFO "Fix your initscripts?\n");
3250 }
3251
3252 if (func == 1)
3253 do_exit(0);
3254 return 0;
3255}
3256
3257/*
3258 * Buffer-head allocation
3259 */
e18b890b 3260static struct kmem_cache *bh_cachep;
1da177e4
LT
3261
3262/*
3263 * Once the number of bh's in the machine exceeds this level, we start
3264 * stripping them in writeback.
3265 */
3266static int max_buffer_heads;
3267
3268int buffer_heads_over_limit;
3269
3270struct bh_accounting {
3271 int nr; /* Number of live bh's */
3272 int ratelimit; /* Limit cacheline bouncing */
3273};
3274
3275static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3276
3277static void recalc_bh_state(void)
3278{
3279 int i;
3280 int tot = 0;
3281
3282 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3283 return;
3284 __get_cpu_var(bh_accounting).ratelimit = 0;
8a143426 3285 for_each_online_cpu(i)
1da177e4
LT
3286 tot += per_cpu(bh_accounting, i).nr;
3287 buffer_heads_over_limit = (tot > max_buffer_heads);
3288}
3289
dd0fc66f 3290struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
1da177e4 3291{
488514d1 3292 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
1da177e4 3293 if (ret) {
a35afb83 3294 INIT_LIST_HEAD(&ret->b_assoc_buffers);
736c7b80 3295 get_cpu_var(bh_accounting).nr++;
1da177e4 3296 recalc_bh_state();
736c7b80 3297 put_cpu_var(bh_accounting);
1da177e4
LT
3298 }
3299 return ret;
3300}
3301EXPORT_SYMBOL(alloc_buffer_head);
3302
3303void free_buffer_head(struct buffer_head *bh)
3304{
3305 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3306 kmem_cache_free(bh_cachep, bh);
736c7b80 3307 get_cpu_var(bh_accounting).nr--;
1da177e4 3308 recalc_bh_state();
736c7b80 3309 put_cpu_var(bh_accounting);
1da177e4
LT
3310}
3311EXPORT_SYMBOL(free_buffer_head);
3312
1da177e4
LT
3313static void buffer_exit_cpu(int cpu)
3314{
3315 int i;
3316 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3317
3318 for (i = 0; i < BH_LRU_SIZE; i++) {
3319 brelse(b->bhs[i]);
3320 b->bhs[i] = NULL;
3321 }
8a143426
ED
3322 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3323 per_cpu(bh_accounting, cpu).nr = 0;
3324 put_cpu_var(bh_accounting);
1da177e4
LT
3325}
3326
3327static int buffer_cpu_notify(struct notifier_block *self,
3328 unsigned long action, void *hcpu)
3329{
8bb78442 3330 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1da177e4
LT
3331 buffer_exit_cpu((unsigned long)hcpu);
3332 return NOTIFY_OK;
3333}
1da177e4 3334
389d1b08 3335/**
a6b91919 3336 * bh_uptodate_or_lock - Test whether the buffer is uptodate
389d1b08
AK
3337 * @bh: struct buffer_head
3338 *
3339 * Return true if the buffer is up-to-date and false,
3340 * with the buffer locked, if not.
3341 */
3342int bh_uptodate_or_lock(struct buffer_head *bh)
3343{
3344 if (!buffer_uptodate(bh)) {
3345 lock_buffer(bh);
3346 if (!buffer_uptodate(bh))
3347 return 0;
3348 unlock_buffer(bh);
3349 }
3350 return 1;
3351}
3352EXPORT_SYMBOL(bh_uptodate_or_lock);
3353
3354/**
a6b91919 3355 * bh_submit_read - Submit a locked buffer for reading
389d1b08
AK
3356 * @bh: struct buffer_head
3357 *
3358 * Returns zero on success and -EIO on error.
3359 */
3360int bh_submit_read(struct buffer_head *bh)
3361{
3362 BUG_ON(!buffer_locked(bh));
3363
3364 if (buffer_uptodate(bh)) {
3365 unlock_buffer(bh);
3366 return 0;
3367 }
3368
3369 get_bh(bh);
3370 bh->b_end_io = end_buffer_read_sync;
3371 submit_bh(READ, bh);
3c72afb2 3372 wait_on_buffer(bh);
389d1b08
AK
3373 if (buffer_uptodate(bh))
3374 return 0;
3375 return -EIO;
3376}
3377EXPORT_SYMBOL(bh_submit_read);
3378
b98938c3 3379static void
51cc5068 3380init_buffer_head(void *data)
b98938c3
CL
3381{
3382 struct buffer_head *bh = data;
3383
3384 memset(bh, 0, sizeof(*bh));
3385 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3386}
3387
1da177e4
LT
3388void __init buffer_init(void)
3389{
3390 int nrpages;
3391
b98938c3
CL
3392 bh_cachep = kmem_cache_create("buffer_head",
3393 sizeof(struct buffer_head), 0,
3394 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3395 SLAB_MEM_SPREAD),
3396 init_buffer_head);
1da177e4
LT
3397
3398 /*
3399 * Limit the bh occupancy to 10% of ZONE_NORMAL
3400 */
3401 nrpages = (nr_free_buffer_pages() * 10) / 100;
3402 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3403 hotcpu_notifier(buffer_cpu_notify, 0);
3404}