ext2: async get_block and support code
[linux-block.git] / fs / buffer.c
CommitLineData
1da177e4
LT
1/*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
6
7/*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9 *
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12 *
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
15 *
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17 *
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19 */
20
1da177e4
LT
21#include <linux/kernel.h>
22#include <linux/syscalls.h>
23#include <linux/fs.h>
24#include <linux/mm.h>
25#include <linux/percpu.h>
26#include <linux/slab.h>
16f7e0fe 27#include <linux/capability.h>
1da177e4
LT
28#include <linux/blkdev.h>
29#include <linux/file.h>
30#include <linux/quotaops.h>
31#include <linux/highmem.h>
32#include <linux/module.h>
33#include <linux/writeback.h>
34#include <linux/hash.h>
35#include <linux/suspend.h>
36#include <linux/buffer_head.h>
55e829af 37#include <linux/task_io_accounting_ops.h>
1da177e4
LT
38#include <linux/bio.h>
39#include <linux/notifier.h>
40#include <linux/cpu.h>
41#include <linux/bitops.h>
42#include <linux/mpage.h>
fb1c8f93 43#include <linux/bit_spinlock.h>
1da177e4
LT
44
45static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
1da177e4
LT
46
47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48
49inline void
50init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51{
52 bh->b_end_io = handler;
53 bh->b_private = private;
54}
1fe72eaa 55EXPORT_SYMBOL(init_buffer);
1da177e4
LT
56
57static int sync_buffer(void *word)
58{
59 struct block_device *bd;
60 struct buffer_head *bh
61 = container_of(word, struct buffer_head, b_state);
62
63 smp_mb();
64 bd = bh->b_bdev;
65 if (bd)
66 blk_run_address_space(bd->bd_inode->i_mapping);
f80e69e7
JA
67 if (!in_aio(current))
68 io_schedule();
1da177e4
LT
69 return 0;
70}
71
fc9b52cd 72void __lock_buffer(struct buffer_head *bh)
1da177e4
LT
73{
74 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
75 TASK_UNINTERRUPTIBLE);
76}
77EXPORT_SYMBOL(__lock_buffer);
78
2557c31c
JA
79int __lock_buffer_async(struct buffer_head *bh, struct wait_bit_queue *wait)
80{
81 return wait_on_bit_lock_async(&bh->b_state, BH_Lock, sync_buffer,
82 TASK_UNINTERRUPTIBLE, wait);
83}
84EXPORT_SYMBOL(__lock_buffer_async);
85
fc9b52cd 86void unlock_buffer(struct buffer_head *bh)
1da177e4 87{
51b07fc3 88 clear_bit_unlock(BH_Lock, &bh->b_state);
1da177e4
LT
89 smp_mb__after_clear_bit();
90 wake_up_bit(&bh->b_state, BH_Lock);
91}
1fe72eaa 92EXPORT_SYMBOL(unlock_buffer);
1da177e4
LT
93
94/*
95 * Block until a buffer comes unlocked. This doesn't stop it
96 * from becoming locked again - you have to lock it yourself
97 * if you want to preserve its state.
98 */
99void __wait_on_buffer(struct buffer_head * bh)
100{
101 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
102}
1fe72eaa 103EXPORT_SYMBOL(__wait_on_buffer);
1da177e4 104
f80e69e7
JA
105int __wait_on_buffer_async(struct buffer_head *bh, struct wait_bit_queue *wait)
106{
107 return wait_on_bit_async(&bh->b_state, BH_Lock, sync_buffer,
108 TASK_UNINTERRUPTIBLE, wait);
109}
110EXPORT_SYMBOL(__wait_on_buffer_async);
111
1da177e4
LT
112static void
113__clear_page_buffers(struct page *page)
114{
115 ClearPagePrivate(page);
4c21e2f2 116 set_page_private(page, 0);
1da177e4
LT
117 page_cache_release(page);
118}
119
08bafc03
KM
120
121static int quiet_error(struct buffer_head *bh)
122{
123 if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
124 return 0;
125 return 1;
126}
127
128
1da177e4
LT
129static void buffer_io_error(struct buffer_head *bh)
130{
131 char b[BDEVNAME_SIZE];
1da177e4
LT
132 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
133 bdevname(bh->b_bdev, b),
134 (unsigned long long)bh->b_blocknr);
135}
136
137/*
68671f35
DM
138 * End-of-IO handler helper function which does not touch the bh after
139 * unlocking it.
140 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
141 * a race there is benign: unlock_buffer() only use the bh's address for
142 * hashing after unlocking the buffer, so it doesn't actually touch the bh
143 * itself.
1da177e4 144 */
68671f35 145static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
1da177e4
LT
146{
147 if (uptodate) {
148 set_buffer_uptodate(bh);
149 } else {
150 /* This happens, due to failed READA attempts. */
151 clear_buffer_uptodate(bh);
152 }
153 unlock_buffer(bh);
68671f35
DM
154}
155
156/*
157 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
158 * unlock the buffer. This is what ll_rw_block uses too.
159 */
160void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
161{
162 __end_buffer_read_notouch(bh, uptodate);
1da177e4
LT
163 put_bh(bh);
164}
1fe72eaa 165EXPORT_SYMBOL(end_buffer_read_sync);
1da177e4
LT
166
167void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
168{
169 char b[BDEVNAME_SIZE];
170
171 if (uptodate) {
172 set_buffer_uptodate(bh);
173 } else {
08bafc03 174 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
1da177e4
LT
175 buffer_io_error(bh);
176 printk(KERN_WARNING "lost page write due to "
177 "I/O error on %s\n",
178 bdevname(bh->b_bdev, b));
179 }
180 set_buffer_write_io_error(bh);
181 clear_buffer_uptodate(bh);
182 }
183 unlock_buffer(bh);
184 put_bh(bh);
185}
1fe72eaa 186EXPORT_SYMBOL(end_buffer_write_sync);
1da177e4 187
1da177e4
LT
188/*
189 * Various filesystems appear to want __find_get_block to be non-blocking.
190 * But it's the page lock which protects the buffers. To get around this,
191 * we get exclusion from try_to_free_buffers with the blockdev mapping's
192 * private_lock.
193 *
194 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
195 * may be quite high. This code could TryLock the page, and if that
196 * succeeds, there is no need to take private_lock. (But if
197 * private_lock is contended then so is mapping->tree_lock).
198 */
199static struct buffer_head *
385fd4c5 200__find_get_block_slow(struct block_device *bdev, sector_t block)
1da177e4
LT
201{
202 struct inode *bd_inode = bdev->bd_inode;
203 struct address_space *bd_mapping = bd_inode->i_mapping;
204 struct buffer_head *ret = NULL;
205 pgoff_t index;
206 struct buffer_head *bh;
207 struct buffer_head *head;
208 struct page *page;
209 int all_mapped = 1;
210
211 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
212 page = find_get_page(bd_mapping, index);
213 if (!page)
214 goto out;
215
216 spin_lock(&bd_mapping->private_lock);
217 if (!page_has_buffers(page))
218 goto out_unlock;
219 head = page_buffers(page);
220 bh = head;
221 do {
97f76d3d
NK
222 if (!buffer_mapped(bh))
223 all_mapped = 0;
224 else if (bh->b_blocknr == block) {
1da177e4
LT
225 ret = bh;
226 get_bh(bh);
227 goto out_unlock;
228 }
1da177e4
LT
229 bh = bh->b_this_page;
230 } while (bh != head);
231
232 /* we might be here because some of the buffers on this page are
233 * not mapped. This is due to various races between
234 * file io on the block device and getblk. It gets dealt with
235 * elsewhere, don't buffer_error if we had some unmapped buffers
236 */
237 if (all_mapped) {
238 printk("__find_get_block_slow() failed. "
239 "block=%llu, b_blocknr=%llu\n",
205f87f6
BP
240 (unsigned long long)block,
241 (unsigned long long)bh->b_blocknr);
242 printk("b_state=0x%08lx, b_size=%zu\n",
243 bh->b_state, bh->b_size);
1da177e4
LT
244 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
245 }
246out_unlock:
247 spin_unlock(&bd_mapping->private_lock);
248 page_cache_release(page);
249out:
250 return ret;
251}
252
253/* If invalidate_buffers() will trash dirty buffers, it means some kind
254 of fs corruption is going on. Trashing dirty data always imply losing
255 information that was supposed to be just stored on the physical layer
256 by the user.
257
258 Thus invalidate_buffers in general usage is not allwowed to trash
259 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
260 be preserved. These buffers are simply skipped.
261
262 We also skip buffers which are still in use. For example this can
263 happen if a userspace program is reading the block device.
264
265 NOTE: In the case where the user removed a removable-media-disk even if
266 there's still dirty data not synced on disk (due a bug in the device driver
267 or due an error of the user), by not destroying the dirty buffers we could
268 generate corruption also on the next media inserted, thus a parameter is
269 necessary to handle this case in the most safe way possible (trying
270 to not corrupt also the new disk inserted with the data belonging to
271 the old now corrupted disk). Also for the ramdisk the natural thing
272 to do in order to release the ramdisk memory is to destroy dirty buffers.
273
274 These are two special cases. Normal usage imply the device driver
275 to issue a sync on the device (without waiting I/O completion) and
276 then an invalidate_buffers call that doesn't trash dirty buffers.
277
278 For handling cache coherency with the blkdev pagecache the 'update' case
279 is been introduced. It is needed to re-read from disk any pinned
280 buffer. NOTE: re-reading from disk is destructive so we can do it only
281 when we assume nobody is changing the buffercache under our I/O and when
282 we think the disk contains more recent information than the buffercache.
283 The update == 1 pass marks the buffers we need to update, the update == 2
284 pass does the actual I/O. */
f98393a6 285void invalidate_bdev(struct block_device *bdev)
1da177e4 286{
0e1dfc66
AM
287 struct address_space *mapping = bdev->bd_inode->i_mapping;
288
289 if (mapping->nrpages == 0)
290 return;
291
1da177e4 292 invalidate_bh_lrus();
fc0ecff6 293 invalidate_mapping_pages(mapping, 0, -1);
1da177e4 294}
1fe72eaa 295EXPORT_SYMBOL(invalidate_bdev);
1da177e4
LT
296
297/*
5b0830cb 298 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
1da177e4
LT
299 */
300static void free_more_memory(void)
301{
19770b32 302 struct zone *zone;
0e88460d 303 int nid;
1da177e4 304
03ba3782 305 wakeup_flusher_threads(1024);
1da177e4
LT
306 yield();
307
0e88460d 308 for_each_online_node(nid) {
19770b32
MG
309 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
310 gfp_zone(GFP_NOFS), NULL,
311 &zone);
312 if (zone)
54a6eb5c 313 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
327c0e96 314 GFP_NOFS, NULL);
1da177e4
LT
315 }
316}
317
318/*
319 * I/O completion handler for block_read_full_page() - pages
320 * which come unlocked at the end of I/O.
321 */
322static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
323{
1da177e4 324 unsigned long flags;
a3972203 325 struct buffer_head *first;
1da177e4
LT
326 struct buffer_head *tmp;
327 struct page *page;
328 int page_uptodate = 1;
329
330 BUG_ON(!buffer_async_read(bh));
331
332 page = bh->b_page;
333 if (uptodate) {
334 set_buffer_uptodate(bh);
335 } else {
336 clear_buffer_uptodate(bh);
08bafc03 337 if (!quiet_error(bh))
1da177e4
LT
338 buffer_io_error(bh);
339 SetPageError(page);
340 }
341
342 /*
343 * Be _very_ careful from here on. Bad things can happen if
344 * two buffer heads end IO at almost the same time and both
345 * decide that the page is now completely done.
346 */
a3972203
NP
347 first = page_buffers(page);
348 local_irq_save(flags);
349 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
1da177e4
LT
350 clear_buffer_async_read(bh);
351 unlock_buffer(bh);
352 tmp = bh;
353 do {
354 if (!buffer_uptodate(tmp))
355 page_uptodate = 0;
356 if (buffer_async_read(tmp)) {
357 BUG_ON(!buffer_locked(tmp));
358 goto still_busy;
359 }
360 tmp = tmp->b_this_page;
361 } while (tmp != bh);
a3972203
NP
362 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
363 local_irq_restore(flags);
1da177e4
LT
364
365 /*
366 * If none of the buffers had errors and they are all
367 * uptodate then we can set the page uptodate.
368 */
369 if (page_uptodate && !PageError(page))
370 SetPageUptodate(page);
371 unlock_page(page);
372 return;
373
374still_busy:
a3972203
NP
375 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
376 local_irq_restore(flags);
1da177e4
LT
377 return;
378}
379
380/*
381 * Completion handler for block_write_full_page() - pages which are unlocked
382 * during I/O, and which have PageWriteback cleared upon I/O completion.
383 */
35c80d5f 384void end_buffer_async_write(struct buffer_head *bh, int uptodate)
1da177e4
LT
385{
386 char b[BDEVNAME_SIZE];
1da177e4 387 unsigned long flags;
a3972203 388 struct buffer_head *first;
1da177e4
LT
389 struct buffer_head *tmp;
390 struct page *page;
391
392 BUG_ON(!buffer_async_write(bh));
393
394 page = bh->b_page;
395 if (uptodate) {
396 set_buffer_uptodate(bh);
397 } else {
08bafc03 398 if (!quiet_error(bh)) {
1da177e4
LT
399 buffer_io_error(bh);
400 printk(KERN_WARNING "lost page write due to "
401 "I/O error on %s\n",
402 bdevname(bh->b_bdev, b));
403 }
404 set_bit(AS_EIO, &page->mapping->flags);
58ff407b 405 set_buffer_write_io_error(bh);
1da177e4
LT
406 clear_buffer_uptodate(bh);
407 SetPageError(page);
408 }
409
a3972203
NP
410 first = page_buffers(page);
411 local_irq_save(flags);
412 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
413
1da177e4
LT
414 clear_buffer_async_write(bh);
415 unlock_buffer(bh);
416 tmp = bh->b_this_page;
417 while (tmp != bh) {
418 if (buffer_async_write(tmp)) {
419 BUG_ON(!buffer_locked(tmp));
420 goto still_busy;
421 }
422 tmp = tmp->b_this_page;
423 }
a3972203
NP
424 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
425 local_irq_restore(flags);
1da177e4
LT
426 end_page_writeback(page);
427 return;
428
429still_busy:
a3972203
NP
430 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
431 local_irq_restore(flags);
1da177e4
LT
432 return;
433}
1fe72eaa 434EXPORT_SYMBOL(end_buffer_async_write);
1da177e4
LT
435
436/*
437 * If a page's buffers are under async readin (end_buffer_async_read
438 * completion) then there is a possibility that another thread of
439 * control could lock one of the buffers after it has completed
440 * but while some of the other buffers have not completed. This
441 * locked buffer would confuse end_buffer_async_read() into not unlocking
442 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
443 * that this buffer is not under async I/O.
444 *
445 * The page comes unlocked when it has no locked buffer_async buffers
446 * left.
447 *
448 * PageLocked prevents anyone starting new async I/O reads any of
449 * the buffers.
450 *
451 * PageWriteback is used to prevent simultaneous writeout of the same
452 * page.
453 *
454 * PageLocked prevents anyone from starting writeback of a page which is
455 * under read I/O (PageWriteback is only ever set against a locked page).
456 */
457static void mark_buffer_async_read(struct buffer_head *bh)
458{
459 bh->b_end_io = end_buffer_async_read;
460 set_buffer_async_read(bh);
461}
462
1fe72eaa
HS
463static void mark_buffer_async_write_endio(struct buffer_head *bh,
464 bh_end_io_t *handler)
1da177e4 465{
35c80d5f 466 bh->b_end_io = handler;
1da177e4
LT
467 set_buffer_async_write(bh);
468}
35c80d5f
CM
469
470void mark_buffer_async_write(struct buffer_head *bh)
471{
472 mark_buffer_async_write_endio(bh, end_buffer_async_write);
473}
1da177e4
LT
474EXPORT_SYMBOL(mark_buffer_async_write);
475
476
477/*
478 * fs/buffer.c contains helper functions for buffer-backed address space's
479 * fsync functions. A common requirement for buffer-based filesystems is
480 * that certain data from the backing blockdev needs to be written out for
481 * a successful fsync(). For example, ext2 indirect blocks need to be
482 * written back and waited upon before fsync() returns.
483 *
484 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
485 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
486 * management of a list of dependent buffers at ->i_mapping->private_list.
487 *
488 * Locking is a little subtle: try_to_free_buffers() will remove buffers
489 * from their controlling inode's queue when they are being freed. But
490 * try_to_free_buffers() will be operating against the *blockdev* mapping
491 * at the time, not against the S_ISREG file which depends on those buffers.
492 * So the locking for private_list is via the private_lock in the address_space
493 * which backs the buffers. Which is different from the address_space
494 * against which the buffers are listed. So for a particular address_space,
495 * mapping->private_lock does *not* protect mapping->private_list! In fact,
496 * mapping->private_list will always be protected by the backing blockdev's
497 * ->private_lock.
498 *
499 * Which introduces a requirement: all buffers on an address_space's
500 * ->private_list must be from the same address_space: the blockdev's.
501 *
502 * address_spaces which do not place buffers at ->private_list via these
503 * utility functions are free to use private_lock and private_list for
504 * whatever they want. The only requirement is that list_empty(private_list)
505 * be true at clear_inode() time.
506 *
507 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
508 * filesystems should do that. invalidate_inode_buffers() should just go
509 * BUG_ON(!list_empty).
510 *
511 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
512 * take an address_space, not an inode. And it should be called
513 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
514 * queued up.
515 *
516 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
517 * list if it is already on a list. Because if the buffer is on a list,
518 * it *must* already be on the right one. If not, the filesystem is being
519 * silly. This will save a ton of locking. But first we have to ensure
520 * that buffers are taken *off* the old inode's list when they are freed
521 * (presumably in truncate). That requires careful auditing of all
522 * filesystems (do it inside bforget()). It could also be done by bringing
523 * b_inode back.
524 */
525
526/*
527 * The buffer's backing address_space's private_lock must be held
528 */
dbacefc9 529static void __remove_assoc_queue(struct buffer_head *bh)
1da177e4
LT
530{
531 list_del_init(&bh->b_assoc_buffers);
58ff407b
JK
532 WARN_ON(!bh->b_assoc_map);
533 if (buffer_write_io_error(bh))
534 set_bit(AS_EIO, &bh->b_assoc_map->flags);
535 bh->b_assoc_map = NULL;
1da177e4
LT
536}
537
538int inode_has_buffers(struct inode *inode)
539{
540 return !list_empty(&inode->i_data.private_list);
541}
542
543/*
544 * osync is designed to support O_SYNC io. It waits synchronously for
545 * all already-submitted IO to complete, but does not queue any new
546 * writes to the disk.
547 *
548 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
549 * you dirty the buffers, and then use osync_inode_buffers to wait for
550 * completion. Any other dirty buffers which are not yet queued for
551 * write will not be flushed to disk by the osync.
552 */
553static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
554{
555 struct buffer_head *bh;
556 struct list_head *p;
557 int err = 0;
558
559 spin_lock(lock);
560repeat:
561 list_for_each_prev(p, list) {
562 bh = BH_ENTRY(p);
563 if (buffer_locked(bh)) {
564 get_bh(bh);
565 spin_unlock(lock);
566 wait_on_buffer(bh);
567 if (!buffer_uptodate(bh))
568 err = -EIO;
569 brelse(bh);
570 spin_lock(lock);
571 goto repeat;
572 }
573 }
574 spin_unlock(lock);
575 return err;
576}
577
1fe72eaa 578static void do_thaw_all(struct work_struct *work)
c2d75438
ES
579{
580 struct super_block *sb;
581 char b[BDEVNAME_SIZE];
582
583 spin_lock(&sb_lock);
584restart:
585 list_for_each_entry(sb, &super_blocks, s_list) {
586 sb->s_count++;
587 spin_unlock(&sb_lock);
588 down_read(&sb->s_umount);
589 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
590 printk(KERN_WARNING "Emergency Thaw on %s\n",
591 bdevname(sb->s_bdev, b));
592 up_read(&sb->s_umount);
593 spin_lock(&sb_lock);
594 if (__put_super_and_need_restart(sb))
595 goto restart;
596 }
597 spin_unlock(&sb_lock);
053c525f 598 kfree(work);
c2d75438
ES
599 printk(KERN_WARNING "Emergency Thaw complete\n");
600}
601
602/**
603 * emergency_thaw_all -- forcibly thaw every frozen filesystem
604 *
605 * Used for emergency unfreeze of all filesystems via SysRq
606 */
607void emergency_thaw_all(void)
608{
053c525f
JA
609 struct work_struct *work;
610
611 work = kmalloc(sizeof(*work), GFP_ATOMIC);
612 if (work) {
613 INIT_WORK(work, do_thaw_all);
614 schedule_work(work);
615 }
c2d75438
ES
616}
617
1da177e4 618/**
78a4a50a 619 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
67be2dd1 620 * @mapping: the mapping which wants those buffers written
1da177e4
LT
621 *
622 * Starts I/O against the buffers at mapping->private_list, and waits upon
623 * that I/O.
624 *
67be2dd1
MW
625 * Basically, this is a convenience function for fsync().
626 * @mapping is a file or directory which needs those buffers to be written for
627 * a successful fsync().
1da177e4
LT
628 */
629int sync_mapping_buffers(struct address_space *mapping)
630{
631 struct address_space *buffer_mapping = mapping->assoc_mapping;
632
633 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
634 return 0;
635
636 return fsync_buffers_list(&buffer_mapping->private_lock,
637 &mapping->private_list);
638}
639EXPORT_SYMBOL(sync_mapping_buffers);
640
641/*
642 * Called when we've recently written block `bblock', and it is known that
643 * `bblock' was for a buffer_boundary() buffer. This means that the block at
644 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
645 * dirty, schedule it for IO. So that indirects merge nicely with their data.
646 */
647void write_boundary_block(struct block_device *bdev,
648 sector_t bblock, unsigned blocksize)
649{
650 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
651 if (bh) {
652 if (buffer_dirty(bh))
653 ll_rw_block(WRITE, 1, &bh);
654 put_bh(bh);
655 }
656}
657
658void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
659{
660 struct address_space *mapping = inode->i_mapping;
661 struct address_space *buffer_mapping = bh->b_page->mapping;
662
663 mark_buffer_dirty(bh);
664 if (!mapping->assoc_mapping) {
665 mapping->assoc_mapping = buffer_mapping;
666 } else {
e827f923 667 BUG_ON(mapping->assoc_mapping != buffer_mapping);
1da177e4 668 }
535ee2fb 669 if (!bh->b_assoc_map) {
1da177e4
LT
670 spin_lock(&buffer_mapping->private_lock);
671 list_move_tail(&bh->b_assoc_buffers,
672 &mapping->private_list);
58ff407b 673 bh->b_assoc_map = mapping;
1da177e4
LT
674 spin_unlock(&buffer_mapping->private_lock);
675 }
676}
677EXPORT_SYMBOL(mark_buffer_dirty_inode);
678
787d2214
NP
679/*
680 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
681 * dirty.
682 *
683 * If warn is true, then emit a warning if the page is not uptodate and has
684 * not been truncated.
685 */
a8e7d49a 686static void __set_page_dirty(struct page *page,
787d2214
NP
687 struct address_space *mapping, int warn)
688{
19fd6231 689 spin_lock_irq(&mapping->tree_lock);
787d2214
NP
690 if (page->mapping) { /* Race with truncate? */
691 WARN_ON_ONCE(warn && !PageUptodate(page));
e3a7cca1 692 account_page_dirtied(page, mapping);
787d2214
NP
693 radix_tree_tag_set(&mapping->page_tree,
694 page_index(page), PAGECACHE_TAG_DIRTY);
695 }
19fd6231 696 spin_unlock_irq(&mapping->tree_lock);
787d2214 697 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
787d2214
NP
698}
699
1da177e4
LT
700/*
701 * Add a page to the dirty page list.
702 *
703 * It is a sad fact of life that this function is called from several places
704 * deeply under spinlocking. It may not sleep.
705 *
706 * If the page has buffers, the uptodate buffers are set dirty, to preserve
707 * dirty-state coherency between the page and the buffers. It the page does
708 * not have buffers then when they are later attached they will all be set
709 * dirty.
710 *
711 * The buffers are dirtied before the page is dirtied. There's a small race
712 * window in which a writepage caller may see the page cleanness but not the
713 * buffer dirtiness. That's fine. If this code were to set the page dirty
714 * before the buffers, a concurrent writepage caller could clear the page dirty
715 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
716 * page on the dirty page list.
717 *
718 * We use private_lock to lock against try_to_free_buffers while using the
719 * page's buffer list. Also use this to protect against clean buffers being
720 * added to the page after it was set dirty.
721 *
722 * FIXME: may need to call ->reservepage here as well. That's rather up to the
723 * address_space though.
724 */
725int __set_page_dirty_buffers(struct page *page)
726{
a8e7d49a 727 int newly_dirty;
787d2214 728 struct address_space *mapping = page_mapping(page);
ebf7a227
NP
729
730 if (unlikely(!mapping))
731 return !TestSetPageDirty(page);
1da177e4
LT
732
733 spin_lock(&mapping->private_lock);
734 if (page_has_buffers(page)) {
735 struct buffer_head *head = page_buffers(page);
736 struct buffer_head *bh = head;
737
738 do {
739 set_buffer_dirty(bh);
740 bh = bh->b_this_page;
741 } while (bh != head);
742 }
a8e7d49a 743 newly_dirty = !TestSetPageDirty(page);
1da177e4
LT
744 spin_unlock(&mapping->private_lock);
745
a8e7d49a
LT
746 if (newly_dirty)
747 __set_page_dirty(page, mapping, 1);
748 return newly_dirty;
1da177e4
LT
749}
750EXPORT_SYMBOL(__set_page_dirty_buffers);
751
752/*
753 * Write out and wait upon a list of buffers.
754 *
755 * We have conflicting pressures: we want to make sure that all
756 * initially dirty buffers get waited on, but that any subsequently
757 * dirtied buffers don't. After all, we don't want fsync to last
758 * forever if somebody is actively writing to the file.
759 *
760 * Do this in two main stages: first we copy dirty buffers to a
761 * temporary inode list, queueing the writes as we go. Then we clean
762 * up, waiting for those writes to complete.
763 *
764 * During this second stage, any subsequent updates to the file may end
765 * up refiling the buffer on the original inode's dirty list again, so
766 * there is a chance we will end up with a buffer queued for write but
767 * not yet completed on that list. So, as a final cleanup we go through
768 * the osync code to catch these locked, dirty buffers without requeuing
769 * any newly dirty buffers for write.
770 */
771static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
772{
773 struct buffer_head *bh;
774 struct list_head tmp;
9cf6b720 775 struct address_space *mapping, *prev_mapping = NULL;
1da177e4
LT
776 int err = 0, err2;
777
778 INIT_LIST_HEAD(&tmp);
779
780 spin_lock(lock);
781 while (!list_empty(list)) {
782 bh = BH_ENTRY(list->next);
535ee2fb 783 mapping = bh->b_assoc_map;
58ff407b 784 __remove_assoc_queue(bh);
535ee2fb
JK
785 /* Avoid race with mark_buffer_dirty_inode() which does
786 * a lockless check and we rely on seeing the dirty bit */
787 smp_mb();
1da177e4
LT
788 if (buffer_dirty(bh) || buffer_locked(bh)) {
789 list_add(&bh->b_assoc_buffers, &tmp);
535ee2fb 790 bh->b_assoc_map = mapping;
1da177e4
LT
791 if (buffer_dirty(bh)) {
792 get_bh(bh);
793 spin_unlock(lock);
794 /*
795 * Ensure any pending I/O completes so that
796 * ll_rw_block() actually writes the current
797 * contents - it is a noop if I/O is still in
798 * flight on potentially older contents.
799 */
9cf6b720
JA
800 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
801
802 /*
803 * Kick off IO for the previous mapping. Note
804 * that we will not run the very last mapping,
805 * wait_on_buffer() will do that for us
806 * through sync_buffer().
807 */
808 if (prev_mapping && prev_mapping != mapping)
809 blk_run_address_space(prev_mapping);
810 prev_mapping = mapping;
811
1da177e4
LT
812 brelse(bh);
813 spin_lock(lock);
814 }
815 }
816 }
817
818 while (!list_empty(&tmp)) {
819 bh = BH_ENTRY(tmp.prev);
1da177e4 820 get_bh(bh);
535ee2fb
JK
821 mapping = bh->b_assoc_map;
822 __remove_assoc_queue(bh);
823 /* Avoid race with mark_buffer_dirty_inode() which does
824 * a lockless check and we rely on seeing the dirty bit */
825 smp_mb();
826 if (buffer_dirty(bh)) {
827 list_add(&bh->b_assoc_buffers,
e3892296 828 &mapping->private_list);
535ee2fb
JK
829 bh->b_assoc_map = mapping;
830 }
1da177e4
LT
831 spin_unlock(lock);
832 wait_on_buffer(bh);
833 if (!buffer_uptodate(bh))
834 err = -EIO;
835 brelse(bh);
836 spin_lock(lock);
837 }
838
839 spin_unlock(lock);
840 err2 = osync_buffers_list(lock, list);
841 if (err)
842 return err;
843 else
844 return err2;
845}
846
847/*
848 * Invalidate any and all dirty buffers on a given inode. We are
849 * probably unmounting the fs, but that doesn't mean we have already
850 * done a sync(). Just drop the buffers from the inode list.
851 *
852 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
853 * assumes that all the buffers are against the blockdev. Not true
854 * for reiserfs.
855 */
856void invalidate_inode_buffers(struct inode *inode)
857{
858 if (inode_has_buffers(inode)) {
859 struct address_space *mapping = &inode->i_data;
860 struct list_head *list = &mapping->private_list;
861 struct address_space *buffer_mapping = mapping->assoc_mapping;
862
863 spin_lock(&buffer_mapping->private_lock);
864 while (!list_empty(list))
865 __remove_assoc_queue(BH_ENTRY(list->next));
866 spin_unlock(&buffer_mapping->private_lock);
867 }
868}
52b19ac9 869EXPORT_SYMBOL(invalidate_inode_buffers);
1da177e4
LT
870
871/*
872 * Remove any clean buffers from the inode's buffer list. This is called
873 * when we're trying to free the inode itself. Those buffers can pin it.
874 *
875 * Returns true if all buffers were removed.
876 */
877int remove_inode_buffers(struct inode *inode)
878{
879 int ret = 1;
880
881 if (inode_has_buffers(inode)) {
882 struct address_space *mapping = &inode->i_data;
883 struct list_head *list = &mapping->private_list;
884 struct address_space *buffer_mapping = mapping->assoc_mapping;
885
886 spin_lock(&buffer_mapping->private_lock);
887 while (!list_empty(list)) {
888 struct buffer_head *bh = BH_ENTRY(list->next);
889 if (buffer_dirty(bh)) {
890 ret = 0;
891 break;
892 }
893 __remove_assoc_queue(bh);
894 }
895 spin_unlock(&buffer_mapping->private_lock);
896 }
897 return ret;
898}
899
900/*
901 * Create the appropriate buffers when given a page for data area and
902 * the size of each buffer.. Use the bh->b_this_page linked list to
903 * follow the buffers created. Return NULL if unable to create more
904 * buffers.
905 *
906 * The retry flag is used to differentiate async IO (paging, swapping)
907 * which may not fail from ordinary buffer allocations.
908 */
909struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
910 int retry)
911{
912 struct buffer_head *bh, *head;
913 long offset;
914
915try_again:
916 head = NULL;
917 offset = PAGE_SIZE;
918 while ((offset -= size) >= 0) {
919 bh = alloc_buffer_head(GFP_NOFS);
920 if (!bh)
921 goto no_grow;
922
923 bh->b_bdev = NULL;
924 bh->b_this_page = head;
925 bh->b_blocknr = -1;
926 head = bh;
927
928 bh->b_state = 0;
929 atomic_set(&bh->b_count, 0);
fc5cd582 930 bh->b_private = NULL;
1da177e4
LT
931 bh->b_size = size;
932
933 /* Link the buffer to its page */
934 set_bh_page(bh, page, offset);
935
01ffe339 936 init_buffer(bh, NULL, NULL);
1da177e4
LT
937 }
938 return head;
939/*
940 * In case anything failed, we just free everything we got.
941 */
942no_grow:
943 if (head) {
944 do {
945 bh = head;
946 head = head->b_this_page;
947 free_buffer_head(bh);
948 } while (head);
949 }
950
951 /*
952 * Return failure for non-async IO requests. Async IO requests
953 * are not allowed to fail, so we have to wait until buffer heads
954 * become available. But we don't want tasks sleeping with
955 * partially complete buffers, so all were released above.
956 */
957 if (!retry)
958 return NULL;
959
960 /* We're _really_ low on memory. Now we just
961 * wait for old buffer heads to become free due to
962 * finishing IO. Since this is an async request and
963 * the reserve list is empty, we're sure there are
964 * async buffer heads in use.
965 */
966 free_more_memory();
967 goto try_again;
968}
969EXPORT_SYMBOL_GPL(alloc_page_buffers);
970
971static inline void
972link_dev_buffers(struct page *page, struct buffer_head *head)
973{
974 struct buffer_head *bh, *tail;
975
976 bh = head;
977 do {
978 tail = bh;
979 bh = bh->b_this_page;
980 } while (bh);
981 tail->b_this_page = head;
982 attach_page_buffers(page, head);
983}
984
985/*
986 * Initialise the state of a blockdev page's buffers.
987 */
988static void
989init_page_buffers(struct page *page, struct block_device *bdev,
990 sector_t block, int size)
991{
992 struct buffer_head *head = page_buffers(page);
993 struct buffer_head *bh = head;
994 int uptodate = PageUptodate(page);
995
996 do {
997 if (!buffer_mapped(bh)) {
998 init_buffer(bh, NULL, NULL);
999 bh->b_bdev = bdev;
1000 bh->b_blocknr = block;
1001 if (uptodate)
1002 set_buffer_uptodate(bh);
1003 set_buffer_mapped(bh);
1004 }
1005 block++;
1006 bh = bh->b_this_page;
1007 } while (bh != head);
1008}
1009
1010/*
1011 * Create the page-cache page that contains the requested block.
1012 *
1013 * This is user purely for blockdev mappings.
1014 */
1015static struct page *
1016grow_dev_page(struct block_device *bdev, sector_t block,
1017 pgoff_t index, int size)
1018{
1019 struct inode *inode = bdev->bd_inode;
1020 struct page *page;
1021 struct buffer_head *bh;
1022
ea125892 1023 page = find_or_create_page(inode->i_mapping, index,
769848c0 1024 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1da177e4
LT
1025 if (!page)
1026 return NULL;
1027
e827f923 1028 BUG_ON(!PageLocked(page));
1da177e4
LT
1029
1030 if (page_has_buffers(page)) {
1031 bh = page_buffers(page);
1032 if (bh->b_size == size) {
1033 init_page_buffers(page, bdev, block, size);
1034 return page;
1035 }
1036 if (!try_to_free_buffers(page))
1037 goto failed;
1038 }
1039
1040 /*
1041 * Allocate some buffers for this page
1042 */
1043 bh = alloc_page_buffers(page, size, 0);
1044 if (!bh)
1045 goto failed;
1046
1047 /*
1048 * Link the page to the buffers and initialise them. Take the
1049 * lock to be atomic wrt __find_get_block(), which does not
1050 * run under the page lock.
1051 */
1052 spin_lock(&inode->i_mapping->private_lock);
1053 link_dev_buffers(page, bh);
1054 init_page_buffers(page, bdev, block, size);
1055 spin_unlock(&inode->i_mapping->private_lock);
1056 return page;
1057
1058failed:
1059 BUG();
1060 unlock_page(page);
1061 page_cache_release(page);
1062 return NULL;
1063}
1064
1065/*
1066 * Create buffers for the specified block device block's page. If
1067 * that page was dirty, the buffers are set dirty also.
1da177e4 1068 */
858119e1 1069static int
1da177e4
LT
1070grow_buffers(struct block_device *bdev, sector_t block, int size)
1071{
1072 struct page *page;
1073 pgoff_t index;
1074 int sizebits;
1075
1076 sizebits = -1;
1077 do {
1078 sizebits++;
1079 } while ((size << sizebits) < PAGE_SIZE);
1080
1081 index = block >> sizebits;
1da177e4 1082
e5657933
AM
1083 /*
1084 * Check for a block which wants to lie outside our maximum possible
1085 * pagecache index. (this comparison is done using sector_t types).
1086 */
1087 if (unlikely(index != block >> sizebits)) {
1088 char b[BDEVNAME_SIZE];
1089
1090 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1091 "device %s\n",
8e24eea7 1092 __func__, (unsigned long long)block,
e5657933
AM
1093 bdevname(bdev, b));
1094 return -EIO;
1095 }
1096 block = index << sizebits;
1da177e4
LT
1097 /* Create a page with the proper size buffers.. */
1098 page = grow_dev_page(bdev, block, index, size);
1099 if (!page)
1100 return 0;
1101 unlock_page(page);
1102 page_cache_release(page);
1103 return 1;
1104}
1105
75c96f85 1106static struct buffer_head *
1da177e4
LT
1107__getblk_slow(struct block_device *bdev, sector_t block, int size)
1108{
1109 /* Size must be multiple of hard sectorsize */
e1defc4f 1110 if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1da177e4
LT
1111 (size < 512 || size > PAGE_SIZE))) {
1112 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1113 size);
e1defc4f
MP
1114 printk(KERN_ERR "logical block size: %d\n",
1115 bdev_logical_block_size(bdev));
1da177e4
LT
1116
1117 dump_stack();
1118 return NULL;
1119 }
1120
1121 for (;;) {
1122 struct buffer_head * bh;
e5657933 1123 int ret;
1da177e4
LT
1124
1125 bh = __find_get_block(bdev, block, size);
1126 if (bh)
1127 return bh;
1128
e5657933
AM
1129 ret = grow_buffers(bdev, block, size);
1130 if (ret < 0)
1131 return NULL;
1132 if (ret == 0)
1da177e4
LT
1133 free_more_memory();
1134 }
1135}
1136
1137/*
1138 * The relationship between dirty buffers and dirty pages:
1139 *
1140 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1141 * the page is tagged dirty in its radix tree.
1142 *
1143 * At all times, the dirtiness of the buffers represents the dirtiness of
1144 * subsections of the page. If the page has buffers, the page dirty bit is
1145 * merely a hint about the true dirty state.
1146 *
1147 * When a page is set dirty in its entirety, all its buffers are marked dirty
1148 * (if the page has buffers).
1149 *
1150 * When a buffer is marked dirty, its page is dirtied, but the page's other
1151 * buffers are not.
1152 *
1153 * Also. When blockdev buffers are explicitly read with bread(), they
1154 * individually become uptodate. But their backing page remains not
1155 * uptodate - even if all of its buffers are uptodate. A subsequent
1156 * block_read_full_page() against that page will discover all the uptodate
1157 * buffers, will set the page uptodate and will perform no I/O.
1158 */
1159
1160/**
1161 * mark_buffer_dirty - mark a buffer_head as needing writeout
67be2dd1 1162 * @bh: the buffer_head to mark dirty
1da177e4
LT
1163 *
1164 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1165 * backing page dirty, then tag the page as dirty in its address_space's radix
1166 * tree and then attach the address_space's inode to its superblock's dirty
1167 * inode list.
1168 *
1169 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1170 * mapping->tree_lock and the global inode_lock.
1171 */
fc9b52cd 1172void mark_buffer_dirty(struct buffer_head *bh)
1da177e4 1173{
787d2214 1174 WARN_ON_ONCE(!buffer_uptodate(bh));
1be62dc1
LT
1175
1176 /*
1177 * Very *carefully* optimize the it-is-already-dirty case.
1178 *
1179 * Don't let the final "is it dirty" escape to before we
1180 * perhaps modified the buffer.
1181 */
1182 if (buffer_dirty(bh)) {
1183 smp_mb();
1184 if (buffer_dirty(bh))
1185 return;
1186 }
1187
a8e7d49a
LT
1188 if (!test_set_buffer_dirty(bh)) {
1189 struct page *page = bh->b_page;
8e9d78ed
LT
1190 if (!TestSetPageDirty(page)) {
1191 struct address_space *mapping = page_mapping(page);
1192 if (mapping)
1193 __set_page_dirty(page, mapping, 0);
1194 }
a8e7d49a 1195 }
1da177e4 1196}
1fe72eaa 1197EXPORT_SYMBOL(mark_buffer_dirty);
1da177e4
LT
1198
1199/*
1200 * Decrement a buffer_head's reference count. If all buffers against a page
1201 * have zero reference count, are clean and unlocked, and if the page is clean
1202 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1203 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1204 * a page but it ends up not being freed, and buffers may later be reattached).
1205 */
1206void __brelse(struct buffer_head * buf)
1207{
1208 if (atomic_read(&buf->b_count)) {
1209 put_bh(buf);
1210 return;
1211 }
5c752ad9 1212 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1da177e4 1213}
1fe72eaa 1214EXPORT_SYMBOL(__brelse);
1da177e4
LT
1215
1216/*
1217 * bforget() is like brelse(), except it discards any
1218 * potentially dirty data.
1219 */
1220void __bforget(struct buffer_head *bh)
1221{
1222 clear_buffer_dirty(bh);
535ee2fb 1223 if (bh->b_assoc_map) {
1da177e4
LT
1224 struct address_space *buffer_mapping = bh->b_page->mapping;
1225
1226 spin_lock(&buffer_mapping->private_lock);
1227 list_del_init(&bh->b_assoc_buffers);
58ff407b 1228 bh->b_assoc_map = NULL;
1da177e4
LT
1229 spin_unlock(&buffer_mapping->private_lock);
1230 }
1231 __brelse(bh);
1232}
1fe72eaa 1233EXPORT_SYMBOL(__bforget);
1da177e4 1234
2557c31c
JA
1235static struct buffer_head *__bread_slow(struct buffer_head *bh,
1236 struct wait_bit_queue *wait)
1da177e4 1237{
2557c31c
JA
1238 if (lock_buffer_async(bh, wait))
1239 return ERR_PTR(-EIOCBRETRY);
1da177e4
LT
1240 if (buffer_uptodate(bh)) {
1241 unlock_buffer(bh);
1242 return bh;
1243 } else {
1244 get_bh(bh);
1245 bh->b_end_io = end_buffer_read_sync;
1246 submit_bh(READ, bh);
2557c31c
JA
1247 if (wait_on_buffer_async(bh, wait))
1248 return ERR_PTR(-EIOCBRETRY);
1da177e4
LT
1249 if (buffer_uptodate(bh))
1250 return bh;
1251 }
1252 brelse(bh);
1253 return NULL;
1254}
1255
1256/*
1257 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1258 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1259 * refcount elevated by one when they're in an LRU. A buffer can only appear
1260 * once in a particular CPU's LRU. A single buffer can be present in multiple
1261 * CPU's LRUs at the same time.
1262 *
1263 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1264 * sb_find_get_block().
1265 *
1266 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1267 * a local interrupt disable for that.
1268 */
1269
1270#define BH_LRU_SIZE 8
1271
1272struct bh_lru {
1273 struct buffer_head *bhs[BH_LRU_SIZE];
1274};
1275
1276static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1277
1278#ifdef CONFIG_SMP
1279#define bh_lru_lock() local_irq_disable()
1280#define bh_lru_unlock() local_irq_enable()
1281#else
1282#define bh_lru_lock() preempt_disable()
1283#define bh_lru_unlock() preempt_enable()
1284#endif
1285
1286static inline void check_irqs_on(void)
1287{
1288#ifdef irqs_disabled
1289 BUG_ON(irqs_disabled());
1290#endif
1291}
1292
1293/*
1294 * The LRU management algorithm is dopey-but-simple. Sorry.
1295 */
1296static void bh_lru_install(struct buffer_head *bh)
1297{
1298 struct buffer_head *evictee = NULL;
1299 struct bh_lru *lru;
1300
1301 check_irqs_on();
1302 bh_lru_lock();
1303 lru = &__get_cpu_var(bh_lrus);
1304 if (lru->bhs[0] != bh) {
1305 struct buffer_head *bhs[BH_LRU_SIZE];
1306 int in;
1307 int out = 0;
1308
1309 get_bh(bh);
1310 bhs[out++] = bh;
1311 for (in = 0; in < BH_LRU_SIZE; in++) {
1312 struct buffer_head *bh2 = lru->bhs[in];
1313
1314 if (bh2 == bh) {
1315 __brelse(bh2);
1316 } else {
1317 if (out >= BH_LRU_SIZE) {
1318 BUG_ON(evictee != NULL);
1319 evictee = bh2;
1320 } else {
1321 bhs[out++] = bh2;
1322 }
1323 }
1324 }
1325 while (out < BH_LRU_SIZE)
1326 bhs[out++] = NULL;
1327 memcpy(lru->bhs, bhs, sizeof(bhs));
1328 }
1329 bh_lru_unlock();
1330
1331 if (evictee)
1332 __brelse(evictee);
1333}
1334
1335/*
1336 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1337 */
858119e1 1338static struct buffer_head *
3991d3bd 1339lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1340{
1341 struct buffer_head *ret = NULL;
1342 struct bh_lru *lru;
3991d3bd 1343 unsigned int i;
1da177e4
LT
1344
1345 check_irqs_on();
1346 bh_lru_lock();
1347 lru = &__get_cpu_var(bh_lrus);
1348 for (i = 0; i < BH_LRU_SIZE; i++) {
1349 struct buffer_head *bh = lru->bhs[i];
1350
1351 if (bh && bh->b_bdev == bdev &&
1352 bh->b_blocknr == block && bh->b_size == size) {
1353 if (i) {
1354 while (i) {
1355 lru->bhs[i] = lru->bhs[i - 1];
1356 i--;
1357 }
1358 lru->bhs[0] = bh;
1359 }
1360 get_bh(bh);
1361 ret = bh;
1362 break;
1363 }
1364 }
1365 bh_lru_unlock();
1366 return ret;
1367}
1368
1369/*
1370 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1371 * it in the LRU and mark it as accessed. If it is not present then return
1372 * NULL
1373 */
1374struct buffer_head *
3991d3bd 1375__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1376{
1377 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1378
1379 if (bh == NULL) {
385fd4c5 1380 bh = __find_get_block_slow(bdev, block);
1da177e4
LT
1381 if (bh)
1382 bh_lru_install(bh);
1383 }
1384 if (bh)
1385 touch_buffer(bh);
1386 return bh;
1387}
1388EXPORT_SYMBOL(__find_get_block);
1389
1390/*
1391 * __getblk will locate (and, if necessary, create) the buffer_head
1392 * which corresponds to the passed block_device, block and size. The
1393 * returned buffer has its reference count incremented.
1394 *
1395 * __getblk() cannot fail - it just keeps trying. If you pass it an
1396 * illegal block number, __getblk() will happily return a buffer_head
1397 * which represents the non-existent block. Very weird.
1398 *
1399 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1400 * attempt is failing. FIXME, perhaps?
1401 */
1402struct buffer_head *
3991d3bd 1403__getblk(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1404{
1405 struct buffer_head *bh = __find_get_block(bdev, block, size);
1406
1407 might_sleep();
1408 if (bh == NULL)
1409 bh = __getblk_slow(bdev, block, size);
1410 return bh;
1411}
1412EXPORT_SYMBOL(__getblk);
1413
1414/*
1415 * Do async read-ahead on a buffer..
1416 */
3991d3bd 1417void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1418{
1419 struct buffer_head *bh = __getblk(bdev, block, size);
a3e713b5
AM
1420 if (likely(bh)) {
1421 ll_rw_block(READA, 1, &bh);
1422 brelse(bh);
1423 }
1da177e4
LT
1424}
1425EXPORT_SYMBOL(__breadahead);
1426
1427/**
1428 * __bread() - reads a specified block and returns the bh
67be2dd1 1429 * @bdev: the block_device to read from
1da177e4
LT
1430 * @block: number of block
1431 * @size: size (in bytes) to read
1432 *
1433 * Reads a specified block, and returns buffer head that contains it.
1434 * It returns NULL if the block was unreadable.
1435 */
1436struct buffer_head *
2557c31c
JA
1437__bread_async(struct block_device *bdev, sector_t block, unsigned size,
1438 struct wait_bit_queue *wait)
1da177e4
LT
1439{
1440 struct buffer_head *bh = __getblk(bdev, block, size);
1441
a3e713b5 1442 if (likely(bh) && !buffer_uptodate(bh))
2557c31c 1443 bh = __bread_slow(bh, wait);
1da177e4
LT
1444 return bh;
1445}
2557c31c 1446EXPORT_SYMBOL(__bread_async);
1da177e4
LT
1447
1448/*
1449 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1450 * This doesn't race because it runs in each cpu either in irq
1451 * or with preempt disabled.
1452 */
1453static void invalidate_bh_lru(void *arg)
1454{
1455 struct bh_lru *b = &get_cpu_var(bh_lrus);
1456 int i;
1457
1458 for (i = 0; i < BH_LRU_SIZE; i++) {
1459 brelse(b->bhs[i]);
1460 b->bhs[i] = NULL;
1461 }
1462 put_cpu_var(bh_lrus);
1463}
1464
f9a14399 1465void invalidate_bh_lrus(void)
1da177e4 1466{
15c8b6c1 1467 on_each_cpu(invalidate_bh_lru, NULL, 1);
1da177e4 1468}
9db5579b 1469EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1da177e4
LT
1470
1471void set_bh_page(struct buffer_head *bh,
1472 struct page *page, unsigned long offset)
1473{
1474 bh->b_page = page;
e827f923 1475 BUG_ON(offset >= PAGE_SIZE);
1da177e4
LT
1476 if (PageHighMem(page))
1477 /*
1478 * This catches illegal uses and preserves the offset:
1479 */
1480 bh->b_data = (char *)(0 + offset);
1481 else
1482 bh->b_data = page_address(page) + offset;
1483}
1484EXPORT_SYMBOL(set_bh_page);
1485
1486/*
1487 * Called when truncating a buffer on a page completely.
1488 */
858119e1 1489static void discard_buffer(struct buffer_head * bh)
1da177e4
LT
1490{
1491 lock_buffer(bh);
1492 clear_buffer_dirty(bh);
1493 bh->b_bdev = NULL;
1494 clear_buffer_mapped(bh);
1495 clear_buffer_req(bh);
1496 clear_buffer_new(bh);
1497 clear_buffer_delay(bh);
33a266dd 1498 clear_buffer_unwritten(bh);
1da177e4
LT
1499 unlock_buffer(bh);
1500}
1501
1da177e4
LT
1502/**
1503 * block_invalidatepage - invalidate part of all of a buffer-backed page
1504 *
1505 * @page: the page which is affected
1506 * @offset: the index of the truncation point
1507 *
1508 * block_invalidatepage() is called when all or part of the page has become
1509 * invalidatedby a truncate operation.
1510 *
1511 * block_invalidatepage() does not have to release all buffers, but it must
1512 * ensure that no dirty buffer is left outside @offset and that no I/O
1513 * is underway against any of the blocks which are outside the truncation
1514 * point. Because the caller is about to free (and possibly reuse) those
1515 * blocks on-disk.
1516 */
2ff28e22 1517void block_invalidatepage(struct page *page, unsigned long offset)
1da177e4
LT
1518{
1519 struct buffer_head *head, *bh, *next;
1520 unsigned int curr_off = 0;
1da177e4
LT
1521
1522 BUG_ON(!PageLocked(page));
1523 if (!page_has_buffers(page))
1524 goto out;
1525
1526 head = page_buffers(page);
1527 bh = head;
1528 do {
1529 unsigned int next_off = curr_off + bh->b_size;
1530 next = bh->b_this_page;
1531
1532 /*
1533 * is this block fully invalidated?
1534 */
1535 if (offset <= curr_off)
1536 discard_buffer(bh);
1537 curr_off = next_off;
1538 bh = next;
1539 } while (bh != head);
1540
1541 /*
1542 * We release buffers only if the entire page is being invalidated.
1543 * The get_block cached value has been unconditionally invalidated,
1544 * so real IO is not possible anymore.
1545 */
1546 if (offset == 0)
2ff28e22 1547 try_to_release_page(page, 0);
1da177e4 1548out:
2ff28e22 1549 return;
1da177e4
LT
1550}
1551EXPORT_SYMBOL(block_invalidatepage);
1552
1553/*
1554 * We attach and possibly dirty the buffers atomically wrt
1555 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1556 * is already excluded via the page lock.
1557 */
1558void create_empty_buffers(struct page *page,
1559 unsigned long blocksize, unsigned long b_state)
1560{
1561 struct buffer_head *bh, *head, *tail;
1562
1563 head = alloc_page_buffers(page, blocksize, 1);
1564 bh = head;
1565 do {
1566 bh->b_state |= b_state;
1567 tail = bh;
1568 bh = bh->b_this_page;
1569 } while (bh);
1570 tail->b_this_page = head;
1571
1572 spin_lock(&page->mapping->private_lock);
1573 if (PageUptodate(page) || PageDirty(page)) {
1574 bh = head;
1575 do {
1576 if (PageDirty(page))
1577 set_buffer_dirty(bh);
1578 if (PageUptodate(page))
1579 set_buffer_uptodate(bh);
1580 bh = bh->b_this_page;
1581 } while (bh != head);
1582 }
1583 attach_page_buffers(page, head);
1584 spin_unlock(&page->mapping->private_lock);
1585}
1586EXPORT_SYMBOL(create_empty_buffers);
1587
1588/*
1589 * We are taking a block for data and we don't want any output from any
1590 * buffer-cache aliases starting from return from that function and
1591 * until the moment when something will explicitly mark the buffer
1592 * dirty (hopefully that will not happen until we will free that block ;-)
1593 * We don't even need to mark it not-uptodate - nobody can expect
1594 * anything from a newly allocated buffer anyway. We used to used
1595 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1596 * don't want to mark the alias unmapped, for example - it would confuse
1597 * anyone who might pick it with bread() afterwards...
1598 *
1599 * Also.. Note that bforget() doesn't lock the buffer. So there can
1600 * be writeout I/O going on against recently-freed buffers. We don't
1601 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1602 * only if we really need to. That happens here.
1603 */
1604void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1605{
1606 struct buffer_head *old_bh;
1607
1608 might_sleep();
1609
385fd4c5 1610 old_bh = __find_get_block_slow(bdev, block);
1da177e4
LT
1611 if (old_bh) {
1612 clear_buffer_dirty(old_bh);
1613 wait_on_buffer(old_bh);
1614 clear_buffer_req(old_bh);
1615 __brelse(old_bh);
1616 }
1617}
1618EXPORT_SYMBOL(unmap_underlying_metadata);
1619
1620/*
1621 * NOTE! All mapped/uptodate combinations are valid:
1622 *
1623 * Mapped Uptodate Meaning
1624 *
1625 * No No "unknown" - must do get_block()
1626 * No Yes "hole" - zero-filled
1627 * Yes No "allocated" - allocated on disk, not read in
1628 * Yes Yes "valid" - allocated and up-to-date in memory.
1629 *
1630 * "Dirty" is valid only with the last case (mapped+uptodate).
1631 */
1632
1633/*
1634 * While block_write_full_page is writing back the dirty buffers under
1635 * the page lock, whoever dirtied the buffers may decide to clean them
1636 * again at any time. We handle that by only looking at the buffer
1637 * state inside lock_buffer().
1638 *
1639 * If block_write_full_page() is called for regular writeback
1640 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1641 * locked buffer. This only can happen if someone has written the buffer
1642 * directly, with submit_bh(). At the address_space level PageWriteback
1643 * prevents this contention from occurring.
6e34eedd
TT
1644 *
1645 * If block_write_full_page() is called with wbc->sync_mode ==
1646 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
1647 * causes the writes to be flagged as synchronous writes, but the
1648 * block device queue will NOT be unplugged, since usually many pages
1649 * will be pushed to the out before the higher-level caller actually
1650 * waits for the writes to be completed. The various wait functions,
1651 * such as wait_on_writeback_range() will ultimately call sync_page()
1652 * which will ultimately call blk_run_backing_dev(), which will end up
1653 * unplugging the device queue.
1da177e4
LT
1654 */
1655static int __block_write_full_page(struct inode *inode, struct page *page,
35c80d5f
CM
1656 get_block_t *get_block, struct writeback_control *wbc,
1657 bh_end_io_t *handler)
1da177e4
LT
1658{
1659 int err;
1660 sector_t block;
1661 sector_t last_block;
f0fbd5fc 1662 struct buffer_head *bh, *head;
b0cf2321 1663 const unsigned blocksize = 1 << inode->i_blkbits;
1da177e4 1664 int nr_underway = 0;
6e34eedd
TT
1665 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1666 WRITE_SYNC_PLUG : WRITE);
1da177e4
LT
1667
1668 BUG_ON(!PageLocked(page));
1669
1670 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1671
1672 if (!page_has_buffers(page)) {
b0cf2321 1673 create_empty_buffers(page, blocksize,
1da177e4
LT
1674 (1 << BH_Dirty)|(1 << BH_Uptodate));
1675 }
1676
1677 /*
1678 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1679 * here, and the (potentially unmapped) buffers may become dirty at
1680 * any time. If a buffer becomes dirty here after we've inspected it
1681 * then we just miss that fact, and the page stays dirty.
1682 *
1683 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1684 * handle that here by just cleaning them.
1685 */
1686
54b21a79 1687 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1da177e4
LT
1688 head = page_buffers(page);
1689 bh = head;
1690
1691 /*
1692 * Get all the dirty buffers mapped to disk addresses and
1693 * handle any aliases from the underlying blockdev's mapping.
1694 */
1695 do {
1696 if (block > last_block) {
1697 /*
1698 * mapped buffers outside i_size will occur, because
1699 * this page can be outside i_size when there is a
1700 * truncate in progress.
1701 */
1702 /*
1703 * The buffer was zeroed by block_write_full_page()
1704 */
1705 clear_buffer_dirty(bh);
1706 set_buffer_uptodate(bh);
29a814d2
AT
1707 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1708 buffer_dirty(bh)) {
b0cf2321 1709 WARN_ON(bh->b_size != blocksize);
1da177e4
LT
1710 err = get_block(inode, block, bh, 1);
1711 if (err)
1712 goto recover;
29a814d2 1713 clear_buffer_delay(bh);
1da177e4
LT
1714 if (buffer_new(bh)) {
1715 /* blockdev mappings never come here */
1716 clear_buffer_new(bh);
1717 unmap_underlying_metadata(bh->b_bdev,
1718 bh->b_blocknr);
1719 }
1720 }
1721 bh = bh->b_this_page;
1722 block++;
1723 } while (bh != head);
1724
1725 do {
1da177e4
LT
1726 if (!buffer_mapped(bh))
1727 continue;
1728 /*
1729 * If it's a fully non-blocking write attempt and we cannot
1730 * lock the buffer then redirty the page. Note that this can
5b0830cb
JA
1731 * potentially cause a busy-wait loop from writeback threads
1732 * and kswapd activity, but those code paths have their own
1733 * higher-level throttling.
1da177e4
LT
1734 */
1735 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1736 lock_buffer(bh);
ca5de404 1737 } else if (!trylock_buffer(bh)) {
1da177e4
LT
1738 redirty_page_for_writepage(wbc, page);
1739 continue;
1740 }
1741 if (test_clear_buffer_dirty(bh)) {
35c80d5f 1742 mark_buffer_async_write_endio(bh, handler);
1da177e4
LT
1743 } else {
1744 unlock_buffer(bh);
1745 }
1746 } while ((bh = bh->b_this_page) != head);
1747
1748 /*
1749 * The page and its buffers are protected by PageWriteback(), so we can
1750 * drop the bh refcounts early.
1751 */
1752 BUG_ON(PageWriteback(page));
1753 set_page_writeback(page);
1da177e4
LT
1754
1755 do {
1756 struct buffer_head *next = bh->b_this_page;
1757 if (buffer_async_write(bh)) {
a64c8610 1758 submit_bh(write_op, bh);
1da177e4
LT
1759 nr_underway++;
1760 }
1da177e4
LT
1761 bh = next;
1762 } while (bh != head);
05937baa 1763 unlock_page(page);
1da177e4
LT
1764
1765 err = 0;
1766done:
1767 if (nr_underway == 0) {
1768 /*
1769 * The page was marked dirty, but the buffers were
1770 * clean. Someone wrote them back by hand with
1771 * ll_rw_block/submit_bh. A rare case.
1772 */
1da177e4 1773 end_page_writeback(page);
3d67f2d7 1774
1da177e4
LT
1775 /*
1776 * The page and buffer_heads can be released at any time from
1777 * here on.
1778 */
1da177e4
LT
1779 }
1780 return err;
1781
1782recover:
1783 /*
1784 * ENOSPC, or some other error. We may already have added some
1785 * blocks to the file, so we need to write these out to avoid
1786 * exposing stale data.
1787 * The page is currently locked and not marked for writeback
1788 */
1789 bh = head;
1790 /* Recovery: lock and submit the mapped buffers */
1791 do {
29a814d2
AT
1792 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1793 !buffer_delay(bh)) {
1da177e4 1794 lock_buffer(bh);
35c80d5f 1795 mark_buffer_async_write_endio(bh, handler);
1da177e4
LT
1796 } else {
1797 /*
1798 * The buffer may have been set dirty during
1799 * attachment to a dirty page.
1800 */
1801 clear_buffer_dirty(bh);
1802 }
1803 } while ((bh = bh->b_this_page) != head);
1804 SetPageError(page);
1805 BUG_ON(PageWriteback(page));
7e4c3690 1806 mapping_set_error(page->mapping, err);
1da177e4 1807 set_page_writeback(page);
1da177e4
LT
1808 do {
1809 struct buffer_head *next = bh->b_this_page;
1810 if (buffer_async_write(bh)) {
1811 clear_buffer_dirty(bh);
a64c8610 1812 submit_bh(write_op, bh);
1da177e4
LT
1813 nr_underway++;
1814 }
1da177e4
LT
1815 bh = next;
1816 } while (bh != head);
ffda9d30 1817 unlock_page(page);
1da177e4
LT
1818 goto done;
1819}
1820
afddba49
NP
1821/*
1822 * If a page has any new buffers, zero them out here, and mark them uptodate
1823 * and dirty so they'll be written out (in order to prevent uninitialised
1824 * block data from leaking). And clear the new bit.
1825 */
1826void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1827{
1828 unsigned int block_start, block_end;
1829 struct buffer_head *head, *bh;
1830
1831 BUG_ON(!PageLocked(page));
1832 if (!page_has_buffers(page))
1833 return;
1834
1835 bh = head = page_buffers(page);
1836 block_start = 0;
1837 do {
1838 block_end = block_start + bh->b_size;
1839
1840 if (buffer_new(bh)) {
1841 if (block_end > from && block_start < to) {
1842 if (!PageUptodate(page)) {
1843 unsigned start, size;
1844
1845 start = max(from, block_start);
1846 size = min(to, block_end) - start;
1847
eebd2aa3 1848 zero_user(page, start, size);
afddba49
NP
1849 set_buffer_uptodate(bh);
1850 }
1851
1852 clear_buffer_new(bh);
1853 mark_buffer_dirty(bh);
1854 }
1855 }
1856
1857 block_start = block_end;
1858 bh = bh->b_this_page;
1859 } while (bh != head);
1860}
1861EXPORT_SYMBOL(page_zero_new_buffers);
1862
1da177e4
LT
1863static int __block_prepare_write(struct inode *inode, struct page *page,
1864 unsigned from, unsigned to, get_block_t *get_block)
1865{
1866 unsigned block_start, block_end;
1867 sector_t block;
1868 int err = 0;
1869 unsigned blocksize, bbits;
1870 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1871
1872 BUG_ON(!PageLocked(page));
1873 BUG_ON(from > PAGE_CACHE_SIZE);
1874 BUG_ON(to > PAGE_CACHE_SIZE);
1875 BUG_ON(from > to);
1876
1877 blocksize = 1 << inode->i_blkbits;
1878 if (!page_has_buffers(page))
1879 create_empty_buffers(page, blocksize, 0);
1880 head = page_buffers(page);
1881
1882 bbits = inode->i_blkbits;
1883 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1884
1885 for(bh = head, block_start = 0; bh != head || !block_start;
1886 block++, block_start=block_end, bh = bh->b_this_page) {
1887 block_end = block_start + blocksize;
1888 if (block_end <= from || block_start >= to) {
1889 if (PageUptodate(page)) {
1890 if (!buffer_uptodate(bh))
1891 set_buffer_uptodate(bh);
1892 }
1893 continue;
1894 }
1895 if (buffer_new(bh))
1896 clear_buffer_new(bh);
1897 if (!buffer_mapped(bh)) {
b0cf2321 1898 WARN_ON(bh->b_size != blocksize);
1da177e4
LT
1899 err = get_block(inode, block, bh, 1);
1900 if (err)
f3ddbdc6 1901 break;
1da177e4 1902 if (buffer_new(bh)) {
1da177e4
LT
1903 unmap_underlying_metadata(bh->b_bdev,
1904 bh->b_blocknr);
1905 if (PageUptodate(page)) {
637aff46 1906 clear_buffer_new(bh);
1da177e4 1907 set_buffer_uptodate(bh);
637aff46 1908 mark_buffer_dirty(bh);
1da177e4
LT
1909 continue;
1910 }
eebd2aa3
CL
1911 if (block_end > to || block_start < from)
1912 zero_user_segments(page,
1913 to, block_end,
1914 block_start, from);
1da177e4
LT
1915 continue;
1916 }
1917 }
1918 if (PageUptodate(page)) {
1919 if (!buffer_uptodate(bh))
1920 set_buffer_uptodate(bh);
1921 continue;
1922 }
1923 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
33a266dd 1924 !buffer_unwritten(bh) &&
1da177e4
LT
1925 (block_start < from || block_end > to)) {
1926 ll_rw_block(READ, 1, &bh);
1927 *wait_bh++=bh;
1928 }
1929 }
1930 /*
1931 * If we issued read requests - let them complete.
1932 */
1933 while(wait_bh > wait) {
f80e69e7
JA
1934 int ret;
1935
1936 ret = wait_on_buffer_async(*--wait_bh, current->io_wait);
1be3d0ec
JA
1937 if (ret && !err) {
1938 WARN(1, "%s: ret\n", __FUNCTION__);
f80e69e7 1939 err = ret;
1be3d0ec 1940 }
1da177e4 1941 if (!buffer_uptodate(*wait_bh))
f3ddbdc6 1942 err = -EIO;
1da177e4 1943 }
afddba49
NP
1944 if (unlikely(err))
1945 page_zero_new_buffers(page, from, to);
1da177e4
LT
1946 return err;
1947}
1948
1949static int __block_commit_write(struct inode *inode, struct page *page,
1950 unsigned from, unsigned to)
1951{
1952 unsigned block_start, block_end;
1953 int partial = 0;
1954 unsigned blocksize;
1955 struct buffer_head *bh, *head;
1956
1957 blocksize = 1 << inode->i_blkbits;
1958
1959 for(bh = head = page_buffers(page), block_start = 0;
1960 bh != head || !block_start;
1961 block_start=block_end, bh = bh->b_this_page) {
1962 block_end = block_start + blocksize;
1963 if (block_end <= from || block_start >= to) {
1964 if (!buffer_uptodate(bh))
1965 partial = 1;
1966 } else {
1967 set_buffer_uptodate(bh);
1968 mark_buffer_dirty(bh);
1969 }
afddba49 1970 clear_buffer_new(bh);
1da177e4
LT
1971 }
1972
1973 /*
1974 * If this is a partial write which happened to make all buffers
1975 * uptodate then we can optimize away a bogus readpage() for
1976 * the next read(). Here we 'discover' whether the page went
1977 * uptodate as a result of this (potentially partial) write.
1978 */
1979 if (!partial)
1980 SetPageUptodate(page);
1981 return 0;
1982}
1983
afddba49
NP
1984/*
1985 * block_write_begin takes care of the basic task of block allocation and
1986 * bringing partial write blocks uptodate first.
1987 *
1988 * If *pagep is not NULL, then block_write_begin uses the locked page
1989 * at *pagep rather than allocating its own. In this case, the page will
1990 * not be unlocked or deallocated on failure.
1991 */
1992int block_write_begin(struct file *file, struct address_space *mapping,
1993 loff_t pos, unsigned len, unsigned flags,
1994 struct page **pagep, void **fsdata,
1995 get_block_t *get_block)
1996{
1997 struct inode *inode = mapping->host;
1998 int status = 0;
1999 struct page *page;
2000 pgoff_t index;
2001 unsigned start, end;
2002 int ownpage = 0;
2003
2004 index = pos >> PAGE_CACHE_SHIFT;
2005 start = pos & (PAGE_CACHE_SIZE - 1);
2006 end = start + len;
2007
2008 page = *pagep;
2009 if (page == NULL) {
2010 ownpage = 1;
54566b2c 2011 page = grab_cache_page_write_begin(mapping, index, flags);
afddba49
NP
2012 if (!page) {
2013 status = -ENOMEM;
2014 goto out;
2015 }
2016 *pagep = page;
2017 } else
2018 BUG_ON(!PageLocked(page));
2019
2020 status = __block_prepare_write(inode, page, start, end, get_block);
2021 if (unlikely(status)) {
2022 ClearPageUptodate(page);
2023
2024 if (ownpage) {
2025 unlock_page(page);
2026 page_cache_release(page);
2027 *pagep = NULL;
2028
2029 /*
2030 * prepare_write() may have instantiated a few blocks
2031 * outside i_size. Trim these off again. Don't need
2032 * i_size_read because we hold i_mutex.
2033 */
2034 if (pos + len > inode->i_size)
2035 vmtruncate(inode, inode->i_size);
2036 }
afddba49
NP
2037 }
2038
2039out:
2040 return status;
2041}
2042EXPORT_SYMBOL(block_write_begin);
2043
2044int block_write_end(struct file *file, struct address_space *mapping,
2045 loff_t pos, unsigned len, unsigned copied,
2046 struct page *page, void *fsdata)
2047{
2048 struct inode *inode = mapping->host;
2049 unsigned start;
2050
2051 start = pos & (PAGE_CACHE_SIZE - 1);
2052
2053 if (unlikely(copied < len)) {
2054 /*
2055 * The buffers that were written will now be uptodate, so we
2056 * don't have to worry about a readpage reading them and
2057 * overwriting a partial write. However if we have encountered
2058 * a short write and only partially written into a buffer, it
2059 * will not be marked uptodate, so a readpage might come in and
2060 * destroy our partial write.
2061 *
2062 * Do the simplest thing, and just treat any short write to a
2063 * non uptodate page as a zero-length write, and force the
2064 * caller to redo the whole thing.
2065 */
2066 if (!PageUptodate(page))
2067 copied = 0;
2068
2069 page_zero_new_buffers(page, start+copied, start+len);
2070 }
2071 flush_dcache_page(page);
2072
2073 /* This could be a short (even 0-length) commit */
2074 __block_commit_write(inode, page, start, start+copied);
2075
2076 return copied;
2077}
2078EXPORT_SYMBOL(block_write_end);
2079
2080int generic_write_end(struct file *file, struct address_space *mapping,
2081 loff_t pos, unsigned len, unsigned copied,
2082 struct page *page, void *fsdata)
2083{
2084 struct inode *inode = mapping->host;
c7d206b3 2085 int i_size_changed = 0;
afddba49
NP
2086
2087 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2088
2089 /*
2090 * No need to use i_size_read() here, the i_size
2091 * cannot change under us because we hold i_mutex.
2092 *
2093 * But it's important to update i_size while still holding page lock:
2094 * page writeout could otherwise come in and zero beyond i_size.
2095 */
2096 if (pos+copied > inode->i_size) {
2097 i_size_write(inode, pos+copied);
c7d206b3 2098 i_size_changed = 1;
afddba49
NP
2099 }
2100
2101 unlock_page(page);
2102 page_cache_release(page);
2103
c7d206b3
JK
2104 /*
2105 * Don't mark the inode dirty under page lock. First, it unnecessarily
2106 * makes the holding time of page lock longer. Second, it forces lock
2107 * ordering of page lock and transaction start for journaling
2108 * filesystems.
2109 */
2110 if (i_size_changed)
2111 mark_inode_dirty(inode);
2112
afddba49
NP
2113 return copied;
2114}
2115EXPORT_SYMBOL(generic_write_end);
2116
8ab22b9a
HH
2117/*
2118 * block_is_partially_uptodate checks whether buffers within a page are
2119 * uptodate or not.
2120 *
2121 * Returns true if all buffers which correspond to a file portion
2122 * we want to read are uptodate.
2123 */
2124int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2125 unsigned long from)
2126{
2127 struct inode *inode = page->mapping->host;
2128 unsigned block_start, block_end, blocksize;
2129 unsigned to;
2130 struct buffer_head *bh, *head;
2131 int ret = 1;
2132
2133 if (!page_has_buffers(page))
2134 return 0;
2135
2136 blocksize = 1 << inode->i_blkbits;
2137 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2138 to = from + to;
2139 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2140 return 0;
2141
2142 head = page_buffers(page);
2143 bh = head;
2144 block_start = 0;
2145 do {
2146 block_end = block_start + blocksize;
2147 if (block_end > from && block_start < to) {
2148 if (!buffer_uptodate(bh)) {
2149 ret = 0;
2150 break;
2151 }
2152 if (block_end >= to)
2153 break;
2154 }
2155 block_start = block_end;
2156 bh = bh->b_this_page;
2157 } while (bh != head);
2158
2159 return ret;
2160}
2161EXPORT_SYMBOL(block_is_partially_uptodate);
2162
1da177e4
LT
2163/*
2164 * Generic "read page" function for block devices that have the normal
2165 * get_block functionality. This is most of the block device filesystems.
2166 * Reads the page asynchronously --- the unlock_buffer() and
2167 * set/clear_buffer_uptodate() functions propagate buffer state into the
2168 * page struct once IO has completed.
2169 */
2170int block_read_full_page(struct page *page, get_block_t *get_block)
2171{
2172 struct inode *inode = page->mapping->host;
2173 sector_t iblock, lblock;
2174 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2175 unsigned int blocksize;
2176 int nr, i;
2177 int fully_mapped = 1;
2178
cd7619d6 2179 BUG_ON(!PageLocked(page));
1da177e4
LT
2180 blocksize = 1 << inode->i_blkbits;
2181 if (!page_has_buffers(page))
2182 create_empty_buffers(page, blocksize, 0);
2183 head = page_buffers(page);
2184
2185 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2186 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2187 bh = head;
2188 nr = 0;
2189 i = 0;
2190
2191 do {
2192 if (buffer_uptodate(bh))
2193 continue;
2194
2195 if (!buffer_mapped(bh)) {
c64610ba
AM
2196 int err = 0;
2197
1da177e4
LT
2198 fully_mapped = 0;
2199 if (iblock < lblock) {
b0cf2321 2200 WARN_ON(bh->b_size != blocksize);
c64610ba
AM
2201 err = get_block(inode, iblock, bh, 0);
2202 if (err)
1da177e4
LT
2203 SetPageError(page);
2204 }
2205 if (!buffer_mapped(bh)) {
eebd2aa3 2206 zero_user(page, i * blocksize, blocksize);
c64610ba
AM
2207 if (!err)
2208 set_buffer_uptodate(bh);
1da177e4
LT
2209 continue;
2210 }
2211 /*
2212 * get_block() might have updated the buffer
2213 * synchronously
2214 */
2215 if (buffer_uptodate(bh))
2216 continue;
2217 }
2218 arr[nr++] = bh;
2219 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2220
2221 if (fully_mapped)
2222 SetPageMappedToDisk(page);
2223
2224 if (!nr) {
2225 /*
2226 * All buffers are uptodate - we can set the page uptodate
2227 * as well. But not if get_block() returned an error.
2228 */
2229 if (!PageError(page))
2230 SetPageUptodate(page);
2231 unlock_page(page);
2232 return 0;
2233 }
2234
2235 /* Stage two: lock the buffers */
2236 for (i = 0; i < nr; i++) {
2237 bh = arr[i];
2238 lock_buffer(bh);
2239 mark_buffer_async_read(bh);
2240 }
2241
2242 /*
2243 * Stage 3: start the IO. Check for uptodateness
2244 * inside the buffer lock in case another process reading
2245 * the underlying blockdev brought it uptodate (the sct fix).
2246 */
2247 for (i = 0; i < nr; i++) {
2248 bh = arr[i];
2249 if (buffer_uptodate(bh))
2250 end_buffer_async_read(bh, 1);
2251 else
2252 submit_bh(READ, bh);
2253 }
2254 return 0;
2255}
1fe72eaa 2256EXPORT_SYMBOL(block_read_full_page);
1da177e4
LT
2257
2258/* utility function for filesystems that need to do work on expanding
89e10787 2259 * truncates. Uses filesystem pagecache writes to allow the filesystem to
1da177e4
LT
2260 * deal with the hole.
2261 */
89e10787 2262int generic_cont_expand_simple(struct inode *inode, loff_t size)
1da177e4
LT
2263{
2264 struct address_space *mapping = inode->i_mapping;
2265 struct page *page;
89e10787 2266 void *fsdata;
1da177e4
LT
2267 int err;
2268
c08d3b0e 2269 err = inode_newsize_ok(inode, size);
2270 if (err)
1da177e4
LT
2271 goto out;
2272
89e10787
NP
2273 err = pagecache_write_begin(NULL, mapping, size, 0,
2274 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2275 &page, &fsdata);
2276 if (err)
05eb0b51 2277 goto out;
05eb0b51 2278
89e10787
NP
2279 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2280 BUG_ON(err > 0);
05eb0b51 2281
1da177e4
LT
2282out:
2283 return err;
2284}
1fe72eaa 2285EXPORT_SYMBOL(generic_cont_expand_simple);
1da177e4 2286
f1e3af72
AB
2287static int cont_expand_zero(struct file *file, struct address_space *mapping,
2288 loff_t pos, loff_t *bytes)
1da177e4 2289{
1da177e4 2290 struct inode *inode = mapping->host;
1da177e4 2291 unsigned blocksize = 1 << inode->i_blkbits;
89e10787
NP
2292 struct page *page;
2293 void *fsdata;
2294 pgoff_t index, curidx;
2295 loff_t curpos;
2296 unsigned zerofrom, offset, len;
2297 int err = 0;
1da177e4 2298
89e10787
NP
2299 index = pos >> PAGE_CACHE_SHIFT;
2300 offset = pos & ~PAGE_CACHE_MASK;
2301
2302 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2303 zerofrom = curpos & ~PAGE_CACHE_MASK;
1da177e4
LT
2304 if (zerofrom & (blocksize-1)) {
2305 *bytes |= (blocksize-1);
2306 (*bytes)++;
2307 }
89e10787 2308 len = PAGE_CACHE_SIZE - zerofrom;
1da177e4 2309
89e10787
NP
2310 err = pagecache_write_begin(file, mapping, curpos, len,
2311 AOP_FLAG_UNINTERRUPTIBLE,
2312 &page, &fsdata);
2313 if (err)
2314 goto out;
eebd2aa3 2315 zero_user(page, zerofrom, len);
89e10787
NP
2316 err = pagecache_write_end(file, mapping, curpos, len, len,
2317 page, fsdata);
2318 if (err < 0)
2319 goto out;
2320 BUG_ON(err != len);
2321 err = 0;
061e9746
OH
2322
2323 balance_dirty_pages_ratelimited(mapping);
89e10787 2324 }
1da177e4 2325
89e10787
NP
2326 /* page covers the boundary, find the boundary offset */
2327 if (index == curidx) {
2328 zerofrom = curpos & ~PAGE_CACHE_MASK;
1da177e4 2329 /* if we will expand the thing last block will be filled */
89e10787
NP
2330 if (offset <= zerofrom) {
2331 goto out;
2332 }
2333 if (zerofrom & (blocksize-1)) {
1da177e4
LT
2334 *bytes |= (blocksize-1);
2335 (*bytes)++;
2336 }
89e10787 2337 len = offset - zerofrom;
1da177e4 2338
89e10787
NP
2339 err = pagecache_write_begin(file, mapping, curpos, len,
2340 AOP_FLAG_UNINTERRUPTIBLE,
2341 &page, &fsdata);
2342 if (err)
2343 goto out;
eebd2aa3 2344 zero_user(page, zerofrom, len);
89e10787
NP
2345 err = pagecache_write_end(file, mapping, curpos, len, len,
2346 page, fsdata);
2347 if (err < 0)
2348 goto out;
2349 BUG_ON(err != len);
2350 err = 0;
1da177e4 2351 }
89e10787
NP
2352out:
2353 return err;
2354}
2355
2356/*
2357 * For moronic filesystems that do not allow holes in file.
2358 * We may have to extend the file.
2359 */
2360int cont_write_begin(struct file *file, struct address_space *mapping,
2361 loff_t pos, unsigned len, unsigned flags,
2362 struct page **pagep, void **fsdata,
2363 get_block_t *get_block, loff_t *bytes)
2364{
2365 struct inode *inode = mapping->host;
2366 unsigned blocksize = 1 << inode->i_blkbits;
2367 unsigned zerofrom;
2368 int err;
2369
2370 err = cont_expand_zero(file, mapping, pos, bytes);
2371 if (err)
2372 goto out;
2373
2374 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2375 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2376 *bytes |= (blocksize-1);
2377 (*bytes)++;
1da177e4 2378 }
1da177e4 2379
89e10787
NP
2380 *pagep = NULL;
2381 err = block_write_begin(file, mapping, pos, len,
2382 flags, pagep, fsdata, get_block);
1da177e4 2383out:
89e10787 2384 return err;
1da177e4 2385}
1fe72eaa 2386EXPORT_SYMBOL(cont_write_begin);
1da177e4
LT
2387
2388int block_prepare_write(struct page *page, unsigned from, unsigned to,
2389 get_block_t *get_block)
2390{
2391 struct inode *inode = page->mapping->host;
2392 int err = __block_prepare_write(inode, page, from, to, get_block);
2393 if (err)
2394 ClearPageUptodate(page);
2395 return err;
2396}
1fe72eaa 2397EXPORT_SYMBOL(block_prepare_write);
1da177e4
LT
2398
2399int block_commit_write(struct page *page, unsigned from, unsigned to)
2400{
2401 struct inode *inode = page->mapping->host;
2402 __block_commit_write(inode,page,from,to);
2403 return 0;
2404}
1fe72eaa 2405EXPORT_SYMBOL(block_commit_write);
1da177e4 2406
54171690
DC
2407/*
2408 * block_page_mkwrite() is not allowed to change the file size as it gets
2409 * called from a page fault handler when a page is first dirtied. Hence we must
2410 * be careful to check for EOF conditions here. We set the page up correctly
2411 * for a written page which means we get ENOSPC checking when writing into
2412 * holes and correct delalloc and unwritten extent mapping on filesystems that
2413 * support these features.
2414 *
2415 * We are not allowed to take the i_mutex here so we have to play games to
2416 * protect against truncate races as the page could now be beyond EOF. Because
2417 * vmtruncate() writes the inode size before removing pages, once we have the
2418 * page lock we can determine safely if the page is beyond EOF. If it is not
2419 * beyond EOF, then the page is guaranteed safe against truncation until we
2420 * unlock the page.
2421 */
2422int
c2ec175c 2423block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
54171690
DC
2424 get_block_t get_block)
2425{
c2ec175c 2426 struct page *page = vmf->page;
54171690
DC
2427 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2428 unsigned long end;
2429 loff_t size;
56a76f82 2430 int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
54171690
DC
2431
2432 lock_page(page);
2433 size = i_size_read(inode);
2434 if ((page->mapping != inode->i_mapping) ||
18336338 2435 (page_offset(page) > size)) {
54171690 2436 /* page got truncated out from underneath us */
b827e496
NP
2437 unlock_page(page);
2438 goto out;
54171690
DC
2439 }
2440
2441 /* page is wholly or partially inside EOF */
2442 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2443 end = size & ~PAGE_CACHE_MASK;
2444 else
2445 end = PAGE_CACHE_SIZE;
2446
2447 ret = block_prepare_write(page, 0, end, get_block);
2448 if (!ret)
2449 ret = block_commit_write(page, 0, end);
2450
56a76f82 2451 if (unlikely(ret)) {
b827e496 2452 unlock_page(page);
56a76f82
NP
2453 if (ret == -ENOMEM)
2454 ret = VM_FAULT_OOM;
2455 else /* -ENOSPC, -EIO, etc */
2456 ret = VM_FAULT_SIGBUS;
b827e496
NP
2457 } else
2458 ret = VM_FAULT_LOCKED;
c2ec175c 2459
b827e496 2460out:
54171690
DC
2461 return ret;
2462}
1fe72eaa 2463EXPORT_SYMBOL(block_page_mkwrite);
1da177e4
LT
2464
2465/*
03158cd7 2466 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
1da177e4
LT
2467 * immediately, while under the page lock. So it needs a special end_io
2468 * handler which does not touch the bh after unlocking it.
1da177e4
LT
2469 */
2470static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2471{
68671f35 2472 __end_buffer_read_notouch(bh, uptodate);
1da177e4
LT
2473}
2474
03158cd7
NP
2475/*
2476 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2477 * the page (converting it to circular linked list and taking care of page
2478 * dirty races).
2479 */
2480static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2481{
2482 struct buffer_head *bh;
2483
2484 BUG_ON(!PageLocked(page));
2485
2486 spin_lock(&page->mapping->private_lock);
2487 bh = head;
2488 do {
2489 if (PageDirty(page))
2490 set_buffer_dirty(bh);
2491 if (!bh->b_this_page)
2492 bh->b_this_page = head;
2493 bh = bh->b_this_page;
2494 } while (bh != head);
2495 attach_page_buffers(page, head);
2496 spin_unlock(&page->mapping->private_lock);
2497}
2498
1da177e4
LT
2499/*
2500 * On entry, the page is fully not uptodate.
2501 * On exit the page is fully uptodate in the areas outside (from,to)
2502 */
03158cd7
NP
2503int nobh_write_begin(struct file *file, struct address_space *mapping,
2504 loff_t pos, unsigned len, unsigned flags,
2505 struct page **pagep, void **fsdata,
1da177e4
LT
2506 get_block_t *get_block)
2507{
03158cd7 2508 struct inode *inode = mapping->host;
1da177e4
LT
2509 const unsigned blkbits = inode->i_blkbits;
2510 const unsigned blocksize = 1 << blkbits;
a4b0672d 2511 struct buffer_head *head, *bh;
03158cd7
NP
2512 struct page *page;
2513 pgoff_t index;
2514 unsigned from, to;
1da177e4 2515 unsigned block_in_page;
a4b0672d 2516 unsigned block_start, block_end;
1da177e4 2517 sector_t block_in_file;
1da177e4 2518 int nr_reads = 0;
1da177e4
LT
2519 int ret = 0;
2520 int is_mapped_to_disk = 1;
1da177e4 2521
03158cd7
NP
2522 index = pos >> PAGE_CACHE_SHIFT;
2523 from = pos & (PAGE_CACHE_SIZE - 1);
2524 to = from + len;
2525
54566b2c 2526 page = grab_cache_page_write_begin(mapping, index, flags);
03158cd7
NP
2527 if (!page)
2528 return -ENOMEM;
2529 *pagep = page;
2530 *fsdata = NULL;
2531
2532 if (page_has_buffers(page)) {
2533 unlock_page(page);
2534 page_cache_release(page);
2535 *pagep = NULL;
2536 return block_write_begin(file, mapping, pos, len, flags, pagep,
2537 fsdata, get_block);
2538 }
a4b0672d 2539
1da177e4
LT
2540 if (PageMappedToDisk(page))
2541 return 0;
2542
a4b0672d
NP
2543 /*
2544 * Allocate buffers so that we can keep track of state, and potentially
2545 * attach them to the page if an error occurs. In the common case of
2546 * no error, they will just be freed again without ever being attached
2547 * to the page (which is all OK, because we're under the page lock).
2548 *
2549 * Be careful: the buffer linked list is a NULL terminated one, rather
2550 * than the circular one we're used to.
2551 */
2552 head = alloc_page_buffers(page, blocksize, 0);
03158cd7
NP
2553 if (!head) {
2554 ret = -ENOMEM;
2555 goto out_release;
2556 }
a4b0672d 2557
1da177e4 2558 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
1da177e4
LT
2559
2560 /*
2561 * We loop across all blocks in the page, whether or not they are
2562 * part of the affected region. This is so we can discover if the
2563 * page is fully mapped-to-disk.
2564 */
a4b0672d 2565 for (block_start = 0, block_in_page = 0, bh = head;
1da177e4 2566 block_start < PAGE_CACHE_SIZE;
a4b0672d 2567 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
1da177e4
LT
2568 int create;
2569
a4b0672d
NP
2570 block_end = block_start + blocksize;
2571 bh->b_state = 0;
1da177e4
LT
2572 create = 1;
2573 if (block_start >= to)
2574 create = 0;
2575 ret = get_block(inode, block_in_file + block_in_page,
a4b0672d 2576 bh, create);
1da177e4
LT
2577 if (ret)
2578 goto failed;
a4b0672d 2579 if (!buffer_mapped(bh))
1da177e4 2580 is_mapped_to_disk = 0;
a4b0672d
NP
2581 if (buffer_new(bh))
2582 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2583 if (PageUptodate(page)) {
2584 set_buffer_uptodate(bh);
1da177e4 2585 continue;
a4b0672d
NP
2586 }
2587 if (buffer_new(bh) || !buffer_mapped(bh)) {
eebd2aa3
CL
2588 zero_user_segments(page, block_start, from,
2589 to, block_end);
1da177e4
LT
2590 continue;
2591 }
a4b0672d 2592 if (buffer_uptodate(bh))
1da177e4
LT
2593 continue; /* reiserfs does this */
2594 if (block_start < from || block_end > to) {
a4b0672d
NP
2595 lock_buffer(bh);
2596 bh->b_end_io = end_buffer_read_nobh;
2597 submit_bh(READ, bh);
2598 nr_reads++;
1da177e4
LT
2599 }
2600 }
2601
2602 if (nr_reads) {
1da177e4
LT
2603 /*
2604 * The page is locked, so these buffers are protected from
2605 * any VM or truncate activity. Hence we don't need to care
2606 * for the buffer_head refcounts.
2607 */
a4b0672d 2608 for (bh = head; bh; bh = bh->b_this_page) {
f80e69e7
JA
2609 int err;
2610
2611 err = wait_on_buffer_async(bh, current->io_wait);
1be3d0ec
JA
2612 if (err && !ret) {
2613 WARN(1, "%s: ret\n", __FUNCTION__);
f80e69e7 2614 ret = err;
1be3d0ec 2615 }
1da177e4
LT
2616 if (!buffer_uptodate(bh))
2617 ret = -EIO;
1da177e4
LT
2618 }
2619 if (ret)
2620 goto failed;
2621 }
2622
2623 if (is_mapped_to_disk)
2624 SetPageMappedToDisk(page);
1da177e4 2625
03158cd7 2626 *fsdata = head; /* to be released by nobh_write_end */
a4b0672d 2627
1da177e4
LT
2628 return 0;
2629
2630failed:
03158cd7 2631 BUG_ON(!ret);
1da177e4 2632 /*
a4b0672d
NP
2633 * Error recovery is a bit difficult. We need to zero out blocks that
2634 * were newly allocated, and dirty them to ensure they get written out.
2635 * Buffers need to be attached to the page at this point, otherwise
2636 * the handling of potential IO errors during writeout would be hard
2637 * (could try doing synchronous writeout, but what if that fails too?)
1da177e4 2638 */
03158cd7
NP
2639 attach_nobh_buffers(page, head);
2640 page_zero_new_buffers(page, from, to);
a4b0672d 2641
03158cd7
NP
2642out_release:
2643 unlock_page(page);
2644 page_cache_release(page);
2645 *pagep = NULL;
a4b0672d 2646
03158cd7
NP
2647 if (pos + len > inode->i_size)
2648 vmtruncate(inode, inode->i_size);
a4b0672d 2649
1da177e4
LT
2650 return ret;
2651}
03158cd7 2652EXPORT_SYMBOL(nobh_write_begin);
1da177e4 2653
03158cd7
NP
2654int nobh_write_end(struct file *file, struct address_space *mapping,
2655 loff_t pos, unsigned len, unsigned copied,
2656 struct page *page, void *fsdata)
1da177e4
LT
2657{
2658 struct inode *inode = page->mapping->host;
efdc3131 2659 struct buffer_head *head = fsdata;
03158cd7 2660 struct buffer_head *bh;
5b41e74a 2661 BUG_ON(fsdata != NULL && page_has_buffers(page));
1da177e4 2662
d4cf109f 2663 if (unlikely(copied < len) && head)
5b41e74a
DM
2664 attach_nobh_buffers(page, head);
2665 if (page_has_buffers(page))
2666 return generic_write_end(file, mapping, pos, len,
2667 copied, page, fsdata);
a4b0672d 2668
22c8ca78 2669 SetPageUptodate(page);
1da177e4 2670 set_page_dirty(page);
03158cd7
NP
2671 if (pos+copied > inode->i_size) {
2672 i_size_write(inode, pos+copied);
1da177e4
LT
2673 mark_inode_dirty(inode);
2674 }
03158cd7
NP
2675
2676 unlock_page(page);
2677 page_cache_release(page);
2678
03158cd7
NP
2679 while (head) {
2680 bh = head;
2681 head = head->b_this_page;
2682 free_buffer_head(bh);
2683 }
2684
2685 return copied;
1da177e4 2686}
03158cd7 2687EXPORT_SYMBOL(nobh_write_end);
1da177e4
LT
2688
2689/*
2690 * nobh_writepage() - based on block_full_write_page() except
2691 * that it tries to operate without attaching bufferheads to
2692 * the page.
2693 */
2694int nobh_writepage(struct page *page, get_block_t *get_block,
2695 struct writeback_control *wbc)
2696{
2697 struct inode * const inode = page->mapping->host;
2698 loff_t i_size = i_size_read(inode);
2699 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2700 unsigned offset;
1da177e4
LT
2701 int ret;
2702
2703 /* Is the page fully inside i_size? */
2704 if (page->index < end_index)
2705 goto out;
2706
2707 /* Is the page fully outside i_size? (truncate in progress) */
2708 offset = i_size & (PAGE_CACHE_SIZE-1);
2709 if (page->index >= end_index+1 || !offset) {
2710 /*
2711 * The page may have dirty, unmapped buffers. For example,
2712 * they may have been added in ext3_writepage(). Make them
2713 * freeable here, so the page does not leak.
2714 */
2715#if 0
2716 /* Not really sure about this - do we need this ? */
2717 if (page->mapping->a_ops->invalidatepage)
2718 page->mapping->a_ops->invalidatepage(page, offset);
2719#endif
2720 unlock_page(page);
2721 return 0; /* don't care */
2722 }
2723
2724 /*
2725 * The page straddles i_size. It must be zeroed out on each and every
2726 * writepage invocation because it may be mmapped. "A file is mapped
2727 * in multiples of the page size. For a file that is not a multiple of
2728 * the page size, the remaining memory is zeroed when mapped, and
2729 * writes to that region are not written out to the file."
2730 */
eebd2aa3 2731 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
1da177e4
LT
2732out:
2733 ret = mpage_writepage(page, get_block, wbc);
2734 if (ret == -EAGAIN)
35c80d5f
CM
2735 ret = __block_write_full_page(inode, page, get_block, wbc,
2736 end_buffer_async_write);
1da177e4
LT
2737 return ret;
2738}
2739EXPORT_SYMBOL(nobh_writepage);
2740
03158cd7
NP
2741int nobh_truncate_page(struct address_space *mapping,
2742 loff_t from, get_block_t *get_block)
1da177e4 2743{
1da177e4
LT
2744 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2745 unsigned offset = from & (PAGE_CACHE_SIZE-1);
03158cd7
NP
2746 unsigned blocksize;
2747 sector_t iblock;
2748 unsigned length, pos;
2749 struct inode *inode = mapping->host;
1da177e4 2750 struct page *page;
03158cd7
NP
2751 struct buffer_head map_bh;
2752 int err;
1da177e4 2753
03158cd7
NP
2754 blocksize = 1 << inode->i_blkbits;
2755 length = offset & (blocksize - 1);
2756
2757 /* Block boundary? Nothing to do */
2758 if (!length)
2759 return 0;
2760
2761 length = blocksize - length;
2762 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1da177e4 2763
1da177e4 2764 page = grab_cache_page(mapping, index);
03158cd7 2765 err = -ENOMEM;
1da177e4
LT
2766 if (!page)
2767 goto out;
2768
03158cd7
NP
2769 if (page_has_buffers(page)) {
2770has_buffers:
2771 unlock_page(page);
2772 page_cache_release(page);
2773 return block_truncate_page(mapping, from, get_block);
2774 }
2775
2776 /* Find the buffer that contains "offset" */
2777 pos = blocksize;
2778 while (offset >= pos) {
2779 iblock++;
2780 pos += blocksize;
2781 }
2782
460bcf57
TT
2783 map_bh.b_size = blocksize;
2784 map_bh.b_state = 0;
03158cd7
NP
2785 err = get_block(inode, iblock, &map_bh, 0);
2786 if (err)
2787 goto unlock;
2788 /* unmapped? It's a hole - nothing to do */
2789 if (!buffer_mapped(&map_bh))
2790 goto unlock;
2791
2792 /* Ok, it's mapped. Make sure it's up-to-date */
2793 if (!PageUptodate(page)) {
2794 err = mapping->a_ops->readpage(NULL, page);
2795 if (err) {
2796 page_cache_release(page);
2797 goto out;
2798 }
2799 lock_page(page);
2800 if (!PageUptodate(page)) {
2801 err = -EIO;
2802 goto unlock;
2803 }
2804 if (page_has_buffers(page))
2805 goto has_buffers;
1da177e4 2806 }
eebd2aa3 2807 zero_user(page, offset, length);
03158cd7
NP
2808 set_page_dirty(page);
2809 err = 0;
2810
2811unlock:
1da177e4
LT
2812 unlock_page(page);
2813 page_cache_release(page);
2814out:
03158cd7 2815 return err;
1da177e4
LT
2816}
2817EXPORT_SYMBOL(nobh_truncate_page);
2818
2819int block_truncate_page(struct address_space *mapping,
2820 loff_t from, get_block_t *get_block)
2821{
2822 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2823 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2824 unsigned blocksize;
54b21a79 2825 sector_t iblock;
1da177e4
LT
2826 unsigned length, pos;
2827 struct inode *inode = mapping->host;
2828 struct page *page;
2829 struct buffer_head *bh;
1da177e4
LT
2830 int err;
2831
2832 blocksize = 1 << inode->i_blkbits;
2833 length = offset & (blocksize - 1);
2834
2835 /* Block boundary? Nothing to do */
2836 if (!length)
2837 return 0;
2838
2839 length = blocksize - length;
54b21a79 2840 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1da177e4
LT
2841
2842 page = grab_cache_page(mapping, index);
2843 err = -ENOMEM;
2844 if (!page)
2845 goto out;
2846
2847 if (!page_has_buffers(page))
2848 create_empty_buffers(page, blocksize, 0);
2849
2850 /* Find the buffer that contains "offset" */
2851 bh = page_buffers(page);
2852 pos = blocksize;
2853 while (offset >= pos) {
2854 bh = bh->b_this_page;
2855 iblock++;
2856 pos += blocksize;
2857 }
2858
2859 err = 0;
2860 if (!buffer_mapped(bh)) {
b0cf2321 2861 WARN_ON(bh->b_size != blocksize);
1da177e4
LT
2862 err = get_block(inode, iblock, bh, 0);
2863 if (err)
2864 goto unlock;
2865 /* unmapped? It's a hole - nothing to do */
2866 if (!buffer_mapped(bh))
2867 goto unlock;
2868 }
2869
2870 /* Ok, it's mapped. Make sure it's up-to-date */
2871 if (PageUptodate(page))
2872 set_buffer_uptodate(bh);
2873
33a266dd 2874 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
1da177e4 2875 ll_rw_block(READ, 1, &bh);
f80e69e7 2876 err = wait_on_buffer_async(bh, current->io_wait);
1be3d0ec
JA
2877 if (err) {
2878 WARN(1, "err=%d\n", err);
2879 goto out;
2880 }
1da177e4 2881 /* Uhhuh. Read error. Complain and punt. */
f80e69e7 2882 err = -EIO;
1da177e4
LT
2883 if (!buffer_uptodate(bh))
2884 goto unlock;
2885 }
2886
eebd2aa3 2887 zero_user(page, offset, length);
1da177e4
LT
2888 mark_buffer_dirty(bh);
2889 err = 0;
2890
2891unlock:
2892 unlock_page(page);
2893 page_cache_release(page);
2894out:
2895 return err;
2896}
1fe72eaa 2897EXPORT_SYMBOL(block_truncate_page);
1da177e4
LT
2898
2899/*
2900 * The generic ->writepage function for buffer-backed address_spaces
35c80d5f 2901 * this form passes in the end_io handler used to finish the IO.
1da177e4 2902 */
35c80d5f
CM
2903int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2904 struct writeback_control *wbc, bh_end_io_t *handler)
1da177e4
LT
2905{
2906 struct inode * const inode = page->mapping->host;
2907 loff_t i_size = i_size_read(inode);
2908 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2909 unsigned offset;
1da177e4
LT
2910
2911 /* Is the page fully inside i_size? */
2912 if (page->index < end_index)
35c80d5f
CM
2913 return __block_write_full_page(inode, page, get_block, wbc,
2914 handler);
1da177e4
LT
2915
2916 /* Is the page fully outside i_size? (truncate in progress) */
2917 offset = i_size & (PAGE_CACHE_SIZE-1);
2918 if (page->index >= end_index+1 || !offset) {
2919 /*
2920 * The page may have dirty, unmapped buffers. For example,
2921 * they may have been added in ext3_writepage(). Make them
2922 * freeable here, so the page does not leak.
2923 */
aaa4059b 2924 do_invalidatepage(page, 0);
1da177e4
LT
2925 unlock_page(page);
2926 return 0; /* don't care */
2927 }
2928
2929 /*
2930 * The page straddles i_size. It must be zeroed out on each and every
2931 * writepage invokation because it may be mmapped. "A file is mapped
2932 * in multiples of the page size. For a file that is not a multiple of
2933 * the page size, the remaining memory is zeroed when mapped, and
2934 * writes to that region are not written out to the file."
2935 */
eebd2aa3 2936 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
35c80d5f 2937 return __block_write_full_page(inode, page, get_block, wbc, handler);
1da177e4 2938}
1fe72eaa 2939EXPORT_SYMBOL(block_write_full_page_endio);
1da177e4 2940
35c80d5f
CM
2941/*
2942 * The generic ->writepage function for buffer-backed address_spaces
2943 */
2944int block_write_full_page(struct page *page, get_block_t *get_block,
2945 struct writeback_control *wbc)
2946{
2947 return block_write_full_page_endio(page, get_block, wbc,
2948 end_buffer_async_write);
2949}
1fe72eaa 2950EXPORT_SYMBOL(block_write_full_page);
35c80d5f 2951
1da177e4
LT
2952sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2953 get_block_t *get_block)
2954{
2955 struct buffer_head tmp;
2956 struct inode *inode = mapping->host;
2957 tmp.b_state = 0;
2958 tmp.b_blocknr = 0;
b0cf2321 2959 tmp.b_size = 1 << inode->i_blkbits;
1da177e4
LT
2960 get_block(inode, block, &tmp, 0);
2961 return tmp.b_blocknr;
2962}
1fe72eaa 2963EXPORT_SYMBOL(generic_block_bmap);
1da177e4 2964
6712ecf8 2965static void end_bio_bh_io_sync(struct bio *bio, int err)
1da177e4
LT
2966{
2967 struct buffer_head *bh = bio->bi_private;
2968
1da177e4
LT
2969 if (err == -EOPNOTSUPP) {
2970 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2971 set_bit(BH_Eopnotsupp, &bh->b_state);
2972 }
2973
08bafc03
KM
2974 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2975 set_bit(BH_Quiet, &bh->b_state);
2976
1da177e4
LT
2977 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2978 bio_put(bio);
1da177e4
LT
2979}
2980
2981int submit_bh(int rw, struct buffer_head * bh)
2982{
2983 struct bio *bio;
2984 int ret = 0;
2985
2986 BUG_ON(!buffer_locked(bh));
2987 BUG_ON(!buffer_mapped(bh));
2988 BUG_ON(!bh->b_end_io);
8fb0e342
AK
2989 BUG_ON(buffer_delay(bh));
2990 BUG_ON(buffer_unwritten(bh));
1da177e4 2991
48fd4f93
JA
2992 /*
2993 * Mask in barrier bit for a write (could be either a WRITE or a
2994 * WRITE_SYNC
2995 */
2996 if (buffer_ordered(bh) && (rw & WRITE))
2997 rw |= WRITE_BARRIER;
1da177e4
LT
2998
2999 /*
48fd4f93 3000 * Only clear out a write error when rewriting
1da177e4 3001 */
48fd4f93 3002 if (test_set_buffer_req(bh) && (rw & WRITE))
1da177e4
LT
3003 clear_buffer_write_io_error(bh);
3004
3005 /*
3006 * from here on down, it's all bio -- do the initial mapping,
3007 * submit_bio -> generic_make_request may further map this bio around
3008 */
3009 bio = bio_alloc(GFP_NOIO, 1);
3010
3011 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3012 bio->bi_bdev = bh->b_bdev;
3013 bio->bi_io_vec[0].bv_page = bh->b_page;
3014 bio->bi_io_vec[0].bv_len = bh->b_size;
3015 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
3016
3017 bio->bi_vcnt = 1;
3018 bio->bi_idx = 0;
3019 bio->bi_size = bh->b_size;
3020
3021 bio->bi_end_io = end_bio_bh_io_sync;
3022 bio->bi_private = bh;
3023
3024 bio_get(bio);
3025 submit_bio(rw, bio);
3026
3027 if (bio_flagged(bio, BIO_EOPNOTSUPP))
3028 ret = -EOPNOTSUPP;
3029
3030 bio_put(bio);
3031 return ret;
3032}
1fe72eaa 3033EXPORT_SYMBOL(submit_bh);
1da177e4
LT
3034
3035/**
3036 * ll_rw_block: low-level access to block devices (DEPRECATED)
a7662236 3037 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
1da177e4
LT
3038 * @nr: number of &struct buffer_heads in the array
3039 * @bhs: array of pointers to &struct buffer_head
3040 *
a7662236
JK
3041 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3042 * requests an I/O operation on them, either a %READ or a %WRITE. The third
3043 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
3044 * are sent to disk. The fourth %READA option is described in the documentation
3045 * for generic_make_request() which ll_rw_block() calls.
1da177e4
LT
3046 *
3047 * This function drops any buffer that it cannot get a lock on (with the
a7662236
JK
3048 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
3049 * clean when doing a write request, and any buffer that appears to be
3050 * up-to-date when doing read request. Further it marks as clean buffers that
3051 * are processed for writing (the buffer cache won't assume that they are
3052 * actually clean until the buffer gets unlocked).
1da177e4
LT
3053 *
3054 * ll_rw_block sets b_end_io to simple completion handler that marks
3055 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
3056 * any waiters.
3057 *
3058 * All of the buffers must be for the same device, and must also be a
3059 * multiple of the current approved size for the device.
3060 */
3061void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3062{
3063 int i;
3064
3065 for (i = 0; i < nr; i++) {
3066 struct buffer_head *bh = bhs[i];
3067
9cf6b720 3068 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
a7662236 3069 lock_buffer(bh);
ca5de404 3070 else if (!trylock_buffer(bh))
1da177e4
LT
3071 continue;
3072
9cf6b720
JA
3073 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
3074 rw == SWRITE_SYNC_PLUG) {
1da177e4 3075 if (test_clear_buffer_dirty(bh)) {
76c3073a 3076 bh->b_end_io = end_buffer_write_sync;
e60e5c50 3077 get_bh(bh);
18ce3751
JA
3078 if (rw == SWRITE_SYNC)
3079 submit_bh(WRITE_SYNC, bh);
3080 else
3081 submit_bh(WRITE, bh);
1da177e4
LT
3082 continue;
3083 }
3084 } else {
1da177e4 3085 if (!buffer_uptodate(bh)) {
76c3073a 3086 bh->b_end_io = end_buffer_read_sync;
e60e5c50 3087 get_bh(bh);
1da177e4
LT
3088 submit_bh(rw, bh);
3089 continue;
3090 }
3091 }
3092 unlock_buffer(bh);
1da177e4
LT
3093 }
3094}
1fe72eaa 3095EXPORT_SYMBOL(ll_rw_block);
1da177e4
LT
3096
3097/*
3098 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3099 * and then start new I/O and then wait upon it. The caller must have a ref on
3100 * the buffer_head.
3101 */
3102int sync_dirty_buffer(struct buffer_head *bh)
3103{
3104 int ret = 0;
3105
3106 WARN_ON(atomic_read(&bh->b_count) < 1);
3107 lock_buffer(bh);
3108 if (test_clear_buffer_dirty(bh)) {
3109 get_bh(bh);
3110 bh->b_end_io = end_buffer_write_sync;
1aa2a7cc 3111 ret = submit_bh(WRITE_SYNC, bh);
1da177e4
LT
3112 wait_on_buffer(bh);
3113 if (buffer_eopnotsupp(bh)) {
3114 clear_buffer_eopnotsupp(bh);
3115 ret = -EOPNOTSUPP;
3116 }
3117 if (!ret && !buffer_uptodate(bh))
3118 ret = -EIO;
3119 } else {
3120 unlock_buffer(bh);
3121 }
3122 return ret;
3123}
1fe72eaa 3124EXPORT_SYMBOL(sync_dirty_buffer);
1da177e4
LT
3125
3126/*
3127 * try_to_free_buffers() checks if all the buffers on this particular page
3128 * are unused, and releases them if so.
3129 *
3130 * Exclusion against try_to_free_buffers may be obtained by either
3131 * locking the page or by holding its mapping's private_lock.
3132 *
3133 * If the page is dirty but all the buffers are clean then we need to
3134 * be sure to mark the page clean as well. This is because the page
3135 * may be against a block device, and a later reattachment of buffers
3136 * to a dirty page will set *all* buffers dirty. Which would corrupt
3137 * filesystem data on the same device.
3138 *
3139 * The same applies to regular filesystem pages: if all the buffers are
3140 * clean then we set the page clean and proceed. To do that, we require
3141 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3142 * private_lock.
3143 *
3144 * try_to_free_buffers() is non-blocking.
3145 */
3146static inline int buffer_busy(struct buffer_head *bh)
3147{
3148 return atomic_read(&bh->b_count) |
3149 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3150}
3151
3152static int
3153drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3154{
3155 struct buffer_head *head = page_buffers(page);
3156 struct buffer_head *bh;
3157
3158 bh = head;
3159 do {
de7d5a3b 3160 if (buffer_write_io_error(bh) && page->mapping)
1da177e4
LT
3161 set_bit(AS_EIO, &page->mapping->flags);
3162 if (buffer_busy(bh))
3163 goto failed;
3164 bh = bh->b_this_page;
3165 } while (bh != head);
3166
3167 do {
3168 struct buffer_head *next = bh->b_this_page;
3169
535ee2fb 3170 if (bh->b_assoc_map)
1da177e4
LT
3171 __remove_assoc_queue(bh);
3172 bh = next;
3173 } while (bh != head);
3174 *buffers_to_free = head;
3175 __clear_page_buffers(page);
3176 return 1;
3177failed:
3178 return 0;
3179}
3180
3181int try_to_free_buffers(struct page *page)
3182{
3183 struct address_space * const mapping = page->mapping;
3184 struct buffer_head *buffers_to_free = NULL;
3185 int ret = 0;
3186
3187 BUG_ON(!PageLocked(page));
ecdfc978 3188 if (PageWriteback(page))
1da177e4
LT
3189 return 0;
3190
3191 if (mapping == NULL) { /* can this still happen? */
3192 ret = drop_buffers(page, &buffers_to_free);
3193 goto out;
3194 }
3195
3196 spin_lock(&mapping->private_lock);
3197 ret = drop_buffers(page, &buffers_to_free);
ecdfc978
LT
3198
3199 /*
3200 * If the filesystem writes its buffers by hand (eg ext3)
3201 * then we can have clean buffers against a dirty page. We
3202 * clean the page here; otherwise the VM will never notice
3203 * that the filesystem did any IO at all.
3204 *
3205 * Also, during truncate, discard_buffer will have marked all
3206 * the page's buffers clean. We discover that here and clean
3207 * the page also.
87df7241
NP
3208 *
3209 * private_lock must be held over this entire operation in order
3210 * to synchronise against __set_page_dirty_buffers and prevent the
3211 * dirty bit from being lost.
ecdfc978
LT
3212 */
3213 if (ret)
3214 cancel_dirty_page(page, PAGE_CACHE_SIZE);
87df7241 3215 spin_unlock(&mapping->private_lock);
1da177e4
LT
3216out:
3217 if (buffers_to_free) {
3218 struct buffer_head *bh = buffers_to_free;
3219
3220 do {
3221 struct buffer_head *next = bh->b_this_page;
3222 free_buffer_head(bh);
3223 bh = next;
3224 } while (bh != buffers_to_free);
3225 }
3226 return ret;
3227}
3228EXPORT_SYMBOL(try_to_free_buffers);
3229
3978d717 3230void block_sync_page(struct page *page)
1da177e4
LT
3231{
3232 struct address_space *mapping;
3233
3234 smp_mb();
3235 mapping = page_mapping(page);
3236 if (mapping)
3237 blk_run_backing_dev(mapping->backing_dev_info, page);
1da177e4 3238}
1fe72eaa 3239EXPORT_SYMBOL(block_sync_page);
1da177e4
LT
3240
3241/*
3242 * There are no bdflush tunables left. But distributions are
3243 * still running obsolete flush daemons, so we terminate them here.
3244 *
3245 * Use of bdflush() is deprecated and will be removed in a future kernel.
5b0830cb 3246 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
1da177e4 3247 */
bdc480e3 3248SYSCALL_DEFINE2(bdflush, int, func, long, data)
1da177e4
LT
3249{
3250 static int msg_count;
3251
3252 if (!capable(CAP_SYS_ADMIN))
3253 return -EPERM;
3254
3255 if (msg_count < 5) {
3256 msg_count++;
3257 printk(KERN_INFO
3258 "warning: process `%s' used the obsolete bdflush"
3259 " system call\n", current->comm);
3260 printk(KERN_INFO "Fix your initscripts?\n");
3261 }
3262
3263 if (func == 1)
3264 do_exit(0);
3265 return 0;
3266}
3267
3268/*
3269 * Buffer-head allocation
3270 */
e18b890b 3271static struct kmem_cache *bh_cachep;
1da177e4
LT
3272
3273/*
3274 * Once the number of bh's in the machine exceeds this level, we start
3275 * stripping them in writeback.
3276 */
3277static int max_buffer_heads;
3278
3279int buffer_heads_over_limit;
3280
3281struct bh_accounting {
3282 int nr; /* Number of live bh's */
3283 int ratelimit; /* Limit cacheline bouncing */
3284};
3285
3286static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3287
3288static void recalc_bh_state(void)
3289{
3290 int i;
3291 int tot = 0;
3292
3293 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3294 return;
3295 __get_cpu_var(bh_accounting).ratelimit = 0;
8a143426 3296 for_each_online_cpu(i)
1da177e4
LT
3297 tot += per_cpu(bh_accounting, i).nr;
3298 buffer_heads_over_limit = (tot > max_buffer_heads);
3299}
3300
dd0fc66f 3301struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
1da177e4 3302{
488514d1 3303 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
1da177e4 3304 if (ret) {
a35afb83 3305 INIT_LIST_HEAD(&ret->b_assoc_buffers);
736c7b80 3306 get_cpu_var(bh_accounting).nr++;
1da177e4 3307 recalc_bh_state();
736c7b80 3308 put_cpu_var(bh_accounting);
1da177e4
LT
3309 }
3310 return ret;
3311}
3312EXPORT_SYMBOL(alloc_buffer_head);
3313
3314void free_buffer_head(struct buffer_head *bh)
3315{
3316 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3317 kmem_cache_free(bh_cachep, bh);
736c7b80 3318 get_cpu_var(bh_accounting).nr--;
1da177e4 3319 recalc_bh_state();
736c7b80 3320 put_cpu_var(bh_accounting);
1da177e4
LT
3321}
3322EXPORT_SYMBOL(free_buffer_head);
3323
1da177e4
LT
3324static void buffer_exit_cpu(int cpu)
3325{
3326 int i;
3327 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3328
3329 for (i = 0; i < BH_LRU_SIZE; i++) {
3330 brelse(b->bhs[i]);
3331 b->bhs[i] = NULL;
3332 }
8a143426
ED
3333 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3334 per_cpu(bh_accounting, cpu).nr = 0;
3335 put_cpu_var(bh_accounting);
1da177e4
LT
3336}
3337
3338static int buffer_cpu_notify(struct notifier_block *self,
3339 unsigned long action, void *hcpu)
3340{
8bb78442 3341 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1da177e4
LT
3342 buffer_exit_cpu((unsigned long)hcpu);
3343 return NOTIFY_OK;
3344}
1da177e4 3345
389d1b08 3346/**
a6b91919 3347 * bh_uptodate_or_lock - Test whether the buffer is uptodate
389d1b08
AK
3348 * @bh: struct buffer_head
3349 *
3350 * Return true if the buffer is up-to-date and false,
3351 * with the buffer locked, if not.
3352 */
3353int bh_uptodate_or_lock(struct buffer_head *bh)
3354{
3355 if (!buffer_uptodate(bh)) {
3356 lock_buffer(bh);
3357 if (!buffer_uptodate(bh))
3358 return 0;
3359 unlock_buffer(bh);
3360 }
3361 return 1;
3362}
3363EXPORT_SYMBOL(bh_uptodate_or_lock);
3364
3365/**
a6b91919 3366 * bh_submit_read - Submit a locked buffer for reading
389d1b08
AK
3367 * @bh: struct buffer_head
3368 *
3369 * Returns zero on success and -EIO on error.
3370 */
3371int bh_submit_read(struct buffer_head *bh)
3372{
3373 BUG_ON(!buffer_locked(bh));
3374
3375 if (buffer_uptodate(bh)) {
3376 unlock_buffer(bh);
3377 return 0;
3378 }
3379
3380 get_bh(bh);
3381 bh->b_end_io = end_buffer_read_sync;
3382 submit_bh(READ, bh);
3c72afb2 3383 wait_on_buffer(bh);
389d1b08
AK
3384 if (buffer_uptodate(bh))
3385 return 0;
3386 return -EIO;
3387}
3388EXPORT_SYMBOL(bh_submit_read);
3389
b98938c3 3390static void
51cc5068 3391init_buffer_head(void *data)
b98938c3
CL
3392{
3393 struct buffer_head *bh = data;
3394
3395 memset(bh, 0, sizeof(*bh));
3396 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3397}
3398
1da177e4
LT
3399void __init buffer_init(void)
3400{
3401 int nrpages;
3402
b98938c3
CL
3403 bh_cachep = kmem_cache_create("buffer_head",
3404 sizeof(struct buffer_head), 0,
3405 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3406 SLAB_MEM_SPREAD),
3407 init_buffer_head);
1da177e4
LT
3408
3409 /*
3410 * Limit the bh occupancy to 10% of ZONE_NORMAL
3411 */
3412 nrpages = (nr_free_buffer_pages() * 10) / 100;
3413 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3414 hotcpu_notifier(buffer_cpu_notify, 0);
3415}