Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec
[linux-block.git] / fs / gfs2 / aops.c
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9
10 #include <linux/sched.h>
11 #include <linux/slab.h>
12 #include <linux/spinlock.h>
13 #include <linux/completion.h>
14 #include <linux/buffer_head.h>
15 #include <linux/pagemap.h>
16 #include <linux/pagevec.h>
17 #include <linux/mpage.h>
18 #include <linux/fs.h>
19 #include <linux/writeback.h>
20 #include <linux/swap.h>
21 #include <linux/gfs2_ondisk.h>
22 #include <linux/backing-dev.h>
23 #include <linux/uio.h>
24 #include <trace/events/writeback.h>
25
26 #include "gfs2.h"
27 #include "incore.h"
28 #include "bmap.h"
29 #include "glock.h"
30 #include "inode.h"
31 #include "log.h"
32 #include "meta_io.h"
33 #include "quota.h"
34 #include "trans.h"
35 #include "rgrp.h"
36 #include "super.h"
37 #include "util.h"
38 #include "glops.h"
39
40
41 static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
42                                    unsigned int from, unsigned int len)
43 {
44         struct buffer_head *head = page_buffers(page);
45         unsigned int bsize = head->b_size;
46         struct buffer_head *bh;
47         unsigned int to = from + len;
48         unsigned int start, end;
49
50         for (bh = head, start = 0; bh != head || !start;
51              bh = bh->b_this_page, start = end) {
52                 end = start + bsize;
53                 if (end <= from)
54                         continue;
55                 if (start >= to)
56                         break;
57                 if (gfs2_is_jdata(ip))
58                         set_buffer_uptodate(bh);
59                 gfs2_trans_add_data(ip->i_gl, bh);
60         }
61 }
62
63 /**
64  * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
65  * @inode: The inode
66  * @lblock: The block number to look up
67  * @bh_result: The buffer head to return the result in
68  * @create: Non-zero if we may add block to the file
69  *
70  * Returns: errno
71  */
72
73 static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
74                                   struct buffer_head *bh_result, int create)
75 {
76         int error;
77
78         error = gfs2_block_map(inode, lblock, bh_result, 0);
79         if (error)
80                 return error;
81         if (!buffer_mapped(bh_result))
82                 return -EIO;
83         return 0;
84 }
85
86 static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
87                                  struct buffer_head *bh_result, int create)
88 {
89         return gfs2_block_map(inode, lblock, bh_result, 0);
90 }
91
92 /**
93  * gfs2_writepage_common - Common bits of writepage
94  * @page: The page to be written
95  * @wbc: The writeback control
96  *
97  * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
98  */
99
100 static int gfs2_writepage_common(struct page *page,
101                                  struct writeback_control *wbc)
102 {
103         struct inode *inode = page->mapping->host;
104         struct gfs2_inode *ip = GFS2_I(inode);
105         struct gfs2_sbd *sdp = GFS2_SB(inode);
106         loff_t i_size = i_size_read(inode);
107         pgoff_t end_index = i_size >> PAGE_SHIFT;
108         unsigned offset;
109
110         if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
111                 goto out;
112         if (current->journal_info)
113                 goto redirty;
114         /* Is the page fully outside i_size? (truncate in progress) */
115         offset = i_size & (PAGE_SIZE-1);
116         if (page->index > end_index || (page->index == end_index && !offset)) {
117                 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
118                 goto out;
119         }
120         return 1;
121 redirty:
122         redirty_page_for_writepage(wbc, page);
123 out:
124         unlock_page(page);
125         return 0;
126 }
127
128 /**
129  * gfs2_writepage - Write page for writeback mappings
130  * @page: The page
131  * @wbc: The writeback control
132  *
133  */
134
135 static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
136 {
137         int ret;
138
139         ret = gfs2_writepage_common(page, wbc);
140         if (ret <= 0)
141                 return ret;
142
143         return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
144 }
145
146 /* This is the same as calling block_write_full_page, but it also
147  * writes pages outside of i_size
148  */
149 static int gfs2_write_full_page(struct page *page, get_block_t *get_block,
150                                 struct writeback_control *wbc)
151 {
152         struct inode * const inode = page->mapping->host;
153         loff_t i_size = i_size_read(inode);
154         const pgoff_t end_index = i_size >> PAGE_SHIFT;
155         unsigned offset;
156
157         /*
158          * The page straddles i_size.  It must be zeroed out on each and every
159          * writepage invocation because it may be mmapped.  "A file is mapped
160          * in multiples of the page size.  For a file that is not a multiple of
161          * the  page size, the remaining memory is zeroed when mapped, and
162          * writes to that region are not written out to the file."
163          */
164         offset = i_size & (PAGE_SIZE-1);
165         if (page->index == end_index && offset)
166                 zero_user_segment(page, offset, PAGE_SIZE);
167
168         return __block_write_full_page(inode, page, get_block, wbc,
169                                        end_buffer_async_write);
170 }
171
172 /**
173  * __gfs2_jdata_writepage - The core of jdata writepage
174  * @page: The page to write
175  * @wbc: The writeback control
176  *
177  * This is shared between writepage and writepages and implements the
178  * core of the writepage operation. If a transaction is required then
179  * PageChecked will have been set and the transaction will have
180  * already been started before this is called.
181  */
182
183 static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
184 {
185         struct inode *inode = page->mapping->host;
186         struct gfs2_inode *ip = GFS2_I(inode);
187         struct gfs2_sbd *sdp = GFS2_SB(inode);
188
189         if (PageChecked(page)) {
190                 ClearPageChecked(page);
191                 if (!page_has_buffers(page)) {
192                         create_empty_buffers(page, inode->i_sb->s_blocksize,
193                                              BIT(BH_Dirty)|BIT(BH_Uptodate));
194                 }
195                 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize);
196         }
197         return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc);
198 }
199
200 /**
201  * gfs2_jdata_writepage - Write complete page
202  * @page: Page to write
203  * @wbc: The writeback control
204  *
205  * Returns: errno
206  *
207  */
208
209 static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
210 {
211         struct inode *inode = page->mapping->host;
212         struct gfs2_inode *ip = GFS2_I(inode);
213         struct gfs2_sbd *sdp = GFS2_SB(inode);
214         int ret;
215
216         if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
217                 goto out;
218         if (PageChecked(page) || current->journal_info)
219                 goto out_ignore;
220         ret = __gfs2_jdata_writepage(page, wbc);
221         return ret;
222
223 out_ignore:
224         redirty_page_for_writepage(wbc, page);
225 out:
226         unlock_page(page);
227         return 0;
228 }
229
230 /**
231  * gfs2_writepages - Write a bunch of dirty pages back to disk
232  * @mapping: The mapping to write
233  * @wbc: Write-back control
234  *
235  * Used for both ordered and writeback modes.
236  */
237 static int gfs2_writepages(struct address_space *mapping,
238                            struct writeback_control *wbc)
239 {
240         struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
241         int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
242
243         /*
244          * Even if we didn't write any pages here, we might still be holding
245          * dirty pages in the ail. We forcibly flush the ail because we don't
246          * want balance_dirty_pages() to loop indefinitely trying to write out
247          * pages held in the ail that it can't find.
248          */
249         if (ret == 0)
250                 set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags);
251
252         return ret;
253 }
254
255 /**
256  * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
257  * @mapping: The mapping
258  * @wbc: The writeback control
259  * @pvec: The vector of pages
260  * @nr_pages: The number of pages to write
261  * @done_index: Page index
262  *
263  * Returns: non-zero if loop should terminate, zero otherwise
264  */
265
266 static int gfs2_write_jdata_pagevec(struct address_space *mapping,
267                                     struct writeback_control *wbc,
268                                     struct pagevec *pvec,
269                                     int nr_pages,
270                                     pgoff_t *done_index)
271 {
272         struct inode *inode = mapping->host;
273         struct gfs2_sbd *sdp = GFS2_SB(inode);
274         unsigned nrblocks = nr_pages * (PAGE_SIZE/inode->i_sb->s_blocksize);
275         int i;
276         int ret;
277
278         ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
279         if (ret < 0)
280                 return ret;
281
282         for(i = 0; i < nr_pages; i++) {
283                 struct page *page = pvec->pages[i];
284
285                 *done_index = page->index;
286
287                 lock_page(page);
288
289                 if (unlikely(page->mapping != mapping)) {
290 continue_unlock:
291                         unlock_page(page);
292                         continue;
293                 }
294
295                 if (!PageDirty(page)) {
296                         /* someone wrote it for us */
297                         goto continue_unlock;
298                 }
299
300                 if (PageWriteback(page)) {
301                         if (wbc->sync_mode != WB_SYNC_NONE)
302                                 wait_on_page_writeback(page);
303                         else
304                                 goto continue_unlock;
305                 }
306
307                 BUG_ON(PageWriteback(page));
308                 if (!clear_page_dirty_for_io(page))
309                         goto continue_unlock;
310
311                 trace_wbc_writepage(wbc, inode_to_bdi(inode));
312
313                 ret = __gfs2_jdata_writepage(page, wbc);
314                 if (unlikely(ret)) {
315                         if (ret == AOP_WRITEPAGE_ACTIVATE) {
316                                 unlock_page(page);
317                                 ret = 0;
318                         } else {
319
320                                 /*
321                                  * done_index is set past this page,
322                                  * so media errors will not choke
323                                  * background writeout for the entire
324                                  * file. This has consequences for
325                                  * range_cyclic semantics (ie. it may
326                                  * not be suitable for data integrity
327                                  * writeout).
328                                  */
329                                 *done_index = page->index + 1;
330                                 ret = 1;
331                                 break;
332                         }
333                 }
334
335                 /*
336                  * We stop writing back only if we are not doing
337                  * integrity sync. In case of integrity sync we have to
338                  * keep going until we have written all the pages
339                  * we tagged for writeback prior to entering this loop.
340                  */
341                 if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) {
342                         ret = 1;
343                         break;
344                 }
345
346         }
347         gfs2_trans_end(sdp);
348         return ret;
349 }
350
351 /**
352  * gfs2_write_cache_jdata - Like write_cache_pages but different
353  * @mapping: The mapping to write
354  * @wbc: The writeback control
355  *
356  * The reason that we use our own function here is that we need to
357  * start transactions before we grab page locks. This allows us
358  * to get the ordering right.
359  */
360
361 static int gfs2_write_cache_jdata(struct address_space *mapping,
362                                   struct writeback_control *wbc)
363 {
364         int ret = 0;
365         int done = 0;
366         struct pagevec pvec;
367         int nr_pages;
368         pgoff_t uninitialized_var(writeback_index);
369         pgoff_t index;
370         pgoff_t end;
371         pgoff_t done_index;
372         int cycled;
373         int range_whole = 0;
374         int tag;
375
376         pagevec_init(&pvec);
377         if (wbc->range_cyclic) {
378                 writeback_index = mapping->writeback_index; /* prev offset */
379                 index = writeback_index;
380                 if (index == 0)
381                         cycled = 1;
382                 else
383                         cycled = 0;
384                 end = -1;
385         } else {
386                 index = wbc->range_start >> PAGE_SHIFT;
387                 end = wbc->range_end >> PAGE_SHIFT;
388                 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
389                         range_whole = 1;
390                 cycled = 1; /* ignore range_cyclic tests */
391         }
392         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
393                 tag = PAGECACHE_TAG_TOWRITE;
394         else
395                 tag = PAGECACHE_TAG_DIRTY;
396
397 retry:
398         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
399                 tag_pages_for_writeback(mapping, index, end);
400         done_index = index;
401         while (!done && (index <= end)) {
402                 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
403                                 tag);
404                 if (nr_pages == 0)
405                         break;
406
407                 ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, &done_index);
408                 if (ret)
409                         done = 1;
410                 if (ret > 0)
411                         ret = 0;
412                 pagevec_release(&pvec);
413                 cond_resched();
414         }
415
416         if (!cycled && !done) {
417                 /*
418                  * range_cyclic:
419                  * We hit the last page and there is more work to be done: wrap
420                  * back to the start of the file
421                  */
422                 cycled = 1;
423                 index = 0;
424                 end = writeback_index - 1;
425                 goto retry;
426         }
427
428         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
429                 mapping->writeback_index = done_index;
430
431         return ret;
432 }
433
434
435 /**
436  * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
437  * @mapping: The mapping to write
438  * @wbc: The writeback control
439  * 
440  */
441
442 static int gfs2_jdata_writepages(struct address_space *mapping,
443                                  struct writeback_control *wbc)
444 {
445         struct gfs2_inode *ip = GFS2_I(mapping->host);
446         struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
447         int ret;
448
449         ret = gfs2_write_cache_jdata(mapping, wbc);
450         if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
451                 gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
452                                GFS2_LFC_JDATA_WPAGES);
453                 ret = gfs2_write_cache_jdata(mapping, wbc);
454         }
455         return ret;
456 }
457
458 /**
459  * stuffed_readpage - Fill in a Linux page with stuffed file data
460  * @ip: the inode
461  * @page: the page
462  *
463  * Returns: errno
464  */
465
466 static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
467 {
468         struct buffer_head *dibh;
469         u64 dsize = i_size_read(&ip->i_inode);
470         void *kaddr;
471         int error;
472
473         /*
474          * Due to the order of unstuffing files and ->fault(), we can be
475          * asked for a zero page in the case of a stuffed file being extended,
476          * so we need to supply one here. It doesn't happen often.
477          */
478         if (unlikely(page->index)) {
479                 zero_user(page, 0, PAGE_SIZE);
480                 SetPageUptodate(page);
481                 return 0;
482         }
483
484         error = gfs2_meta_inode_buffer(ip, &dibh);
485         if (error)
486                 return error;
487
488         kaddr = kmap_atomic(page);
489         if (dsize > gfs2_max_stuffed_size(ip))
490                 dsize = gfs2_max_stuffed_size(ip);
491         memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
492         memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
493         kunmap_atomic(kaddr);
494         flush_dcache_page(page);
495         brelse(dibh);
496         SetPageUptodate(page);
497
498         return 0;
499 }
500
501
502 /**
503  * __gfs2_readpage - readpage
504  * @file: The file to read a page for
505  * @page: The page to read
506  *
507  * This is the core of gfs2's readpage. It's used by the internal file
508  * reading code as in that case we already hold the glock. Also it's
509  * called by gfs2_readpage() once the required lock has been granted.
510  */
511
512 static int __gfs2_readpage(void *file, struct page *page)
513 {
514         struct gfs2_inode *ip = GFS2_I(page->mapping->host);
515         struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
516         int error;
517
518         if (gfs2_is_stuffed(ip)) {
519                 error = stuffed_readpage(ip, page);
520                 unlock_page(page);
521         } else {
522                 error = mpage_readpage(page, gfs2_block_map);
523         }
524
525         if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
526                 return -EIO;
527
528         return error;
529 }
530
531 /**
532  * gfs2_readpage - read a page of a file
533  * @file: The file to read
534  * @page: The page of the file
535  *
536  * This deals with the locking required. We have to unlock and
537  * relock the page in order to get the locking in the right
538  * order.
539  */
540
541 static int gfs2_readpage(struct file *file, struct page *page)
542 {
543         struct address_space *mapping = page->mapping;
544         struct gfs2_inode *ip = GFS2_I(mapping->host);
545         struct gfs2_holder gh;
546         int error;
547
548         unlock_page(page);
549         gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
550         error = gfs2_glock_nq(&gh);
551         if (unlikely(error))
552                 goto out;
553         error = AOP_TRUNCATED_PAGE;
554         lock_page(page);
555         if (page->mapping == mapping && !PageUptodate(page))
556                 error = __gfs2_readpage(file, page);
557         else
558                 unlock_page(page);
559         gfs2_glock_dq(&gh);
560 out:
561         gfs2_holder_uninit(&gh);
562         if (error && error != AOP_TRUNCATED_PAGE)
563                 lock_page(page);
564         return error;
565 }
566
567 /**
568  * gfs2_internal_read - read an internal file
569  * @ip: The gfs2 inode
570  * @buf: The buffer to fill
571  * @pos: The file position
572  * @size: The amount to read
573  *
574  */
575
576 int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
577                        unsigned size)
578 {
579         struct address_space *mapping = ip->i_inode.i_mapping;
580         unsigned long index = *pos / PAGE_SIZE;
581         unsigned offset = *pos & (PAGE_SIZE - 1);
582         unsigned copied = 0;
583         unsigned amt;
584         struct page *page;
585         void *p;
586
587         do {
588                 amt = size - copied;
589                 if (offset + size > PAGE_SIZE)
590                         amt = PAGE_SIZE - offset;
591                 page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
592                 if (IS_ERR(page))
593                         return PTR_ERR(page);
594                 p = kmap_atomic(page);
595                 memcpy(buf + copied, p + offset, amt);
596                 kunmap_atomic(p);
597                 put_page(page);
598                 copied += amt;
599                 index++;
600                 offset = 0;
601         } while(copied < size);
602         (*pos) += size;
603         return size;
604 }
605
606 /**
607  * gfs2_readpages - Read a bunch of pages at once
608  * @file: The file to read from
609  * @mapping: Address space info
610  * @pages: List of pages to read
611  * @nr_pages: Number of pages to read
612  *
613  * Some notes:
614  * 1. This is only for readahead, so we can simply ignore any things
615  *    which are slightly inconvenient (such as locking conflicts between
616  *    the page lock and the glock) and return having done no I/O. Its
617  *    obviously not something we'd want to do on too regular a basis.
618  *    Any I/O we ignore at this time will be done via readpage later.
619  * 2. We don't handle stuffed files here we let readpage do the honours.
620  * 3. mpage_readpages() does most of the heavy lifting in the common case.
621  * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
622  */
623
624 static int gfs2_readpages(struct file *file, struct address_space *mapping,
625                           struct list_head *pages, unsigned nr_pages)
626 {
627         struct inode *inode = mapping->host;
628         struct gfs2_inode *ip = GFS2_I(inode);
629         struct gfs2_sbd *sdp = GFS2_SB(inode);
630         struct gfs2_holder gh;
631         int ret;
632
633         gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
634         ret = gfs2_glock_nq(&gh);
635         if (unlikely(ret))
636                 goto out_uninit;
637         if (!gfs2_is_stuffed(ip))
638                 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
639         gfs2_glock_dq(&gh);
640 out_uninit:
641         gfs2_holder_uninit(&gh);
642         if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
643                 ret = -EIO;
644         return ret;
645 }
646
647 /**
648  * gfs2_write_begin - Begin to write to a file
649  * @file: The file to write to
650  * @mapping: The mapping in which to write
651  * @pos: The file offset at which to start writing
652  * @len: Length of the write
653  * @flags: Various flags
654  * @pagep: Pointer to return the page
655  * @fsdata: Pointer to return fs data (unused by GFS2)
656  *
657  * Returns: errno
658  */
659
660 static int gfs2_write_begin(struct file *file, struct address_space *mapping,
661                             loff_t pos, unsigned len, unsigned flags,
662                             struct page **pagep, void **fsdata)
663 {
664         struct gfs2_inode *ip = GFS2_I(mapping->host);
665         struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
666         struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
667         unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
668         unsigned requested = 0;
669         int alloc_required;
670         int error = 0;
671         pgoff_t index = pos >> PAGE_SHIFT;
672         unsigned from = pos & (PAGE_SIZE - 1);
673         struct page *page;
674
675         gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
676         error = gfs2_glock_nq(&ip->i_gh);
677         if (unlikely(error))
678                 goto out_uninit;
679         if (&ip->i_inode == sdp->sd_rindex) {
680                 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
681                                            GL_NOCACHE, &m_ip->i_gh);
682                 if (unlikely(error)) {
683                         gfs2_glock_dq(&ip->i_gh);
684                         goto out_uninit;
685                 }
686         }
687
688         alloc_required = gfs2_write_alloc_required(ip, pos, len);
689
690         if (alloc_required || gfs2_is_jdata(ip))
691                 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
692
693         if (alloc_required) {
694                 struct gfs2_alloc_parms ap = { .aflags = 0, };
695                 requested = data_blocks + ind_blocks;
696                 ap.target = requested;
697                 error = gfs2_quota_lock_check(ip, &ap);
698                 if (error)
699                         goto out_unlock;
700
701                 error = gfs2_inplace_reserve(ip, &ap);
702                 if (error)
703                         goto out_qunlock;
704         }
705
706         rblocks = RES_DINODE + ind_blocks;
707         if (gfs2_is_jdata(ip))
708                 rblocks += data_blocks ? data_blocks : 1;
709         if (ind_blocks || data_blocks)
710                 rblocks += RES_STATFS + RES_QUOTA;
711         if (&ip->i_inode == sdp->sd_rindex)
712                 rblocks += 2 * RES_STATFS;
713         if (alloc_required)
714                 rblocks += gfs2_rg_blocks(ip, requested);
715
716         error = gfs2_trans_begin(sdp, rblocks,
717                                  PAGE_SIZE/sdp->sd_sb.sb_bsize);
718         if (error)
719                 goto out_trans_fail;
720
721         error = -ENOMEM;
722         flags |= AOP_FLAG_NOFS;
723         page = grab_cache_page_write_begin(mapping, index, flags);
724         *pagep = page;
725         if (unlikely(!page))
726                 goto out_endtrans;
727
728         if (gfs2_is_stuffed(ip)) {
729                 error = 0;
730                 if (pos + len > gfs2_max_stuffed_size(ip)) {
731                         error = gfs2_unstuff_dinode(ip, page);
732                         if (error == 0)
733                                 goto prepare_write;
734                 } else if (!PageUptodate(page)) {
735                         error = stuffed_readpage(ip, page);
736                 }
737                 goto out;
738         }
739
740 prepare_write:
741         error = __block_write_begin(page, from, len, gfs2_block_map);
742 out:
743         if (error == 0)
744                 return 0;
745
746         unlock_page(page);
747         put_page(page);
748
749         gfs2_trans_end(sdp);
750         if (pos + len > ip->i_inode.i_size)
751                 gfs2_trim_blocks(&ip->i_inode);
752         goto out_trans_fail;
753
754 out_endtrans:
755         gfs2_trans_end(sdp);
756 out_trans_fail:
757         if (alloc_required) {
758                 gfs2_inplace_release(ip);
759 out_qunlock:
760                 gfs2_quota_unlock(ip);
761         }
762 out_unlock:
763         if (&ip->i_inode == sdp->sd_rindex) {
764                 gfs2_glock_dq(&m_ip->i_gh);
765                 gfs2_holder_uninit(&m_ip->i_gh);
766         }
767         gfs2_glock_dq(&ip->i_gh);
768 out_uninit:
769         gfs2_holder_uninit(&ip->i_gh);
770         return error;
771 }
772
773 /**
774  * adjust_fs_space - Adjusts the free space available due to gfs2_grow
775  * @inode: the rindex inode
776  */
777 static void adjust_fs_space(struct inode *inode)
778 {
779         struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
780         struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
781         struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
782         struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
783         struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
784         struct buffer_head *m_bh, *l_bh;
785         u64 fs_total, new_free;
786
787         /* Total up the file system space, according to the latest rindex. */
788         fs_total = gfs2_ri_total(sdp);
789         if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
790                 return;
791
792         spin_lock(&sdp->sd_statfs_spin);
793         gfs2_statfs_change_in(m_sc, m_bh->b_data +
794                               sizeof(struct gfs2_dinode));
795         if (fs_total > (m_sc->sc_total + l_sc->sc_total))
796                 new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
797         else
798                 new_free = 0;
799         spin_unlock(&sdp->sd_statfs_spin);
800         fs_warn(sdp, "File system extended by %llu blocks.\n",
801                 (unsigned long long)new_free);
802         gfs2_statfs_change(sdp, new_free, new_free, 0);
803
804         if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
805                 goto out;
806         update_statfs(sdp, m_bh, l_bh);
807         brelse(l_bh);
808 out:
809         brelse(m_bh);
810 }
811
812 /**
813  * gfs2_stuffed_write_end - Write end for stuffed files
814  * @inode: The inode
815  * @dibh: The buffer_head containing the on-disk inode
816  * @pos: The file position
817  * @len: The length of the write
818  * @copied: How much was actually copied by the VFS
819  * @page: The page
820  *
821  * This copies the data from the page into the inode block after
822  * the inode data structure itself.
823  *
824  * Returns: errno
825  */
826 static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
827                                   loff_t pos, unsigned len, unsigned copied,
828                                   struct page *page)
829 {
830         struct gfs2_inode *ip = GFS2_I(inode);
831         struct gfs2_sbd *sdp = GFS2_SB(inode);
832         struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
833         u64 to = pos + copied;
834         void *kaddr;
835         unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
836
837         BUG_ON(pos + len > gfs2_max_stuffed_size(ip));
838
839         kaddr = kmap_atomic(page);
840         memcpy(buf + pos, kaddr + pos, copied);
841         flush_dcache_page(page);
842         kunmap_atomic(kaddr);
843
844         WARN_ON(!PageUptodate(page));
845         unlock_page(page);
846         put_page(page);
847
848         if (copied) {
849                 if (inode->i_size < to)
850                         i_size_write(inode, to);
851                 mark_inode_dirty(inode);
852         }
853
854         if (inode == sdp->sd_rindex) {
855                 adjust_fs_space(inode);
856                 sdp->sd_rindex_uptodate = 0;
857         }
858
859         brelse(dibh);
860         gfs2_trans_end(sdp);
861         if (inode == sdp->sd_rindex) {
862                 gfs2_glock_dq(&m_ip->i_gh);
863                 gfs2_holder_uninit(&m_ip->i_gh);
864         }
865         gfs2_glock_dq(&ip->i_gh);
866         gfs2_holder_uninit(&ip->i_gh);
867         return copied;
868 }
869
870 /**
871  * gfs2_write_end
872  * @file: The file to write to
873  * @mapping: The address space to write to
874  * @pos: The file position
875  * @len: The length of the data
876  * @copied: How much was actually copied by the VFS
877  * @page: The page that has been written
878  * @fsdata: The fsdata (unused in GFS2)
879  *
880  * The main write_end function for GFS2. We have a separate one for
881  * stuffed files as they are slightly different, otherwise we just
882  * put our locking around the VFS provided functions.
883  *
884  * Returns: errno
885  */
886
887 static int gfs2_write_end(struct file *file, struct address_space *mapping,
888                           loff_t pos, unsigned len, unsigned copied,
889                           struct page *page, void *fsdata)
890 {
891         struct inode *inode = page->mapping->host;
892         struct gfs2_inode *ip = GFS2_I(inode);
893         struct gfs2_sbd *sdp = GFS2_SB(inode);
894         struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
895         struct buffer_head *dibh;
896         int ret;
897         struct gfs2_trans *tr = current->journal_info;
898         BUG_ON(!tr);
899
900         BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
901
902         ret = gfs2_meta_inode_buffer(ip, &dibh);
903         if (unlikely(ret)) {
904                 unlock_page(page);
905                 put_page(page);
906                 goto failed;
907         }
908
909         if (gfs2_is_stuffed(ip))
910                 return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
911
912         if (!gfs2_is_writeback(ip))
913                 gfs2_page_add_databufs(ip, page, pos & ~PAGE_MASK, len);
914
915         ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
916         if (tr->tr_num_buf_new)
917                 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
918         else
919                 gfs2_trans_add_meta(ip->i_gl, dibh);
920
921
922         if (inode == sdp->sd_rindex) {
923                 adjust_fs_space(inode);
924                 sdp->sd_rindex_uptodate = 0;
925         }
926
927         brelse(dibh);
928 failed:
929         gfs2_trans_end(sdp);
930         gfs2_inplace_release(ip);
931         if (ip->i_qadata && ip->i_qadata->qa_qd_num)
932                 gfs2_quota_unlock(ip);
933         if (inode == sdp->sd_rindex) {
934                 gfs2_glock_dq(&m_ip->i_gh);
935                 gfs2_holder_uninit(&m_ip->i_gh);
936         }
937         gfs2_glock_dq(&ip->i_gh);
938         gfs2_holder_uninit(&ip->i_gh);
939         return ret;
940 }
941
942 /**
943  * jdata_set_page_dirty - Page dirtying function
944  * @page: The page to dirty
945  *
946  * Returns: 1 if it dirtyed the page, or 0 otherwise
947  */
948  
949 static int jdata_set_page_dirty(struct page *page)
950 {
951         SetPageChecked(page);
952         return __set_page_dirty_buffers(page);
953 }
954
955 /**
956  * gfs2_bmap - Block map function
957  * @mapping: Address space info
958  * @lblock: The block to map
959  *
960  * Returns: The disk address for the block or 0 on hole or error
961  */
962
963 static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
964 {
965         struct gfs2_inode *ip = GFS2_I(mapping->host);
966         struct gfs2_holder i_gh;
967         sector_t dblock = 0;
968         int error;
969
970         error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
971         if (error)
972                 return 0;
973
974         if (!gfs2_is_stuffed(ip))
975                 dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
976
977         gfs2_glock_dq_uninit(&i_gh);
978
979         return dblock;
980 }
981
982 static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
983 {
984         struct gfs2_bufdata *bd;
985
986         lock_buffer(bh);
987         gfs2_log_lock(sdp);
988         clear_buffer_dirty(bh);
989         bd = bh->b_private;
990         if (bd) {
991                 if (!list_empty(&bd->bd_list) && !buffer_pinned(bh))
992                         list_del_init(&bd->bd_list);
993                 else
994                         gfs2_remove_from_journal(bh, REMOVE_JDATA);
995         }
996         bh->b_bdev = NULL;
997         clear_buffer_mapped(bh);
998         clear_buffer_req(bh);
999         clear_buffer_new(bh);
1000         gfs2_log_unlock(sdp);
1001         unlock_buffer(bh);
1002 }
1003
1004 static void gfs2_invalidatepage(struct page *page, unsigned int offset,
1005                                 unsigned int length)
1006 {
1007         struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
1008         unsigned int stop = offset + length;
1009         int partial_page = (offset || length < PAGE_SIZE);
1010         struct buffer_head *bh, *head;
1011         unsigned long pos = 0;
1012
1013         BUG_ON(!PageLocked(page));
1014         if (!partial_page)
1015                 ClearPageChecked(page);
1016         if (!page_has_buffers(page))
1017                 goto out;
1018
1019         bh = head = page_buffers(page);
1020         do {
1021                 if (pos + bh->b_size > stop)
1022                         return;
1023
1024                 if (offset <= pos)
1025                         gfs2_discard(sdp, bh);
1026                 pos += bh->b_size;
1027                 bh = bh->b_this_page;
1028         } while (bh != head);
1029 out:
1030         if (!partial_page)
1031                 try_to_release_page(page, 0);
1032 }
1033
1034 /**
1035  * gfs2_ok_for_dio - check that dio is valid on this file
1036  * @ip: The inode
1037  * @offset: The offset at which we are reading or writing
1038  *
1039  * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
1040  *          1 (to accept the i/o request)
1041  */
1042 static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset)
1043 {
1044         /*
1045          * Should we return an error here? I can't see that O_DIRECT for
1046          * a stuffed file makes any sense. For now we'll silently fall
1047          * back to buffered I/O
1048          */
1049         if (gfs2_is_stuffed(ip))
1050                 return 0;
1051
1052         if (offset >= i_size_read(&ip->i_inode))
1053                 return 0;
1054         return 1;
1055 }
1056
1057
1058
1059 static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
1060 {
1061         struct file *file = iocb->ki_filp;
1062         struct inode *inode = file->f_mapping->host;
1063         struct address_space *mapping = inode->i_mapping;
1064         struct gfs2_inode *ip = GFS2_I(inode);
1065         loff_t offset = iocb->ki_pos;
1066         struct gfs2_holder gh;
1067         int rv;
1068
1069         /*
1070          * Deferred lock, even if its a write, since we do no allocation
1071          * on this path. All we need change is atime, and this lock mode
1072          * ensures that other nodes have flushed their buffered read caches
1073          * (i.e. their page cache entries for this inode). We do not,
1074          * unfortunately have the option of only flushing a range like
1075          * the VFS does.
1076          */
1077         gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
1078         rv = gfs2_glock_nq(&gh);
1079         if (rv)
1080                 goto out_uninit;
1081         rv = gfs2_ok_for_dio(ip, offset);
1082         if (rv != 1)
1083                 goto out; /* dio not valid, fall back to buffered i/o */
1084
1085         /*
1086          * Now since we are holding a deferred (CW) lock at this point, you
1087          * might be wondering why this is ever needed. There is a case however
1088          * where we've granted a deferred local lock against a cached exclusive
1089          * glock. That is ok provided all granted local locks are deferred, but
1090          * it also means that it is possible to encounter pages which are
1091          * cached and possibly also mapped. So here we check for that and sort
1092          * them out ahead of the dio. The glock state machine will take care of
1093          * everything else.
1094          *
1095          * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
1096          * the first place, mapping->nr_pages will always be zero.
1097          */
1098         if (mapping->nrpages) {
1099                 loff_t lstart = offset & ~(PAGE_SIZE - 1);
1100                 loff_t len = iov_iter_count(iter);
1101                 loff_t end = PAGE_ALIGN(offset + len) - 1;
1102
1103                 rv = 0;
1104                 if (len == 0)
1105                         goto out;
1106                 if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
1107                         unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
1108                 rv = filemap_write_and_wait_range(mapping, lstart, end);
1109                 if (rv)
1110                         goto out;
1111                 if (iov_iter_rw(iter) == WRITE)
1112                         truncate_inode_pages_range(mapping, lstart, end);
1113         }
1114
1115         rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
1116                                   gfs2_get_block_direct, NULL, NULL, 0);
1117 out:
1118         gfs2_glock_dq(&gh);
1119 out_uninit:
1120         gfs2_holder_uninit(&gh);
1121         return rv;
1122 }
1123
1124 /**
1125  * gfs2_releasepage - free the metadata associated with a page
1126  * @page: the page that's being released
1127  * @gfp_mask: passed from Linux VFS, ignored by us
1128  *
1129  * Call try_to_free_buffers() if the buffers in this page can be
1130  * released.
1131  *
1132  * Returns: 0
1133  */
1134
1135 int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
1136 {
1137         struct address_space *mapping = page->mapping;
1138         struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
1139         struct buffer_head *bh, *head;
1140         struct gfs2_bufdata *bd;
1141
1142         if (!page_has_buffers(page))
1143                 return 0;
1144
1145         /*
1146          * From xfs_vm_releasepage: mm accommodates an old ext3 case where
1147          * clean pages might not have had the dirty bit cleared.  Thus, it can
1148          * send actual dirty pages to ->releasepage() via shrink_active_list().
1149          *
1150          * As a workaround, we skip pages that contain dirty buffers below.
1151          * Once ->releasepage isn't called on dirty pages anymore, we can warn
1152          * on dirty buffers like we used to here again.
1153          */
1154
1155         gfs2_log_lock(sdp);
1156         spin_lock(&sdp->sd_ail_lock);
1157         head = bh = page_buffers(page);
1158         do {
1159                 if (atomic_read(&bh->b_count))
1160                         goto cannot_release;
1161                 bd = bh->b_private;
1162                 if (bd && bd->bd_tr)
1163                         goto cannot_release;
1164                 if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh)))
1165                         goto cannot_release;
1166                 bh = bh->b_this_page;
1167         } while(bh != head);
1168         spin_unlock(&sdp->sd_ail_lock);
1169
1170         head = bh = page_buffers(page);
1171         do {
1172                 bd = bh->b_private;
1173                 if (bd) {
1174                         gfs2_assert_warn(sdp, bd->bd_bh == bh);
1175                         if (!list_empty(&bd->bd_list))
1176                                 list_del_init(&bd->bd_list);
1177                         bd->bd_bh = NULL;
1178                         bh->b_private = NULL;
1179                         kmem_cache_free(gfs2_bufdata_cachep, bd);
1180                 }
1181
1182                 bh = bh->b_this_page;
1183         } while (bh != head);
1184         gfs2_log_unlock(sdp);
1185
1186         return try_to_free_buffers(page);
1187
1188 cannot_release:
1189         spin_unlock(&sdp->sd_ail_lock);
1190         gfs2_log_unlock(sdp);
1191         return 0;
1192 }
1193
1194 static const struct address_space_operations gfs2_writeback_aops = {
1195         .writepage = gfs2_writepage,
1196         .writepages = gfs2_writepages,
1197         .readpage = gfs2_readpage,
1198         .readpages = gfs2_readpages,
1199         .write_begin = gfs2_write_begin,
1200         .write_end = gfs2_write_end,
1201         .bmap = gfs2_bmap,
1202         .invalidatepage = gfs2_invalidatepage,
1203         .releasepage = gfs2_releasepage,
1204         .direct_IO = gfs2_direct_IO,
1205         .migratepage = buffer_migrate_page,
1206         .is_partially_uptodate = block_is_partially_uptodate,
1207         .error_remove_page = generic_error_remove_page,
1208 };
1209
1210 static const struct address_space_operations gfs2_ordered_aops = {
1211         .writepage = gfs2_writepage,
1212         .writepages = gfs2_writepages,
1213         .readpage = gfs2_readpage,
1214         .readpages = gfs2_readpages,
1215         .write_begin = gfs2_write_begin,
1216         .write_end = gfs2_write_end,
1217         .set_page_dirty = __set_page_dirty_buffers,
1218         .bmap = gfs2_bmap,
1219         .invalidatepage = gfs2_invalidatepage,
1220         .releasepage = gfs2_releasepage,
1221         .direct_IO = gfs2_direct_IO,
1222         .migratepage = buffer_migrate_page,
1223         .is_partially_uptodate = block_is_partially_uptodate,
1224         .error_remove_page = generic_error_remove_page,
1225 };
1226
1227 static const struct address_space_operations gfs2_jdata_aops = {
1228         .writepage = gfs2_jdata_writepage,
1229         .writepages = gfs2_jdata_writepages,
1230         .readpage = gfs2_readpage,
1231         .readpages = gfs2_readpages,
1232         .write_begin = gfs2_write_begin,
1233         .write_end = gfs2_write_end,
1234         .set_page_dirty = jdata_set_page_dirty,
1235         .bmap = gfs2_bmap,
1236         .invalidatepage = gfs2_invalidatepage,
1237         .releasepage = gfs2_releasepage,
1238         .is_partially_uptodate = block_is_partially_uptodate,
1239         .error_remove_page = generic_error_remove_page,
1240 };
1241
1242 void gfs2_set_aops(struct inode *inode)
1243 {
1244         struct gfs2_inode *ip = GFS2_I(inode);
1245
1246         if (gfs2_is_writeback(ip))
1247                 inode->i_mapping->a_ops = &gfs2_writeback_aops;
1248         else if (gfs2_is_ordered(ip))
1249                 inode->i_mapping->a_ops = &gfs2_ordered_aops;
1250         else if (gfs2_is_jdata(ip))
1251                 inode->i_mapping->a_ops = &gfs2_jdata_aops;
1252         else
1253                 BUG();
1254 }
1255