fs/gfs2/bmap.c

   1 /*
   2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   4  *
   5  * This copyrighted material is made available to anyone wishing to use,
   6  * modify, copy, or redistribute it subject to the terms and conditions
   7  * of the GNU General Public License version 2.
   8  */
   9
  10 #include <linux/spinlock.h>
  11 #include <linux/completion.h>
  12 #include <linux/buffer_head.h>
  13 #include <linux/blkdev.h>
  14 #include <linux/gfs2_ondisk.h>
  15 #include <linux/crc32.h>
  16 #include <linux/iomap.h>
  17 #include <linux/ktime.h>
  18
  19 #include "gfs2.h"
  20 #include "incore.h"
  21 #include "bmap.h"
  22 #include "glock.h"
  23 #include "inode.h"
  24 #include "meta_io.h"
  25 #include "quota.h"
  26 #include "rgrp.h"
  27 #include "log.h"
  28 #include "super.h"
  29 #include "trans.h"
  30 #include "dir.h"
  31 #include "util.h"
  32 #include "aops.h"
  33 #include "trace_gfs2.h"
  34
  35 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  36  * block is 512, so __u16 is fine for that. It saves stack space to
  37  * keep it small.
  38  */
  39 struct metapath {
  40         struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  41         __u16 mp_list[GFS2_MAX_META_HEIGHT];
  42         int mp_fheight; /* find_metapath height */
  43         int mp_aheight; /* actual height (lookup height) */
  44 };
  45
  46 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
  47
  48 /**
  49  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  50  * @ip: the inode
  51  * @dibh: the dinode buffer
  52  * @block: the block number that was allocated
  53  * @page: The (optional) page. This is looked up if @page is NULL
  54  *
  55  * Returns: errno
  56  */
  57
  58 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  59                                u64 block, struct page *page)
  60 {
  61         struct inode *inode = &ip->i_inode;
  62         struct buffer_head *bh;
  63         int release = 0;
  64
  65         if (!page || page->index) {
  66                 page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
  67                 if (!page)
  68                         return -ENOMEM;
  69                 release = 1;
  70         }
  71
  72         if (!PageUptodate(page)) {
  73                 void *kaddr = kmap(page);
  74                 u64 dsize = i_size_read(inode);
  75
  76                 if (dsize > gfs2_max_stuffed_size(ip))
  77                         dsize = gfs2_max_stuffed_size(ip);
  78
  79                 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  80                 memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
  81                 kunmap(page);
  82
  83                 SetPageUptodate(page);
  84         }
  85
  86         if (!page_has_buffers(page))
  87                 create_empty_buffers(page, BIT(inode->i_blkbits),
  88                                      BIT(BH_Uptodate));
  89
  90         bh = page_buffers(page);
  91
  92         if (!buffer_mapped(bh))
  93                 map_bh(bh, inode->i_sb, block);
  94
  95         set_buffer_uptodate(bh);
  96         if (gfs2_is_jdata(ip))
  97                 gfs2_trans_add_data(ip->i_gl, bh);
  98         else {
  99                 mark_buffer_dirty(bh);
 100                 gfs2_ordered_add_inode(ip);
 101         }
 102
 103         if (release) {
 104                 unlock_page(page);
 105                 put_page(page);
 106         }
 107
 108         return 0;
 109 }
 110
 111 /**
 112  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 113  * @ip: The GFS2 inode to unstuff
 114  * @page: The (optional) page. This is looked up if the @page is NULL
 115  *
 116  * This routine unstuffs a dinode and returns it to a "normal" state such
 117  * that the height can be grown in the traditional way.
 118  *
 119  * Returns: errno
 120  */
 121
 122 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 123 {
 124         struct buffer_head *bh, *dibh;
 125         struct gfs2_dinode *di;
 126         u64 block = 0;
 127         int isdir = gfs2_is_dir(ip);
 128         int error;
 129
 130         down_write(&ip->i_rw_mutex);
 131
 132         error = gfs2_meta_inode_buffer(ip, &dibh);
 133         if (error)
 134                 goto out;
 135
 136         if (i_size_read(&ip->i_inode)) {
 137                 /* Get a free block, fill it with the stuffed data,
 138                    and write it out to disk */
 139
 140                 unsigned int n = 1;
 141                 error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 142                 if (error)
 143                         goto out_brelse;
 144                 if (isdir) {
 145                         gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 146                         error = gfs2_dir_get_new_buffer(ip, block, &bh);
 147                         if (error)
 148                                 goto out_brelse;
 149                         gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 150                                               dibh, sizeof(struct gfs2_dinode));
 151                         brelse(bh);
 152                 } else {
 153                         error = gfs2_unstuffer_page(ip, dibh, block, page);
 154                         if (error)
 155                                 goto out_brelse;
 156                 }
 157         }
 158
 159         /*  Set up the pointer to the new block  */
 160
 161         gfs2_trans_add_meta(ip->i_gl, dibh);
 162         di = (struct gfs2_dinode *)dibh->b_data;
 163         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 164
 165         if (i_size_read(&ip->i_inode)) {
 166                 *(__be64 *)(di + 1) = cpu_to_be64(block);
 167                 gfs2_add_inode_blocks(&ip->i_inode, 1);
 168                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 169         }
 170
 171         ip->i_height = 1;
 172         di->di_height = cpu_to_be16(1);
 173
 174 out_brelse:
 175         brelse(dibh);
 176 out:
 177         up_write(&ip->i_rw_mutex);
 178         return error;
 179 }
 180
 181
 182 /**
 183  * find_metapath - Find path through the metadata tree
 184  * @sdp: The superblock
 185  * @block: The disk block to look up
 186  * @mp: The metapath to return the result in
 187  * @height: The pre-calculated height of the metadata tree
 188  *
 189  *   This routine returns a struct metapath structure that defines a path
 190  *   through the metadata of inode "ip" to get to block "block".
 191  *
 192  *   Example:
 193  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 194  *   filesystem with a blocksize of 4096.
 195  *
 196  *   find_metapath() would return a struct metapath structure set to:
 197  *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
 198  *
 199  *   That means that in order to get to the block containing the byte at
 200  *   offset 101342453, we would load the indirect block pointed to by pointer
 201  *   0 in the dinode.  We would then load the indirect block pointed to by
 202  *   pointer 48 in that indirect block.  We would then load the data block
 203  *   pointed to by pointer 165 in that indirect block.
 204  *
 205  *             ----------------------------------------
 206  *             | Dinode |                             |
 207  *             |        |                            4|
 208  *             |        |0 1 2 3 4 5                 9|
 209  *             |        |                            6|
 210  *             ----------------------------------------
 211  *                       |
 212  *                       |
 213  *                       V
 214  *             ----------------------------------------
 215  *             | Indirect Block                       |
 216  *             |                                     5|
 217  *             |            4 4 4 4 4 5 5            1|
 218  *             |0           5 6 7 8 9 0 1            2|
 219  *             ----------------------------------------
 220  *                                |
 221  *                                |
 222  *                                V
 223  *             ----------------------------------------
 224  *             | Indirect Block                       |
 225  *             |                         1 1 1 1 1   5|
 226  *             |                         6 6 6 6 6   1|
 227  *             |0                        3 4 5 6 7   2|
 228  *             ----------------------------------------
 229  *                                           |
 230  *                                           |
 231  *                                           V
 232  *             ----------------------------------------
 233  *             | Data block containing offset         |
 234  *             |            101342453                 |
 235  *             |                                      |
 236  *             |                                      |
 237  *             ----------------------------------------
 238  *
 239  */
 240
 241 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 242                           struct metapath *mp, unsigned int height)
 243 {
 244         unsigned int i;
 245
 246         mp->mp_fheight = height;
 247         for (i = height; i--;)
 248                 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 249 }
 250
 251 static inline unsigned int metapath_branch_start(const struct metapath *mp)
 252 {
 253         if (mp->mp_list[0] == 0)
 254                 return 2;
 255         return 1;
 256 }
 257
 258 /**
 259  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 260  * @height: The metadata height (0 = dinode)
 261  * @mp: The metapath
 262  */
 263 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 264 {
 265         struct buffer_head *bh = mp->mp_bh[height];
 266         if (height == 0)
 267                 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 268         return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 269 }
 270
 271 /**
 272  * metapointer - Return pointer to start of metadata in a buffer
 273  * @height: The metadata height (0 = dinode)
 274  * @mp: The metapath
 275  *
 276  * Return a pointer to the block number of the next height of the metadata
 277  * tree given a buffer containing the pointer to the current height of the
 278  * metadata tree.
 279  */
 280
 281 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 282 {
 283         __be64 *p = metaptr1(height, mp);
 284         return p + mp->mp_list[height];
 285 }
 286
 287 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
 288 {
 289         const struct buffer_head *bh = mp->mp_bh[height];
 290         return (const __be64 *)(bh->b_data + bh->b_size);
 291 }
 292
 293 static void clone_metapath(struct metapath *clone, struct metapath *mp)
 294 {
 295         unsigned int hgt;
 296
 297         *clone = *mp;
 298         for (hgt = 0; hgt < mp->mp_aheight; hgt++)
 299                 get_bh(clone->mp_bh[hgt]);
 300 }
 301
 302 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 303 {
 304         const __be64 *t;
 305
 306         for (t = start; t < end; t++) {
 307                 struct buffer_head *rabh;
 308
 309                 if (!*t)
 310                         continue;
 311
 312                 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 313                 if (trylock_buffer(rabh)) {
 314                         if (!buffer_uptodate(rabh)) {
 315                                 rabh->b_end_io = end_buffer_read_sync;
 316                                 submit_bh(REQ_OP_READ,
 317                                           REQ_RAHEAD | REQ_META | REQ_PRIO,
 318                                           rabh);
 319                                 continue;
 320                         }
 321                         unlock_buffer(rabh);
 322                 }
 323                 brelse(rabh);
 324         }
 325 }
 326
 327 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 328                              unsigned int x, unsigned int h)
 329 {
 330         for (; x < h; x++) {
 331                 __be64 *ptr = metapointer(x, mp);
 332                 u64 dblock = be64_to_cpu(*ptr);
 333                 int ret;
 334
 335                 if (!dblock)
 336                         break;
 337                 ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]);
 338                 if (ret)
 339                         return ret;
 340         }
 341         mp->mp_aheight = x + 1;
 342         return 0;
 343 }
 344
 345 /**
 346  * lookup_metapath - Walk the metadata tree to a specific point
 347  * @ip: The inode
 348  * @mp: The metapath
 349  *
 350  * Assumes that the inode's buffer has already been looked up and
 351  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 352  * by find_metapath().
 353  *
 354  * If this function encounters part of the tree which has not been
 355  * allocated, it returns the current height of the tree at the point
 356  * at which it found the unallocated block. Blocks which are found are
 357  * added to the mp->mp_bh[] list.
 358  *
 359  * Returns: error
 360  */
 361
 362 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 363 {
 364         return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 365 }
 366
 367 /**
 368  * fillup_metapath - fill up buffers for the metadata path to a specific height
 369  * @ip: The inode
 370  * @mp: The metapath
 371  * @h: The height to which it should be mapped
 372  *
 373  * Similar to lookup_metapath, but does lookups for a range of heights
 374  *
 375  * Returns: error or the number of buffers filled
 376  */
 377
 378 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 379 {
 380         unsigned int x = 0;
 381         int ret;
 382
 383         if (h) {
 384                 /* find the first buffer we need to look up. */
 385                 for (x = h - 1; x > 0; x--) {
 386                         if (mp->mp_bh[x])
 387                                 break;
 388                 }
 389         }
 390         ret = __fillup_metapath(ip, mp, x, h);
 391         if (ret)
 392                 return ret;
 393         return mp->mp_aheight - x - 1;
 394 }
 395
 396 static void release_metapath(struct metapath *mp)
 397 {
 398         int i;
 399
 400         for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 401                 if (mp->mp_bh[i] == NULL)
 402                         break;
 403                 brelse(mp->mp_bh[i]);
 404                 mp->mp_bh[i] = NULL;
 405         }
 406 }
 407
 408 /**
 409  * gfs2_extent_length - Returns length of an extent of blocks
 410  * @bh: The metadata block
 411  * @ptr: Current position in @bh
 412  * @limit: Max extent length to return
 413  * @eob: Set to 1 if we hit "end of block"
 414  *
 415  * Returns: The length of the extent (minimum of one block)
 416  */
 417
 418 static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
 419 {
 420         const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 421         const __be64 *first = ptr;
 422         u64 d = be64_to_cpu(*ptr);
 423
 424         *eob = 0;
 425         do {
 426                 ptr++;
 427                 if (ptr >= end)
 428                         break;
 429                 d++;
 430         } while(be64_to_cpu(*ptr) == d);
 431         if (ptr >= end)
 432                 *eob = 1;
 433         return ptr - first;
 434 }
 435
 436 typedef const __be64 *(*gfs2_metadata_walker)(
 437                 struct metapath *mp,
 438                 const __be64 *start, const __be64 *end,
 439                 u64 factor, void *data);
 440
 441 #define WALK_STOP ((__be64 *)0)
 442 #define WALK_NEXT ((__be64 *)1)
 443
 444 static int gfs2_walk_metadata(struct inode *inode, sector_t lblock,
 445                 u64 len, struct metapath *mp, gfs2_metadata_walker walker,
 446                 void *data)
 447 {
 448         struct metapath clone;
 449         struct gfs2_inode *ip = GFS2_I(inode);
 450         struct gfs2_sbd *sdp = GFS2_SB(inode);
 451         const __be64 *start, *end, *ptr;
 452         u64 factor = 1;
 453         unsigned int hgt;
 454         int ret = 0;
 455
 456         for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--)
 457                 factor *= sdp->sd_inptrs;
 458
 459         for (;;) {
 460                 u64 step;
 461
 462                 /* Walk indirect block. */
 463                 start = metapointer(hgt, mp);
 464                 end = metaend(hgt, mp);
 465
 466                 step = (end - start) * factor;
 467                 if (step > len)
 468                         end = start + DIV_ROUND_UP_ULL(len, factor);
 469
 470                 ptr = walker(mp, start, end, factor, data);
 471                 if (ptr == WALK_STOP)
 472                         break;
 473                 if (step >= len)
 474                         break;
 475                 len -= step;
 476                 if (ptr != WALK_NEXT) {
 477                         BUG_ON(!*ptr);
 478                         mp->mp_list[hgt] += ptr - start;
 479                         goto fill_up_metapath;
 480                 }
 481
 482 lower_metapath:
 483                 /* Decrease height of metapath. */
 484                 if (mp != &clone) {
 485                         clone_metapath(&clone, mp);
 486                         mp = &clone;
 487                 }
 488                 brelse(mp->mp_bh[hgt]);
 489                 mp->mp_bh[hgt] = NULL;
 490                 if (!hgt)
 491                         break;
 492                 hgt--;
 493                 factor *= sdp->sd_inptrs;
 494
 495                 /* Advance in metadata tree. */
 496                 (mp->mp_list[hgt])++;
 497                 start = metapointer(hgt, mp);
 498                 end = metaend(hgt, mp);
 499                 if (start >= end) {
 500                         mp->mp_list[hgt] = 0;
 501                         if (!hgt)
 502                                 break;
 503                         goto lower_metapath;
 504                 }
 505
 506 fill_up_metapath:
 507                 /* Increase height of metapath. */
 508                 if (mp != &clone) {
 509                         clone_metapath(&clone, mp);
 510                         mp = &clone;
 511                 }
 512                 ret = fillup_metapath(ip, mp, ip->i_height - 1);
 513                 if (ret < 0)
 514                         break;
 515                 hgt += ret;
 516                 for (; ret; ret--)
 517                         do_div(factor, sdp->sd_inptrs);
 518                 mp->mp_aheight = hgt + 1;
 519         }
 520         if (mp == &clone)
 521                 release_metapath(mp);
 522         return ret;
 523 }
 524
 525 struct gfs2_hole_walker_args {
 526         u64 blocks;
 527 };
 528
 529 static const __be64 *gfs2_hole_walker(struct metapath *mp,
 530                 const __be64 *start, const __be64 *end,
 531                 u64 factor, void *data)
 532 {
 533         struct gfs2_hole_walker_args *args = data;
 534         const __be64 *ptr;
 535
 536         for (ptr = start; ptr < end; ptr++) {
 537                 if (*ptr) {
 538                         args->blocks += (ptr - start) * factor;
 539                         if (mp->mp_aheight == mp->mp_fheight)
 540                                 return WALK_STOP;
 541                         return ptr;  /* increase height */
 542                 }
 543         }
 544         args->blocks += (end - start) * factor;
 545         return WALK_NEXT;
 546 }
 547
 548 /**
 549  * gfs2_hole_size - figure out the size of a hole
 550  * @inode: The inode
 551  * @lblock: The logical starting block number
 552  * @len: How far to look (in blocks)
 553  * @mp: The metapath at lblock
 554  * @iomap: The iomap to store the hole size in
 555  *
 556  * This function modifies @mp.
 557  *
 558  * Returns: errno on error
 559  */
 560 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
 561                           struct metapath *mp, struct iomap *iomap)
 562 {
 563         struct gfs2_hole_walker_args args = { };
 564         int ret = 0;
 565
 566         ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args);
 567         if (!ret)
 568                 iomap->length = args.blocks << inode->i_blkbits;
 569         return ret;
 570 }
 571
 572 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
 573                                          struct gfs2_glock *gl, unsigned int i,
 574                                          unsigned offset, u64 bn)
 575 {
 576         __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 577                        ((i > 1) ? sizeof(struct gfs2_meta_header) :
 578                                  sizeof(struct gfs2_dinode)));
 579         BUG_ON(i < 1);
 580         BUG_ON(mp->mp_bh[i] != NULL);
 581         mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 582         gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 583         gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 584         gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 585         ptr += offset;
 586         *ptr = cpu_to_be64(bn);
 587         return ptr;
 588 }
 589
 590 enum alloc_state {
 591         ALLOC_DATA = 0,
 592         ALLOC_GROW_DEPTH = 1,
 593         ALLOC_GROW_HEIGHT = 2,
 594         /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 595 };
 596
 597 /**
 598  * gfs2_iomap_alloc - Build a metadata tree of the requested height
 599  * @inode: The GFS2 inode
 600  * @iomap: The iomap structure
 601  * @flags: iomap flags
 602  * @mp: The metapath, with proper height information calculated
 603  *
 604  * In this routine we may have to alloc:
 605  *   i) Indirect blocks to grow the metadata tree height
 606  *  ii) Indirect blocks to fill in lower part of the metadata tree
 607  * iii) Data blocks
 608  *
 609  * This function is called after gfs2_iomap_get, which works out the
 610  * total number of blocks which we need via gfs2_alloc_size.
 611  *
 612  * We then do the actual allocation asking for an extent at a time (if
 613  * enough contiguous free blocks are available, there will only be one
 614  * allocation request per call) and uses the state machine to initialise
 615  * the blocks in order.
 616  *
 617  * Right now, this function will allocate at most one indirect block
 618  * worth of data -- with a default block size of 4K, that's slightly
 619  * less than 2M.  If this limitation is ever removed to allow huge
 620  * allocations, we would probably still want to limit the iomap size we
 621  * return to avoid stalling other tasks during huge writes; the next
 622  * iomap iteration would then find the blocks already allocated.
 623  *
 624  * Returns: errno on error
 625  */
 626
 627 static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 628                             unsigned flags, struct metapath *mp)
 629 {
 630         struct gfs2_inode *ip = GFS2_I(inode);
 631         struct gfs2_sbd *sdp = GFS2_SB(inode);
 632         struct buffer_head *dibh = mp->mp_bh[0];
 633         u64 bn;
 634         unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 635         size_t dblks = iomap->length >> inode->i_blkbits;
 636         const unsigned end_of_metadata = mp->mp_fheight - 1;
 637         int ret;
 638         enum alloc_state state;
 639         __be64 *ptr;
 640         __be64 zero_bn = 0;
 641
 642         BUG_ON(mp->mp_aheight < 1);
 643         BUG_ON(dibh == NULL);
 644         BUG_ON(dblks < 1);
 645
 646         gfs2_trans_add_meta(ip->i_gl, dibh);
 647
 648         down_write(&ip->i_rw_mutex);
 649
 650         if (mp->mp_fheight == mp->mp_aheight) {
 651                 /* Bottom indirect block exists */
 652                 state = ALLOC_DATA;
 653         } else {
 654                 /* Need to allocate indirect blocks */
 655                 if (mp->mp_fheight == ip->i_height) {
 656                         /* Writing into existing tree, extend tree down */
 657                         iblks = mp->mp_fheight - mp->mp_aheight;
 658                         state = ALLOC_GROW_DEPTH;
 659                 } else {
 660                         /* Building up tree height */
 661                         state = ALLOC_GROW_HEIGHT;
 662                         iblks = mp->mp_fheight - ip->i_height;
 663                         branch_start = metapath_branch_start(mp);
 664                         iblks += (mp->mp_fheight - branch_start);
 665                 }
 666         }
 667
 668         /* start of the second part of the function (state machine) */
 669
 670         blks = dblks + iblks;
 671         i = mp->mp_aheight;
 672         do {
 673                 n = blks - alloced;
 674                 ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 675                 if (ret)
 676                         goto out;
 677                 alloced += n;
 678                 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 679                         gfs2_trans_add_unrevoke(sdp, bn, n);
 680                 switch (state) {
 681                 /* Growing height of tree */
 682                 case ALLOC_GROW_HEIGHT:
 683                         if (i == 1) {
 684                                 ptr = (__be64 *)(dibh->b_data +
 685                                                  sizeof(struct gfs2_dinode));
 686                                 zero_bn = *ptr;
 687                         }
 688                         for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 689                              i++, n--)
 690                                 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 691                         if (i - 1 == mp->mp_fheight - ip->i_height) {
 692                                 i--;
 693                                 gfs2_buffer_copy_tail(mp->mp_bh[i],
 694                                                 sizeof(struct gfs2_meta_header),
 695                                                 dibh, sizeof(struct gfs2_dinode));
 696                                 gfs2_buffer_clear_tail(dibh,
 697                                                 sizeof(struct gfs2_dinode) +
 698                                                 sizeof(__be64));
 699                                 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 700                                         sizeof(struct gfs2_meta_header));
 701                                 *ptr = zero_bn;
 702                                 state = ALLOC_GROW_DEPTH;
 703                                 for(i = branch_start; i < mp->mp_fheight; i++) {
 704                                         if (mp->mp_bh[i] == NULL)
 705                                                 break;
 706                                         brelse(mp->mp_bh[i]);
 707                                         mp->mp_bh[i] = NULL;
 708                                 }
 709                                 i = branch_start;
 710                         }
 711                         if (n == 0)
 712                                 break;
 713                 /* fall through - To branching from existing tree */
 714                 case ALLOC_GROW_DEPTH:
 715                         if (i > 1 && i < mp->mp_fheight)
 716                                 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 717                         for (; i < mp->mp_fheight && n > 0; i++, n--)
 718                                 gfs2_indirect_init(mp, ip->i_gl, i,
 719                                                    mp->mp_list[i-1], bn++);
 720                         if (i == mp->mp_fheight)
 721                                 state = ALLOC_DATA;
 722                         if (n == 0)
 723                                 break;
 724                 /* fall through - To tree complete, adding data blocks */
 725                 case ALLOC_DATA:
 726                         BUG_ON(n > dblks);
 727                         BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 728                         gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 729                         dblks = n;
 730                         ptr = metapointer(end_of_metadata, mp);
 731                         iomap->addr = bn << inode->i_blkbits;
 732                         iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
 733                         while (n-- > 0)
 734                                 *ptr++ = cpu_to_be64(bn++);
 735                         break;
 736                 }
 737         } while (iomap->addr == IOMAP_NULL_ADDR);
 738
 739         iomap->type = IOMAP_MAPPED;
 740         iomap->length = (u64)dblks << inode->i_blkbits;
 741         ip->i_height = mp->mp_fheight;
 742         gfs2_add_inode_blocks(&ip->i_inode, alloced);
 743         gfs2_dinode_out(ip, dibh->b_data);
 744 out:
 745         up_write(&ip->i_rw_mutex);
 746         return ret;
 747 }
 748
 749 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
 750
 751 /**
 752  * gfs2_alloc_size - Compute the maximum allocation size
 753  * @inode: The inode
 754  * @mp: The metapath
 755  * @size: Requested size in blocks
 756  *
 757  * Compute the maximum size of the next allocation at @mp.
 758  *
 759  * Returns: size in blocks
 760  */
 761 static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
 762 {
 763         struct gfs2_inode *ip = GFS2_I(inode);
 764         struct gfs2_sbd *sdp = GFS2_SB(inode);
 765         const __be64 *first, *ptr, *end;
 766
 767         /*
 768          * For writes to stuffed files, this function is called twice via
 769          * gfs2_iomap_get, before and after unstuffing. The size we return the
 770          * first time needs to be large enough to get the reservation and
 771          * allocation sizes right.  The size we return the second time must
 772          * be exact or else gfs2_iomap_alloc won't do the right thing.
 773          */
 774
 775         if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
 776                 unsigned int maxsize = mp->mp_fheight > 1 ?
 777                         sdp->sd_inptrs : sdp->sd_diptrs;
 778                 maxsize -= mp->mp_list[mp->mp_fheight - 1];
 779                 if (size > maxsize)
 780                         size = maxsize;
 781                 return size;
 782         }
 783
 784         first = metapointer(ip->i_height - 1, mp);
 785         end = metaend(ip->i_height - 1, mp);
 786         if (end - first > size)
 787                 end = first + size;
 788         for (ptr = first; ptr < end; ptr++) {
 789                 if (*ptr)
 790                         break;
 791         }
 792         return ptr - first;
 793 }
 794
 795 /**
 796  * gfs2_iomap_get - Map blocks from an inode to disk blocks
 797  * @inode: The inode
 798  * @pos: Starting position in bytes
 799  * @length: Length to map, in bytes
 800  * @flags: iomap flags
 801  * @iomap: The iomap structure
 802  * @mp: The metapath
 803  *
 804  * Returns: errno
 805  */
 806 static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 807                           unsigned flags, struct iomap *iomap,
 808                           struct metapath *mp)
 809 {
 810         struct gfs2_inode *ip = GFS2_I(inode);
 811         struct gfs2_sbd *sdp = GFS2_SB(inode);
 812         loff_t size = i_size_read(inode);
 813         __be64 *ptr;
 814         sector_t lblock;
 815         sector_t lblock_stop;
 816         int ret;
 817         int eob;
 818         u64 len;
 819         struct buffer_head *dibh = NULL, *bh;
 820         u8 height;
 821
 822         if (!length)
 823                 return -EINVAL;
 824
 825         down_read(&ip->i_rw_mutex);
 826
 827         ret = gfs2_meta_inode_buffer(ip, &dibh);
 828         if (ret)
 829                 goto unlock;
 830         mp->mp_bh[0] = dibh;
 831
 832         if (gfs2_is_stuffed(ip)) {
 833                 if (flags & IOMAP_WRITE) {
 834                         loff_t max_size = gfs2_max_stuffed_size(ip);
 835
 836                         if (pos + length > max_size)
 837                                 goto unstuff;
 838                         iomap->length = max_size;
 839                 } else {
 840                         if (pos >= size) {
 841                                 if (flags & IOMAP_REPORT) {
 842                                         ret = -ENOENT;
 843                                         goto unlock;
 844                                 } else {
 845                                         /* report a hole */
 846                                         iomap->offset = pos;
 847                                         iomap->length = length;
 848                                         goto do_alloc;
 849                                 }
 850                         }
 851                         iomap->length = size;
 852                 }
 853                 iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 854                               sizeof(struct gfs2_dinode);
 855                 iomap->type = IOMAP_INLINE;
 856                 iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
 857                 goto out;
 858         }
 859
 860 unstuff:
 861         lblock = pos >> inode->i_blkbits;
 862         iomap->offset = lblock << inode->i_blkbits;
 863         lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 864         len = lblock_stop - lblock + 1;
 865         iomap->length = len << inode->i_blkbits;
 866
 867         height = ip->i_height;
 868         while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
 869                 height++;
 870         find_metapath(sdp, lblock, mp, height);
 871         if (height > ip->i_height || gfs2_is_stuffed(ip))
 872                 goto do_alloc;
 873
 874         ret = lookup_metapath(ip, mp);
 875         if (ret)
 876                 goto unlock;
 877
 878         if (mp->mp_aheight != ip->i_height)
 879                 goto do_alloc;
 880
 881         ptr = metapointer(ip->i_height - 1, mp);
 882         if (*ptr == 0)
 883                 goto do_alloc;
 884
 885         bh = mp->mp_bh[ip->i_height - 1];
 886         len = gfs2_extent_length(bh, ptr, len, &eob);
 887
 888         iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 889         iomap->length = len << inode->i_blkbits;
 890         iomap->type = IOMAP_MAPPED;
 891         iomap->flags |= IOMAP_F_MERGED;
 892         if (eob)
 893                 iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 894
 895 out:
 896         iomap->bdev = inode->i_sb->s_bdev;
 897 unlock:
 898         up_read(&ip->i_rw_mutex);
 899         return ret;
 900
 901 do_alloc:
 902         iomap->addr = IOMAP_NULL_ADDR;
 903         iomap->type = IOMAP_HOLE;
 904         if (flags & IOMAP_REPORT) {
 905                 if (pos >= size)
 906                         ret = -ENOENT;
 907                 else if (height == ip->i_height)
 908                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 909                 else
 910                         iomap->length = size - pos;
 911         } else if (flags & IOMAP_WRITE) {
 912                 u64 alloc_size;
 913
 914                 if (flags & IOMAP_DIRECT)
 915                         goto out;  /* (see gfs2_file_direct_write) */
 916
 917                 len = gfs2_alloc_size(inode, mp, len);
 918                 alloc_size = len << inode->i_blkbits;
 919                 if (alloc_size < iomap->length)
 920                         iomap->length = alloc_size;
 921         } else {
 922                 if (pos < size && height == ip->i_height)
 923                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 924         }
 925         goto out;
 926 }
 927
 928 /**
 929  * gfs2_lblk_to_dblk - convert logical block to disk block
 930  * @inode: the inode of the file we're mapping
 931  * @lblock: the block relative to the start of the file
 932  * @dblock: the returned dblock, if no error
 933  *
 934  * This function maps a single block from a file logical block (relative to
 935  * the start of the file) to a file system absolute block using iomap.
 936  *
 937  * Returns: the absolute file system block, or an error
 938  */
 939 int gfs2_lblk_to_dblk(struct inode *inode, u32 lblock, u64 *dblock)
 940 {
 941         struct iomap iomap = { };
 942         struct metapath mp = { .mp_aheight = 1, };
 943         loff_t pos = (loff_t)lblock << inode->i_blkbits;
 944         int ret;
 945
 946         ret = gfs2_iomap_get(inode, pos, i_blocksize(inode), 0, &iomap, &mp);
 947         release_metapath(&mp);
 948         if (ret == 0)
 949                 *dblock = iomap.addr >> inode->i_blkbits;
 950
 951         return ret;
 952 }
 953
 954 static int gfs2_write_lock(struct inode *inode)
 955 {
 956         struct gfs2_inode *ip = GFS2_I(inode);
 957         struct gfs2_sbd *sdp = GFS2_SB(inode);
 958         int error;
 959
 960         gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
 961         error = gfs2_glock_nq(&ip->i_gh);
 962         if (error)
 963                 goto out_uninit;
 964         if (&ip->i_inode == sdp->sd_rindex) {
 965                 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 966
 967                 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
 968                                            GL_NOCACHE, &m_ip->i_gh);
 969                 if (error)
 970                         goto out_unlock;
 971         }
 972         return 0;
 973
 974 out_unlock:
 975         gfs2_glock_dq(&ip->i_gh);
 976 out_uninit:
 977         gfs2_holder_uninit(&ip->i_gh);
 978         return error;
 979 }
 980
 981 static void gfs2_write_unlock(struct inode *inode)
 982 {
 983         struct gfs2_inode *ip = GFS2_I(inode);
 984         struct gfs2_sbd *sdp = GFS2_SB(inode);
 985
 986         if (&ip->i_inode == sdp->sd_rindex) {
 987                 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 988
 989                 gfs2_glock_dq_uninit(&m_ip->i_gh);
 990         }
 991         gfs2_glock_dq_uninit(&ip->i_gh);
 992 }
 993
 994 static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
 995                                  unsigned copied, struct page *page,
 996                                  struct iomap *iomap)
 997 {
 998         struct gfs2_inode *ip = GFS2_I(inode);
 999
1000         if (page)
1001                 gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
1002 }
1003
1004 static const struct iomap_page_ops gfs2_iomap_page_ops = {
1005         .page_done = gfs2_iomap_page_done,
1006 };
1007
1008 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
1009                                   loff_t length, unsigned flags,
1010                                   struct iomap *iomap,
1011                                   struct metapath *mp)
1012 {
1013         struct gfs2_inode *ip = GFS2_I(inode);
1014         struct gfs2_sbd *sdp = GFS2_SB(inode);
1015         unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
1016         bool unstuff, alloc_required;
1017         int ret;
1018
1019         ret = gfs2_write_lock(inode);
1020         if (ret)
1021                 return ret;
1022
1023         unstuff = gfs2_is_stuffed(ip) &&
1024                   pos + length > gfs2_max_stuffed_size(ip);
1025
1026         ret = gfs2_iomap_get(inode, pos, length, flags, iomap, mp);
1027         if (ret)
1028                 goto out_unlock;
1029
1030         alloc_required = unstuff || iomap->type == IOMAP_HOLE;
1031
1032         if (alloc_required || gfs2_is_jdata(ip))
1033                 gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1034                                        &ind_blocks);
1035
1036         if (alloc_required) {
1037                 struct gfs2_alloc_parms ap = {
1038                         .target = data_blocks + ind_blocks
1039                 };
1040
1041                 ret = gfs2_quota_lock_check(ip, &ap);
1042                 if (ret)
1043                         goto out_unlock;
1044
1045                 ret = gfs2_inplace_reserve(ip, &ap);
1046                 if (ret)
1047                         goto out_qunlock;
1048         }
1049
1050         rblocks = RES_DINODE + ind_blocks;
1051         if (gfs2_is_jdata(ip))
1052                 rblocks += data_blocks;
1053         if (ind_blocks || data_blocks)
1054                 rblocks += RES_STATFS + RES_QUOTA;
1055         if (inode == sdp->sd_rindex)
1056                 rblocks += 2 * RES_STATFS;
1057         if (alloc_required)
1058                 rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1059
1060         ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits);
1061         if (ret)
1062                 goto out_trans_fail;
1063
1064         if (unstuff) {
1065                 ret = gfs2_unstuff_dinode(ip, NULL);
1066                 if (ret)
1067                         goto out_trans_end;
1068                 release_metapath(mp);
1069                 ret = gfs2_iomap_get(inode, iomap->offset, iomap->length,
1070                                      flags, iomap, mp);
1071                 if (ret)
1072                         goto out_trans_end;
1073         }
1074
1075         if (iomap->type == IOMAP_HOLE) {
1076                 ret = gfs2_iomap_alloc(inode, iomap, flags, mp);
1077                 if (ret) {
1078                         gfs2_trans_end(sdp);
1079                         gfs2_inplace_release(ip);
1080                         punch_hole(ip, iomap->offset, iomap->length);
1081                         goto out_qunlock;
1082                 }
1083         }
1084         if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip))
1085                 iomap->page_ops = &gfs2_iomap_page_ops;
1086         return 0;
1087
1088 out_trans_end:
1089         gfs2_trans_end(sdp);
1090 out_trans_fail:
1091         if (alloc_required)
1092                 gfs2_inplace_release(ip);
1093 out_qunlock:
1094         if (alloc_required)
1095                 gfs2_quota_unlock(ip);
1096 out_unlock:
1097         gfs2_write_unlock(inode);
1098         return ret;
1099 }
1100
1101 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1102                             unsigned flags, struct iomap *iomap)
1103 {
1104         struct gfs2_inode *ip = GFS2_I(inode);
1105         struct metapath mp = { .mp_aheight = 1, };
1106         int ret;
1107
1108         iomap->flags |= IOMAP_F_BUFFER_HEAD;
1109
1110         trace_gfs2_iomap_start(ip, pos, length, flags);
1111         if ((flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)) {
1112                 ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1113         } else {
1114                 ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1115
1116                 /*
1117                  * Silently fall back to buffered I/O for stuffed files or if
1118                  * we've hot a hole (see gfs2_file_direct_write).
1119                  */
1120                 if ((flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT) &&
1121                     iomap->type != IOMAP_MAPPED)
1122                         ret = -ENOTBLK;
1123         }
1124         if (!ret) {
1125                 get_bh(mp.mp_bh[0]);
1126                 iomap->private = mp.mp_bh[0];
1127         }
1128         release_metapath(&mp);
1129         trace_gfs2_iomap_end(ip, iomap, ret);
1130         return ret;
1131 }
1132
1133 static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1134                           ssize_t written, unsigned flags, struct iomap *iomap)
1135 {
1136         struct gfs2_inode *ip = GFS2_I(inode);
1137         struct gfs2_sbd *sdp = GFS2_SB(inode);
1138         struct gfs2_trans *tr = current->journal_info;
1139         struct buffer_head *dibh = iomap->private;
1140
1141         if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE)
1142                 goto out;
1143
1144         if (iomap->type != IOMAP_INLINE) {
1145                 gfs2_ordered_add_inode(ip);
1146
1147                 if (tr->tr_num_buf_new)
1148                         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1149                 else
1150                         gfs2_trans_add_meta(ip->i_gl, dibh);
1151         }
1152
1153         if (inode == sdp->sd_rindex) {
1154                 adjust_fs_space(inode);
1155                 sdp->sd_rindex_uptodate = 0;
1156         }
1157
1158         gfs2_trans_end(sdp);
1159         gfs2_inplace_release(ip);
1160
1161         if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1162                 /* Deallocate blocks that were just allocated. */
1163                 loff_t blockmask = i_blocksize(inode) - 1;
1164                 loff_t end = (pos + length) & ~blockmask;
1165
1166                 pos = (pos + written + blockmask) & ~blockmask;
1167                 if (pos < end) {
1168                         truncate_pagecache_range(inode, pos, end - 1);
1169                         punch_hole(ip, pos, end - pos);
1170                 }
1171         }
1172
1173         if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1174                 gfs2_quota_unlock(ip);
1175         gfs2_write_unlock(inode);
1176
1177 out:
1178         if (dibh)
1179                 brelse(dibh);
1180         return 0;
1181 }
1182
1183 const struct iomap_ops gfs2_iomap_ops = {
1184         .iomap_begin = gfs2_iomap_begin,
1185         .iomap_end = gfs2_iomap_end,
1186 };
1187
1188 /**
1189  * gfs2_block_map - Map one or more blocks of an inode to a disk block
1190  * @inode: The inode
1191  * @lblock: The logical block number
1192  * @bh_map: The bh to be mapped
1193  * @create: True if its ok to alloc blocks to satify the request
1194  *
1195  * The size of the requested mapping is defined in bh_map->b_size.
1196  *
1197  * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1198  * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1199  * bh_map->b_size to indicate the size of the mapping when @lblock and
1200  * successive blocks are mapped, up to the requested size.
1201  *
1202  * Sets buffer_boundary() if a read of metadata will be required
1203  * before the next block can be mapped. Sets buffer_new() if new
1204  * blocks were allocated.
1205  *
1206  * Returns: errno
1207  */
1208
1209 int gfs2_block_map(struct inode *inode, sector_t lblock,
1210                    struct buffer_head *bh_map, int create)
1211 {
1212         struct gfs2_inode *ip = GFS2_I(inode);
1213         loff_t pos = (loff_t)lblock << inode->i_blkbits;
1214         loff_t length = bh_map->b_size;
1215         struct metapath mp = { .mp_aheight = 1, };
1216         struct iomap iomap = { };
1217         int ret;
1218
1219         clear_buffer_mapped(bh_map);
1220         clear_buffer_new(bh_map);
1221         clear_buffer_boundary(bh_map);
1222         trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1223
1224         if (create) {
1225                 ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp);
1226                 if (!ret && iomap.type == IOMAP_HOLE)
1227                         ret = gfs2_iomap_alloc(inode, &iomap, IOMAP_WRITE, &mp);
1228                 release_metapath(&mp);
1229         } else {
1230                 ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
1231                 release_metapath(&mp);
1232         }
1233         if (ret)
1234                 goto out;
1235
1236         if (iomap.length > bh_map->b_size) {
1237                 iomap.length = bh_map->b_size;
1238                 iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1239         }
1240         if (iomap.addr != IOMAP_NULL_ADDR)
1241                 map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1242         bh_map->b_size = iomap.length;
1243         if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1244                 set_buffer_boundary(bh_map);
1245         if (iomap.flags & IOMAP_F_NEW)
1246                 set_buffer_new(bh_map);
1247
1248 out:
1249         trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1250         return ret;
1251 }
1252
1253 /*
1254  * Deprecated: do not use in new code
1255  */
1256 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
1257 {
1258         struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
1259         int ret;
1260         int create = *new;
1261
1262         BUG_ON(!extlen);
1263         BUG_ON(!dblock);
1264         BUG_ON(!new);
1265
1266         bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
1267         ret = gfs2_block_map(inode, lblock, &bh, create);
1268         *extlen = bh.b_size >> inode->i_blkbits;
1269         *dblock = bh.b_blocknr;
1270         if (buffer_new(&bh))
1271                 *new = 1;
1272         else
1273                 *new = 0;
1274         return ret;
1275 }
1276
1277 /**
1278  * gfs2_block_zero_range - Deal with zeroing out data
1279  *
1280  * This is partly borrowed from ext3.
1281  */
1282 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1283                                  unsigned int length)
1284 {
1285         struct address_space *mapping = inode->i_mapping;
1286         struct gfs2_inode *ip = GFS2_I(inode);
1287         unsigned long index = from >> PAGE_SHIFT;
1288         unsigned offset = from & (PAGE_SIZE-1);
1289         unsigned blocksize, iblock, pos;
1290         struct buffer_head *bh;
1291         struct page *page;
1292         int err;
1293
1294         page = find_or_create_page(mapping, index, GFP_NOFS);
1295         if (!page)
1296                 return 0;
1297
1298         blocksize = inode->i_sb->s_blocksize;
1299         iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
1300
1301         if (!page_has_buffers(page))
1302                 create_empty_buffers(page, blocksize, 0);
1303
1304         /* Find the buffer that contains "offset" */
1305         bh = page_buffers(page);
1306         pos = blocksize;
1307         while (offset >= pos) {
1308                 bh = bh->b_this_page;
1309                 iblock++;
1310                 pos += blocksize;
1311         }
1312
1313         err = 0;
1314
1315         if (!buffer_mapped(bh)) {
1316                 gfs2_block_map(inode, iblock, bh, 0);
1317                 /* unmapped? It's a hole - nothing to do */
1318                 if (!buffer_mapped(bh))
1319                         goto unlock;
1320         }
1321
1322         /* Ok, it's mapped. Make sure it's up-to-date */
1323         if (PageUptodate(page))
1324                 set_buffer_uptodate(bh);
1325
1326         if (!buffer_uptodate(bh)) {
1327                 err = -EIO;
1328                 ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1329                 wait_on_buffer(bh);
1330                 /* Uhhuh. Read error. Complain and punt. */
1331                 if (!buffer_uptodate(bh))
1332                         goto unlock;
1333                 err = 0;
1334         }
1335
1336         if (gfs2_is_jdata(ip))
1337                 gfs2_trans_add_data(ip->i_gl, bh);
1338         else
1339                 gfs2_ordered_add_inode(ip);
1340
1341         zero_user(page, offset, length);
1342         mark_buffer_dirty(bh);
1343 unlock:
1344         unlock_page(page);
1345         put_page(page);
1346         return err;
1347 }
1348
1349 #define GFS2_JTRUNC_REVOKES 8192
1350
1351 /**
1352  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1353  * @inode: The inode being truncated
1354  * @oldsize: The original (larger) size
1355  * @newsize: The new smaller size
1356  *
1357  * With jdata files, we have to journal a revoke for each block which is
1358  * truncated. As a result, we need to split this into separate transactions
1359  * if the number of pages being truncated gets too large.
1360  */
1361
1362 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1363 {
1364         struct gfs2_sbd *sdp = GFS2_SB(inode);
1365         u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1366         u64 chunk;
1367         int error;
1368
1369         while (oldsize != newsize) {
1370                 struct gfs2_trans *tr;
1371                 unsigned int offs;
1372
1373                 chunk = oldsize - newsize;
1374                 if (chunk > max_chunk)
1375                         chunk = max_chunk;
1376
1377                 offs = oldsize & ~PAGE_MASK;
1378                 if (offs && chunk > PAGE_SIZE)
1379                         chunk = offs + ((chunk - offs) & PAGE_MASK);
1380
1381                 truncate_pagecache(inode, oldsize - chunk);
1382                 oldsize -= chunk;
1383
1384                 tr = current->journal_info;
1385                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1386                         continue;
1387
1388                 gfs2_trans_end(sdp);
1389                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1390                 if (error)
1391                         return error;
1392         }
1393
1394         return 0;
1395 }
1396
1397 static int trunc_start(struct inode *inode, u64 newsize)
1398 {
1399         struct gfs2_inode *ip = GFS2_I(inode);
1400         struct gfs2_sbd *sdp = GFS2_SB(inode);
1401         struct buffer_head *dibh = NULL;
1402         int journaled = gfs2_is_jdata(ip);
1403         u64 oldsize = inode->i_size;
1404         int error;
1405
1406         if (journaled)
1407                 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1408         else
1409                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1410         if (error)
1411                 return error;
1412
1413         error = gfs2_meta_inode_buffer(ip, &dibh);
1414         if (error)
1415                 goto out;
1416
1417         gfs2_trans_add_meta(ip->i_gl, dibh);
1418
1419         if (gfs2_is_stuffed(ip)) {
1420                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1421         } else {
1422                 unsigned int blocksize = i_blocksize(inode);
1423                 unsigned int offs = newsize & (blocksize - 1);
1424                 if (offs) {
1425                         error = gfs2_block_zero_range(inode, newsize,
1426                                                       blocksize - offs);
1427                         if (error)
1428                                 goto out;
1429                 }
1430                 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1431         }
1432
1433         i_size_write(inode, newsize);
1434         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1435         gfs2_dinode_out(ip, dibh->b_data);
1436
1437         if (journaled)
1438                 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1439         else
1440                 truncate_pagecache(inode, newsize);
1441
1442 out:
1443         brelse(dibh);
1444         if (current->journal_info)
1445                 gfs2_trans_end(sdp);
1446         return error;
1447 }
1448
1449 int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
1450                          struct iomap *iomap)
1451 {
1452         struct metapath mp = { .mp_aheight = 1, };
1453         int ret;
1454
1455         ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1456         if (!ret && iomap->type == IOMAP_HOLE)
1457                 ret = gfs2_iomap_alloc(inode, iomap, IOMAP_WRITE, &mp);
1458         release_metapath(&mp);
1459         return ret;
1460 }
1461
1462 /**
1463  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1464  * @ip: inode
1465  * @rg_gh: holder of resource group glock
1466  * @bh: buffer head to sweep
1467  * @start: starting point in bh
1468  * @end: end point in bh
1469  * @meta: true if bh points to metadata (rather than data)
1470  * @btotal: place to keep count of total blocks freed
1471  *
1472  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1473  * free, and free them all. However, we do it one rgrp at a time. If this
1474  * block has references to multiple rgrps, we break it into individual
1475  * transactions. This allows other processes to use the rgrps while we're
1476  * focused on a single one, for better concurrency / performance.
1477  * At every transaction boundary, we rewrite the inode into the journal.
1478  * That way the bitmaps are kept consistent with the inode and we can recover
1479  * if we're interrupted by power-outages.
1480  *
1481  * Returns: 0, or return code if an error occurred.
1482  *          *btotal has the total number of blocks freed
1483  */
1484 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1485                               struct buffer_head *bh, __be64 *start, __be64 *end,
1486                               bool meta, u32 *btotal)
1487 {
1488         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1489         struct gfs2_rgrpd *rgd;
1490         struct gfs2_trans *tr;
1491         __be64 *p;
1492         int blks_outside_rgrp;
1493         u64 bn, bstart, isize_blks;
1494         s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1495         int ret = 0;
1496         bool buf_in_tr = false; /* buffer was added to transaction */
1497
1498 more_rgrps:
1499         rgd = NULL;
1500         if (gfs2_holder_initialized(rd_gh)) {
1501                 rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1502                 gfs2_assert_withdraw(sdp,
1503                              gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1504         }
1505         blks_outside_rgrp = 0;
1506         bstart = 0;
1507         blen = 0;
1508
1509         for (p = start; p < end; p++) {
1510                 if (!*p)
1511                         continue;
1512                 bn = be64_to_cpu(*p);
1513
1514                 if (rgd) {
1515                         if (!rgrp_contains_block(rgd, bn)) {
1516                                 blks_outside_rgrp++;
1517                                 continue;
1518                         }
1519                 } else {
1520                         rgd = gfs2_blk2rgrpd(sdp, bn, true);
1521                         if (unlikely(!rgd)) {
1522                                 ret = -EIO;
1523                                 goto out;
1524                         }
1525                         ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1526                                                  0, rd_gh);
1527                         if (ret)
1528                                 goto out;
1529
1530                         /* Must be done with the rgrp glock held: */
1531                         if (gfs2_rs_active(&ip->i_res) &&
1532                             rgd == ip->i_res.rs_rbm.rgd)
1533                                 gfs2_rs_deltree(&ip->i_res);
1534                 }
1535
1536                 /* The size of our transactions will be unknown until we
1537                    actually process all the metadata blocks that relate to
1538                    the rgrp. So we estimate. We know it can't be more than
1539                    the dinode's i_blocks and we don't want to exceed the
1540                    journal flush threshold, sd_log_thresh2. */
1541                 if (current->journal_info == NULL) {
1542                         unsigned int jblocks_rqsted, revokes;
1543
1544                         jblocks_rqsted = rgd->rd_length + RES_DINODE +
1545                                 RES_INDIRECT;
1546                         isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1547                         if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1548                                 jblocks_rqsted +=
1549                                         atomic_read(&sdp->sd_log_thresh2);
1550                         else
1551                                 jblocks_rqsted += isize_blks;
1552                         revokes = jblocks_rqsted;
1553                         if (meta)
1554                                 revokes += end - start;
1555                         else if (ip->i_depth)
1556                                 revokes += sdp->sd_inptrs;
1557                         ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1558                         if (ret)
1559                                 goto out_unlock;
1560                         down_write(&ip->i_rw_mutex);
1561                 }
1562                 /* check if we will exceed the transaction blocks requested */
1563                 tr = current->journal_info;
1564                 if (tr->tr_num_buf_new + RES_STATFS +
1565                     RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1566                         /* We set blks_outside_rgrp to ensure the loop will
1567                            be repeated for the same rgrp, but with a new
1568                            transaction. */
1569                         blks_outside_rgrp++;
1570                         /* This next part is tricky. If the buffer was added
1571                            to the transaction, we've already set some block
1572                            pointers to 0, so we better follow through and free
1573                            them, or we will introduce corruption (so break).
1574                            This may be impossible, or at least rare, but I
1575                            decided to cover the case regardless.
1576
1577                            If the buffer was not added to the transaction
1578                            (this call), doing so would exceed our transaction
1579                            size, so we need to end the transaction and start a
1580                            new one (so goto). */
1581
1582                         if (buf_in_tr)
1583                                 break;
1584                         goto out_unlock;
1585                 }
1586
1587                 gfs2_trans_add_meta(ip->i_gl, bh);
1588                 buf_in_tr = true;
1589                 *p = 0;
1590                 if (bstart + blen == bn) {
1591                         blen++;
1592                         continue;
1593                 }
1594                 if (bstart) {
1595                         __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1596                         (*btotal) += blen;
1597                         gfs2_add_inode_blocks(&ip->i_inode, -blen);
1598                 }
1599                 bstart = bn;
1600                 blen = 1;
1601         }
1602         if (bstart) {
1603                 __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1604                 (*btotal) += blen;
1605                 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1606         }
1607 out_unlock:
1608         if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1609                                             outside the rgrp we just processed,
1610                                             do it all over again. */
1611                 if (current->journal_info) {
1612                         struct buffer_head *dibh;
1613
1614                         ret = gfs2_meta_inode_buffer(ip, &dibh);
1615                         if (ret)
1616                                 goto out;
1617
1618                         /* Every transaction boundary, we rewrite the dinode
1619                            to keep its di_blocks current in case of failure. */
1620                         ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1621                                 current_time(&ip->i_inode);
1622                         gfs2_trans_add_meta(ip->i_gl, dibh);
1623                         gfs2_dinode_out(ip, dibh->b_data);
1624                         brelse(dibh);
1625                         up_write(&ip->i_rw_mutex);
1626                         gfs2_trans_end(sdp);
1627                 }
1628                 gfs2_glock_dq_uninit(rd_gh);
1629                 cond_resched();
1630                 goto more_rgrps;
1631         }
1632 out:
1633         return ret;
1634 }
1635
1636 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1637 {
1638         if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1639                 return false;
1640         return true;
1641 }
1642
1643 /**
1644  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1645  * @mp: starting metapath
1646  * @h: desired height to search
1647  *
1648  * Assumes the metapath is valid (with buffers) out to height h.
1649  * Returns: true if a non-null pointer was found in the metapath buffer
1650  *          false if all remaining pointers are NULL in the buffer
1651  */
1652 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1653                              unsigned int h,
1654                              __u16 *end_list, unsigned int end_aligned)
1655 {
1656         struct buffer_head *bh = mp->mp_bh[h];
1657         __be64 *first, *ptr, *end;
1658
1659         first = metaptr1(h, mp);
1660         ptr = first + mp->mp_list[h];
1661         end = (__be64 *)(bh->b_data + bh->b_size);
1662         if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1663                 bool keep_end = h < end_aligned;
1664                 end = first + end_list[h] + keep_end;
1665         }
1666
1667         while (ptr < end) {
1668                 if (*ptr) { /* if we have a non-null pointer */
1669                         mp->mp_list[h] = ptr - first;
1670                         h++;
1671                         if (h < GFS2_MAX_META_HEIGHT)
1672                                 mp->mp_list[h] = 0;
1673                         return true;
1674                 }
1675                 ptr++;
1676         }
1677         return false;
1678 }
1679
1680 enum dealloc_states {
1681         DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1682         DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1683         DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1684         DEALLOC_DONE = 3,       /* process complete */
1685 };
1686
1687 static inline void
1688 metapointer_range(struct metapath *mp, int height,
1689                   __u16 *start_list, unsigned int start_aligned,
1690                   __u16 *end_list, unsigned int end_aligned,
1691                   __be64 **start, __be64 **end)
1692 {
1693         struct buffer_head *bh = mp->mp_bh[height];
1694         __be64 *first;
1695
1696         first = metaptr1(height, mp);
1697         *start = first;
1698         if (mp_eq_to_hgt(mp, start_list, height)) {
1699                 bool keep_start = height < start_aligned;
1700                 *start = first + start_list[height] + keep_start;
1701         }
1702         *end = (__be64 *)(bh->b_data + bh->b_size);
1703         if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1704                 bool keep_end = height < end_aligned;
1705                 *end = first + end_list[height] + keep_end;
1706         }
1707 }
1708
1709 static inline bool walk_done(struct gfs2_sbd *sdp,
1710                              struct metapath *mp, int height,
1711                              __u16 *end_list, unsigned int end_aligned)
1712 {
1713         __u16 end;
1714
1715         if (end_list) {
1716                 bool keep_end = height < end_aligned;
1717                 if (!mp_eq_to_hgt(mp, end_list, height))
1718                         return false;
1719                 end = end_list[height] + keep_end;
1720         } else
1721                 end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1722         return mp->mp_list[height] >= end;
1723 }
1724
1725 /**
1726  * punch_hole - deallocate blocks in a file
1727  * @ip: inode to truncate
1728  * @offset: the start of the hole
1729  * @length: the size of the hole (or 0 for truncate)
1730  *
1731  * Punch a hole into a file or truncate a file at a given position.  This
1732  * function operates in whole blocks (@offset and @length are rounded
1733  * accordingly); partially filled blocks must be cleared otherwise.
1734  *
1735  * This function works from the bottom up, and from the right to the left. In
1736  * other words, it strips off the highest layer (data) before stripping any of
1737  * the metadata. Doing it this way is best in case the operation is interrupted
1738  * by power failure, etc.  The dinode is rewritten in every transaction to
1739  * guarantee integrity.
1740  */
1741 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1742 {
1743         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1744         u64 maxsize = sdp->sd_heightsize[ip->i_height];
1745         struct metapath mp = {};
1746         struct buffer_head *dibh, *bh;
1747         struct gfs2_holder rd_gh;
1748         unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1749         u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1750         __u16 start_list[GFS2_MAX_META_HEIGHT];
1751         __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1752         unsigned int start_aligned, uninitialized_var(end_aligned);
1753         unsigned int strip_h = ip->i_height - 1;
1754         u32 btotal = 0;
1755         int ret, state;
1756         int mp_h; /* metapath buffers are read in to this height */
1757         u64 prev_bnr = 0;
1758         __be64 *start, *end;
1759
1760         if (offset >= maxsize) {
1761                 /*
1762                  * The starting point lies beyond the allocated meta-data;
1763                  * there are no blocks do deallocate.
1764                  */
1765                 return 0;
1766         }
1767
1768         /*
1769          * The start position of the hole is defined by lblock, start_list, and
1770          * start_aligned.  The end position of the hole is defined by lend,
1771          * end_list, and end_aligned.
1772          *
1773          * start_aligned and end_aligned define down to which height the start
1774          * and end positions are aligned to the metadata tree (i.e., the
1775          * position is a multiple of the metadata granularity at the height
1776          * above).  This determines at which heights additional meta pointers
1777          * needs to be preserved for the remaining data.
1778          */
1779
1780         if (length) {
1781                 u64 end_offset = offset + length;
1782                 u64 lend;
1783
1784                 /*
1785                  * Clip the end at the maximum file size for the given height:
1786                  * that's how far the metadata goes; files bigger than that
1787                  * will have additional layers of indirection.
1788                  */
1789                 if (end_offset > maxsize)
1790                         end_offset = maxsize;
1791                 lend = end_offset >> bsize_shift;
1792
1793                 if (lblock >= lend)
1794                         return 0;
1795
1796                 find_metapath(sdp, lend, &mp, ip->i_height);
1797                 end_list = __end_list;
1798                 memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1799
1800                 for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1801                         if (end_list[mp_h])
1802                                 break;
1803                 }
1804                 end_aligned = mp_h;
1805         }
1806
1807         find_metapath(sdp, lblock, &mp, ip->i_height);
1808         memcpy(start_list, mp.mp_list, sizeof(start_list));
1809
1810         for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1811                 if (start_list[mp_h])
1812                         break;
1813         }
1814         start_aligned = mp_h;
1815
1816         ret = gfs2_meta_inode_buffer(ip, &dibh);
1817         if (ret)
1818                 return ret;
1819
1820         mp.mp_bh[0] = dibh;
1821         ret = lookup_metapath(ip, &mp);
1822         if (ret)
1823                 goto out_metapath;
1824
1825         /* issue read-ahead on metadata */
1826         for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1827                 metapointer_range(&mp, mp_h, start_list, start_aligned,
1828                                   end_list, end_aligned, &start, &end);
1829                 gfs2_metapath_ra(ip->i_gl, start, end);
1830         }
1831
1832         if (mp.mp_aheight == ip->i_height)
1833                 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1834         else
1835                 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1836
1837         ret = gfs2_rindex_update(sdp);
1838         if (ret)
1839                 goto out_metapath;
1840
1841         ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1842         if (ret)
1843                 goto out_metapath;
1844         gfs2_holder_mark_uninitialized(&rd_gh);
1845
1846         mp_h = strip_h;
1847
1848         while (state != DEALLOC_DONE) {
1849                 switch (state) {
1850                 /* Truncate a full metapath at the given strip height.
1851                  * Note that strip_h == mp_h in order to be in this state. */
1852                 case DEALLOC_MP_FULL:
1853                         bh = mp.mp_bh[mp_h];
1854                         gfs2_assert_withdraw(sdp, bh);
1855                         if (gfs2_assert_withdraw(sdp,
1856                                                  prev_bnr != bh->b_blocknr)) {
1857                                 printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1858                                        "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1859                                        sdp->sd_fsname,
1860                                        (unsigned long long)ip->i_no_addr,
1861                                        prev_bnr, ip->i_height, strip_h, mp_h);
1862                         }
1863                         prev_bnr = bh->b_blocknr;
1864
1865                         if (gfs2_metatype_check(sdp, bh,
1866                                                 (mp_h ? GFS2_METATYPE_IN :
1867                                                         GFS2_METATYPE_DI))) {
1868                                 ret = -EIO;
1869                                 goto out;
1870                         }
1871
1872                         /*
1873                          * Below, passing end_aligned as 0 gives us the
1874                          * metapointer range excluding the end point: the end
1875                          * point is the first metapath we must not deallocate!
1876                          */
1877
1878                         metapointer_range(&mp, mp_h, start_list, start_aligned,
1879                                           end_list, 0 /* end_aligned */,
1880                                           &start, &end);
1881                         ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1882                                                  start, end,
1883                                                  mp_h != ip->i_height - 1,
1884                                                  &btotal);
1885
1886                         /* If we hit an error or just swept dinode buffer,
1887                            just exit. */
1888                         if (ret || !mp_h) {
1889                                 state = DEALLOC_DONE;
1890                                 break;
1891                         }
1892                         state = DEALLOC_MP_LOWER;
1893                         break;
1894
1895                 /* lower the metapath strip height */
1896                 case DEALLOC_MP_LOWER:
1897                         /* We're done with the current buffer, so release it,
1898                            unless it's the dinode buffer. Then back up to the
1899                            previous pointer. */
1900                         if (mp_h) {
1901                                 brelse(mp.mp_bh[mp_h]);
1902                                 mp.mp_bh[mp_h] = NULL;
1903                         }
1904                         /* If we can't get any lower in height, we've stripped
1905                            off all we can. Next step is to back up and start
1906                            stripping the previous level of metadata. */
1907                         if (mp_h == 0) {
1908                                 strip_h--;
1909                                 memcpy(mp.mp_list, start_list, sizeof(start_list));
1910                                 mp_h = strip_h;
1911                                 state = DEALLOC_FILL_MP;
1912                                 break;
1913                         }
1914                         mp.mp_list[mp_h] = 0;
1915                         mp_h--; /* search one metadata height down */
1916                         mp.mp_list[mp_h]++;
1917                         if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1918                                 break;
1919                         /* Here we've found a part of the metapath that is not
1920                          * allocated. We need to search at that height for the
1921                          * next non-null pointer. */
1922                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1923                                 state = DEALLOC_FILL_MP;
1924                                 mp_h++;
1925                         }
1926                         /* No more non-null pointers at this height. Back up
1927                            to the previous height and try again. */
1928                         break; /* loop around in the same state */
1929
1930                 /* Fill the metapath with buffers to the given height. */
1931                 case DEALLOC_FILL_MP:
1932                         /* Fill the buffers out to the current height. */
1933                         ret = fillup_metapath(ip, &mp, mp_h);
1934                         if (ret < 0)
1935                                 goto out;
1936
1937                         /* On the first pass, issue read-ahead on metadata. */
1938                         if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1939                                 unsigned int height = mp.mp_aheight - 1;
1940
1941                                 /* No read-ahead for data blocks. */
1942                                 if (mp.mp_aheight - 1 == strip_h)
1943                                         height--;
1944
1945                                 for (; height >= mp.mp_aheight - ret; height--) {
1946                                         metapointer_range(&mp, height,
1947                                                           start_list, start_aligned,
1948                                                           end_list, end_aligned,
1949                                                           &start, &end);
1950                                         gfs2_metapath_ra(ip->i_gl, start, end);
1951                                 }
1952                         }
1953
1954                         /* If buffers found for the entire strip height */
1955                         if (mp.mp_aheight - 1 == strip_h) {
1956                                 state = DEALLOC_MP_FULL;
1957                                 break;
1958                         }
1959                         if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1960                                 mp_h = mp.mp_aheight - 1;
1961
1962                         /* If we find a non-null block pointer, crawl a bit
1963                            higher up in the metapath and try again, otherwise
1964                            we need to look lower for a new starting point. */
1965                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1966                                 mp_h++;
1967                         else
1968                                 state = DEALLOC_MP_LOWER;
1969                         break;
1970                 }
1971         }
1972
1973         if (btotal) {
1974                 if (current->journal_info == NULL) {
1975                         ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1976                                                RES_QUOTA, 0);
1977                         if (ret)
1978                                 goto out;
1979                         down_write(&ip->i_rw_mutex);
1980                 }
1981                 gfs2_statfs_change(sdp, 0, +btotal, 0);
1982                 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1983                                   ip->i_inode.i_gid);
1984                 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1985                 gfs2_trans_add_meta(ip->i_gl, dibh);
1986                 gfs2_dinode_out(ip, dibh->b_data);
1987                 up_write(&ip->i_rw_mutex);
1988                 gfs2_trans_end(sdp);
1989         }
1990
1991 out:
1992         if (gfs2_holder_initialized(&rd_gh))
1993                 gfs2_glock_dq_uninit(&rd_gh);
1994         if (current->journal_info) {
1995                 up_write(&ip->i_rw_mutex);
1996                 gfs2_trans_end(sdp);
1997                 cond_resched();
1998         }
1999         gfs2_quota_unhold(ip);
2000 out_metapath:
2001         release_metapath(&mp);
2002         return ret;
2003 }
2004
2005 static int trunc_end(struct gfs2_inode *ip)
2006 {
2007         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2008         struct buffer_head *dibh;
2009         int error;
2010
2011         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2012         if (error)
2013                 return error;
2014
2015         down_write(&ip->i_rw_mutex);
2016
2017         error = gfs2_meta_inode_buffer(ip, &dibh);
2018         if (error)
2019                 goto out;
2020
2021         if (!i_size_read(&ip->i_inode)) {
2022                 ip->i_height = 0;
2023                 ip->i_goal = ip->i_no_addr;
2024                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
2025                 gfs2_ordered_del_inode(ip);
2026         }
2027         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2028         ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
2029
2030         gfs2_trans_add_meta(ip->i_gl, dibh);
2031         gfs2_dinode_out(ip, dibh->b_data);
2032         brelse(dibh);
2033
2034 out:
2035         up_write(&ip->i_rw_mutex);
2036         gfs2_trans_end(sdp);
2037         return error;
2038 }
2039
2040 /**
2041  * do_shrink - make a file smaller
2042  * @inode: the inode
2043  * @newsize: the size to make the file
2044  *
2045  * Called with an exclusive lock on @inode. The @size must
2046  * be equal to or smaller than the current inode size.
2047  *
2048  * Returns: errno
2049  */
2050
2051 static int do_shrink(struct inode *inode, u64 newsize)
2052 {
2053         struct gfs2_inode *ip = GFS2_I(inode);
2054         int error;
2055
2056         error = trunc_start(inode, newsize);
2057         if (error < 0)
2058                 return error;
2059         if (gfs2_is_stuffed(ip))
2060                 return 0;
2061
2062         error = punch_hole(ip, newsize, 0);
2063         if (error == 0)
2064                 error = trunc_end(ip);
2065
2066         return error;
2067 }
2068
2069 void gfs2_trim_blocks(struct inode *inode)
2070 {
2071         int ret;
2072
2073         ret = do_shrink(inode, inode->i_size);
2074         WARN_ON(ret != 0);
2075 }
2076
2077 /**
2078  * do_grow - Touch and update inode size
2079  * @inode: The inode
2080  * @size: The new size
2081  *
2082  * This function updates the timestamps on the inode and
2083  * may also increase the size of the inode. This function
2084  * must not be called with @size any smaller than the current
2085  * inode size.
2086  *
2087  * Although it is not strictly required to unstuff files here,
2088  * earlier versions of GFS2 have a bug in the stuffed file reading
2089  * code which will result in a buffer overrun if the size is larger
2090  * than the max stuffed file size. In order to prevent this from
2091  * occurring, such files are unstuffed, but in other cases we can
2092  * just update the inode size directly.
2093  *
2094  * Returns: 0 on success, or -ve on error
2095  */
2096
2097 static int do_grow(struct inode *inode, u64 size)
2098 {
2099         struct gfs2_inode *ip = GFS2_I(inode);
2100         struct gfs2_sbd *sdp = GFS2_SB(inode);
2101         struct gfs2_alloc_parms ap = { .target = 1, };
2102         struct buffer_head *dibh;
2103         int error;
2104         int unstuff = 0;
2105
2106         if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2107                 error = gfs2_quota_lock_check(ip, &ap);
2108                 if (error)
2109                         return error;
2110
2111                 error = gfs2_inplace_reserve(ip, &ap);
2112                 if (error)
2113                         goto do_grow_qunlock;
2114                 unstuff = 1;
2115         }
2116
2117         error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2118                                  (unstuff &&
2119                                   gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2120                                  (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2121                                   0 : RES_QUOTA), 0);
2122         if (error)
2123                 goto do_grow_release;
2124
2125         if (unstuff) {
2126                 error = gfs2_unstuff_dinode(ip, NULL);
2127                 if (error)
2128                         goto do_end_trans;
2129         }
2130
2131         error = gfs2_meta_inode_buffer(ip, &dibh);
2132         if (error)
2133                 goto do_end_trans;
2134
2135         i_size_write(inode, size);
2136         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2137         gfs2_trans_add_meta(ip->i_gl, dibh);
2138         gfs2_dinode_out(ip, dibh->b_data);
2139         brelse(dibh);
2140
2141 do_end_trans:
2142         gfs2_trans_end(sdp);
2143 do_grow_release:
2144         if (unstuff) {
2145                 gfs2_inplace_release(ip);
2146 do_grow_qunlock:
2147                 gfs2_quota_unlock(ip);
2148         }
2149         return error;
2150 }
2151
2152 /**
2153  * gfs2_setattr_size - make a file a given size
2154  * @inode: the inode
2155  * @newsize: the size to make the file
2156  *
2157  * The file size can grow, shrink, or stay the same size. This
2158  * is called holding i_rwsem and an exclusive glock on the inode
2159  * in question.
2160  *
2161  * Returns: errno
2162  */
2163
2164 int gfs2_setattr_size(struct inode *inode, u64 newsize)
2165 {
2166         struct gfs2_inode *ip = GFS2_I(inode);
2167         int ret;
2168
2169         BUG_ON(!S_ISREG(inode->i_mode));
2170
2171         ret = inode_newsize_ok(inode, newsize);
2172         if (ret)
2173                 return ret;
2174
2175         inode_dio_wait(inode);
2176
2177         ret = gfs2_rsqa_alloc(ip);
2178         if (ret)
2179                 goto out;
2180
2181         if (newsize >= inode->i_size) {
2182                 ret = do_grow(inode, newsize);
2183                 goto out;
2184         }
2185
2186         ret = do_shrink(inode, newsize);
2187 out:
2188         gfs2_rsqa_delete(ip, NULL);
2189         return ret;
2190 }
2191
2192 int gfs2_truncatei_resume(struct gfs2_inode *ip)
2193 {
2194         int error;
2195         error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2196         if (!error)
2197                 error = trunc_end(ip);
2198         return error;
2199 }
2200
2201 int gfs2_file_dealloc(struct gfs2_inode *ip)
2202 {
2203         return punch_hole(ip, 0, 0);
2204 }
2205
2206 /**
2207  * gfs2_free_journal_extents - Free cached journal bmap info
2208  * @jd: The journal
2209  *
2210  */
2211
2212 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2213 {
2214         struct gfs2_journal_extent *jext;
2215
2216         while(!list_empty(&jd->extent_list)) {
2217                 jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
2218                 list_del(&jext->list);
2219                 kfree(jext);
2220         }
2221 }
2222
2223 /**
2224  * gfs2_add_jextent - Add or merge a new extent to extent cache
2225  * @jd: The journal descriptor
2226  * @lblock: The logical block at start of new extent
2227  * @dblock: The physical block at start of new extent
2228  * @blocks: Size of extent in fs blocks
2229  *
2230  * Returns: 0 on success or -ENOMEM
2231  */
2232
2233 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2234 {
2235         struct gfs2_journal_extent *jext;
2236
2237         if (!list_empty(&jd->extent_list)) {
2238                 jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
2239                 if ((jext->dblock + jext->blocks) == dblock) {
2240                         jext->blocks += blocks;
2241                         return 0;
2242                 }
2243         }
2244
2245         jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2246         if (jext == NULL)
2247                 return -ENOMEM;
2248         jext->dblock = dblock;
2249         jext->lblock = lblock;
2250         jext->blocks = blocks;
2251         list_add_tail(&jext->list, &jd->extent_list);
2252         jd->nr_extents++;
2253         return 0;
2254 }
2255
2256 /**
2257  * gfs2_map_journal_extents - Cache journal bmap info
2258  * @sdp: The super block
2259  * @jd: The journal to map
2260  *
2261  * Create a reusable "extent" mapping from all logical
2262  * blocks to all physical blocks for the given journal.  This will save
2263  * us time when writing journal blocks.  Most journals will have only one
2264  * extent that maps all their logical blocks.  That's because gfs2.mkfs
2265  * arranges the journal blocks sequentially to maximize performance.
2266  * So the extent would map the first block for the entire file length.
2267  * However, gfs2_jadd can happen while file activity is happening, so
2268  * those journals may not be sequential.  Less likely is the case where
2269  * the users created their own journals by mounting the metafs and
2270  * laying it out.  But it's still possible.  These journals might have
2271  * several extents.
2272  *
2273  * Returns: 0 on success, or error on failure
2274  */
2275
2276 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2277 {
2278         u64 lblock = 0;
2279         u64 lblock_stop;
2280         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2281         struct buffer_head bh;
2282         unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2283         u64 size;
2284         int rc;
2285         ktime_t start, end;
2286
2287         start = ktime_get();
2288         lblock_stop = i_size_read(jd->jd_inode) >> shift;
2289         size = (lblock_stop - lblock) << shift;
2290         jd->nr_extents = 0;
2291         WARN_ON(!list_empty(&jd->extent_list));
2292
2293         do {
2294                 bh.b_state = 0;
2295                 bh.b_blocknr = 0;
2296                 bh.b_size = size;
2297                 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2298                 if (rc || !buffer_mapped(&bh))
2299                         goto fail;
2300                 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2301                 if (rc)
2302                         goto fail;
2303                 size -= bh.b_size;
2304                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2305         } while(size > 0);
2306
2307         end = ktime_get();
2308         fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2309                 jd->nr_extents, ktime_ms_delta(end, start));
2310         return 0;
2311
2312 fail:
2313         fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2314                 rc, jd->jd_jid,
2315                 (unsigned long long)(i_size_read(jd->jd_inode) - size),
2316                 jd->nr_extents);
2317         fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2318                 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2319                 bh.b_state, (unsigned long long)bh.b_size);
2320         gfs2_free_journal_extents(jd);
2321         return rc;
2322 }
2323
2324 /**
2325  * gfs2_write_alloc_required - figure out if a write will require an allocation
2326  * @ip: the file being written to
2327  * @offset: the offset to write to
2328  * @len: the number of bytes being written
2329  *
2330  * Returns: 1 if an alloc is required, 0 otherwise
2331  */
2332
2333 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2334                               unsigned int len)
2335 {
2336         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2337         struct buffer_head bh;
2338         unsigned int shift;
2339         u64 lblock, lblock_stop, size;
2340         u64 end_of_file;
2341
2342         if (!len)
2343                 return 0;
2344
2345         if (gfs2_is_stuffed(ip)) {
2346                 if (offset + len > gfs2_max_stuffed_size(ip))
2347                         return 1;
2348                 return 0;
2349         }
2350
2351         shift = sdp->sd_sb.sb_bsize_shift;
2352         BUG_ON(gfs2_is_dir(ip));
2353         end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2354         lblock = offset >> shift;
2355         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2356         if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2357                 return 1;
2358
2359         size = (lblock_stop - lblock) << shift;
2360         do {
2361                 bh.b_state = 0;
2362                 bh.b_size = size;
2363                 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2364                 if (!buffer_mapped(&bh))
2365                         return 1;
2366                 size -= bh.b_size;
2367                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2368         } while(size > 0);
2369
2370         return 0;
2371 }
2372
2373 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2374 {
2375         struct gfs2_inode *ip = GFS2_I(inode);
2376         struct buffer_head *dibh;
2377         int error;
2378
2379         if (offset >= inode->i_size)
2380                 return 0;
2381         if (offset + length > inode->i_size)
2382                 length = inode->i_size - offset;
2383
2384         error = gfs2_meta_inode_buffer(ip, &dibh);
2385         if (error)
2386                 return error;
2387         gfs2_trans_add_meta(ip->i_gl, dibh);
2388         memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2389                length);
2390         brelse(dibh);
2391         return 0;
2392 }
2393
2394 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2395                                          loff_t length)
2396 {
2397         struct gfs2_sbd *sdp = GFS2_SB(inode);
2398         loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2399         int error;
2400
2401         while (length) {
2402                 struct gfs2_trans *tr;
2403                 loff_t chunk;
2404                 unsigned int offs;
2405
2406                 chunk = length;
2407                 if (chunk > max_chunk)
2408                         chunk = max_chunk;
2409
2410                 offs = offset & ~PAGE_MASK;
2411                 if (offs && chunk > PAGE_SIZE)
2412                         chunk = offs + ((chunk - offs) & PAGE_MASK);
2413
2414                 truncate_pagecache_range(inode, offset, chunk);
2415                 offset += chunk;
2416                 length -= chunk;
2417
2418                 tr = current->journal_info;
2419                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2420                         continue;
2421
2422                 gfs2_trans_end(sdp);
2423                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2424                 if (error)
2425                         return error;
2426         }
2427         return 0;
2428 }
2429
2430 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2431 {
2432         struct inode *inode = file_inode(file);
2433         struct gfs2_inode *ip = GFS2_I(inode);
2434         struct gfs2_sbd *sdp = GFS2_SB(inode);
2435         int error;
2436
2437         if (gfs2_is_jdata(ip))
2438                 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2439                                          GFS2_JTRUNC_REVOKES);
2440         else
2441                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2442         if (error)
2443                 return error;
2444
2445         if (gfs2_is_stuffed(ip)) {
2446                 error = stuffed_zero_range(inode, offset, length);
2447                 if (error)
2448                         goto out;
2449         } else {
2450                 unsigned int start_off, end_len, blocksize;
2451
2452                 blocksize = i_blocksize(inode);
2453                 start_off = offset & (blocksize - 1);
2454                 end_len = (offset + length) & (blocksize - 1);
2455                 if (start_off) {
2456                         unsigned int len = length;
2457                         if (length > blocksize - start_off)
2458                                 len = blocksize - start_off;
2459                         error = gfs2_block_zero_range(inode, offset, len);
2460                         if (error)
2461                                 goto out;
2462                         if (start_off + length < blocksize)
2463                                 end_len = 0;
2464                 }
2465                 if (end_len) {
2466                         error = gfs2_block_zero_range(inode,
2467                                 offset + length - end_len, end_len);
2468                         if (error)
2469                                 goto out;
2470                 }
2471         }
2472
2473         if (gfs2_is_jdata(ip)) {
2474                 BUG_ON(!current->journal_info);
2475                 gfs2_journaled_truncate_range(inode, offset, length);
2476         } else
2477                 truncate_pagecache_range(inode, offset, offset + length - 1);
2478
2479         file_update_time(file);
2480         mark_inode_dirty(inode);
2481
2482         if (current->journal_info)
2483                 gfs2_trans_end(sdp);
2484
2485         if (!gfs2_is_stuffed(ip))
2486                 error = punch_hole(ip, offset, length);
2487
2488 out:
2489         if (current->journal_info)
2490                 gfs2_trans_end(sdp);
2491         return error;
2492 }