fs/gfs2/bmap.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   4  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   5  */
   6
   7 #include <linux/spinlock.h>
   8 #include <linux/completion.h>
   9 #include <linux/buffer_head.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/gfs2_ondisk.h>
  12 #include <linux/crc32.h>
  13 #include <linux/iomap.h>
  14 #include <linux/ktime.h>
  15
  16 #include "gfs2.h"
  17 #include "incore.h"
  18 #include "bmap.h"
  19 #include "glock.h"
  20 #include "inode.h"
  21 #include "meta_io.h"
  22 #include "quota.h"
  23 #include "rgrp.h"
  24 #include "log.h"
  25 #include "super.h"
  26 #include "trans.h"
  27 #include "dir.h"
  28 #include "util.h"
  29 #include "aops.h"
  30 #include "trace_gfs2.h"
  31
  32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  33  * block is 512, so __u16 is fine for that. It saves stack space to
  34  * keep it small.
  35  */
  36 struct metapath {
  37         struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  38         __u16 mp_list[GFS2_MAX_META_HEIGHT];
  39         int mp_fheight; /* find_metapath height */
  40         int mp_aheight; /* actual height (lookup height) */
  41 };
  42
  43 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
  44
  45 /**
  46  * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio
  47  * @ip: the inode
  48  * @dibh: the dinode buffer
  49  * @block: the block number that was allocated
  50  * @folio: The folio.
  51  *
  52  * Returns: errno
  53  */
  54 static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh,
  55                                u64 block, struct folio *folio)
  56 {
  57         struct inode *inode = &ip->i_inode;
  58
  59         if (!folio_test_uptodate(folio)) {
  60                 void *kaddr = kmap_local_folio(folio, 0);
  61                 u64 dsize = i_size_read(inode);
  62
  63                 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  64                 memset(kaddr + dsize, 0, folio_size(folio) - dsize);
  65                 kunmap_local(kaddr);
  66
  67                 folio_mark_uptodate(folio);
  68         }
  69
  70         if (gfs2_is_jdata(ip)) {
  71                 struct buffer_head *bh = folio_buffers(folio);
  72
  73                 if (!bh)
  74                         bh = create_empty_buffers(folio,
  75                                 BIT(inode->i_blkbits), BIT(BH_Uptodate));
  76
  77                 if (!buffer_mapped(bh))
  78                         map_bh(bh, inode->i_sb, block);
  79
  80                 set_buffer_uptodate(bh);
  81                 gfs2_trans_add_data(ip->i_gl, bh);
  82         } else {
  83                 folio_mark_dirty(folio);
  84                 gfs2_ordered_add_inode(ip);
  85         }
  86
  87         return 0;
  88 }
  89
  90 static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio)
  91 {
  92         struct buffer_head *bh, *dibh;
  93         struct gfs2_dinode *di;
  94         u64 block = 0;
  95         int isdir = gfs2_is_dir(ip);
  96         int error;
  97
  98         error = gfs2_meta_inode_buffer(ip, &dibh);
  99         if (error)
 100                 return error;
 101
 102         if (i_size_read(&ip->i_inode)) {
 103                 /* Get a free block, fill it with the stuffed data,
 104                    and write it out to disk */
 105
 106                 unsigned int n = 1;
 107                 error = gfs2_alloc_blocks(ip, &block, &n, 0);
 108                 if (error)
 109                         goto out_brelse;
 110                 if (isdir) {
 111                         gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
 112                         error = gfs2_dir_get_new_buffer(ip, block, &bh);
 113                         if (error)
 114                                 goto out_brelse;
 115                         gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 116                                               dibh, sizeof(struct gfs2_dinode));
 117                         brelse(bh);
 118                 } else {
 119                         error = gfs2_unstuffer_folio(ip, dibh, block, folio);
 120                         if (error)
 121                                 goto out_brelse;
 122                 }
 123         }
 124
 125         /*  Set up the pointer to the new block  */
 126
 127         gfs2_trans_add_meta(ip->i_gl, dibh);
 128         di = (struct gfs2_dinode *)dibh->b_data;
 129         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 130
 131         if (i_size_read(&ip->i_inode)) {
 132                 *(__be64 *)(di + 1) = cpu_to_be64(block);
 133                 gfs2_add_inode_blocks(&ip->i_inode, 1);
 134                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 135         }
 136
 137         ip->i_height = 1;
 138         di->di_height = cpu_to_be16(1);
 139
 140 out_brelse:
 141         brelse(dibh);
 142         return error;
 143 }
 144
 145 /**
 146  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 147  * @ip: The GFS2 inode to unstuff
 148  *
 149  * This routine unstuffs a dinode and returns it to a "normal" state such
 150  * that the height can be grown in the traditional way.
 151  *
 152  * Returns: errno
 153  */
 154
 155 int gfs2_unstuff_dinode(struct gfs2_inode *ip)
 156 {
 157         struct inode *inode = &ip->i_inode;
 158         struct folio *folio;
 159         int error;
 160
 161         down_write(&ip->i_rw_mutex);
 162         folio = filemap_grab_folio(inode->i_mapping, 0);
 163         error = PTR_ERR(folio);
 164         if (IS_ERR(folio))
 165                 goto out;
 166         error = __gfs2_unstuff_inode(ip, folio);
 167         folio_unlock(folio);
 168         folio_put(folio);
 169 out:
 170         up_write(&ip->i_rw_mutex);
 171         return error;
 172 }
 173
 174 /**
 175  * find_metapath - Find path through the metadata tree
 176  * @sdp: The superblock
 177  * @block: The disk block to look up
 178  * @mp: The metapath to return the result in
 179  * @height: The pre-calculated height of the metadata tree
 180  *
 181  *   This routine returns a struct metapath structure that defines a path
 182  *   through the metadata of inode "ip" to get to block "block".
 183  *
 184  *   Example:
 185  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 186  *   filesystem with a blocksize of 4096.
 187  *
 188  *   find_metapath() would return a struct metapath structure set to:
 189  *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
 190  *
 191  *   That means that in order to get to the block containing the byte at
 192  *   offset 101342453, we would load the indirect block pointed to by pointer
 193  *   0 in the dinode.  We would then load the indirect block pointed to by
 194  *   pointer 48 in that indirect block.  We would then load the data block
 195  *   pointed to by pointer 165 in that indirect block.
 196  *
 197  *             ----------------------------------------
 198  *             | Dinode |                             |
 199  *             |        |                            4|
 200  *             |        |0 1 2 3 4 5                 9|
 201  *             |        |                            6|
 202  *             ----------------------------------------
 203  *                       |
 204  *                       |
 205  *                       V
 206  *             ----------------------------------------
 207  *             | Indirect Block                       |
 208  *             |                                     5|
 209  *             |            4 4 4 4 4 5 5            1|
 210  *             |0           5 6 7 8 9 0 1            2|
 211  *             ----------------------------------------
 212  *                                |
 213  *                                |
 214  *                                V
 215  *             ----------------------------------------
 216  *             | Indirect Block                       |
 217  *             |                         1 1 1 1 1   5|
 218  *             |                         6 6 6 6 6   1|
 219  *             |0                        3 4 5 6 7   2|
 220  *             ----------------------------------------
 221  *                                           |
 222  *                                           |
 223  *                                           V
 224  *             ----------------------------------------
 225  *             | Data block containing offset         |
 226  *             |            101342453                 |
 227  *             |                                      |
 228  *             |                                      |
 229  *             ----------------------------------------
 230  *
 231  */
 232
 233 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 234                           struct metapath *mp, unsigned int height)
 235 {
 236         unsigned int i;
 237
 238         mp->mp_fheight = height;
 239         for (i = height; i--;)
 240                 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 241 }
 242
 243 static inline unsigned int metapath_branch_start(const struct metapath *mp)
 244 {
 245         if (mp->mp_list[0] == 0)
 246                 return 2;
 247         return 1;
 248 }
 249
 250 /**
 251  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 252  * @height: The metadata height (0 = dinode)
 253  * @mp: The metapath
 254  */
 255 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 256 {
 257         struct buffer_head *bh = mp->mp_bh[height];
 258         if (height == 0)
 259                 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 260         return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 261 }
 262
 263 /**
 264  * metapointer - Return pointer to start of metadata in a buffer
 265  * @height: The metadata height (0 = dinode)
 266  * @mp: The metapath
 267  *
 268  * Return a pointer to the block number of the next height of the metadata
 269  * tree given a buffer containing the pointer to the current height of the
 270  * metadata tree.
 271  */
 272
 273 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 274 {
 275         __be64 *p = metaptr1(height, mp);
 276         return p + mp->mp_list[height];
 277 }
 278
 279 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
 280 {
 281         const struct buffer_head *bh = mp->mp_bh[height];
 282         return (const __be64 *)(bh->b_data + bh->b_size);
 283 }
 284
 285 static void clone_metapath(struct metapath *clone, struct metapath *mp)
 286 {
 287         unsigned int hgt;
 288
 289         *clone = *mp;
 290         for (hgt = 0; hgt < mp->mp_aheight; hgt++)
 291                 get_bh(clone->mp_bh[hgt]);
 292 }
 293
 294 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 295 {
 296         const __be64 *t;
 297
 298         for (t = start; t < end; t++) {
 299                 struct buffer_head *rabh;
 300
 301                 if (!*t)
 302                         continue;
 303
 304                 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 305                 if (trylock_buffer(rabh)) {
 306                         if (!buffer_uptodate(rabh)) {
 307                                 rabh->b_end_io = end_buffer_read_sync;
 308                                 submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
 309                                           REQ_PRIO, rabh);
 310                                 continue;
 311                         }
 312                         unlock_buffer(rabh);
 313                 }
 314                 brelse(rabh);
 315         }
 316 }
 317
 318 static inline struct buffer_head *
 319 metapath_dibh(struct metapath *mp)
 320 {
 321         return mp->mp_bh[0];
 322 }
 323
 324 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 325                              unsigned int x, unsigned int h)
 326 {
 327         for (; x < h; x++) {
 328                 __be64 *ptr = metapointer(x, mp);
 329                 u64 dblock = be64_to_cpu(*ptr);
 330                 int ret;
 331
 332                 if (!dblock)
 333                         break;
 334                 ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
 335                 if (ret)
 336                         return ret;
 337         }
 338         mp->mp_aheight = x + 1;
 339         return 0;
 340 }
 341
 342 /**
 343  * lookup_metapath - Walk the metadata tree to a specific point
 344  * @ip: The inode
 345  * @mp: The metapath
 346  *
 347  * Assumes that the inode's buffer has already been looked up and
 348  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 349  * by find_metapath().
 350  *
 351  * If this function encounters part of the tree which has not been
 352  * allocated, it returns the current height of the tree at the point
 353  * at which it found the unallocated block. Blocks which are found are
 354  * added to the mp->mp_bh[] list.
 355  *
 356  * Returns: error
 357  */
 358
 359 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 360 {
 361         return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 362 }
 363
 364 /**
 365  * fillup_metapath - fill up buffers for the metadata path to a specific height
 366  * @ip: The inode
 367  * @mp: The metapath
 368  * @h: The height to which it should be mapped
 369  *
 370  * Similar to lookup_metapath, but does lookups for a range of heights
 371  *
 372  * Returns: error or the number of buffers filled
 373  */
 374
 375 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 376 {
 377         unsigned int x = 0;
 378         int ret;
 379
 380         if (h) {
 381                 /* find the first buffer we need to look up. */
 382                 for (x = h - 1; x > 0; x--) {
 383                         if (mp->mp_bh[x])
 384                                 break;
 385                 }
 386         }
 387         ret = __fillup_metapath(ip, mp, x, h);
 388         if (ret)
 389                 return ret;
 390         return mp->mp_aheight - x - 1;
 391 }
 392
 393 static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
 394 {
 395         sector_t factor = 1, block = 0;
 396         int hgt;
 397
 398         for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
 399                 if (hgt < mp->mp_aheight)
 400                         block += mp->mp_list[hgt] * factor;
 401                 factor *= sdp->sd_inptrs;
 402         }
 403         return block;
 404 }
 405
 406 static void release_metapath(struct metapath *mp)
 407 {
 408         int i;
 409
 410         for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 411                 if (mp->mp_bh[i] == NULL)
 412                         break;
 413                 brelse(mp->mp_bh[i]);
 414                 mp->mp_bh[i] = NULL;
 415         }
 416 }
 417
 418 /**
 419  * gfs2_extent_length - Returns length of an extent of blocks
 420  * @bh: The metadata block
 421  * @ptr: Current position in @bh
 422  * @eob: Set to 1 if we hit "end of block"
 423  *
 424  * Returns: The length of the extent (minimum of one block)
 425  */
 426
 427 static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, int *eob)
 428 {
 429         const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 430         const __be64 *first = ptr;
 431         u64 d = be64_to_cpu(*ptr);
 432
 433         *eob = 0;
 434         do {
 435                 ptr++;
 436                 if (ptr >= end)
 437                         break;
 438                 d++;
 439         } while(be64_to_cpu(*ptr) == d);
 440         if (ptr >= end)
 441                 *eob = 1;
 442         return ptr - first;
 443 }
 444
 445 enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
 446
 447 /*
 448  * gfs2_metadata_walker - walk an indirect block
 449  * @mp: Metapath to indirect block
 450  * @ptrs: Number of pointers to look at
 451  *
 452  * When returning WALK_FOLLOW, the walker must update @mp to point at the right
 453  * indirect block to follow.
 454  */
 455 typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
 456                                                    unsigned int ptrs);
 457
 458 /*
 459  * gfs2_walk_metadata - walk a tree of indirect blocks
 460  * @inode: The inode
 461  * @mp: Starting point of walk
 462  * @max_len: Maximum number of blocks to walk
 463  * @walker: Called during the walk
 464  *
 465  * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
 466  * past the end of metadata, and a negative error code otherwise.
 467  */
 468
 469 static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
 470                 u64 max_len, gfs2_metadata_walker walker)
 471 {
 472         struct gfs2_inode *ip = GFS2_I(inode);
 473         struct gfs2_sbd *sdp = GFS2_SB(inode);
 474         u64 factor = 1;
 475         unsigned int hgt;
 476         int ret;
 477
 478         /*
 479          * The walk starts in the lowest allocated indirect block, which may be
 480          * before the position indicated by @mp.  Adjust @max_len accordingly
 481          * to avoid a short walk.
 482          */
 483         for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
 484                 max_len += mp->mp_list[hgt] * factor;
 485                 mp->mp_list[hgt] = 0;
 486                 factor *= sdp->sd_inptrs;
 487         }
 488
 489         for (;;) {
 490                 u16 start = mp->mp_list[hgt];
 491                 enum walker_status status;
 492                 unsigned int ptrs;
 493                 u64 len;
 494
 495                 /* Walk indirect block. */
 496                 ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
 497                 len = ptrs * factor;
 498                 if (len > max_len)
 499                         ptrs = DIV_ROUND_UP_ULL(max_len, factor);
 500                 status = walker(mp, ptrs);
 501                 switch (status) {
 502                 case WALK_STOP:
 503                         return 1;
 504                 case WALK_FOLLOW:
 505                         BUG_ON(mp->mp_aheight == mp->mp_fheight);
 506                         ptrs = mp->mp_list[hgt] - start;
 507                         len = ptrs * factor;
 508                         break;
 509                 case WALK_CONTINUE:
 510                         break;
 511                 }
 512                 if (len >= max_len)
 513                         break;
 514                 max_len -= len;
 515                 if (status == WALK_FOLLOW)
 516                         goto fill_up_metapath;
 517
 518 lower_metapath:
 519                 /* Decrease height of metapath. */
 520                 brelse(mp->mp_bh[hgt]);
 521                 mp->mp_bh[hgt] = NULL;
 522                 mp->mp_list[hgt] = 0;
 523                 if (!hgt)
 524                         break;
 525                 hgt--;
 526                 factor *= sdp->sd_inptrs;
 527
 528                 /* Advance in metadata tree. */
 529                 (mp->mp_list[hgt])++;
 530                 if (hgt) {
 531                         if (mp->mp_list[hgt] >= sdp->sd_inptrs)
 532                                 goto lower_metapath;
 533                 } else {
 534                         if (mp->mp_list[hgt] >= sdp->sd_diptrs)
 535                                 break;
 536                 }
 537
 538 fill_up_metapath:
 539                 /* Increase height of metapath. */
 540                 ret = fillup_metapath(ip, mp, ip->i_height - 1);
 541                 if (ret < 0)
 542                         return ret;
 543                 hgt += ret;
 544                 for (; ret; ret--)
 545                         do_div(factor, sdp->sd_inptrs);
 546                 mp->mp_aheight = hgt + 1;
 547         }
 548         return 0;
 549 }
 550
 551 static enum walker_status gfs2_hole_walker(struct metapath *mp,
 552                                            unsigned int ptrs)
 553 {
 554         const __be64 *start, *ptr, *end;
 555         unsigned int hgt;
 556
 557         hgt = mp->mp_aheight - 1;
 558         start = metapointer(hgt, mp);
 559         end = start + ptrs;
 560
 561         for (ptr = start; ptr < end; ptr++) {
 562                 if (*ptr) {
 563                         mp->mp_list[hgt] += ptr - start;
 564                         if (mp->mp_aheight == mp->mp_fheight)
 565                                 return WALK_STOP;
 566                         return WALK_FOLLOW;
 567                 }
 568         }
 569         return WALK_CONTINUE;
 570 }
 571
 572 /**
 573  * gfs2_hole_size - figure out the size of a hole
 574  * @inode: The inode
 575  * @lblock: The logical starting block number
 576  * @len: How far to look (in blocks)
 577  * @mp: The metapath at lblock
 578  * @iomap: The iomap to store the hole size in
 579  *
 580  * This function modifies @mp.
 581  *
 582  * Returns: errno on error
 583  */
 584 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
 585                           struct metapath *mp, struct iomap *iomap)
 586 {
 587         struct metapath clone;
 588         u64 hole_size;
 589         int ret;
 590
 591         clone_metapath(&clone, mp);
 592         ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
 593         if (ret < 0)
 594                 goto out;
 595
 596         if (ret == 1)
 597                 hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
 598         else
 599                 hole_size = len;
 600         iomap->length = hole_size << inode->i_blkbits;
 601         ret = 0;
 602
 603 out:
 604         release_metapath(&clone);
 605         return ret;
 606 }
 607
 608 static inline void gfs2_indirect_init(struct metapath *mp,
 609                                       struct gfs2_glock *gl, unsigned int i,
 610                                       unsigned offset, u64 bn)
 611 {
 612         __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 613                        ((i > 1) ? sizeof(struct gfs2_meta_header) :
 614                                  sizeof(struct gfs2_dinode)));
 615         BUG_ON(i < 1);
 616         BUG_ON(mp->mp_bh[i] != NULL);
 617         mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 618         gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 619         gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 620         gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 621         ptr += offset;
 622         *ptr = cpu_to_be64(bn);
 623 }
 624
 625 enum alloc_state {
 626         ALLOC_DATA = 0,
 627         ALLOC_GROW_DEPTH = 1,
 628         ALLOC_GROW_HEIGHT = 2,
 629         /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 630 };
 631
 632 /**
 633  * __gfs2_iomap_alloc - Build a metadata tree of the requested height
 634  * @inode: The GFS2 inode
 635  * @iomap: The iomap structure
 636  * @mp: The metapath, with proper height information calculated
 637  *
 638  * In this routine we may have to alloc:
 639  *   i) Indirect blocks to grow the metadata tree height
 640  *  ii) Indirect blocks to fill in lower part of the metadata tree
 641  * iii) Data blocks
 642  *
 643  * This function is called after __gfs2_iomap_get, which works out the
 644  * total number of blocks which we need via gfs2_alloc_size.
 645  *
 646  * We then do the actual allocation asking for an extent at a time (if
 647  * enough contiguous free blocks are available, there will only be one
 648  * allocation request per call) and uses the state machine to initialise
 649  * the blocks in order.
 650  *
 651  * Right now, this function will allocate at most one indirect block
 652  * worth of data -- with a default block size of 4K, that's slightly
 653  * less than 2M.  If this limitation is ever removed to allow huge
 654  * allocations, we would probably still want to limit the iomap size we
 655  * return to avoid stalling other tasks during huge writes; the next
 656  * iomap iteration would then find the blocks already allocated.
 657  *
 658  * Returns: errno on error
 659  */
 660
 661 static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 662                               struct metapath *mp)
 663 {
 664         struct gfs2_inode *ip = GFS2_I(inode);
 665         struct gfs2_sbd *sdp = GFS2_SB(inode);
 666         struct buffer_head *dibh = metapath_dibh(mp);
 667         u64 bn;
 668         unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 669         size_t dblks = iomap->length >> inode->i_blkbits;
 670         const unsigned end_of_metadata = mp->mp_fheight - 1;
 671         int ret;
 672         enum alloc_state state;
 673         __be64 *ptr;
 674         __be64 zero_bn = 0;
 675
 676         BUG_ON(mp->mp_aheight < 1);
 677         BUG_ON(dibh == NULL);
 678         BUG_ON(dblks < 1);
 679
 680         gfs2_trans_add_meta(ip->i_gl, dibh);
 681
 682         down_write(&ip->i_rw_mutex);
 683
 684         if (mp->mp_fheight == mp->mp_aheight) {
 685                 /* Bottom indirect block exists */
 686                 state = ALLOC_DATA;
 687         } else {
 688                 /* Need to allocate indirect blocks */
 689                 if (mp->mp_fheight == ip->i_height) {
 690                         /* Writing into existing tree, extend tree down */
 691                         iblks = mp->mp_fheight - mp->mp_aheight;
 692                         state = ALLOC_GROW_DEPTH;
 693                 } else {
 694                         /* Building up tree height */
 695                         state = ALLOC_GROW_HEIGHT;
 696                         iblks = mp->mp_fheight - ip->i_height;
 697                         branch_start = metapath_branch_start(mp);
 698                         iblks += (mp->mp_fheight - branch_start);
 699                 }
 700         }
 701
 702         /* start of the second part of the function (state machine) */
 703
 704         blks = dblks + iblks;
 705         i = mp->mp_aheight;
 706         do {
 707                 n = blks - alloced;
 708                 ret = gfs2_alloc_blocks(ip, &bn, &n, 0);
 709                 if (ret)
 710                         goto out;
 711                 alloced += n;
 712                 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 713                         gfs2_trans_remove_revoke(sdp, bn, n);
 714                 switch (state) {
 715                 /* Growing height of tree */
 716                 case ALLOC_GROW_HEIGHT:
 717                         if (i == 1) {
 718                                 ptr = (__be64 *)(dibh->b_data +
 719                                                  sizeof(struct gfs2_dinode));
 720                                 zero_bn = *ptr;
 721                         }
 722                         for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 723                              i++, n--)
 724                                 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 725                         if (i - 1 == mp->mp_fheight - ip->i_height) {
 726                                 i--;
 727                                 gfs2_buffer_copy_tail(mp->mp_bh[i],
 728                                                 sizeof(struct gfs2_meta_header),
 729                                                 dibh, sizeof(struct gfs2_dinode));
 730                                 gfs2_buffer_clear_tail(dibh,
 731                                                 sizeof(struct gfs2_dinode) +
 732                                                 sizeof(__be64));
 733                                 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 734                                         sizeof(struct gfs2_meta_header));
 735                                 *ptr = zero_bn;
 736                                 state = ALLOC_GROW_DEPTH;
 737                                 for(i = branch_start; i < mp->mp_fheight; i++) {
 738                                         if (mp->mp_bh[i] == NULL)
 739                                                 break;
 740                                         brelse(mp->mp_bh[i]);
 741                                         mp->mp_bh[i] = NULL;
 742                                 }
 743                                 i = branch_start;
 744                         }
 745                         if (n == 0)
 746                                 break;
 747                         fallthrough;    /* To branching from existing tree */
 748                 case ALLOC_GROW_DEPTH:
 749                         if (i > 1 && i < mp->mp_fheight)
 750                                 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 751                         for (; i < mp->mp_fheight && n > 0; i++, n--)
 752                                 gfs2_indirect_init(mp, ip->i_gl, i,
 753                                                    mp->mp_list[i-1], bn++);
 754                         if (i == mp->mp_fheight)
 755                                 state = ALLOC_DATA;
 756                         if (n == 0)
 757                                 break;
 758                         fallthrough;    /* To tree complete, adding data blocks */
 759                 case ALLOC_DATA:
 760                         BUG_ON(n > dblks);
 761                         BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 762                         gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 763                         dblks = n;
 764                         ptr = metapointer(end_of_metadata, mp);
 765                         iomap->addr = bn << inode->i_blkbits;
 766                         iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
 767                         while (n-- > 0)
 768                                 *ptr++ = cpu_to_be64(bn++);
 769                         break;
 770                 }
 771         } while (iomap->addr == IOMAP_NULL_ADDR);
 772
 773         iomap->type = IOMAP_MAPPED;
 774         iomap->length = (u64)dblks << inode->i_blkbits;
 775         ip->i_height = mp->mp_fheight;
 776         gfs2_add_inode_blocks(&ip->i_inode, alloced);
 777         gfs2_dinode_out(ip, dibh->b_data);
 778 out:
 779         up_write(&ip->i_rw_mutex);
 780         return ret;
 781 }
 782
 783 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
 784
 785 /**
 786  * gfs2_alloc_size - Compute the maximum allocation size
 787  * @inode: The inode
 788  * @mp: The metapath
 789  * @size: Requested size in blocks
 790  *
 791  * Compute the maximum size of the next allocation at @mp.
 792  *
 793  * Returns: size in blocks
 794  */
 795 static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
 796 {
 797         struct gfs2_inode *ip = GFS2_I(inode);
 798         struct gfs2_sbd *sdp = GFS2_SB(inode);
 799         const __be64 *first, *ptr, *end;
 800
 801         /*
 802          * For writes to stuffed files, this function is called twice via
 803          * __gfs2_iomap_get, before and after unstuffing. The size we return the
 804          * first time needs to be large enough to get the reservation and
 805          * allocation sizes right.  The size we return the second time must
 806          * be exact or else __gfs2_iomap_alloc won't do the right thing.
 807          */
 808
 809         if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
 810                 unsigned int maxsize = mp->mp_fheight > 1 ?
 811                         sdp->sd_inptrs : sdp->sd_diptrs;
 812                 maxsize -= mp->mp_list[mp->mp_fheight - 1];
 813                 if (size > maxsize)
 814                         size = maxsize;
 815                 return size;
 816         }
 817
 818         first = metapointer(ip->i_height - 1, mp);
 819         end = metaend(ip->i_height - 1, mp);
 820         if (end - first > size)
 821                 end = first + size;
 822         for (ptr = first; ptr < end; ptr++) {
 823                 if (*ptr)
 824                         break;
 825         }
 826         return ptr - first;
 827 }
 828
 829 /**
 830  * __gfs2_iomap_get - Map blocks from an inode to disk blocks
 831  * @inode: The inode
 832  * @pos: Starting position in bytes
 833  * @length: Length to map, in bytes
 834  * @flags: iomap flags
 835  * @iomap: The iomap structure
 836  * @mp: The metapath
 837  *
 838  * Returns: errno
 839  */
 840 static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 841                             unsigned flags, struct iomap *iomap,
 842                             struct metapath *mp)
 843 {
 844         struct gfs2_inode *ip = GFS2_I(inode);
 845         struct gfs2_sbd *sdp = GFS2_SB(inode);
 846         loff_t size = i_size_read(inode);
 847         __be64 *ptr;
 848         sector_t lblock;
 849         sector_t lblock_stop;
 850         int ret;
 851         int eob;
 852         u64 len;
 853         struct buffer_head *dibh = NULL, *bh;
 854         u8 height;
 855
 856         if (!length)
 857                 return -EINVAL;
 858
 859         down_read(&ip->i_rw_mutex);
 860
 861         ret = gfs2_meta_inode_buffer(ip, &dibh);
 862         if (ret)
 863                 goto unlock;
 864         mp->mp_bh[0] = dibh;
 865
 866         if (gfs2_is_stuffed(ip)) {
 867                 if (flags & IOMAP_WRITE) {
 868                         loff_t max_size = gfs2_max_stuffed_size(ip);
 869
 870                         if (pos + length > max_size)
 871                                 goto unstuff;
 872                         iomap->length = max_size;
 873                 } else {
 874                         if (pos >= size) {
 875                                 if (flags & IOMAP_REPORT) {
 876                                         ret = -ENOENT;
 877                                         goto unlock;
 878                                 } else {
 879                                         iomap->offset = pos;
 880                                         iomap->length = length;
 881                                         goto hole_found;
 882                                 }
 883                         }
 884                         iomap->length = size;
 885                 }
 886                 iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 887                               sizeof(struct gfs2_dinode);
 888                 iomap->type = IOMAP_INLINE;
 889                 iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
 890                 goto out;
 891         }
 892
 893 unstuff:
 894         lblock = pos >> inode->i_blkbits;
 895         iomap->offset = lblock << inode->i_blkbits;
 896         lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 897         len = lblock_stop - lblock + 1;
 898         iomap->length = len << inode->i_blkbits;
 899
 900         height = ip->i_height;
 901         while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
 902                 height++;
 903         find_metapath(sdp, lblock, mp, height);
 904         if (height > ip->i_height || gfs2_is_stuffed(ip))
 905                 goto do_alloc;
 906
 907         ret = lookup_metapath(ip, mp);
 908         if (ret)
 909                 goto unlock;
 910
 911         if (mp->mp_aheight != ip->i_height)
 912                 goto do_alloc;
 913
 914         ptr = metapointer(ip->i_height - 1, mp);
 915         if (*ptr == 0)
 916                 goto do_alloc;
 917
 918         bh = mp->mp_bh[ip->i_height - 1];
 919         len = gfs2_extent_length(bh, ptr, &eob);
 920
 921         iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 922         iomap->length = len << inode->i_blkbits;
 923         iomap->type = IOMAP_MAPPED;
 924         iomap->flags |= IOMAP_F_MERGED;
 925         if (eob)
 926                 iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 927
 928 out:
 929         iomap->bdev = inode->i_sb->s_bdev;
 930 unlock:
 931         up_read(&ip->i_rw_mutex);
 932         return ret;
 933
 934 do_alloc:
 935         if (flags & IOMAP_REPORT) {
 936                 if (pos >= size)
 937                         ret = -ENOENT;
 938                 else if (height == ip->i_height)
 939                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 940                 else
 941                         iomap->length = size - iomap->offset;
 942         } else if (flags & IOMAP_WRITE) {
 943                 u64 alloc_size;
 944
 945                 if (flags & IOMAP_DIRECT)
 946                         goto out;  /* (see gfs2_file_direct_write) */
 947
 948                 len = gfs2_alloc_size(inode, mp, len);
 949                 alloc_size = len << inode->i_blkbits;
 950                 if (alloc_size < iomap->length)
 951                         iomap->length = alloc_size;
 952         } else {
 953                 if (pos < size && height == ip->i_height)
 954                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 955         }
 956 hole_found:
 957         iomap->addr = IOMAP_NULL_ADDR;
 958         iomap->type = IOMAP_HOLE;
 959         goto out;
 960 }
 961
 962 static struct folio *
 963 gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
 964 {
 965         struct inode *inode = iter->inode;
 966         unsigned int blockmask = i_blocksize(inode) - 1;
 967         struct gfs2_sbd *sdp = GFS2_SB(inode);
 968         unsigned int blocks;
 969         struct folio *folio;
 970         int status;
 971
 972         blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
 973         status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
 974         if (status)
 975                 return ERR_PTR(status);
 976
 977         folio = iomap_get_folio(iter, pos, len);
 978         if (IS_ERR(folio))
 979                 gfs2_trans_end(sdp);
 980         return folio;
 981 }
 982
 983 static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
 984                                  unsigned copied, struct folio *folio)
 985 {
 986         struct gfs2_trans *tr = current->journal_info;
 987         struct gfs2_inode *ip = GFS2_I(inode);
 988         struct gfs2_sbd *sdp = GFS2_SB(inode);
 989
 990         if (!gfs2_is_stuffed(ip))
 991                 gfs2_trans_add_databufs(ip->i_gl, folio,
 992                                         offset_in_folio(folio, pos),
 993                                         copied);
 994
 995         folio_unlock(folio);
 996         folio_put(folio);
 997
 998         if (tr->tr_num_buf_new)
 999                 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1000
1001         gfs2_trans_end(sdp);
1002 }
1003
1004 static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
1005         .get_folio = gfs2_iomap_get_folio,
1006         .put_folio = gfs2_iomap_put_folio,
1007 };
1008
1009 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
1010                                   loff_t length, unsigned flags,
1011                                   struct iomap *iomap,
1012                                   struct metapath *mp)
1013 {
1014         struct gfs2_inode *ip = GFS2_I(inode);
1015         struct gfs2_sbd *sdp = GFS2_SB(inode);
1016         bool unstuff;
1017         int ret;
1018
1019         unstuff = gfs2_is_stuffed(ip) &&
1020                   pos + length > gfs2_max_stuffed_size(ip);
1021
1022         if (unstuff || iomap->type == IOMAP_HOLE) {
1023                 unsigned int data_blocks, ind_blocks;
1024                 struct gfs2_alloc_parms ap = {};
1025                 unsigned int rblocks;
1026                 struct gfs2_trans *tr;
1027
1028                 gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1029                                        &ind_blocks);
1030                 ap.target = data_blocks + ind_blocks;
1031                 ret = gfs2_quota_lock_check(ip, &ap);
1032                 if (ret)
1033                         return ret;
1034
1035                 ret = gfs2_inplace_reserve(ip, &ap);
1036                 if (ret)
1037                         goto out_qunlock;
1038
1039                 rblocks = RES_DINODE + ind_blocks;
1040                 if (gfs2_is_jdata(ip))
1041                         rblocks += data_blocks;
1042                 if (ind_blocks || data_blocks)
1043                         rblocks += RES_STATFS + RES_QUOTA;
1044                 if (inode == sdp->sd_rindex)
1045                         rblocks += 2 * RES_STATFS;
1046                 rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1047
1048                 ret = gfs2_trans_begin(sdp, rblocks,
1049                                        iomap->length >> inode->i_blkbits);
1050                 if (ret)
1051                         goto out_trans_fail;
1052
1053                 if (unstuff) {
1054                         ret = gfs2_unstuff_dinode(ip);
1055                         if (ret)
1056                                 goto out_trans_end;
1057                         release_metapath(mp);
1058                         ret = __gfs2_iomap_get(inode, iomap->offset,
1059                                                iomap->length, flags, iomap, mp);
1060                         if (ret)
1061                                 goto out_trans_end;
1062                 }
1063
1064                 if (iomap->type == IOMAP_HOLE) {
1065                         ret = __gfs2_iomap_alloc(inode, iomap, mp);
1066                         if (ret) {
1067                                 gfs2_trans_end(sdp);
1068                                 gfs2_inplace_release(ip);
1069                                 punch_hole(ip, iomap->offset, iomap->length);
1070                                 goto out_qunlock;
1071                         }
1072                 }
1073
1074                 tr = current->journal_info;
1075                 if (tr->tr_num_buf_new)
1076                         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1077
1078                 gfs2_trans_end(sdp);
1079         }
1080
1081         if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1082                 iomap->folio_ops = &gfs2_iomap_folio_ops;
1083         return 0;
1084
1085 out_trans_end:
1086         gfs2_trans_end(sdp);
1087 out_trans_fail:
1088         gfs2_inplace_release(ip);
1089 out_qunlock:
1090         gfs2_quota_unlock(ip);
1091         return ret;
1092 }
1093
1094 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1095                             unsigned flags, struct iomap *iomap,
1096                             struct iomap *srcmap)
1097 {
1098         struct gfs2_inode *ip = GFS2_I(inode);
1099         struct metapath mp = { .mp_aheight = 1, };
1100         int ret;
1101
1102         if (gfs2_is_jdata(ip))
1103                 iomap->flags |= IOMAP_F_BUFFER_HEAD;
1104
1105         trace_gfs2_iomap_start(ip, pos, length, flags);
1106         ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1107         if (ret)
1108                 goto out_unlock;
1109
1110         switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1111         case IOMAP_WRITE:
1112                 if (flags & IOMAP_DIRECT) {
1113                         /*
1114                          * Silently fall back to buffered I/O for stuffed files
1115                          * or if we've got a hole (see gfs2_file_direct_write).
1116                          */
1117                         if (iomap->type != IOMAP_MAPPED)
1118                                 ret = -ENOTBLK;
1119                         goto out_unlock;
1120                 }
1121                 break;
1122         case IOMAP_ZERO:
1123                 if (iomap->type == IOMAP_HOLE)
1124                         goto out_unlock;
1125                 break;
1126         default:
1127                 goto out_unlock;
1128         }
1129
1130         ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1131
1132 out_unlock:
1133         release_metapath(&mp);
1134         trace_gfs2_iomap_end(ip, iomap, ret);
1135         return ret;
1136 }
1137
1138 static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1139                           ssize_t written, unsigned flags, struct iomap *iomap)
1140 {
1141         struct gfs2_inode *ip = GFS2_I(inode);
1142         struct gfs2_sbd *sdp = GFS2_SB(inode);
1143
1144         switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1145         case IOMAP_WRITE:
1146                 if (flags & IOMAP_DIRECT)
1147                         return 0;
1148                 break;
1149         case IOMAP_ZERO:
1150                  if (iomap->type == IOMAP_HOLE)
1151                          return 0;
1152                  break;
1153         default:
1154                  return 0;
1155         }
1156
1157         if (!gfs2_is_stuffed(ip))
1158                 gfs2_ordered_add_inode(ip);
1159
1160         if (inode == sdp->sd_rindex)
1161                 adjust_fs_space(inode);
1162
1163         gfs2_inplace_release(ip);
1164
1165         if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1166                 gfs2_quota_unlock(ip);
1167
1168         if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1169                 /* Deallocate blocks that were just allocated. */
1170                 loff_t hstart = round_up(pos + written, i_blocksize(inode));
1171                 loff_t hend = iomap->offset + iomap->length;
1172
1173                 if (hstart < hend) {
1174                         truncate_pagecache_range(inode, hstart, hend - 1);
1175                         punch_hole(ip, hstart, hend - hstart);
1176                 }
1177         }
1178
1179         if (unlikely(!written))
1180                 return 0;
1181
1182         if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1183                 mark_inode_dirty(inode);
1184         set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1185         return 0;
1186 }
1187
1188 const struct iomap_ops gfs2_iomap_ops = {
1189         .iomap_begin = gfs2_iomap_begin,
1190         .iomap_end = gfs2_iomap_end,
1191 };
1192
1193 /**
1194  * gfs2_block_map - Map one or more blocks of an inode to a disk block
1195  * @inode: The inode
1196  * @lblock: The logical block number
1197  * @bh_map: The bh to be mapped
1198  * @create: True if its ok to alloc blocks to satify the request
1199  *
1200  * The size of the requested mapping is defined in bh_map->b_size.
1201  *
1202  * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1203  * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1204  * bh_map->b_size to indicate the size of the mapping when @lblock and
1205  * successive blocks are mapped, up to the requested size.
1206  *
1207  * Sets buffer_boundary() if a read of metadata will be required
1208  * before the next block can be mapped. Sets buffer_new() if new
1209  * blocks were allocated.
1210  *
1211  * Returns: errno
1212  */
1213
1214 int gfs2_block_map(struct inode *inode, sector_t lblock,
1215                    struct buffer_head *bh_map, int create)
1216 {
1217         struct gfs2_inode *ip = GFS2_I(inode);
1218         loff_t pos = (loff_t)lblock << inode->i_blkbits;
1219         loff_t length = bh_map->b_size;
1220         struct iomap iomap = { };
1221         int ret;
1222
1223         clear_buffer_mapped(bh_map);
1224         clear_buffer_new(bh_map);
1225         clear_buffer_boundary(bh_map);
1226         trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1227
1228         if (!create)
1229                 ret = gfs2_iomap_get(inode, pos, length, &iomap);
1230         else
1231                 ret = gfs2_iomap_alloc(inode, pos, length, &iomap);
1232         if (ret)
1233                 goto out;
1234
1235         if (iomap.length > bh_map->b_size) {
1236                 iomap.length = bh_map->b_size;
1237                 iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1238         }
1239         if (iomap.addr != IOMAP_NULL_ADDR)
1240                 map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1241         bh_map->b_size = iomap.length;
1242         if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1243                 set_buffer_boundary(bh_map);
1244         if (iomap.flags & IOMAP_F_NEW)
1245                 set_buffer_new(bh_map);
1246
1247 out:
1248         trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1249         return ret;
1250 }
1251
1252 int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
1253                     unsigned int *extlen)
1254 {
1255         unsigned int blkbits = inode->i_blkbits;
1256         struct iomap iomap = { };
1257         unsigned int len;
1258         int ret;
1259
1260         ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits,
1261                              &iomap);
1262         if (ret)
1263                 return ret;
1264         if (iomap.type != IOMAP_MAPPED)
1265                 return -EIO;
1266         *dblock = iomap.addr >> blkbits;
1267         len = iomap.length >> blkbits;
1268         if (len < *extlen)
1269                 *extlen = len;
1270         return 0;
1271 }
1272
1273 int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
1274                       unsigned int *extlen, bool *new)
1275 {
1276         unsigned int blkbits = inode->i_blkbits;
1277         struct iomap iomap = { };
1278         unsigned int len;
1279         int ret;
1280
1281         ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits,
1282                                &iomap);
1283         if (ret)
1284                 return ret;
1285         if (iomap.type != IOMAP_MAPPED)
1286                 return -EIO;
1287         *dblock = iomap.addr >> blkbits;
1288         len = iomap.length >> blkbits;
1289         if (len < *extlen)
1290                 *extlen = len;
1291         *new = iomap.flags & IOMAP_F_NEW;
1292         return 0;
1293 }
1294
1295 /*
1296  * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1297  * uses iomap write to perform its actions, which begin their own transactions
1298  * (iomap_begin, get_folio, etc.)
1299  */
1300 static int gfs2_block_zero_range(struct inode *inode, loff_t from, loff_t length)
1301 {
1302         BUG_ON(current->journal_info);
1303         if (from >= inode->i_size)
1304                 return 0;
1305         length = min(length, inode->i_size - from);
1306         return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
1307                         NULL);
1308 }
1309
1310 #define GFS2_JTRUNC_REVOKES 8192
1311
1312 /**
1313  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1314  * @inode: The inode being truncated
1315  * @oldsize: The original (larger) size
1316  * @newsize: The new smaller size
1317  *
1318  * With jdata files, we have to journal a revoke for each block which is
1319  * truncated. As a result, we need to split this into separate transactions
1320  * if the number of pages being truncated gets too large.
1321  */
1322
1323 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1324 {
1325         struct gfs2_sbd *sdp = GFS2_SB(inode);
1326         u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1327         u64 chunk;
1328         int error;
1329
1330         while (oldsize != newsize) {
1331                 struct gfs2_trans *tr;
1332                 unsigned int offs;
1333
1334                 chunk = oldsize - newsize;
1335                 if (chunk > max_chunk)
1336                         chunk = max_chunk;
1337
1338                 offs = oldsize & ~PAGE_MASK;
1339                 if (offs && chunk > PAGE_SIZE)
1340                         chunk = offs + ((chunk - offs) & PAGE_MASK);
1341
1342                 truncate_pagecache(inode, oldsize - chunk);
1343                 oldsize -= chunk;
1344
1345                 tr = current->journal_info;
1346                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1347                         continue;
1348
1349                 gfs2_trans_end(sdp);
1350                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1351                 if (error)
1352                         return error;
1353         }
1354
1355         return 0;
1356 }
1357
1358 static int trunc_start(struct inode *inode, u64 newsize)
1359 {
1360         struct gfs2_inode *ip = GFS2_I(inode);
1361         struct gfs2_sbd *sdp = GFS2_SB(inode);
1362         struct buffer_head *dibh = NULL;
1363         int journaled = gfs2_is_jdata(ip);
1364         u64 oldsize = inode->i_size;
1365         int error;
1366
1367         if (!gfs2_is_stuffed(ip)) {
1368                 unsigned int blocksize = i_blocksize(inode);
1369                 unsigned int offs = newsize & (blocksize - 1);
1370                 if (offs) {
1371                         error = gfs2_block_zero_range(inode, newsize,
1372                                                       blocksize - offs);
1373                         if (error)
1374                                 return error;
1375                 }
1376         }
1377         if (journaled)
1378                 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1379         else
1380                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1381         if (error)
1382                 return error;
1383
1384         error = gfs2_meta_inode_buffer(ip, &dibh);
1385         if (error)
1386                 goto out;
1387
1388         gfs2_trans_add_meta(ip->i_gl, dibh);
1389
1390         if (gfs2_is_stuffed(ip))
1391                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1392         else
1393                 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1394
1395         i_size_write(inode, newsize);
1396         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1397         gfs2_dinode_out(ip, dibh->b_data);
1398
1399         if (journaled)
1400                 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1401         else
1402                 truncate_pagecache(inode, newsize);
1403
1404 out:
1405         brelse(dibh);
1406         if (current->journal_info)
1407                 gfs2_trans_end(sdp);
1408         return error;
1409 }
1410
1411 int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
1412                    struct iomap *iomap)
1413 {
1414         struct metapath mp = { .mp_aheight = 1, };
1415         int ret;
1416
1417         ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp);
1418         release_metapath(&mp);
1419         return ret;
1420 }
1421
1422 int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
1423                      struct iomap *iomap)
1424 {
1425         struct metapath mp = { .mp_aheight = 1, };
1426         int ret;
1427
1428         ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1429         if (!ret && iomap->type == IOMAP_HOLE)
1430                 ret = __gfs2_iomap_alloc(inode, iomap, &mp);
1431         release_metapath(&mp);
1432         return ret;
1433 }
1434
1435 /**
1436  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1437  * @ip: inode
1438  * @rd_gh: holder of resource group glock
1439  * @bh: buffer head to sweep
1440  * @start: starting point in bh
1441  * @end: end point in bh
1442  * @meta: true if bh points to metadata (rather than data)
1443  * @btotal: place to keep count of total blocks freed
1444  *
1445  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1446  * free, and free them all. However, we do it one rgrp at a time. If this
1447  * block has references to multiple rgrps, we break it into individual
1448  * transactions. This allows other processes to use the rgrps while we're
1449  * focused on a single one, for better concurrency / performance.
1450  * At every transaction boundary, we rewrite the inode into the journal.
1451  * That way the bitmaps are kept consistent with the inode and we can recover
1452  * if we're interrupted by power-outages.
1453  *
1454  * Returns: 0, or return code if an error occurred.
1455  *          *btotal has the total number of blocks freed
1456  */
1457 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1458                               struct buffer_head *bh, __be64 *start, __be64 *end,
1459                               bool meta, u32 *btotal)
1460 {
1461         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1462         struct gfs2_rgrpd *rgd;
1463         struct gfs2_trans *tr;
1464         __be64 *p;
1465         int blks_outside_rgrp;
1466         u64 bn, bstart, isize_blks;
1467         s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1468         int ret = 0;
1469         bool buf_in_tr = false; /* buffer was added to transaction */
1470
1471 more_rgrps:
1472         rgd = NULL;
1473         if (gfs2_holder_initialized(rd_gh)) {
1474                 rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1475                 gfs2_assert_withdraw(sdp,
1476                              gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1477         }
1478         blks_outside_rgrp = 0;
1479         bstart = 0;
1480         blen = 0;
1481
1482         for (p = start; p < end; p++) {
1483                 if (!*p)
1484                         continue;
1485                 bn = be64_to_cpu(*p);
1486
1487                 if (rgd) {
1488                         if (!rgrp_contains_block(rgd, bn)) {
1489                                 blks_outside_rgrp++;
1490                                 continue;
1491                         }
1492                 } else {
1493                         rgd = gfs2_blk2rgrpd(sdp, bn, true);
1494                         if (unlikely(!rgd)) {
1495                                 ret = -EIO;
1496                                 goto out;
1497                         }
1498                         ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1499                                                  LM_FLAG_NODE_SCOPE, rd_gh);
1500                         if (ret)
1501                                 goto out;
1502
1503                         /* Must be done with the rgrp glock held: */
1504                         if (gfs2_rs_active(&ip->i_res) &&
1505                             rgd == ip->i_res.rs_rgd)
1506                                 gfs2_rs_deltree(&ip->i_res);
1507                 }
1508
1509                 /* The size of our transactions will be unknown until we
1510                    actually process all the metadata blocks that relate to
1511                    the rgrp. So we estimate. We know it can't be more than
1512                    the dinode's i_blocks and we don't want to exceed the
1513                    journal flush threshold, sd_log_thresh2. */
1514                 if (current->journal_info == NULL) {
1515                         unsigned int jblocks_rqsted, revokes;
1516
1517                         jblocks_rqsted = rgd->rd_length + RES_DINODE +
1518                                 RES_INDIRECT;
1519                         isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1520                         if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1521                                 jblocks_rqsted +=
1522                                         atomic_read(&sdp->sd_log_thresh2);
1523                         else
1524                                 jblocks_rqsted += isize_blks;
1525                         revokes = jblocks_rqsted;
1526                         if (meta)
1527                                 revokes += end - start;
1528                         else if (ip->i_depth)
1529                                 revokes += sdp->sd_inptrs;
1530                         ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1531                         if (ret)
1532                                 goto out_unlock;
1533                         down_write(&ip->i_rw_mutex);
1534                 }
1535                 /* check if we will exceed the transaction blocks requested */
1536                 tr = current->journal_info;
1537                 if (tr->tr_num_buf_new + RES_STATFS +
1538                     RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1539                         /* We set blks_outside_rgrp to ensure the loop will
1540                            be repeated for the same rgrp, but with a new
1541                            transaction. */
1542                         blks_outside_rgrp++;
1543                         /* This next part is tricky. If the buffer was added
1544                            to the transaction, we've already set some block
1545                            pointers to 0, so we better follow through and free
1546                            them, or we will introduce corruption (so break).
1547                            This may be impossible, or at least rare, but I
1548                            decided to cover the case regardless.
1549
1550                            If the buffer was not added to the transaction
1551                            (this call), doing so would exceed our transaction
1552                            size, so we need to end the transaction and start a
1553                            new one (so goto). */
1554
1555                         if (buf_in_tr)
1556                                 break;
1557                         goto out_unlock;
1558                 }
1559
1560                 gfs2_trans_add_meta(ip->i_gl, bh);
1561                 buf_in_tr = true;
1562                 *p = 0;
1563                 if (bstart + blen == bn) {
1564                         blen++;
1565                         continue;
1566                 }
1567                 if (bstart) {
1568                         __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1569                         (*btotal) += blen;
1570                         gfs2_add_inode_blocks(&ip->i_inode, -blen);
1571                 }
1572                 bstart = bn;
1573                 blen = 1;
1574         }
1575         if (bstart) {
1576                 __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1577                 (*btotal) += blen;
1578                 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1579         }
1580 out_unlock:
1581         if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1582                                             outside the rgrp we just processed,
1583                                             do it all over again. */
1584                 if (current->journal_info) {
1585                         struct buffer_head *dibh;
1586
1587                         ret = gfs2_meta_inode_buffer(ip, &dibh);
1588                         if (ret)
1589                                 goto out;
1590
1591                         /* Every transaction boundary, we rewrite the dinode
1592                            to keep its di_blocks current in case of failure. */
1593                         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1594                         gfs2_trans_add_meta(ip->i_gl, dibh);
1595                         gfs2_dinode_out(ip, dibh->b_data);
1596                         brelse(dibh);
1597                         up_write(&ip->i_rw_mutex);
1598                         gfs2_trans_end(sdp);
1599                         buf_in_tr = false;
1600                 }
1601                 gfs2_glock_dq_uninit(rd_gh);
1602                 cond_resched();
1603                 goto more_rgrps;
1604         }
1605 out:
1606         return ret;
1607 }
1608
1609 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1610 {
1611         if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1612                 return false;
1613         return true;
1614 }
1615
1616 /**
1617  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1618  * @sdp: The superblock
1619  * @mp: starting metapath
1620  * @h: desired height to search
1621  * @end_list: See punch_hole().
1622  * @end_aligned: See punch_hole().
1623  *
1624  * Assumes the metapath is valid (with buffers) out to height h.
1625  * Returns: true if a non-null pointer was found in the metapath buffer
1626  *          false if all remaining pointers are NULL in the buffer
1627  */
1628 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1629                              unsigned int h,
1630                              __u16 *end_list, unsigned int end_aligned)
1631 {
1632         struct buffer_head *bh = mp->mp_bh[h];
1633         __be64 *first, *ptr, *end;
1634
1635         first = metaptr1(h, mp);
1636         ptr = first + mp->mp_list[h];
1637         end = (__be64 *)(bh->b_data + bh->b_size);
1638         if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1639                 bool keep_end = h < end_aligned;
1640                 end = first + end_list[h] + keep_end;
1641         }
1642
1643         while (ptr < end) {
1644                 if (*ptr) { /* if we have a non-null pointer */
1645                         mp->mp_list[h] = ptr - first;
1646                         h++;
1647                         if (h < GFS2_MAX_META_HEIGHT)
1648                                 mp->mp_list[h] = 0;
1649                         return true;
1650                 }
1651                 ptr++;
1652         }
1653         return false;
1654 }
1655
1656 enum dealloc_states {
1657         DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1658         DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1659         DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1660         DEALLOC_DONE = 3,       /* process complete */
1661 };
1662
1663 static inline void
1664 metapointer_range(struct metapath *mp, int height,
1665                   __u16 *start_list, unsigned int start_aligned,
1666                   __u16 *end_list, unsigned int end_aligned,
1667                   __be64 **start, __be64 **end)
1668 {
1669         struct buffer_head *bh = mp->mp_bh[height];
1670         __be64 *first;
1671
1672         first = metaptr1(height, mp);
1673         *start = first;
1674         if (mp_eq_to_hgt(mp, start_list, height)) {
1675                 bool keep_start = height < start_aligned;
1676                 *start = first + start_list[height] + keep_start;
1677         }
1678         *end = (__be64 *)(bh->b_data + bh->b_size);
1679         if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1680                 bool keep_end = height < end_aligned;
1681                 *end = first + end_list[height] + keep_end;
1682         }
1683 }
1684
1685 static inline bool walk_done(struct gfs2_sbd *sdp,
1686                              struct metapath *mp, int height,
1687                              __u16 *end_list, unsigned int end_aligned)
1688 {
1689         __u16 end;
1690
1691         if (end_list) {
1692                 bool keep_end = height < end_aligned;
1693                 if (!mp_eq_to_hgt(mp, end_list, height))
1694                         return false;
1695                 end = end_list[height] + keep_end;
1696         } else
1697                 end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1698         return mp->mp_list[height] >= end;
1699 }
1700
1701 /**
1702  * punch_hole - deallocate blocks in a file
1703  * @ip: inode to truncate
1704  * @offset: the start of the hole
1705  * @length: the size of the hole (or 0 for truncate)
1706  *
1707  * Punch a hole into a file or truncate a file at a given position.  This
1708  * function operates in whole blocks (@offset and @length are rounded
1709  * accordingly); partially filled blocks must be cleared otherwise.
1710  *
1711  * This function works from the bottom up, and from the right to the left. In
1712  * other words, it strips off the highest layer (data) before stripping any of
1713  * the metadata. Doing it this way is best in case the operation is interrupted
1714  * by power failure, etc.  The dinode is rewritten in every transaction to
1715  * guarantee integrity.
1716  */
1717 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1718 {
1719         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1720         u64 maxsize = sdp->sd_heightsize[ip->i_height];
1721         struct metapath mp = {};
1722         struct buffer_head *dibh, *bh;
1723         struct gfs2_holder rd_gh;
1724         unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1725         unsigned int bsize = 1 << bsize_shift;
1726         u64 lblock = (offset + bsize - 1) >> bsize_shift;
1727         __u16 start_list[GFS2_MAX_META_HEIGHT];
1728         __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1729         unsigned int start_aligned, end_aligned;
1730         unsigned int strip_h = ip->i_height - 1;
1731         u32 btotal = 0;
1732         int ret, state;
1733         int mp_h; /* metapath buffers are read in to this height */
1734         u64 prev_bnr = 0;
1735         __be64 *start, *end;
1736
1737         if (offset + bsize - 1 >= maxsize) {
1738                 /*
1739                  * The starting point lies beyond the allocated metadata;
1740                  * there are no blocks to deallocate.
1741                  */
1742                 return 0;
1743         }
1744
1745         /*
1746          * The start position of the hole is defined by lblock, start_list, and
1747          * start_aligned.  The end position of the hole is defined by lend,
1748          * end_list, and end_aligned.
1749          *
1750          * start_aligned and end_aligned define down to which height the start
1751          * and end positions are aligned to the metadata tree (i.e., the
1752          * position is a multiple of the metadata granularity at the height
1753          * above).  This determines at which heights additional meta pointers
1754          * needs to be preserved for the remaining data.
1755          */
1756
1757         if (length) {
1758                 u64 end_offset = offset + length;
1759                 u64 lend;
1760
1761                 /*
1762                  * Clip the end at the maximum file size for the given height:
1763                  * that's how far the metadata goes; files bigger than that
1764                  * will have additional layers of indirection.
1765                  */
1766                 if (end_offset > maxsize)
1767                         end_offset = maxsize;
1768                 lend = end_offset >> bsize_shift;
1769
1770                 if (lblock >= lend)
1771                         return 0;
1772
1773                 find_metapath(sdp, lend, &mp, ip->i_height);
1774                 end_list = __end_list;
1775                 memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1776
1777                 for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1778                         if (end_list[mp_h])
1779                                 break;
1780                 }
1781                 end_aligned = mp_h;
1782         }
1783
1784         find_metapath(sdp, lblock, &mp, ip->i_height);
1785         memcpy(start_list, mp.mp_list, sizeof(start_list));
1786
1787         for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1788                 if (start_list[mp_h])
1789                         break;
1790         }
1791         start_aligned = mp_h;
1792
1793         ret = gfs2_meta_inode_buffer(ip, &dibh);
1794         if (ret)
1795                 return ret;
1796
1797         mp.mp_bh[0] = dibh;
1798         ret = lookup_metapath(ip, &mp);
1799         if (ret)
1800                 goto out_metapath;
1801
1802         /* issue read-ahead on metadata */
1803         for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1804                 metapointer_range(&mp, mp_h, start_list, start_aligned,
1805                                   end_list, end_aligned, &start, &end);
1806                 gfs2_metapath_ra(ip->i_gl, start, end);
1807         }
1808
1809         if (mp.mp_aheight == ip->i_height)
1810                 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1811         else
1812                 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1813
1814         ret = gfs2_rindex_update(sdp);
1815         if (ret)
1816                 goto out_metapath;
1817
1818         ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1819         if (ret)
1820                 goto out_metapath;
1821         gfs2_holder_mark_uninitialized(&rd_gh);
1822
1823         mp_h = strip_h;
1824
1825         while (state != DEALLOC_DONE) {
1826                 switch (state) {
1827                 /* Truncate a full metapath at the given strip height.
1828                  * Note that strip_h == mp_h in order to be in this state. */
1829                 case DEALLOC_MP_FULL:
1830                         bh = mp.mp_bh[mp_h];
1831                         gfs2_assert_withdraw(sdp, bh);
1832                         if (gfs2_assert_withdraw(sdp,
1833                                                  prev_bnr != bh->b_blocknr)) {
1834                                 fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u, "
1835                                          "s_h:%u, mp_h:%u\n",
1836                                        (unsigned long long)ip->i_no_addr,
1837                                        prev_bnr, ip->i_height, strip_h, mp_h);
1838                         }
1839                         prev_bnr = bh->b_blocknr;
1840
1841                         if (gfs2_metatype_check(sdp, bh,
1842                                                 (mp_h ? GFS2_METATYPE_IN :
1843                                                         GFS2_METATYPE_DI))) {
1844                                 ret = -EIO;
1845                                 goto out;
1846                         }
1847
1848                         /*
1849                          * Below, passing end_aligned as 0 gives us the
1850                          * metapointer range excluding the end point: the end
1851                          * point is the first metapath we must not deallocate!
1852                          */
1853
1854                         metapointer_range(&mp, mp_h, start_list, start_aligned,
1855                                           end_list, 0 /* end_aligned */,
1856                                           &start, &end);
1857                         ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1858                                                  start, end,
1859                                                  mp_h != ip->i_height - 1,
1860                                                  &btotal);
1861
1862                         /* If we hit an error or just swept dinode buffer,
1863                            just exit. */
1864                         if (ret || !mp_h) {
1865                                 state = DEALLOC_DONE;
1866                                 break;
1867                         }
1868                         state = DEALLOC_MP_LOWER;
1869                         break;
1870
1871                 /* lower the metapath strip height */
1872                 case DEALLOC_MP_LOWER:
1873                         /* We're done with the current buffer, so release it,
1874                            unless it's the dinode buffer. Then back up to the
1875                            previous pointer. */
1876                         if (mp_h) {
1877                                 brelse(mp.mp_bh[mp_h]);
1878                                 mp.mp_bh[mp_h] = NULL;
1879                         }
1880                         /* If we can't get any lower in height, we've stripped
1881                            off all we can. Next step is to back up and start
1882                            stripping the previous level of metadata. */
1883                         if (mp_h == 0) {
1884                                 strip_h--;
1885                                 memcpy(mp.mp_list, start_list, sizeof(start_list));
1886                                 mp_h = strip_h;
1887                                 state = DEALLOC_FILL_MP;
1888                                 break;
1889                         }
1890                         mp.mp_list[mp_h] = 0;
1891                         mp_h--; /* search one metadata height down */
1892                         mp.mp_list[mp_h]++;
1893                         if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1894                                 break;
1895                         /* Here we've found a part of the metapath that is not
1896                          * allocated. We need to search at that height for the
1897                          * next non-null pointer. */
1898                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1899                                 state = DEALLOC_FILL_MP;
1900                                 mp_h++;
1901                         }
1902                         /* No more non-null pointers at this height. Back up
1903                            to the previous height and try again. */
1904                         break; /* loop around in the same state */
1905
1906                 /* Fill the metapath with buffers to the given height. */
1907                 case DEALLOC_FILL_MP:
1908                         /* Fill the buffers out to the current height. */
1909                         ret = fillup_metapath(ip, &mp, mp_h);
1910                         if (ret < 0)
1911                                 goto out;
1912
1913                         /* On the first pass, issue read-ahead on metadata. */
1914                         if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1915                                 unsigned int height = mp.mp_aheight - 1;
1916
1917                                 /* No read-ahead for data blocks. */
1918                                 if (mp.mp_aheight - 1 == strip_h)
1919                                         height--;
1920
1921                                 for (; height >= mp.mp_aheight - ret; height--) {
1922                                         metapointer_range(&mp, height,
1923                                                           start_list, start_aligned,
1924                                                           end_list, end_aligned,
1925                                                           &start, &end);
1926                                         gfs2_metapath_ra(ip->i_gl, start, end);
1927                                 }
1928                         }
1929
1930                         /* If buffers found for the entire strip height */
1931                         if (mp.mp_aheight - 1 == strip_h) {
1932                                 state = DEALLOC_MP_FULL;
1933                                 break;
1934                         }
1935                         if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1936                                 mp_h = mp.mp_aheight - 1;
1937
1938                         /* If we find a non-null block pointer, crawl a bit
1939                            higher up in the metapath and try again, otherwise
1940                            we need to look lower for a new starting point. */
1941                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1942                                 mp_h++;
1943                         else
1944                                 state = DEALLOC_MP_LOWER;
1945                         break;
1946                 }
1947         }
1948
1949         if (btotal) {
1950                 if (current->journal_info == NULL) {
1951                         ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1952                                                RES_QUOTA, 0);
1953                         if (ret)
1954                                 goto out;
1955                         down_write(&ip->i_rw_mutex);
1956                 }
1957                 gfs2_statfs_change(sdp, 0, +btotal, 0);
1958                 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1959                                   ip->i_inode.i_gid);
1960                 inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1961                 gfs2_trans_add_meta(ip->i_gl, dibh);
1962                 gfs2_dinode_out(ip, dibh->b_data);
1963                 up_write(&ip->i_rw_mutex);
1964                 gfs2_trans_end(sdp);
1965         }
1966
1967 out:
1968         if (gfs2_holder_initialized(&rd_gh))
1969                 gfs2_glock_dq_uninit(&rd_gh);
1970         if (current->journal_info) {
1971                 up_write(&ip->i_rw_mutex);
1972                 gfs2_trans_end(sdp);
1973                 cond_resched();
1974         }
1975         gfs2_quota_unhold(ip);
1976 out_metapath:
1977         release_metapath(&mp);
1978         return ret;
1979 }
1980
1981 static int trunc_end(struct gfs2_inode *ip)
1982 {
1983         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1984         struct buffer_head *dibh;
1985         int error;
1986
1987         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1988         if (error)
1989                 return error;
1990
1991         down_write(&ip->i_rw_mutex);
1992
1993         error = gfs2_meta_inode_buffer(ip, &dibh);
1994         if (error)
1995                 goto out;
1996
1997         if (!i_size_read(&ip->i_inode)) {
1998                 ip->i_height = 0;
1999                 ip->i_goal = ip->i_no_addr;
2000                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
2001                 gfs2_ordered_del_inode(ip);
2002         }
2003         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
2004         ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
2005
2006         gfs2_trans_add_meta(ip->i_gl, dibh);
2007         gfs2_dinode_out(ip, dibh->b_data);
2008         brelse(dibh);
2009
2010 out:
2011         up_write(&ip->i_rw_mutex);
2012         gfs2_trans_end(sdp);
2013         return error;
2014 }
2015
2016 /**
2017  * do_shrink - make a file smaller
2018  * @inode: the inode
2019  * @newsize: the size to make the file
2020  *
2021  * Called with an exclusive lock on @inode. The @size must
2022  * be equal to or smaller than the current inode size.
2023  *
2024  * Returns: errno
2025  */
2026
2027 static int do_shrink(struct inode *inode, u64 newsize)
2028 {
2029         struct gfs2_inode *ip = GFS2_I(inode);
2030         int error;
2031
2032         error = trunc_start(inode, newsize);
2033         if (error < 0)
2034                 return error;
2035         if (gfs2_is_stuffed(ip))
2036                 return 0;
2037
2038         error = punch_hole(ip, newsize, 0);
2039         if (error == 0)
2040                 error = trunc_end(ip);
2041
2042         return error;
2043 }
2044
2045 /**
2046  * do_grow - Touch and update inode size
2047  * @inode: The inode
2048  * @size: The new size
2049  *
2050  * This function updates the timestamps on the inode and
2051  * may also increase the size of the inode. This function
2052  * must not be called with @size any smaller than the current
2053  * inode size.
2054  *
2055  * Although it is not strictly required to unstuff files here,
2056  * earlier versions of GFS2 have a bug in the stuffed file reading
2057  * code which will result in a buffer overrun if the size is larger
2058  * than the max stuffed file size. In order to prevent this from
2059  * occurring, such files are unstuffed, but in other cases we can
2060  * just update the inode size directly.
2061  *
2062  * Returns: 0 on success, or -ve on error
2063  */
2064
2065 static int do_grow(struct inode *inode, u64 size)
2066 {
2067         struct gfs2_inode *ip = GFS2_I(inode);
2068         struct gfs2_sbd *sdp = GFS2_SB(inode);
2069         struct gfs2_alloc_parms ap = { .target = 1, };
2070         struct buffer_head *dibh;
2071         int error;
2072         int unstuff = 0;
2073
2074         if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2075                 error = gfs2_quota_lock_check(ip, &ap);
2076                 if (error)
2077                         return error;
2078
2079                 error = gfs2_inplace_reserve(ip, &ap);
2080                 if (error)
2081                         goto do_grow_qunlock;
2082                 unstuff = 1;
2083         }
2084
2085         error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2086                                  (unstuff &&
2087                                   gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2088                                  (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2089                                   0 : RES_QUOTA), 0);
2090         if (error)
2091                 goto do_grow_release;
2092
2093         if (unstuff) {
2094                 error = gfs2_unstuff_dinode(ip);
2095                 if (error)
2096                         goto do_end_trans;
2097         }
2098
2099         error = gfs2_meta_inode_buffer(ip, &dibh);
2100         if (error)
2101                 goto do_end_trans;
2102
2103         truncate_setsize(inode, size);
2104         inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
2105         gfs2_trans_add_meta(ip->i_gl, dibh);
2106         gfs2_dinode_out(ip, dibh->b_data);
2107         brelse(dibh);
2108
2109 do_end_trans:
2110         gfs2_trans_end(sdp);
2111 do_grow_release:
2112         if (unstuff) {
2113                 gfs2_inplace_release(ip);
2114 do_grow_qunlock:
2115                 gfs2_quota_unlock(ip);
2116         }
2117         return error;
2118 }
2119
2120 /**
2121  * gfs2_setattr_size - make a file a given size
2122  * @inode: the inode
2123  * @newsize: the size to make the file
2124  *
2125  * The file size can grow, shrink, or stay the same size. This
2126  * is called holding i_rwsem and an exclusive glock on the inode
2127  * in question.
2128  *
2129  * Returns: errno
2130  */
2131
2132 int gfs2_setattr_size(struct inode *inode, u64 newsize)
2133 {
2134         struct gfs2_inode *ip = GFS2_I(inode);
2135         int ret;
2136
2137         BUG_ON(!S_ISREG(inode->i_mode));
2138
2139         ret = inode_newsize_ok(inode, newsize);
2140         if (ret)
2141                 return ret;
2142
2143         inode_dio_wait(inode);
2144
2145         ret = gfs2_qa_get(ip);
2146         if (ret)
2147                 goto out;
2148
2149         if (newsize >= inode->i_size) {
2150                 ret = do_grow(inode, newsize);
2151                 goto out;
2152         }
2153
2154         ret = do_shrink(inode, newsize);
2155 out:
2156         gfs2_rs_delete(ip);
2157         gfs2_qa_put(ip);
2158         return ret;
2159 }
2160
2161 int gfs2_truncatei_resume(struct gfs2_inode *ip)
2162 {
2163         int error;
2164         error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2165         if (!error)
2166                 error = trunc_end(ip);
2167         return error;
2168 }
2169
2170 int gfs2_file_dealloc(struct gfs2_inode *ip)
2171 {
2172         return punch_hole(ip, 0, 0);
2173 }
2174
2175 /**
2176  * gfs2_free_journal_extents - Free cached journal bmap info
2177  * @jd: The journal
2178  *
2179  */
2180
2181 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2182 {
2183         struct gfs2_journal_extent *jext;
2184
2185         while(!list_empty(&jd->extent_list)) {
2186                 jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2187                 list_del(&jext->list);
2188                 kfree(jext);
2189         }
2190 }
2191
2192 /**
2193  * gfs2_add_jextent - Add or merge a new extent to extent cache
2194  * @jd: The journal descriptor
2195  * @lblock: The logical block at start of new extent
2196  * @dblock: The physical block at start of new extent
2197  * @blocks: Size of extent in fs blocks
2198  *
2199  * Returns: 0 on success or -ENOMEM
2200  */
2201
2202 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2203 {
2204         struct gfs2_journal_extent *jext;
2205
2206         if (!list_empty(&jd->extent_list)) {
2207                 jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2208                 if ((jext->dblock + jext->blocks) == dblock) {
2209                         jext->blocks += blocks;
2210                         return 0;
2211                 }
2212         }
2213
2214         jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2215         if (jext == NULL)
2216                 return -ENOMEM;
2217         jext->dblock = dblock;
2218         jext->lblock = lblock;
2219         jext->blocks = blocks;
2220         list_add_tail(&jext->list, &jd->extent_list);
2221         jd->nr_extents++;
2222         return 0;
2223 }
2224
2225 /**
2226  * gfs2_map_journal_extents - Cache journal bmap info
2227  * @sdp: The super block
2228  * @jd: The journal to map
2229  *
2230  * Create a reusable "extent" mapping from all logical
2231  * blocks to all physical blocks for the given journal.  This will save
2232  * us time when writing journal blocks.  Most journals will have only one
2233  * extent that maps all their logical blocks.  That's because gfs2.mkfs
2234  * arranges the journal blocks sequentially to maximize performance.
2235  * So the extent would map the first block for the entire file length.
2236  * However, gfs2_jadd can happen while file activity is happening, so
2237  * those journals may not be sequential.  Less likely is the case where
2238  * the users created their own journals by mounting the metafs and
2239  * laying it out.  But it's still possible.  These journals might have
2240  * several extents.
2241  *
2242  * Returns: 0 on success, or error on failure
2243  */
2244
2245 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2246 {
2247         u64 lblock = 0;
2248         u64 lblock_stop;
2249         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2250         struct buffer_head bh;
2251         unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2252         u64 size;
2253         int rc;
2254         ktime_t start, end;
2255
2256         start = ktime_get();
2257         lblock_stop = i_size_read(jd->jd_inode) >> shift;
2258         size = (lblock_stop - lblock) << shift;
2259         jd->nr_extents = 0;
2260         WARN_ON(!list_empty(&jd->extent_list));
2261
2262         do {
2263                 bh.b_state = 0;
2264                 bh.b_blocknr = 0;
2265                 bh.b_size = size;
2266                 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2267                 if (rc || !buffer_mapped(&bh))
2268                         goto fail;
2269                 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2270                 if (rc)
2271                         goto fail;
2272                 size -= bh.b_size;
2273                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2274         } while(size > 0);
2275
2276         end = ktime_get();
2277         fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2278                 jd->nr_extents, ktime_ms_delta(end, start));
2279         return 0;
2280
2281 fail:
2282         fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2283                 rc, jd->jd_jid,
2284                 (unsigned long long)(i_size_read(jd->jd_inode) - size),
2285                 jd->nr_extents);
2286         fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2287                 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2288                 bh.b_state, (unsigned long long)bh.b_size);
2289         gfs2_free_journal_extents(jd);
2290         return rc;
2291 }
2292
2293 /**
2294  * gfs2_write_alloc_required - figure out if a write will require an allocation
2295  * @ip: the file being written to
2296  * @offset: the offset to write to
2297  * @len: the number of bytes being written
2298  *
2299  * Returns: 1 if an alloc is required, 0 otherwise
2300  */
2301
2302 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2303                               unsigned int len)
2304 {
2305         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2306         struct buffer_head bh;
2307         unsigned int shift;
2308         u64 lblock, lblock_stop, size;
2309         u64 end_of_file;
2310
2311         if (!len)
2312                 return 0;
2313
2314         if (gfs2_is_stuffed(ip)) {
2315                 if (offset + len > gfs2_max_stuffed_size(ip))
2316                         return 1;
2317                 return 0;
2318         }
2319
2320         shift = sdp->sd_sb.sb_bsize_shift;
2321         BUG_ON(gfs2_is_dir(ip));
2322         end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2323         lblock = offset >> shift;
2324         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2325         if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2326                 return 1;
2327
2328         size = (lblock_stop - lblock) << shift;
2329         do {
2330                 bh.b_state = 0;
2331                 bh.b_size = size;
2332                 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2333                 if (!buffer_mapped(&bh))
2334                         return 1;
2335                 size -= bh.b_size;
2336                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2337         } while(size > 0);
2338
2339         return 0;
2340 }
2341
2342 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2343 {
2344         struct gfs2_inode *ip = GFS2_I(inode);
2345         struct buffer_head *dibh;
2346         int error;
2347
2348         if (offset >= inode->i_size)
2349                 return 0;
2350         if (offset + length > inode->i_size)
2351                 length = inode->i_size - offset;
2352
2353         error = gfs2_meta_inode_buffer(ip, &dibh);
2354         if (error)
2355                 return error;
2356         gfs2_trans_add_meta(ip->i_gl, dibh);
2357         memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2358                length);
2359         brelse(dibh);
2360         return 0;
2361 }
2362
2363 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2364                                          loff_t length)
2365 {
2366         struct gfs2_sbd *sdp = GFS2_SB(inode);
2367         loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2368         int error;
2369
2370         while (length) {
2371                 struct gfs2_trans *tr;
2372                 loff_t chunk;
2373                 unsigned int offs;
2374
2375                 chunk = length;
2376                 if (chunk > max_chunk)
2377                         chunk = max_chunk;
2378
2379                 offs = offset & ~PAGE_MASK;
2380                 if (offs && chunk > PAGE_SIZE)
2381                         chunk = offs + ((chunk - offs) & PAGE_MASK);
2382
2383                 truncate_pagecache_range(inode, offset, chunk);
2384                 offset += chunk;
2385                 length -= chunk;
2386
2387                 tr = current->journal_info;
2388                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2389                         continue;
2390
2391                 gfs2_trans_end(sdp);
2392                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2393                 if (error)
2394                         return error;
2395         }
2396         return 0;
2397 }
2398
2399 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2400 {
2401         struct inode *inode = file_inode(file);
2402         struct gfs2_inode *ip = GFS2_I(inode);
2403         struct gfs2_sbd *sdp = GFS2_SB(inode);
2404         unsigned int blocksize = i_blocksize(inode);
2405         loff_t start, end;
2406         int error;
2407
2408         if (!gfs2_is_stuffed(ip)) {
2409                 unsigned int start_off, end_len;
2410
2411                 start_off = offset & (blocksize - 1);
2412                 end_len = (offset + length) & (blocksize - 1);
2413                 if (start_off) {
2414                         unsigned int len = length;
2415                         if (length > blocksize - start_off)
2416                                 len = blocksize - start_off;
2417                         error = gfs2_block_zero_range(inode, offset, len);
2418                         if (error)
2419                                 goto out;
2420                         if (start_off + length < blocksize)
2421                                 end_len = 0;
2422                 }
2423                 if (end_len) {
2424                         error = gfs2_block_zero_range(inode,
2425                                 offset + length - end_len, end_len);
2426                         if (error)
2427                                 goto out;
2428                 }
2429         }
2430
2431         start = round_down(offset, blocksize);
2432         end = round_up(offset + length, blocksize) - 1;
2433         error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2434         if (error)
2435                 return error;
2436
2437         if (gfs2_is_jdata(ip))
2438                 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2439                                          GFS2_JTRUNC_REVOKES);
2440         else
2441                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2442         if (error)
2443                 return error;
2444
2445         if (gfs2_is_stuffed(ip)) {
2446                 error = stuffed_zero_range(inode, offset, length);
2447                 if (error)
2448                         goto out;
2449         }
2450
2451         if (gfs2_is_jdata(ip)) {
2452                 BUG_ON(!current->journal_info);
2453                 gfs2_journaled_truncate_range(inode, offset, length);
2454         } else
2455                 truncate_pagecache_range(inode, offset, offset + length - 1);
2456
2457         file_update_time(file);
2458         mark_inode_dirty(inode);
2459
2460         if (current->journal_info)
2461                 gfs2_trans_end(sdp);
2462
2463         if (!gfs2_is_stuffed(ip))
2464                 error = punch_hole(ip, offset, length);
2465
2466 out:
2467         if (current->journal_info)
2468                 gfs2_trans_end(sdp);
2469         return error;
2470 }
2471
2472 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
2473                 loff_t offset, unsigned int len)
2474 {
2475         int ret;
2476
2477         if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
2478                 return -EIO;
2479
2480         if (offset >= wpc->iomap.offset &&
2481             offset < wpc->iomap.offset + wpc->iomap.length)
2482                 return 0;
2483
2484         memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2485         ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
2486         return ret;
2487 }
2488
2489 const struct iomap_writeback_ops gfs2_writeback_ops = {
2490         .map_blocks             = gfs2_map_blocks,
2491 };