ext4: allow ext4_get_group_info() to fail
[linux-block.git] / fs / ext4 / mballoc.c
CommitLineData
f5166768 1// SPDX-License-Identifier: GPL-2.0
c9de560d
AT
2/*
3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
4 * Written by Alex Tomas <alex@clusterfs.com>
c9de560d
AT
5 */
6
7
8/*
9 * mballoc.c contains the multiblocks allocation routines
10 */
11
18aadd47 12#include "ext4_jbd2.h"
8f6e39a7 13#include "mballoc.h"
28623c2f 14#include <linux/log2.h>
a0b30c12 15#include <linux/module.h>
5a0e3ad6 16#include <linux/slab.h>
1a5d5e5d 17#include <linux/nospec.h>
66114cad 18#include <linux/backing-dev.h>
9bffad1e
TT
19#include <trace/events/ext4.h>
20
c9de560d
AT
21/*
22 * MUSTDO:
23 * - test ext4_ext_search_left() and ext4_ext_search_right()
24 * - search for metadata in few groups
25 *
26 * TODO v4:
27 * - normalization should take into account whether file is still open
28 * - discard preallocations if no free space left (policy?)
29 * - don't normalize tails
30 * - quota
31 * - reservation for superuser
32 *
33 * TODO v3:
34 * - bitmap read-ahead (proposed by Oleg Drokin aka green)
35 * - track min/max extents in each group for better group selection
36 * - mb_mark_used() may allocate chunk right after splitting buddy
37 * - tree of groups sorted by number of free blocks
38 * - error handling
39 */
40
41/*
42 * The allocation request involve request for multiple number of blocks
43 * near to the goal(block) value specified.
44 *
b713a5ec
TT
45 * During initialization phase of the allocator we decide to use the
46 * group preallocation or inode preallocation depending on the size of
47 * the file. The size of the file could be the resulting file size we
48 * would have after allocation, or the current file size, which ever
49 * is larger. If the size is less than sbi->s_mb_stream_request we
50 * select to use the group preallocation. The default value of
51 * s_mb_stream_request is 16 blocks. This can also be tuned via
52 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
53 * terms of number of blocks.
c9de560d
AT
54 *
55 * The main motivation for having small file use group preallocation is to
b713a5ec 56 * ensure that we have small files closer together on the disk.
c9de560d 57 *
b713a5ec
TT
58 * First stage the allocator looks at the inode prealloc list,
59 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
60 * spaces for this particular inode. The inode prealloc space is
61 * represented as:
c9de560d
AT
62 *
63 * pa_lstart -> the logical start block for this prealloc space
64 * pa_pstart -> the physical start block for this prealloc space
53accfa9
TT
65 * pa_len -> length for this prealloc space (in clusters)
66 * pa_free -> free space available in this prealloc space (in clusters)
c9de560d
AT
67 *
68 * The inode preallocation space is used looking at the _logical_ start
69 * block. If only the logical file block falls within the range of prealloc
caaf7a29
TM
70 * space we will consume the particular prealloc space. This makes sure that
71 * we have contiguous physical blocks representing the file blocks
c9de560d
AT
72 *
73 * The important thing to be noted in case of inode prealloc space is that
74 * we don't modify the values associated to inode prealloc space except
75 * pa_free.
76 *
77 * If we are not able to find blocks in the inode prealloc space and if we
78 * have the group allocation flag set then we look at the locality group
caaf7a29 79 * prealloc space. These are per CPU prealloc list represented as
c9de560d
AT
80 *
81 * ext4_sb_info.s_locality_groups[smp_processor_id()]
82 *
83 * The reason for having a per cpu locality group is to reduce the contention
84 * between CPUs. It is possible to get scheduled at this point.
85 *
86 * The locality group prealloc space is used looking at whether we have
25985edc 87 * enough free space (pa_free) within the prealloc space.
c9de560d
AT
88 *
89 * If we can't allocate blocks via inode prealloc or/and locality group
90 * prealloc then we look at the buddy cache. The buddy cache is represented
91 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
92 * mapped to the buddy and bitmap information regarding different
93 * groups. The buddy information is attached to buddy cache inode so that
94 * we can access them through the page cache. The information regarding
95 * each group is loaded via ext4_mb_load_buddy. The information involve
96 * block bitmap and buddy information. The information are stored in the
97 * inode as:
98 *
99 * { page }
c3a326a6 100 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
c9de560d
AT
101 *
102 *
103 * one block each for bitmap and buddy information. So for each group we
ea1754a0 104 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
c9de560d
AT
105 * blocksize) blocks. So it can have information regarding groups_per_page
106 * which is blocks_per_page/2
107 *
108 * The buddy cache inode is not stored on disk. The inode is thrown
109 * away when the filesystem is unmounted.
110 *
111 * We look for count number of blocks in the buddy cache. If we were able
112 * to locate that many free blocks we return with additional information
113 * regarding rest of the contiguous physical block available
114 *
115 * Before allocating blocks via buddy cache we normalize the request
116 * blocks. This ensure we ask for more blocks that we needed. The extra
117 * blocks that we get after allocation is added to the respective prealloc
118 * list. In case of inode preallocation we follow a list of heuristics
119 * based on file size. This can be found in ext4_mb_normalize_request. If
120 * we are doing a group prealloc we try to normalize the request to
27baebb8
TT
121 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
122 * dependent on the cluster size; for non-bigalloc file systems, it is
c9de560d 123 * 512 blocks. This can be tuned via
d7a1fee1 124 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
c9de560d
AT
125 * terms of number of blocks. If we have mounted the file system with -O
126 * stripe=<value> option the group prealloc request is normalized to the
b483bb77 127 * smallest multiple of the stripe value (sbi->s_stripe) which is
d7a1fee1 128 * greater than the default mb_group_prealloc.
c9de560d 129 *
196e402a
HS
130 * If "mb_optimize_scan" mount option is set, we maintain in memory group info
131 * structures in two data structures:
132 *
133 * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
134 *
135 * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
136 *
137 * This is an array of lists where the index in the array represents the
138 * largest free order in the buddy bitmap of the participating group infos of
139 * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
140 * number of buddy bitmap orders possible) number of lists. Group-infos are
141 * placed in appropriate lists.
142 *
83e80a6e 143 * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
196e402a 144 *
83e80a6e 145 * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
196e402a 146 *
83e80a6e
JK
147 * This is an array of lists where in the i-th list there are groups with
148 * average fragment size >= 2^i and < 2^(i+1). The average fragment size
149 * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
150 * Note that we don't bother with a special list for completely empty groups
151 * so we only have MB_NUM_ORDERS(sb) lists.
196e402a
HS
152 *
153 * When "mb_optimize_scan" mount option is set, mballoc consults the above data
154 * structures to decide the order in which groups are to be traversed for
155 * fulfilling an allocation request.
156 *
157 * At CR = 0, we look for groups which have the largest_free_order >= the order
158 * of the request. We directly look at the largest free order list in the data
159 * structure (1) above where largest_free_order = order of the request. If that
160 * list is empty, we look at remaining list in the increasing order of
161 * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
162 *
163 * At CR = 1, we only consider groups where average fragment size > request
164 * size. So, we lookup a group which has average fragment size just above or
83e80a6e
JK
165 * equal to request size using our average fragment size group lists (data
166 * structure 2) in O(1) time.
196e402a
HS
167 *
168 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
169 * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
170 *
d7a1fee1 171 * The regular allocator (using the buddy cache) supports a few tunables.
c9de560d 172 *
b713a5ec
TT
173 * /sys/fs/ext4/<partition>/mb_min_to_scan
174 * /sys/fs/ext4/<partition>/mb_max_to_scan
175 * /sys/fs/ext4/<partition>/mb_order2_req
196e402a 176 * /sys/fs/ext4/<partition>/mb_linear_limit
c9de560d 177 *
b713a5ec 178 * The regular allocator uses buddy scan only if the request len is power of
c9de560d
AT
179 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
180 * value of s_mb_order2_reqs can be tuned via
b713a5ec 181 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
af901ca1 182 * stripe size (sbi->s_stripe), we try to search for contiguous block in
b713a5ec
TT
183 * stripe size. This should result in better allocation on RAID setups. If
184 * not, we search in the specific group using bitmap for best extents. The
185 * tunable min_to_scan and max_to_scan control the behaviour here.
c9de560d 186 * min_to_scan indicate how long the mballoc __must__ look for a best
b713a5ec 187 * extent and max_to_scan indicates how long the mballoc __can__ look for a
c9de560d
AT
188 * best extent in the found extents. Searching for the blocks starts with
189 * the group specified as the goal value in allocation context via
190 * ac_g_ex. Each group is first checked based on the criteria whether it
caaf7a29 191 * can be used for allocation. ext4_mb_good_group explains how the groups are
c9de560d
AT
192 * checked.
193 *
196e402a
HS
194 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
195 * get traversed linearly. That may result in subsequent allocations being not
196 * close to each other. And so, the underlying device may get filled up in a
197 * non-linear fashion. While that may not matter on non-rotational devices, for
198 * rotational devices that may result in higher seek times. "mb_linear_limit"
199 * tells mballoc how many groups mballoc should search linearly before
200 * performing consulting above data structures for more efficient lookups. For
201 * non rotational devices, this value defaults to 0 and for rotational devices
202 * this is set to MB_DEFAULT_LINEAR_LIMIT.
203 *
c9de560d
AT
204 * Both the prealloc space are getting populated as above. So for the first
205 * request we will hit the buddy cache which will result in this prealloc
206 * space getting filled. The prealloc space is then later used for the
207 * subsequent request.
208 */
209
210/*
211 * mballoc operates on the following data:
212 * - on-disk bitmap
213 * - in-core buddy (actually includes buddy and bitmap)
214 * - preallocation descriptors (PAs)
215 *
216 * there are two types of preallocations:
217 * - inode
218 * assiged to specific inode and can be used for this inode only.
219 * it describes part of inode's space preallocated to specific
220 * physical blocks. any block from that preallocated can be used
221 * independent. the descriptor just tracks number of blocks left
222 * unused. so, before taking some block from descriptor, one must
223 * make sure corresponded logical block isn't allocated yet. this
224 * also means that freeing any block within descriptor's range
225 * must discard all preallocated blocks.
226 * - locality group
227 * assigned to specific locality group which does not translate to
228 * permanent set of inodes: inode can join and leave group. space
229 * from this type of preallocation can be used for any inode. thus
230 * it's consumed from the beginning to the end.
231 *
232 * relation between them can be expressed as:
233 * in-core buddy = on-disk bitmap + preallocation descriptors
234 *
235 * this mean blocks mballoc considers used are:
236 * - allocated blocks (persistent)
237 * - preallocated blocks (non-persistent)
238 *
239 * consistency in mballoc world means that at any time a block is either
240 * free or used in ALL structures. notice: "any time" should not be read
241 * literally -- time is discrete and delimited by locks.
242 *
243 * to keep it simple, we don't use block numbers, instead we count number of
244 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
245 *
246 * all operations can be expressed as:
247 * - init buddy: buddy = on-disk + PAs
248 * - new PA: buddy += N; PA = N
249 * - use inode PA: on-disk += N; PA -= N
250 * - discard inode PA buddy -= on-disk - PA; PA = 0
251 * - use locality group PA on-disk += N; PA -= N
252 * - discard locality group PA buddy -= PA; PA = 0
253 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
254 * is used in real operation because we can't know actual used
255 * bits from PA, only from on-disk bitmap
256 *
257 * if we follow this strict logic, then all operations above should be atomic.
258 * given some of them can block, we'd have to use something like semaphores
259 * killing performance on high-end SMP hardware. let's try to relax it using
260 * the following knowledge:
261 * 1) if buddy is referenced, it's already initialized
262 * 2) while block is used in buddy and the buddy is referenced,
263 * nobody can re-allocate that block
264 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
265 * bit set and PA claims same block, it's OK. IOW, one can set bit in
266 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
267 * block
268 *
269 * so, now we're building a concurrency table:
270 * - init buddy vs.
271 * - new PA
272 * blocks for PA are allocated in the buddy, buddy must be referenced
273 * until PA is linked to allocation group to avoid concurrent buddy init
274 * - use inode PA
275 * we need to make sure that either on-disk bitmap or PA has uptodate data
276 * given (3) we care that PA-=N operation doesn't interfere with init
277 * - discard inode PA
278 * the simplest way would be to have buddy initialized by the discard
279 * - use locality group PA
280 * again PA-=N must be serialized with init
281 * - discard locality group PA
282 * the simplest way would be to have buddy initialized by the discard
283 * - new PA vs.
284 * - use inode PA
285 * i_data_sem serializes them
286 * - discard inode PA
287 * discard process must wait until PA isn't used by another process
288 * - use locality group PA
289 * some mutex should serialize them
290 * - discard locality group PA
291 * discard process must wait until PA isn't used by another process
292 * - use inode PA
293 * - use inode PA
294 * i_data_sem or another mutex should serializes them
295 * - discard inode PA
296 * discard process must wait until PA isn't used by another process
297 * - use locality group PA
298 * nothing wrong here -- they're different PAs covering different blocks
299 * - discard locality group PA
300 * discard process must wait until PA isn't used by another process
301 *
302 * now we're ready to make few consequences:
303 * - PA is referenced and while it is no discard is possible
304 * - PA is referenced until block isn't marked in on-disk bitmap
305 * - PA changes only after on-disk bitmap
306 * - discard must not compete with init. either init is done before
307 * any discard or they're serialized somehow
308 * - buddy init as sum of on-disk bitmap and PAs is done atomically
309 *
310 * a special case when we've used PA to emptiness. no need to modify buddy
311 * in this case, but we should care about concurrent init
312 *
313 */
314
315 /*
316 * Logic in few words:
317 *
318 * - allocation:
319 * load group
320 * find blocks
321 * mark bits in on-disk bitmap
322 * release group
323 *
324 * - use preallocation:
325 * find proper PA (per-inode or group)
326 * load group
327 * mark bits in on-disk bitmap
328 * release group
329 * release PA
330 *
331 * - free:
332 * load group
333 * mark bits in on-disk bitmap
334 * release group
335 *
336 * - discard preallocations in group:
337 * mark PAs deleted
338 * move them onto local list
339 * load on-disk bitmap
340 * load group
341 * remove PA from object (inode or locality group)
342 * mark free blocks in-core
343 *
344 * - discard inode's preallocations:
345 */
346
347/*
348 * Locking rules
349 *
350 * Locks:
351 * - bitlock on a group (group)
352 * - object (inode/locality) (object)
353 * - per-pa lock (pa)
196e402a
HS
354 * - cr0 lists lock (cr0)
355 * - cr1 tree lock (cr1)
c9de560d
AT
356 *
357 * Paths:
358 * - new pa
359 * object
360 * group
361 *
362 * - find and use pa:
363 * pa
364 *
365 * - release consumed pa:
366 * pa
367 * group
368 * object
369 *
370 * - generate in-core bitmap:
371 * group
372 * pa
373 *
374 * - discard all for given object (inode, locality group):
375 * object
376 * pa
377 * group
378 *
379 * - discard all for given group:
380 * group
381 * pa
382 * group
383 * object
384 *
196e402a
HS
385 * - allocation path (ext4_mb_regular_allocator)
386 * group
387 * cr0/cr1
c9de560d 388 */
c3a326a6
AK
389static struct kmem_cache *ext4_pspace_cachep;
390static struct kmem_cache *ext4_ac_cachep;
18aadd47 391static struct kmem_cache *ext4_free_data_cachep;
fb1813f4
CW
392
393/* We create slab caches for groupinfo data structures based on the
394 * superblock block size. There will be one per mounted filesystem for
395 * each unique s_blocksize_bits */
2892c15d 396#define NR_GRPINFO_CACHES 8
fb1813f4
CW
397static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
398
d6006186 399static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
2892c15d
ES
400 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
401 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
402 "ext4_groupinfo_64k", "ext4_groupinfo_128k"
403};
404
c3a326a6
AK
405static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
406 ext4_group_t group);
7a2fcbf7
AK
407static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
408 ext4_group_t group);
53f86b17 409static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
c3a326a6 410
196e402a
HS
411static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
412 ext4_group_t group, int cr);
413
55cdd0af
WJ
414static int ext4_try_to_trim_range(struct super_block *sb,
415 struct ext4_buddy *e4b, ext4_grpblk_t start,
416 ext4_grpblk_t max, ext4_grpblk_t minblocks);
417
07b5b8e1
RH
418/*
419 * The algorithm using this percpu seq counter goes below:
420 * 1. We sample the percpu discard_pa_seq counter before trying for block
421 * allocation in ext4_mb_new_blocks().
422 * 2. We increment this percpu discard_pa_seq counter when we either allocate
423 * or free these blocks i.e. while marking those blocks as used/free in
424 * mb_mark_used()/mb_free_blocks().
425 * 3. We also increment this percpu seq counter when we successfully identify
426 * that the bb_prealloc_list is not empty and hence proceed for discarding
427 * of those PAs inside ext4_mb_discard_group_preallocations().
428 *
429 * Now to make sure that the regular fast path of block allocation is not
430 * affected, as a small optimization we only sample the percpu seq counter
431 * on that cpu. Only when the block allocation fails and when freed blocks
432 * found were 0, that is when we sample percpu seq counter for all cpus using
433 * below function ext4_get_discard_pa_seq_sum(). This happens after making
434 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
435 */
436static DEFINE_PER_CPU(u64, discard_pa_seq);
437static inline u64 ext4_get_discard_pa_seq_sum(void)
438{
439 int __cpu;
440 u64 __seq = 0;
441
442 for_each_possible_cpu(__cpu)
443 __seq += per_cpu(discard_pa_seq, __cpu);
444 return __seq;
445}
446
ffad0a44
AK
447static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
448{
c9de560d 449#if BITS_PER_LONG == 64
ffad0a44
AK
450 *bit += ((unsigned long) addr & 7UL) << 3;
451 addr = (void *) ((unsigned long) addr & ~7UL);
c9de560d 452#elif BITS_PER_LONG == 32
ffad0a44
AK
453 *bit += ((unsigned long) addr & 3UL) << 3;
454 addr = (void *) ((unsigned long) addr & ~3UL);
c9de560d
AT
455#else
456#error "how many bits you are?!"
457#endif
ffad0a44
AK
458 return addr;
459}
c9de560d
AT
460
461static inline int mb_test_bit(int bit, void *addr)
462{
463 /*
464 * ext4_test_bit on architecture like powerpc
465 * needs unsigned long aligned address
466 */
ffad0a44 467 addr = mb_correct_addr_and_bit(&bit, addr);
c9de560d
AT
468 return ext4_test_bit(bit, addr);
469}
470
471static inline void mb_set_bit(int bit, void *addr)
472{
ffad0a44 473 addr = mb_correct_addr_and_bit(&bit, addr);
c9de560d
AT
474 ext4_set_bit(bit, addr);
475}
476
c9de560d
AT
477static inline void mb_clear_bit(int bit, void *addr)
478{
ffad0a44 479 addr = mb_correct_addr_and_bit(&bit, addr);
c9de560d
AT
480 ext4_clear_bit(bit, addr);
481}
482
eabe0444
AS
483static inline int mb_test_and_clear_bit(int bit, void *addr)
484{
485 addr = mb_correct_addr_and_bit(&bit, addr);
486 return ext4_test_and_clear_bit(bit, addr);
487}
488
ffad0a44
AK
489static inline int mb_find_next_zero_bit(void *addr, int max, int start)
490{
e7dfb246 491 int fix = 0, ret, tmpmax;
ffad0a44 492 addr = mb_correct_addr_and_bit(&fix, addr);
e7dfb246 493 tmpmax = max + fix;
ffad0a44
AK
494 start += fix;
495
e7dfb246
AK
496 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
497 if (ret > max)
498 return max;
499 return ret;
ffad0a44
AK
500}
501
502static inline int mb_find_next_bit(void *addr, int max, int start)
503{
e7dfb246 504 int fix = 0, ret, tmpmax;
ffad0a44 505 addr = mb_correct_addr_and_bit(&fix, addr);
e7dfb246 506 tmpmax = max + fix;
ffad0a44
AK
507 start += fix;
508
e7dfb246
AK
509 ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
510 if (ret > max)
511 return max;
512 return ret;
ffad0a44
AK
513}
514
c9de560d
AT
515static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
516{
517 char *bb;
518
c5e8f3f3 519 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
c9de560d
AT
520 BUG_ON(max == NULL);
521
522 if (order > e4b->bd_blkbits + 1) {
523 *max = 0;
524 return NULL;
525 }
526
527 /* at order 0 we see each particular block */
84b775a3
CL
528 if (order == 0) {
529 *max = 1 << (e4b->bd_blkbits + 3);
c5e8f3f3 530 return e4b->bd_bitmap;
84b775a3 531 }
c9de560d 532
c5e8f3f3 533 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
c9de560d
AT
534 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
535
536 return bb;
537}
538
539#ifdef DOUBLE_CHECK
540static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
541 int first, int count)
542{
543 int i;
544 struct super_block *sb = e4b->bd_sb;
545
546 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
547 return;
bc8e6740 548 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
c9de560d
AT
549 for (i = 0; i < count; i++) {
550 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
551 ext4_fsblk_t blocknr;
5661bd68
AM
552
553 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
53accfa9 554 blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
5d1b1b3f 555 ext4_grp_locked_error(sb, e4b->bd_group,
e29136f8
TT
556 inode ? inode->i_ino : 0,
557 blocknr,
558 "freeing block already freed "
559 "(bit %u)",
560 first + i);
736dedbb
WS
561 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
562 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
c9de560d
AT
563 }
564 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
565 }
566}
567
568static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
569{
570 int i;
571
572 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
573 return;
bc8e6740 574 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
c9de560d
AT
575 for (i = 0; i < count; i++) {
576 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
577 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
578 }
579}
580
581static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
582{
eb2b8ebb
RH
583 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
584 return;
c9de560d
AT
585 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
586 unsigned char *b1, *b2;
587 int i;
588 b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
589 b2 = (unsigned char *) bitmap;
590 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
591 if (b1[i] != b2[i]) {
9d8b9ec4
TT
592 ext4_msg(e4b->bd_sb, KERN_ERR,
593 "corruption in group %u "
594 "at byte %u(%u): %x in copy != %x "
595 "on disk/prealloc",
596 e4b->bd_group, i, i * 8, b1[i], b2[i]);
c9de560d
AT
597 BUG();
598 }
599 }
600 }
601}
602
a3450215
RH
603static void mb_group_bb_bitmap_alloc(struct super_block *sb,
604 struct ext4_group_info *grp, ext4_group_t group)
605{
606 struct buffer_head *bh;
607
608 grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
eb2b8ebb
RH
609 if (!grp->bb_bitmap)
610 return;
a3450215
RH
611
612 bh = ext4_read_block_bitmap(sb, group);
eb2b8ebb
RH
613 if (IS_ERR_OR_NULL(bh)) {
614 kfree(grp->bb_bitmap);
615 grp->bb_bitmap = NULL;
616 return;
617 }
a3450215
RH
618
619 memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
620 put_bh(bh);
621}
622
623static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
624{
625 kfree(grp->bb_bitmap);
626}
627
c9de560d
AT
628#else
629static inline void mb_free_blocks_double(struct inode *inode,
630 struct ext4_buddy *e4b, int first, int count)
631{
632 return;
633}
634static inline void mb_mark_used_double(struct ext4_buddy *e4b,
635 int first, int count)
636{
637 return;
638}
639static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
640{
641 return;
642}
a3450215
RH
643
644static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
645 struct ext4_group_info *grp, ext4_group_t group)
646{
647 return;
648}
649
650static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
651{
652 return;
653}
c9de560d
AT
654#endif
655
656#ifdef AGGRESSIVE_CHECK
657
658#define MB_CHECK_ASSERT(assert) \
659do { \
660 if (!(assert)) { \
661 printk(KERN_EMERG \
662 "Assertion failure in %s() at %s:%d: \"%s\"\n", \
663 function, file, line, # assert); \
664 BUG(); \
665 } \
666} while (0)
667
668static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
669 const char *function, int line)
670{
671 struct super_block *sb = e4b->bd_sb;
672 int order = e4b->bd_blkbits + 1;
673 int max;
674 int max2;
675 int i;
676 int j;
677 int k;
678 int count;
679 struct ext4_group_info *grp;
680 int fragments = 0;
681 int fstart;
682 struct list_head *cur;
683 void *buddy;
684 void *buddy2;
685
addd752c
CX
686 if (e4b->bd_info->bb_check_counter++ % 10)
687 return 0;
c9de560d
AT
688
689 while (order > 1) {
690 buddy = mb_find_buddy(e4b, order, &max);
691 MB_CHECK_ASSERT(buddy);
692 buddy2 = mb_find_buddy(e4b, order - 1, &max2);
693 MB_CHECK_ASSERT(buddy2);
694 MB_CHECK_ASSERT(buddy != buddy2);
695 MB_CHECK_ASSERT(max * 2 == max2);
696
697 count = 0;
698 for (i = 0; i < max; i++) {
699
700 if (mb_test_bit(i, buddy)) {
af2b3275 701 /* only single bit in buddy2 may be 0 */
c9de560d
AT
702 if (!mb_test_bit(i << 1, buddy2)) {
703 MB_CHECK_ASSERT(
704 mb_test_bit((i<<1)+1, buddy2));
c9de560d
AT
705 }
706 continue;
707 }
708
0a10da73 709 /* both bits in buddy2 must be 1 */
c9de560d
AT
710 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
711 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
712
713 for (j = 0; j < (1 << order); j++) {
714 k = (i * (1 << order)) + j;
715 MB_CHECK_ASSERT(
c5e8f3f3 716 !mb_test_bit(k, e4b->bd_bitmap));
c9de560d
AT
717 }
718 count++;
719 }
720 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
721 order--;
722 }
723
724 fstart = -1;
725 buddy = mb_find_buddy(e4b, 0, &max);
726 for (i = 0; i < max; i++) {
727 if (!mb_test_bit(i, buddy)) {
728 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
729 if (fstart == -1) {
730 fragments++;
731 fstart = i;
732 }
733 continue;
734 }
735 fstart = -1;
736 /* check used bits only */
737 for (j = 0; j < e4b->bd_blkbits + 1; j++) {
738 buddy2 = mb_find_buddy(e4b, j, &max2);
739 k = i >> j;
740 MB_CHECK_ASSERT(k < max2);
741 MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
742 }
743 }
744 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
745 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
746
747 grp = ext4_get_group_info(sb, e4b->bd_group);
5354b2af
TT
748 if (!grp)
749 return NULL;
c9de560d
AT
750 list_for_each(cur, &grp->bb_prealloc_list) {
751 ext4_group_t groupnr;
752 struct ext4_prealloc_space *pa;
60bd63d1
SR
753 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
754 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
c9de560d 755 MB_CHECK_ASSERT(groupnr == e4b->bd_group);
60bd63d1 756 for (i = 0; i < pa->pa_len; i++)
c9de560d
AT
757 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
758 }
759 return 0;
760}
761#undef MB_CHECK_ASSERT
762#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
46e665e9 763 __FILE__, __func__, __LINE__)
c9de560d
AT
764#else
765#define mb_check_buddy(e4b)
766#endif
767
7c786059
CL
768/*
769 * Divide blocks started from @first with length @len into
770 * smaller chunks with power of 2 blocks.
771 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
772 * then increase bb_counters[] for corresponded chunk size.
773 */
c9de560d 774static void ext4_mb_mark_free_simple(struct super_block *sb,
a36b4498 775 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
c9de560d
AT
776 struct ext4_group_info *grp)
777{
778 struct ext4_sb_info *sbi = EXT4_SB(sb);
a36b4498
ES
779 ext4_grpblk_t min;
780 ext4_grpblk_t max;
781 ext4_grpblk_t chunk;
69e43e8c 782 unsigned int border;
c9de560d 783
7137d7a4 784 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
c9de560d
AT
785
786 border = 2 << sb->s_blocksize_bits;
787
788 while (len > 0) {
789 /* find how many blocks can be covered since this position */
790 max = ffs(first | border) - 1;
791
792 /* find how many blocks of power 2 we need to mark */
793 min = fls(len) - 1;
794
795 if (max < min)
796 min = max;
797 chunk = 1 << min;
798
799 /* mark multiblock chunks only */
800 grp->bb_counters[min]++;
801 if (min > 0)
802 mb_clear_bit(first >> min,
803 buddy + sbi->s_mb_offsets[min]);
804
805 len -= chunk;
806 first += chunk;
807 }
808}
809
83e80a6e 810static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
196e402a 811{
83e80a6e 812 int order;
196e402a 813
83e80a6e
JK
814 /*
815 * We don't bother with a special lists groups with only 1 block free
816 * extents and for completely empty groups.
817 */
818 order = fls(len) - 2;
819 if (order < 0)
820 return 0;
821 if (order == MB_NUM_ORDERS(sb))
822 order--;
823 return order;
196e402a
HS
824}
825
83e80a6e 826/* Move group to appropriate avg_fragment_size list */
196e402a
HS
827static void
828mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
829{
830 struct ext4_sb_info *sbi = EXT4_SB(sb);
83e80a6e 831 int new_order;
196e402a
HS
832
833 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
834 return;
835
83e80a6e
JK
836 new_order = mb_avg_fragment_size_order(sb,
837 grp->bb_free / grp->bb_fragments);
838 if (new_order == grp->bb_avg_fragment_size_order)
839 return;
196e402a 840
83e80a6e
JK
841 if (grp->bb_avg_fragment_size_order != -1) {
842 write_lock(&sbi->s_mb_avg_fragment_size_locks[
843 grp->bb_avg_fragment_size_order]);
844 list_del(&grp->bb_avg_fragment_size_node);
845 write_unlock(&sbi->s_mb_avg_fragment_size_locks[
846 grp->bb_avg_fragment_size_order]);
847 }
848 grp->bb_avg_fragment_size_order = new_order;
849 write_lock(&sbi->s_mb_avg_fragment_size_locks[
850 grp->bb_avg_fragment_size_order]);
851 list_add_tail(&grp->bb_avg_fragment_size_node,
852 &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
853 write_unlock(&sbi->s_mb_avg_fragment_size_locks[
854 grp->bb_avg_fragment_size_order]);
196e402a
HS
855}
856
857/*
858 * Choose next group by traversing largest_free_order lists. Updates *new_cr if
859 * cr level needs an update.
860 */
861static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
862 int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
863{
864 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
865 struct ext4_group_info *iter, *grp;
866 int i;
867
868 if (ac->ac_status == AC_STATUS_FOUND)
869 return;
870
871 if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
872 atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
873
874 grp = NULL;
875 for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
876 if (list_empty(&sbi->s_mb_largest_free_orders[i]))
877 continue;
878 read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
879 if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
880 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
881 continue;
882 }
883 grp = NULL;
884 list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
885 bb_largest_free_order_node) {
886 if (sbi->s_mb_stats)
887 atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
888 if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
889 grp = iter;
890 break;
891 }
892 }
893 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
894 if (grp)
895 break;
896 }
897
898 if (!grp) {
899 /* Increment cr and search again */
900 *new_cr = 1;
901 } else {
902 *group = grp->bb_group;
196e402a
HS
903 ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
904 }
905}
906
907/*
83e80a6e
JK
908 * Choose next group by traversing average fragment size list of suitable
909 * order. Updates *new_cr if cr level needs an update.
196e402a
HS
910 */
911static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
912 int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
913{
914 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
a078dff8 915 struct ext4_group_info *grp = NULL, *iter;
83e80a6e 916 int i;
196e402a
HS
917
918 if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
919 if (sbi->s_mb_stats)
920 atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
83e80a6e
JK
921 }
922
923 for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
924 i < MB_NUM_ORDERS(ac->ac_sb); i++) {
925 if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
926 continue;
927 read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
928 if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
929 read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
930 continue;
931 }
83e80a6e
JK
932 list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
933 bb_avg_fragment_size_node) {
196e402a
HS
934 if (sbi->s_mb_stats)
935 atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
83e80a6e
JK
936 if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
937 grp = iter;
196e402a 938 break;
196e402a
HS
939 }
940 }
83e80a6e
JK
941 read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
942 if (grp)
943 break;
196e402a
HS
944 }
945
83e80a6e 946 if (grp) {
196e402a
HS
947 *group = grp->bb_group;
948 ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
949 } else {
950 *new_cr = 2;
951 }
196e402a
HS
952}
953
954static inline int should_optimize_scan(struct ext4_allocation_context *ac)
955{
956 if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
957 return 0;
958 if (ac->ac_criteria >= 2)
959 return 0;
077d0c2c 960 if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
196e402a
HS
961 return 0;
962 return 1;
963}
964
965/*
966 * Return next linear group for allocation. If linear traversal should not be
967 * performed, this function just returns the same group
968 */
969static int
970next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
971{
972 if (!should_optimize_scan(ac))
973 goto inc_and_return;
974
975 if (ac->ac_groups_linear_remaining) {
976 ac->ac_groups_linear_remaining--;
977 goto inc_and_return;
978 }
979
196e402a
HS
980 return group;
981inc_and_return:
982 /*
983 * Artificially restricted ngroups for non-extent
984 * files makes group > ngroups possible on first loop.
985 */
986 return group + 1 >= ngroups ? 0 : group + 1;
987}
988
989/*
990 * ext4_mb_choose_next_group: choose next group for allocation.
991 *
992 * @ac Allocation Context
993 * @new_cr This is an output parameter. If the there is no good group
994 * available at current CR level, this field is updated to indicate
995 * the new cr level that should be used.
996 * @group This is an input / output parameter. As an input it indicates the
997 * next group that the allocator intends to use for allocation. As
998 * output, this field indicates the next group that should be used as
999 * determined by the optimization functions.
1000 * @ngroups Total number of groups
1001 */
1002static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
1003 int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
1004{
1005 *new_cr = ac->ac_criteria;
1006
4fca50d4
JK
1007 if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
1008 *group = next_linear_group(ac, *group, ngroups);
196e402a 1009 return;
4fca50d4 1010 }
196e402a
HS
1011
1012 if (*new_cr == 0) {
1013 ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
1014 } else if (*new_cr == 1) {
1015 ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
1016 } else {
1017 /*
1018 * TODO: For CR=2, we can arrange groups in an rb tree sorted by
1019 * bb_free. But until that happens, we should never come here.
1020 */
1021 WARN_ON(1);
1022 }
1023}
1024
8a57d9d6
CW
1025/*
1026 * Cache the order of the largest free extent we have available in this block
1027 * group.
1028 */
1029static void
1030mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
1031{
196e402a 1032 struct ext4_sb_info *sbi = EXT4_SB(sb);
8a57d9d6 1033 int i;
8a57d9d6 1034
1940265e
JK
1035 for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
1036 if (grp->bb_counters[i] > 0)
1037 break;
1038 /* No need to move between order lists? */
1039 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
1040 i == grp->bb_largest_free_order) {
1041 grp->bb_largest_free_order = i;
1042 return;
1043 }
1044
1045 if (grp->bb_largest_free_order >= 0) {
196e402a
HS
1046 write_lock(&sbi->s_mb_largest_free_orders_locks[
1047 grp->bb_largest_free_order]);
1048 list_del_init(&grp->bb_largest_free_order_node);
1049 write_unlock(&sbi->s_mb_largest_free_orders_locks[
1050 grp->bb_largest_free_order]);
1051 }
1940265e
JK
1052 grp->bb_largest_free_order = i;
1053 if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
196e402a
HS
1054 write_lock(&sbi->s_mb_largest_free_orders_locks[
1055 grp->bb_largest_free_order]);
1056 list_add_tail(&grp->bb_largest_free_order_node,
1057 &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
1058 write_unlock(&sbi->s_mb_largest_free_orders_locks[
1059 grp->bb_largest_free_order]);
1060 }
8a57d9d6
CW
1061}
1062
089ceecc
ES
1063static noinline_for_stack
1064void ext4_mb_generate_buddy(struct super_block *sb,
5354b2af
TT
1065 void *buddy, void *bitmap, ext4_group_t group,
1066 struct ext4_group_info *grp)
c9de560d 1067{
e43bb4e6 1068 struct ext4_sb_info *sbi = EXT4_SB(sb);
7137d7a4 1069 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
a36b4498
ES
1070 ext4_grpblk_t i = 0;
1071 ext4_grpblk_t first;
1072 ext4_grpblk_t len;
c9de560d
AT
1073 unsigned free = 0;
1074 unsigned fragments = 0;
1075 unsigned long long period = get_cycles();
1076
1077 /* initialize buddy from bitmap which is aggregation
1078 * of on-disk bitmap and preallocations */
ffad0a44 1079 i = mb_find_next_zero_bit(bitmap, max, 0);
c9de560d
AT
1080 grp->bb_first_free = i;
1081 while (i < max) {
1082 fragments++;
1083 first = i;
ffad0a44 1084 i = mb_find_next_bit(bitmap, max, i);
c9de560d
AT
1085 len = i - first;
1086 free += len;
1087 if (len > 1)
1088 ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
1089 else
1090 grp->bb_counters[0]++;
1091 if (i < max)
ffad0a44 1092 i = mb_find_next_zero_bit(bitmap, max, i);
c9de560d
AT
1093 }
1094 grp->bb_fragments = fragments;
1095
1096 if (free != grp->bb_free) {
e29136f8 1097 ext4_grp_locked_error(sb, group, 0, 0,
94d4c066
TT
1098 "block bitmap and bg descriptor "
1099 "inconsistent: %u vs %u free clusters",
e29136f8 1100 free, grp->bb_free);
e56eb659 1101 /*
163a203d 1102 * If we intend to continue, we consider group descriptor
e56eb659
AK
1103 * corrupt and update bb_free using bitmap value
1104 */
c9de560d 1105 grp->bb_free = free;
db79e6d1
WS
1106 ext4_mark_group_bitmap_corrupted(sb, group,
1107 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
c9de560d 1108 }
8a57d9d6 1109 mb_set_largest_free_order(sb, grp);
83e80a6e 1110 mb_update_avg_fragment_size(sb, grp);
c9de560d
AT
1111
1112 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
1113
1114 period = get_cycles() - period;
67d25186
HS
1115 atomic_inc(&sbi->s_mb_buddies_generated);
1116 atomic64_add(period, &sbi->s_mb_generation_time);
c9de560d
AT
1117}
1118
1119/* The buddy information is attached the buddy cache inode
1120 * for convenience. The information regarding each group
1121 * is loaded via ext4_mb_load_buddy. The information involve
1122 * block bitmap and buddy information. The information are
1123 * stored in the inode as
1124 *
1125 * { page }
c3a326a6 1126 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
c9de560d
AT
1127 *
1128 *
1129 * one block each for bitmap and buddy information.
1130 * So for each group we take up 2 blocks. A page can
ea1754a0 1131 * contain blocks_per_page (PAGE_SIZE / blocksize) blocks.
c9de560d
AT
1132 * So it can have information regarding groups_per_page which
1133 * is blocks_per_page/2
8a57d9d6
CW
1134 *
1135 * Locking note: This routine takes the block group lock of all groups
1136 * for this page; do not hold this lock when calling this routine!
c9de560d
AT
1137 */
1138
adb7ef60 1139static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
c9de560d 1140{
8df9675f 1141 ext4_group_t ngroups;
c9de560d
AT
1142 int blocksize;
1143 int blocks_per_page;
1144 int groups_per_page;
1145 int err = 0;
1146 int i;
813e5727 1147 ext4_group_t first_group, group;
c9de560d
AT
1148 int first_block;
1149 struct super_block *sb;
1150 struct buffer_head *bhs;
fa77dcfa 1151 struct buffer_head **bh = NULL;
c9de560d
AT
1152 struct inode *inode;
1153 char *data;
1154 char *bitmap;
9b8b7d35 1155 struct ext4_group_info *grinfo;
c9de560d 1156
c9de560d
AT
1157 inode = page->mapping->host;
1158 sb = inode->i_sb;
8df9675f 1159 ngroups = ext4_get_groups_count(sb);
93407472 1160 blocksize = i_blocksize(inode);
09cbfeaf 1161 blocks_per_page = PAGE_SIZE / blocksize;
c9de560d 1162
d3df1453
RH
1163 mb_debug(sb, "init page %lu\n", page->index);
1164
c9de560d
AT
1165 groups_per_page = blocks_per_page >> 1;
1166 if (groups_per_page == 0)
1167 groups_per_page = 1;
1168
1169 /* allocate buffer_heads to read bitmaps */
1170 if (groups_per_page > 1) {
c9de560d 1171 i = sizeof(struct buffer_head *) * groups_per_page;
adb7ef60 1172 bh = kzalloc(i, gfp);
139f46d3
KS
1173 if (bh == NULL)
1174 return -ENOMEM;
c9de560d
AT
1175 } else
1176 bh = &bhs;
1177
1178 first_group = page->index * blocks_per_page / 2;
1179
1180 /* read all groups the page covers into the cache */
813e5727
TT
1181 for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
1182 if (group >= ngroups)
c9de560d
AT
1183 break;
1184
813e5727 1185 grinfo = ext4_get_group_info(sb, group);
5354b2af
TT
1186 if (!grinfo)
1187 continue;
9b8b7d35
AG
1188 /*
1189 * If page is uptodate then we came here after online resize
1190 * which added some new uninitialized group info structs, so
1191 * we must skip all initialized uptodate buddies on the page,
1192 * which may be currently in use by an allocating task.
1193 */
1194 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
1195 bh[i] = NULL;
1196 continue;
1197 }
cfd73237 1198 bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
9008a58e
DW
1199 if (IS_ERR(bh[i])) {
1200 err = PTR_ERR(bh[i]);
1201 bh[i] = NULL;
c9de560d 1202 goto out;
2ccb5fb9 1203 }
d3df1453 1204 mb_debug(sb, "read bitmap for group %u\n", group);
c9de560d
AT
1205 }
1206
1207 /* wait for I/O completion */
813e5727 1208 for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
9008a58e
DW
1209 int err2;
1210
1211 if (!bh[i])
1212 continue;
1213 err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
1214 if (!err)
1215 err = err2;
813e5727 1216 }
c9de560d
AT
1217
1218 first_block = page->index * blocks_per_page;
1219 for (i = 0; i < blocks_per_page; i++) {
c9de560d 1220 group = (first_block + i) >> 1;
8df9675f 1221 if (group >= ngroups)
c9de560d
AT
1222 break;
1223
9b8b7d35
AG
1224 if (!bh[group - first_group])
1225 /* skip initialized uptodate buddy */
1226 continue;
1227
bbdc322f
LC
1228 if (!buffer_verified(bh[group - first_group]))
1229 /* Skip faulty bitmaps */
1230 continue;
1231 err = 0;
1232
c9de560d
AT
1233 /*
1234 * data carry information regarding this
1235 * particular group in the format specified
1236 * above
1237 *
1238 */
1239 data = page_address(page) + (i * blocksize);
1240 bitmap = bh[group - first_group]->b_data;
1241
1242 /*
1243 * We place the buddy block and bitmap block
1244 * close together
1245 */
1246 if ((first_block + i) & 1) {
1247 /* this is block of buddy */
1248 BUG_ON(incore == NULL);
d3df1453 1249 mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
c9de560d 1250 group, page->index, i * blocksize);
f307333e 1251 trace_ext4_mb_buddy_bitmap_load(sb, group);
c9de560d 1252 grinfo = ext4_get_group_info(sb, group);
5354b2af
TT
1253 if (!grinfo) {
1254 err = -EFSCORRUPTED;
1255 goto out;
1256 }
c9de560d
AT
1257 grinfo->bb_fragments = 0;
1258 memset(grinfo->bb_counters, 0,
1927805e 1259 sizeof(*grinfo->bb_counters) *
4b68f6df 1260 (MB_NUM_ORDERS(sb)));
c9de560d
AT
1261 /*
1262 * incore got set to the group block bitmap below
1263 */
7a2fcbf7 1264 ext4_lock_group(sb, group);
9b8b7d35
AG
1265 /* init the buddy */
1266 memset(data, 0xff, blocksize);
5354b2af 1267 ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
7a2fcbf7 1268 ext4_unlock_group(sb, group);
c9de560d
AT
1269 incore = NULL;
1270 } else {
1271 /* this is block of bitmap */
1272 BUG_ON(incore != NULL);
d3df1453 1273 mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
c9de560d 1274 group, page->index, i * blocksize);
f307333e 1275 trace_ext4_mb_bitmap_load(sb, group);
c9de560d
AT
1276
1277 /* see comments in ext4_mb_put_pa() */
1278 ext4_lock_group(sb, group);
1279 memcpy(data, bitmap, blocksize);
1280
1281 /* mark all preallocated blks used in in-core bitmap */
1282 ext4_mb_generate_from_pa(sb, data, group);
7a2fcbf7 1283 ext4_mb_generate_from_freelist(sb, data, group);
c9de560d
AT
1284 ext4_unlock_group(sb, group);
1285
1286 /* set incore so that the buddy information can be
1287 * generated using this
1288 */
1289 incore = data;
1290 }
1291 }
1292 SetPageUptodate(page);
1293
1294out:
1295 if (bh) {
9b8b7d35 1296 for (i = 0; i < groups_per_page; i++)
c9de560d
AT
1297 brelse(bh[i]);
1298 if (bh != &bhs)
1299 kfree(bh);
1300 }
1301 return err;
1302}
1303
eee4adc7 1304/*
2de8807b
AG
1305 * Lock the buddy and bitmap pages. This make sure other parallel init_group
1306 * on the same buddy page doesn't happen whild holding the buddy page lock.
1307 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
1308 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
eee4adc7 1309 */
2de8807b 1310static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
adb7ef60 1311 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
eee4adc7 1312{
2de8807b
AG
1313 struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
1314 int block, pnum, poff;
eee4adc7 1315 int blocks_per_page;
2de8807b
AG
1316 struct page *page;
1317
1318 e4b->bd_buddy_page = NULL;
1319 e4b->bd_bitmap_page = NULL;
eee4adc7 1320
09cbfeaf 1321 blocks_per_page = PAGE_SIZE / sb->s_blocksize;
eee4adc7
ES
1322 /*
1323 * the buddy cache inode stores the block bitmap
1324 * and buddy information in consecutive blocks.
1325 * So for each group we need two blocks.
1326 */
1327 block = group * 2;
1328 pnum = block / blocks_per_page;
2de8807b 1329 poff = block % blocks_per_page;
adb7ef60 1330 page = find_or_create_page(inode->i_mapping, pnum, gfp);
2de8807b 1331 if (!page)
c57ab39b 1332 return -ENOMEM;
2de8807b
AG
1333 BUG_ON(page->mapping != inode->i_mapping);
1334 e4b->bd_bitmap_page = page;
1335 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1336
1337 if (blocks_per_page >= 2) {
1338 /* buddy and bitmap are on the same page */
1339 return 0;
eee4adc7 1340 }
2de8807b
AG
1341
1342 block++;
1343 pnum = block / blocks_per_page;
adb7ef60 1344 page = find_or_create_page(inode->i_mapping, pnum, gfp);
2de8807b 1345 if (!page)
c57ab39b 1346 return -ENOMEM;
2de8807b
AG
1347 BUG_ON(page->mapping != inode->i_mapping);
1348 e4b->bd_buddy_page = page;
1349 return 0;
eee4adc7
ES
1350}
1351
2de8807b 1352static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
eee4adc7 1353{
2de8807b
AG
1354 if (e4b->bd_bitmap_page) {
1355 unlock_page(e4b->bd_bitmap_page);
09cbfeaf 1356 put_page(e4b->bd_bitmap_page);
2de8807b
AG
1357 }
1358 if (e4b->bd_buddy_page) {
1359 unlock_page(e4b->bd_buddy_page);
09cbfeaf 1360 put_page(e4b->bd_buddy_page);
eee4adc7 1361 }
eee4adc7
ES
1362}
1363
8a57d9d6
CW
1364/*
1365 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1366 * block group lock of all groups for this page; do not hold the BG lock when
1367 * calling this routine!
1368 */
b6a758ec 1369static noinline_for_stack
adb7ef60 1370int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
b6a758ec
AK
1371{
1372
b6a758ec 1373 struct ext4_group_info *this_grp;
2de8807b
AG
1374 struct ext4_buddy e4b;
1375 struct page *page;
1376 int ret = 0;
b6a758ec 1377
b10a44c3 1378 might_sleep();
d3df1453 1379 mb_debug(sb, "init group %u\n", group);
b6a758ec 1380 this_grp = ext4_get_group_info(sb, group);
5354b2af
TT
1381 if (!this_grp)
1382 return -EFSCORRUPTED;
1383
b6a758ec 1384 /*
08c3a813
AK
1385 * This ensures that we don't reinit the buddy cache
1386 * page which map to the group from which we are already
1387 * allocating. If we are looking at the buddy cache we would
1388 * have taken a reference using ext4_mb_load_buddy and that
2de8807b 1389 * would have pinned buddy page to page cache.
2457aec6
MG
1390 * The call to ext4_mb_get_buddy_page_lock will mark the
1391 * page accessed.
b6a758ec 1392 */
adb7ef60 1393 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
2de8807b 1394 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
b6a758ec
AK
1395 /*
1396 * somebody initialized the group
1397 * return without doing anything
1398 */
b6a758ec
AK
1399 goto err;
1400 }
2de8807b
AG
1401
1402 page = e4b.bd_bitmap_page;
adb7ef60 1403 ret = ext4_mb_init_cache(page, NULL, gfp);
2de8807b
AG
1404 if (ret)
1405 goto err;
1406 if (!PageUptodate(page)) {
b6a758ec
AK
1407 ret = -EIO;
1408 goto err;
1409 }
b6a758ec 1410
2de8807b 1411 if (e4b.bd_buddy_page == NULL) {
b6a758ec
AK
1412 /*
1413 * If both the bitmap and buddy are in
1414 * the same page we don't need to force
1415 * init the buddy
1416 */
2de8807b
AG
1417 ret = 0;
1418 goto err;
b6a758ec 1419 }
2de8807b
AG
1420 /* init buddy cache */
1421 page = e4b.bd_buddy_page;
adb7ef60 1422 ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
2de8807b
AG
1423 if (ret)
1424 goto err;
1425 if (!PageUptodate(page)) {
b6a758ec
AK
1426 ret = -EIO;
1427 goto err;
1428 }
b6a758ec 1429err:
2de8807b 1430 ext4_mb_put_buddy_page_lock(&e4b);
b6a758ec
AK
1431 return ret;
1432}
1433
8a57d9d6
CW
1434/*
1435 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1436 * block group lock of all groups for this page; do not hold the BG lock when
1437 * calling this routine!
1438 */
4ddfef7b 1439static noinline_for_stack int
adb7ef60
KK
1440ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1441 struct ext4_buddy *e4b, gfp_t gfp)
c9de560d 1442{
c9de560d
AT
1443 int blocks_per_page;
1444 int block;
1445 int pnum;
1446 int poff;
1447 struct page *page;
fdf6c7a7 1448 int ret;
920313a7
AK
1449 struct ext4_group_info *grp;
1450 struct ext4_sb_info *sbi = EXT4_SB(sb);
1451 struct inode *inode = sbi->s_buddy_cache;
c9de560d 1452
b10a44c3 1453 might_sleep();
d3df1453 1454 mb_debug(sb, "load group %u\n", group);
c9de560d 1455
09cbfeaf 1456 blocks_per_page = PAGE_SIZE / sb->s_blocksize;
920313a7 1457 grp = ext4_get_group_info(sb, group);
5354b2af
TT
1458 if (!grp)
1459 return -EFSCORRUPTED;
c9de560d
AT
1460
1461 e4b->bd_blkbits = sb->s_blocksize_bits;
529da704 1462 e4b->bd_info = grp;
c9de560d
AT
1463 e4b->bd_sb = sb;
1464 e4b->bd_group = group;
1465 e4b->bd_buddy_page = NULL;
1466 e4b->bd_bitmap_page = NULL;
1467
f41c0750 1468 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
f41c0750
AK
1469 /*
1470 * we need full data about the group
1471 * to make a good selection
1472 */
adb7ef60 1473 ret = ext4_mb_init_group(sb, group, gfp);
f41c0750
AK
1474 if (ret)
1475 return ret;
f41c0750
AK
1476 }
1477
c9de560d
AT
1478 /*
1479 * the buddy cache inode stores the block bitmap
1480 * and buddy information in consecutive blocks.
1481 * So for each group we need two blocks.
1482 */
1483 block = group * 2;
1484 pnum = block / blocks_per_page;
1485 poff = block % blocks_per_page;
1486
1487 /* we could use find_or_create_page(), but it locks page
1488 * what we'd like to avoid in fast path ... */
2457aec6 1489 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
c9de560d
AT
1490 if (page == NULL || !PageUptodate(page)) {
1491 if (page)
920313a7
AK
1492 /*
1493 * drop the page reference and try
1494 * to get the page with lock. If we
1495 * are not uptodate that implies
1496 * somebody just created the page but
1497 * is yet to initialize the same. So
1498 * wait for it to initialize.
1499 */
09cbfeaf 1500 put_page(page);
adb7ef60 1501 page = find_or_create_page(inode->i_mapping, pnum, gfp);
c9de560d 1502 if (page) {
19b8b035
TT
1503 if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
1504 "ext4: bitmap's paging->mapping != inode->i_mapping\n")) {
1505 /* should never happen */
1506 unlock_page(page);
1507 ret = -EINVAL;
1508 goto err;
1509 }
c9de560d 1510 if (!PageUptodate(page)) {
adb7ef60 1511 ret = ext4_mb_init_cache(page, NULL, gfp);
fdf6c7a7
SF
1512 if (ret) {
1513 unlock_page(page);
1514 goto err;
1515 }
c9de560d
AT
1516 mb_cmp_bitmaps(e4b, page_address(page) +
1517 (poff * sb->s_blocksize));
1518 }
1519 unlock_page(page);
1520 }
1521 }
c57ab39b
YL
1522 if (page == NULL) {
1523 ret = -ENOMEM;
1524 goto err;
1525 }
1526 if (!PageUptodate(page)) {
fdf6c7a7 1527 ret = -EIO;
c9de560d 1528 goto err;
fdf6c7a7 1529 }
2457aec6
MG
1530
1531 /* Pages marked accessed already */
c9de560d
AT
1532 e4b->bd_bitmap_page = page;
1533 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
c9de560d
AT
1534
1535 block++;
1536 pnum = block / blocks_per_page;
1537 poff = block % blocks_per_page;
1538
2457aec6 1539 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
c9de560d
AT
1540 if (page == NULL || !PageUptodate(page)) {
1541 if (page)
09cbfeaf 1542 put_page(page);
adb7ef60 1543 page = find_or_create_page(inode->i_mapping, pnum, gfp);
c9de560d 1544 if (page) {
19b8b035
TT
1545 if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
1546 "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) {
1547 /* should never happen */
1548 unlock_page(page);
1549 ret = -EINVAL;
1550 goto err;
1551 }
fdf6c7a7 1552 if (!PageUptodate(page)) {
adb7ef60
KK
1553 ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1554 gfp);
fdf6c7a7
SF
1555 if (ret) {
1556 unlock_page(page);
1557 goto err;
1558 }
1559 }
c9de560d
AT
1560 unlock_page(page);
1561 }
1562 }
c57ab39b
YL
1563 if (page == NULL) {
1564 ret = -ENOMEM;
1565 goto err;
1566 }
1567 if (!PageUptodate(page)) {
fdf6c7a7 1568 ret = -EIO;
c9de560d 1569 goto err;
fdf6c7a7 1570 }
2457aec6
MG
1571
1572 /* Pages marked accessed already */
c9de560d
AT
1573 e4b->bd_buddy_page = page;
1574 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
c9de560d 1575
c9de560d
AT
1576 return 0;
1577
1578err:
26626f11 1579 if (page)
09cbfeaf 1580 put_page(page);
c9de560d 1581 if (e4b->bd_bitmap_page)
09cbfeaf 1582 put_page(e4b->bd_bitmap_page);
285164b8 1583
c9de560d
AT
1584 e4b->bd_buddy = NULL;
1585 e4b->bd_bitmap = NULL;
fdf6c7a7 1586 return ret;
c9de560d
AT
1587}
1588
adb7ef60
KK
1589static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1590 struct ext4_buddy *e4b)
1591{
1592 return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1593}
1594
e39e07fd 1595static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
c9de560d
AT
1596{
1597 if (e4b->bd_bitmap_page)
09cbfeaf 1598 put_page(e4b->bd_bitmap_page);
c9de560d 1599 if (e4b->bd_buddy_page)
09cbfeaf 1600 put_page(e4b->bd_buddy_page);
c9de560d
AT
1601}
1602
1603
1604static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1605{
ce3cca33 1606 int order = 1, max;
c9de560d
AT
1607 void *bb;
1608
c5e8f3f3 1609 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
c9de560d
AT
1610 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1611
c9de560d 1612 while (order <= e4b->bd_blkbits + 1) {
ce3cca33
CX
1613 bb = mb_find_buddy(e4b, order, &max);
1614 if (!mb_test_bit(block >> order, bb)) {
c9de560d
AT
1615 /* this block is part of buddy of order 'order' */
1616 return order;
1617 }
c9de560d
AT
1618 order++;
1619 }
1620 return 0;
1621}
1622
955ce5f5 1623static void mb_clear_bits(void *bm, int cur, int len)
c9de560d
AT
1624{
1625 __u32 *addr;
1626
1627 len = cur + len;
1628 while (cur < len) {
1629 if ((cur & 31) == 0 && (len - cur) >= 32) {
1630 /* fast path: clear whole word at once */
1631 addr = bm + (cur >> 3);
1632 *addr = 0;
1633 cur += 32;
1634 continue;
1635 }
955ce5f5 1636 mb_clear_bit(cur, bm);
c9de560d
AT
1637 cur++;
1638 }
1639}
1640
eabe0444
AS
1641/* clear bits in given range
1642 * will return first found zero bit if any, -1 otherwise
1643 */
1644static int mb_test_and_clear_bits(void *bm, int cur, int len)
1645{
1646 __u32 *addr;
1647 int zero_bit = -1;
1648
1649 len = cur + len;
1650 while (cur < len) {
1651 if ((cur & 31) == 0 && (len - cur) >= 32) {
1652 /* fast path: clear whole word at once */
1653 addr = bm + (cur >> 3);
1654 if (*addr != (__u32)(-1) && zero_bit == -1)
1655 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1656 *addr = 0;
1657 cur += 32;
1658 continue;
1659 }
1660 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1661 zero_bit = cur;
1662 cur++;
1663 }
1664
1665 return zero_bit;
1666}
1667
123e3016 1668void mb_set_bits(void *bm, int cur, int len)
c9de560d
AT
1669{
1670 __u32 *addr;
1671
1672 len = cur + len;
1673 while (cur < len) {
1674 if ((cur & 31) == 0 && (len - cur) >= 32) {
1675 /* fast path: set whole word at once */
1676 addr = bm + (cur >> 3);
1677 *addr = 0xffffffff;
1678 cur += 32;
1679 continue;
1680 }
955ce5f5 1681 mb_set_bit(cur, bm);
c9de560d
AT
1682 cur++;
1683 }
1684}
1685
eabe0444
AS
1686static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1687{
1688 if (mb_test_bit(*bit + side, bitmap)) {
1689 mb_clear_bit(*bit, bitmap);
1690 (*bit) -= side;
1691 return 1;
1692 }
1693 else {
1694 (*bit) += side;
1695 mb_set_bit(*bit, bitmap);
1696 return -1;
1697 }
1698}
1699
1700static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1701{
1702 int max;
1703 int order = 1;
1704 void *buddy = mb_find_buddy(e4b, order, &max);
1705
1706 while (buddy) {
1707 void *buddy2;
1708
1709 /* Bits in range [first; last] are known to be set since
1710 * corresponding blocks were allocated. Bits in range
1711 * (first; last) will stay set because they form buddies on
1712 * upper layer. We just deal with borders if they don't
1713 * align with upper layer and then go up.
1714 * Releasing entire group is all about clearing
1715 * single bit of highest order buddy.
1716 */
1717
1718 /* Example:
1719 * ---------------------------------
1720 * | 1 | 1 | 1 | 1 |
1721 * ---------------------------------
1722 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1723 * ---------------------------------
1724 * 0 1 2 3 4 5 6 7
1725 * \_____________________/
1726 *
1727 * Neither [1] nor [6] is aligned to above layer.
1728 * Left neighbour [0] is free, so mark it busy,
1729 * decrease bb_counters and extend range to
1730 * [0; 6]
1731 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1732 * mark [6] free, increase bb_counters and shrink range to
1733 * [0; 5].
1734 * Then shift range to [0; 2], go up and do the same.
1735 */
1736
1737
1738 if (first & 1)
1739 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1740 if (!(last & 1))
1741 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1742 if (first > last)
1743 break;
1744 order++;
1745
976620bd
KS
1746 buddy2 = mb_find_buddy(e4b, order, &max);
1747 if (!buddy2) {
eabe0444
AS
1748 mb_clear_bits(buddy, first, last - first + 1);
1749 e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1750 break;
1751 }
1752 first >>= 1;
1753 last >>= 1;
1754 buddy = buddy2;
1755 }
1756}
1757
7e5a8cdd 1758static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
eabe0444 1759 int first, int count)
c9de560d 1760{
eabe0444
AS
1761 int left_is_free = 0;
1762 int right_is_free = 0;
1763 int block;
1764 int last = first + count - 1;
c9de560d
AT
1765 struct super_block *sb = e4b->bd_sb;
1766
c99d1e6e
TT
1767 if (WARN_ON(count == 0))
1768 return;
eabe0444 1769 BUG_ON(last >= (sb->s_blocksize << 3));
bc8e6740 1770 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
163a203d
DW
1771 /* Don't bother if the block group is corrupt. */
1772 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1773 return;
1774
c9de560d
AT
1775 mb_check_buddy(e4b);
1776 mb_free_blocks_double(inode, e4b, first, count);
1777
07b5b8e1 1778 this_cpu_inc(discard_pa_seq);
c9de560d
AT
1779 e4b->bd_info->bb_free += count;
1780 if (first < e4b->bd_info->bb_first_free)
1781 e4b->bd_info->bb_first_free = first;
1782
eabe0444
AS
1783 /* access memory sequentially: check left neighbour,
1784 * clear range and then check right neighbour
1785 */
c9de560d 1786 if (first != 0)
eabe0444
AS
1787 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1788 block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1789 if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1790 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1791
1792 if (unlikely(block != -1)) {
e43bb4e6 1793 struct ext4_sb_info *sbi = EXT4_SB(sb);
eabe0444
AS
1794 ext4_fsblk_t blocknr;
1795
1796 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
49598e04 1797 blocknr += EXT4_C2B(sbi, block);
8016e29f
HS
1798 if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
1799 ext4_grp_locked_error(sb, e4b->bd_group,
1800 inode ? inode->i_ino : 0,
1801 blocknr,
1802 "freeing already freed block (bit %u); block bitmap corrupt.",
1803 block);
1804 ext4_mark_group_bitmap_corrupted(
1805 sb, e4b->bd_group,
db79e6d1 1806 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
8016e29f 1807 }
eabe0444
AS
1808 goto done;
1809 }
1810
1811 /* let's maintain fragments counter */
1812 if (left_is_free && right_is_free)
c9de560d 1813 e4b->bd_info->bb_fragments--;
eabe0444 1814 else if (!left_is_free && !right_is_free)
c9de560d
AT
1815 e4b->bd_info->bb_fragments++;
1816
eabe0444
AS
1817 /* buddy[0] == bd_bitmap is a special case, so handle
1818 * it right away and let mb_buddy_mark_free stay free of
1819 * zero order checks.
1820 * Check if neighbours are to be coaleasced,
1821 * adjust bitmap bb_counters and borders appropriately.
1822 */
1823 if (first & 1) {
1824 first += !left_is_free;
1825 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1826 }
1827 if (!(last & 1)) {
1828 last -= !right_is_free;
1829 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1830 }
c9de560d 1831
eabe0444
AS
1832 if (first <= last)
1833 mb_buddy_mark_free(e4b, first >> 1, last >> 1);
c9de560d 1834
eabe0444 1835done:
8a57d9d6 1836 mb_set_largest_free_order(sb, e4b->bd_info);
196e402a 1837 mb_update_avg_fragment_size(sb, e4b->bd_info);
c9de560d 1838 mb_check_buddy(e4b);
c9de560d
AT
1839}
1840
15c006a2 1841static int mb_find_extent(struct ext4_buddy *e4b, int block,
c9de560d
AT
1842 int needed, struct ext4_free_extent *ex)
1843{
1844 int next = block;
15c006a2 1845 int max, order;
c9de560d
AT
1846 void *buddy;
1847
bc8e6740 1848 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
c9de560d
AT
1849 BUG_ON(ex == NULL);
1850
15c006a2 1851 buddy = mb_find_buddy(e4b, 0, &max);
c9de560d
AT
1852 BUG_ON(buddy == NULL);
1853 BUG_ON(block >= max);
1854 if (mb_test_bit(block, buddy)) {
1855 ex->fe_len = 0;
1856 ex->fe_start = 0;
1857 ex->fe_group = 0;
1858 return 0;
1859 }
1860
15c006a2
RD
1861 /* find actual order */
1862 order = mb_find_order_for_block(e4b, block);
1863 block = block >> order;
c9de560d
AT
1864
1865 ex->fe_len = 1 << order;
1866 ex->fe_start = block << order;
1867 ex->fe_group = e4b->bd_group;
1868
1869 /* calc difference from given start */
1870 next = next - ex->fe_start;
1871 ex->fe_len -= next;
1872 ex->fe_start += next;
1873
1874 while (needed > ex->fe_len &&
d8ec0c39 1875 mb_find_buddy(e4b, order, &max)) {
c9de560d
AT
1876
1877 if (block + 1 >= max)
1878 break;
1879
1880 next = (block + 1) * (1 << order);
c5e8f3f3 1881 if (mb_test_bit(next, e4b->bd_bitmap))
c9de560d
AT
1882 break;
1883
b051d8dc 1884 order = mb_find_order_for_block(e4b, next);
c9de560d 1885
c9de560d
AT
1886 block = next >> order;
1887 ex->fe_len += 1 << order;
1888 }
1889
31562b95 1890 if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
43c73221
TT
1891 /* Should never happen! (but apparently sometimes does?!?) */
1892 WARN_ON(1);
cd84bbba
SB
1893 ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
1894 "corruption or bug in mb_find_extent "
1895 "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
1896 block, order, needed, ex->fe_group, ex->fe_start,
1897 ex->fe_len, ex->fe_logical);
43c73221
TT
1898 ex->fe_len = 0;
1899 ex->fe_start = 0;
1900 ex->fe_group = 0;
1901 }
c9de560d
AT
1902 return ex->fe_len;
1903}
1904
1905static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1906{
1907 int ord;
1908 int mlen = 0;
1909 int max = 0;
1910 int cur;
1911 int start = ex->fe_start;
1912 int len = ex->fe_len;
1913 unsigned ret = 0;
1914 int len0 = len;
1915 void *buddy;
218a6944 1916 bool split = false;
c9de560d
AT
1917
1918 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1919 BUG_ON(e4b->bd_group != ex->fe_group);
bc8e6740 1920 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
c9de560d
AT
1921 mb_check_buddy(e4b);
1922 mb_mark_used_double(e4b, start, len);
1923
07b5b8e1 1924 this_cpu_inc(discard_pa_seq);
c9de560d
AT
1925 e4b->bd_info->bb_free -= len;
1926 if (e4b->bd_info->bb_first_free == start)
1927 e4b->bd_info->bb_first_free += len;
1928
1929 /* let's maintain fragments counter */
1930 if (start != 0)
c5e8f3f3 1931 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
c9de560d 1932 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
c5e8f3f3 1933 max = !mb_test_bit(start + len, e4b->bd_bitmap);
c9de560d
AT
1934 if (mlen && max)
1935 e4b->bd_info->bb_fragments++;
1936 else if (!mlen && !max)
1937 e4b->bd_info->bb_fragments--;
1938
1939 /* let's maintain buddy itself */
1940 while (len) {
218a6944 1941 if (!split)
1942 ord = mb_find_order_for_block(e4b, start);
c9de560d
AT
1943
1944 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1945 /* the whole chunk may be allocated at once! */
1946 mlen = 1 << ord;
218a6944 1947 if (!split)
1948 buddy = mb_find_buddy(e4b, ord, &max);
1949 else
1950 split = false;
c9de560d
AT
1951 BUG_ON((start >> ord) >= max);
1952 mb_set_bit(start >> ord, buddy);
1953 e4b->bd_info->bb_counters[ord]--;
1954 start += mlen;
1955 len -= mlen;
1956 BUG_ON(len < 0);
1957 continue;
1958 }
1959
1960 /* store for history */
1961 if (ret == 0)
1962 ret = len | (ord << 16);
1963
1964 /* we have to split large buddy */
1965 BUG_ON(ord <= 0);
1966 buddy = mb_find_buddy(e4b, ord, &max);
1967 mb_set_bit(start >> ord, buddy);
1968 e4b->bd_info->bb_counters[ord]--;
1969
1970 ord--;
1971 cur = (start >> ord) & ~1U;
1972 buddy = mb_find_buddy(e4b, ord, &max);
1973 mb_clear_bit(cur, buddy);
1974 mb_clear_bit(cur + 1, buddy);
1975 e4b->bd_info->bb_counters[ord]++;
1976 e4b->bd_info->bb_counters[ord]++;
218a6944 1977 split = true;
c9de560d 1978 }
8a57d9d6 1979 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
c9de560d 1980
196e402a 1981 mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
123e3016 1982 mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
c9de560d
AT
1983 mb_check_buddy(e4b);
1984
1985 return ret;
1986}
1987
1988/*
1989 * Must be called under group lock!
1990 */
1991static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1992 struct ext4_buddy *e4b)
1993{
1994 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1995 int ret;
1996
1997 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1998 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1999
2000 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
2001 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
2002 ret = mb_mark_used(e4b, &ac->ac_b_ex);
2003
2004 /* preallocation can change ac_b_ex, thus we store actually
2005 * allocated blocks for history */
2006 ac->ac_f_ex = ac->ac_b_ex;
2007
2008 ac->ac_status = AC_STATUS_FOUND;
2009 ac->ac_tail = ret & 0xffff;
2010 ac->ac_buddy = ret >> 16;
2011
c3a326a6
AK
2012 /*
2013 * take the page reference. We want the page to be pinned
2014 * so that we don't get a ext4_mb_init_cache_call for this
2015 * group until we update the bitmap. That would mean we
2016 * double allocate blocks. The reference is dropped
2017 * in ext4_mb_release_context
2018 */
c9de560d
AT
2019 ac->ac_bitmap_page = e4b->bd_bitmap_page;
2020 get_page(ac->ac_bitmap_page);
2021 ac->ac_buddy_page = e4b->bd_buddy_page;
2022 get_page(ac->ac_buddy_page);
c9de560d 2023 /* store last allocated for subsequent stream allocation */
4ba74d00 2024 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
c9de560d
AT
2025 spin_lock(&sbi->s_md_lock);
2026 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
2027 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
2028 spin_unlock(&sbi->s_md_lock);
2029 }
53f86b17
RH
2030 /*
2031 * As we've just preallocated more space than
2032 * user requested originally, we store allocated
2033 * space in a special descriptor.
2034 */
2035 if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
2036 ext4_mb_new_preallocation(ac);
2037
c9de560d
AT
2038}
2039
c9de560d
AT
2040static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
2041 struct ext4_buddy *e4b,
2042 int finish_group)
2043{
2044 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2045 struct ext4_free_extent *bex = &ac->ac_b_ex;
2046 struct ext4_free_extent *gex = &ac->ac_g_ex;
c9de560d 2047
032115fc
AK
2048 if (ac->ac_status == AC_STATUS_FOUND)
2049 return;
c9de560d
AT
2050 /*
2051 * We don't want to scan for a whole year
2052 */
2053 if (ac->ac_found > sbi->s_mb_max_to_scan &&
2054 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2055 ac->ac_status = AC_STATUS_BREAK;
2056 return;
2057 }
2058
2059 /*
2060 * Haven't found good chunk so far, let's continue
2061 */
2062 if (bex->fe_len < gex->fe_len)
2063 return;
2064
78dc9f84
KS
2065 if (finish_group)
2066 ext4_mb_use_best_found(ac, e4b);
c9de560d
AT
2067}
2068
2069/*
2070 * The routine checks whether found extent is good enough. If it is,
2071 * then the extent gets marked used and flag is set to the context
2072 * to stop scanning. Otherwise, the extent is compared with the
2073 * previous found extent and if new one is better, then it's stored
2074 * in the context. Later, the best found extent will be used, if
2075 * mballoc can't find good enough extent.
2076 *
2077 * FIXME: real allocation policy is to be designed yet!
2078 */
2079static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
2080 struct ext4_free_extent *ex,
2081 struct ext4_buddy *e4b)
2082{
2083 struct ext4_free_extent *bex = &ac->ac_b_ex;
2084 struct ext4_free_extent *gex = &ac->ac_g_ex;
2085
2086 BUG_ON(ex->fe_len <= 0);
7137d7a4
TT
2087 BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
2088 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
c9de560d
AT
2089 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
2090
2091 ac->ac_found++;
2092
2093 /*
2094 * The special case - take what you catch first
2095 */
2096 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2097 *bex = *ex;
2098 ext4_mb_use_best_found(ac, e4b);
2099 return;
2100 }
2101
2102 /*
2103 * Let's check whether the chuck is good enough
2104 */
2105 if (ex->fe_len == gex->fe_len) {
2106 *bex = *ex;
2107 ext4_mb_use_best_found(ac, e4b);
2108 return;
2109 }
2110
2111 /*
2112 * If this is first found extent, just store it in the context
2113 */
2114 if (bex->fe_len == 0) {
2115 *bex = *ex;
2116 return;
2117 }
2118
2119 /*
2120 * If new found extent is better, store it in the context
2121 */
2122 if (bex->fe_len < gex->fe_len) {
2123 /* if the request isn't satisfied, any found extent
2124 * larger than previous best one is better */
2125 if (ex->fe_len > bex->fe_len)
2126 *bex = *ex;
2127 } else if (ex->fe_len > gex->fe_len) {
2128 /* if the request is satisfied, then we try to find
2129 * an extent that still satisfy the request, but is
2130 * smaller than previous one */
2131 if (ex->fe_len < bex->fe_len)
2132 *bex = *ex;
2133 }
2134
2135 ext4_mb_check_limits(ac, e4b, 0);
2136}
2137
089ceecc 2138static noinline_for_stack
85b67ffb 2139void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
c9de560d
AT
2140 struct ext4_buddy *e4b)
2141{
2142 struct ext4_free_extent ex = ac->ac_b_ex;
2143 ext4_group_t group = ex.fe_group;
2144 int max;
2145 int err;
2146
2147 BUG_ON(ex.fe_len <= 0);
2148 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
2149 if (err)
85b67ffb 2150 return;
c9de560d
AT
2151
2152 ext4_lock_group(ac->ac_sb, group);
15c006a2 2153 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
c9de560d
AT
2154
2155 if (max > 0) {
2156 ac->ac_b_ex = ex;
2157 ext4_mb_use_best_found(ac, e4b);
2158 }
2159
2160 ext4_unlock_group(ac->ac_sb, group);
e39e07fd 2161 ext4_mb_unload_buddy(e4b);
c9de560d
AT
2162}
2163
089ceecc
ES
2164static noinline_for_stack
2165int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
c9de560d
AT
2166 struct ext4_buddy *e4b)
2167{
2168 ext4_group_t group = ac->ac_g_ex.fe_group;
2169 int max;
2170 int err;
2171 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
838cd0cf 2172 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
c9de560d
AT
2173 struct ext4_free_extent ex;
2174
5354b2af
TT
2175 if (!grp)
2176 return -EFSCORRUPTED;
01e4ca29 2177 if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
c9de560d 2178 return 0;
838cd0cf
YY
2179 if (grp->bb_free == 0)
2180 return 0;
c9de560d
AT
2181
2182 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
2183 if (err)
2184 return err;
2185
163a203d
DW
2186 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
2187 ext4_mb_unload_buddy(e4b);
2188 return 0;
2189 }
2190
c9de560d 2191 ext4_lock_group(ac->ac_sb, group);
15c006a2 2192 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
c9de560d 2193 ac->ac_g_ex.fe_len, &ex);
ab0c00fc 2194 ex.fe_logical = 0xDEADFA11; /* debug value */
c9de560d
AT
2195
2196 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
2197 ext4_fsblk_t start;
2198
5661bd68
AM
2199 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
2200 ex.fe_start;
c9de560d
AT
2201 /* use do_div to get remainder (would be 64-bit modulo) */
2202 if (do_div(start, sbi->s_stripe) == 0) {
2203 ac->ac_found++;
2204 ac->ac_b_ex = ex;
2205 ext4_mb_use_best_found(ac, e4b);
2206 }
2207 } else if (max >= ac->ac_g_ex.fe_len) {
2208 BUG_ON(ex.fe_len <= 0);
2209 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
2210 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
2211 ac->ac_found++;
2212 ac->ac_b_ex = ex;
2213 ext4_mb_use_best_found(ac, e4b);
2214 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
2215 /* Sometimes, caller may want to merge even small
2216 * number of blocks to an existing extent */
2217 BUG_ON(ex.fe_len <= 0);
2218 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
2219 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
2220 ac->ac_found++;
2221 ac->ac_b_ex = ex;
2222 ext4_mb_use_best_found(ac, e4b);
2223 }
2224 ext4_unlock_group(ac->ac_sb, group);
e39e07fd 2225 ext4_mb_unload_buddy(e4b);
c9de560d
AT
2226
2227 return 0;
2228}
2229
2230/*
2231 * The routine scans buddy structures (not bitmap!) from given order
2232 * to max order and tries to find big enough chunk to satisfy the req
2233 */
089ceecc
ES
2234static noinline_for_stack
2235void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
c9de560d
AT
2236 struct ext4_buddy *e4b)
2237{
2238 struct super_block *sb = ac->ac_sb;
2239 struct ext4_group_info *grp = e4b->bd_info;
2240 void *buddy;
2241 int i;
2242 int k;
2243 int max;
2244
2245 BUG_ON(ac->ac_2order <= 0);
4b68f6df 2246 for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
c9de560d
AT
2247 if (grp->bb_counters[i] == 0)
2248 continue;
2249
2250 buddy = mb_find_buddy(e4b, i, &max);
19b8b035
TT
2251 if (WARN_RATELIMIT(buddy == NULL,
2252 "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i))
2253 continue;
c9de560d 2254
ffad0a44 2255 k = mb_find_next_zero_bit(buddy, max, 0);
eb576086
DM
2256 if (k >= max) {
2257 ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
2258 "%d free clusters of order %d. But found 0",
2259 grp->bb_counters[i], i);
2260 ext4_mark_group_bitmap_corrupted(ac->ac_sb,
2261 e4b->bd_group,
2262 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2263 break;
2264 }
c9de560d
AT
2265 ac->ac_found++;
2266
2267 ac->ac_b_ex.fe_len = 1 << i;
2268 ac->ac_b_ex.fe_start = k << i;
2269 ac->ac_b_ex.fe_group = e4b->bd_group;
2270
2271 ext4_mb_use_best_found(ac, e4b);
2272
53f86b17 2273 BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
c9de560d
AT
2274
2275 if (EXT4_SB(sb)->s_mb_stats)
2276 atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
2277
2278 break;
2279 }
2280}
2281
2282/*
2283 * The routine scans the group and measures all found extents.
2284 * In order to optimize scanning, caller must pass number of
2285 * free blocks in the group, so the routine can know upper limit.
2286 */
089ceecc
ES
2287static noinline_for_stack
2288void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
c9de560d
AT
2289 struct ext4_buddy *e4b)
2290{
2291 struct super_block *sb = ac->ac_sb;
c5e8f3f3 2292 void *bitmap = e4b->bd_bitmap;
c9de560d
AT
2293 struct ext4_free_extent ex;
2294 int i;
2295 int free;
2296
2297 free = e4b->bd_info->bb_free;
907ea529
TT
2298 if (WARN_ON(free <= 0))
2299 return;
c9de560d
AT
2300
2301 i = e4b->bd_info->bb_first_free;
2302
2303 while (free && ac->ac_status == AC_STATUS_CONTINUE) {
ffad0a44 2304 i = mb_find_next_zero_bit(bitmap,
7137d7a4
TT
2305 EXT4_CLUSTERS_PER_GROUP(sb), i);
2306 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
26346ff6 2307 /*
e56eb659 2308 * IF we have corrupt bitmap, we won't find any
26346ff6 2309 * free blocks even though group info says we
b483bb77 2310 * have free blocks
26346ff6 2311 */
e29136f8 2312 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
53accfa9 2313 "%d free clusters as per "
fde4d95a 2314 "group info. But bitmap says 0",
26346ff6 2315 free);
736dedbb
WS
2316 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
2317 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
c9de560d
AT
2318 break;
2319 }
2320
15c006a2 2321 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
907ea529
TT
2322 if (WARN_ON(ex.fe_len <= 0))
2323 break;
26346ff6 2324 if (free < ex.fe_len) {
e29136f8 2325 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
53accfa9 2326 "%d free clusters as per "
fde4d95a 2327 "group info. But got %d blocks",
26346ff6 2328 free, ex.fe_len);
736dedbb
WS
2329 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
2330 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
e56eb659
AK
2331 /*
2332 * The number of free blocks differs. This mostly
2333 * indicate that the bitmap is corrupt. So exit
2334 * without claiming the space.
2335 */
2336 break;
26346ff6 2337 }
ab0c00fc 2338 ex.fe_logical = 0xDEADC0DE; /* debug value */
c9de560d
AT
2339 ext4_mb_measure_extent(ac, &ex, e4b);
2340
2341 i += ex.fe_len;
2342 free -= ex.fe_len;
2343 }
2344
2345 ext4_mb_check_limits(ac, e4b, 1);
2346}
2347
2348/*
2349 * This is a special case for storages like raid5
506bf2d8 2350 * we try to find stripe-aligned chunks for stripe-size-multiple requests
c9de560d 2351 */
089ceecc
ES
2352static noinline_for_stack
2353void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
c9de560d
AT
2354 struct ext4_buddy *e4b)
2355{
2356 struct super_block *sb = ac->ac_sb;
2357 struct ext4_sb_info *sbi = EXT4_SB(sb);
c5e8f3f3 2358 void *bitmap = e4b->bd_bitmap;
c9de560d
AT
2359 struct ext4_free_extent ex;
2360 ext4_fsblk_t first_group_block;
2361 ext4_fsblk_t a;
2362 ext4_grpblk_t i;
2363 int max;
2364
2365 BUG_ON(sbi->s_stripe == 0);
2366
2367 /* find first stripe-aligned block in group */
5661bd68
AM
2368 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
2369
c9de560d
AT
2370 a = first_group_block + sbi->s_stripe - 1;
2371 do_div(a, sbi->s_stripe);
2372 i = (a * sbi->s_stripe) - first_group_block;
2373
7137d7a4 2374 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
c9de560d 2375 if (!mb_test_bit(i, bitmap)) {
15c006a2 2376 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
c9de560d
AT
2377 if (max >= sbi->s_stripe) {
2378 ac->ac_found++;
ab0c00fc 2379 ex.fe_logical = 0xDEADF00D; /* debug value */
c9de560d
AT
2380 ac->ac_b_ex = ex;
2381 ext4_mb_use_best_found(ac, e4b);
2382 break;
2383 }
2384 }
2385 i += sbi->s_stripe;
2386 }
2387}
2388
42ac1848 2389/*
8ef123fe 2390 * This is also called BEFORE we load the buddy bitmap.
42ac1848 2391 * Returns either 1 or 0 indicating that the group is either suitable
8ef123fe 2392 * for the allocation or not.
42ac1848 2393 */
8ef123fe 2394static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
c9de560d
AT
2395 ext4_group_t group, int cr)
2396{
8ef123fe 2397 ext4_grpblk_t free, fragments;
a4912123 2398 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
c9de560d
AT
2399 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2400
2401 BUG_ON(cr < 0 || cr >= 4);
8a57d9d6 2402
5354b2af 2403 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp))
8ef123fe 2404 return false;
01fc48e8 2405
dddcd2f9 2406 free = grp->bb_free;
2407 if (free == 0)
8ef123fe 2408 return false;
c9de560d 2409
c9de560d 2410 fragments = grp->bb_fragments;
c9de560d 2411 if (fragments == 0)
8ef123fe 2412 return false;
c9de560d
AT
2413
2414 switch (cr) {
2415 case 0:
2416 BUG_ON(ac->ac_2order == 0);
c9de560d 2417
a4912123
TT
2418 /* Avoid using the first bg of a flexgroup for data files */
2419 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2420 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2421 ((group % flex_size) == 0))
8ef123fe 2422 return false;
a4912123 2423
dddcd2f9 2424 if (free < ac->ac_g_ex.fe_len)
2425 return false;
2426
4b68f6df 2427 if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
8ef123fe 2428 return true;
40ae3487
TT
2429
2430 if (grp->bb_largest_free_order < ac->ac_2order)
8ef123fe 2431 return false;
40ae3487 2432
8ef123fe 2433 return true;
c9de560d
AT
2434 case 1:
2435 if ((free / fragments) >= ac->ac_g_ex.fe_len)
8ef123fe 2436 return true;
c9de560d
AT
2437 break;
2438 case 2:
2439 if (free >= ac->ac_g_ex.fe_len)
8ef123fe 2440 return true;
c9de560d
AT
2441 break;
2442 case 3:
8ef123fe 2443 return true;
c9de560d
AT
2444 default:
2445 BUG();
2446 }
2447
8ef123fe
RH
2448 return false;
2449}
2450
2451/*
2452 * This could return negative error code if something goes wrong
2453 * during ext4_mb_init_group(). This should not be called with
2454 * ext4_lock_group() held.
a5fda113
TT
2455 *
2456 * Note: because we are conditionally operating with the group lock in
2457 * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
2458 * function using __acquire and __release. This means we need to be
2459 * super careful before messing with the error path handling via "goto
2460 * out"!
8ef123fe
RH
2461 */
2462static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
2463 ext4_group_t group, int cr)
2464{
2465 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
99377830 2466 struct super_block *sb = ac->ac_sb;
c1d2c7d4 2467 struct ext4_sb_info *sbi = EXT4_SB(sb);
99377830 2468 bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
8ef123fe
RH
2469 ext4_grpblk_t free;
2470 int ret = 0;
2471
5354b2af
TT
2472 if (!grp)
2473 return -EFSCORRUPTED;
a6c75eaf
HS
2474 if (sbi->s_mb_stats)
2475 atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
a5fda113 2476 if (should_lock) {
99377830 2477 ext4_lock_group(sb, group);
a5fda113
TT
2478 __release(ext4_group_lock_ptr(sb, group));
2479 }
8ef123fe
RH
2480 free = grp->bb_free;
2481 if (free == 0)
2482 goto out;
2483 if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2484 goto out;
2485 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2486 goto out;
a5fda113
TT
2487 if (should_lock) {
2488 __acquire(ext4_group_lock_ptr(sb, group));
99377830 2489 ext4_unlock_group(sb, group);
a5fda113 2490 }
8ef123fe
RH
2491
2492 /* We only do this if the grp has never been initialized */
2493 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
c1d2c7d4
AZ
2494 struct ext4_group_desc *gdp =
2495 ext4_get_group_desc(sb, group, NULL);
2496 int ret;
2497
2498 /* cr=0/1 is a very optimistic search to find large
2499 * good chunks almost for free. If buddy data is not
2500 * ready, then this optimization makes no sense. But
2501 * we never skip the first block group in a flex_bg,
2502 * since this gets used for metadata block allocation,
2503 * and we want to make sure we locate metadata blocks
2504 * in the first block group in the flex_bg if possible.
2505 */
2506 if (cr < 2 &&
2507 (!sbi->s_log_groups_per_flex ||
2508 ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
2509 !(ext4_has_group_desc_csum(sb) &&
2510 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
2511 return 0;
2512 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
8ef123fe
RH
2513 if (ret)
2514 return ret;
2515 }
2516
a5fda113 2517 if (should_lock) {
99377830 2518 ext4_lock_group(sb, group);
a5fda113
TT
2519 __release(ext4_group_lock_ptr(sb, group));
2520 }
8ef123fe
RH
2521 ret = ext4_mb_good_group(ac, group, cr);
2522out:
a5fda113
TT
2523 if (should_lock) {
2524 __acquire(ext4_group_lock_ptr(sb, group));
99377830 2525 ext4_unlock_group(sb, group);
a5fda113 2526 }
8ef123fe 2527 return ret;
c9de560d
AT
2528}
2529
cfd73237
AZ
2530/*
2531 * Start prefetching @nr block bitmaps starting at @group.
2532 * Return the next group which needs to be prefetched.
2533 */
3d392b26
TT
2534ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
2535 unsigned int nr, int *cnt)
cfd73237
AZ
2536{
2537 ext4_group_t ngroups = ext4_get_groups_count(sb);
2538 struct buffer_head *bh;
2539 struct blk_plug plug;
2540
2541 blk_start_plug(&plug);
2542 while (nr-- > 0) {
2543 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2544 NULL);
2545 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2546
2547 /*
2548 * Prefetch block groups with free blocks; but don't
2549 * bother if it is marked uninitialized on disk, since
2550 * it won't require I/O to read. Also only try to
2551 * prefetch once, so we avoid getblk() call, which can
2552 * be expensive.
2553 */
5354b2af 2554 if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
cfd73237
AZ
2555 EXT4_MB_GRP_NEED_INIT(grp) &&
2556 ext4_free_group_clusters(sb, gdp) > 0 &&
2557 !(ext4_has_group_desc_csum(sb) &&
2558 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2559 bh = ext4_read_block_bitmap_nowait(sb, group, true);
2560 if (bh && !IS_ERR(bh)) {
2561 if (!buffer_uptodate(bh) && cnt)
2562 (*cnt)++;
2563 brelse(bh);
2564 }
2565 }
2566 if (++group >= ngroups)
2567 group = 0;
2568 }
2569 blk_finish_plug(&plug);
2570 return group;
2571}
2572
2573/*
2574 * Prefetching reads the block bitmap into the buffer cache; but we
2575 * need to make sure that the buddy bitmap in the page cache has been
2576 * initialized. Note that ext4_mb_init_group() will block if the I/O
2577 * is not yet completed, or indeed if it was not initiated by
2578 * ext4_mb_prefetch did not start the I/O.
2579 *
2580 * TODO: We should actually kick off the buddy bitmap setup in a work
2581 * queue when the buffer I/O is completed, so that we don't block
2582 * waiting for the block allocation bitmap read to finish when
2583 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
2584 */
3d392b26
TT
2585void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
2586 unsigned int nr)
cfd73237 2587{
22fab984
KS
2588 struct ext4_group_desc *gdp;
2589 struct ext4_group_info *grp;
cfd73237 2590
22fab984 2591 while (nr-- > 0) {
cfd73237
AZ
2592 if (!group)
2593 group = ext4_get_groups_count(sb);
2594 group--;
22fab984 2595 gdp = ext4_get_group_desc(sb, group, NULL);
cfd73237
AZ
2596 grp = ext4_get_group_info(sb, group);
2597
5354b2af 2598 if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
cfd73237
AZ
2599 ext4_free_group_clusters(sb, gdp) > 0 &&
2600 !(ext4_has_group_desc_csum(sb) &&
2601 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2602 if (ext4_mb_init_group(sb, group, GFP_NOFS))
2603 break;
2604 }
2605 }
2606}
2607
4ddfef7b
ES
2608static noinline_for_stack int
2609ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
c9de560d 2610{
cfd73237 2611 ext4_group_t prefetch_grp = 0, ngroups, group, i;
4fca50d4 2612 int cr = -1, new_cr;
42ac1848 2613 int err = 0, first_err = 0;
cfd73237 2614 unsigned int nr = 0, prefetch_ios = 0;
c9de560d
AT
2615 struct ext4_sb_info *sbi;
2616 struct super_block *sb;
2617 struct ext4_buddy e4b;
66d5e027 2618 int lost;
c9de560d
AT
2619
2620 sb = ac->ac_sb;
2621 sbi = EXT4_SB(sb);
8df9675f 2622 ngroups = ext4_get_groups_count(sb);
fb0a387d 2623 /* non-extent files are limited to low blocks/groups */
12e9b892 2624 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
fb0a387d
ES
2625 ngroups = sbi->s_blockfile_groups;
2626
c9de560d
AT
2627 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2628
2629 /* first, try the goal */
2630 err = ext4_mb_find_by_goal(ac, &e4b);
2631 if (err || ac->ac_status == AC_STATUS_FOUND)
2632 goto out;
2633
2634 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2635 goto out;
2636
2637 /*
e9a3cd48 2638 * ac->ac_2order is set only if the fe_len is a power of 2
2639 * if ac->ac_2order is set we also set criteria to 0 so that we
c9de560d
AT
2640 * try exact allocation using buddy.
2641 */
2642 i = fls(ac->ac_g_ex.fe_len);
2643 ac->ac_2order = 0;
2644 /*
2645 * We search using buddy data only if the order of the request
2646 * is greater than equal to the sbi_s_mb_order2_reqs
b713a5ec 2647 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
d9b22cf9
JK
2648 * We also support searching for power-of-two requests only for
2649 * requests upto maximum buddy size we have constructed.
c9de560d 2650 */
4b68f6df 2651 if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
c9de560d
AT
2652 /*
2653 * This should tell if fe_len is exactly power of 2
2654 */
2655 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
1a5d5e5d 2656 ac->ac_2order = array_index_nospec(i - 1,
4b68f6df 2657 MB_NUM_ORDERS(sb));
c9de560d
AT
2658 }
2659
4ba74d00
TT
2660 /* if stream allocation is enabled, use global goal */
2661 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
c9de560d
AT
2662 /* TBD: may be hot point */
2663 spin_lock(&sbi->s_md_lock);
2664 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2665 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2666 spin_unlock(&sbi->s_md_lock);
2667 }
4ba74d00 2668
c9de560d
AT
2669 /* Let's just scan groups to find more-less suitable blocks */
2670 cr = ac->ac_2order ? 0 : 1;
2671 /*
2672 * cr == 0 try to get exact allocation,
2673 * cr == 3 try to get anything
2674 */
2675repeat:
2676 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2677 ac->ac_criteria = cr;
ed8f9c75
AK
2678 /*
2679 * searching for the right group start
2680 * from the goal value specified
2681 */
2682 group = ac->ac_g_ex.fe_group;
196e402a 2683 ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
cfd73237 2684 prefetch_grp = group;
ed8f9c75 2685
4fca50d4
JK
2686 for (i = 0, new_cr = cr; i < ngroups; i++,
2687 ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
2688 int ret = 0;
196e402a 2689
2ed5724d 2690 cond_resched();
196e402a
HS
2691 if (new_cr != cr) {
2692 cr = new_cr;
2693 goto repeat;
2694 }
c9de560d 2695
cfd73237
AZ
2696 /*
2697 * Batch reads of the block allocation bitmaps
2698 * to get multiple READs in flight; limit
2699 * prefetching at cr=0/1, otherwise mballoc can
2700 * spend a lot of time loading imperfect groups
2701 */
2702 if ((prefetch_grp == group) &&
2703 (cr > 1 ||
2704 prefetch_ios < sbi->s_mb_prefetch_limit)) {
2705 unsigned int curr_ios = prefetch_ios;
2706
2707 nr = sbi->s_mb_prefetch;
2708 if (ext4_has_feature_flex_bg(sb)) {
82ef1370
CX
2709 nr = 1 << sbi->s_log_groups_per_flex;
2710 nr -= group & (nr - 1);
2711 nr = min(nr, sbi->s_mb_prefetch);
cfd73237
AZ
2712 }
2713 prefetch_grp = ext4_mb_prefetch(sb, group,
2714 nr, &prefetch_ios);
2715 if (prefetch_ios == curr_ios)
2716 nr = 0;
2717 }
2718
8a57d9d6 2719 /* This now checks without needing the buddy page */
8ef123fe 2720 ret = ext4_mb_good_group_nolock(ac, group, cr);
42ac1848
LC
2721 if (ret <= 0) {
2722 if (!first_err)
2723 first_err = ret;
c9de560d 2724 continue;
42ac1848 2725 }
c9de560d 2726
c9de560d
AT
2727 err = ext4_mb_load_buddy(sb, group, &e4b);
2728 if (err)
2729 goto out;
2730
2731 ext4_lock_group(sb, group);
8a57d9d6
CW
2732
2733 /*
2734 * We need to check again after locking the
2735 * block group
2736 */
42ac1848 2737 ret = ext4_mb_good_group(ac, group, cr);
8ef123fe 2738 if (ret == 0) {
c9de560d 2739 ext4_unlock_group(sb, group);
e39e07fd 2740 ext4_mb_unload_buddy(&e4b);
c9de560d
AT
2741 continue;
2742 }
2743
2744 ac->ac_groups_scanned++;
d9b22cf9 2745 if (cr == 0)
c9de560d 2746 ext4_mb_simple_scan_group(ac, &e4b);
506bf2d8
ES
2747 else if (cr == 1 && sbi->s_stripe &&
2748 !(ac->ac_g_ex.fe_len % sbi->s_stripe))
c9de560d
AT
2749 ext4_mb_scan_aligned(ac, &e4b);
2750 else
2751 ext4_mb_complex_scan_group(ac, &e4b);
2752
2753 ext4_unlock_group(sb, group);
e39e07fd 2754 ext4_mb_unload_buddy(&e4b);
c9de560d
AT
2755
2756 if (ac->ac_status != AC_STATUS_CONTINUE)
2757 break;
2758 }
a6c75eaf
HS
2759 /* Processed all groups and haven't found blocks */
2760 if (sbi->s_mb_stats && i == ngroups)
2761 atomic64_inc(&sbi->s_bal_cX_failed[cr]);
c9de560d
AT
2762 }
2763
2764 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2765 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2766 /*
2767 * We've been searching too long. Let's try to allocate
2768 * the best chunk we've found so far
2769 */
c9de560d
AT
2770 ext4_mb_try_best_found(ac, &e4b);
2771 if (ac->ac_status != AC_STATUS_FOUND) {
2772 /*
2773 * Someone more lucky has already allocated it.
2774 * The only thing we can do is just take first
2775 * found block(s)
c9de560d 2776 */
66d5e027 2777 lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
2778 mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
c55ee7d2 2779 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
2780 ac->ac_b_ex.fe_len, lost);
2781
c9de560d
AT
2782 ac->ac_b_ex.fe_group = 0;
2783 ac->ac_b_ex.fe_start = 0;
2784 ac->ac_b_ex.fe_len = 0;
2785 ac->ac_status = AC_STATUS_CONTINUE;
2786 ac->ac_flags |= EXT4_MB_HINT_FIRST;
2787 cr = 3;
c9de560d
AT
2788 goto repeat;
2789 }
2790 }
a6c75eaf
HS
2791
2792 if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
2793 atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
c9de560d 2794out:
42ac1848
LC
2795 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
2796 err = first_err;
bbc4ec77 2797
d3df1453 2798 mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
bbc4ec77
RH
2799 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
2800 ac->ac_flags, cr, err);
cfd73237
AZ
2801
2802 if (nr)
2803 ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
2804
c9de560d
AT
2805 return err;
2806}
2807
c9de560d
AT
2808static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2809{
359745d7 2810 struct super_block *sb = pde_data(file_inode(seq->file));
c9de560d
AT
2811 ext4_group_t group;
2812
8df9675f 2813 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
c9de560d 2814 return NULL;
c9de560d 2815 group = *pos + 1;
a9df9a49 2816 return (void *) ((unsigned long) group);
c9de560d
AT
2817}
2818
2819static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2820{
359745d7 2821 struct super_block *sb = pde_data(file_inode(seq->file));
c9de560d
AT
2822 ext4_group_t group;
2823
2824 ++*pos;
8df9675f 2825 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
c9de560d
AT
2826 return NULL;
2827 group = *pos + 1;
a9df9a49 2828 return (void *) ((unsigned long) group);
c9de560d
AT
2829}
2830
2831static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2832{
359745d7 2833 struct super_block *sb = pde_data(file_inode(seq->file));
a9df9a49 2834 ext4_group_t group = (ext4_group_t) ((unsigned long) v);
c9de560d 2835 int i;
1c8457ca 2836 int err, buddy_loaded = 0;
c9de560d 2837 struct ext4_buddy e4b;
1c8457ca 2838 struct ext4_group_info *grinfo;
2df2c340
AB
2839 unsigned char blocksize_bits = min_t(unsigned char,
2840 sb->s_blocksize_bits,
2841 EXT4_MAX_BLOCK_LOG_SIZE);
c9de560d
AT
2842 struct sg {
2843 struct ext4_group_info info;
b80b32b6 2844 ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
c9de560d
AT
2845 } sg;
2846
2847 group--;
2848 if (group == 0)
97b4af2f
RV
2849 seq_puts(seq, "#group: free frags first ["
2850 " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
802cf1f9 2851 " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
c9de560d 2852
b80b32b6
TT
2853 i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2854 sizeof(struct ext4_group_info);
2855
1c8457ca 2856 grinfo = ext4_get_group_info(sb, group);
5354b2af
TT
2857 if (!grinfo)
2858 return 0;
1c8457ca
AK
2859 /* Load the group info in memory only if not already loaded. */
2860 if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2861 err = ext4_mb_load_buddy(sb, group, &e4b);
2862 if (err) {
2863 seq_printf(seq, "#%-5u: I/O error\n", group);
2864 return 0;
2865 }
2866 buddy_loaded = 1;
c9de560d 2867 }
1c8457ca 2868
5354b2af 2869 memcpy(&sg, grinfo, i);
1c8457ca
AK
2870
2871 if (buddy_loaded)
2872 ext4_mb_unload_buddy(&e4b);
c9de560d 2873
a9df9a49 2874 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
c9de560d
AT
2875 sg.info.bb_fragments, sg.info.bb_first_free);
2876 for (i = 0; i <= 13; i++)
2df2c340 2877 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
c9de560d 2878 sg.info.bb_counters[i] : 0);
e0d438c7 2879 seq_puts(seq, " ]\n");
c9de560d
AT
2880
2881 return 0;
2882}
2883
2884static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2885{
2886}
2887
247dbed8 2888const struct seq_operations ext4_mb_seq_groups_ops = {
c9de560d
AT
2889 .start = ext4_mb_seq_groups_start,
2890 .next = ext4_mb_seq_groups_next,
2891 .stop = ext4_mb_seq_groups_stop,
2892 .show = ext4_mb_seq_groups_show,
2893};
2894
a6c75eaf
HS
2895int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
2896{
c30365b9 2897 struct super_block *sb = seq->private;
a6c75eaf
HS
2898 struct ext4_sb_info *sbi = EXT4_SB(sb);
2899
2900 seq_puts(seq, "mballoc:\n");
2901 if (!sbi->s_mb_stats) {
2902 seq_puts(seq, "\tmb stats collection turned off.\n");
2903 seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
2904 return 0;
2905 }
2906 seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
2907 seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
2908
2909 seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned));
2910
2911 seq_puts(seq, "\tcr0_stats:\n");
2912 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
2913 seq_printf(seq, "\t\tgroups_considered: %llu\n",
2914 atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
2915 seq_printf(seq, "\t\tuseless_loops: %llu\n",
2916 atomic64_read(&sbi->s_bal_cX_failed[0]));
196e402a
HS
2917 seq_printf(seq, "\t\tbad_suggestions: %u\n",
2918 atomic_read(&sbi->s_bal_cr0_bad_suggestions));
a6c75eaf
HS
2919
2920 seq_puts(seq, "\tcr1_stats:\n");
2921 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
2922 seq_printf(seq, "\t\tgroups_considered: %llu\n",
2923 atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
2924 seq_printf(seq, "\t\tuseless_loops: %llu\n",
2925 atomic64_read(&sbi->s_bal_cX_failed[1]));
196e402a
HS
2926 seq_printf(seq, "\t\tbad_suggestions: %u\n",
2927 atomic_read(&sbi->s_bal_cr1_bad_suggestions));
a6c75eaf
HS
2928
2929 seq_puts(seq, "\tcr2_stats:\n");
2930 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
2931 seq_printf(seq, "\t\tgroups_considered: %llu\n",
2932 atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
2933 seq_printf(seq, "\t\tuseless_loops: %llu\n",
2934 atomic64_read(&sbi->s_bal_cX_failed[2]));
2935
2936 seq_puts(seq, "\tcr3_stats:\n");
2937 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
2938 seq_printf(seq, "\t\tgroups_considered: %llu\n",
2939 atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
2940 seq_printf(seq, "\t\tuseless_loops: %llu\n",
2941 atomic64_read(&sbi->s_bal_cX_failed[3]));
2942 seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
2943 seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
2944 seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
2945 seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
2946 seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
2947
2948 seq_printf(seq, "\tbuddies_generated: %u/%u\n",
2949 atomic_read(&sbi->s_mb_buddies_generated),
2950 ext4_get_groups_count(sb));
2951 seq_printf(seq, "\tbuddies_time_used: %llu\n",
2952 atomic64_read(&sbi->s_mb_generation_time));
2953 seq_printf(seq, "\tpreallocated: %u\n",
2954 atomic_read(&sbi->s_mb_preallocated));
2955 seq_printf(seq, "\tdiscarded: %u\n",
2956 atomic_read(&sbi->s_mb_discarded));
2957 return 0;
2958}
2959
f68f4063 2960static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
a5fda113 2961__acquires(&EXT4_SB(sb)->s_mb_rb_lock)
f68f4063 2962{
359745d7 2963 struct super_block *sb = pde_data(file_inode(seq->file));
f68f4063
HS
2964 unsigned long position;
2965
83e80a6e 2966 if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
f68f4063
HS
2967 return NULL;
2968 position = *pos + 1;
2969 return (void *) ((unsigned long) position);
2970}
2971
2972static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
2973{
359745d7 2974 struct super_block *sb = pde_data(file_inode(seq->file));
f68f4063
HS
2975 unsigned long position;
2976
2977 ++*pos;
83e80a6e 2978 if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
f68f4063
HS
2979 return NULL;
2980 position = *pos + 1;
2981 return (void *) ((unsigned long) position);
2982}
2983
2984static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
2985{
359745d7 2986 struct super_block *sb = pde_data(file_inode(seq->file));
f68f4063
HS
2987 struct ext4_sb_info *sbi = EXT4_SB(sb);
2988 unsigned long position = ((unsigned long) v);
2989 struct ext4_group_info *grp;
83e80a6e 2990 unsigned int count;
f68f4063
HS
2991
2992 position--;
2993 if (position >= MB_NUM_ORDERS(sb)) {
83e80a6e
JK
2994 position -= MB_NUM_ORDERS(sb);
2995 if (position == 0)
2996 seq_puts(seq, "avg_fragment_size_lists:\n");
f68f4063 2997
83e80a6e
JK
2998 count = 0;
2999 read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
3000 list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
3001 bb_avg_fragment_size_node)
3002 count++;
3003 read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
3004 seq_printf(seq, "\tlist_order_%u_groups: %u\n",
3005 (unsigned int)position, count);
f68f4063
HS
3006 return 0;
3007 }
3008
3009 if (position == 0) {
3010 seq_printf(seq, "optimize_scan: %d\n",
3011 test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
3012 seq_puts(seq, "max_free_order_lists:\n");
3013 }
3014 count = 0;
83e80a6e 3015 read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
f68f4063
HS
3016 list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
3017 bb_largest_free_order_node)
3018 count++;
83e80a6e 3019 read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
f68f4063
HS
3020 seq_printf(seq, "\tlist_order_%u_groups: %u\n",
3021 (unsigned int)position, count);
3022
3023 return 0;
3024}
3025
3026static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
3027{
f68f4063
HS
3028}
3029
3030const struct seq_operations ext4_mb_seq_structs_summary_ops = {
3031 .start = ext4_mb_seq_structs_summary_start,
3032 .next = ext4_mb_seq_structs_summary_next,
3033 .stop = ext4_mb_seq_structs_summary_stop,
3034 .show = ext4_mb_seq_structs_summary_show,
3035};
3036
fb1813f4
CW
3037static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
3038{
3039 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
3040 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
3041
3042 BUG_ON(!cachep);
3043 return cachep;
3044}
5f21b0e6 3045
28623c2f
TT
3046/*
3047 * Allocate the top-level s_group_info array for the specified number
3048 * of groups
3049 */
3050int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
3051{
3052 struct ext4_sb_info *sbi = EXT4_SB(sb);
3053 unsigned size;
df3da4ea 3054 struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
28623c2f
TT
3055
3056 size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
3057 EXT4_DESC_PER_BLOCK_BITS(sb);
3058 if (size <= sbi->s_group_info_size)
3059 return 0;
3060
3061 size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
a7c3e901 3062 new_groupinfo = kvzalloc(size, GFP_KERNEL);
28623c2f
TT
3063 if (!new_groupinfo) {
3064 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
3065 return -ENOMEM;
3066 }
df3da4ea
SJS
3067 rcu_read_lock();
3068 old_groupinfo = rcu_dereference(sbi->s_group_info);
3069 if (old_groupinfo)
3070 memcpy(new_groupinfo, old_groupinfo,
28623c2f 3071 sbi->s_group_info_size * sizeof(*sbi->s_group_info));
df3da4ea
SJS
3072 rcu_read_unlock();
3073 rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
28623c2f 3074 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
df3da4ea
SJS
3075 if (old_groupinfo)
3076 ext4_kvfree_array_rcu(old_groupinfo);
666245d9 3077 ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
28623c2f
TT
3078 sbi->s_group_info_size);
3079 return 0;
3080}
3081
5f21b0e6 3082/* Create and initialize ext4_group_info data for the given group. */
920313a7 3083int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
5f21b0e6
FB
3084 struct ext4_group_desc *desc)
3085{
fb1813f4 3086 int i;
5f21b0e6 3087 int metalen = 0;
df3da4ea 3088 int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
5f21b0e6
FB
3089 struct ext4_sb_info *sbi = EXT4_SB(sb);
3090 struct ext4_group_info **meta_group_info;
fb1813f4 3091 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
5f21b0e6
FB
3092
3093 /*
3094 * First check if this group is the first of a reserved block.
3095 * If it's true, we have to allocate a new table of pointers
3096 * to ext4_group_info structures
3097 */
3098 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
3099 metalen = sizeof(*meta_group_info) <<
3100 EXT4_DESC_PER_BLOCK_BITS(sb);
4fdb5543 3101 meta_group_info = kmalloc(metalen, GFP_NOFS);
5f21b0e6 3102 if (meta_group_info == NULL) {
7f6a11e7 3103 ext4_msg(sb, KERN_ERR, "can't allocate mem "
9d8b9ec4 3104 "for a buddy group");
df119095 3105 return -ENOMEM;
5f21b0e6 3106 }
df3da4ea
SJS
3107 rcu_read_lock();
3108 rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
3109 rcu_read_unlock();
5f21b0e6
FB
3110 }
3111
df3da4ea 3112 meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
5f21b0e6
FB
3113 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
3114
4fdb5543 3115 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
5f21b0e6 3116 if (meta_group_info[i] == NULL) {
7f6a11e7 3117 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
5f21b0e6
FB
3118 goto exit_group_info;
3119 }
3120 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
3121 &(meta_group_info[i]->bb_state));
3122
3123 /*
3124 * initialize bb_free to be able to skip
3125 * empty groups without initialization
3126 */
8844618d
TT
3127 if (ext4_has_group_desc_csum(sb) &&
3128 (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
5f21b0e6 3129 meta_group_info[i]->bb_free =
cff1dfd7 3130 ext4_free_clusters_after_init(sb, group, desc);
5f21b0e6
FB
3131 } else {
3132 meta_group_info[i]->bb_free =
021b65bb 3133 ext4_free_group_clusters(sb, desc);
5f21b0e6
FB
3134 }
3135
3136 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
920313a7 3137 init_rwsem(&meta_group_info[i]->alloc_sem);
64e290ec 3138 meta_group_info[i]->bb_free_root = RB_ROOT;
196e402a 3139 INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
83e80a6e 3140 INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
8a57d9d6 3141 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
83e80a6e 3142 meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
196e402a 3143 meta_group_info[i]->bb_group = group;
5f21b0e6 3144
a3450215 3145 mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
5f21b0e6
FB
3146 return 0;
3147
3148exit_group_info:
3149 /* If a meta_group_info table has been allocated, release it now */
caaf7a29 3150 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
df3da4ea
SJS
3151 struct ext4_group_info ***group_info;
3152
3153 rcu_read_lock();
3154 group_info = rcu_dereference(sbi->s_group_info);
3155 kfree(group_info[idx]);
3156 group_info[idx] = NULL;
3157 rcu_read_unlock();
caaf7a29 3158 }
5f21b0e6
FB
3159 return -ENOMEM;
3160} /* ext4_mb_add_groupinfo */
3161
c9de560d
AT
3162static int ext4_mb_init_backend(struct super_block *sb)
3163{
8df9675f 3164 ext4_group_t ngroups = ext4_get_groups_count(sb);
c9de560d 3165 ext4_group_t i;
c9de560d 3166 struct ext4_sb_info *sbi = EXT4_SB(sb);
28623c2f 3167 int err;
5f21b0e6 3168 struct ext4_group_desc *desc;
df3da4ea 3169 struct ext4_group_info ***group_info;
fb1813f4 3170 struct kmem_cache *cachep;
5f21b0e6 3171
28623c2f
TT
3172 err = ext4_mb_alloc_groupinfo(sb, ngroups);
3173 if (err)
3174 return err;
c9de560d 3175
c9de560d
AT
3176 sbi->s_buddy_cache = new_inode(sb);
3177 if (sbi->s_buddy_cache == NULL) {
9d8b9ec4 3178 ext4_msg(sb, KERN_ERR, "can't get new inode");
c9de560d
AT
3179 goto err_freesgi;
3180 }
48e6061b
YJ
3181 /* To avoid potentially colliding with an valid on-disk inode number,
3182 * use EXT4_BAD_INO for the buddy cache inode number. This inode is
3183 * not in the inode hash, so it should never be found by iget(), but
3184 * this will avoid confusion if it ever shows up during debugging. */
3185 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
c9de560d 3186 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
8df9675f 3187 for (i = 0; i < ngroups; i++) {
4b99faa2 3188 cond_resched();
c9de560d
AT
3189 desc = ext4_get_group_desc(sb, i, NULL);
3190 if (desc == NULL) {
9d8b9ec4 3191 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
c9de560d
AT
3192 goto err_freebuddy;
3193 }
5f21b0e6
FB
3194 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
3195 goto err_freebuddy;
c9de560d
AT
3196 }
3197
cfd73237 3198 if (ext4_has_feature_flex_bg(sb)) {
f91436d5
ST
3199 /* a single flex group is supposed to be read by a single IO.
3200 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
3201 * unsigned integer, so the maximum shift is 32.
3202 */
3203 if (sbi->s_es->s_log_groups_per_flex >= 32) {
3204 ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
a8867f4e 3205 goto err_freebuddy;
f91436d5
ST
3206 }
3207 sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
82ef1370 3208 BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
cfd73237
AZ
3209 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
3210 } else {
3211 sbi->s_mb_prefetch = 32;
3212 }
3213 if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
3214 sbi->s_mb_prefetch = ext4_get_groups_count(sb);
3215 /* now many real IOs to prefetch within a single allocation at cr=0
3216 * given cr=0 is an CPU-related optimization we shouldn't try to
3217 * load too many groups, at some point we should start to use what
3218 * we've got in memory.
3219 * with an average random access time 5ms, it'd take a second to get
3220 * 200 groups (* N with flex_bg), so let's make this limit 4
3221 */
3222 sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
3223 if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
3224 sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
3225
c9de560d
AT
3226 return 0;
3227
3228err_freebuddy:
fb1813f4 3229 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
5354b2af
TT
3230 while (i-- > 0) {
3231 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
3232
3233 if (grp)
3234 kmem_cache_free(cachep, grp);
3235 }
28623c2f 3236 i = sbi->s_group_info_size;
df3da4ea
SJS
3237 rcu_read_lock();
3238 group_info = rcu_dereference(sbi->s_group_info);
f1fa3342 3239 while (i-- > 0)
df3da4ea
SJS
3240 kfree(group_info[i]);
3241 rcu_read_unlock();
c9de560d
AT
3242 iput(sbi->s_buddy_cache);
3243err_freesgi:
df3da4ea
SJS
3244 rcu_read_lock();
3245 kvfree(rcu_dereference(sbi->s_group_info));
3246 rcu_read_unlock();
c9de560d
AT
3247 return -ENOMEM;
3248}
3249
2892c15d
ES
3250static void ext4_groupinfo_destroy_slabs(void)
3251{
3252 int i;
3253
3254 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
21c580d8 3255 kmem_cache_destroy(ext4_groupinfo_caches[i]);
2892c15d
ES
3256 ext4_groupinfo_caches[i] = NULL;
3257 }
3258}
3259
3260static int ext4_groupinfo_create_slab(size_t size)
3261{
3262 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
3263 int slab_size;
3264 int blocksize_bits = order_base_2(size);
3265 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
3266 struct kmem_cache *cachep;
3267
3268 if (cache_index >= NR_GRPINFO_CACHES)
3269 return -EINVAL;
3270
3271 if (unlikely(cache_index < 0))
3272 cache_index = 0;
3273
3274 mutex_lock(&ext4_grpinfo_slab_create_mutex);
3275 if (ext4_groupinfo_caches[cache_index]) {
3276 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
3277 return 0; /* Already created */
3278 }
3279
3280 slab_size = offsetof(struct ext4_group_info,
3281 bb_counters[blocksize_bits + 2]);
3282
3283 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
3284 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
3285 NULL);
3286
823ba01f
TM
3287 ext4_groupinfo_caches[cache_index] = cachep;
3288
2892c15d
ES
3289 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
3290 if (!cachep) {
9d8b9ec4
TT
3291 printk(KERN_EMERG
3292 "EXT4-fs: no memory for groupinfo slab cache\n");
2892c15d
ES
3293 return -ENOMEM;
3294 }
3295
2892c15d
ES
3296 return 0;
3297}
3298
55cdd0af
WJ
3299static void ext4_discard_work(struct work_struct *work)
3300{
3301 struct ext4_sb_info *sbi = container_of(work,
3302 struct ext4_sb_info, s_discard_work);
3303 struct super_block *sb = sbi->s_sb;
3304 struct ext4_free_data *fd, *nfd;
3305 struct ext4_buddy e4b;
3306 struct list_head discard_list;
3307 ext4_group_t grp, load_grp;
3308 int err = 0;
3309
3310 INIT_LIST_HEAD(&discard_list);
3311 spin_lock(&sbi->s_md_lock);
3312 list_splice_init(&sbi->s_discard_list, &discard_list);
3313 spin_unlock(&sbi->s_md_lock);
3314
3315 load_grp = UINT_MAX;
3316 list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
3317 /*
5036ab8d
WJ
3318 * If filesystem is umounting or no memory or suffering
3319 * from no space, give up the discard
55cdd0af 3320 */
5036ab8d
WJ
3321 if ((sb->s_flags & SB_ACTIVE) && !err &&
3322 !atomic_read(&sbi->s_retry_alloc_pending)) {
55cdd0af
WJ
3323 grp = fd->efd_group;
3324 if (grp != load_grp) {
3325 if (load_grp != UINT_MAX)
3326 ext4_mb_unload_buddy(&e4b);
3327
3328 err = ext4_mb_load_buddy(sb, grp, &e4b);
3329 if (err) {
3330 kmem_cache_free(ext4_free_data_cachep, fd);
3331 load_grp = UINT_MAX;
3332 continue;
3333 } else {
3334 load_grp = grp;
3335 }
3336 }
3337
3338 ext4_lock_group(sb, grp);
3339 ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
3340 fd->efd_start_cluster + fd->efd_count - 1, 1);
3341 ext4_unlock_group(sb, grp);
3342 }
3343 kmem_cache_free(ext4_free_data_cachep, fd);
3344 }
3345
3346 if (load_grp != UINT_MAX)
3347 ext4_mb_unload_buddy(&e4b);
3348}
3349
9d99012f 3350int ext4_mb_init(struct super_block *sb)
c9de560d
AT
3351{
3352 struct ext4_sb_info *sbi = EXT4_SB(sb);
6be2ded1 3353 unsigned i, j;
935244cd 3354 unsigned offset, offset_incr;
c9de560d 3355 unsigned max;
74767c5a 3356 int ret;
c9de560d 3357
4b68f6df 3358 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
c9de560d
AT
3359
3360 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
3361 if (sbi->s_mb_offsets == NULL) {
fb1813f4
CW
3362 ret = -ENOMEM;
3363 goto out;
c9de560d 3364 }
ff7ef329 3365
4b68f6df 3366 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
c9de560d
AT
3367 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
3368 if (sbi->s_mb_maxs == NULL) {
fb1813f4
CW
3369 ret = -ENOMEM;
3370 goto out;
3371 }
3372
2892c15d
ES
3373 ret = ext4_groupinfo_create_slab(sb->s_blocksize);
3374 if (ret < 0)
3375 goto out;
c9de560d
AT
3376
3377 /* order 0 is regular bitmap */
3378 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
3379 sbi->s_mb_offsets[0] = 0;
3380
3381 i = 1;
3382 offset = 0;
935244cd 3383 offset_incr = 1 << (sb->s_blocksize_bits - 1);
c9de560d
AT
3384 max = sb->s_blocksize << 2;
3385 do {
3386 sbi->s_mb_offsets[i] = offset;
3387 sbi->s_mb_maxs[i] = max;
935244cd
NS
3388 offset += offset_incr;
3389 offset_incr = offset_incr >> 1;
c9de560d
AT
3390 max = max >> 1;
3391 i++;
4b68f6df
HS
3392 } while (i < MB_NUM_ORDERS(sb));
3393
83e80a6e
JK
3394 sbi->s_mb_avg_fragment_size =
3395 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
3396 GFP_KERNEL);
3397 if (!sbi->s_mb_avg_fragment_size) {
3398 ret = -ENOMEM;
3399 goto out;
3400 }
3401 sbi->s_mb_avg_fragment_size_locks =
3402 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
3403 GFP_KERNEL);
3404 if (!sbi->s_mb_avg_fragment_size_locks) {
3405 ret = -ENOMEM;
3406 goto out;
3407 }
3408 for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
3409 INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
3410 rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
3411 }
196e402a
HS
3412 sbi->s_mb_largest_free_orders =
3413 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
3414 GFP_KERNEL);
3415 if (!sbi->s_mb_largest_free_orders) {
3416 ret = -ENOMEM;
3417 goto out;
3418 }
3419 sbi->s_mb_largest_free_orders_locks =
3420 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
3421 GFP_KERNEL);
3422 if (!sbi->s_mb_largest_free_orders_locks) {
3423 ret = -ENOMEM;
3424 goto out;
3425 }
3426 for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
3427 INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
3428 rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
3429 }
c9de560d 3430
c9de560d 3431 spin_lock_init(&sbi->s_md_lock);
d08854f5 3432 sbi->s_mb_free_pending = 0;
a0154344 3433 INIT_LIST_HEAD(&sbi->s_freed_data_list);
55cdd0af
WJ
3434 INIT_LIST_HEAD(&sbi->s_discard_list);
3435 INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
5036ab8d 3436 atomic_set(&sbi->s_retry_alloc_pending, 0);
c9de560d
AT
3437
3438 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
3439 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
3440 sbi->s_mb_stats = MB_DEFAULT_STATS;
3441 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
3442 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
27baebb8
TT
3443 /*
3444 * The default group preallocation is 512, which for 4k block
3445 * sizes translates to 2 megabytes. However for bigalloc file
3446 * systems, this is probably too big (i.e, if the cluster size
3447 * is 1 megabyte, then group preallocation size becomes half a
3448 * gigabyte!). As a default, we will keep a two megabyte
3449 * group pralloc size for cluster sizes up to 64k, and after
3450 * that, we will force a minimum group preallocation size of
3451 * 32 clusters. This translates to 8 megs when the cluster
3452 * size is 256k, and 32 megs when the cluster size is 1 meg,
3453 * which seems reasonable as a default.
3454 */
3455 sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
3456 sbi->s_cluster_bits, 32);
d7a1fee1
DE
3457 /*
3458 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
3459 * to the lowest multiple of s_stripe which is bigger than
3460 * the s_mb_group_prealloc as determined above. We want
3461 * the preallocation size to be an exact multiple of the
3462 * RAID stripe size so that preallocations don't fragment
3463 * the stripes.
3464 */
3465 if (sbi->s_stripe > 1) {
3466 sbi->s_mb_group_prealloc = roundup(
3467 sbi->s_mb_group_prealloc, sbi->s_stripe);
3468 }
c9de560d 3469
730c213c 3470 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
c9de560d 3471 if (sbi->s_locality_groups == NULL) {
fb1813f4 3472 ret = -ENOMEM;
029b10c5 3473 goto out;
c9de560d 3474 }
730c213c 3475 for_each_possible_cpu(i) {
c9de560d 3476 struct ext4_locality_group *lg;
730c213c 3477 lg = per_cpu_ptr(sbi->s_locality_groups, i);
c9de560d 3478 mutex_init(&lg->lg_mutex);
6be2ded1
AK
3479 for (j = 0; j < PREALLOC_TB_SIZE; j++)
3480 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
c9de560d
AT
3481 spin_lock_init(&lg->lg_prealloc_lock);
3482 }
3483
10f0d2a5 3484 if (bdev_nonrot(sb->s_bdev))
196e402a
HS
3485 sbi->s_mb_max_linear_groups = 0;
3486 else
3487 sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
79a77c5a
YJ
3488 /* init file for buddy data */
3489 ret = ext4_mb_init_backend(sb);
7aa0baea
TM
3490 if (ret != 0)
3491 goto out_free_locality_groups;
79a77c5a 3492
7aa0baea
TM
3493 return 0;
3494
3495out_free_locality_groups:
3496 free_percpu(sbi->s_locality_groups);
3497 sbi->s_locality_groups = NULL;
fb1813f4 3498out:
83e80a6e
JK
3499 kfree(sbi->s_mb_avg_fragment_size);
3500 kfree(sbi->s_mb_avg_fragment_size_locks);
196e402a
HS
3501 kfree(sbi->s_mb_largest_free_orders);
3502 kfree(sbi->s_mb_largest_free_orders_locks);
7aa0baea
TM
3503 kfree(sbi->s_mb_offsets);
3504 sbi->s_mb_offsets = NULL;
3505 kfree(sbi->s_mb_maxs);
3506 sbi->s_mb_maxs = NULL;
fb1813f4 3507 return ret;
c9de560d
AT
3508}
3509
955ce5f5 3510/* need to called with the ext4 group lock held */
d3df1453 3511static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
c9de560d
AT
3512{
3513 struct ext4_prealloc_space *pa;
3514 struct list_head *cur, *tmp;
3515 int count = 0;
3516
3517 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
3518 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3519 list_del(&pa->pa_group_list);
3520 count++;
688f05a0 3521 kmem_cache_free(ext4_pspace_cachep, pa);
c9de560d 3522 }
d3df1453 3523 return count;
c9de560d
AT
3524}
3525
3526int ext4_mb_release(struct super_block *sb)
3527{
8df9675f 3528 ext4_group_t ngroups = ext4_get_groups_count(sb);
c9de560d
AT
3529 ext4_group_t i;
3530 int num_meta_group_infos;
df3da4ea 3531 struct ext4_group_info *grinfo, ***group_info;
c9de560d 3532 struct ext4_sb_info *sbi = EXT4_SB(sb);
fb1813f4 3533 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
d3df1453 3534 int count;
c9de560d 3535
55cdd0af
WJ
3536 if (test_opt(sb, DISCARD)) {
3537 /*
3538 * wait the discard work to drain all of ext4_free_data
3539 */
3540 flush_work(&sbi->s_discard_work);
3541 WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
3542 }
3543
c9de560d 3544 if (sbi->s_group_info) {
8df9675f 3545 for (i = 0; i < ngroups; i++) {
4b99faa2 3546 cond_resched();
c9de560d 3547 grinfo = ext4_get_group_info(sb, i);
5354b2af
TT
3548 if (!grinfo)
3549 continue;
a3450215 3550 mb_group_bb_bitmap_free(grinfo);
c9de560d 3551 ext4_lock_group(sb, i);
d3df1453
RH
3552 count = ext4_mb_cleanup_pa(grinfo);
3553 if (count)
3554 mb_debug(sb, "mballoc: %d PAs left\n",
3555 count);
c9de560d 3556 ext4_unlock_group(sb, i);
fb1813f4 3557 kmem_cache_free(cachep, grinfo);
c9de560d 3558 }
8df9675f 3559 num_meta_group_infos = (ngroups +
c9de560d
AT
3560 EXT4_DESC_PER_BLOCK(sb) - 1) >>
3561 EXT4_DESC_PER_BLOCK_BITS(sb);
df3da4ea
SJS
3562 rcu_read_lock();
3563 group_info = rcu_dereference(sbi->s_group_info);
c9de560d 3564 for (i = 0; i < num_meta_group_infos; i++)
df3da4ea
SJS
3565 kfree(group_info[i]);
3566 kvfree(group_info);
3567 rcu_read_unlock();
c9de560d 3568 }
83e80a6e
JK
3569 kfree(sbi->s_mb_avg_fragment_size);
3570 kfree(sbi->s_mb_avg_fragment_size_locks);
196e402a
HS
3571 kfree(sbi->s_mb_largest_free_orders);
3572 kfree(sbi->s_mb_largest_free_orders_locks);
c9de560d
AT
3573 kfree(sbi->s_mb_offsets);
3574 kfree(sbi->s_mb_maxs);
bfcba2d0 3575 iput(sbi->s_buddy_cache);
c9de560d 3576 if (sbi->s_mb_stats) {
9d8b9ec4
TT
3577 ext4_msg(sb, KERN_INFO,
3578 "mballoc: %u blocks %u reqs (%u success)",
c9de560d
AT
3579 atomic_read(&sbi->s_bal_allocated),
3580 atomic_read(&sbi->s_bal_reqs),
3581 atomic_read(&sbi->s_bal_success));
9d8b9ec4 3582 ext4_msg(sb, KERN_INFO,
a6c75eaf 3583 "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
9d8b9ec4 3584 "%u 2^N hits, %u breaks, %u lost",
c9de560d 3585 atomic_read(&sbi->s_bal_ex_scanned),
a6c75eaf 3586 atomic_read(&sbi->s_bal_groups_scanned),
c9de560d
AT
3587 atomic_read(&sbi->s_bal_goals),
3588 atomic_read(&sbi->s_bal_2orders),
3589 atomic_read(&sbi->s_bal_breaks),
3590 atomic_read(&sbi->s_mb_lost_chunks));
9d8b9ec4 3591 ext4_msg(sb, KERN_INFO,
67d25186
HS
3592 "mballoc: %u generated and it took %llu",
3593 atomic_read(&sbi->s_mb_buddies_generated),
3594 atomic64_read(&sbi->s_mb_generation_time));
9d8b9ec4
TT
3595 ext4_msg(sb, KERN_INFO,
3596 "mballoc: %u preallocated, %u discarded",
c9de560d
AT
3597 atomic_read(&sbi->s_mb_preallocated),
3598 atomic_read(&sbi->s_mb_discarded));
3599 }
3600
730c213c 3601 free_percpu(sbi->s_locality_groups);
c9de560d
AT
3602
3603 return 0;
3604}
3605
77ca6cdf 3606static inline int ext4_issue_discard(struct super_block *sb,
a0154344
DJ
3607 ext4_group_t block_group, ext4_grpblk_t cluster, int count,
3608 struct bio **biop)
5c521830 3609{
5c521830
JZ
3610 ext4_fsblk_t discard_block;
3611
84130193
TT
3612 discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
3613 ext4_group_first_block_no(sb, block_group));
3614 count = EXT4_C2B(EXT4_SB(sb), count);
5c521830
JZ
3615 trace_ext4_discard_blocks(sb,
3616 (unsigned long long) discard_block, count);
a0154344
DJ
3617 if (biop) {
3618 return __blkdev_issue_discard(sb->s_bdev,
3619 (sector_t)discard_block << (sb->s_blocksize_bits - 9),
3620 (sector_t)count << (sb->s_blocksize_bits - 9),
44abff2c 3621 GFP_NOFS, biop);
a0154344
DJ
3622 } else
3623 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
5c521830
JZ
3624}
3625
a0154344
DJ
3626static void ext4_free_data_in_buddy(struct super_block *sb,
3627 struct ext4_free_data *entry)
c9de560d 3628{
c9de560d 3629 struct ext4_buddy e4b;
c894058d 3630 struct ext4_group_info *db;
c7f2bafa 3631 int err, count = 0;
c9de560d 3632
d3df1453 3633 mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
18aadd47 3634 entry->efd_count, entry->efd_group, entry);
c9de560d 3635
18aadd47
BJ
3636 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
3637 /* we expect to find existing buddy because it's pinned */
3638 BUG_ON(err != 0);
b90f6870 3639
d08854f5
TT
3640 spin_lock(&EXT4_SB(sb)->s_md_lock);
3641 EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
3642 spin_unlock(&EXT4_SB(sb)->s_md_lock);
c9de560d 3643
18aadd47
BJ
3644 db = e4b.bd_info;
3645 /* there are blocks to put in buddy to make them really free */
3646 count += entry->efd_count;
18aadd47
BJ
3647 ext4_lock_group(sb, entry->efd_group);
3648 /* Take it out of per group rb tree */
3649 rb_erase(&entry->efd_node, &(db->bb_free_root));
3650 mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
c894058d 3651
18aadd47
BJ
3652 /*
3653 * Clear the trimmed flag for the group so that the next
3654 * ext4_trim_fs can trim it.
3655 * If the volume is mounted with -o discard, online discard
3656 * is supported and the free blocks will be trimmed online.
3657 */
3658 if (!test_opt(sb, DISCARD))
3659 EXT4_MB_GRP_CLEAR_TRIMMED(db);
3d56b8d2 3660
18aadd47
BJ
3661 if (!db->bb_free_root.rb_node) {
3662 /* No more items in the per group rb tree
3663 * balance refcounts from ext4_mb_free_metadata()
3664 */
09cbfeaf
KS
3665 put_page(e4b.bd_buddy_page);
3666 put_page(e4b.bd_bitmap_page);
3e624fc7 3667 }
18aadd47 3668 ext4_unlock_group(sb, entry->efd_group);
18aadd47 3669 ext4_mb_unload_buddy(&e4b);
c9de560d 3670
c7f2bafa 3671 mb_debug(sb, "freed %d blocks in 1 structures\n", count);
c9de560d
AT
3672}
3673
a0154344
DJ
3674/*
3675 * This function is called by the jbd2 layer once the commit has finished,
3676 * so we know we can free the blocks that were released with that commit.
3677 */
3678void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
3679{
3680 struct ext4_sb_info *sbi = EXT4_SB(sb);
3681 struct ext4_free_data *entry, *tmp;
a0154344
DJ
3682 struct list_head freed_data_list;
3683 struct list_head *cut_pos = NULL;
55cdd0af 3684 bool wake;
a0154344
DJ
3685
3686 INIT_LIST_HEAD(&freed_data_list);
3687
3688 spin_lock(&sbi->s_md_lock);
3689 list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
3690 if (entry->efd_tid != commit_tid)
3691 break;
3692 cut_pos = &entry->efd_list;
3693 }
3694 if (cut_pos)
3695 list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
3696 cut_pos);
3697 spin_unlock(&sbi->s_md_lock);
3698
55cdd0af
WJ
3699 list_for_each_entry(entry, &freed_data_list, efd_list)
3700 ext4_free_data_in_buddy(sb, entry);
a0154344 3701
55cdd0af
WJ
3702 if (test_opt(sb, DISCARD)) {
3703 spin_lock(&sbi->s_md_lock);
3704 wake = list_empty(&sbi->s_discard_list);
3705 list_splice_tail(&freed_data_list, &sbi->s_discard_list);
3706 spin_unlock(&sbi->s_md_lock);
3707 if (wake)
3708 queue_work(system_unbound_wq, &sbi->s_discard_work);
3709 } else {
3710 list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
3711 kmem_cache_free(ext4_free_data_cachep, entry);
a0154344 3712 }
a0154344
DJ
3713}
3714
5dabfc78 3715int __init ext4_init_mballoc(void)
c9de560d 3716{
16828088
TT
3717 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
3718 SLAB_RECLAIM_ACCOUNT);
c9de560d 3719 if (ext4_pspace_cachep == NULL)
f283529a 3720 goto out;
c9de560d 3721
16828088
TT
3722 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
3723 SLAB_RECLAIM_ACCOUNT);
f283529a
RH
3724 if (ext4_ac_cachep == NULL)
3725 goto out_pa_free;
c894058d 3726
18aadd47
BJ
3727 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
3728 SLAB_RECLAIM_ACCOUNT);
f283529a
RH
3729 if (ext4_free_data_cachep == NULL)
3730 goto out_ac_free;
3731
c9de560d 3732 return 0;
f283529a
RH
3733
3734out_ac_free:
3735 kmem_cache_destroy(ext4_ac_cachep);
3736out_pa_free:
3737 kmem_cache_destroy(ext4_pspace_cachep);
3738out:
3739 return -ENOMEM;
c9de560d
AT
3740}
3741
5dabfc78 3742void ext4_exit_mballoc(void)
c9de560d 3743{
60e6679e 3744 /*
3e03f9ca
JDB
3745 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
3746 * before destroying the slab cache.
3747 */
3748 rcu_barrier();
c9de560d 3749 kmem_cache_destroy(ext4_pspace_cachep);
256bdb49 3750 kmem_cache_destroy(ext4_ac_cachep);
18aadd47 3751 kmem_cache_destroy(ext4_free_data_cachep);
2892c15d 3752 ext4_groupinfo_destroy_slabs();
c9de560d
AT
3753}
3754
3755
3756/*
73b2c716 3757 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
c9de560d
AT
3758 * Returns 0 if success or error code
3759 */
4ddfef7b
ES
3760static noinline_for_stack int
3761ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
53accfa9 3762 handle_t *handle, unsigned int reserv_clstrs)
c9de560d
AT
3763{
3764 struct buffer_head *bitmap_bh = NULL;
c9de560d
AT
3765 struct ext4_group_desc *gdp;
3766 struct buffer_head *gdp_bh;
3767 struct ext4_sb_info *sbi;
3768 struct super_block *sb;
3769 ext4_fsblk_t block;
519deca0 3770 int err, len;
c9de560d
AT
3771
3772 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3773 BUG_ON(ac->ac_b_ex.fe_len <= 0);
3774
3775 sb = ac->ac_sb;
3776 sbi = EXT4_SB(sb);
c9de560d 3777
574ca174 3778 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
9008a58e 3779 if (IS_ERR(bitmap_bh)) {
fb28f9ce 3780 return PTR_ERR(bitmap_bh);
9008a58e 3781 }
c9de560d 3782
5d601255 3783 BUFFER_TRACE(bitmap_bh, "getting write access");
188c299e
JK
3784 err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
3785 EXT4_JTR_NONE);
c9de560d
AT
3786 if (err)
3787 goto out_err;
3788
3789 err = -EIO;
3790 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
3791 if (!gdp)
3792 goto out_err;
3793
a9df9a49 3794 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
021b65bb 3795 ext4_free_group_clusters(sb, gdp));
03cddb80 3796
5d601255 3797 BUFFER_TRACE(gdp_bh, "get_write_access");
188c299e 3798 err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
c9de560d
AT
3799 if (err)
3800 goto out_err;
3801
bda00de7 3802 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
c9de560d 3803
53accfa9 3804 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
ce9f24cc 3805 if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
12062ddd 3806 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
1084f252 3807 "fs metadata", block, block+len);
519deca0 3808 /* File system mounted not to panic on error
554a5ccc 3809 * Fix the bitmap and return EFSCORRUPTED
519deca0
AK
3810 * We leak some of the blocks here.
3811 */
955ce5f5 3812 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
123e3016 3813 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
c3e94d1d 3814 ac->ac_b_ex.fe_len);
955ce5f5 3815 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
0390131b 3816 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
519deca0 3817 if (!err)
554a5ccc 3818 err = -EFSCORRUPTED;
519deca0 3819 goto out_err;
c9de560d 3820 }
955ce5f5
AK
3821
3822 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
c9de560d
AT
3823#ifdef AGGRESSIVE_CHECK
3824 {
3825 int i;
3826 for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
3827 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
3828 bitmap_bh->b_data));
3829 }
3830 }
3831#endif
123e3016 3832 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
c3e94d1d 3833 ac->ac_b_ex.fe_len);
8844618d
TT
3834 if (ext4_has_group_desc_csum(sb) &&
3835 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
c9de560d 3836 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
021b65bb 3837 ext4_free_group_clusters_set(sb, gdp,
cff1dfd7 3838 ext4_free_clusters_after_init(sb,
021b65bb 3839 ac->ac_b_ex.fe_group, gdp));
c9de560d 3840 }
021b65bb
TT
3841 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
3842 ext4_free_group_clusters_set(sb, gdp, len);
1df9bde4 3843 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
feb0ab32 3844 ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
955ce5f5
AK
3845
3846 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
57042651 3847 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
d2a17637 3848 /*
6bc6e63f 3849 * Now reduce the dirty block count also. Should not go negative
d2a17637 3850 */
6bc6e63f
AK
3851 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
3852 /* release all the reserved blocks if non delalloc */
57042651
TT
3853 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
3854 reserv_clstrs);
c9de560d 3855
772cb7c8
JS
3856 if (sbi->s_log_groups_per_flex) {
3857 ext4_group_t flex_group = ext4_flex_group(sbi,
3858 ac->ac_b_ex.fe_group);
90ba983f 3859 atomic64_sub(ac->ac_b_ex.fe_len,
7c990728
SJS
3860 &sbi_array_rcu_deref(sbi, s_flex_groups,
3861 flex_group)->free_clusters);
772cb7c8
JS
3862 }
3863
0390131b 3864 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
c9de560d
AT
3865 if (err)
3866 goto out_err;
0390131b 3867 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
c9de560d
AT
3868
3869out_err:
42a10add 3870 brelse(bitmap_bh);
c9de560d
AT
3871 return err;
3872}
3873
8016e29f
HS
3874/*
3875 * Idempotent helper for Ext4 fast commit replay path to set the state of
3876 * blocks in bitmaps and update counters.
3877 */
3878void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
3879 int len, int state)
3880{
3881 struct buffer_head *bitmap_bh = NULL;
3882 struct ext4_group_desc *gdp;
3883 struct buffer_head *gdp_bh;
3884 struct ext4_sb_info *sbi = EXT4_SB(sb);
3885 ext4_group_t group;
3886 ext4_grpblk_t blkoff;
a5c0e2fd 3887 int i, err;
8016e29f 3888 int already;
bfdc502a 3889 unsigned int clen, clen_changed, thisgrp_len;
8016e29f 3890
bfdc502a
RH
3891 while (len > 0) {
3892 ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
8016e29f 3893
bfdc502a
RH
3894 /*
3895 * Check to see if we are freeing blocks across a group
3896 * boundary.
3897 * In case of flex_bg, this can happen that (block, len) may
3898 * span across more than one group. In that case we need to
3899 * get the corresponding group metadata to work with.
3900 * For this we have goto again loop.
3901 */
3902 thisgrp_len = min_t(unsigned int, (unsigned int)len,
3903 EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
3904 clen = EXT4_NUM_B2C(sbi, thisgrp_len);
3905
8c91c579
RH
3906 if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
3907 ext4_error(sb, "Marking blocks in system zone - "
3908 "Block = %llu, len = %u",
3909 block, thisgrp_len);
3910 bitmap_bh = NULL;
3911 break;
3912 }
3913
bfdc502a
RH
3914 bitmap_bh = ext4_read_block_bitmap(sb, group);
3915 if (IS_ERR(bitmap_bh)) {
3916 err = PTR_ERR(bitmap_bh);
3917 bitmap_bh = NULL;
3918 break;
3919 }
8016e29f 3920
bfdc502a
RH
3921 err = -EIO;
3922 gdp = ext4_get_group_desc(sb, group, &gdp_bh);
3923 if (!gdp)
3924 break;
8016e29f 3925
bfdc502a
RH
3926 ext4_lock_group(sb, group);
3927 already = 0;
3928 for (i = 0; i < clen; i++)
3929 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
3930 !state)
3931 already++;
8016e29f 3932
bfdc502a
RH
3933 clen_changed = clen - already;
3934 if (state)
123e3016 3935 mb_set_bits(bitmap_bh->b_data, blkoff, clen);
bfdc502a 3936 else
bd8247ee 3937 mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
bfdc502a
RH
3938 if (ext4_has_group_desc_csum(sb) &&
3939 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3940 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3941 ext4_free_group_clusters_set(sb, gdp,
3942 ext4_free_clusters_after_init(sb, group, gdp));
3943 }
3944 if (state)
3945 clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
3946 else
3947 clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
8016e29f 3948
bfdc502a 3949 ext4_free_group_clusters_set(sb, gdp, clen);
1df9bde4 3950 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
bfdc502a 3951 ext4_group_desc_csum_set(sb, group, gdp);
8016e29f 3952
bfdc502a 3953 ext4_unlock_group(sb, group);
8016e29f 3954
bfdc502a
RH
3955 if (sbi->s_log_groups_per_flex) {
3956 ext4_group_t flex_group = ext4_flex_group(sbi, group);
3957 struct flex_groups *fg = sbi_array_rcu_deref(sbi,
3958 s_flex_groups, flex_group);
8016e29f 3959
bfdc502a
RH
3960 if (state)
3961 atomic64_sub(clen_changed, &fg->free_clusters);
3962 else
3963 atomic64_add(clen_changed, &fg->free_clusters);
3964
3965 }
3966
3967 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
3968 if (err)
3969 break;
3970 sync_dirty_buffer(bitmap_bh);
3971 err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
3972 sync_dirty_buffer(gdp_bh);
3973 if (err)
3974 break;
3975
3976 block += thisgrp_len;
3977 len -= thisgrp_len;
3978 brelse(bitmap_bh);
3979 BUG_ON(len < 0);
8016e29f
HS
3980 }
3981
8016e29f 3982 if (err)
bfdc502a 3983 brelse(bitmap_bh);
8016e29f
HS
3984}
3985
c9de560d
AT
3986/*
3987 * here we normalize request for locality group
d7a1fee1
DE
3988 * Group request are normalized to s_mb_group_prealloc, which goes to
3989 * s_strip if we set the same via mount option.
3990 * s_mb_group_prealloc can be configured via
b713a5ec 3991 * /sys/fs/ext4/<partition>/mb_group_prealloc
c9de560d
AT
3992 *
3993 * XXX: should we try to preallocate more than the group has now?
3994 */
3995static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3996{
3997 struct super_block *sb = ac->ac_sb;
3998 struct ext4_locality_group *lg = ac->ac_lg;
3999
4000 BUG_ON(lg == NULL);
d7a1fee1 4001 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
d3df1453 4002 mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
c9de560d
AT
4003}
4004
38727786
OM
4005/*
4006 * This function returns the next element to look at during inode
4007 * PA rbtree walk. We assume that we have held the inode PA rbtree lock
4008 * (ei->i_prealloc_lock)
4009 *
4010 * new_start The start of the range we want to compare
4011 * cur_start The existing start that we are comparing against
4012 * node The node of the rb_tree
4013 */
4014static inline struct rb_node*
4015ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node)
4016{
4017 if (new_start < cur_start)
4018 return node->rb_left;
4019 else
4020 return node->rb_right;
4021}
4022
7692094a
OM
4023static inline void
4024ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
4025 ext4_lblk_t start, ext4_lblk_t end)
4026{
4027 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4028 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
4029 struct ext4_prealloc_space *tmp_pa;
4030 ext4_lblk_t tmp_pa_start, tmp_pa_end;
38727786 4031 struct rb_node *iter;
7692094a 4032
38727786
OM
4033 read_lock(&ei->i_prealloc_lock);
4034 for (iter = ei->i_prealloc_node.rb_node; iter;
4035 iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) {
4036 tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4037 pa_node.inode_node);
4038 tmp_pa_start = tmp_pa->pa_lstart;
4039 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
7692094a 4040
38727786
OM
4041 spin_lock(&tmp_pa->pa_lock);
4042 if (tmp_pa->pa_deleted == 0)
7692094a 4043 BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start));
7692094a
OM
4044 spin_unlock(&tmp_pa->pa_lock);
4045 }
38727786 4046 read_unlock(&ei->i_prealloc_lock);
7692094a
OM
4047}
4048
0830344c
OM
4049/*
4050 * Given an allocation context "ac" and a range "start", "end", check
4051 * and adjust boundaries if the range overlaps with any of the existing
4052 * preallocatoins stored in the corresponding inode of the allocation context.
4053 *
38727786 4054 * Parameters:
0830344c
OM
4055 * ac allocation context
4056 * start start of the new range
4057 * end end of the new range
4058 */
4059static inline void
4060ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
38727786 4061 ext4_lblk_t *start, ext4_lblk_t *end)
0830344c
OM
4062{
4063 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
4064 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
38727786
OM
4065 struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL;
4066 struct rb_node *iter;
0830344c 4067 ext4_lblk_t new_start, new_end;
38727786 4068 ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1;
0830344c
OM
4069
4070 new_start = *start;
4071 new_end = *end;
4072
38727786
OM
4073 /*
4074 * Adjust the normalized range so that it doesn't overlap with any
4075 * existing preallocated blocks(PAs). Make sure to hold the rbtree lock
4076 * so it doesn't change underneath us.
4077 */
4078 read_lock(&ei->i_prealloc_lock);
4079
4080 /* Step 1: find any one immediate neighboring PA of the normalized range */
4081 for (iter = ei->i_prealloc_node.rb_node; iter;
4082 iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
4083 tmp_pa_start, iter)) {
4084 tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4085 pa_node.inode_node);
0830344c
OM
4086 tmp_pa_start = tmp_pa->pa_lstart;
4087 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
4088
4089 /* PA must not overlap original request */
38727786
OM
4090 spin_lock(&tmp_pa->pa_lock);
4091 if (tmp_pa->pa_deleted == 0)
4092 BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
4093 ac->ac_o_ex.fe_logical < tmp_pa_start));
4094 spin_unlock(&tmp_pa->pa_lock);
4095 }
4096
4097 /*
4098 * Step 2: check if the found PA is left or right neighbor and
4099 * get the other neighbor
4100 */
4101 if (tmp_pa) {
4102 if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) {
4103 struct rb_node *tmp;
4104
4105 left_pa = tmp_pa;
4106 tmp = rb_next(&left_pa->pa_node.inode_node);
4107 if (tmp) {
4108 right_pa = rb_entry(tmp,
4109 struct ext4_prealloc_space,
4110 pa_node.inode_node);
4111 }
4112 } else {
4113 struct rb_node *tmp;
4114
4115 right_pa = tmp_pa;
4116 tmp = rb_prev(&right_pa->pa_node.inode_node);
4117 if (tmp) {
4118 left_pa = rb_entry(tmp,
4119 struct ext4_prealloc_space,
4120 pa_node.inode_node);
4121 }
4122 }
4123 }
4124
4125 /* Step 3: get the non deleted neighbors */
4126 if (left_pa) {
4127 for (iter = &left_pa->pa_node.inode_node;;
4128 iter = rb_prev(iter)) {
4129 if (!iter) {
4130 left_pa = NULL;
4131 break;
4132 }
0830344c 4133
38727786
OM
4134 tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4135 pa_node.inode_node);
4136 left_pa = tmp_pa;
4137 spin_lock(&tmp_pa->pa_lock);
4138 if (tmp_pa->pa_deleted == 0) {
4139 spin_unlock(&tmp_pa->pa_lock);
4140 break;
4141 }
0830344c 4142 spin_unlock(&tmp_pa->pa_lock);
0830344c 4143 }
38727786
OM
4144 }
4145
4146 if (right_pa) {
4147 for (iter = &right_pa->pa_node.inode_node;;
4148 iter = rb_next(iter)) {
4149 if (!iter) {
4150 right_pa = NULL;
4151 break;
4152 }
4153
4154 tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4155 pa_node.inode_node);
4156 right_pa = tmp_pa;
4157 spin_lock(&tmp_pa->pa_lock);
4158 if (tmp_pa->pa_deleted == 0) {
4159 spin_unlock(&tmp_pa->pa_lock);
4160 break;
4161 }
4162 spin_unlock(&tmp_pa->pa_lock);
0830344c 4163 }
0830344c 4164 }
38727786
OM
4165
4166 if (left_pa) {
4167 left_pa_end =
4168 left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len);
4169 BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical);
4170 }
4171
4172 if (right_pa) {
4173 right_pa_start = right_pa->pa_lstart;
4174 BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical);
4175 }
4176
4177 /* Step 4: trim our normalized range to not overlap with the neighbors */
4178 if (left_pa) {
4179 if (left_pa_end > new_start)
4180 new_start = left_pa_end;
4181 }
4182
4183 if (right_pa) {
4184 if (right_pa_start < new_end)
4185 new_end = right_pa_start;
4186 }
4187 read_unlock(&ei->i_prealloc_lock);
0830344c
OM
4188
4189 /* XXX: extra loop to check we really don't overlap preallocations */
4190 ext4_mb_pa_assert_overlap(ac, new_start, new_end);
4191
4192 *start = new_start;
4193 *end = new_end;
4194}
4195
c9de560d
AT
4196/*
4197 * Normalization means making request better in terms of
4198 * size and alignment
4199 */
4ddfef7b
ES
4200static noinline_for_stack void
4201ext4_mb_normalize_request(struct ext4_allocation_context *ac,
c9de560d
AT
4202 struct ext4_allocation_request *ar)
4203{
53accfa9 4204 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
b07ffe69 4205 struct ext4_super_block *es = sbi->s_es;
c9de560d
AT
4206 int bsbits, max;
4207 ext4_lblk_t end;
1592d2c5
CW
4208 loff_t size, start_off;
4209 loff_t orig_size __maybe_unused;
5a0790c2 4210 ext4_lblk_t start;
c9de560d
AT
4211
4212 /* do normalize only data requests, metadata requests
4213 do not need preallocation */
4214 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4215 return;
4216
4217 /* sometime caller may want exact blocks */
4218 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4219 return;
4220
4221 /* caller may indicate that preallocation isn't
4222 * required (it's a tail, for example) */
4223 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
4224 return;
4225
4226 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
4227 ext4_mb_normalize_group_request(ac);
4228 return ;
4229 }
4230
4231 bsbits = ac->ac_sb->s_blocksize_bits;
4232
4233 /* first, let's learn actual file size
4234 * given current request is allocated */
53accfa9 4235 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
c9de560d
AT
4236 size = size << bsbits;
4237 if (size < i_size_read(ac->ac_inode))
4238 size = i_size_read(ac->ac_inode);
5a0790c2 4239 orig_size = size;
c9de560d 4240
1930479c
VC
4241 /* max size of free chunks */
4242 max = 2 << bsbits;
c9de560d 4243
1930479c
VC
4244#define NRL_CHECK_SIZE(req, size, max, chunk_size) \
4245 (req <= (size) || max <= (chunk_size))
c9de560d
AT
4246
4247 /* first, try to predict filesize */
4248 /* XXX: should this table be tunable? */
4249 start_off = 0;
4250 if (size <= 16 * 1024) {
4251 size = 16 * 1024;
4252 } else if (size <= 32 * 1024) {
4253 size = 32 * 1024;
4254 } else if (size <= 64 * 1024) {
4255 size = 64 * 1024;
4256 } else if (size <= 128 * 1024) {
4257 size = 128 * 1024;
4258 } else if (size <= 256 * 1024) {
4259 size = 256 * 1024;
4260 } else if (size <= 512 * 1024) {
4261 size = 512 * 1024;
4262 } else if (size <= 1024 * 1024) {
4263 size = 1024 * 1024;
1930479c 4264 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
c9de560d 4265 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
1930479c
VC
4266 (21 - bsbits)) << 21;
4267 size = 2 * 1024 * 1024;
4268 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
c9de560d
AT
4269 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4270 (22 - bsbits)) << 22;
4271 size = 4 * 1024 * 1024;
4272 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
1930479c 4273 (8<<20)>>bsbits, max, 8 * 1024)) {
c9de560d
AT
4274 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4275 (23 - bsbits)) << 23;
4276 size = 8 * 1024 * 1024;
4277 } else {
b27b1535 4278 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
91a48aaf 4279 size = (loff_t) EXT4_C2B(sbi,
b27b1535 4280 ac->ac_o_ex.fe_len) << bsbits;
c9de560d 4281 }
5a0790c2
AK
4282 size = size >> bsbits;
4283 start = start_off >> bsbits;
c9de560d 4284
a08f789d
BL
4285 /*
4286 * For tiny groups (smaller than 8MB) the chosen allocation
4287 * alignment may be larger than group size. Make sure the
4288 * alignment does not move allocation to a different group which
4289 * makes mballoc fail assertions later.
4290 */
4291 start = max(start, rounddown(ac->ac_o_ex.fe_logical,
4292 (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
4293
c9de560d
AT
4294 /* don't cover already allocated blocks in selected range */
4295 if (ar->pleft && start <= ar->lleft) {
4296 size -= ar->lleft + 1 - start;
4297 start = ar->lleft + 1;
4298 }
4299 if (ar->pright && start + size - 1 >= ar->lright)
4300 size -= start + size - ar->lright;
4301
cd648b8a
JK
4302 /*
4303 * Trim allocation request for filesystems with artificially small
4304 * groups.
4305 */
4306 if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
4307 size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
4308
c9de560d
AT
4309 end = start + size;
4310
0830344c 4311 ext4_mb_pa_adjust_overlap(ac, &start, &end);
c9de560d 4312
c9de560d
AT
4313 size = end - start;
4314
cf4ff938
BL
4315 /*
4316 * In this function "start" and "size" are normalized for better
4317 * alignment and length such that we could preallocate more blocks.
4318 * This normalization is done such that original request of
4319 * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
4320 * "size" boundaries.
4321 * (Note fe_len can be relaxed since FS block allocation API does not
4322 * provide gurantee on number of contiguous blocks allocation since that
4323 * depends upon free space left, etc).
4324 * In case of inode pa, later we use the allocated blocks
1221b235 4325 * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated
cf4ff938
BL
4326 * range of goal/best blocks [start, size] to put it at the
4327 * ac_o_ex.fe_logical extent of this inode.
4328 * (See ext4_mb_use_inode_pa() for more details)
4329 */
4330 if (start + size <= ac->ac_o_ex.fe_logical ||
c9de560d 4331 start > ac->ac_o_ex.fe_logical) {
9d8b9ec4
TT
4332 ext4_msg(ac->ac_sb, KERN_ERR,
4333 "start %lu, size %lu, fe_logical %lu",
4334 (unsigned long) start, (unsigned long) size,
4335 (unsigned long) ac->ac_o_ex.fe_logical);
dfe076c1 4336 BUG();
c9de560d 4337 }
b5b60778 4338 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
c9de560d
AT
4339
4340 /* now prepare goal request */
4341
4342 /* XXX: is it better to align blocks WRT to logical
4343 * placement or satisfy big request as is */
4344 ac->ac_g_ex.fe_logical = start;
53accfa9 4345 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
c9de560d
AT
4346
4347 /* define goal start in order to merge */
b07ffe69
KS
4348 if (ar->pright && (ar->lright == (start + size)) &&
4349 ar->pright >= size &&
4350 ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
c9de560d
AT
4351 /* merge to the right */
4352 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
b07ffe69
KS
4353 &ac->ac_g_ex.fe_group,
4354 &ac->ac_g_ex.fe_start);
c9de560d
AT
4355 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
4356 }
b07ffe69
KS
4357 if (ar->pleft && (ar->lleft + 1 == start) &&
4358 ar->pleft + 1 < ext4_blocks_count(es)) {
c9de560d
AT
4359 /* merge to the left */
4360 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
b07ffe69
KS
4361 &ac->ac_g_ex.fe_group,
4362 &ac->ac_g_ex.fe_start);
c9de560d
AT
4363 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
4364 }
4365
d3df1453
RH
4366 mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
4367 orig_size, start);
c9de560d
AT
4368}
4369
4370static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
4371{
4372 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4373
a6c75eaf 4374 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
c9de560d
AT
4375 atomic_inc(&sbi->s_bal_reqs);
4376 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
291dae47 4377 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
c9de560d
AT
4378 atomic_inc(&sbi->s_bal_success);
4379 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
a6c75eaf 4380 atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
c9de560d
AT
4381 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
4382 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
4383 atomic_inc(&sbi->s_bal_goals);
4384 if (ac->ac_found > sbi->s_mb_max_to_scan)
4385 atomic_inc(&sbi->s_bal_breaks);
4386 }
4387
296c355c
TT
4388 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
4389 trace_ext4_mballoc_alloc(ac);
4390 else
4391 trace_ext4_mballoc_prealloc(ac);
c9de560d
AT
4392}
4393
b844167e
CW
4394/*
4395 * Called on failure; free up any blocks from the inode PA for this
4396 * context. We don't need this for MB_GROUP_PA because we only change
4397 * pa_free in ext4_mb_release_context(), but on failure, we've already
4398 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
4399 */
4400static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
4401{
4402 struct ext4_prealloc_space *pa = ac->ac_pa;
86f0afd4
TT
4403 struct ext4_buddy e4b;
4404 int err;
b844167e 4405
86f0afd4 4406 if (pa == NULL) {
c99d1e6e
TT
4407 if (ac->ac_f_ex.fe_len == 0)
4408 return;
86f0afd4 4409 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
19b8b035
TT
4410 if (WARN_RATELIMIT(err,
4411 "ext4: mb_load_buddy failed (%d)", err))
86f0afd4
TT
4412 /*
4413 * This should never happen since we pin the
4414 * pages in the ext4_allocation_context so
4415 * ext4_mb_load_buddy() should never fail.
4416 */
86f0afd4 4417 return;
86f0afd4
TT
4418 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
4419 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
4420 ac->ac_f_ex.fe_len);
4421 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
c99d1e6e 4422 ext4_mb_unload_buddy(&e4b);
86f0afd4
TT
4423 return;
4424 }
36cb0f52
KS
4425 if (pa->pa_type == MB_INODE_PA) {
4426 spin_lock(&pa->pa_lock);
400db9d3 4427 pa->pa_free += ac->ac_b_ex.fe_len;
36cb0f52
KS
4428 spin_unlock(&pa->pa_lock);
4429 }
b844167e
CW
4430}
4431
c9de560d
AT
4432/*
4433 * use blocks preallocated to inode
4434 */
4435static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
4436 struct ext4_prealloc_space *pa)
4437{
53accfa9 4438 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
c9de560d
AT
4439 ext4_fsblk_t start;
4440 ext4_fsblk_t end;
4441 int len;
4442
4443 /* found preallocated blocks, use them */
4444 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
53accfa9
TT
4445 end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
4446 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
4447 len = EXT4_NUM_B2C(sbi, end - start);
c9de560d
AT
4448 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
4449 &ac->ac_b_ex.fe_start);
4450 ac->ac_b_ex.fe_len = len;
4451 ac->ac_status = AC_STATUS_FOUND;
4452 ac->ac_pa = pa;
4453
4454 BUG_ON(start < pa->pa_pstart);
53accfa9 4455 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
c9de560d 4456 BUG_ON(pa->pa_free < len);
93cdf49f 4457 BUG_ON(ac->ac_b_ex.fe_len <= 0);
c9de560d
AT
4458 pa->pa_free -= len;
4459
d3df1453 4460 mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
c9de560d
AT
4461}
4462
4463/*
4464 * use blocks preallocated to locality group
4465 */
4466static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
4467 struct ext4_prealloc_space *pa)
4468{
03cddb80 4469 unsigned int len = ac->ac_o_ex.fe_len;
6be2ded1 4470
c9de560d
AT
4471 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
4472 &ac->ac_b_ex.fe_group,
4473 &ac->ac_b_ex.fe_start);
4474 ac->ac_b_ex.fe_len = len;
4475 ac->ac_status = AC_STATUS_FOUND;
4476 ac->ac_pa = pa;
4477
1221b235 4478 /* we don't correct pa_pstart or pa_len here to avoid
26346ff6 4479 * possible race when the group is being loaded concurrently
c9de560d 4480 * instead we correct pa later, after blocks are marked
26346ff6
AK
4481 * in on-disk bitmap -- see ext4_mb_release_context()
4482 * Other CPUs are prevented from allocating from this pa by lg_mutex
c9de560d 4483 */
d3df1453 4484 mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
1afdc588 4485 pa->pa_lstart, len, pa);
c9de560d
AT
4486}
4487
5e745b04
AK
4488/*
4489 * Return the prealloc space that have minimal distance
4490 * from the goal block. @cpa is the prealloc
4491 * space that is having currently known minimal distance
4492 * from the goal block.
4493 */
4494static struct ext4_prealloc_space *
4495ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
4496 struct ext4_prealloc_space *pa,
4497 struct ext4_prealloc_space *cpa)
4498{
4499 ext4_fsblk_t cur_distance, new_distance;
4500
4501 if (cpa == NULL) {
4502 atomic_inc(&pa->pa_count);
4503 return pa;
4504 }
79211c8e
AM
4505 cur_distance = abs(goal_block - cpa->pa_pstart);
4506 new_distance = abs(goal_block - pa->pa_pstart);
5e745b04 4507
5a54b2f1 4508 if (cur_distance <= new_distance)
5e745b04
AK
4509 return cpa;
4510
4511 /* drop the previous reference */
4512 atomic_dec(&cpa->pa_count);
4513 atomic_inc(&pa->pa_count);
4514 return pa;
4515}
4516
c9de560d
AT
4517/*
4518 * search goal blocks in preallocated space
4519 */
4fca8f07 4520static noinline_for_stack bool
4ddfef7b 4521ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
c9de560d 4522{
53accfa9 4523 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
6be2ded1 4524 int order, i;
c9de560d
AT
4525 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
4526 struct ext4_locality_group *lg;
bcf43499
OM
4527 struct ext4_prealloc_space *tmp_pa, *cpa = NULL;
4528 ext4_lblk_t tmp_pa_start, tmp_pa_end;
38727786 4529 struct rb_node *iter;
5e745b04 4530 ext4_fsblk_t goal_block;
c9de560d
AT
4531
4532 /* only data can be preallocated */
4533 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4fca8f07 4534 return false;
c9de560d
AT
4535
4536 /* first, try per-file preallocation */
38727786
OM
4537 read_lock(&ei->i_prealloc_lock);
4538 for (iter = ei->i_prealloc_node.rb_node; iter;
4539 iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
4540 tmp_pa_start, iter)) {
4541 tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4542 pa_node.inode_node);
c9de560d
AT
4543
4544 /* all fields in this condition don't change,
4545 * so we can skip locking for them */
bcf43499
OM
4546 tmp_pa_start = tmp_pa->pa_lstart;
4547 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
4548
38727786 4549 /* original request start doesn't lie in this PA */
bcf43499
OM
4550 if (ac->ac_o_ex.fe_logical < tmp_pa_start ||
4551 ac->ac_o_ex.fe_logical >= tmp_pa_end)
c9de560d
AT
4552 continue;
4553
fb0a387d 4554 /* non-extent files can't have physical blocks past 2^32 */
12e9b892 4555 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
bcf43499 4556 (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
e86a7182
OM
4557 EXT4_MAX_BLOCK_FILE_PHYS)) {
4558 /*
4559 * Since PAs don't overlap, we won't find any
4560 * other PA to satisfy this.
4561 */
4562 break;
4563 }
fb0a387d 4564
c9de560d 4565 /* found preallocated blocks, use them */
bcf43499
OM
4566 spin_lock(&tmp_pa->pa_lock);
4567 if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) {
4568 atomic_inc(&tmp_pa->pa_count);
4569 ext4_mb_use_inode_pa(ac, tmp_pa);
4570 spin_unlock(&tmp_pa->pa_lock);
c9de560d 4571 ac->ac_criteria = 10;
38727786 4572 read_unlock(&ei->i_prealloc_lock);
4fca8f07 4573 return true;
c9de560d 4574 }
bcf43499 4575 spin_unlock(&tmp_pa->pa_lock);
c9de560d 4576 }
38727786 4577 read_unlock(&ei->i_prealloc_lock);
c9de560d
AT
4578
4579 /* can we use group allocation? */
4580 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
4fca8f07 4581 return false;
c9de560d
AT
4582
4583 /* inode may have no locality group for some reason */
4584 lg = ac->ac_lg;
4585 if (lg == NULL)
4fca8f07 4586 return false;
6be2ded1
AK
4587 order = fls(ac->ac_o_ex.fe_len) - 1;
4588 if (order > PREALLOC_TB_SIZE - 1)
4589 /* The max size of hash table is PREALLOC_TB_SIZE */
4590 order = PREALLOC_TB_SIZE - 1;
4591
bda00de7 4592 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
5e745b04
AK
4593 /*
4594 * search for the prealloc space that is having
4595 * minimal distance from the goal block.
4596 */
6be2ded1
AK
4597 for (i = order; i < PREALLOC_TB_SIZE; i++) {
4598 rcu_read_lock();
bcf43499 4599 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i],
a8e38fd3 4600 pa_node.lg_list) {
bcf43499
OM
4601 spin_lock(&tmp_pa->pa_lock);
4602 if (tmp_pa->pa_deleted == 0 &&
4603 tmp_pa->pa_free >= ac->ac_o_ex.fe_len) {
5e745b04
AK
4604
4605 cpa = ext4_mb_check_group_pa(goal_block,
bcf43499 4606 tmp_pa, cpa);
6be2ded1 4607 }
bcf43499 4608 spin_unlock(&tmp_pa->pa_lock);
c9de560d 4609 }
6be2ded1 4610 rcu_read_unlock();
c9de560d 4611 }
5e745b04
AK
4612 if (cpa) {
4613 ext4_mb_use_group_pa(ac, cpa);
4614 ac->ac_criteria = 20;
4fca8f07 4615 return true;
5e745b04 4616 }
4fca8f07 4617 return false;
c9de560d
AT
4618}
4619
7a2fcbf7
AK
4620/*
4621 * the function goes through all block freed in the group
4622 * but not yet committed and marks them used in in-core bitmap.
4623 * buddy must be generated from this bitmap
955ce5f5 4624 * Need to be called with the ext4 group lock held
7a2fcbf7
AK
4625 */
4626static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
4627 ext4_group_t group)
4628{
4629 struct rb_node *n;
4630 struct ext4_group_info *grp;
4631 struct ext4_free_data *entry;
4632
4633 grp = ext4_get_group_info(sb, group);
5354b2af
TT
4634 if (!grp)
4635 return;
7a2fcbf7
AK
4636 n = rb_first(&(grp->bb_free_root));
4637
4638 while (n) {
18aadd47 4639 entry = rb_entry(n, struct ext4_free_data, efd_node);
123e3016 4640 mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
7a2fcbf7
AK
4641 n = rb_next(n);
4642 }
4643 return;
4644}
4645
c9de560d
AT
4646/*
4647 * the function goes through all preallocation in this group and marks them
4648 * used in in-core bitmap. buddy must be generated from this bitmap
955ce5f5 4649 * Need to be called with ext4 group lock held
c9de560d 4650 */
089ceecc
ES
4651static noinline_for_stack
4652void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
c9de560d
AT
4653 ext4_group_t group)
4654{
4655 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
4656 struct ext4_prealloc_space *pa;
4657 struct list_head *cur;
4658 ext4_group_t groupnr;
4659 ext4_grpblk_t start;
4660 int preallocated = 0;
c9de560d
AT
4661 int len;
4662
5354b2af
TT
4663 if (!grp)
4664 return;
4665
c9de560d
AT
4666 /* all form of preallocation discards first load group,
4667 * so the only competing code is preallocation use.
4668 * we don't need any locking here
4669 * notice we do NOT ignore preallocations with pa_deleted
4670 * otherwise we could leave used blocks available for
4671 * allocation in buddy when concurrent ext4_mb_put_pa()
4672 * is dropping preallocation
4673 */
4674 list_for_each(cur, &grp->bb_prealloc_list) {
4675 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
4676 spin_lock(&pa->pa_lock);
4677 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4678 &groupnr, &start);
4679 len = pa->pa_len;
4680 spin_unlock(&pa->pa_lock);
4681 if (unlikely(len == 0))
4682 continue;
4683 BUG_ON(groupnr != group);
123e3016 4684 mb_set_bits(bitmap, start, len);
c9de560d 4685 preallocated += len;
c9de560d 4686 }
d3df1453 4687 mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
c9de560d
AT
4688}
4689
27bc446e 4690static void ext4_mb_mark_pa_deleted(struct super_block *sb,
4691 struct ext4_prealloc_space *pa)
4692{
4693 struct ext4_inode_info *ei;
4694
4695 if (pa->pa_deleted) {
4696 ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
4697 pa->pa_type, pa->pa_pstart, pa->pa_lstart,
4698 pa->pa_len);
4699 return;
4700 }
4701
4702 pa->pa_deleted = 1;
4703
4704 if (pa->pa_type == MB_INODE_PA) {
4705 ei = EXT4_I(pa->pa_inode);
4706 atomic_dec(&ei->i_prealloc_active);
4707 }
4708}
4709
82089725 4710static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa)
c9de560d 4711{
82089725 4712 BUG_ON(!pa);
4e8d2139
JR
4713 BUG_ON(atomic_read(&pa->pa_count));
4714 BUG_ON(pa->pa_deleted == 0);
c9de560d
AT
4715 kmem_cache_free(ext4_pspace_cachep, pa);
4716}
4717
82089725
OM
4718static void ext4_mb_pa_callback(struct rcu_head *head)
4719{
4720 struct ext4_prealloc_space *pa;
4721
4722 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
4723 ext4_mb_pa_free(pa);
4724}
4725
c9de560d
AT
4726/*
4727 * drops a reference to preallocated space descriptor
4728 * if this was the last reference and the space is consumed
4729 */
4730static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
4731 struct super_block *sb, struct ext4_prealloc_space *pa)
4732{
a9df9a49 4733 ext4_group_t grp;
d33a1976 4734 ext4_fsblk_t grp_blk;
38727786 4735 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
c9de560d 4736
c9de560d
AT
4737 /* in this short window concurrent discard can set pa_deleted */
4738 spin_lock(&pa->pa_lock);
4e8d2139
JR
4739 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
4740 spin_unlock(&pa->pa_lock);
4741 return;
4742 }
4743
c9de560d
AT
4744 if (pa->pa_deleted == 1) {
4745 spin_unlock(&pa->pa_lock);
4746 return;
4747 }
4748
27bc446e 4749 ext4_mb_mark_pa_deleted(sb, pa);
c9de560d
AT
4750 spin_unlock(&pa->pa_lock);
4751
d33a1976 4752 grp_blk = pa->pa_pstart;
60e6679e 4753 /*
cc0fb9ad
AK
4754 * If doing group-based preallocation, pa_pstart may be in the
4755 * next group when pa is used up
4756 */
4757 if (pa->pa_type == MB_GROUP_PA)
d33a1976
ES
4758 grp_blk--;
4759
bd86298e 4760 grp = ext4_get_group_number(sb, grp_blk);
c9de560d
AT
4761
4762 /*
4763 * possible race:
4764 *
4765 * P1 (buddy init) P2 (regular allocation)
4766 * find block B in PA
4767 * copy on-disk bitmap to buddy
4768 * mark B in on-disk bitmap
4769 * drop PA from group
4770 * mark all PAs in buddy
4771 *
4772 * thus, P1 initializes buddy with B available. to prevent this
4773 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
4774 * against that pair
4775 */
4776 ext4_lock_group(sb, grp);
4777 list_del(&pa->pa_group_list);
4778 ext4_unlock_group(sb, grp);
4779
a8e38fd3 4780 if (pa->pa_type == MB_INODE_PA) {
38727786
OM
4781 write_lock(pa->pa_node_lock.inode_lock);
4782 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
4783 write_unlock(pa->pa_node_lock.inode_lock);
4784 ext4_mb_pa_free(pa);
a8e38fd3
OM
4785 } else {
4786 spin_lock(pa->pa_node_lock.lg_lock);
4787 list_del_rcu(&pa->pa_node.lg_list);
4788 spin_unlock(pa->pa_node_lock.lg_lock);
38727786
OM
4789 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4790 }
4791}
4792
4793static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new)
4794{
4795 struct rb_node **iter = &root->rb_node, *parent = NULL;
4796 struct ext4_prealloc_space *iter_pa, *new_pa;
4797 ext4_lblk_t iter_start, new_start;
4798
4799 while (*iter) {
4800 iter_pa = rb_entry(*iter, struct ext4_prealloc_space,
4801 pa_node.inode_node);
4802 new_pa = rb_entry(new, struct ext4_prealloc_space,
4803 pa_node.inode_node);
4804 iter_start = iter_pa->pa_lstart;
4805 new_start = new_pa->pa_lstart;
4806
4807 parent = *iter;
4808 if (new_start < iter_start)
4809 iter = &((*iter)->rb_left);
4810 else
4811 iter = &((*iter)->rb_right);
a8e38fd3 4812 }
c9de560d 4813
38727786
OM
4814 rb_link_node(new, parent, iter);
4815 rb_insert_color(new, root);
c9de560d
AT
4816}
4817
4818/*
4819 * creates new preallocated space for given inode
4820 */
53f86b17 4821static noinline_for_stack void
4ddfef7b 4822ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
c9de560d
AT
4823{
4824 struct super_block *sb = ac->ac_sb;
53accfa9 4825 struct ext4_sb_info *sbi = EXT4_SB(sb);
c9de560d
AT
4826 struct ext4_prealloc_space *pa;
4827 struct ext4_group_info *grp;
4828 struct ext4_inode_info *ei;
4829
4830 /* preallocate only when found space is larger then requested */
4831 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
4832 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
4833 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
53f86b17 4834 BUG_ON(ac->ac_pa == NULL);
c9de560d 4835
53f86b17 4836 pa = ac->ac_pa;
c9de560d
AT
4837
4838 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
93cdf49f
OM
4839 int new_bex_start;
4840 int new_bex_end;
c9de560d
AT
4841
4842 /* we can't allocate as much as normalizer wants.
4843 * so, found space must get proper lstart
4844 * to cover original request */
4845 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
4846 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
4847
93cdf49f
OM
4848 /*
4849 * Use the below logic for adjusting best extent as it keeps
4850 * fragmentation in check while ensuring logical range of best
4851 * extent doesn't overflow out of goal extent:
4852 *
4853 * 1. Check if best ex can be kept at end of goal and still
4854 * cover original start
4855 * 2. Else, check if best ex can be kept at start of goal and
4856 * still cover original start
4857 * 3. Else, keep the best ex at start of original request.
4858 */
4859 new_bex_end = ac->ac_g_ex.fe_logical +
4860 EXT4_C2B(sbi, ac->ac_g_ex.fe_len);
4861 new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4862 if (ac->ac_o_ex.fe_logical >= new_bex_start)
4863 goto adjust_bex;
c9de560d 4864
93cdf49f
OM
4865 new_bex_start = ac->ac_g_ex.fe_logical;
4866 new_bex_end =
4867 new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4868 if (ac->ac_o_ex.fe_logical < new_bex_end)
4869 goto adjust_bex;
c9de560d 4870
93cdf49f
OM
4871 new_bex_start = ac->ac_o_ex.fe_logical;
4872 new_bex_end =
4873 new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
c9de560d 4874
93cdf49f
OM
4875adjust_bex:
4876 ac->ac_b_ex.fe_logical = new_bex_start;
c9de560d 4877
c9de560d
AT
4878 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
4879 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
93cdf49f
OM
4880 BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical +
4881 EXT4_C2B(sbi, ac->ac_g_ex.fe_len)));
c9de560d
AT
4882 }
4883
c9de560d
AT
4884 pa->pa_lstart = ac->ac_b_ex.fe_logical;
4885 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4886 pa->pa_len = ac->ac_b_ex.fe_len;
4887 pa->pa_free = pa->pa_len;
c9de560d 4888 spin_lock_init(&pa->pa_lock);
d794bf8e 4889 INIT_LIST_HEAD(&pa->pa_group_list);
c9de560d 4890 pa->pa_deleted = 0;
cc0fb9ad 4891 pa->pa_type = MB_INODE_PA;
c9de560d 4892
d3df1453
RH
4893 mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
4894 pa->pa_len, pa->pa_lstart);
9bffad1e 4895 trace_ext4_mb_new_inode_pa(ac, pa);
c9de560d 4896
53accfa9 4897 atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
abc075d4 4898 ext4_mb_use_inode_pa(ac, pa);
c9de560d
AT
4899
4900 ei = EXT4_I(ac->ac_inode);
4901 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
5354b2af
TT
4902 if (!grp)
4903 return;
c9de560d 4904
a8e38fd3 4905 pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock;
c9de560d
AT
4906 pa->pa_inode = ac->ac_inode;
4907
c9de560d 4908 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
c9de560d 4909
38727786
OM
4910 write_lock(pa->pa_node_lock.inode_lock);
4911 ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node);
4912 write_unlock(pa->pa_node_lock.inode_lock);
27bc446e 4913 atomic_inc(&ei->i_prealloc_active);
c9de560d
AT
4914}
4915
4916/*
4917 * creates new preallocated space for locality group inodes belongs to
4918 */
53f86b17 4919static noinline_for_stack void
4ddfef7b 4920ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
c9de560d
AT
4921{
4922 struct super_block *sb = ac->ac_sb;
4923 struct ext4_locality_group *lg;
4924 struct ext4_prealloc_space *pa;
4925 struct ext4_group_info *grp;
4926
4927 /* preallocate only when found space is larger then requested */
4928 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
4929 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
4930 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
53f86b17 4931 BUG_ON(ac->ac_pa == NULL);
c9de560d 4932
53f86b17 4933 pa = ac->ac_pa;
c9de560d 4934
c9de560d
AT
4935 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4936 pa->pa_lstart = pa->pa_pstart;
4937 pa->pa_len = ac->ac_b_ex.fe_len;
4938 pa->pa_free = pa->pa_len;
c9de560d 4939 spin_lock_init(&pa->pa_lock);
a8e38fd3 4940 INIT_LIST_HEAD(&pa->pa_node.lg_list);
d794bf8e 4941 INIT_LIST_HEAD(&pa->pa_group_list);
c9de560d 4942 pa->pa_deleted = 0;
cc0fb9ad 4943 pa->pa_type = MB_GROUP_PA;
c9de560d 4944
d3df1453
RH
4945 mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
4946 pa->pa_len, pa->pa_lstart);
9bffad1e 4947 trace_ext4_mb_new_group_pa(ac, pa);
c9de560d
AT
4948
4949 ext4_mb_use_group_pa(ac, pa);
4950 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
4951
4952 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
5354b2af
TT
4953 if (!grp)
4954 return;
c9de560d
AT
4955 lg = ac->ac_lg;
4956 BUG_ON(lg == NULL);
4957
a8e38fd3 4958 pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock;
c9de560d
AT
4959 pa->pa_inode = NULL;
4960
c9de560d 4961 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
c9de560d 4962
6be2ded1
AK
4963 /*
4964 * We will later add the new pa to the right bucket
4965 * after updating the pa_free in ext4_mb_release_context
4966 */
c9de560d
AT
4967}
4968
53f86b17 4969static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
c9de560d 4970{
c9de560d 4971 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
53f86b17 4972 ext4_mb_new_group_pa(ac);
c9de560d 4973 else
53f86b17 4974 ext4_mb_new_inode_pa(ac);
c9de560d
AT
4975}
4976
4977/*
4978 * finds all unused blocks in on-disk bitmap, frees them in
4979 * in-core bitmap and buddy.
4980 * @pa must be unlinked from inode and group lists, so that
4981 * nobody else can find/use it.
4982 * the caller MUST hold group/inode locks.
4983 * TODO: optimize the case when there are no in-core structures yet
4984 */
4ddfef7b
ES
4985static noinline_for_stack int
4986ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3e1e5f50 4987 struct ext4_prealloc_space *pa)
c9de560d 4988{
c9de560d
AT
4989 struct super_block *sb = e4b->bd_sb;
4990 struct ext4_sb_info *sbi = EXT4_SB(sb);
498e5f24
TT
4991 unsigned int end;
4992 unsigned int next;
c9de560d
AT
4993 ext4_group_t group;
4994 ext4_grpblk_t bit;
ba80b101 4995 unsigned long long grp_blk_start;
c9de560d
AT
4996 int free = 0;
4997
4998 BUG_ON(pa->pa_deleted == 0);
4999 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
53accfa9 5000 grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
c9de560d
AT
5001 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
5002 end = bit + pa->pa_len;
5003
c9de560d 5004 while (bit < end) {
ffad0a44 5005 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
c9de560d
AT
5006 if (bit >= end)
5007 break;
ffad0a44 5008 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
d3df1453 5009 mb_debug(sb, "free preallocated %u/%u in group %u\n",
5a0790c2
AK
5010 (unsigned) ext4_group_first_block_no(sb, group) + bit,
5011 (unsigned) next - bit, (unsigned) group);
c9de560d
AT
5012 free += next - bit;
5013
3e1e5f50 5014 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
53accfa9
TT
5015 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
5016 EXT4_C2B(sbi, bit)),
a9c667f8 5017 next - bit);
c9de560d
AT
5018 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
5019 bit = next + 1;
5020 }
5021 if (free != pa->pa_free) {
9d8b9ec4 5022 ext4_msg(e4b->bd_sb, KERN_CRIT,
36bad423 5023 "pa %p: logic %lu, phys. %lu, len %d",
9d8b9ec4
TT
5024 pa, (unsigned long) pa->pa_lstart,
5025 (unsigned long) pa->pa_pstart,
36bad423 5026 pa->pa_len);
e29136f8 5027 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
5d1b1b3f 5028 free, pa->pa_free);
e56eb659
AK
5029 /*
5030 * pa is already deleted so we use the value obtained
5031 * from the bitmap and continue.
5032 */
c9de560d 5033 }
c9de560d
AT
5034 atomic_add(free, &sbi->s_mb_discarded);
5035
863c37fc 5036 return 0;
c9de560d
AT
5037}
5038
4ddfef7b
ES
5039static noinline_for_stack int
5040ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3e1e5f50 5041 struct ext4_prealloc_space *pa)
c9de560d 5042{
c9de560d
AT
5043 struct super_block *sb = e4b->bd_sb;
5044 ext4_group_t group;
5045 ext4_grpblk_t bit;
5046
60e07cf5 5047 trace_ext4_mb_release_group_pa(sb, pa);
c9de560d
AT
5048 BUG_ON(pa->pa_deleted == 0);
5049 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
5050 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
5051 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
5052 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3e1e5f50 5053 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
c9de560d
AT
5054
5055 return 0;
5056}
5057
5058/*
5059 * releases all preallocations in given group
5060 *
5061 * first, we need to decide discard policy:
5062 * - when do we discard
5063 * 1) ENOSPC
5064 * - how many do we discard
5065 * 1) how many requested
5066 */
4ddfef7b
ES
5067static noinline_for_stack int
5068ext4_mb_discard_group_preallocations(struct super_block *sb,
8c80fb31 5069 ext4_group_t group, int *busy)
c9de560d
AT
5070{
5071 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
5072 struct buffer_head *bitmap_bh = NULL;
5073 struct ext4_prealloc_space *pa, *tmp;
5074 struct list_head list;
5075 struct ext4_buddy e4b;
38727786 5076 struct ext4_inode_info *ei;
c9de560d 5077 int err;
8c80fb31 5078 int free = 0;
c9de560d 5079
5354b2af
TT
5080 if (!grp)
5081 return 0;
d3df1453 5082 mb_debug(sb, "discard preallocation for group %u\n", group);
c9de560d 5083 if (list_empty(&grp->bb_prealloc_list))
bbc4ec77 5084 goto out_dbg;
c9de560d 5085
574ca174 5086 bitmap_bh = ext4_read_block_bitmap(sb, group);
9008a58e
DW
5087 if (IS_ERR(bitmap_bh)) {
5088 err = PTR_ERR(bitmap_bh);
54d3adbc
TT
5089 ext4_error_err(sb, -err,
5090 "Error %d reading block bitmap for %u",
5091 err, group);
bbc4ec77 5092 goto out_dbg;
c9de560d
AT
5093 }
5094
5095 err = ext4_mb_load_buddy(sb, group, &e4b);
ce89f46c 5096 if (err) {
9651e6b2
KK
5097 ext4_warning(sb, "Error %d loading buddy information for %u",
5098 err, group);
ce89f46c 5099 put_bh(bitmap_bh);
bbc4ec77 5100 goto out_dbg;
ce89f46c 5101 }
c9de560d 5102
c9de560d 5103 INIT_LIST_HEAD(&list);
c9de560d
AT
5104 ext4_lock_group(sb, group);
5105 list_for_each_entry_safe(pa, tmp,
5106 &grp->bb_prealloc_list, pa_group_list) {
5107 spin_lock(&pa->pa_lock);
5108 if (atomic_read(&pa->pa_count)) {
5109 spin_unlock(&pa->pa_lock);
8c80fb31 5110 *busy = 1;
c9de560d
AT
5111 continue;
5112 }
5113 if (pa->pa_deleted) {
5114 spin_unlock(&pa->pa_lock);
5115 continue;
5116 }
5117
5118 /* seems this one can be freed ... */
27bc446e 5119 ext4_mb_mark_pa_deleted(sb, pa);
c9de560d 5120
70022da8
YB
5121 if (!free)
5122 this_cpu_inc(discard_pa_seq);
5123
c9de560d
AT
5124 /* we can trust pa_free ... */
5125 free += pa->pa_free;
5126
5127 spin_unlock(&pa->pa_lock);
5128
5129 list_del(&pa->pa_group_list);
5130 list_add(&pa->u.pa_tmp_list, &list);
5131 }
5132
c9de560d
AT
5133 /* now free all selected PAs */
5134 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
5135
5136 /* remove from object (inode or locality group) */
a8e38fd3
OM
5137 if (pa->pa_type == MB_GROUP_PA) {
5138 spin_lock(pa->pa_node_lock.lg_lock);
5139 list_del_rcu(&pa->pa_node.lg_list);
5140 spin_unlock(pa->pa_node_lock.lg_lock);
5141 } else {
38727786
OM
5142 write_lock(pa->pa_node_lock.inode_lock);
5143 ei = EXT4_I(pa->pa_inode);
5144 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
5145 write_unlock(pa->pa_node_lock.inode_lock);
a8e38fd3 5146 }
c9de560d 5147
38727786
OM
5148 list_del(&pa->u.pa_tmp_list);
5149
5150 if (pa->pa_type == MB_GROUP_PA) {
3e1e5f50 5151 ext4_mb_release_group_pa(&e4b, pa);
38727786
OM
5152 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
5153 } else {
3e1e5f50 5154 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
38727786
OM
5155 ext4_mb_pa_free(pa);
5156 }
c9de560d
AT
5157 }
5158
c9de560d 5159 ext4_unlock_group(sb, group);
e39e07fd 5160 ext4_mb_unload_buddy(&e4b);
c9de560d 5161 put_bh(bitmap_bh);
bbc4ec77 5162out_dbg:
d3df1453 5163 mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
8c80fb31
CX
5164 free, group, grp->bb_free);
5165 return free;
c9de560d
AT
5166}
5167
5168/*
5169 * releases all non-used preallocated blocks for given inode
5170 *
5171 * It's important to discard preallocations under i_data_sem
5172 * We don't want another block to be served from the prealloc
5173 * space when we are discarding the inode prealloc space.
5174 *
5175 * FIXME!! Make sure it is valid at all the call sites
5176 */
27bc446e 5177void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
c9de560d
AT
5178{
5179 struct ext4_inode_info *ei = EXT4_I(inode);
5180 struct super_block *sb = inode->i_sb;
5181 struct buffer_head *bitmap_bh = NULL;
5182 struct ext4_prealloc_space *pa, *tmp;
5183 ext4_group_t group = 0;
5184 struct list_head list;
5185 struct ext4_buddy e4b;
38727786 5186 struct rb_node *iter;
c9de560d
AT
5187 int err;
5188
c2ea3fde 5189 if (!S_ISREG(inode->i_mode)) {
c9de560d
AT
5190 return;
5191 }
5192
8016e29f
HS
5193 if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
5194 return;
5195
d3df1453
RH
5196 mb_debug(sb, "discard preallocation for inode %lu\n",
5197 inode->i_ino);
27bc446e 5198 trace_ext4_discard_preallocations(inode,
5199 atomic_read(&ei->i_prealloc_active), needed);
c9de560d
AT
5200
5201 INIT_LIST_HEAD(&list);
5202
27bc446e 5203 if (needed == 0)
5204 needed = UINT_MAX;
5205
c9de560d
AT
5206repeat:
5207 /* first, collect all pa's in the inode */
38727786
OM
5208 write_lock(&ei->i_prealloc_lock);
5209 for (iter = rb_first(&ei->i_prealloc_node); iter && needed;
5210 iter = rb_next(iter)) {
5211 pa = rb_entry(iter, struct ext4_prealloc_space,
5212 pa_node.inode_node);
a8e38fd3 5213 BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock);
38727786 5214
c9de560d
AT
5215 spin_lock(&pa->pa_lock);
5216 if (atomic_read(&pa->pa_count)) {
5217 /* this shouldn't happen often - nobody should
5218 * use preallocation while we're discarding it */
5219 spin_unlock(&pa->pa_lock);
38727786 5220 write_unlock(&ei->i_prealloc_lock);
9d8b9ec4
TT
5221 ext4_msg(sb, KERN_ERR,
5222 "uh-oh! used pa while discarding");
c9de560d
AT
5223 WARN_ON(1);
5224 schedule_timeout_uninterruptible(HZ);
5225 goto repeat;
5226
5227 }
5228 if (pa->pa_deleted == 0) {
27bc446e 5229 ext4_mb_mark_pa_deleted(sb, pa);
c9de560d 5230 spin_unlock(&pa->pa_lock);
38727786 5231 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
c9de560d 5232 list_add(&pa->u.pa_tmp_list, &list);
27bc446e 5233 needed--;
c9de560d
AT
5234 continue;
5235 }
5236
5237 /* someone is deleting pa right now */
5238 spin_unlock(&pa->pa_lock);
38727786 5239 write_unlock(&ei->i_prealloc_lock);
c9de560d
AT
5240
5241 /* we have to wait here because pa_deleted
5242 * doesn't mean pa is already unlinked from
5243 * the list. as we might be called from
5244 * ->clear_inode() the inode will get freed
5245 * and concurrent thread which is unlinking
5246 * pa from inode's list may access already
5247 * freed memory, bad-bad-bad */
5248
5249 /* XXX: if this happens too often, we can
5250 * add a flag to force wait only in case
5251 * of ->clear_inode(), but not in case of
5252 * regular truncate */
5253 schedule_timeout_uninterruptible(HZ);
5254 goto repeat;
5255 }
38727786 5256 write_unlock(&ei->i_prealloc_lock);
c9de560d
AT
5257
5258 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
cc0fb9ad 5259 BUG_ON(pa->pa_type != MB_INODE_PA);
bd86298e 5260 group = ext4_get_group_number(sb, pa->pa_pstart);
c9de560d 5261
9651e6b2
KK
5262 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
5263 GFP_NOFS|__GFP_NOFAIL);
ce89f46c 5264 if (err) {
54d3adbc
TT
5265 ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
5266 err, group);
ce89f46c
AK
5267 continue;
5268 }
c9de560d 5269
574ca174 5270 bitmap_bh = ext4_read_block_bitmap(sb, group);
9008a58e
DW
5271 if (IS_ERR(bitmap_bh)) {
5272 err = PTR_ERR(bitmap_bh);
54d3adbc
TT
5273 ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
5274 err, group);
e39e07fd 5275 ext4_mb_unload_buddy(&e4b);
ce89f46c 5276 continue;
c9de560d
AT
5277 }
5278
5279 ext4_lock_group(sb, group);
5280 list_del(&pa->pa_group_list);
3e1e5f50 5281 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
c9de560d
AT
5282 ext4_unlock_group(sb, group);
5283
e39e07fd 5284 ext4_mb_unload_buddy(&e4b);
c9de560d
AT
5285 put_bh(bitmap_bh);
5286
5287 list_del(&pa->u.pa_tmp_list);
38727786 5288 ext4_mb_pa_free(pa);
c9de560d
AT
5289 }
5290}
5291
53f86b17
RH
5292static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
5293{
5294 struct ext4_prealloc_space *pa;
5295
5296 BUG_ON(ext4_pspace_cachep == NULL);
5297 pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
5298 if (!pa)
5299 return -ENOMEM;
5300 atomic_set(&pa->pa_count, 1);
5301 ac->ac_pa = pa;
5302 return 0;
5303}
5304
82089725 5305static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac)
53f86b17
RH
5306{
5307 struct ext4_prealloc_space *pa = ac->ac_pa;
5308
5309 BUG_ON(!pa);
5310 ac->ac_pa = NULL;
5311 WARN_ON(!atomic_dec_and_test(&pa->pa_count));
82089725
OM
5312 /*
5313 * current function is only called due to an error or due to
5314 * len of found blocks < len of requested blocks hence the PA has not
5315 * been added to grp->bb_prealloc_list. So we don't need to lock it
5316 */
5317 pa->pa_deleted = 1;
5318 ext4_mb_pa_free(pa);
53f86b17
RH
5319}
5320
6ba495e9 5321#ifdef CONFIG_EXT4_DEBUG
e68cf40c 5322static inline void ext4_mb_show_pa(struct super_block *sb)
c9de560d 5323{
e68cf40c 5324 ext4_group_t i, ngroups;
c9de560d 5325
9b5f6c9b 5326 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
e3570639
ES
5327 return;
5328
8df9675f 5329 ngroups = ext4_get_groups_count(sb);
d3df1453 5330 mb_debug(sb, "groups: ");
8df9675f 5331 for (i = 0; i < ngroups; i++) {
c9de560d
AT
5332 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
5333 struct ext4_prealloc_space *pa;
5334 ext4_grpblk_t start;
5335 struct list_head *cur;
5354b2af
TT
5336
5337 if (!grp)
5338 continue;
c9de560d
AT
5339 ext4_lock_group(sb, i);
5340 list_for_each(cur, &grp->bb_prealloc_list) {
5341 pa = list_entry(cur, struct ext4_prealloc_space,
5342 pa_group_list);
5343 spin_lock(&pa->pa_lock);
5344 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
5345 NULL, &start);
5346 spin_unlock(&pa->pa_lock);
d3df1453
RH
5347 mb_debug(sb, "PA:%u:%d:%d\n", i, start,
5348 pa->pa_len);
c9de560d 5349 }
60bd63d1 5350 ext4_unlock_group(sb, i);
d3df1453
RH
5351 mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
5352 grp->bb_fragments);
c9de560d 5353 }
c9de560d 5354}
e68cf40c
RH
5355
5356static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
5357{
5358 struct super_block *sb = ac->ac_sb;
5359
9b5f6c9b 5360 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
e68cf40c
RH
5361 return;
5362
d3df1453 5363 mb_debug(sb, "Can't allocate:"
e68cf40c 5364 " Allocation context details:");
d3df1453 5365 mb_debug(sb, "status %u flags 0x%x",
e68cf40c 5366 ac->ac_status, ac->ac_flags);
d3df1453 5367 mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
e68cf40c
RH
5368 "goal %lu/%lu/%lu@%lu, "
5369 "best %lu/%lu/%lu@%lu cr %d",
5370 (unsigned long)ac->ac_o_ex.fe_group,
5371 (unsigned long)ac->ac_o_ex.fe_start,
5372 (unsigned long)ac->ac_o_ex.fe_len,
5373 (unsigned long)ac->ac_o_ex.fe_logical,
5374 (unsigned long)ac->ac_g_ex.fe_group,
5375 (unsigned long)ac->ac_g_ex.fe_start,
5376 (unsigned long)ac->ac_g_ex.fe_len,
5377 (unsigned long)ac->ac_g_ex.fe_logical,
5378 (unsigned long)ac->ac_b_ex.fe_group,
5379 (unsigned long)ac->ac_b_ex.fe_start,
5380 (unsigned long)ac->ac_b_ex.fe_len,
5381 (unsigned long)ac->ac_b_ex.fe_logical,
5382 (int)ac->ac_criteria);
d3df1453 5383 mb_debug(sb, "%u found", ac->ac_found);
e68cf40c
RH
5384 ext4_mb_show_pa(sb);
5385}
c9de560d 5386#else
e68cf40c
RH
5387static inline void ext4_mb_show_pa(struct super_block *sb)
5388{
5389 return;
5390}
c9de560d
AT
5391static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
5392{
e68cf40c 5393 ext4_mb_show_pa(ac->ac_sb);
c9de560d
AT
5394 return;
5395}
5396#endif
5397
5398/*
5399 * We use locality group preallocation for small size file. The size of the
5400 * file is determined by the current size or the resulting size after
5401 * allocation which ever is larger
5402 *
b713a5ec 5403 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
c9de560d
AT
5404 */
5405static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
5406{
5407 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
5408 int bsbits = ac->ac_sb->s_blocksize_bits;
5409 loff_t size, isize;
a9f2a293 5410 bool inode_pa_eligible, group_pa_eligible;
c9de560d
AT
5411
5412 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
5413 return;
5414
4ba74d00
TT
5415 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
5416 return;
5417
a9f2a293
JK
5418 group_pa_eligible = sbi->s_mb_group_prealloc > 0;
5419 inode_pa_eligible = true;
53accfa9 5420 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
50797481
TT
5421 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
5422 >> bsbits;
c9de560d 5423
a9f2a293 5424 /* No point in using inode preallocation for closed files */
82dd124c 5425 if ((size == isize) && !ext4_fs_is_busy(sbi) &&
a9f2a293
JK
5426 !inode_is_open_for_write(ac->ac_inode))
5427 inode_pa_eligible = false;
50797481 5428
71780577 5429 size = max(size, isize);
a9f2a293
JK
5430 /* Don't use group allocation for large files */
5431 if (size > sbi->s_mb_stream_request)
5432 group_pa_eligible = false;
5433
5434 if (!group_pa_eligible) {
5435 if (inode_pa_eligible)
5436 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
5437 else
5438 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
c9de560d 5439 return;
4ba74d00 5440 }
c9de560d
AT
5441
5442 BUG_ON(ac->ac_lg != NULL);
5443 /*
5444 * locality group prealloc space are per cpu. The reason for having
5445 * per cpu locality group is to reduce the contention between block
5446 * request from multiple CPUs.
5447 */
a0b6bc63 5448 ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
c9de560d
AT
5449
5450 /* we're going to use group allocation */
5451 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
5452
5453 /* serialize all allocations in the group */
5454 mutex_lock(&ac->ac_lg->lg_mutex);
5455}
5456
d73eff68 5457static noinline_for_stack void
4ddfef7b 5458ext4_mb_initialize_context(struct ext4_allocation_context *ac,
c9de560d
AT
5459 struct ext4_allocation_request *ar)
5460{
5461 struct super_block *sb = ar->inode->i_sb;
5462 struct ext4_sb_info *sbi = EXT4_SB(sb);
5463 struct ext4_super_block *es = sbi->s_es;
5464 ext4_group_t group;
498e5f24
TT
5465 unsigned int len;
5466 ext4_fsblk_t goal;
c9de560d
AT
5467 ext4_grpblk_t block;
5468
5469 /* we can't allocate > group size */
5470 len = ar->len;
5471
5472 /* just a dirty hack to filter too big requests */
40ae3487
TT
5473 if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
5474 len = EXT4_CLUSTERS_PER_GROUP(sb);
c9de560d
AT
5475
5476 /* start searching from the goal */
5477 goal = ar->goal;
5478 if (goal < le32_to_cpu(es->s_first_data_block) ||
5479 goal >= ext4_blocks_count(es))
5480 goal = le32_to_cpu(es->s_first_data_block);
5481 ext4_get_group_no_and_offset(sb, goal, &group, &block);
5482
5483 /* set up allocation goals */
f5a44db5 5484 ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
c9de560d 5485 ac->ac_status = AC_STATUS_CONTINUE;
c9de560d
AT
5486 ac->ac_sb = sb;
5487 ac->ac_inode = ar->inode;
53accfa9 5488 ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
c9de560d
AT
5489 ac->ac_o_ex.fe_group = group;
5490 ac->ac_o_ex.fe_start = block;
5491 ac->ac_o_ex.fe_len = len;
53accfa9 5492 ac->ac_g_ex = ac->ac_o_ex;
c9de560d 5493 ac->ac_flags = ar->flags;
c9de560d 5494
3cb77bd2 5495 /* we have to define context: we'll work with a file or
c9de560d
AT
5496 * locality group. this is a policy, actually */
5497 ext4_mb_group_or_file(ac);
5498
d3df1453 5499 mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
c9de560d
AT
5500 "left: %u/%u, right %u/%u to %swritable\n",
5501 (unsigned) ar->len, (unsigned) ar->logical,
5502 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
5503 (unsigned) ar->lleft, (unsigned) ar->pleft,
5504 (unsigned) ar->lright, (unsigned) ar->pright,
82dd124c 5505 inode_is_open_for_write(ar->inode) ? "" : "non-");
c9de560d
AT
5506}
5507
6be2ded1
AK
5508static noinline_for_stack void
5509ext4_mb_discard_lg_preallocations(struct super_block *sb,
5510 struct ext4_locality_group *lg,
5511 int order, int total_entries)
5512{
5513 ext4_group_t group = 0;
5514 struct ext4_buddy e4b;
5515 struct list_head discard_list;
5516 struct ext4_prealloc_space *pa, *tmp;
6be2ded1 5517
d3df1453 5518 mb_debug(sb, "discard locality group preallocation\n");
6be2ded1
AK
5519
5520 INIT_LIST_HEAD(&discard_list);
6be2ded1
AK
5521
5522 spin_lock(&lg->lg_prealloc_lock);
5523 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
a8e38fd3 5524 pa_node.lg_list,
92e9c58c 5525 lockdep_is_held(&lg->lg_prealloc_lock)) {
6be2ded1
AK
5526 spin_lock(&pa->pa_lock);
5527 if (atomic_read(&pa->pa_count)) {
5528 /*
5529 * This is the pa that we just used
5530 * for block allocation. So don't
5531 * free that
5532 */
5533 spin_unlock(&pa->pa_lock);
5534 continue;
5535 }
5536 if (pa->pa_deleted) {
5537 spin_unlock(&pa->pa_lock);
5538 continue;
5539 }
5540 /* only lg prealloc space */
cc0fb9ad 5541 BUG_ON(pa->pa_type != MB_GROUP_PA);
6be2ded1
AK
5542
5543 /* seems this one can be freed ... */
27bc446e 5544 ext4_mb_mark_pa_deleted(sb, pa);
6be2ded1
AK
5545 spin_unlock(&pa->pa_lock);
5546
a8e38fd3 5547 list_del_rcu(&pa->pa_node.lg_list);
6be2ded1
AK
5548 list_add(&pa->u.pa_tmp_list, &discard_list);
5549
5550 total_entries--;
5551 if (total_entries <= 5) {
5552 /*
5553 * we want to keep only 5 entries
5554 * allowing it to grow to 8. This
5555 * mak sure we don't call discard
5556 * soon for this list.
5557 */
5558 break;
5559 }
5560 }
5561 spin_unlock(&lg->lg_prealloc_lock);
5562
5563 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
9651e6b2 5564 int err;
6be2ded1 5565
bd86298e 5566 group = ext4_get_group_number(sb, pa->pa_pstart);
9651e6b2
KK
5567 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
5568 GFP_NOFS|__GFP_NOFAIL);
5569 if (err) {
54d3adbc
TT
5570 ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
5571 err, group);
6be2ded1
AK
5572 continue;
5573 }
5574 ext4_lock_group(sb, group);
5575 list_del(&pa->pa_group_list);
3e1e5f50 5576 ext4_mb_release_group_pa(&e4b, pa);
6be2ded1
AK
5577 ext4_unlock_group(sb, group);
5578
e39e07fd 5579 ext4_mb_unload_buddy(&e4b);
6be2ded1
AK
5580 list_del(&pa->u.pa_tmp_list);
5581 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
5582 }
6be2ded1
AK
5583}
5584
5585/*
5586 * We have incremented pa_count. So it cannot be freed at this
5587 * point. Also we hold lg_mutex. So no parallel allocation is
5588 * possible from this lg. That means pa_free cannot be updated.
5589 *
5590 * A parallel ext4_mb_discard_group_preallocations is possible.
5591 * which can cause the lg_prealloc_list to be updated.
5592 */
5593
5594static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
5595{
5596 int order, added = 0, lg_prealloc_count = 1;
5597 struct super_block *sb = ac->ac_sb;
5598 struct ext4_locality_group *lg = ac->ac_lg;
5599 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
5600
5601 order = fls(pa->pa_free) - 1;
5602 if (order > PREALLOC_TB_SIZE - 1)
5603 /* The max size of hash table is PREALLOC_TB_SIZE */
5604 order = PREALLOC_TB_SIZE - 1;
5605 /* Add the prealloc space to lg */
f1167009 5606 spin_lock(&lg->lg_prealloc_lock);
6be2ded1 5607 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
a8e38fd3 5608 pa_node.lg_list,
92e9c58c 5609 lockdep_is_held(&lg->lg_prealloc_lock)) {
6be2ded1
AK
5610 spin_lock(&tmp_pa->pa_lock);
5611 if (tmp_pa->pa_deleted) {
e7c9e3e9 5612 spin_unlock(&tmp_pa->pa_lock);
6be2ded1
AK
5613 continue;
5614 }
5615 if (!added && pa->pa_free < tmp_pa->pa_free) {
5616 /* Add to the tail of the previous entry */
a8e38fd3
OM
5617 list_add_tail_rcu(&pa->pa_node.lg_list,
5618 &tmp_pa->pa_node.lg_list);
6be2ded1
AK
5619 added = 1;
5620 /*
5621 * we want to count the total
5622 * number of entries in the list
5623 */
5624 }
5625 spin_unlock(&tmp_pa->pa_lock);
5626 lg_prealloc_count++;
5627 }
5628 if (!added)
a8e38fd3 5629 list_add_tail_rcu(&pa->pa_node.lg_list,
6be2ded1 5630 &lg->lg_prealloc_list[order]);
f1167009 5631 spin_unlock(&lg->lg_prealloc_lock);
6be2ded1
AK
5632
5633 /* Now trim the list to be not more than 8 elements */
5634 if (lg_prealloc_count > 8) {
5635 ext4_mb_discard_lg_preallocations(sb, lg,
f1167009 5636 order, lg_prealloc_count);
6be2ded1
AK
5637 return;
5638 }
5639 return ;
5640}
5641
c9de560d
AT
5642/*
5643 * release all resource we used in allocation
5644 */
5645static int ext4_mb_release_context(struct ext4_allocation_context *ac)
5646{
53accfa9 5647 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
6be2ded1
AK
5648 struct ext4_prealloc_space *pa = ac->ac_pa;
5649 if (pa) {
cc0fb9ad 5650 if (pa->pa_type == MB_GROUP_PA) {
c9de560d 5651 /* see comment in ext4_mb_use_group_pa() */
6be2ded1 5652 spin_lock(&pa->pa_lock);
53accfa9
TT
5653 pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
5654 pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
6be2ded1
AK
5655 pa->pa_free -= ac->ac_b_ex.fe_len;
5656 pa->pa_len -= ac->ac_b_ex.fe_len;
5657 spin_unlock(&pa->pa_lock);
66d5e027 5658
5659 /*
5660 * We want to add the pa to the right bucket.
5661 * Remove it from the list and while adding
5662 * make sure the list to which we are adding
5663 * doesn't grow big.
5664 */
5665 if (likely(pa->pa_free)) {
a8e38fd3
OM
5666 spin_lock(pa->pa_node_lock.lg_lock);
5667 list_del_rcu(&pa->pa_node.lg_list);
5668 spin_unlock(pa->pa_node_lock.lg_lock);
66d5e027 5669 ext4_mb_add_n_trim(ac);
5670 }
ba443916 5671 }
27bc446e 5672
ba443916
AK
5673 ext4_mb_put_pa(ac, ac->ac_sb, pa);
5674 }
c9de560d 5675 if (ac->ac_bitmap_page)
09cbfeaf 5676 put_page(ac->ac_bitmap_page);
c9de560d 5677 if (ac->ac_buddy_page)
09cbfeaf 5678 put_page(ac->ac_buddy_page);
c9de560d
AT
5679 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
5680 mutex_unlock(&ac->ac_lg->lg_mutex);
5681 ext4_mb_collect_stats(ac);
5682 return 0;
5683}
5684
5685static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
5686{
8df9675f 5687 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
c9de560d 5688 int ret;
8c80fb31
CX
5689 int freed = 0, busy = 0;
5690 int retry = 0;
c9de560d 5691
9bffad1e 5692 trace_ext4_mb_discard_preallocations(sb, needed);
8c80fb31
CX
5693
5694 if (needed == 0)
5695 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
5696 repeat:
8df9675f 5697 for (i = 0; i < ngroups && needed > 0; i++) {
8c80fb31 5698 ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
c9de560d
AT
5699 freed += ret;
5700 needed -= ret;
8c80fb31
CX
5701 cond_resched();
5702 }
5703
5704 if (needed > 0 && busy && ++retry < 3) {
5705 busy = 0;
5706 goto repeat;
c9de560d
AT
5707 }
5708
5709 return freed;
5710}
5711
cf5e2ca6 5712static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
07b5b8e1 5713 struct ext4_allocation_context *ac, u64 *seq)
cf5e2ca6
RH
5714{
5715 int freed;
07b5b8e1
RH
5716 u64 seq_retry = 0;
5717 bool ret = false;
cf5e2ca6
RH
5718
5719 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
07b5b8e1
RH
5720 if (freed) {
5721 ret = true;
5722 goto out_dbg;
5723 }
5724 seq_retry = ext4_get_discard_pa_seq_sum();
99377830
RH
5725 if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
5726 ac->ac_flags |= EXT4_MB_STRICT_CHECK;
07b5b8e1
RH
5727 *seq = seq_retry;
5728 ret = true;
5729 }
5730
5731out_dbg:
5732 mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
5733 return ret;
cf5e2ca6
RH
5734}
5735
8016e29f
HS
5736static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
5737 struct ext4_allocation_request *ar, int *errp);
5738
c9de560d
AT
5739/*
5740 * Main entry point into mballoc to allocate blocks
5741 * it tries to use preallocation first, then falls back
5742 * to usual allocation
5743 */
5744ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
6c7a120a 5745 struct ext4_allocation_request *ar, int *errp)
c9de560d 5746{
256bdb49 5747 struct ext4_allocation_context *ac = NULL;
c9de560d
AT
5748 struct ext4_sb_info *sbi;
5749 struct super_block *sb;
5750 ext4_fsblk_t block = 0;
60e58e0f 5751 unsigned int inquota = 0;
53accfa9 5752 unsigned int reserv_clstrs = 0;
80fa46d6 5753 int retries = 0;
07b5b8e1 5754 u64 seq;
c9de560d 5755
b10a44c3 5756 might_sleep();
c9de560d
AT
5757 sb = ar->inode->i_sb;
5758 sbi = EXT4_SB(sb);
5759
9bffad1e 5760 trace_ext4_request_blocks(ar);
8016e29f
HS
5761 if (sbi->s_mount_state & EXT4_FC_REPLAY)
5762 return ext4_mb_new_blocks_simple(handle, ar, errp);
ba80b101 5763
45dc63e7 5764 /* Allow to use superuser reservation for quota file */
02749a4c 5765 if (ext4_is_quota_file(ar->inode))
45dc63e7
DM
5766 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
5767
e3cf5d5d 5768 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
60e58e0f
MC
5769 /* Without delayed allocation we need to verify
5770 * there is enough free blocks to do block allocation
5771 * and verify allocation doesn't exceed the quota limits.
d2a17637 5772 */
55f020db 5773 while (ar->len &&
e7d5f315 5774 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
55f020db 5775
030ba6bc 5776 /* let others to free the space */
bb8b20ed 5777 cond_resched();
030ba6bc
AK
5778 ar->len = ar->len >> 1;
5779 }
5780 if (!ar->len) {
bbc4ec77 5781 ext4_mb_show_pa(sb);
a30d542a
AK
5782 *errp = -ENOSPC;
5783 return 0;
5784 }
53accfa9 5785 reserv_clstrs = ar->len;
55f020db 5786 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
53accfa9
TT
5787 dquot_alloc_block_nofail(ar->inode,
5788 EXT4_C2B(sbi, ar->len));
55f020db
AH
5789 } else {
5790 while (ar->len &&
53accfa9
TT
5791 dquot_alloc_block(ar->inode,
5792 EXT4_C2B(sbi, ar->len))) {
55f020db
AH
5793
5794 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
5795 ar->len--;
5796 }
60e58e0f
MC
5797 }
5798 inquota = ar->len;
5799 if (ar->len == 0) {
5800 *errp = -EDQUOT;
6c7a120a 5801 goto out;
60e58e0f 5802 }
07031431 5803 }
d2a17637 5804
85556c9a 5805 ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
833576b3 5806 if (!ac) {
363d4251 5807 ar->len = 0;
256bdb49 5808 *errp = -ENOMEM;
6c7a120a 5809 goto out;
256bdb49
ES
5810 }
5811
d73eff68 5812 ext4_mb_initialize_context(ac, ar);
c9de560d 5813
256bdb49 5814 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
81198536 5815 seq = this_cpu_read(discard_pa_seq);
256bdb49 5816 if (!ext4_mb_use_preallocated(ac)) {
256bdb49
ES
5817 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
5818 ext4_mb_normalize_request(ac, ar);
53f86b17
RH
5819
5820 *errp = ext4_mb_pa_alloc(ac);
5821 if (*errp)
5822 goto errout;
c9de560d
AT
5823repeat:
5824 /* allocate space in core */
6c7a120a 5825 *errp = ext4_mb_regular_allocator(ac);
53f86b17
RH
5826 /*
5827 * pa allocated above is added to grp->bb_prealloc_list only
5828 * when we were able to allocate some block i.e. when
5829 * ac->ac_status == AC_STATUS_FOUND.
5830 * And error from above mean ac->ac_status != AC_STATUS_FOUND
5831 * So we have to free this pa here itself.
5832 */
2c00ef3e 5833 if (*errp) {
82089725 5834 ext4_mb_pa_put_free(ac);
2c00ef3e
AK
5835 ext4_discard_allocated_blocks(ac);
5836 goto errout;
5837 }
53f86b17
RH
5838 if (ac->ac_status == AC_STATUS_FOUND &&
5839 ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
82089725 5840 ext4_mb_pa_put_free(ac);
c9de560d 5841 }
256bdb49 5842 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
53accfa9 5843 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
554a5ccc 5844 if (*errp) {
b844167e 5845 ext4_discard_allocated_blocks(ac);
6d138ced
ES
5846 goto errout;
5847 } else {
519deca0
AK
5848 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
5849 ar->len = ac->ac_b_ex.fe_len;
5850 }
c9de560d 5851 } else {
80fa46d6
TT
5852 if (++retries < 3 &&
5853 ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
c9de560d 5854 goto repeat;
53f86b17
RH
5855 /*
5856 * If block allocation fails then the pa allocated above
5857 * needs to be freed here itself.
5858 */
82089725 5859 ext4_mb_pa_put_free(ac);
c9de560d 5860 *errp = -ENOSPC;
6c7a120a
AK
5861 }
5862
5863 if (*errp) {
aaae558d 5864errout:
256bdb49 5865 ac->ac_b_ex.fe_len = 0;
c9de560d 5866 ar->len = 0;
256bdb49 5867 ext4_mb_show_ac(ac);
c9de560d 5868 }
256bdb49 5869 ext4_mb_release_context(ac);
aaae558d 5870 kmem_cache_free(ext4_ac_cachep, ac);
6c7a120a 5871out:
60e58e0f 5872 if (inquota && ar->len < inquota)
53accfa9 5873 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
0087d9fb 5874 if (!ar->len) {
e3cf5d5d 5875 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
0087d9fb 5876 /* release all the reserved blocks if non delalloc */
57042651 5877 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
53accfa9 5878 reserv_clstrs);
0087d9fb 5879 }
c9de560d 5880
9bffad1e 5881 trace_ext4_allocate_blocks(ar, (unsigned long long)block);
ba80b101 5882
c9de560d
AT
5883 return block;
5884}
c9de560d 5885
c894058d
AK
5886/*
5887 * We can merge two free data extents only if the physical blocks
5888 * are contiguous, AND the extents were freed by the same transaction,
5889 * AND the blocks are associated with the same group.
5890 */
a0154344
DJ
5891static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
5892 struct ext4_free_data *entry,
5893 struct ext4_free_data *new_entry,
5894 struct rb_root *entry_rb_root)
c894058d 5895{
a0154344
DJ
5896 if ((entry->efd_tid != new_entry->efd_tid) ||
5897 (entry->efd_group != new_entry->efd_group))
5898 return;
5899 if (entry->efd_start_cluster + entry->efd_count ==
5900 new_entry->efd_start_cluster) {
5901 new_entry->efd_start_cluster = entry->efd_start_cluster;
5902 new_entry->efd_count += entry->efd_count;
5903 } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
5904 entry->efd_start_cluster) {
5905 new_entry->efd_count += entry->efd_count;
5906 } else
5907 return;
5908 spin_lock(&sbi->s_md_lock);
5909 list_del(&entry->efd_list);
5910 spin_unlock(&sbi->s_md_lock);
5911 rb_erase(&entry->efd_node, entry_rb_root);
5912 kmem_cache_free(ext4_free_data_cachep, entry);
c894058d
AK
5913}
5914
85b67ffb 5915static noinline_for_stack void
4ddfef7b 5916ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
7a2fcbf7 5917 struct ext4_free_data *new_entry)
c9de560d 5918{
e29136f8 5919 ext4_group_t group = e4b->bd_group;
84130193 5920 ext4_grpblk_t cluster;
d08854f5 5921 ext4_grpblk_t clusters = new_entry->efd_count;
7a2fcbf7 5922 struct ext4_free_data *entry;
c9de560d
AT
5923 struct ext4_group_info *db = e4b->bd_info;
5924 struct super_block *sb = e4b->bd_sb;
5925 struct ext4_sb_info *sbi = EXT4_SB(sb);
c894058d
AK
5926 struct rb_node **n = &db->bb_free_root.rb_node, *node;
5927 struct rb_node *parent = NULL, *new_node;
5928
0390131b 5929 BUG_ON(!ext4_handle_valid(handle));
c9de560d
AT
5930 BUG_ON(e4b->bd_bitmap_page == NULL);
5931 BUG_ON(e4b->bd_buddy_page == NULL);
5932
18aadd47
BJ
5933 new_node = &new_entry->efd_node;
5934 cluster = new_entry->efd_start_cluster;
c894058d 5935
c894058d
AK
5936 if (!*n) {
5937 /* first free block exent. We need to
5938 protect buddy cache from being freed,
5939 * otherwise we'll refresh it from
5940 * on-disk bitmap and lose not-yet-available
5941 * blocks */
09cbfeaf
KS
5942 get_page(e4b->bd_buddy_page);
5943 get_page(e4b->bd_bitmap_page);
c894058d
AK
5944 }
5945 while (*n) {
5946 parent = *n;
18aadd47
BJ
5947 entry = rb_entry(parent, struct ext4_free_data, efd_node);
5948 if (cluster < entry->efd_start_cluster)
c894058d 5949 n = &(*n)->rb_left;
18aadd47 5950 else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
c894058d
AK
5951 n = &(*n)->rb_right;
5952 else {
e29136f8 5953 ext4_grp_locked_error(sb, group, 0,
84130193
TT
5954 ext4_group_first_block_no(sb, group) +
5955 EXT4_C2B(sbi, cluster),
e29136f8 5956 "Block already on to-be-freed list");
cca41553 5957 kmem_cache_free(ext4_free_data_cachep, new_entry);
85b67ffb 5958 return;
c9de560d 5959 }
c894058d 5960 }
c9de560d 5961
c894058d
AK
5962 rb_link_node(new_node, parent, n);
5963 rb_insert_color(new_node, &db->bb_free_root);
5964
5965 /* Now try to see the extent can be merged to left and right */
5966 node = rb_prev(new_node);
5967 if (node) {
18aadd47 5968 entry = rb_entry(node, struct ext4_free_data, efd_node);
a0154344
DJ
5969 ext4_try_merge_freed_extent(sbi, entry, new_entry,
5970 &(db->bb_free_root));
c894058d 5971 }
c9de560d 5972
c894058d
AK
5973 node = rb_next(new_node);
5974 if (node) {
18aadd47 5975 entry = rb_entry(node, struct ext4_free_data, efd_node);
a0154344
DJ
5976 ext4_try_merge_freed_extent(sbi, entry, new_entry,
5977 &(db->bb_free_root));
c9de560d 5978 }
a0154344 5979
d08854f5 5980 spin_lock(&sbi->s_md_lock);
a0154344 5981 list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
d08854f5
TT
5982 sbi->s_mb_free_pending += clusters;
5983 spin_unlock(&sbi->s_md_lock);
c9de560d
AT
5984}
5985
8016e29f
HS
5986/*
5987 * Simple allocator for Ext4 fast commit replay path. It searches for blocks
5988 * linearly starting at the goal block and also excludes the blocks which
5989 * are going to be in use after fast commit replay.
5990 */
5991static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
5992 struct ext4_allocation_request *ar, int *errp)
5993{
5994 struct buffer_head *bitmap_bh;
5995 struct super_block *sb = ar->inode->i_sb;
5996 ext4_group_t group;
5997 ext4_grpblk_t blkoff;
31a074a0
XY
5998 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
5999 ext4_grpblk_t i = 0;
8016e29f
HS
6000 ext4_fsblk_t goal, block;
6001 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
6002
6003 goal = ar->goal;
6004 if (goal < le32_to_cpu(es->s_first_data_block) ||
6005 goal >= ext4_blocks_count(es))
6006 goal = le32_to_cpu(es->s_first_data_block);
6007
6008 ar->len = 0;
6009 ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
6010 for (; group < ext4_get_groups_count(sb); group++) {
6011 bitmap_bh = ext4_read_block_bitmap(sb, group);
6012 if (IS_ERR(bitmap_bh)) {
6013 *errp = PTR_ERR(bitmap_bh);
6014 pr_warn("Failed to read block bitmap\n");
6015 return 0;
6016 }
6017
31a074a0
XY
6018 while (1) {
6019 i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
8016e29f 6020 blkoff);
31a074a0
XY
6021 if (i >= max)
6022 break;
6023 if (ext4_fc_replay_check_excluded(sb,
6024 ext4_group_first_block_no(sb, group) + i)) {
6025 blkoff = i + 1;
6026 } else
6027 break;
6028 }
8016e29f 6029 brelse(bitmap_bh);
31a074a0
XY
6030 if (i < max)
6031 break;
253cacb0
KS
6032
6033 blkoff = 0;
8016e29f
HS
6034 }
6035
31a074a0
XY
6036 if (group >= ext4_get_groups_count(sb) || i >= max) {
6037 *errp = -ENOSPC;
8016e29f 6038 return 0;
31a074a0 6039 }
8016e29f
HS
6040
6041 block = ext4_group_first_block_no(sb, group) + i;
6042 ext4_mb_mark_bb(sb, block, 1, 1);
6043 ar->len = 1;
6044
6045 return block;
6046}
6047
6048static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
6049 unsigned long count)
6050{
6051 struct buffer_head *bitmap_bh;
6052 struct super_block *sb = inode->i_sb;
6053 struct ext4_group_desc *gdp;
6054 struct buffer_head *gdp_bh;
6055 ext4_group_t group;
6056 ext4_grpblk_t blkoff;
6057 int already_freed = 0, err, i;
6058
6059 ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
6060 bitmap_bh = ext4_read_block_bitmap(sb, group);
6061 if (IS_ERR(bitmap_bh)) {
8016e29f
HS
6062 pr_warn("Failed to read block bitmap\n");
6063 return;
6064 }
6065 gdp = ext4_get_group_desc(sb, group, &gdp_bh);
6066 if (!gdp)
1b5c9d34 6067 goto err_out;
8016e29f
HS
6068
6069 for (i = 0; i < count; i++) {
6070 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
6071 already_freed++;
6072 }
6073 mb_clear_bits(bitmap_bh->b_data, blkoff, count);
6074 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
6075 if (err)
1b5c9d34 6076 goto err_out;
8016e29f
HS
6077 ext4_free_group_clusters_set(
6078 sb, gdp, ext4_free_group_clusters(sb, gdp) +
6079 count - already_freed);
1df9bde4 6080 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
8016e29f
HS
6081 ext4_group_desc_csum_set(sb, group, gdp);
6082 ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
6083 sync_dirty_buffer(bitmap_bh);
6084 sync_dirty_buffer(gdp_bh);
1b5c9d34
KS
6085
6086err_out:
8016e29f
HS
6087 brelse(bitmap_bh);
6088}
6089
44338711 6090/**
8ac3939d
RH
6091 * ext4_mb_clear_bb() -- helper function for freeing blocks.
6092 * Used by ext4_free_blocks()
44338711
TT
6093 * @handle: handle for this transaction
6094 * @inode: inode
c60990b3
TT
6095 * @block: starting physical block to be freed
6096 * @count: number of blocks to be freed
5def1360 6097 * @flags: flags used by ext4_free_blocks
c9de560d 6098 */
8ac3939d
RH
6099static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
6100 ext4_fsblk_t block, unsigned long count,
6101 int flags)
c9de560d 6102{
26346ff6 6103 struct buffer_head *bitmap_bh = NULL;
c9de560d 6104 struct super_block *sb = inode->i_sb;
c9de560d 6105 struct ext4_group_desc *gdp;
5354b2af 6106 struct ext4_group_info *grp;
498e5f24 6107 unsigned int overflow;
c9de560d
AT
6108 ext4_grpblk_t bit;
6109 struct buffer_head *gd_bh;
6110 ext4_group_t block_group;
6111 struct ext4_sb_info *sbi;
6112 struct ext4_buddy e4b;
84130193 6113 unsigned int count_clusters;
c9de560d
AT
6114 int err = 0;
6115 int ret;
6116
8016e29f
HS
6117 sbi = EXT4_SB(sb);
6118
1e1c2b86
LC
6119 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
6120 !ext4_inode_block_valid(inode, block, count)) {
6121 ext4_error(sb, "Freeing blocks in system zone - "
6122 "Block = %llu, count = %lu", block, count);
6123 /* err = 0. ext4_std_error should be a no op */
6124 goto error_return;
6125 }
6126 flags |= EXT4_FREE_BLOCKS_VALIDATED;
6127
c9de560d
AT
6128do_more:
6129 overflow = 0;
6130 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
6131
5354b2af
TT
6132 grp = ext4_get_group_info(sb, block_group);
6133 if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
163a203d
DW
6134 return;
6135
c9de560d
AT
6136 /*
6137 * Check to see if we are freeing blocks across a group
6138 * boundary.
6139 */
84130193
TT
6140 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
6141 overflow = EXT4_C2B(sbi, bit) + count -
6142 EXT4_BLOCKS_PER_GROUP(sb);
c9de560d 6143 count -= overflow;
1e1c2b86
LC
6144 /* The range changed so it's no longer validated */
6145 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
c9de560d 6146 }
810da240 6147 count_clusters = EXT4_NUM_B2C(sbi, count);
574ca174 6148 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
9008a58e
DW
6149 if (IS_ERR(bitmap_bh)) {
6150 err = PTR_ERR(bitmap_bh);
6151 bitmap_bh = NULL;
c9de560d 6152 goto error_return;
ce89f46c 6153 }
c9de560d 6154 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
ce89f46c
AK
6155 if (!gdp) {
6156 err = -EIO;
c9de560d 6157 goto error_return;
ce89f46c 6158 }
c9de560d 6159
1e1c2b86
LC
6160 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
6161 !ext4_inode_block_valid(inode, block, count)) {
12062ddd 6162 ext4_error(sb, "Freeing blocks in system zone - "
0610b6e9 6163 "Block = %llu, count = %lu", block, count);
519deca0
AK
6164 /* err = 0. ext4_std_error should be a no op */
6165 goto error_return;
c9de560d
AT
6166 }
6167
6168 BUFFER_TRACE(bitmap_bh, "getting write access");
188c299e
JK
6169 err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
6170 EXT4_JTR_NONE);
c9de560d
AT
6171 if (err)
6172 goto error_return;
6173
6174 /*
6175 * We are about to modify some metadata. Call the journal APIs
6176 * to unshare ->b_data if a currently-committing transaction is
6177 * using it
6178 */
6179 BUFFER_TRACE(gd_bh, "get_write_access");
188c299e 6180 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
c9de560d
AT
6181 if (err)
6182 goto error_return;
c9de560d
AT
6183#ifdef AGGRESSIVE_CHECK
6184 {
6185 int i;
84130193 6186 for (i = 0; i < count_clusters; i++)
c9de560d
AT
6187 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
6188 }
6189#endif
84130193 6190 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
c9de560d 6191
adb7ef60
KK
6192 /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
6193 err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
6194 GFP_NOFS|__GFP_NOFAIL);
920313a7
AK
6195 if (err)
6196 goto error_return;
e6362609 6197
f96c450d
DJ
6198 /*
6199 * We need to make sure we don't reuse the freed block until after the
6200 * transaction is committed. We make an exception if the inode is to be
6201 * written in writeback mode since writeback mode has weak data
6202 * consistency guarantees.
6203 */
6204 if (ext4_handle_valid(handle) &&
6205 ((flags & EXT4_FREE_BLOCKS_METADATA) ||
6206 !ext4_should_writeback_data(inode))) {
7a2fcbf7
AK
6207 struct ext4_free_data *new_entry;
6208 /*
7444a072
MH
6209 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
6210 * to fail.
7a2fcbf7 6211 */
7444a072
MH
6212 new_entry = kmem_cache_alloc(ext4_free_data_cachep,
6213 GFP_NOFS|__GFP_NOFAIL);
18aadd47
BJ
6214 new_entry->efd_start_cluster = bit;
6215 new_entry->efd_group = block_group;
6216 new_entry->efd_count = count_clusters;
6217 new_entry->efd_tid = handle->h_transaction->t_tid;
955ce5f5 6218
7a2fcbf7 6219 ext4_lock_group(sb, block_group);
84130193 6220 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
7a2fcbf7 6221 ext4_mb_free_metadata(handle, &e4b, new_entry);
c9de560d 6222 } else {
7a2fcbf7
AK
6223 /* need to update group_info->bb_free and bitmap
6224 * with group lock held. generate_buddy look at
6225 * them with group lock_held
6226 */
d71c1ae2 6227 if (test_opt(sb, DISCARD)) {
a0154344
DJ
6228 err = ext4_issue_discard(sb, block_group, bit, count,
6229 NULL);
d71c1ae2
LC
6230 if (err && err != -EOPNOTSUPP)
6231 ext4_msg(sb, KERN_WARNING, "discard request in"
a00b482b 6232 " group:%u block:%d count:%lu failed"
d71c1ae2
LC
6233 " with %d", block_group, bit, count,
6234 err);
8f9ff189
LC
6235 } else
6236 EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
d71c1ae2 6237
955ce5f5 6238 ext4_lock_group(sb, block_group);
84130193
TT
6239 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
6240 mb_free_blocks(inode, &e4b, bit, count_clusters);
c9de560d
AT
6241 }
6242
021b65bb
TT
6243 ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
6244 ext4_free_group_clusters_set(sb, gdp, ret);
1df9bde4 6245 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
feb0ab32 6246 ext4_group_desc_csum_set(sb, block_group, gdp);
955ce5f5 6247 ext4_unlock_group(sb, block_group);
c9de560d 6248
772cb7c8
JS
6249 if (sbi->s_log_groups_per_flex) {
6250 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
90ba983f 6251 atomic64_add(count_clusters,
7c990728
SJS
6252 &sbi_array_rcu_deref(sbi, s_flex_groups,
6253 flex_group)->free_clusters);
772cb7c8
JS
6254 }
6255
9fe67149
EW
6256 /*
6257 * on a bigalloc file system, defer the s_freeclusters_counter
6258 * update to the caller (ext4_remove_space and friends) so they
6259 * can determine if a cluster freed here should be rereserved
6260 */
6261 if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
6262 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
6263 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
6264 percpu_counter_add(&sbi->s_freeclusters_counter,
6265 count_clusters);
6266 }
7d734532
JK
6267
6268 ext4_mb_unload_buddy(&e4b);
7b415bf6 6269
7a2fcbf7
AK
6270 /* We dirtied the bitmap block */
6271 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
6272 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
6273
c9de560d
AT
6274 /* And the group descriptor block */
6275 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
0390131b 6276 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
c9de560d
AT
6277 if (!err)
6278 err = ret;
6279
6280 if (overflow && !err) {
6281 block += count;
6282 count = overflow;
6283 put_bh(bitmap_bh);
1e1c2b86
LC
6284 /* The range changed so it's no longer validated */
6285 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
c9de560d
AT
6286 goto do_more;
6287 }
c9de560d
AT
6288error_return:
6289 brelse(bitmap_bh);
6290 ext4_std_error(sb, err);
6291 return;
6292}
7360d173 6293
8ac3939d
RH
6294/**
6295 * ext4_free_blocks() -- Free given blocks and update quota
6296 * @handle: handle for this transaction
6297 * @inode: inode
6298 * @bh: optional buffer of the block to be freed
6299 * @block: starting physical block to be freed
6300 * @count: number of blocks to be freed
6301 * @flags: flags used by ext4_free_blocks
6302 */
6303void ext4_free_blocks(handle_t *handle, struct inode *inode,
6304 struct buffer_head *bh, ext4_fsblk_t block,
6305 unsigned long count, int flags)
6306{
6307 struct super_block *sb = inode->i_sb;
6308 unsigned int overflow;
6309 struct ext4_sb_info *sbi;
6310
6311 sbi = EXT4_SB(sb);
6312
6313 if (sbi->s_mount_state & EXT4_FC_REPLAY) {
6314 ext4_free_blocks_simple(inode, block, count);
6315 return;
6316 }
6317
6318 might_sleep();
6319 if (bh) {
6320 if (block)
6321 BUG_ON(block != bh->b_blocknr);
6322 else
6323 block = bh->b_blocknr;
6324 }
6325
6326 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
6327 !ext4_inode_block_valid(inode, block, count)) {
6328 ext4_error(sb, "Freeing blocks not in datazone - "
6329 "block = %llu, count = %lu", block, count);
6330 return;
6331 }
1e1c2b86 6332 flags |= EXT4_FREE_BLOCKS_VALIDATED;
8ac3939d
RH
6333
6334 ext4_debug("freeing block %llu\n", block);
6335 trace_ext4_free_blocks(inode, block, count, flags);
6336
6337 if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
6338 BUG_ON(count > 1);
6339
6340 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
6341 inode, bh, block);
6342 }
6343
6344 /*
6345 * If the extent to be freed does not begin on a cluster
6346 * boundary, we need to deal with partial clusters at the
6347 * beginning and end of the extent. Normally we will free
6348 * blocks at the beginning or the end unless we are explicitly
6349 * requested to avoid doing so.
6350 */
6351 overflow = EXT4_PBLK_COFF(sbi, block);
6352 if (overflow) {
6353 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
6354 overflow = sbi->s_cluster_ratio - overflow;
6355 block += overflow;
6356 if (count > overflow)
6357 count -= overflow;
6358 else
6359 return;
6360 } else {
6361 block -= overflow;
6362 count += overflow;
6363 }
1e1c2b86
LC
6364 /* The range changed so it's no longer validated */
6365 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
8ac3939d
RH
6366 }
6367 overflow = EXT4_LBLK_COFF(sbi, count);
6368 if (overflow) {
6369 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
6370 if (count > overflow)
6371 count -= overflow;
6372 else
6373 return;
6374 } else
6375 count += sbi->s_cluster_ratio - overflow;
1e1c2b86
LC
6376 /* The range changed so it's no longer validated */
6377 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
8ac3939d
RH
6378 }
6379
6380 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
6381 int i;
6382 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
6383
6384 for (i = 0; i < count; i++) {
6385 cond_resched();
6386 if (is_metadata)
6387 bh = sb_find_get_block(inode->i_sb, block + i);
6388 ext4_forget(handle, is_metadata, inode, bh, block + i);
6389 }
6390 }
6391
6392 ext4_mb_clear_bb(handle, inode, block, count, flags);
6393 return;
6394}
6395
2846e820 6396/**
0529155e 6397 * ext4_group_add_blocks() -- Add given blocks to an existing group
2846e820
AG
6398 * @handle: handle to this transaction
6399 * @sb: super block
4907cb7b 6400 * @block: start physical block to add to the block group
2846e820
AG
6401 * @count: number of blocks to free
6402 *
e73a347b 6403 * This marks the blocks as free in the bitmap and buddy.
2846e820 6404 */
cc7365df 6405int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
2846e820
AG
6406 ext4_fsblk_t block, unsigned long count)
6407{
6408 struct buffer_head *bitmap_bh = NULL;
6409 struct buffer_head *gd_bh;
6410 ext4_group_t block_group;
6411 ext4_grpblk_t bit;
6412 unsigned int i;
6413 struct ext4_group_desc *desc;
6414 struct ext4_sb_info *sbi = EXT4_SB(sb);
e73a347b 6415 struct ext4_buddy e4b;
d77147ff 6416 int err = 0, ret, free_clusters_count;
6417 ext4_grpblk_t clusters_freed;
6418 ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
6419 ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
6420 unsigned long cluster_count = last_cluster - first_cluster + 1;
2846e820
AG
6421
6422 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
6423
4740b830
YY
6424 if (count == 0)
6425 return 0;
6426
2846e820 6427 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
2846e820
AG
6428 /*
6429 * Check to see if we are freeing blocks across a group
6430 * boundary.
6431 */
d77147ff 6432 if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
6433 ext4_warning(sb, "too many blocks added to group %u",
cc7365df
YY
6434 block_group);
6435 err = -EINVAL;
2846e820 6436 goto error_return;
cc7365df 6437 }
2cd05cc3 6438
2846e820 6439 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
9008a58e
DW
6440 if (IS_ERR(bitmap_bh)) {
6441 err = PTR_ERR(bitmap_bh);
6442 bitmap_bh = NULL;
2846e820 6443 goto error_return;
cc7365df
YY
6444 }
6445
2846e820 6446 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
cc7365df
YY
6447 if (!desc) {
6448 err = -EIO;
2846e820 6449 goto error_return;
cc7365df 6450 }
2846e820 6451
a00b482b 6452 if (!ext4_sb_block_valid(sb, NULL, block, count)) {
2846e820
AG
6453 ext4_error(sb, "Adding blocks in system zones - "
6454 "Block = %llu, count = %lu",
6455 block, count);
cc7365df 6456 err = -EINVAL;
2846e820
AG
6457 goto error_return;
6458 }
6459
2cd05cc3 6460 BUFFER_TRACE(bitmap_bh, "getting write access");
188c299e
JK
6461 err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
6462 EXT4_JTR_NONE);
2846e820
AG
6463 if (err)
6464 goto error_return;
6465
6466 /*
6467 * We are about to modify some metadata. Call the journal APIs
6468 * to unshare ->b_data if a currently-committing transaction is
6469 * using it
6470 */
6471 BUFFER_TRACE(gd_bh, "get_write_access");
188c299e 6472 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
2846e820
AG
6473 if (err)
6474 goto error_return;
e73a347b 6475
d77147ff 6476 for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
2846e820 6477 BUFFER_TRACE(bitmap_bh, "clear bit");
e73a347b 6478 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
2846e820
AG
6479 ext4_error(sb, "bit already cleared for block %llu",
6480 (ext4_fsblk_t)(block + i));
6481 BUFFER_TRACE(bitmap_bh, "bit already cleared");
6482 } else {
d77147ff 6483 clusters_freed++;
2846e820
AG
6484 }
6485 }
e73a347b
AG
6486
6487 err = ext4_mb_load_buddy(sb, block_group, &e4b);
6488 if (err)
6489 goto error_return;
6490
6491 /*
6492 * need to update group_info->bb_free and bitmap
6493 * with group lock held. generate_buddy look at
6494 * them with group lock_held
6495 */
2846e820 6496 ext4_lock_group(sb, block_group);
d77147ff 6497 mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
6498 mb_free_blocks(NULL, &e4b, bit, cluster_count);
6499 free_clusters_count = clusters_freed +
6500 ext4_free_group_clusters(sb, desc);
6501 ext4_free_group_clusters_set(sb, desc, free_clusters_count);
1df9bde4 6502 ext4_block_bitmap_csum_set(sb, desc, bitmap_bh);
feb0ab32 6503 ext4_group_desc_csum_set(sb, block_group, desc);
2846e820 6504 ext4_unlock_group(sb, block_group);
57042651 6505 percpu_counter_add(&sbi->s_freeclusters_counter,
d77147ff 6506 clusters_freed);
2846e820
AG
6507
6508 if (sbi->s_log_groups_per_flex) {
6509 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
d77147ff 6510 atomic64_add(clusters_freed,
7c990728
SJS
6511 &sbi_array_rcu_deref(sbi, s_flex_groups,
6512 flex_group)->free_clusters);
2846e820 6513 }
e73a347b
AG
6514
6515 ext4_mb_unload_buddy(&e4b);
2846e820
AG
6516
6517 /* We dirtied the bitmap block */
6518 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
6519 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
6520
6521 /* And the group descriptor block */
6522 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
6523 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
6524 if (!err)
6525 err = ret;
6526
6527error_return:
6528 brelse(bitmap_bh);
6529 ext4_std_error(sb, err);
cc7365df 6530 return err;
2846e820
AG
6531}
6532
7360d173
LC
6533/**
6534 * ext4_trim_extent -- function to TRIM one single free extent in the group
6535 * @sb: super block for the file system
6536 * @start: starting block of the free extent in the alloc. group
6537 * @count: number of blocks to TRIM
7360d173
LC
6538 * @e4b: ext4 buddy for the group
6539 *
6540 * Trim "count" blocks starting at "start" in the "group". To assure that no
6541 * one will allocate those blocks, mark it as used in buddy bitmap. This must
6542 * be called with under the group lock.
6543 */
bd2eea8d
WJ
6544static int ext4_trim_extent(struct super_block *sb,
6545 int start, int count, struct ext4_buddy *e4b)
e2cbd587 6546__releases(bitlock)
6547__acquires(bitlock)
7360d173
LC
6548{
6549 struct ext4_free_extent ex;
bd2eea8d 6550 ext4_group_t group = e4b->bd_group;
d71c1ae2 6551 int ret = 0;
7360d173 6552
b3d4c2b1
TM
6553 trace_ext4_trim_extent(sb, group, start, count);
6554
7360d173
LC
6555 assert_spin_locked(ext4_group_lock_ptr(sb, group));
6556
6557 ex.fe_start = start;
6558 ex.fe_group = group;
6559 ex.fe_len = count;
6560
6561 /*
6562 * Mark blocks used, so no one can reuse them while
6563 * being trimmed.
6564 */
6565 mb_mark_used(e4b, &ex);
6566 ext4_unlock_group(sb, group);
a0154344 6567 ret = ext4_issue_discard(sb, group, start, count, NULL);
7360d173
LC
6568 ext4_lock_group(sb, group);
6569 mb_free_blocks(NULL, e4b, start, ex.fe_len);
d71c1ae2 6570 return ret;
7360d173
LC
6571}
6572
6920b391
WJ
6573static int ext4_try_to_trim_range(struct super_block *sb,
6574 struct ext4_buddy *e4b, ext4_grpblk_t start,
6575 ext4_grpblk_t max, ext4_grpblk_t minblocks)
a5fda113
TT
6576__acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
6577__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
6920b391
WJ
6578{
6579 ext4_grpblk_t next, count, free_count;
6580 void *bitmap;
6920b391
WJ
6581
6582 bitmap = e4b->bd_bitmap;
6583 start = (e4b->bd_info->bb_first_free > start) ?
6584 e4b->bd_info->bb_first_free : start;
6585 count = 0;
6586 free_count = 0;
6587
6588 while (start <= max) {
6589 start = mb_find_next_zero_bit(bitmap, max + 1, start);
6590 if (start > max)
6591 break;
6592 next = mb_find_next_bit(bitmap, max + 1, start);
6593
6594 if ((next - start) >= minblocks) {
afcc4e32
LB
6595 int ret = ext4_trim_extent(sb, start, next - start, e4b);
6596
6920b391
WJ
6597 if (ret && ret != -EOPNOTSUPP)
6598 break;
6920b391
WJ
6599 count += next - start;
6600 }
6601 free_count += next - start;
6602 start = next + 1;
6603
6604 if (fatal_signal_pending(current)) {
6605 count = -ERESTARTSYS;
6606 break;
6607 }
6608
6609 if (need_resched()) {
6610 ext4_unlock_group(sb, e4b->bd_group);
6611 cond_resched();
6612 ext4_lock_group(sb, e4b->bd_group);
6613 }
6614
6615 if ((e4b->bd_info->bb_free - free_count) < minblocks)
6616 break;
6617 }
6618
6619 return count;
6620}
6621
7360d173
LC
6622/**
6623 * ext4_trim_all_free -- function to trim all free space in alloc. group
6624 * @sb: super block for file system
22612283 6625 * @group: group to be trimmed
7360d173
LC
6626 * @start: first group block to examine
6627 * @max: last group block to examine
6628 * @minblocks: minimum extent block count
d63c00ea 6629 * @set_trimmed: set the trimmed flag if at least one block is trimmed
7360d173 6630 *
7360d173
LC
6631 * ext4_trim_all_free walks through group's block bitmap searching for free
6632 * extents. When the free extent is found, mark it as used in group buddy
6633 * bitmap. Then issue a TRIM command on this extent and free the extent in
b6f5558c 6634 * the group buddy bitmap.
7360d173 6635 */
0b75a840 6636static ext4_grpblk_t
78944086
LC
6637ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
6638 ext4_grpblk_t start, ext4_grpblk_t max,
d63c00ea 6639 ext4_grpblk_t minblocks, bool set_trimmed)
7360d173 6640{
78944086 6641 struct ext4_buddy e4b;
6920b391 6642 int ret;
7360d173 6643
b3d4c2b1
TM
6644 trace_ext4_trim_all_free(sb, group, start, max);
6645
78944086
LC
6646 ret = ext4_mb_load_buddy(sb, group, &e4b);
6647 if (ret) {
9651e6b2
KK
6648 ext4_warning(sb, "Error %d loading buddy information for %u",
6649 ret, group);
78944086
LC
6650 return ret;
6651 }
28739eea
LC
6652
6653 ext4_lock_group(sb, group);
7360d173 6654
6920b391 6655 if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
2327fb2e 6656 minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
6920b391 6657 ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
d63c00ea 6658 if (ret >= 0 && set_trimmed)
6920b391
WJ
6659 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
6660 } else {
6661 ret = 0;
7360d173 6662 }
3d56b8d2 6663
7360d173 6664 ext4_unlock_group(sb, group);
78944086 6665 ext4_mb_unload_buddy(&e4b);
7360d173
LC
6666
6667 ext4_debug("trimmed %d blocks in the group %d\n",
6920b391 6668 ret, group);
7360d173 6669
d71c1ae2 6670 return ret;
7360d173
LC
6671}
6672
6673/**
6674 * ext4_trim_fs() -- trim ioctl handle function
6675 * @sb: superblock for filesystem
6676 * @range: fstrim_range structure
6677 *
6678 * start: First Byte to trim
6679 * len: number of Bytes to trim from start
6680 * minlen: minimum extent length in Bytes
6681 * ext4_trim_fs goes through all allocation groups containing Bytes from
6682 * start to start+len. For each such a group ext4_trim_all_free function
6683 * is invoked to trim all free space.
6684 */
6685int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
6686{
7b47ef52 6687 unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
78944086 6688 struct ext4_group_info *grp;
913eed83 6689 ext4_group_t group, first_group, last_group;
7137d7a4 6690 ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
913eed83 6691 uint64_t start, end, minlen, trimmed = 0;
0f0a25bf
JK
6692 ext4_fsblk_t first_data_blk =
6693 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
913eed83 6694 ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
d63c00ea 6695 bool whole_group, eof = false;
7360d173
LC
6696 int ret = 0;
6697
6698 start = range->start >> sb->s_blocksize_bits;
913eed83 6699 end = start + (range->len >> sb->s_blocksize_bits) - 1;
aaf7d73e
LC
6700 minlen = EXT4_NUM_B2C(EXT4_SB(sb),
6701 range->minlen >> sb->s_blocksize_bits);
7360d173 6702
5de35e8d
LC
6703 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
6704 start >= max_blks ||
6705 range->len < sb->s_blocksize)
7360d173 6706 return -EINVAL;
173b6e38 6707 /* No point to try to trim less than discard granularity */
7b47ef52 6708 if (range->minlen < discard_granularity) {
173b6e38 6709 minlen = EXT4_NUM_B2C(EXT4_SB(sb),
7b47ef52 6710 discard_granularity >> sb->s_blocksize_bits);
173b6e38
JK
6711 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
6712 goto out;
6713 }
d63c00ea 6714 if (end >= max_blks - 1) {
913eed83 6715 end = max_blks - 1;
d63c00ea
DM
6716 eof = true;
6717 }
913eed83 6718 if (end <= first_data_blk)
22f10457 6719 goto out;
913eed83 6720 if (start < first_data_blk)
0f0a25bf 6721 start = first_data_blk;
7360d173 6722
913eed83 6723 /* Determine first and last group to examine based on start and end */
7360d173 6724 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
7137d7a4 6725 &first_group, &first_cluster);
913eed83 6726 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
7137d7a4 6727 &last_group, &last_cluster);
7360d173 6728
913eed83
LC
6729 /* end now represents the last cluster to discard in this group */
6730 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
d63c00ea 6731 whole_group = true;
7360d173
LC
6732
6733 for (group = first_group; group <= last_group; group++) {
78944086 6734 grp = ext4_get_group_info(sb, group);
5354b2af
TT
6735 if (!grp)
6736 continue;
78944086
LC
6737 /* We only do this if the grp has never been initialized */
6738 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
adb7ef60 6739 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
78944086
LC
6740 if (ret)
6741 break;
7360d173
LC
6742 }
6743
0ba08517 6744 /*
913eed83
LC
6745 * For all the groups except the last one, last cluster will
6746 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
6747 * change it for the last group, note that last_cluster is
6748 * already computed earlier by ext4_get_group_no_and_offset()
0ba08517 6749 */
d63c00ea 6750 if (group == last_group) {
913eed83 6751 end = last_cluster;
d63c00ea
DM
6752 whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1;
6753 }
78944086 6754 if (grp->bb_free >= minlen) {
7137d7a4 6755 cnt = ext4_trim_all_free(sb, group, first_cluster,
d63c00ea 6756 end, minlen, whole_group);
7360d173
LC
6757 if (cnt < 0) {
6758 ret = cnt;
7360d173
LC
6759 break;
6760 }
21e7fd22 6761 trimmed += cnt;
7360d173 6762 }
913eed83
LC
6763
6764 /*
6765 * For every group except the first one, we are sure
6766 * that the first cluster to discard will be cluster #0.
6767 */
7137d7a4 6768 first_cluster = 0;
7360d173 6769 }
7360d173 6770
3d56b8d2 6771 if (!ret)
2327fb2e 6772 EXT4_SB(sb)->s_last_trim_minblks = minlen;
3d56b8d2 6773
22f10457 6774out:
aaf7d73e 6775 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
7360d173
LC
6776 return ret;
6777}
0c9ec4be
DW
6778
6779/* Iterate all the free extents in the group. */
6780int
6781ext4_mballoc_query_range(
6782 struct super_block *sb,
6783 ext4_group_t group,
6784 ext4_grpblk_t start,
6785 ext4_grpblk_t end,
6786 ext4_mballoc_query_range_fn formatter,
6787 void *priv)
6788{
6789 void *bitmap;
6790 ext4_grpblk_t next;
6791 struct ext4_buddy e4b;
6792 int error;
6793
6794 error = ext4_mb_load_buddy(sb, group, &e4b);
6795 if (error)
6796 return error;
6797 bitmap = e4b.bd_bitmap;
6798
6799 ext4_lock_group(sb, group);
6800
6801 start = (e4b.bd_info->bb_first_free > start) ?
6802 e4b.bd_info->bb_first_free : start;
6803 if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
6804 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
6805
6806 while (start <= end) {
6807 start = mb_find_next_zero_bit(bitmap, end + 1, start);
6808 if (start > end)
6809 break;
6810 next = mb_find_next_bit(bitmap, end + 1, start);
6811
6812 ext4_unlock_group(sb, group);
6813 error = formatter(sb, group, start, next - start, priv);
6814 if (error)
6815 goto out_unload;
6816 ext4_lock_group(sb, group);
6817
6818 start = next + 1;
6819 }
6820
6821 ext4_unlock_group(sb, group);
6822out_unload:
6823 ext4_mb_unload_buddy(&e4b);
6824
6825 return error;
6826}