1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
17 */
18
19
20/*
21 * mballoc.c contains the multiblocks allocation routines
22 */
23
24#include "ext4_jbd2.h"
25#include "mballoc.h"
26#include <linux/log2.h>
27#include <linux/module.h>
28#include <linux/slab.h>
29#include <trace/events/ext4.h>
30
31#ifdef CONFIG_EXT4_DEBUG
32ushort ext4_mballoc_debug __read_mostly;
33
34module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
35MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
36#endif
37
38/*
39 * MUSTDO:
40 *   - test ext4_ext_search_left() and ext4_ext_search_right()
41 *   - search for metadata in few groups
42 *
43 * TODO v4:
44 *   - normalization should take into account whether file is still open
45 *   - discard preallocations if no free space left (policy?)
46 *   - don't normalize tails
47 *   - quota
48 *   - reservation for superuser
49 *
50 * TODO v3:
51 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
52 *   - track min/max extents in each group for better group selection
53 *   - mb_mark_used() may allocate chunk right after splitting buddy
54 *   - tree of groups sorted by number of free blocks
55 *   - error handling
56 */
57
58/*
59 * The allocation request involve request for multiple number of blocks
60 * near to the goal(block) value specified.
61 *
62 * During initialization phase of the allocator we decide to use the
63 * group preallocation or inode preallocation depending on the size of
64 * the file. The size of the file could be the resulting file size we
65 * would have after allocation, or the current file size, which ever
66 * is larger. If the size is less than sbi->s_mb_stream_request we
67 * select to use the group preallocation. The default value of
68 * s_mb_stream_request is 16 blocks. This can also be tuned via
69 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
70 * terms of number of blocks.
71 *
72 * The main motivation for having small file use group preallocation is to
73 * ensure that we have small files closer together on the disk.
74 *
75 * First stage the allocator looks at the inode prealloc list,
76 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
77 * spaces for this particular inode. The inode prealloc space is
78 * represented as:
79 *
80 * pa_lstart -> the logical start block for this prealloc space
81 * pa_pstart -> the physical start block for this prealloc space
82 * pa_len    -> length for this prealloc space (in clusters)
83 * pa_free   ->  free space available in this prealloc space (in clusters)
84 *
85 * The inode preallocation space is used looking at the _logical_ start
86 * block. If only the logical file block falls within the range of prealloc
87 * space we will consume the particular prealloc space. This makes sure that
88 * we have contiguous physical blocks representing the file blocks
89 *
90 * The important thing to be noted in case of inode prealloc space is that
91 * we don't modify the values associated to inode prealloc space except
92 * pa_free.
93 *
94 * If we are not able to find blocks in the inode prealloc space and if we
95 * have the group allocation flag set then we look at the locality group
96 * prealloc space. These are per CPU prealloc list represented as
97 *
98 * ext4_sb_info.s_locality_groups[smp_processor_id()]
99 *
100 * The reason for having a per cpu locality group is to reduce the contention
101 * between CPUs. It is possible to get scheduled at this point.
102 *
103 * The locality group prealloc space is used looking at whether we have
104 * enough free space (pa_free) within the prealloc space.
105 *
106 * If we can't allocate blocks via inode prealloc or/and locality group
107 * prealloc then we look at the buddy cache. The buddy cache is represented
108 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
109 * mapped to the buddy and bitmap information regarding different
110 * groups. The buddy information is attached to buddy cache inode so that
111 * we can access them through the page cache. The information regarding
112 * each group is loaded via ext4_mb_load_buddy.  The information involve
113 * block bitmap and buddy information. The information are stored in the
114 * inode as:
115 *
116 *  {                        page                        }
117 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
118 *
119 *
120 * one block each for bitmap and buddy information.  So for each group we
121 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
122 * blocksize) blocks.  So it can have information regarding groups_per_page
123 * which is blocks_per_page/2
124 *
125 * The buddy cache inode is not stored on disk. The inode is thrown
126 * away when the filesystem is unmounted.
127 *
128 * We look for count number of blocks in the buddy cache. If we were able
129 * to locate that many free blocks we return with additional information
130 * regarding rest of the contiguous physical block available
131 *
132 * Before allocating blocks via buddy cache we normalize the request
133 * blocks. This ensure we ask for more blocks that we needed. The extra
134 * blocks that we get after allocation is added to the respective prealloc
135 * list. In case of inode preallocation we follow a list of heuristics
136 * based on file size. This can be found in ext4_mb_normalize_request. If
137 * we are doing a group prealloc we try to normalize the request to
138 * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
139 * dependent on the cluster size; for non-bigalloc file systems, it is
140 * 512 blocks. This can be tuned via
141 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
142 * terms of number of blocks. If we have mounted the file system with -O
143 * stripe=<value> option the group prealloc request is normalized to the
144 * the smallest multiple of the stripe value (sbi->s_stripe) which is
145 * greater than the default mb_group_prealloc.
146 *
147 * The regular allocator (using the buddy cache) supports a few tunables.
148 *
149 * /sys/fs/ext4/<partition>/mb_min_to_scan
150 * /sys/fs/ext4/<partition>/mb_max_to_scan
151 * /sys/fs/ext4/<partition>/mb_order2_req
152 *
153 * The regular allocator uses buddy scan only if the request len is power of
154 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
155 * value of s_mb_order2_reqs can be tuned via
156 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
157 * stripe size (sbi->s_stripe), we try to search for contiguous block in
158 * stripe size. This should result in better allocation on RAID setups. If
159 * not, we search in the specific group using bitmap for best extents. The
160 * tunable min_to_scan and max_to_scan control the behaviour here.
161 * min_to_scan indicate how long the mballoc __must__ look for a best
162 * extent and max_to_scan indicates how long the mballoc __can__ look for a
163 * best extent in the found extents. Searching for the blocks starts with
164 * the group specified as the goal value in allocation context via
165 * ac_g_ex. Each group is first checked based on the criteria whether it
166 * can be used for allocation. ext4_mb_good_group explains how the groups are
167 * checked.
168 *
169 * Both the prealloc space are getting populated as above. So for the first
170 * request we will hit the buddy cache which will result in this prealloc
171 * space getting filled. The prealloc space is then later used for the
172 * subsequent request.
173 */
174
175/*
176 * mballoc operates on the following data:
177 *  - on-disk bitmap
178 *  - in-core buddy (actually includes buddy and bitmap)
179 *  - preallocation descriptors (PAs)
180 *
181 * there are two types of preallocations:
182 *  - inode
183 *    assiged to specific inode and can be used for this inode only.
184 *    it describes part of inode's space preallocated to specific
185 *    physical blocks. any block from that preallocated can be used
186 *    independent. the descriptor just tracks number of blocks left
187 *    unused. so, before taking some block from descriptor, one must
188 *    make sure corresponded logical block isn't allocated yet. this
189 *    also means that freeing any block within descriptor's range
190 *    must discard all preallocated blocks.
191 *  - locality group
192 *    assigned to specific locality group which does not translate to
193 *    permanent set of inodes: inode can join and leave group. space
194 *    from this type of preallocation can be used for any inode. thus
195 *    it's consumed from the beginning to the end.
196 *
197 * relation between them can be expressed as:
198 *    in-core buddy = on-disk bitmap + preallocation descriptors
199 *
200 * this mean blocks mballoc considers used are:
201 *  - allocated blocks (persistent)
202 *  - preallocated blocks (non-persistent)
203 *
204 * consistency in mballoc world means that at any time a block is either
205 * free or used in ALL structures. notice: "any time" should not be read
206 * literally -- time is discrete and delimited by locks.
207 *
208 *  to keep it simple, we don't use block numbers, instead we count number of
209 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
210 *
211 * all operations can be expressed as:
212 *  - init buddy:			buddy = on-disk + PAs
213 *  - new PA:				buddy += N; PA = N
214 *  - use inode PA:			on-disk += N; PA -= N
215 *  - discard inode PA			buddy -= on-disk - PA; PA = 0
216 *  - use locality group PA		on-disk += N; PA -= N
217 *  - discard locality group PA		buddy -= PA; PA = 0
218 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
219 *        is used in real operation because we can't know actual used
220 *        bits from PA, only from on-disk bitmap
221 *
222 * if we follow this strict logic, then all operations above should be atomic.
223 * given some of them can block, we'd have to use something like semaphores
224 * killing performance on high-end SMP hardware. let's try to relax it using
225 * the following knowledge:
226 *  1) if buddy is referenced, it's already initialized
227 *  2) while block is used in buddy and the buddy is referenced,
228 *     nobody can re-allocate that block
229 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
230 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
231 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
232 *     block
233 *
234 * so, now we're building a concurrency table:
235 *  - init buddy vs.
236 *    - new PA
237 *      blocks for PA are allocated in the buddy, buddy must be referenced
238 *      until PA is linked to allocation group to avoid concurrent buddy init
239 *    - use inode PA
240 *      we need to make sure that either on-disk bitmap or PA has uptodate data
241 *      given (3) we care that PA-=N operation doesn't interfere with init
242 *    - discard inode PA
243 *      the simplest way would be to have buddy initialized by the discard
244 *    - use locality group PA
245 *      again PA-=N must be serialized with init
246 *    - discard locality group PA
247 *      the simplest way would be to have buddy initialized by the discard
248 *  - new PA vs.
249 *    - use inode PA
250 *      i_data_sem serializes them
251 *    - discard inode PA
252 *      discard process must wait until PA isn't used by another process
253 *    - use locality group PA
254 *      some mutex should serialize them
255 *    - discard locality group PA
256 *      discard process must wait until PA isn't used by another process
257 *  - use inode PA
258 *    - use inode PA
259 *      i_data_sem or another mutex should serializes them
260 *    - discard inode PA
261 *      discard process must wait until PA isn't used by another process
262 *    - use locality group PA
263 *      nothing wrong here -- they're different PAs covering different blocks
264 *    - discard locality group PA
265 *      discard process must wait until PA isn't used by another process
266 *
267 * now we're ready to make few consequences:
268 *  - PA is referenced and while it is no discard is possible
269 *  - PA is referenced until block isn't marked in on-disk bitmap
270 *  - PA changes only after on-disk bitmap
271 *  - discard must not compete with init. either init is done before
272 *    any discard or they're serialized somehow
273 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
274 *
275 * a special case when we've used PA to emptiness. no need to modify buddy
276 * in this case, but we should care about concurrent init
277 *
278 */
279
280 /*
281 * Logic in few words:
282 *
283 *  - allocation:
284 *    load group
285 *    find blocks
286 *    mark bits in on-disk bitmap
287 *    release group
288 *
289 *  - use preallocation:
290 *    find proper PA (per-inode or group)
291 *    load group
292 *    mark bits in on-disk bitmap
293 *    release group
294 *    release PA
295 *
296 *  - free:
297 *    load group
298 *    mark bits in on-disk bitmap
299 *    release group
300 *
301 *  - discard preallocations in group:
302 *    mark PAs deleted
303 *    move them onto local list
304 *    load on-disk bitmap
305 *    load group
306 *    remove PA from object (inode or locality group)
307 *    mark free blocks in-core
308 *
309 *  - discard inode's preallocations:
310 */
311
312/*
313 * Locking rules
314 *
315 * Locks:
316 *  - bitlock on a group	(group)
317 *  - object (inode/locality)	(object)
318 *  - per-pa lock		(pa)
319 *
320 * Paths:
321 *  - new pa
322 *    object
323 *    group
324 *
325 *  - find and use pa:
326 *    pa
327 *
328 *  - release consumed pa:
329 *    pa
330 *    group
331 *    object
332 *
333 *  - generate in-core bitmap:
334 *    group
335 *        pa
336 *
337 *  - discard all for given object (inode, locality group):
338 *    object
339 *        pa
340 *    group
341 *
342 *  - discard all for given group:
343 *    group
344 *        pa
345 *    group
346 *        object
347 *
348 */
349static struct kmem_cache *ext4_pspace_cachep;
350static struct kmem_cache *ext4_ac_cachep;
351static struct kmem_cache *ext4_free_data_cachep;
352
353/* We create slab caches for groupinfo data structures based on the
354 * superblock block size.  There will be one per mounted filesystem for
355 * each unique s_blocksize_bits */
356#define NR_GRPINFO_CACHES 8
357static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
358
359static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
360	"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
361	"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
362	"ext4_groupinfo_64k", "ext4_groupinfo_128k"
363};
364
365static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
366					ext4_group_t group);
367static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
368						ext4_group_t group);
369static void ext4_free_data_callback(struct super_block *sb,
370				struct ext4_journal_cb_entry *jce, int rc);
371
372static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
373{
374#if BITS_PER_LONG == 64
375	*bit += ((unsigned long) addr & 7UL) << 3;
376	addr = (void *) ((unsigned long) addr & ~7UL);
377#elif BITS_PER_LONG == 32
378	*bit += ((unsigned long) addr & 3UL) << 3;
379	addr = (void *) ((unsigned long) addr & ~3UL);
380#else
381#error "how many bits you are?!"
382#endif
383	return addr;
384}
385
386static inline int mb_test_bit(int bit, void *addr)
387{
388	/*
389	 * ext4_test_bit on architecture like powerpc
390	 * needs unsigned long aligned address
391	 */
392	addr = mb_correct_addr_and_bit(&bit, addr);
393	return ext4_test_bit(bit, addr);
394}
395
396static inline void mb_set_bit(int bit, void *addr)
397{
398	addr = mb_correct_addr_and_bit(&bit, addr);
399	ext4_set_bit(bit, addr);
400}
401
402static inline void mb_clear_bit(int bit, void *addr)
403{
404	addr = mb_correct_addr_and_bit(&bit, addr);
405	ext4_clear_bit(bit, addr);
406}
407
408static inline int mb_test_and_clear_bit(int bit, void *addr)
409{
410	addr = mb_correct_addr_and_bit(&bit, addr);
411	return ext4_test_and_clear_bit(bit, addr);
412}
413
414static inline int mb_find_next_zero_bit(void *addr, int max, int start)
415{
416	int fix = 0, ret, tmpmax;
417	addr = mb_correct_addr_and_bit(&fix, addr);
418	tmpmax = max + fix;
419	start += fix;
420
421	ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
422	if (ret > max)
423		return max;
424	return ret;
425}
426
427static inline int mb_find_next_bit(void *addr, int max, int start)
428{
429	int fix = 0, ret, tmpmax;
430	addr = mb_correct_addr_and_bit(&fix, addr);
431	tmpmax = max + fix;
432	start += fix;
433
434	ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
435	if (ret > max)
436		return max;
437	return ret;
438}
439
440static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
441{
442	char *bb;
443
444	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
445	BUG_ON(max == NULL);
446
447	if (order > e4b->bd_blkbits + 1) {
448		*max = 0;
449		return NULL;
450	}
451
452	/* at order 0 we see each particular block */
453	if (order == 0) {
454		*max = 1 << (e4b->bd_blkbits + 3);
455		return e4b->bd_bitmap;
456	}
457
458	bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
459	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
460
461	return bb;
462}
463
464#ifdef DOUBLE_CHECK
465static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
466			   int first, int count)
467{
468	int i;
469	struct super_block *sb = e4b->bd_sb;
470
471	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
472		return;
473	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
474	for (i = 0; i < count; i++) {
475		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
476			ext4_fsblk_t blocknr;
477
478			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
479			blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
480			ext4_grp_locked_error(sb, e4b->bd_group,
481					      inode ? inode->i_ino : 0,
482					      blocknr,
483					      "freeing block already freed "
484					      "(bit %u)",
485					      first + i);
486		}
487		mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
488	}
489}
490
491static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
492{
493	int i;
494
495	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
496		return;
497	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
498	for (i = 0; i < count; i++) {
499		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
500		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
501	}
502}
503
504static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
505{
506	if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
507		unsigned char *b1, *b2;
508		int i;
509		b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
510		b2 = (unsigned char *) bitmap;
511		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
512			if (b1[i] != b2[i]) {
513				ext4_msg(e4b->bd_sb, KERN_ERR,
514					 "corruption in group %u "
515					 "at byte %u(%u): %x in copy != %x "
516					 "on disk/prealloc",
517					 e4b->bd_group, i, i * 8, b1[i], b2[i]);
518				BUG();
519			}
520		}
521	}
522}
523
524#else
525static inline void mb_free_blocks_double(struct inode *inode,
526				struct ext4_buddy *e4b, int first, int count)
527{
528	return;
529}
530static inline void mb_mark_used_double(struct ext4_buddy *e4b,
531						int first, int count)
532{
533	return;
534}
535static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
536{
537	return;
538}
539#endif
540
541#ifdef AGGRESSIVE_CHECK
542
543#define MB_CHECK_ASSERT(assert)						\
544do {									\
545	if (!(assert)) {						\
546		printk(KERN_EMERG					\
547			"Assertion failure in %s() at %s:%d: \"%s\"\n",	\
548			function, file, line, # assert);		\
549		BUG();							\
550	}								\
551} while (0)
552
553static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
554				const char *function, int line)
555{
556	struct super_block *sb = e4b->bd_sb;
557	int order = e4b->bd_blkbits + 1;
558	int max;
559	int max2;
560	int i;
561	int j;
562	int k;
563	int count;
564	struct ext4_group_info *grp;
565	int fragments = 0;
566	int fstart;
567	struct list_head *cur;
568	void *buddy;
569	void *buddy2;
570
571	{
572		static int mb_check_counter;
573		if (mb_check_counter++ % 100 != 0)
574			return 0;
575	}
576
577	while (order > 1) {
578		buddy = mb_find_buddy(e4b, order, &max);
579		MB_CHECK_ASSERT(buddy);
580		buddy2 = mb_find_buddy(e4b, order - 1, &max2);
581		MB_CHECK_ASSERT(buddy2);
582		MB_CHECK_ASSERT(buddy != buddy2);
583		MB_CHECK_ASSERT(max * 2 == max2);
584
585		count = 0;
586		for (i = 0; i < max; i++) {
587
588			if (mb_test_bit(i, buddy)) {
589				/* only single bit in buddy2 may be 1 */
590				if (!mb_test_bit(i << 1, buddy2)) {
591					MB_CHECK_ASSERT(
592						mb_test_bit((i<<1)+1, buddy2));
593				} else if (!mb_test_bit((i << 1) + 1, buddy2)) {
594					MB_CHECK_ASSERT(
595						mb_test_bit(i << 1, buddy2));
596				}
597				continue;
598			}
599
600			/* both bits in buddy2 must be 1 */
601			MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
602			MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
603
604			for (j = 0; j < (1 << order); j++) {
605				k = (i * (1 << order)) + j;
606				MB_CHECK_ASSERT(
607					!mb_test_bit(k, e4b->bd_bitmap));
608			}
609			count++;
610		}
611		MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
612		order--;
613	}
614
615	fstart = -1;
616	buddy = mb_find_buddy(e4b, 0, &max);
617	for (i = 0; i < max; i++) {
618		if (!mb_test_bit(i, buddy)) {
619			MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
620			if (fstart == -1) {
621				fragments++;
622				fstart = i;
623			}
624			continue;
625		}
626		fstart = -1;
627		/* check used bits only */
628		for (j = 0; j < e4b->bd_blkbits + 1; j++) {
629			buddy2 = mb_find_buddy(e4b, j, &max2);
630			k = i >> j;
631			MB_CHECK_ASSERT(k < max2);
632			MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
633		}
634	}
635	MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
636	MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
637
638	grp = ext4_get_group_info(sb, e4b->bd_group);
639	list_for_each(cur, &grp->bb_prealloc_list) {
640		ext4_group_t groupnr;
641		struct ext4_prealloc_space *pa;
642		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
643		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
644		MB_CHECK_ASSERT(groupnr == e4b->bd_group);
645		for (i = 0; i < pa->pa_len; i++)
646			MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
647	}
648	return 0;
649}
650#undef MB_CHECK_ASSERT
651#define mb_check_buddy(e4b) __mb_check_buddy(e4b,	\
652					__FILE__, __func__, __LINE__)
653#else
654#define mb_check_buddy(e4b)
655#endif
656
657/*
658 * Divide blocks started from @first with length @len into
659 * smaller chunks with power of 2 blocks.
660 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
661 * then increase bb_counters[] for corresponded chunk size.
662 */
663static void ext4_mb_mark_free_simple(struct super_block *sb,
664				void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
665					struct ext4_group_info *grp)
666{
667	struct ext4_sb_info *sbi = EXT4_SB(sb);
668	ext4_grpblk_t min;
669	ext4_grpblk_t max;
670	ext4_grpblk_t chunk;
671	unsigned short border;
672
673	BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
674
675	border = 2 << sb->s_blocksize_bits;
676
677	while (len > 0) {
678		/* find how many blocks can be covered since this position */
679		max = ffs(first | border) - 1;
680
681		/* find how many blocks of power 2 we need to mark */
682		min = fls(len) - 1;
683
684		if (max < min)
685			min = max;
686		chunk = 1 << min;
687
688		/* mark multiblock chunks only */
689		grp->bb_counters[min]++;
690		if (min > 0)
691			mb_clear_bit(first >> min,
692				     buddy + sbi->s_mb_offsets[min]);
693
694		len -= chunk;
695		first += chunk;
696	}
697}
698
699/*
700 * Cache the order of the largest free extent we have available in this block
701 * group.
702 */
703static void
704mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
705{
706	int i;
707	int bits;
708
709	grp->bb_largest_free_order = -1; /* uninit */
710
711	bits = sb->s_blocksize_bits + 1;
712	for (i = bits; i >= 0; i--) {
713		if (grp->bb_counters[i] > 0) {
714			grp->bb_largest_free_order = i;
715			break;
716		}
717	}
718}
719
720static noinline_for_stack
721void ext4_mb_generate_buddy(struct super_block *sb,
722				void *buddy, void *bitmap, ext4_group_t group)
723{
724	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
725	struct ext4_sb_info *sbi = EXT4_SB(sb);
726	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
727	ext4_grpblk_t i = 0;
728	ext4_grpblk_t first;
729	ext4_grpblk_t len;
730	unsigned free = 0;
731	unsigned fragments = 0;
732	unsigned long long period = get_cycles();
733
734	/* initialize buddy from bitmap which is aggregation
735	 * of on-disk bitmap and preallocations */
736	i = mb_find_next_zero_bit(bitmap, max, 0);
737	grp->bb_first_free = i;
738	while (i < max) {
739		fragments++;
740		first = i;
741		i = mb_find_next_bit(bitmap, max, i);
742		len = i - first;
743		free += len;
744		if (len > 1)
745			ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
746		else
747			grp->bb_counters[0]++;
748		if (i < max)
749			i = mb_find_next_zero_bit(bitmap, max, i);
750	}
751	grp->bb_fragments = fragments;
752
753	if (free != grp->bb_free) {
754		ext4_grp_locked_error(sb, group, 0, 0,
755				      "block bitmap and bg descriptor "
756				      "inconsistent: %u vs %u free clusters",
757				      free, grp->bb_free);
758		/*
759		 * If we intend to continue, we consider group descriptor
760		 * corrupt and update bb_free using bitmap value
761		 */
762		grp->bb_free = free;
763		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
764			percpu_counter_sub(&sbi->s_freeclusters_counter,
765					   grp->bb_free);
766		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
767	}
768	mb_set_largest_free_order(sb, grp);
769
770	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
771
772	period = get_cycles() - period;
773	spin_lock(&EXT4_SB(sb)->s_bal_lock);
774	EXT4_SB(sb)->s_mb_buddies_generated++;
775	EXT4_SB(sb)->s_mb_generation_time += period;
776	spin_unlock(&EXT4_SB(sb)->s_bal_lock);
777}
778
779static void mb_regenerate_buddy(struct ext4_buddy *e4b)
780{
781	int count;
782	int order = 1;
783	void *buddy;
784
785	while ((buddy = mb_find_buddy(e4b, order++, &count))) {
786		ext4_set_bits(buddy, 0, count);
787	}
788	e4b->bd_info->bb_fragments = 0;
789	memset(e4b->bd_info->bb_counters, 0,
790		sizeof(*e4b->bd_info->bb_counters) *
791		(e4b->bd_sb->s_blocksize_bits + 2));
792
793	ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
794		e4b->bd_bitmap, e4b->bd_group);
795}
796
797/* The buddy information is attached the buddy cache inode
798 * for convenience. The information regarding each group
799 * is loaded via ext4_mb_load_buddy. The information involve
800 * block bitmap and buddy information. The information are
801 * stored in the inode as
802 *
803 * {                        page                        }
804 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
805 *
806 *
807 * one block each for bitmap and buddy information.
808 * So for each group we take up 2 blocks. A page can
809 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
810 * So it can have information regarding groups_per_page which
811 * is blocks_per_page/2
812 *
813 * Locking note:  This routine takes the block group lock of all groups
814 * for this page; do not hold this lock when calling this routine!
815 */
816
817static int ext4_mb_init_cache(struct page *page, char *incore)
818{
819	ext4_group_t ngroups;
820	int blocksize;
821	int blocks_per_page;
822	int groups_per_page;
823	int err = 0;
824	int i;
825	ext4_group_t first_group, group;
826	int first_block;
827	struct super_block *sb;
828	struct buffer_head *bhs;
829	struct buffer_head **bh = NULL;
830	struct inode *inode;
831	char *data;
832	char *bitmap;
833	struct ext4_group_info *grinfo;
834
835	mb_debug(1, "init page %lu\n", page->index);
836
837	inode = page->mapping->host;
838	sb = inode->i_sb;
839	ngroups = ext4_get_groups_count(sb);
840	blocksize = 1 << inode->i_blkbits;
841	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
842
843	groups_per_page = blocks_per_page >> 1;
844	if (groups_per_page == 0)
845		groups_per_page = 1;
846
847	/* allocate buffer_heads to read bitmaps */
848	if (groups_per_page > 1) {
849		i = sizeof(struct buffer_head *) * groups_per_page;
850		bh = kzalloc(i, GFP_NOFS);
851		if (bh == NULL) {
852			err = -ENOMEM;
853			goto out;
854		}
855	} else
856		bh = &bhs;
857
858	first_group = page->index * blocks_per_page / 2;
859
860	/* read all groups the page covers into the cache */
861	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
862		if (group >= ngroups)
863			break;
864
865		grinfo = ext4_get_group_info(sb, group);
866		/*
867		 * If page is uptodate then we came here after online resize
868		 * which added some new uninitialized group info structs, so
869		 * we must skip all initialized uptodate buddies on the page,
870		 * which may be currently in use by an allocating task.
871		 */
872		if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
873			bh[i] = NULL;
874			continue;
875		}
876		if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
877			err = -ENOMEM;
878			goto out;
879		}
880		mb_debug(1, "read bitmap for group %u\n", group);
881	}
882
883	/* wait for I/O completion */
884	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
885		if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
886			err = -EIO;
887			goto out;
888		}
889	}
890
891	first_block = page->index * blocks_per_page;
892	for (i = 0; i < blocks_per_page; i++) {
893		group = (first_block + i) >> 1;
894		if (group >= ngroups)
895			break;
896
897		if (!bh[group - first_group])
898			/* skip initialized uptodate buddy */
899			continue;
900
901		/*
902		 * data carry information regarding this
903		 * particular group in the format specified
904		 * above
905		 *
906		 */
907		data = page_address(page) + (i * blocksize);
908		bitmap = bh[group - first_group]->b_data;
909
910		/*
911		 * We place the buddy block and bitmap block
912		 * close together
913		 */
914		if ((first_block + i) & 1) {
915			/* this is block of buddy */
916			BUG_ON(incore == NULL);
917			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
918				group, page->index, i * blocksize);
919			trace_ext4_mb_buddy_bitmap_load(sb, group);
920			grinfo = ext4_get_group_info(sb, group);
921			grinfo->bb_fragments = 0;
922			memset(grinfo->bb_counters, 0,
923			       sizeof(*grinfo->bb_counters) *
924				(sb->s_blocksize_bits+2));
925			/*
926			 * incore got set to the group block bitmap below
927			 */
928			ext4_lock_group(sb, group);
929			/* init the buddy */
930			memset(data, 0xff, blocksize);
931			ext4_mb_generate_buddy(sb, data, incore, group);
932			ext4_unlock_group(sb, group);
933			incore = NULL;
934		} else {
935			/* this is block of bitmap */
936			BUG_ON(incore != NULL);
937			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
938				group, page->index, i * blocksize);
939			trace_ext4_mb_bitmap_load(sb, group);
940
941			/* see comments in ext4_mb_put_pa() */
942			ext4_lock_group(sb, group);
943			memcpy(data, bitmap, blocksize);
944
945			/* mark all preallocated blks used in in-core bitmap */
946			ext4_mb_generate_from_pa(sb, data, group);
947			ext4_mb_generate_from_freelist(sb, data, group);
948			ext4_unlock_group(sb, group);
949
950			/* set incore so that the buddy information can be
951			 * generated using this
952			 */
953			incore = data;
954		}
955	}
956	SetPageUptodate(page);
957
958out:
959	if (bh) {
960		for (i = 0; i < groups_per_page; i++)
961			brelse(bh[i]);
962		if (bh != &bhs)
963			kfree(bh);
964	}
965	return err;
966}
967
968/*
969 * Lock the buddy and bitmap pages. This make sure other parallel init_group
970 * on the same buddy page doesn't happen whild holding the buddy page lock.
971 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
972 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
973 */
974static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
975		ext4_group_t group, struct ext4_buddy *e4b)
976{
977	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
978	int block, pnum, poff;
979	int blocks_per_page;
980	struct page *page;
981
982	e4b->bd_buddy_page = NULL;
983	e4b->bd_bitmap_page = NULL;
984
985	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
986	/*
987	 * the buddy cache inode stores the block bitmap
988	 * and buddy information in consecutive blocks.
989	 * So for each group we need two blocks.
990	 */
991	block = group * 2;
992	pnum = block / blocks_per_page;
993	poff = block % blocks_per_page;
994	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
995	if (!page)
996		return -ENOMEM;
997	BUG_ON(page->mapping != inode->i_mapping);
998	e4b->bd_bitmap_page = page;
999	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1000
1001	if (blocks_per_page >= 2) {
1002		/* buddy and bitmap are on the same page */
1003		return 0;
1004	}
1005
1006	block++;
1007	pnum = block / blocks_per_page;
1008	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1009	if (!page)
1010		return -ENOMEM;
1011	BUG_ON(page->mapping != inode->i_mapping);
1012	e4b->bd_buddy_page = page;
1013	return 0;
1014}
1015
1016static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1017{
1018	if (e4b->bd_bitmap_page) {
1019		unlock_page(e4b->bd_bitmap_page);
1020		page_cache_release(e4b->bd_bitmap_page);
1021	}
1022	if (e4b->bd_buddy_page) {
1023		unlock_page(e4b->bd_buddy_page);
1024		page_cache_release(e4b->bd_buddy_page);
1025	}
1026}
1027
1028/*
1029 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1030 * block group lock of all groups for this page; do not hold the BG lock when
1031 * calling this routine!
1032 */
1033static noinline_for_stack
1034int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1035{
1036
1037	struct ext4_group_info *this_grp;
1038	struct ext4_buddy e4b;
1039	struct page *page;
1040	int ret = 0;
1041
1042	might_sleep();
1043	mb_debug(1, "init group %u\n", group);
1044	this_grp = ext4_get_group_info(sb, group);
1045	/*
1046	 * This ensures that we don't reinit the buddy cache
1047	 * page which map to the group from which we are already
1048	 * allocating. If we are looking at the buddy cache we would
1049	 * have taken a reference using ext4_mb_load_buddy and that
1050	 * would have pinned buddy page to page cache.
1051	 * The call to ext4_mb_get_buddy_page_lock will mark the
1052	 * page accessed.
1053	 */
1054	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1055	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1056		/*
1057		 * somebody initialized the group
1058		 * return without doing anything
1059		 */
1060		goto err;
1061	}
1062
1063	page = e4b.bd_bitmap_page;
1064	ret = ext4_mb_init_cache(page, NULL);
1065	if (ret)
1066		goto err;
1067	if (!PageUptodate(page)) {
1068		ret = -EIO;
1069		goto err;
1070	}
1071
1072	if (e4b.bd_buddy_page == NULL) {
1073		/*
1074		 * If both the bitmap and buddy are in
1075		 * the same page we don't need to force
1076		 * init the buddy
1077		 */
1078		ret = 0;
1079		goto err;
1080	}
1081	/* init buddy cache */
1082	page = e4b.bd_buddy_page;
1083	ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1084	if (ret)
1085		goto err;
1086	if (!PageUptodate(page)) {
1087		ret = -EIO;
1088		goto err;
1089	}
1090err:
1091	ext4_mb_put_buddy_page_lock(&e4b);
1092	return ret;
1093}
1094
1095/*
1096 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1097 * block group lock of all groups for this page; do not hold the BG lock when
1098 * calling this routine!
1099 */
1100static noinline_for_stack int
1101ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1102					struct ext4_buddy *e4b)
1103{
1104	int blocks_per_page;
1105	int block;
1106	int pnum;
1107	int poff;
1108	struct page *page;
1109	int ret;
1110	struct ext4_group_info *grp;
1111	struct ext4_sb_info *sbi = EXT4_SB(sb);
1112	struct inode *inode = sbi->s_buddy_cache;
1113
1114	might_sleep();
1115	mb_debug(1, "load group %u\n", group);
1116
1117	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1118	grp = ext4_get_group_info(sb, group);
1119
1120	e4b->bd_blkbits = sb->s_blocksize_bits;
1121	e4b->bd_info = grp;
1122	e4b->bd_sb = sb;
1123	e4b->bd_group = group;
1124	e4b->bd_buddy_page = NULL;
1125	e4b->bd_bitmap_page = NULL;
1126
1127	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1128		/*
1129		 * we need full data about the group
1130		 * to make a good selection
1131		 */
1132		ret = ext4_mb_init_group(sb, group);
1133		if (ret)
1134			return ret;
1135	}
1136
1137	/*
1138	 * the buddy cache inode stores the block bitmap
1139	 * and buddy information in consecutive blocks.
1140	 * So for each group we need two blocks.
1141	 */
1142	block = group * 2;
1143	pnum = block / blocks_per_page;
1144	poff = block % blocks_per_page;
1145
1146	/* we could use find_or_create_page(), but it locks page
1147	 * what we'd like to avoid in fast path ... */
1148	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1149	if (page == NULL || !PageUptodate(page)) {
1150		if (page)
1151			/*
1152			 * drop the page reference and try
1153			 * to get the page with lock. If we
1154			 * are not uptodate that implies
1155			 * somebody just created the page but
1156			 * is yet to initialize the same. So
1157			 * wait for it to initialize.
1158			 */
1159			page_cache_release(page);
1160		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1161		if (page) {
1162			BUG_ON(page->mapping != inode->i_mapping);
1163			if (!PageUptodate(page)) {
1164				ret = ext4_mb_init_cache(page, NULL);
1165				if (ret) {
1166					unlock_page(page);
1167					goto err;
1168				}
1169				mb_cmp_bitmaps(e4b, page_address(page) +
1170					       (poff * sb->s_blocksize));
1171			}
1172			unlock_page(page);
1173		}
1174	}
1175	if (page == NULL) {
1176		ret = -ENOMEM;
1177		goto err;
1178	}
1179	if (!PageUptodate(page)) {
1180		ret = -EIO;
1181		goto err;
1182	}
1183
1184	/* Pages marked accessed already */
1185	e4b->bd_bitmap_page = page;
1186	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1187
1188	block++;
1189	pnum = block / blocks_per_page;
1190	poff = block % blocks_per_page;
1191
1192	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1193	if (page == NULL || !PageUptodate(page)) {
1194		if (page)
1195			page_cache_release(page);
1196		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1197		if (page) {
1198			BUG_ON(page->mapping != inode->i_mapping);
1199			if (!PageUptodate(page)) {
1200				ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
1201				if (ret) {
1202					unlock_page(page);
1203					goto err;
1204				}
1205			}
1206			unlock_page(page);
1207		}
1208	}
1209	if (page == NULL) {
1210		ret = -ENOMEM;
1211		goto err;
1212	}
1213	if (!PageUptodate(page)) {
1214		ret = -EIO;
1215		goto err;
1216	}
1217
1218	/* Pages marked accessed already */
1219	e4b->bd_buddy_page = page;
1220	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1221
1222	BUG_ON(e4b->bd_bitmap_page == NULL);
1223	BUG_ON(e4b->bd_buddy_page == NULL);
1224
1225	return 0;
1226
1227err:
1228	if (page)
1229		page_cache_release(page);
1230	if (e4b->bd_bitmap_page)
1231		page_cache_release(e4b->bd_bitmap_page);
1232	if (e4b->bd_buddy_page)
1233		page_cache_release(e4b->bd_buddy_page);
1234	e4b->bd_buddy = NULL;
1235	e4b->bd_bitmap = NULL;
1236	return ret;
1237}
1238
1239static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1240{
1241	if (e4b->bd_bitmap_page)
1242		page_cache_release(e4b->bd_bitmap_page);
1243	if (e4b->bd_buddy_page)
1244		page_cache_release(e4b->bd_buddy_page);
1245}
1246
1247
1248static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1249{
1250	int order = 1;
1251	int bb_incr = 1 << (e4b->bd_blkbits - 1);
1252	void *bb;
1253
1254	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1255	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1256
1257	bb = e4b->bd_buddy;
1258	while (order <= e4b->bd_blkbits + 1) {
1259		block = block >> 1;
1260		if (!mb_test_bit(block, bb)) {
1261			/* this block is part of buddy of order 'order' */
1262			return order;
1263		}
1264		bb += bb_incr;
1265		bb_incr >>= 1;
1266		order++;
1267	}
1268	return 0;
1269}
1270
1271static void mb_clear_bits(void *bm, int cur, int len)
1272{
1273	__u32 *addr;
1274
1275	len = cur + len;
1276	while (cur < len) {
1277		if ((cur & 31) == 0 && (len - cur) >= 32) {
1278			/* fast path: clear whole word at once */
1279			addr = bm + (cur >> 3);
1280			*addr = 0;
1281			cur += 32;
1282			continue;
1283		}
1284		mb_clear_bit(cur, bm);
1285		cur++;
1286	}
1287}
1288
1289/* clear bits in given range
1290 * will return first found zero bit if any, -1 otherwise
1291 */
1292static int mb_test_and_clear_bits(void *bm, int cur, int len)
1293{
1294	__u32 *addr;
1295	int zero_bit = -1;
1296
1297	len = cur + len;
1298	while (cur < len) {
1299		if ((cur & 31) == 0 && (len - cur) >= 32) {
1300			/* fast path: clear whole word at once */
1301			addr = bm + (cur >> 3);
1302			if (*addr != (__u32)(-1) && zero_bit == -1)
1303				zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1304			*addr = 0;
1305			cur += 32;
1306			continue;
1307		}
1308		if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1309			zero_bit = cur;
1310		cur++;
1311	}
1312
1313	return zero_bit;
1314}
1315
1316void ext4_set_bits(void *bm, int cur, int len)
1317{
1318	__u32 *addr;
1319
1320	len = cur + len;
1321	while (cur < len) {
1322		if ((cur & 31) == 0 && (len - cur) >= 32) {
1323			/* fast path: set whole word at once */
1324			addr = bm + (cur >> 3);
1325			*addr = 0xffffffff;
1326			cur += 32;
1327			continue;
1328		}
1329		mb_set_bit(cur, bm);
1330		cur++;
1331	}
1332}
1333
1334/*
1335 * _________________________________________________________________ */
1336
1337static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1338{
1339	if (mb_test_bit(*bit + side, bitmap)) {
1340		mb_clear_bit(*bit, bitmap);
1341		(*bit) -= side;
1342		return 1;
1343	}
1344	else {
1345		(*bit) += side;
1346		mb_set_bit(*bit, bitmap);
1347		return -1;
1348	}
1349}
1350
1351static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1352{
1353	int max;
1354	int order = 1;
1355	void *buddy = mb_find_buddy(e4b, order, &max);
1356
1357	while (buddy) {
1358		void *buddy2;
1359
1360		/* Bits in range [first; last] are known to be set since
1361		 * corresponding blocks were allocated. Bits in range
1362		 * (first; last) will stay set because they form buddies on
1363		 * upper layer. We just deal with borders if they don't
1364		 * align with upper layer and then go up.
1365		 * Releasing entire group is all about clearing
1366		 * single bit of highest order buddy.
1367		 */
1368
1369		/* Example:
1370		 * ---------------------------------
1371		 * |   1   |   1   |   1   |   1   |
1372		 * ---------------------------------
1373		 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1374		 * ---------------------------------
1375		 *   0   1   2   3   4   5   6   7
1376		 *      \_____________________/
1377		 *
1378		 * Neither [1] nor [6] is aligned to above layer.
1379		 * Left neighbour [0] is free, so mark it busy,
1380		 * decrease bb_counters and extend range to
1381		 * [0; 6]
1382		 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1383		 * mark [6] free, increase bb_counters and shrink range to
1384		 * [0; 5].
1385		 * Then shift range to [0; 2], go up and do the same.
1386		 */
1387
1388
1389		if (first & 1)
1390			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1391		if (!(last & 1))
1392			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1393		if (first > last)
1394			break;
1395		order++;
1396
1397		if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1398			mb_clear_bits(buddy, first, last - first + 1);
1399			e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1400			break;
1401		}
1402		first >>= 1;
1403		last >>= 1;
1404		buddy = buddy2;
1405	}
1406}
1407
1408static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1409			   int first, int count)
1410{
1411	int left_is_free = 0;
1412	int right_is_free = 0;
1413	int block;
1414	int last = first + count - 1;
1415	struct super_block *sb = e4b->bd_sb;
1416
1417	if (WARN_ON(count == 0))
1418		return;
1419	BUG_ON(last >= (sb->s_blocksize << 3));
1420	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1421	/* Don't bother if the block group is corrupt. */
1422	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1423		return;
1424
1425	mb_check_buddy(e4b);
1426	mb_free_blocks_double(inode, e4b, first, count);
1427
1428	e4b->bd_info->bb_free += count;
1429	if (first < e4b->bd_info->bb_first_free)
1430		e4b->bd_info->bb_first_free = first;
1431
1432	/* access memory sequentially: check left neighbour,
1433	 * clear range and then check right neighbour
1434	 */
1435	if (first != 0)
1436		left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1437	block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1438	if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1439		right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1440
1441	if (unlikely(block != -1)) {
1442		struct ext4_sb_info *sbi = EXT4_SB(sb);
1443		ext4_fsblk_t blocknr;
1444
1445		blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1446		blocknr += EXT4_C2B(EXT4_SB(sb), block);
1447		ext4_grp_locked_error(sb, e4b->bd_group,
1448				      inode ? inode->i_ino : 0,
1449				      blocknr,
1450				      "freeing already freed block "
1451				      "(bit %u); block bitmap corrupt.",
1452				      block);
1453		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
1454			percpu_counter_sub(&sbi->s_freeclusters_counter,
1455					   e4b->bd_info->bb_free);
1456		/* Mark the block group as corrupt. */
1457		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1458			&e4b->bd_info->bb_state);
1459		mb_regenerate_buddy(e4b);
1460		goto done;
1461	}
1462
1463	/* let's maintain fragments counter */
1464	if (left_is_free && right_is_free)
1465		e4b->bd_info->bb_fragments--;
1466	else if (!left_is_free && !right_is_free)
1467		e4b->bd_info->bb_fragments++;
1468
1469	/* buddy[0] == bd_bitmap is a special case, so handle
1470	 * it right away and let mb_buddy_mark_free stay free of
1471	 * zero order checks.
1472	 * Check if neighbours are to be coaleasced,
1473	 * adjust bitmap bb_counters and borders appropriately.
1474	 */
1475	if (first & 1) {
1476		first += !left_is_free;
1477		e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1478	}
1479	if (!(last & 1)) {
1480		last -= !right_is_free;
1481		e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1482	}
1483
1484	if (first <= last)
1485		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1486
1487done:
1488	mb_set_largest_free_order(sb, e4b->bd_info);
1489	mb_check_buddy(e4b);
1490}
1491
1492static int mb_find_extent(struct ext4_buddy *e4b, int block,
1493				int needed, struct ext4_free_extent *ex)
1494{
1495	int next = block;
1496	int max, order;
1497	void *buddy;
1498
1499	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1500	BUG_ON(ex == NULL);
1501
1502	buddy = mb_find_buddy(e4b, 0, &max);
1503	BUG_ON(buddy == NULL);
1504	BUG_ON(block >= max);
1505	if (mb_test_bit(block, buddy)) {
1506		ex->fe_len = 0;
1507		ex->fe_start = 0;
1508		ex->fe_group = 0;
1509		return 0;
1510	}
1511
1512	/* find actual order */
1513	order = mb_find_order_for_block(e4b, block);
1514	block = block >> order;
1515
1516	ex->fe_len = 1 << order;
1517	ex->fe_start = block << order;
1518	ex->fe_group = e4b->bd_group;
1519
1520	/* calc difference from given start */
1521	next = next - ex->fe_start;
1522	ex->fe_len -= next;
1523	ex->fe_start += next;
1524
1525	while (needed > ex->fe_len &&
1526	       mb_find_buddy(e4b, order, &max)) {
1527
1528		if (block + 1 >= max)
1529			break;
1530
1531		next = (block + 1) * (1 << order);
1532		if (mb_test_bit(next, e4b->bd_bitmap))
1533			break;
1534
1535		order = mb_find_order_for_block(e4b, next);
1536
1537		block = next >> order;
1538		ex->fe_len += 1 << order;
1539	}
1540
1541	BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1542	return ex->fe_len;
1543}
1544
1545static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1546{
1547	int ord;
1548	int mlen = 0;
1549	int max = 0;
1550	int cur;
1551	int start = ex->fe_start;
1552	int len = ex->fe_len;
1553	unsigned ret = 0;
1554	int len0 = len;
1555	void *buddy;
1556
1557	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1558	BUG_ON(e4b->bd_group != ex->fe_group);
1559	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1560	mb_check_buddy(e4b);
1561	mb_mark_used_double(e4b, start, len);
1562
1563	e4b->bd_info->bb_free -= len;
1564	if (e4b->bd_info->bb_first_free == start)
1565		e4b->bd_info->bb_first_free += len;
1566
1567	/* let's maintain fragments counter */
1568	if (start != 0)
1569		mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1570	if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1571		max = !mb_test_bit(start + len, e4b->bd_bitmap);
1572	if (mlen && max)
1573		e4b->bd_info->bb_fragments++;
1574	else if (!mlen && !max)
1575		e4b->bd_info->bb_fragments--;
1576
1577	/* let's maintain buddy itself */
1578	while (len) {
1579		ord = mb_find_order_for_block(e4b, start);
1580
1581		if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1582			/* the whole chunk may be allocated at once! */
1583			mlen = 1 << ord;
1584			buddy = mb_find_buddy(e4b, ord, &max);
1585			BUG_ON((start >> ord) >= max);
1586			mb_set_bit(start >> ord, buddy);
1587			e4b->bd_info->bb_counters[ord]--;
1588			start += mlen;
1589			len -= mlen;
1590			BUG_ON(len < 0);
1591			continue;
1592		}
1593
1594		/* store for history */
1595		if (ret == 0)
1596			ret = len | (ord << 16);
1597
1598		/* we have to split large buddy */
1599		BUG_ON(ord <= 0);
1600		buddy = mb_find_buddy(e4b, ord, &max);
1601		mb_set_bit(start >> ord, buddy);
1602		e4b->bd_info->bb_counters[ord]--;
1603
1604		ord--;
1605		cur = (start >> ord) & ~1U;
1606		buddy = mb_find_buddy(e4b, ord, &max);
1607		mb_clear_bit(cur, buddy);
1608		mb_clear_bit(cur + 1, buddy);
1609		e4b->bd_info->bb_counters[ord]++;
1610		e4b->bd_info->bb_counters[ord]++;
1611	}
1612	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1613
1614	ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1615	mb_check_buddy(e4b);
1616
1617	return ret;
1618}
1619
1620/*
1621 * Must be called under group lock!
1622 */
1623static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1624					struct ext4_buddy *e4b)
1625{
1626	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1627	int ret;
1628
1629	BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1630	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1631
1632	ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1633	ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1634	ret = mb_mark_used(e4b, &ac->ac_b_ex);
1635
1636	/* preallocation can change ac_b_ex, thus we store actually
1637	 * allocated blocks for history */
1638	ac->ac_f_ex = ac->ac_b_ex;
1639
1640	ac->ac_status = AC_STATUS_FOUND;
1641	ac->ac_tail = ret & 0xffff;
1642	ac->ac_buddy = ret >> 16;
1643
1644	/*
1645	 * take the page reference. We want the page to be pinned
1646	 * so that we don't get a ext4_mb_init_cache_call for this
1647	 * group until we update the bitmap. That would mean we
1648	 * double allocate blocks. The reference is dropped
1649	 * in ext4_mb_release_context
1650	 */
1651	ac->ac_bitmap_page = e4b->bd_bitmap_page;
1652	get_page(ac->ac_bitmap_page);
1653	ac->ac_buddy_page = e4b->bd_buddy_page;
1654	get_page(ac->ac_buddy_page);
1655	/* store last allocated for subsequent stream allocation */
1656	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1657		spin_lock(&sbi->s_md_lock);
1658		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1659		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1660		spin_unlock(&sbi->s_md_lock);
1661	}
1662}
1663
1664/*
1665 * regular allocator, for general purposes allocation
1666 */
1667
1668static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1669					struct ext4_buddy *e4b,
1670					int finish_group)
1671{
1672	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1673	struct ext4_free_extent *bex = &ac->ac_b_ex;
1674	struct ext4_free_extent *gex = &ac->ac_g_ex;
1675	struct ext4_free_extent ex;
1676	int max;
1677
1678	if (ac->ac_status == AC_STATUS_FOUND)
1679		return;
1680	/*
1681	 * We don't want to scan for a whole year
1682	 */
1683	if (ac->ac_found > sbi->s_mb_max_to_scan &&
1684			!(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1685		ac->ac_status = AC_STATUS_BREAK;
1686		return;
1687	}
1688
1689	/*
1690	 * Haven't found good chunk so far, let's continue
1691	 */
1692	if (bex->fe_len < gex->fe_len)
1693		return;
1694
1695	if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1696			&& bex->fe_group == e4b->bd_group) {
1697		/* recheck chunk's availability - we don't know
1698		 * when it was found (within this lock-unlock
1699		 * period or not) */
1700		max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1701		if (max >= gex->fe_len) {
1702			ext4_mb_use_best_found(ac, e4b);
1703			return;
1704		}
1705	}
1706}
1707
1708/*
1709 * The routine checks whether found extent is good enough. If it is,
1710 * then the extent gets marked used and flag is set to the context
1711 * to stop scanning. Otherwise, the extent is compared with the
1712 * previous found extent and if new one is better, then it's stored
1713 * in the context. Later, the best found extent will be used, if
1714 * mballoc can't find good enough extent.
1715 *
1716 * FIXME: real allocation policy is to be designed yet!
1717 */
1718static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1719					struct ext4_free_extent *ex,
1720					struct ext4_buddy *e4b)
1721{
1722	struct ext4_free_extent *bex = &ac->ac_b_ex;
1723	struct ext4_free_extent *gex = &ac->ac_g_ex;
1724
1725	BUG_ON(ex->fe_len <= 0);
1726	BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1727	BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1728	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1729
1730	ac->ac_found++;
1731
1732	/*
1733	 * The special case - take what you catch first
1734	 */
1735	if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1736		*bex = *ex;
1737		ext4_mb_use_best_found(ac, e4b);
1738		return;
1739	}
1740
1741	/*
1742	 * Let's check whether the chuck is good enough
1743	 */
1744	if (ex->fe_len == gex->fe_len) {
1745		*bex = *ex;
1746		ext4_mb_use_best_found(ac, e4b);
1747		return;
1748	}
1749
1750	/*
1751	 * If this is first found extent, just store it in the context
1752	 */
1753	if (bex->fe_len == 0) {
1754		*bex = *ex;
1755		return;
1756	}
1757
1758	/*
1759	 * If new found extent is better, store it in the context
1760	 */
1761	if (bex->fe_len < gex->fe_len) {
1762		/* if the request isn't satisfied, any found extent
1763		 * larger than previous best one is better */
1764		if (ex->fe_len > bex->fe_len)
1765			*bex = *ex;
1766	} else if (ex->fe_len > gex->fe_len) {
1767		/* if the request is satisfied, then we try to find
1768		 * an extent that still satisfy the request, but is
1769		 * smaller than previous one */
1770		if (ex->fe_len < bex->fe_len)
1771			*bex = *ex;
1772	}
1773
1774	ext4_mb_check_limits(ac, e4b, 0);
1775}
1776
1777static noinline_for_stack
1778int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1779					struct ext4_buddy *e4b)
1780{
1781	struct ext4_free_extent ex = ac->ac_b_ex;
1782	ext4_group_t group = ex.fe_group;
1783	int max;
1784	int err;
1785
1786	BUG_ON(ex.fe_len <= 0);
1787	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1788	if (err)
1789		return err;
1790
1791	ext4_lock_group(ac->ac_sb, group);
1792	max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1793
1794	if (max > 0) {
1795		ac->ac_b_ex = ex;
1796		ext4_mb_use_best_found(ac, e4b);
1797	}
1798
1799	ext4_unlock_group(ac->ac_sb, group);
1800	ext4_mb_unload_buddy(e4b);
1801
1802	return 0;
1803}
1804
1805static noinline_for_stack
1806int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1807				struct ext4_buddy *e4b)
1808{
1809	ext4_group_t group = ac->ac_g_ex.fe_group;
1810	int max;
1811	int err;
1812	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1813	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1814	struct ext4_free_extent ex;
1815
1816	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1817		return 0;
1818	if (grp->bb_free == 0)
1819		return 0;
1820
1821	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1822	if (err)
1823		return err;
1824
1825	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
1826		ext4_mb_unload_buddy(e4b);
1827		return 0;
1828	}
1829
1830	ext4_lock_group(ac->ac_sb, group);
1831	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1832			     ac->ac_g_ex.fe_len, &ex);
1833	ex.fe_logical = 0xDEADFA11; /* debug value */
1834
1835	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1836		ext4_fsblk_t start;
1837
1838		start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1839			ex.fe_start;
1840		/* use do_div to get remainder (would be 64-bit modulo) */
1841		if (do_div(start, sbi->s_stripe) == 0) {
1842			ac->ac_found++;
1843			ac->ac_b_ex = ex;
1844			ext4_mb_use_best_found(ac, e4b);
1845		}
1846	} else if (max >= ac->ac_g_ex.fe_len) {
1847		BUG_ON(ex.fe_len <= 0);
1848		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1849		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1850		ac->ac_found++;
1851		ac->ac_b_ex = ex;
1852		ext4_mb_use_best_found(ac, e4b);
1853	} else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1854		/* Sometimes, caller may want to merge even small
1855		 * number of blocks to an existing extent */
1856		BUG_ON(ex.fe_len <= 0);
1857		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1858		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1859		ac->ac_found++;
1860		ac->ac_b_ex = ex;
1861		ext4_mb_use_best_found(ac, e4b);
1862	}
1863	ext4_unlock_group(ac->ac_sb, group);
1864	ext4_mb_unload_buddy(e4b);
1865
1866	return 0;
1867}
1868
1869/*
1870 * The routine scans buddy structures (not bitmap!) from given order
1871 * to max order and tries to find big enough chunk to satisfy the req
1872 */
1873static noinline_for_stack
1874void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1875					struct ext4_buddy *e4b)
1876{
1877	struct super_block *sb = ac->ac_sb;
1878	struct ext4_group_info *grp = e4b->bd_info;
1879	void *buddy;
1880	int i;
1881	int k;
1882	int max;
1883
1884	BUG_ON(ac->ac_2order <= 0);
1885	for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1886		if (grp->bb_counters[i] == 0)
1887			continue;
1888
1889		buddy = mb_find_buddy(e4b, i, &max);
1890		BUG_ON(buddy == NULL);
1891
1892		k = mb_find_next_zero_bit(buddy, max, 0);
1893		BUG_ON(k >= max);
1894
1895		ac->ac_found++;
1896
1897		ac->ac_b_ex.fe_len = 1 << i;
1898		ac->ac_b_ex.fe_start = k << i;
1899		ac->ac_b_ex.fe_group = e4b->bd_group;
1900
1901		ext4_mb_use_best_found(ac, e4b);
1902
1903		BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1904
1905		if (EXT4_SB(sb)->s_mb_stats)
1906			atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1907
1908		break;
1909	}
1910}
1911
1912/*
1913 * The routine scans the group and measures all found extents.
1914 * In order to optimize scanning, caller must pass number of
1915 * free blocks in the group, so the routine can know upper limit.
1916 */
1917static noinline_for_stack
1918void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1919					struct ext4_buddy *e4b)
1920{
1921	struct super_block *sb = ac->ac_sb;
1922	void *bitmap = e4b->bd_bitmap;
1923	struct ext4_free_extent ex;
1924	int i;
1925	int free;
1926
1927	free = e4b->bd_info->bb_free;
1928	BUG_ON(free <= 0);
1929
1930	i = e4b->bd_info->bb_first_free;
1931
1932	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1933		i = mb_find_next_zero_bit(bitmap,
1934						EXT4_CLUSTERS_PER_GROUP(sb), i);
1935		if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
1936			/*
1937			 * IF we have corrupt bitmap, we won't find any
1938			 * free blocks even though group info says we
1939			 * we have free blocks
1940			 */
1941			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1942					"%d free clusters as per "
1943					"group info. But bitmap says 0",
1944					free);
1945			break;
1946		}
1947
1948		mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
1949		BUG_ON(ex.fe_len <= 0);
1950		if (free < ex.fe_len) {
1951			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1952					"%d free clusters as per "
1953					"group info. But got %d blocks",
1954					free, ex.fe_len);
1955			/*
1956			 * The number of free blocks differs. This mostly
1957			 * indicate that the bitmap is corrupt. So exit
1958			 * without claiming the space.
1959			 */
1960			break;
1961		}
1962		ex.fe_logical = 0xDEADC0DE; /* debug value */
1963		ext4_mb_measure_extent(ac, &ex, e4b);
1964
1965		i += ex.fe_len;
1966		free -= ex.fe_len;
1967	}
1968
1969	ext4_mb_check_limits(ac, e4b, 1);
1970}
1971
1972/*
1973 * This is a special case for storages like raid5
1974 * we try to find stripe-aligned chunks for stripe-size-multiple requests
1975 */
1976static noinline_for_stack
1977void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1978				 struct ext4_buddy *e4b)
1979{
1980	struct super_block *sb = ac->ac_sb;
1981	struct ext4_sb_info *sbi = EXT4_SB(sb);
1982	void *bitmap = e4b->bd_bitmap;
1983	struct ext4_free_extent ex;
1984	ext4_fsblk_t first_group_block;
1985	ext4_fsblk_t a;
1986	ext4_grpblk_t i;
1987	int max;
1988
1989	BUG_ON(sbi->s_stripe == 0);
1990
1991	/* find first stripe-aligned block in group */
1992	first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1993
1994	a = first_group_block + sbi->s_stripe - 1;
1995	do_div(a, sbi->s_stripe);
1996	i = (a * sbi->s_stripe) - first_group_block;
1997
1998	while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
1999		if (!mb_test_bit(i, bitmap)) {
2000			max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
2001			if (max >= sbi->s_stripe) {
2002				ac->ac_found++;
2003				ex.fe_logical = 0xDEADF00D; /* debug value */
2004				ac->ac_b_ex = ex;
2005				ext4_mb_use_best_found(ac, e4b);
2006				break;
2007			}
2008		}
2009		i += sbi->s_stripe;
2010	}
2011}
2012
2013/* This is now called BEFORE we load the buddy bitmap. */
2014static int ext4_mb_good_group(struct ext4_allocation_context *ac,
2015				ext4_group_t group, int cr)
2016{
2017	unsigned free, fragments;
2018	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
2019	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2020
2021	BUG_ON(cr < 0 || cr >= 4);
2022
2023	free = grp->bb_free;
2024	if (free == 0)
2025		return 0;
2026	if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2027		return 0;
2028
2029	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2030		return 0;
2031
2032	/* We only do this if the grp has never been initialized */
2033	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2034		int ret = ext4_mb_init_group(ac->ac_sb, group);
2035		if (ret)
2036			return 0;
2037	}
2038
2039	fragments = grp->bb_fragments;
2040	if (fragments == 0)
2041		return 0;
2042
2043	switch (cr) {
2044	case 0:
2045		BUG_ON(ac->ac_2order == 0);
2046
2047		/* Avoid using the first bg of a flexgroup for data files */
2048		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2049		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2050		    ((group % flex_size) == 0))
2051			return 0;
2052
2053		if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
2054		    (free / fragments) >= ac->ac_g_ex.fe_len)
2055			return 1;
2056
2057		if (grp->bb_largest_free_order < ac->ac_2order)
2058			return 0;
2059
2060		return 1;
2061	case 1:
2062		if ((free / fragments) >= ac->ac_g_ex.fe_len)
2063			return 1;
2064		break;
2065	case 2:
2066		if (free >= ac->ac_g_ex.fe_len)
2067			return 1;
2068		break;
2069	case 3:
2070		return 1;
2071	default:
2072		BUG();
2073	}
2074
2075	return 0;
2076}
2077
2078static noinline_for_stack int
2079ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2080{
2081	ext4_group_t ngroups, group, i;
2082	int cr;
2083	int err = 0;
2084	struct ext4_sb_info *sbi;
2085	struct super_block *sb;
2086	struct ext4_buddy e4b;
2087
2088	sb = ac->ac_sb;
2089	sbi = EXT4_SB(sb);
2090	ngroups = ext4_get_groups_count(sb);
2091	/* non-extent files are limited to low blocks/groups */
2092	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2093		ngroups = sbi->s_blockfile_groups;
2094
2095	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2096
2097	/* first, try the goal */
2098	err = ext4_mb_find_by_goal(ac, &e4b);
2099	if (err || ac->ac_status == AC_STATUS_FOUND)
2100		goto out;
2101
2102	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2103		goto out;
2104
2105	/*
2106	 * ac->ac2_order is set only if the fe_len is a power of 2
2107	 * if ac2_order is set we also set criteria to 0 so that we
2108	 * try exact allocation using buddy.
2109	 */
2110	i = fls(ac->ac_g_ex.fe_len);
2111	ac->ac_2order = 0;
2112	/*
2113	 * We search using buddy data only if the order of the request
2114	 * is greater than equal to the sbi_s_mb_order2_reqs
2115	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2116	 */
2117	if (i >= sbi->s_mb_order2_reqs) {
2118		/*
2119		 * This should tell if fe_len is exactly power of 2
2120		 */
2121		if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2122			ac->ac_2order = i - 1;
2123	}
2124
2125	/* if stream allocation is enabled, use global goal */
2126	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2127		/* TBD: may be hot point */
2128		spin_lock(&sbi->s_md_lock);
2129		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2130		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2131		spin_unlock(&sbi->s_md_lock);
2132	}
2133
2134	/* Let's just scan groups to find more-less suitable blocks */
2135	cr = ac->ac_2order ? 0 : 1;
2136	/*
2137	 * cr == 0 try to get exact allocation,
2138	 * cr == 3  try to get anything
2139	 */
2140repeat:
2141	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2142		ac->ac_criteria = cr;
2143		/*
2144		 * searching for the right group start
2145		 * from the goal value specified
2146		 */
2147		group = ac->ac_g_ex.fe_group;
2148
2149		for (i = 0; i < ngroups; group++, i++) {
2150			cond_resched();
2151			/*
2152			 * Artificially restricted ngroups for non-extent
2153			 * files makes group > ngroups possible on first loop.
2154			 */
2155			if (group >= ngroups)
2156				group = 0;
2157
2158			/* This now checks without needing the buddy page */
2159			if (!ext4_mb_good_group(ac, group, cr))
2160				continue;
2161
2162			err = ext4_mb_load_buddy(sb, group, &e4b);
2163			if (err)
2164				goto out;
2165
2166			ext4_lock_group(sb, group);
2167
2168			/*
2169			 * We need to check again after locking the
2170			 * block group
2171			 */
2172			if (!ext4_mb_good_group(ac, group, cr)) {
2173				ext4_unlock_group(sb, group);
2174				ext4_mb_unload_buddy(&e4b);
2175				continue;
2176			}
2177
2178			ac->ac_groups_scanned++;
2179			if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
2180				ext4_mb_simple_scan_group(ac, &e4b);
2181			else if (cr == 1 && sbi->s_stripe &&
2182					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
2183				ext4_mb_scan_aligned(ac, &e4b);
2184			else
2185				ext4_mb_complex_scan_group(ac, &e4b);
2186
2187			ext4_unlock_group(sb, group);
2188			ext4_mb_unload_buddy(&e4b);
2189
2190			if (ac->ac_status != AC_STATUS_CONTINUE)
2191				break;
2192		}
2193	}
2194
2195	if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2196	    !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2197		/*
2198		 * We've been searching too long. Let's try to allocate
2199		 * the best chunk we've found so far
2200		 */
2201
2202		ext4_mb_try_best_found(ac, &e4b);
2203		if (ac->ac_status != AC_STATUS_FOUND) {
2204			/*
2205			 * Someone more lucky has already allocated it.
2206			 * The only thing we can do is just take first
2207			 * found block(s)
2208			printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2209			 */
2210			ac->ac_b_ex.fe_group = 0;
2211			ac->ac_b_ex.fe_start = 0;
2212			ac->ac_b_ex.fe_len = 0;
2213			ac->ac_status = AC_STATUS_CONTINUE;
2214			ac->ac_flags |= EXT4_MB_HINT_FIRST;
2215			cr = 3;
2216			atomic_inc(&sbi->s_mb_lost_chunks);
2217			goto repeat;
2218		}
2219	}
2220out:
2221	return err;
2222}
2223
2224static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2225{
2226	struct super_block *sb = seq->private;
2227	ext4_group_t group;
2228
2229	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2230		return NULL;
2231	group = *pos + 1;
2232	return (void *) ((unsigned long) group);
2233}
2234
2235static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2236{
2237	struct super_block *sb = seq->private;
2238	ext4_group_t group;
2239
2240	++*pos;
2241	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2242		return NULL;
2243	group = *pos + 1;
2244	return (void *) ((unsigned long) group);
2245}
2246
2247static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2248{
2249	struct super_block *sb = seq->private;
2250	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2251	int i;
2252	int err, buddy_loaded = 0;
2253	struct ext4_buddy e4b;
2254	struct ext4_group_info *grinfo;
2255	struct sg {
2256		struct ext4_group_info info;
2257		ext4_grpblk_t counters[16];
2258	} sg;
2259
2260	group--;
2261	if (group == 0)
2262		seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
2263				"[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
2264				  "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
2265			   "group", "free", "frags", "first",
2266			   "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
2267			   "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
2268
2269	i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2270		sizeof(struct ext4_group_info);
2271	grinfo = ext4_get_group_info(sb, group);
2272	/* Load the group info in memory only if not already loaded. */
2273	if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2274		err = ext4_mb_load_buddy(sb, group, &e4b);
2275		if (err) {
2276			seq_printf(seq, "#%-5u: I/O error\n", group);
2277			return 0;
2278		}
2279		buddy_loaded = 1;
2280	}
2281
2282	memcpy(&sg, ext4_get_group_info(sb, group), i);
2283
2284	if (buddy_loaded)
2285		ext4_mb_unload_buddy(&e4b);
2286
2287	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2288			sg.info.bb_fragments, sg.info.bb_first_free);
2289	for (i = 0; i <= 13; i++)
2290		seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2291				sg.info.bb_counters[i] : 0);
2292	seq_printf(seq, " ]\n");
2293
2294	return 0;
2295}
2296
2297static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2298{
2299}
2300
2301static const struct seq_operations ext4_mb_seq_groups_ops = {
2302	.start  = ext4_mb_seq_groups_start,
2303	.next   = ext4_mb_seq_groups_next,
2304	.stop   = ext4_mb_seq_groups_stop,
2305	.show   = ext4_mb_seq_groups_show,
2306};
2307
2308static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2309{
2310	struct super_block *sb = PDE_DATA(inode);
2311	int rc;
2312
2313	rc = seq_open(file, &ext4_mb_seq_groups_ops);
2314	if (rc == 0) {
2315		struct seq_file *m = file->private_data;
2316		m->private = sb;
2317	}
2318	return rc;
2319
2320}
2321
2322static const struct file_operations ext4_mb_seq_groups_fops = {
2323	.owner		= THIS_MODULE,
2324	.open		= ext4_mb_seq_groups_open,
2325	.read		= seq_read,
2326	.llseek		= seq_lseek,
2327	.release	= seq_release,
2328};
2329
2330static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2331{
2332	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2333	struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2334
2335	BUG_ON(!cachep);
2336	return cachep;
2337}
2338
2339/*
2340 * Allocate the top-level s_group_info array for the specified number
2341 * of groups
2342 */
2343int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2344{
2345	struct ext4_sb_info *sbi = EXT4_SB(sb);
2346	unsigned size;
2347	struct ext4_group_info ***new_groupinfo;
2348
2349	size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2350		EXT4_DESC_PER_BLOCK_BITS(sb);
2351	if (size <= sbi->s_group_info_size)
2352		return 0;
2353
2354	size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
2355	new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
2356	if (!new_groupinfo) {
2357		ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2358		return -ENOMEM;
2359	}
2360	if (sbi->s_group_info) {
2361		memcpy(new_groupinfo, sbi->s_group_info,
2362		       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2363		kvfree(sbi->s_group_info);
2364	}
2365	sbi->s_group_info = new_groupinfo;
2366	sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2367	ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
2368		   sbi->s_group_info_size);
2369	return 0;
2370}
2371
2372/* Create and initialize ext4_group_info data for the given group. */
2373int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2374			  struct ext4_group_desc *desc)
2375{
2376	int i;
2377	int metalen = 0;
2378	struct ext4_sb_info *sbi = EXT4_SB(sb);
2379	struct ext4_group_info **meta_group_info;
2380	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2381
2382	/*
2383	 * First check if this group is the first of a reserved block.
2384	 * If it's true, we have to allocate a new table of pointers
2385	 * to ext4_group_info structures
2386	 */
2387	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2388		metalen = sizeof(*meta_group_info) <<
2389			EXT4_DESC_PER_BLOCK_BITS(sb);
2390		meta_group_info = kmalloc(metalen, GFP_NOFS);
2391		if (meta_group_info == NULL) {
2392			ext4_msg(sb, KERN_ERR, "can't allocate mem "
2393				 "for a buddy group");
2394			goto exit_meta_group_info;
2395		}
2396		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2397			meta_group_info;
2398	}
2399
2400	meta_group_info =
2401		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2402	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2403
2404	meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
2405	if (meta_group_info[i] == NULL) {
2406		ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2407		goto exit_group_info;
2408	}
2409	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2410		&(meta_group_info[i]->bb_state));
2411
2412	/*
2413	 * initialize bb_free to be able to skip
2414	 * empty groups without initialization
2415	 */
2416	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2417		meta_group_info[i]->bb_free =
2418			ext4_free_clusters_after_init(sb, group, desc);
2419	} else {
2420		meta_group_info[i]->bb_free =
2421			ext4_free_group_clusters(sb, desc);
2422	}
2423
2424	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2425	init_rwsem(&meta_group_info[i]->alloc_sem);
2426	meta_group_info[i]->bb_free_root = RB_ROOT;
2427	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
2428
2429#ifdef DOUBLE_CHECK
2430	{
2431		struct buffer_head *bh;
2432		meta_group_info[i]->bb_bitmap =
2433			kmalloc(sb->s_blocksize, GFP_NOFS);
2434		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2435		bh = ext4_read_block_bitmap(sb, group);
2436		BUG_ON(bh == NULL);
2437		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2438			sb->s_blocksize);
2439		put_bh(bh);
2440	}
2441#endif
2442
2443	return 0;
2444
2445exit_group_info:
2446	/* If a meta_group_info table has been allocated, release it now */
2447	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2448		kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2449		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
2450	}
2451exit_meta_group_info:
2452	return -ENOMEM;
2453} /* ext4_mb_add_groupinfo */
2454
2455static int ext4_mb_init_backend(struct super_block *sb)
2456{
2457	ext4_group_t ngroups = ext4_get_groups_count(sb);
2458	ext4_group_t i;
2459	struct ext4_sb_info *sbi = EXT4_SB(sb);
2460	int err;
2461	struct ext4_group_desc *desc;
2462	struct kmem_cache *cachep;
2463
2464	err = ext4_mb_alloc_groupinfo(sb, ngroups);
2465	if (err)
2466		return err;
2467
2468	sbi->s_buddy_cache = new_inode(sb);
2469	if (sbi->s_buddy_cache == NULL) {
2470		ext4_msg(sb, KERN_ERR, "can't get new inode");
2471		goto err_freesgi;
2472	}
2473	/* To avoid potentially colliding with an valid on-disk inode number,
2474	 * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
2475	 * not in the inode hash, so it should never be found by iget(), but
2476	 * this will avoid confusion if it ever shows up during debugging. */
2477	sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2478	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2479	for (i = 0; i < ngroups; i++) {
2480		desc = ext4_get_group_desc(sb, i, NULL);
2481		if (desc == NULL) {
2482			ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2483			goto err_freebuddy;
2484		}
2485		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2486			goto err_freebuddy;
2487	}
2488
2489	return 0;
2490
2491err_freebuddy:
2492	cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2493	while (i-- > 0)
2494		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2495	i = sbi->s_group_info_size;
2496	while (i-- > 0)
2497		kfree(sbi->s_group_info[i]);
2498	iput(sbi->s_buddy_cache);
2499err_freesgi:
2500	kvfree(sbi->s_group_info);
2501	return -ENOMEM;
2502}
2503
2504static void ext4_groupinfo_destroy_slabs(void)
2505{
2506	int i;
2507
2508	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2509		if (ext4_groupinfo_caches[i])
2510			kmem_cache_destroy(ext4_groupinfo_caches[i]);
2511		ext4_groupinfo_caches[i] = NULL;
2512	}
2513}
2514
2515static int ext4_groupinfo_create_slab(size_t size)
2516{
2517	static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2518	int slab_size;
2519	int blocksize_bits = order_base_2(size);
2520	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2521	struct kmem_cache *cachep;
2522
2523	if (cache_index >= NR_GRPINFO_CACHES)
2524		return -EINVAL;
2525
2526	if (unlikely(cache_index < 0))
2527		cache_index = 0;
2528
2529	mutex_lock(&ext4_grpinfo_slab_create_mutex);
2530	if (ext4_groupinfo_caches[cache_index]) {
2531		mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2532		return 0;	/* Already created */
2533	}
2534
2535	slab_size = offsetof(struct ext4_group_info,
2536				bb_counters[blocksize_bits + 2]);
2537
2538	cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2539					slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2540					NULL);
2541
2542	ext4_groupinfo_caches[cache_index] = cachep;
2543
2544	mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2545	if (!cachep) {
2546		printk(KERN_EMERG
2547		       "EXT4-fs: no memory for groupinfo slab cache\n");
2548		return -ENOMEM;
2549	}
2550
2551	return 0;
2552}
2553
2554int ext4_mb_init(struct super_block *sb)
2555{
2556	struct ext4_sb_info *sbi = EXT4_SB(sb);
2557	unsigned i, j;
2558	unsigned offset, offset_incr;
2559	unsigned max;
2560	int ret;
2561
2562	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2563
2564	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2565	if (sbi->s_mb_offsets == NULL) {
2566		ret = -ENOMEM;
2567		goto out;
2568	}
2569
2570	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2571	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2572	if (sbi->s_mb_maxs == NULL) {
2573		ret = -ENOMEM;
2574		goto out;
2575	}
2576
2577	ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2578	if (ret < 0)
2579		goto out;
2580
2581	/* order 0 is regular bitmap */
2582	sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2583	sbi->s_mb_offsets[0] = 0;
2584
2585	i = 1;
2586	offset = 0;
2587	offset_incr = 1 << (sb->s_blocksize_bits - 1);
2588	max = sb->s_blocksize << 2;
2589	do {
2590		sbi->s_mb_offsets[i] = offset;
2591		sbi->s_mb_maxs[i] = max;
2592		offset += offset_incr;
2593		offset_incr = offset_incr >> 1;
2594		max = max >> 1;
2595		i++;
2596	} while (i <= sb->s_blocksize_bits + 1);
2597
2598	spin_lock_init(&sbi->s_md_lock);
2599	spin_lock_init(&sbi->s_bal_lock);
2600
2601	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2602	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2603	sbi->s_mb_stats = MB_DEFAULT_STATS;
2604	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2605	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2606	/*
2607	 * The default group preallocation is 512, which for 4k block
2608	 * sizes translates to 2 megabytes.  However for bigalloc file
2609	 * systems, this is probably too big (i.e, if the cluster size
2610	 * is 1 megabyte, then group preallocation size becomes half a
2611	 * gigabyte!).  As a default, we will keep a two megabyte
2612	 * group pralloc size for cluster sizes up to 64k, and after
2613	 * that, we will force a minimum group preallocation size of
2614	 * 32 clusters.  This translates to 8 megs when the cluster
2615	 * size is 256k, and 32 megs when the cluster size is 1 meg,
2616	 * which seems reasonable as a default.
2617	 */
2618	sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
2619				       sbi->s_cluster_bits, 32);
2620	/*
2621	 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2622	 * to the lowest multiple of s_stripe which is bigger than
2623	 * the s_mb_group_prealloc as determined above. We want
2624	 * the preallocation size to be an exact multiple of the
2625	 * RAID stripe size so that preallocations don't fragment
2626	 * the stripes.
2627	 */
2628	if (sbi->s_stripe > 1) {
2629		sbi->s_mb_group_prealloc = roundup(
2630			sbi->s_mb_group_prealloc, sbi->s_stripe);
2631	}
2632
2633	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2634	if (sbi->s_locality_groups == NULL) {
2635		ret = -ENOMEM;
2636		goto out;
2637	}
2638	for_each_possible_cpu(i) {
2639		struct ext4_locality_group *lg;
2640		lg = per_cpu_ptr(sbi->s_locality_groups, i);
2641		mutex_init(&lg->lg_mutex);
2642		for (j = 0; j < PREALLOC_TB_SIZE; j++)
2643			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2644		spin_lock_init(&lg->lg_prealloc_lock);
2645	}
2646
2647	/* init file for buddy data */
2648	ret = ext4_mb_init_backend(sb);
2649	if (ret != 0)
2650		goto out_free_locality_groups;
2651
2652	if (sbi->s_proc)
2653		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2654				 &ext4_mb_seq_groups_fops, sb);
2655
2656	return 0;
2657
2658out_free_locality_groups:
2659	free_percpu(sbi->s_locality_groups);
2660	sbi->s_locality_groups = NULL;
2661out:
2662	kfree(sbi->s_mb_offsets);
2663	sbi->s_mb_offsets = NULL;
2664	kfree(sbi->s_mb_maxs);
2665	sbi->s_mb_maxs = NULL;
2666	return ret;
2667}
2668
2669/* need to called with the ext4 group lock held */
2670static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2671{
2672	struct ext4_prealloc_space *pa;
2673	struct list_head *cur, *tmp;
2674	int count = 0;
2675
2676	list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2677		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2678		list_del(&pa->pa_group_list);
2679		count++;
2680		kmem_cache_free(ext4_pspace_cachep, pa);
2681	}
2682	if (count)
2683		mb_debug(1, "mballoc: %u PAs left\n", count);
2684
2685}
2686
2687int ext4_mb_release(struct super_block *sb)
2688{
2689	ext4_group_t ngroups = ext4_get_groups_count(sb);
2690	ext4_group_t i;
2691	int num_meta_group_infos;
2692	struct ext4_group_info *grinfo;
2693	struct ext4_sb_info *sbi = EXT4_SB(sb);
2694	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2695
2696	if (sbi->s_proc)
2697		remove_proc_entry("mb_groups", sbi->s_proc);
2698
2699	if (sbi->s_group_info) {
2700		for (i = 0; i < ngroups; i++) {
2701			grinfo = ext4_get_group_info(sb, i);
2702#ifdef DOUBLE_CHECK
2703			kfree(grinfo->bb_bitmap);
2704#endif
2705			ext4_lock_group(sb, i);
2706			ext4_mb_cleanup_pa(grinfo);
2707			ext4_unlock_group(sb, i);
2708			kmem_cache_free(cachep, grinfo);
2709		}
2710		num_meta_group_infos = (ngroups +
2711				EXT4_DESC_PER_BLOCK(sb) - 1) >>
2712			EXT4_DESC_PER_BLOCK_BITS(sb);
2713		for (i = 0; i < num_meta_group_infos; i++)
2714			kfree(sbi->s_group_info[i]);
2715		kvfree(sbi->s_group_info);
2716	}
2717	kfree(sbi->s_mb_offsets);
2718	kfree(sbi->s_mb_maxs);
2719	iput(sbi->s_buddy_cache);
2720	if (sbi->s_mb_stats) {
2721		ext4_msg(sb, KERN_INFO,
2722		       "mballoc: %u blocks %u reqs (%u success)",
2723				atomic_read(&sbi->s_bal_allocated),
2724				atomic_read(&sbi->s_bal_reqs),
2725				atomic_read(&sbi->s_bal_success));
2726		ext4_msg(sb, KERN_INFO,
2727		      "mballoc: %u extents scanned, %u goal hits, "
2728				"%u 2^N hits, %u breaks, %u lost",
2729				atomic_read(&sbi->s_bal_ex_scanned),
2730				atomic_read(&sbi->s_bal_goals),
2731				atomic_read(&sbi->s_bal_2orders),
2732				atomic_read(&sbi->s_bal_breaks),
2733				atomic_read(&sbi->s_mb_lost_chunks));
2734		ext4_msg(sb, KERN_INFO,
2735		       "mballoc: %lu generated and it took %Lu",
2736				sbi->s_mb_buddies_generated,
2737				sbi->s_mb_generation_time);
2738		ext4_msg(sb, KERN_INFO,
2739		       "mballoc: %u preallocated, %u discarded",
2740				atomic_read(&sbi->s_mb_preallocated),
2741				atomic_read(&sbi->s_mb_discarded));
2742	}
2743
2744	free_percpu(sbi->s_locality_groups);
2745
2746	return 0;
2747}
2748
2749static inline int ext4_issue_discard(struct super_block *sb,
2750		ext4_group_t block_group, ext4_grpblk_t cluster, int count)
2751{
2752	ext4_fsblk_t discard_block;
2753
2754	discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
2755			 ext4_group_first_block_no(sb, block_group));
2756	count = EXT4_C2B(EXT4_SB(sb), count);
2757	trace_ext4_discard_blocks(sb,
2758			(unsigned long long) discard_block, count);
2759	return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2760}
2761
2762/*
2763 * This function is called by the jbd2 layer once the commit has finished,
2764 * so we know we can free the blocks that were released with that commit.
2765 */
2766static void ext4_free_data_callback(struct super_block *sb,
2767				    struct ext4_journal_cb_entry *jce,
2768				    int rc)
2769{
2770	struct ext4_free_data *entry = (struct ext4_free_data *)jce;
2771	struct ext4_buddy e4b;
2772	struct ext4_group_info *db;
2773	int err, count = 0, count2 = 0;
2774
2775	mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2776		 entry->efd_count, entry->efd_group, entry);
2777
2778	if (test_opt(sb, DISCARD)) {
2779		err = ext4_issue_discard(sb, entry->efd_group,
2780					 entry->efd_start_cluster,
2781					 entry->efd_count);
2782		if (err && err != -EOPNOTSUPP)
2783			ext4_msg(sb, KERN_WARNING, "discard request in"
2784				 " group:%d block:%d count:%d failed"
2785				 " with %d", entry->efd_group,
2786				 entry->efd_start_cluster,
2787				 entry->efd_count, err);
2788	}
2789
2790	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2791	/* we expect to find existing buddy because it's pinned */
2792	BUG_ON(err != 0);
2793
2794
2795	db = e4b.bd_info;
2796	/* there are blocks to put in buddy to make them really free */
2797	count += entry->efd_count;
2798	count2++;
2799	ext4_lock_group(sb, entry->efd_group);
2800	/* Take it out of per group rb tree */
2801	rb_erase(&entry->efd_node, &(db->bb_free_root));
2802	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2803
2804	/*
2805	 * Clear the trimmed flag for the group so that the next
2806	 * ext4_trim_fs can trim it.
2807	 * If the volume is mounted with -o discard, online discard
2808	 * is supported and the free blocks will be trimmed online.
2809	 */
2810	if (!test_opt(sb, DISCARD))
2811		EXT4_MB_GRP_CLEAR_TRIMMED(db);
2812
2813	if (!db->bb_free_root.rb_node) {
2814		/* No more items in the per group rb tree
2815		 * balance refcounts from ext4_mb_free_metadata()
2816		 */
2817		page_cache_release(e4b.bd_buddy_page);
2818		page_cache_release(e4b.bd_bitmap_page);
2819	}
2820	ext4_unlock_group(sb, entry->efd_group);
2821	kmem_cache_free(ext4_free_data_cachep, entry);
2822	ext4_mb_unload_buddy(&e4b);
2823
2824	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2825}
2826
2827int __init ext4_init_mballoc(void)
2828{
2829	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2830					SLAB_RECLAIM_ACCOUNT);
2831	if (ext4_pspace_cachep == NULL)
2832		return -ENOMEM;
2833
2834	ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2835				    SLAB_RECLAIM_ACCOUNT);
2836	if (ext4_ac_cachep == NULL) {
2837		kmem_cache_destroy(ext4_pspace_cachep);
2838		return -ENOMEM;
2839	}
2840
2841	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2842					   SLAB_RECLAIM_ACCOUNT);
2843	if (ext4_free_data_cachep == NULL) {
2844		kmem_cache_destroy(ext4_pspace_cachep);
2845		kmem_cache_destroy(ext4_ac_cachep);
2846		return -ENOMEM;
2847	}
2848	return 0;
2849}
2850
2851void ext4_exit_mballoc(void)
2852{
2853	/*
2854	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2855	 * before destroying the slab cache.
2856	 */
2857	rcu_barrier();
2858	kmem_cache_destroy(ext4_pspace_cachep);
2859	kmem_cache_destroy(ext4_ac_cachep);
2860	kmem_cache_destroy(ext4_free_data_cachep);
2861	ext4_groupinfo_destroy_slabs();
2862}
2863
2864
2865/*
2866 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2867 * Returns 0 if success or error code
2868 */
2869static noinline_for_stack int
2870ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2871				handle_t *handle, unsigned int reserv_clstrs)
2872{
2873	struct buffer_head *bitmap_bh = NULL;
2874	struct ext4_group_desc *gdp;
2875	struct buffer_head *gdp_bh;
2876	struct ext4_sb_info *sbi;
2877	struct super_block *sb;
2878	ext4_fsblk_t block;
2879	int err, len;
2880
2881	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2882	BUG_ON(ac->ac_b_ex.fe_len <= 0);
2883
2884	sb = ac->ac_sb;
2885	sbi = EXT4_SB(sb);
2886
2887	err = -EIO;
2888	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2889	if (!bitmap_bh)
2890		goto out_err;
2891
2892	BUFFER_TRACE(bitmap_bh, "getting write access");
2893	err = ext4_journal_get_write_access(handle, bitmap_bh);
2894	if (err)
2895		goto out_err;
2896
2897	err = -EIO;
2898	gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2899	if (!gdp)
2900		goto out_err;
2901
2902	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2903			ext4_free_group_clusters(sb, gdp));
2904
2905	BUFFER_TRACE(gdp_bh, "get_write_access");
2906	err = ext4_journal_get_write_access(handle, gdp_bh);
2907	if (err)
2908		goto out_err;
2909
2910	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2911
2912	len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2913	if (!ext4_data_block_valid(sbi, block, len)) {
2914		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2915			   "fs metadata", block, block+len);
2916		/* File system mounted not to panic on error
2917		 * Fix the bitmap and repeat the block allocation
2918		 * We leak some of the blocks here.
2919		 */
2920		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2921		ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2922			      ac->ac_b_ex.fe_len);
2923		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2924		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2925		if (!err)
2926			err = -EAGAIN;
2927		goto out_err;
2928	}
2929
2930	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2931#ifdef AGGRESSIVE_CHECK
2932	{
2933		int i;
2934		for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
2935			BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
2936						bitmap_bh->b_data));
2937		}
2938	}
2939#endif
2940	ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2941		      ac->ac_b_ex.fe_len);
2942	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2943		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2944		ext4_free_group_clusters_set(sb, gdp,
2945					     ext4_free_clusters_after_init(sb,
2946						ac->ac_b_ex.fe_group, gdp));
2947	}
2948	len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
2949	ext4_free_group_clusters_set(sb, gdp, len);
2950	ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
2951	ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
2952
2953	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2954	percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
2955	/*
2956	 * Now reduce the dirty block count also. Should not go negative
2957	 */
2958	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2959		/* release all the reserved blocks if non delalloc */
2960		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
2961				   reserv_clstrs);
2962
2963	if (sbi->s_log_groups_per_flex) {
2964		ext4_group_t flex_group = ext4_flex_group(sbi,
2965							  ac->ac_b_ex.fe_group);
2966		atomic64_sub(ac->ac_b_ex.fe_len,
2967			     &sbi->s_flex_groups[flex_group].free_clusters);
2968	}
2969
2970	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2971	if (err)
2972		goto out_err;
2973	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2974
2975out_err:
2976	brelse(bitmap_bh);
2977	return err;
2978}
2979
2980/*
2981 * here we normalize request for locality group
2982 * Group request are normalized to s_mb_group_prealloc, which goes to
2983 * s_strip if we set the same via mount option.
2984 * s_mb_group_prealloc can be configured via
2985 * /sys/fs/ext4/<partition>/mb_group_prealloc
2986 *
2987 * XXX: should we try to preallocate more than the group has now?
2988 */
2989static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
2990{
2991	struct super_block *sb = ac->ac_sb;
2992	struct ext4_locality_group *lg = ac->ac_lg;
2993
2994	BUG_ON(lg == NULL);
2995	ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
2996	mb_debug(1, "#%u: goal %u blocks for locality group\n",
2997		current->pid, ac->ac_g_ex.fe_len);
2998}
2999
3000/*
3001 * Normalization means making request better in terms of
3002 * size and alignment
3003 */
3004static noinline_for_stack void
3005ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3006				struct ext4_allocation_request *ar)
3007{
3008	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3009	int bsbits, max;
3010	ext4_lblk_t end;
3011	loff_t size, start_off;
3012	loff_t orig_size __maybe_unused;
3013	ext4_lblk_t start;
3014	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3015	struct ext4_prealloc_space *pa;
3016
3017	/* do normalize only data requests, metadata requests
3018	   do not need preallocation */
3019	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3020		return;
3021
3022	/* sometime caller may want exact blocks */
3023	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3024		return;
3025
3026	/* caller may indicate that preallocation isn't
3027	 * required (it's a tail, for example) */
3028	if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3029		return;
3030
3031	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3032		ext4_mb_normalize_group_request(ac);
3033		return ;
3034	}
3035
3036	bsbits = ac->ac_sb->s_blocksize_bits;
3037
3038	/* first, let's learn actual file size
3039	 * given current request is allocated */
3040	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
3041	size = size << bsbits;
3042	if (size < i_size_read(ac->ac_inode))
3043		size = i_size_read(ac->ac_inode);
3044	orig_size = size;
3045
3046	/* max size of free chunks */
3047	max = 2 << bsbits;
3048
3049#define NRL_CHECK_SIZE(req, size, max, chunk_size)	\
3050		(req <= (size) || max <= (chunk_size))
3051
3052	/* first, try to predict filesize */
3053	/* XXX: should this table be tunable? */
3054	start_off = 0;
3055	if (size <= 16 * 1024) {
3056		size = 16 * 1024;
3057	} else if (size <= 32 * 1024) {
3058		size = 32 * 1024;
3059	} else if (size <= 64 * 1024) {
3060		size = 64 * 1024;
3061	} else if (size <= 128 * 1024) {
3062		size = 128 * 1024;
3063	} else if (size <= 256 * 1024) {
3064		size = 256 * 1024;
3065	} else if (size <= 512 * 1024) {
3066		size = 512 * 1024;
3067	} else if (size <= 1024 * 1024) {
3068		size = 1024 * 1024;
3069	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
3070		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3071						(21 - bsbits)) << 21;
3072		size = 2 * 1024 * 1024;
3073	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
3074		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3075							(22 - bsbits)) << 22;
3076		size = 4 * 1024 * 1024;
3077	} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3078					(8<<20)>>bsbits, max, 8 * 1024)) {
3079		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3080							(23 - bsbits)) << 23;
3081		size = 8 * 1024 * 1024;
3082	} else {
3083		start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
3084		size	  = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
3085					      ac->ac_o_ex.fe_len) << bsbits;
3086	}
3087	size = size >> bsbits;
3088	start = start_off >> bsbits;
3089
3090	/* don't cover already allocated blocks in selected range */
3091	if (ar->pleft && start <= ar->lleft) {
3092		size -= ar->lleft + 1 - start;
3093		start = ar->lleft + 1;
3094	}
3095	if (ar->pright && start + size - 1 >= ar->lright)
3096		size -= start + size - ar->lright;
3097
3098	end = start + size;
3099
3100	/* check we don't cross already preallocated blocks */
3101	rcu_read_lock();
3102	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3103		ext4_lblk_t pa_end;
3104
3105		if (pa->pa_deleted)
3106			continue;
3107		spin_lock(&pa->pa_lock);
3108		if (pa->pa_deleted) {
3109			spin_unlock(&pa->pa_lock);
3110			continue;
3111		}
3112
3113		pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3114						  pa->pa_len);
3115
3116		/* PA must not overlap original request */
3117		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3118			ac->ac_o_ex.fe_logical < pa->pa_lstart));
3119
3120		/* skip PAs this normalized request doesn't overlap with */
3121		if (pa->pa_lstart >= end || pa_end <= start) {
3122			spin_unlock(&pa->pa_lock);
3123			continue;
3124		}
3125		BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3126
3127		/* adjust start or end to be adjacent to this pa */
3128		if (pa_end <= ac->ac_o_ex.fe_logical) {
3129			BUG_ON(pa_end < start);
3130			start = pa_end;
3131		} else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3132			BUG_ON(pa->pa_lstart > end);
3133			end = pa->pa_lstart;
3134		}
3135		spin_unlock(&pa->pa_lock);
3136	}
3137	rcu_read_unlock();
3138	size = end - start;
3139
3140	/* XXX: extra loop to check we really don't overlap preallocations */
3141	rcu_read_lock();
3142	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3143		ext4_lblk_t pa_end;
3144
3145		spin_lock(&pa->pa_lock);
3146		if (pa->pa_deleted == 0) {
3147			pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3148							  pa->pa_len);
3149			BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3150		}
3151		spin_unlock(&pa->pa_lock);
3152	}
3153	rcu_read_unlock();
3154
3155	if (start + size <= ac->ac_o_ex.fe_logical &&
3156			start > ac->ac_o_ex.fe_logical) {
3157		ext4_msg(ac->ac_sb, KERN_ERR,
3158			 "start %lu, size %lu, fe_logical %lu",
3159			 (unsigned long) start, (unsigned long) size,
3160			 (unsigned long) ac->ac_o_ex.fe_logical);
3161		BUG();
3162	}
3163	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3164
3165	/* now prepare goal request */
3166
3167	/* XXX: is it better to align blocks WRT to logical
3168	 * placement or satisfy big request as is */
3169	ac->ac_g_ex.fe_logical = start;
3170	ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3171
3172	/* define goal start in order to merge */
3173	if (ar->pright && (ar->lright == (start + size))) {
3174		/* merge to the right */
3175		ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3176						&ac->ac_f_ex.fe_group,
3177						&ac->ac_f_ex.fe_start);
3178		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3179	}
3180	if (ar->pleft && (ar->lleft + 1 == start)) {
3181		/* merge to the left */
3182		ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3183						&ac->ac_f_ex.fe_group,
3184						&ac->ac_f_ex.fe_start);
3185		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3186	}
3187
3188	mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3189		(unsigned) orig_size, (unsigned) start);
3190}
3191
3192static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3193{
3194	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3195
3196	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3197		atomic_inc(&sbi->s_bal_reqs);
3198		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3199		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3200			atomic_inc(&sbi->s_bal_success);
3201		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3202		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3203				ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3204			atomic_inc(&sbi->s_bal_goals);
3205		if (ac->ac_found > sbi->s_mb_max_to_scan)
3206			atomic_inc(&sbi->s_bal_breaks);
3207	}
3208
3209	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3210		trace_ext4_mballoc_alloc(ac);
3211	else
3212		trace_ext4_mballoc_prealloc(ac);
3213}
3214
3215/*
3216 * Called on failure; free up any blocks from the inode PA for this
3217 * context.  We don't need this for MB_GROUP_PA because we only change
3218 * pa_free in ext4_mb_release_context(), but on failure, we've already
3219 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3220 */
3221static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3222{
3223	struct ext4_prealloc_space *pa = ac->ac_pa;
3224	struct ext4_buddy e4b;
3225	int err;
3226
3227	if (pa == NULL) {
3228		if (ac->ac_f_ex.fe_len == 0)
3229			return;
3230		err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
3231		if (err) {
3232			/*
3233			 * This should never happen since we pin the
3234			 * pages in the ext4_allocation_context so
3235			 * ext4_mb_load_buddy() should never fail.
3236			 */
3237			WARN(1, "mb_load_buddy failed (%d)", err);
3238			return;
3239		}
3240		ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3241		mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
3242			       ac->ac_f_ex.fe_len);
3243		ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3244		ext4_mb_unload_buddy(&e4b);
3245		return;
3246	}
3247	if (pa->pa_type == MB_INODE_PA)
3248		pa->pa_free += ac->ac_b_ex.fe_len;
3249}
3250
3251/*
3252 * use blocks preallocated to inode
3253 */
3254static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3255				struct ext4_prealloc_space *pa)
3256{
3257	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3258	ext4_fsblk_t start;
3259	ext4_fsblk_t end;
3260	int len;
3261
3262	/* found preallocated blocks, use them */
3263	start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3264	end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3265		  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3266	len = EXT4_NUM_B2C(sbi, end - start);
3267	ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3268					&ac->ac_b_ex.fe_start);
3269	ac->ac_b_ex.fe_len = len;
3270	ac->ac_status = AC_STATUS_FOUND;
3271	ac->ac_pa = pa;
3272
3273	BUG_ON(start < pa->pa_pstart);
3274	BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3275	BUG_ON(pa->pa_free < len);
3276	pa->pa_free -= len;
3277
3278	mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3279}
3280
3281/*
3282 * use blocks preallocated to locality group
3283 */
3284static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3285				struct ext4_prealloc_space *pa)
3286{
3287	unsigned int len = ac->ac_o_ex.fe_len;
3288
3289	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3290					&ac->ac_b_ex.fe_group,
3291					&ac->ac_b_ex.fe_start);
3292	ac->ac_b_ex.fe_len = len;
3293	ac->ac_status = AC_STATUS_FOUND;
3294	ac->ac_pa = pa;
3295
3296	/* we don't correct pa_pstart or pa_plen here to avoid
3297	 * possible race when the group is being loaded concurrently
3298	 * instead we correct pa later, after blocks are marked
3299	 * in on-disk bitmap -- see ext4_mb_release_context()
3300	 * Other CPUs are prevented from allocating from this pa by lg_mutex
3301	 */
3302	mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3303}
3304
3305/*
3306 * Return the prealloc space that have minimal distance
3307 * from the goal block. @cpa is the prealloc
3308 * space that is having currently known minimal distance
3309 * from the goal block.
3310 */
3311static struct ext4_prealloc_space *
3312ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3313			struct ext4_prealloc_space *pa,
3314			struct ext4_prealloc_space *cpa)
3315{
3316	ext4_fsblk_t cur_distance, new_distance;
3317
3318	if (cpa == NULL) {
3319		atomic_inc(&pa->pa_count);
3320		return pa;
3321	}
3322	cur_distance = abs(goal_block - cpa->pa_pstart);
3323	new_distance = abs(goal_block - pa->pa_pstart);
3324
3325	if (cur_distance <= new_distance)
3326		return cpa;
3327
3328	/* drop the previous reference */
3329	atomic_dec(&cpa->pa_count);
3330	atomic_inc(&pa->pa_count);
3331	return pa;
3332}
3333
3334/*
3335 * search goal blocks in preallocated space
3336 */
3337static noinline_for_stack int
3338ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3339{
3340	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3341	int order, i;
3342	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3343	struct ext4_locality_group *lg;
3344	struct ext4_prealloc_space *pa, *cpa = NULL;
3345	ext4_fsblk_t goal_block;
3346
3347	/* only data can be preallocated */
3348	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3349		return 0;
3350
3351	/* first, try per-file preallocation */
3352	rcu_read_lock();
3353	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3354
3355		/* all fields in this condition don't change,
3356		 * so we can skip locking for them */
3357		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3358		    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
3359					       EXT4_C2B(sbi, pa->pa_len)))
3360			continue;
3361
3362		/* non-extent files can't have physical blocks past 2^32 */
3363		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3364		    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3365		     EXT4_MAX_BLOCK_FILE_PHYS))
3366			continue;
3367
3368		/* found preallocated blocks, use them */
3369		spin_lock(&pa->pa_lock);
3370		if (pa->pa_deleted == 0 && pa->pa_free) {
3371			atomic_inc(&pa->pa_count);
3372			ext4_mb_use_inode_pa(ac, pa);
3373			spin_unlock(&pa->pa_lock);
3374			ac->ac_criteria = 10;
3375			rcu_read_unlock();
3376			return 1;
3377		}
3378		spin_unlock(&pa->pa_lock);
3379	}
3380	rcu_read_unlock();
3381
3382	/* can we use group allocation? */
3383	if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3384		return 0;
3385
3386	/* inode may have no locality group for some reason */
3387	lg = ac->ac_lg;
3388	if (lg == NULL)
3389		return 0;
3390	order  = fls(ac->ac_o_ex.fe_len) - 1;
3391	if (order > PREALLOC_TB_SIZE - 1)
3392		/* The max size of hash table is PREALLOC_TB_SIZE */
3393		order = PREALLOC_TB_SIZE - 1;
3394
3395	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3396	/*
3397	 * search for the prealloc space that is having
3398	 * minimal distance from the goal block.
3399	 */
3400	for (i = order; i < PREALLOC_TB_SIZE; i++) {
3401		rcu_read_lock();
3402		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3403					pa_inode_list) {
3404			spin_lock(&pa->pa_lock);
3405			if (pa->pa_deleted == 0 &&
3406					pa->pa_free >= ac->ac_o_ex.fe_len) {
3407
3408				cpa = ext4_mb_check_group_pa(goal_block,
3409								pa, cpa);
3410			}
3411			spin_unlock(&pa->pa_lock);
3412		}
3413		rcu_read_unlock();
3414	}
3415	if (cpa) {
3416		ext4_mb_use_group_pa(ac, cpa);
3417		ac->ac_criteria = 20;
3418		return 1;
3419	}
3420	return 0;
3421}
3422
3423/*
3424 * the function goes through all block freed in the group
3425 * but not yet committed and marks them used in in-core bitmap.
3426 * buddy must be generated from this bitmap
3427 * Need to be called with the ext4 group lock held
3428 */
3429static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3430						ext4_group_t group)
3431{
3432	struct rb_node *n;
3433	struct ext4_group_info *grp;
3434	struct ext4_free_data *entry;
3435
3436	grp = ext4_get_group_info(sb, group);
3437	n = rb_first(&(grp->bb_free_root));
3438
3439	while (n) {
3440		entry = rb_entry(n, struct ext4_free_data, efd_node);
3441		ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3442		n = rb_next(n);
3443	}
3444	return;
3445}
3446
3447/*
3448 * the function goes through all preallocation in this group and marks them
3449 * used in in-core bitmap. buddy must be generated from this bitmap
3450 * Need to be called with ext4 group lock held
3451 */
3452static noinline_for_stack
3453void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3454					ext4_group_t group)
3455{
3456	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3457	struct ext4_prealloc_space *pa;
3458	struct list_head *cur;
3459	ext4_group_t groupnr;
3460	ext4_grpblk_t start;
3461	int preallocated = 0;
3462	int len;
3463
3464	/* all form of preallocation discards first load group,
3465	 * so the only competing code is preallocation use.
3466	 * we don't need any locking here
3467	 * notice we do NOT ignore preallocations with pa_deleted
3468	 * otherwise we could leave used blocks available for
3469	 * allocation in buddy when concurrent ext4_mb_put_pa()
3470	 * is dropping preallocation
3471	 */
3472	list_for_each(cur, &grp->bb_prealloc_list) {
3473		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3474		spin_lock(&pa->pa_lock);
3475		ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3476					     &groupnr, &start);
3477		len = pa->pa_len;
3478		spin_unlock(&pa->pa_lock);
3479		if (unlikely(len == 0))
3480			continue;
3481		BUG_ON(groupnr != group);
3482		ext4_set_bits(bitmap, start, len);
3483		preallocated += len;
3484	}
3485	mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3486}
3487
3488static void ext4_mb_pa_callback(struct rcu_head *head)
3489{
3490	struct ext4_prealloc_space *pa;
3491	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3492
3493	BUG_ON(atomic_read(&pa->pa_count));
3494	BUG_ON(pa->pa_deleted == 0);
3495	kmem_cache_free(ext4_pspace_cachep, pa);
3496}
3497
3498/*
3499 * drops a reference to preallocated space descriptor
3500 * if this was the last reference and the space is consumed
3501 */
3502static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3503			struct super_block *sb, struct ext4_prealloc_space *pa)
3504{
3505	ext4_group_t grp;
3506	ext4_fsblk_t grp_blk;
3507
3508	/* in this short window concurrent discard can set pa_deleted */
3509	spin_lock(&pa->pa_lock);
3510	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
3511		spin_unlock(&pa->pa_lock);
3512		return;
3513	}
3514
3515	if (pa->pa_deleted == 1) {
3516		spin_unlock(&pa->pa_lock);
3517		return;
3518	}
3519
3520	pa->pa_deleted = 1;
3521	spin_unlock(&pa->pa_lock);
3522
3523	grp_blk = pa->pa_pstart;
3524	/*
3525	 * If doing group-based preallocation, pa_pstart may be in the
3526	 * next group when pa is used up
3527	 */
3528	if (pa->pa_type == MB_GROUP_PA)
3529		grp_blk--;
3530
3531	grp = ext4_get_group_number(sb, grp_blk);
3532
3533	/*
3534	 * possible race:
3535	 *
3536	 *  P1 (buddy init)			P2 (regular allocation)
3537	 *					find block B in PA
3538	 *  copy on-disk bitmap to buddy
3539	 *  					mark B in on-disk bitmap
3540	 *					drop PA from group
3541	 *  mark all PAs in buddy
3542	 *
3543	 * thus, P1 initializes buddy with B available. to prevent this
3544	 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3545	 * against that pair
3546	 */
3547	ext4_lock_group(sb, grp);
3548	list_del(&pa->pa_group_list);
3549	ext4_unlock_group(sb, grp);
3550
3551	spin_lock(pa->pa_obj_lock);
3552	list_del_rcu(&pa->pa_inode_list);
3553	spin_unlock(pa->pa_obj_lock);
3554
3555	call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3556}
3557
3558/*
3559 * creates new preallocated space for given inode
3560 */
3561static noinline_for_stack int
3562ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3563{
3564	struct super_block *sb = ac->ac_sb;
3565	struct ext4_sb_info *sbi = EXT4_SB(sb);
3566	struct ext4_prealloc_space *pa;
3567	struct ext4_group_info *grp;
3568	struct ext4_inode_info *ei;
3569
3570	/* preallocate only when found space is larger then requested */
3571	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3572	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3573	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3574
3575	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3576	if (pa == NULL)
3577		return -ENOMEM;
3578
3579	if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3580		int winl;
3581		int wins;
3582		int win;
3583		int offs;
3584
3585		/* we can't allocate as much as normalizer wants.
3586		 * so, found space must get proper lstart
3587		 * to cover original request */
3588		BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3589		BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3590
3591		/* we're limited by original request in that
3592		 * logical block must be covered any way
3593		 * winl is window we can move our chunk within */
3594		winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3595
3596		/* also, we should cover whole original request */
3597		wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
3598
3599		/* the smallest one defines real window */
3600		win = min(winl, wins);
3601
3602		offs = ac->ac_o_ex.fe_logical %
3603			EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3604		if (offs && offs < win)
3605			win = offs;
3606
3607		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3608			EXT4_NUM_B2C(sbi, win);
3609		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3610		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3611	}
3612
3613	/* preallocation can change ac_b_ex, thus we store actually
3614	 * allocated blocks for history */
3615	ac->ac_f_ex = ac->ac_b_ex;
3616
3617	pa->pa_lstart = ac->ac_b_ex.fe_logical;
3618	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3619	pa->pa_len = ac->ac_b_ex.fe_len;
3620	pa->pa_free = pa->pa_len;
3621	atomic_set(&pa->pa_count, 1);
3622	spin_lock_init(&pa->pa_lock);
3623	INIT_LIST_HEAD(&pa->pa_inode_list);
3624	INIT_LIST_HEAD(&pa->pa_group_list);
3625	pa->pa_deleted = 0;
3626	pa->pa_type = MB_INODE_PA;
3627
3628	mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3629			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3630	trace_ext4_mb_new_inode_pa(ac, pa);
3631
3632	ext4_mb_use_inode_pa(ac, pa);
3633	atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
3634
3635	ei = EXT4_I(ac->ac_inode);
3636	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3637
3638	pa->pa_obj_lock = &ei->i_prealloc_lock;
3639	pa->pa_inode = ac->ac_inode;
3640
3641	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3642	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3643	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3644
3645	spin_lock(pa->pa_obj_lock);
3646	list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3647	spin_unlock(pa->pa_obj_lock);
3648
3649	return 0;
3650}
3651
3652/*
3653 * creates new preallocated space for locality group inodes belongs to
3654 */
3655static noinline_for_stack int
3656ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3657{
3658	struct super_block *sb = ac->ac_sb;
3659	struct ext4_locality_group *lg;
3660	struct ext4_prealloc_space *pa;
3661	struct ext4_group_info *grp;
3662
3663	/* preallocate only when found space is larger then requested */
3664	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3665	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3666	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3667
3668	BUG_ON(ext4_pspace_cachep == NULL);
3669	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3670	if (pa == NULL)
3671		return -ENOMEM;
3672
3673	/* preallocation can change ac_b_ex, thus we store actually
3674	 * allocated blocks for history */
3675	ac->ac_f_ex = ac->ac_b_ex;
3676
3677	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3678	pa->pa_lstart = pa->pa_pstart;
3679	pa->pa_len = ac->ac_b_ex.fe_len;
3680	pa->pa_free = pa->pa_len;
3681	atomic_set(&pa->pa_count, 1);
3682	spin_lock_init(&pa->pa_lock);
3683	INIT_LIST_HEAD(&pa->pa_inode_list);
3684	INIT_LIST_HEAD(&pa->pa_group_list);
3685	pa->pa_deleted = 0;
3686	pa->pa_type = MB_GROUP_PA;
3687
3688	mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3689			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3690	trace_ext4_mb_new_group_pa(ac, pa);
3691
3692	ext4_mb_use_group_pa(ac, pa);
3693	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3694
3695	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3696	lg = ac->ac_lg;
3697	BUG_ON(lg == NULL);
3698
3699	pa->pa_obj_lock = &lg->lg_prealloc_lock;
3700	pa->pa_inode = NULL;
3701
3702	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3703	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3704	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3705
3706	/*
3707	 * We will later add the new pa to the right bucket
3708	 * after updating the pa_free in ext4_mb_release_context
3709	 */
3710	return 0;
3711}
3712
3713static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3714{
3715	int err;
3716
3717	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3718		err = ext4_mb_new_group_pa(ac);
3719	else
3720		err = ext4_mb_new_inode_pa(ac);
3721	return err;
3722}
3723
3724/*
3725 * finds all unused blocks in on-disk bitmap, frees them in
3726 * in-core bitmap and buddy.
3727 * @pa must be unlinked from inode and group lists, so that
3728 * nobody else can find/use it.
3729 * the caller MUST hold group/inode locks.
3730 * TODO: optimize the case when there are no in-core structures yet
3731 */
3732static noinline_for_stack int
3733ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3734			struct ext4_prealloc_space *pa)
3735{
3736	struct super_block *sb = e4b->bd_sb;
3737	struct ext4_sb_info *sbi = EXT4_SB(sb);
3738	unsigned int end;
3739	unsigned int next;
3740	ext4_group_t group;
3741	ext4_grpblk_t bit;
3742	unsigned long long grp_blk_start;
3743	int err = 0;
3744	int free = 0;
3745
3746	BUG_ON(pa->pa_deleted == 0);
3747	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3748	grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
3749	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3750	end = bit + pa->pa_len;
3751
3752	while (bit < end) {
3753		bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3754		if (bit >= end)
3755			break;
3756		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3757		mb_debug(1, "    free preallocated %u/%u in group %u\n",
3758			 (unsigned) ext4_group_first_block_no(sb, group) + bit,
3759			 (unsigned) next - bit, (unsigned) group);
3760		free += next - bit;
3761
3762		trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3763		trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
3764						    EXT4_C2B(sbi, bit)),
3765					       next - bit);
3766		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3767		bit = next + 1;
3768	}
3769	if (free != pa->pa_free) {
3770		ext4_msg(e4b->bd_sb, KERN_CRIT,
3771			 "pa %p: logic %lu, phys. %lu, len %lu",
3772			 pa, (unsigned long) pa->pa_lstart,
3773			 (unsigned long) pa->pa_pstart,
3774			 (unsigned long) pa->pa_len);
3775		ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3776					free, pa->pa_free);
3777		/*
3778		 * pa is already deleted so we use the value obtained
3779		 * from the bitmap and continue.
3780		 */
3781	}
3782	atomic_add(free, &sbi->s_mb_discarded);
3783
3784	return err;
3785}
3786
3787static noinline_for_stack int
3788ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3789				struct ext4_prealloc_space *pa)
3790{
3791	struct super_block *sb = e4b->bd_sb;
3792	ext4_group_t group;
3793	ext4_grpblk_t bit;
3794
3795	trace_ext4_mb_release_group_pa(sb, pa);
3796	BUG_ON(pa->pa_deleted == 0);
3797	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3798	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3799	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3800	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3801	trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3802
3803	return 0;
3804}
3805
3806/*
3807 * releases all preallocations in given group
3808 *
3809 * first, we need to decide discard policy:
3810 * - when do we discard
3811 *   1) ENOSPC
3812 * - how many do we discard
3813 *   1) how many requested
3814 */
3815static noinline_for_stack int
3816ext4_mb_discard_group_preallocations(struct super_block *sb,
3817					ext4_group_t group, int needed)
3818{
3819	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3820	struct buffer_head *bitmap_bh = NULL;
3821	struct ext4_prealloc_space *pa, *tmp;
3822	struct list_head list;
3823	struct ext4_buddy e4b;
3824	int err;
3825	int busy = 0;
3826	int free = 0;
3827
3828	mb_debug(1, "discard preallocation for group %u\n", group);
3829
3830	if (list_empty(&grp->bb_prealloc_list))
3831		return 0;
3832
3833	bitmap_bh = ext4_read_block_bitmap(sb, group);
3834	if (bitmap_bh == NULL) {
3835		ext4_error(sb, "Error reading block bitmap for %u", group);
3836		return 0;
3837	}
3838
3839	err = ext4_mb_load_buddy(sb, group, &e4b);
3840	if (err) {
3841		ext4_error(sb, "Error loading buddy information for %u", group);
3842		put_bh(bitmap_bh);
3843		return 0;
3844	}
3845
3846	if (needed == 0)
3847		needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3848
3849	INIT_LIST_HEAD(&list);
3850repeat:
3851	ext4_lock_group(sb, group);
3852	list_for_each_entry_safe(pa, tmp,
3853				&grp->bb_prealloc_list, pa_group_list) {
3854		spin_lock(&pa->pa_lock);
3855		if (atomic_read(&pa->pa_count)) {
3856			spin_unlock(&pa->pa_lock);
3857			busy = 1;
3858			continue;
3859		}
3860		if (pa->pa_deleted) {
3861			spin_unlock(&pa->pa_lock);
3862			continue;
3863		}
3864
3865		/* seems this one can be freed ... */
3866		pa->pa_deleted = 1;
3867
3868		/* we can trust pa_free ... */
3869		free += pa->pa_free;
3870
3871		spin_unlock(&pa->pa_lock);
3872
3873		list_del(&pa->pa_group_list);
3874		list_add(&pa->u.pa_tmp_list, &list);
3875	}
3876
3877	/* if we still need more blocks and some PAs were used, try again */
3878	if (free < needed && busy) {
3879		busy = 0;
3880		ext4_unlock_group(sb, group);
3881		cond_resched();
3882		goto repeat;
3883	}
3884
3885	/* found anything to free? */
3886	if (list_empty(&list)) {
3887		BUG_ON(free != 0);
3888		goto out;
3889	}
3890
3891	/* now free all selected PAs */
3892	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3893
3894		/* remove from object (inode or locality group) */
3895		spin_lock(pa->pa_obj_lock);
3896		list_del_rcu(&pa->pa_inode_list);
3897		spin_unlock(pa->pa_obj_lock);
3898
3899		if (pa->pa_type == MB_GROUP_PA)
3900			ext4_mb_release_group_pa(&e4b, pa);
3901		else
3902			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3903
3904		list_del(&pa->u.pa_tmp_list);
3905		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3906	}
3907
3908out:
3909	ext4_unlock_group(sb, group);
3910	ext4_mb_unload_buddy(&e4b);
3911	put_bh(bitmap_bh);
3912	return free;
3913}
3914
3915/*
3916 * releases all non-used preallocated blocks for given inode
3917 *
3918 * It's important to discard preallocations under i_data_sem
3919 * We don't want another block to be served from the prealloc
3920 * space when we are discarding the inode prealloc space.
3921 *
3922 * FIXME!! Make sure it is valid at all the call sites
3923 */
3924void ext4_discard_preallocations(struct inode *inode)
3925{
3926	struct ext4_inode_info *ei = EXT4_I(inode);
3927	struct super_block *sb = inode->i_sb;
3928	struct buffer_head *bitmap_bh = NULL;
3929	struct ext4_prealloc_space *pa, *tmp;
3930	ext4_group_t group = 0;
3931	struct list_head list;
3932	struct ext4_buddy e4b;
3933	int err;
3934
3935	if (!S_ISREG(inode->i_mode)) {
3936		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3937		return;
3938	}
3939
3940	mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3941	trace_ext4_discard_preallocations(inode);
3942
3943	INIT_LIST_HEAD(&list);
3944
3945repeat:
3946	/* first, collect all pa's in the inode */
3947	spin_lock(&ei->i_prealloc_lock);
3948	while (!list_empty(&ei->i_prealloc_list)) {
3949		pa = list_entry(ei->i_prealloc_list.next,
3950				struct ext4_prealloc_space, pa_inode_list);
3951		BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3952		spin_lock(&pa->pa_lock);
3953		if (atomic_read(&pa->pa_count)) {
3954			/* this shouldn't happen often - nobody should
3955			 * use preallocation while we're discarding it */
3956			spin_unlock(&pa->pa_lock);
3957			spin_unlock(&ei->i_prealloc_lock);
3958			ext4_msg(sb, KERN_ERR,
3959				 "uh-oh! used pa while discarding");
3960			WARN_ON(1);
3961			schedule_timeout_uninterruptible(HZ);
3962			goto repeat;
3963
3964		}
3965		if (pa->pa_deleted == 0) {
3966			pa->pa_deleted = 1;
3967			spin_unlock(&pa->pa_lock);
3968			list_del_rcu(&pa->pa_inode_list);
3969			list_add(&pa->u.pa_tmp_list, &list);
3970			continue;
3971		}
3972
3973		/* someone is deleting pa right now */
3974		spin_unlock(&pa->pa_lock);
3975		spin_unlock(&ei->i_prealloc_lock);
3976
3977		/* we have to wait here because pa_deleted
3978		 * doesn't mean pa is already unlinked from
3979		 * the list. as we might be called from
3980		 * ->clear_inode() the inode will get freed
3981		 * and concurrent thread which is unlinking
3982		 * pa from inode's list may access already
3983		 * freed memory, bad-bad-bad */
3984
3985		/* XXX: if this happens too often, we can
3986		 * add a flag to force wait only in case
3987		 * of ->clear_inode(), but not in case of
3988		 * regular truncate */
3989		schedule_timeout_uninterruptible(HZ);
3990		goto repeat;
3991	}
3992	spin_unlock(&ei->i_prealloc_lock);
3993
3994	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3995		BUG_ON(pa->pa_type != MB_INODE_PA);
3996		group = ext4_get_group_number(sb, pa->pa_pstart);
3997
3998		err = ext4_mb_load_buddy(sb, group, &e4b);
3999		if (err) {
4000			ext4_error(sb, "Error loading buddy information for %u",
4001					group);
4002			continue;
4003		}
4004
4005		bitmap_bh = ext4_read_block_bitmap(sb, group);
4006		if (bitmap_bh == NULL) {
4007			ext4_error(sb, "Error reading block bitmap for %u",
4008					group);
4009			ext4_mb_unload_buddy(&e4b);
4010			continue;
4011		}
4012
4013		ext4_lock_group(sb, group);
4014		list_del(&pa->pa_group_list);
4015		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
4016		ext4_unlock_group(sb, group);
4017
4018		ext4_mb_unload_buddy(&e4b);
4019		put_bh(bitmap_bh);
4020
4021		list_del(&pa->u.pa_tmp_list);
4022		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4023	}
4024}
4025
4026#ifdef CONFIG_EXT4_DEBUG
4027static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4028{
4029	struct super_block *sb = ac->ac_sb;
4030	ext4_group_t ngroups, i;
4031
4032	if (!ext4_mballoc_debug ||
4033	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
4034		return;
4035
4036	ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
4037			" Allocation context details:");
4038	ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
4039			ac->ac_status, ac->ac_flags);
4040	ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
4041		 	"goal %lu/%lu/%lu@%lu, "
4042			"best %lu/%lu/%lu@%lu cr %d",
4043			(unsigned long)ac->ac_o_ex.fe_group,
4044			(unsigned long)ac->ac_o_ex.fe_start,
4045			(unsigned long)ac->ac_o_ex.fe_len,
4046			(unsigned long)ac->ac_o_ex.fe_logical,
4047			(unsigned long)ac->ac_g_ex.fe_group,
4048			(unsigned long)ac->ac_g_ex.fe_start,
4049			(unsigned long)ac->ac_g_ex.fe_len,
4050			(unsigned long)ac->ac_g_ex.fe_logical,
4051			(unsigned long)ac->ac_b_ex.fe_group,
4052			(unsigned long)ac->ac_b_ex.fe_start,
4053			(unsigned long)ac->ac_b_ex.fe_len,
4054			(unsigned long)ac->ac_b_ex.fe_logical,
4055			(int)ac->ac_criteria);
4056	ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4057	ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4058	ngroups = ext4_get_groups_count(sb);
4059	for (i = 0; i < ngroups; i++) {
4060		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4061		struct ext4_prealloc_space *pa;
4062		ext4_grpblk_t start;
4063		struct list_head *cur;
4064		ext4_lock_group(sb, i);
4065		list_for_each(cur, &grp->bb_prealloc_list) {
4066			pa = list_entry(cur, struct ext4_prealloc_space,
4067					pa_group_list);
4068			spin_lock(&pa->pa_lock);
4069			ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4070						     NULL, &start);
4071			spin_unlock(&pa->pa_lock);
4072			printk(KERN_ERR "PA:%u:%d:%u \n", i,
4073			       start, pa->pa_len);
4074		}
4075		ext4_unlock_group(sb, i);
4076
4077		if (grp->bb_free == 0)
4078			continue;
4079		printk(KERN_ERR "%u: %d/%d \n",
4080		       i, grp->bb_free, grp->bb_fragments);
4081	}
4082	printk(KERN_ERR "\n");
4083}
4084#else
4085static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4086{
4087	return;
4088}
4089#endif
4090
4091/*
4092 * We use locality group preallocation for small size file. The size of the
4093 * file is determined by the current size or the resulting size after
4094 * allocation which ever is larger
4095 *
4096 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4097 */
4098static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4099{
4100	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4101	int bsbits = ac->ac_sb->s_blocksize_bits;
4102	loff_t size, isize;
4103
4104	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4105		return;
4106
4107	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4108		return;
4109
4110	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
4111	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4112		>> bsbits;
4113
4114	if ((size == isize) &&
4115	    !ext4_fs_is_busy(sbi) &&
4116	    (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4117		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4118		return;
4119	}
4120
4121	if (sbi->s_mb_group_prealloc <= 0) {
4122		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4123		return;
4124	}
4125
4126	/* don't use group allocation for large files */
4127	size = max(size, isize);
4128	if (size > sbi->s_mb_stream_request) {
4129		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4130		return;
4131	}
4132
4133	BUG_ON(ac->ac_lg != NULL);
4134	/*
4135	 * locality group prealloc space are per cpu. The reason for having
4136	 * per cpu locality group is to reduce the contention between block
4137	 * request from multiple CPUs.
4138	 */
4139	ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
4140
4141	/* we're going to use group allocation */
4142	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4143
4144	/* serialize all allocations in the group */
4145	mutex_lock(&ac->ac_lg->lg_mutex);
4146}
4147
4148static noinline_for_stack int
4149ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4150				struct ext4_allocation_request *ar)
4151{
4152	struct super_block *sb = ar->inode->i_sb;
4153	struct ext4_sb_info *sbi = EXT4_SB(sb);
4154	struct ext4_super_block *es = sbi->s_es;
4155	ext4_group_t group;
4156	unsigned int len;
4157	ext4_fsblk_t goal;
4158	ext4_grpblk_t block;
4159
4160	/* we can't allocate > group size */
4161	len = ar->len;
4162
4163	/* just a dirty hack to filter too big requests  */
4164	if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
4165		len = EXT4_CLUSTERS_PER_GROUP(sb);
4166
4167	/* start searching from the goal */
4168	goal = ar->goal;
4169	if (goal < le32_to_cpu(es->s_first_data_block) ||
4170			goal >= ext4_blocks_count(es))
4171		goal = le32_to_cpu(es->s_first_data_block);
4172	ext4_get_group_no_and_offset(sb, goal, &group, &block);
4173
4174	/* set up allocation goals */
4175	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
4176	ac->ac_status = AC_STATUS_CONTINUE;
4177	ac->ac_sb = sb;
4178	ac->ac_inode = ar->inode;
4179	ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4180	ac->ac_o_ex.fe_group = group;
4181	ac->ac_o_ex.fe_start = block;
4182	ac->ac_o_ex.fe_len = len;
4183	ac->ac_g_ex = ac->ac_o_ex;
4184	ac->ac_flags = ar->flags;
4185
4186	/* we have to define context: we'll we work with a file or
4187	 * locality group. this is a policy, actually */
4188	ext4_mb_group_or_file(ac);
4189
4190	mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4191			"left: %u/%u, right %u/%u to %swritable\n",
4192			(unsigned) ar->len, (unsigned) ar->logical,
4193			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4194			(unsigned) ar->lleft, (unsigned) ar->pleft,
4195			(unsigned) ar->lright, (unsigned) ar->pright,
4196			atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4197	return 0;
4198
4199}
4200
4201static noinline_for_stack void
4202ext4_mb_discard_lg_preallocations(struct super_block *sb,
4203					struct ext4_locality_group *lg,
4204					int order, int total_entries)
4205{
4206	ext4_group_t group = 0;
4207	struct ext4_buddy e4b;
4208	struct list_head discard_list;
4209	struct ext4_prealloc_space *pa, *tmp;
4210
4211	mb_debug(1, "discard locality group preallocation\n");
4212
4213	INIT_LIST_HEAD(&discard_list);
4214
4215	spin_lock(&lg->lg_prealloc_lock);
4216	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4217						pa_inode_list) {
4218		spin_lock(&pa->pa_lock);
4219		if (atomic_read(&pa->pa_count)) {
4220			/*
4221			 * This is the pa that we just used
4222			 * for block allocation. So don't
4223			 * free that
4224			 */
4225			spin_unlock(&pa->pa_lock);
4226			continue;
4227		}
4228		if (pa->pa_deleted) {
4229			spin_unlock(&pa->pa_lock);
4230			continue;
4231		}
4232		/* only lg prealloc space */
4233		BUG_ON(pa->pa_type != MB_GROUP_PA);
4234
4235		/* seems this one can be freed ... */
4236		pa->pa_deleted = 1;
4237		spin_unlock(&pa->pa_lock);
4238
4239		list_del_rcu(&pa->pa_inode_list);
4240		list_add(&pa->u.pa_tmp_list, &discard_list);
4241
4242		total_entries--;
4243		if (total_entries <= 5) {
4244			/*
4245			 * we want to keep only 5 entries
4246			 * allowing it to grow to 8. This
4247			 * mak sure we don't call discard
4248			 * soon for this list.
4249			 */
4250			break;
4251		}
4252	}
4253	spin_unlock(&lg->lg_prealloc_lock);
4254
4255	list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4256
4257		group = ext4_get_group_number(sb, pa->pa_pstart);
4258		if (ext4_mb_load_buddy(sb, group, &e4b)) {
4259			ext4_error(sb, "Error loading buddy information for %u",
4260					group);
4261			continue;
4262		}
4263		ext4_lock_group(sb, group);
4264		list_del(&pa->pa_group_list);
4265		ext4_mb_release_group_pa(&e4b, pa);
4266		ext4_unlock_group(sb, group);
4267
4268		ext4_mb_unload_buddy(&e4b);
4269		list_del(&pa->u.pa_tmp_list);
4270		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4271	}
4272}
4273
4274/*
4275 * We have incremented pa_count. So it cannot be freed at this
4276 * point. Also we hold lg_mutex. So no parallel allocation is
4277 * possible from this lg. That means pa_free cannot be updated.
4278 *
4279 * A parallel ext4_mb_discard_group_preallocations is possible.
4280 * which can cause the lg_prealloc_list to be updated.
4281 */
4282
4283static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4284{
4285	int order, added = 0, lg_prealloc_count = 1;
4286	struct super_block *sb = ac->ac_sb;
4287	struct ext4_locality_group *lg = ac->ac_lg;
4288	struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4289
4290	order = fls(pa->pa_free) - 1;
4291	if (order > PREALLOC_TB_SIZE - 1)
4292		/* The max size of hash table is PREALLOC_TB_SIZE */
4293		order = PREALLOC_TB_SIZE - 1;
4294	/* Add the prealloc space to lg */
4295	spin_lock(&lg->lg_prealloc_lock);
4296	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4297						pa_inode_list) {
4298		spin_lock(&tmp_pa->pa_lock);
4299		if (tmp_pa->pa_deleted) {
4300			spin_unlock(&tmp_pa->pa_lock);
4301			continue;
4302		}
4303		if (!added && pa->pa_free < tmp_pa->pa_free) {
4304			/* Add to the tail of the previous entry */
4305			list_add_tail_rcu(&pa->pa_inode_list,
4306						&tmp_pa->pa_inode_list);
4307			added = 1;
4308			/*
4309			 * we want to count the total
4310			 * number of entries in the list
4311			 */
4312		}
4313		spin_unlock(&tmp_pa->pa_lock);
4314		lg_prealloc_count++;
4315	}
4316	if (!added)
4317		list_add_tail_rcu(&pa->pa_inode_list,
4318					&lg->lg_prealloc_list[order]);
4319	spin_unlock(&lg->lg_prealloc_lock);
4320
4321	/* Now trim the list to be not more than 8 elements */
4322	if (lg_prealloc_count > 8) {
4323		ext4_mb_discard_lg_preallocations(sb, lg,
4324						  order, lg_prealloc_count);
4325		return;
4326	}
4327	return ;
4328}
4329
4330/*
4331 * release all resource we used in allocation
4332 */
4333static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4334{
4335	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4336	struct ext4_prealloc_space *pa = ac->ac_pa;
4337	if (pa) {
4338		if (pa->pa_type == MB_GROUP_PA) {
4339			/* see comment in ext4_mb_use_group_pa() */
4340			spin_lock(&pa->pa_lock);
4341			pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4342			pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4343			pa->pa_free -= ac->ac_b_ex.fe_len;
4344			pa->pa_len -= ac->ac_b_ex.fe_len;
4345			spin_unlock(&pa->pa_lock);
4346		}
4347	}
4348	if (pa) {
4349		/*
4350		 * We want to add the pa to the right bucket.
4351		 * Remove it from the list and while adding
4352		 * make sure the list to which we are adding
4353		 * doesn't grow big.
4354		 */
4355		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4356			spin_lock(pa->pa_obj_lock);
4357			list_del_rcu(&pa->pa_inode_list);
4358			spin_unlock(pa->pa_obj_lock);
4359			ext4_mb_add_n_trim(ac);
4360		}
4361		ext4_mb_put_pa(ac, ac->ac_sb, pa);
4362	}
4363	if (ac->ac_bitmap_page)
4364		page_cache_release(ac->ac_bitmap_page);
4365	if (ac->ac_buddy_page)
4366		page_cache_release(ac->ac_buddy_page);
4367	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4368		mutex_unlock(&ac->ac_lg->lg_mutex);
4369	ext4_mb_collect_stats(ac);
4370	return 0;
4371}
4372
4373static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4374{
4375	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4376	int ret;
4377	int freed = 0;
4378
4379	trace_ext4_mb_discard_preallocations(sb, needed);
4380	for (i = 0; i < ngroups && needed > 0; i++) {
4381		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4382		freed += ret;
4383		needed -= ret;
4384	}
4385
4386	return freed;
4387}
4388
4389/*
4390 * Main entry point into mballoc to allocate blocks
4391 * it tries to use preallocation first, then falls back
4392 * to usual allocation
4393 */
4394ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4395				struct ext4_allocation_request *ar, int *errp)
4396{
4397	int freed;
4398	struct ext4_allocation_context *ac = NULL;
4399	struct ext4_sb_info *sbi;
4400	struct super_block *sb;
4401	ext4_fsblk_t block = 0;
4402	unsigned int inquota = 0;
4403	unsigned int reserv_clstrs = 0;
4404
4405	might_sleep();
4406	sb = ar->inode->i_sb;
4407	sbi = EXT4_SB(sb);
4408
4409	trace_ext4_request_blocks(ar);
4410
4411	/* Allow to use superuser reservation for quota file */
4412	if (IS_NOQUOTA(ar->inode))
4413		ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4414
4415	if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
4416		/* Without delayed allocation we need to verify
4417		 * there is enough free blocks to do block allocation
4418		 * and verify allocation doesn't exceed the quota limits.
4419		 */
4420		while (ar->len &&
4421			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4422
4423			/* let others to free the space */
4424			cond_resched();
4425			ar->len = ar->len >> 1;
4426		}
4427		if (!ar->len) {
4428			*errp = -ENOSPC;
4429			return 0;
4430		}
4431		reserv_clstrs = ar->len;
4432		if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4433			dquot_alloc_block_nofail(ar->inode,
4434						 EXT4_C2B(sbi, ar->len));
4435		} else {
4436			while (ar->len &&
4437				dquot_alloc_block(ar->inode,
4438						  EXT4_C2B(sbi, ar->len))) {
4439
4440				ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4441				ar->len--;
4442			}
4443		}
4444		inquota = ar->len;
4445		if (ar->len == 0) {
4446			*errp = -EDQUOT;
4447			goto out;
4448		}
4449	}
4450
4451	ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
4452	if (!ac) {
4453		ar->len = 0;
4454		*errp = -ENOMEM;
4455		goto out;
4456	}
4457
4458	*errp = ext4_mb_initialize_context(ac, ar);
4459	if (*errp) {
4460		ar->len = 0;
4461		goto out;
4462	}
4463
4464	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4465	if (!ext4_mb_use_preallocated(ac)) {
4466		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4467		ext4_mb_normalize_request(ac, ar);
4468repeat:
4469		/* allocate space in core */
4470		*errp = ext4_mb_regular_allocator(ac);
4471		if (*errp)
4472			goto discard_and_exit;
4473
4474		/* as we've just preallocated more space than
4475		 * user requested originally, we store allocated
4476		 * space in a special descriptor */
4477		if (ac->ac_status == AC_STATUS_FOUND &&
4478		    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4479			*errp = ext4_mb_new_preallocation(ac);
4480		if (*errp) {
4481		discard_and_exit:
4482			ext4_discard_allocated_blocks(ac);
4483			goto errout;
4484		}
4485	}
4486	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4487		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
4488		if (*errp == -EAGAIN) {
4489			/*
4490			 * drop the reference that we took
4491			 * in ext4_mb_use_best_found
4492			 */
4493			ext4_mb_release_context(ac);
4494			ac->ac_b_ex.fe_group = 0;
4495			ac->ac_b_ex.fe_start = 0;
4496			ac->ac_b_ex.fe_len = 0;
4497			ac->ac_status = AC_STATUS_CONTINUE;
4498			goto repeat;
4499		} else if (*errp) {
4500			ext4_discard_allocated_blocks(ac);
4501			goto errout;
4502		} else {
4503			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4504			ar->len = ac->ac_b_ex.fe_len;
4505		}
4506	} else {
4507		freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4508		if (freed)
4509			goto repeat;
4510		*errp = -ENOSPC;
4511	}
4512
4513errout:
4514	if (*errp) {
4515		ac->ac_b_ex.fe_len = 0;
4516		ar->len = 0;
4517		ext4_mb_show_ac(ac);
4518	}
4519	ext4_mb_release_context(ac);
4520out:
4521	if (ac)
4522		kmem_cache_free(ext4_ac_cachep, ac);
4523	if (inquota && ar->len < inquota)
4524		dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4525	if (!ar->len) {
4526		if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
4527			/* release all the reserved blocks if non delalloc */
4528			percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4529						reserv_clstrs);
4530	}
4531
4532	trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4533
4534	return block;
4535}
4536
4537/*
4538 * We can merge two free data extents only if the physical blocks
4539 * are contiguous, AND the extents were freed by the same transaction,
4540 * AND the blocks are associated with the same group.
4541 */
4542static int can_merge(struct ext4_free_data *entry1,
4543			struct ext4_free_data *entry2)
4544{
4545	if ((entry1->efd_tid == entry2->efd_tid) &&
4546	    (entry1->efd_group == entry2->efd_group) &&
4547	    ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
4548		return 1;
4549	return 0;
4550}
4551
4552static noinline_for_stack int
4553ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4554		      struct ext4_free_data *new_entry)
4555{
4556	ext4_group_t group = e4b->bd_group;
4557	ext4_grpblk_t cluster;
4558	struct ext4_free_data *entry;
4559	struct ext4_group_info *db = e4b->bd_info;
4560	struct super_block *sb = e4b->bd_sb;
4561	struct ext4_sb_info *sbi = EXT4_SB(sb);
4562	struct rb_node **n = &db->bb_free_root.rb_node, *node;
4563	struct rb_node *parent = NULL, *new_node;
4564
4565	BUG_ON(!ext4_handle_valid(handle));
4566	BUG_ON(e4b->bd_bitmap_page == NULL);
4567	BUG_ON(e4b->bd_buddy_page == NULL);
4568
4569	new_node = &new_entry->efd_node;
4570	cluster = new_entry->efd_start_cluster;
4571
4572	if (!*n) {
4573		/* first free block exent. We need to
4574		   protect buddy cache from being freed,
4575		 * otherwise we'll refresh it from
4576		 * on-disk bitmap and lose not-yet-available
4577		 * blocks */
4578		page_cache_get(e4b->bd_buddy_page);
4579		page_cache_get(e4b->bd_bitmap_page);
4580	}
4581	while (*n) {
4582		parent = *n;
4583		entry = rb_entry(parent, struct ext4_free_data, efd_node);
4584		if (cluster < entry->efd_start_cluster)
4585			n = &(*n)->rb_left;
4586		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4587			n = &(*n)->rb_right;
4588		else {
4589			ext4_grp_locked_error(sb, group, 0,
4590				ext4_group_first_block_no(sb, group) +
4591				EXT4_C2B(sbi, cluster),
4592				"Block already on to-be-freed list");
4593			return 0;
4594		}
4595	}
4596
4597	rb_link_node(new_node, parent, n);
4598	rb_insert_color(new_node, &db->bb_free_root);
4599
4600	/* Now try to see the extent can be merged to left and right */
4601	node = rb_prev(new_node);
4602	if (node) {
4603		entry = rb_entry(node, struct ext4_free_data, efd_node);
4604		if (can_merge(entry, new_entry) &&
4605		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4606			new_entry->efd_start_cluster = entry->efd_start_cluster;
4607			new_entry->efd_count += entry->efd_count;
4608			rb_erase(node, &(db->bb_free_root));
4609			kmem_cache_free(ext4_free_data_cachep, entry);
4610		}
4611	}
4612
4613	node = rb_next(new_node);
4614	if (node) {
4615		entry = rb_entry(node, struct ext4_free_data, efd_node);
4616		if (can_merge(new_entry, entry) &&
4617		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4618			new_entry->efd_count += entry->efd_count;
4619			rb_erase(node, &(db->bb_free_root));
4620			kmem_cache_free(ext4_free_data_cachep, entry);
4621		}
4622	}
4623	/* Add the extent to transaction's private list */
4624	ext4_journal_callback_add(handle, ext4_free_data_callback,
4625				  &new_entry->efd_jce);
4626	return 0;
4627}
4628
4629/**
4630 * ext4_free_blocks() -- Free given blocks and update quota
4631 * @handle:		handle for this transaction
4632 * @inode:		inode
4633 * @block:		start physical block to free
4634 * @count:		number of blocks to count
4635 * @flags:		flags used by ext4_free_blocks
4636 */
4637void ext4_free_blocks(handle_t *handle, struct inode *inode,
4638		      struct buffer_head *bh, ext4_fsblk_t block,
4639		      unsigned long count, int flags)
4640{
4641	struct buffer_head *bitmap_bh = NULL;
4642	struct super_block *sb = inode->i_sb;
4643	struct ext4_group_desc *gdp;
4644	unsigned int overflow;
4645	ext4_grpblk_t bit;
4646	struct buffer_head *gd_bh;
4647	ext4_group_t block_group;
4648	struct ext4_sb_info *sbi;
4649	struct ext4_buddy e4b;
4650	unsigned int count_clusters;
4651	int err = 0;
4652	int ret;
4653
4654	might_sleep();
4655	if (bh) {
4656		if (block)
4657			BUG_ON(block != bh->b_blocknr);
4658		else
4659			block = bh->b_blocknr;
4660	}
4661
4662	sbi = EXT4_SB(sb);
4663	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4664	    !ext4_data_block_valid(sbi, block, count)) {
4665		ext4_error(sb, "Freeing blocks not in datazone - "
4666			   "block = %llu, count = %lu", block, count);
4667		goto error_return;
4668	}
4669
4670	ext4_debug("freeing block %llu\n", block);
4671	trace_ext4_free_blocks(inode, block, count, flags);
4672
4673	if (flags & EXT4_FREE_BLOCKS_FORGET) {
4674		struct buffer_head *tbh = bh;
4675		int i;
4676
4677		BUG_ON(bh && (count > 1));
4678
4679		for (i = 0; i < count; i++) {
4680			cond_resched();
4681			if (!bh)
4682				tbh = sb_find_get_block(inode->i_sb,
4683							block + i);
4684			if (!tbh)
4685				continue;
4686			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4687				    inode, tbh, block + i);
4688		}
4689	}
4690
4691	/*
4692	 * We need to make sure we don't reuse the freed block until
4693	 * after the transaction is committed, which we can do by
4694	 * treating the block as metadata, below.  We make an
4695	 * exception if the inode is to be written in writeback mode
4696	 * since writeback mode has weak data consistency guarantees.
4697	 */
4698	if (!ext4_should_writeback_data(inode))
4699		flags |= EXT4_FREE_BLOCKS_METADATA;
4700
4701	/*
4702	 * If the extent to be freed does not begin on a cluster
4703	 * boundary, we need to deal with partial clusters at the
4704	 * beginning and end of the extent.  Normally we will free
4705	 * blocks at the beginning or the end unless we are explicitly
4706	 * requested to avoid doing so.
4707	 */
4708	overflow = EXT4_PBLK_COFF(sbi, block);
4709	if (overflow) {
4710		if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4711			overflow = sbi->s_cluster_ratio - overflow;
4712			block += overflow;
4713			if (count > overflow)
4714				count -= overflow;
4715			else
4716				return;
4717		} else {
4718			block -= overflow;
4719			count += overflow;
4720		}
4721	}
4722	overflow = EXT4_LBLK_COFF(sbi, count);
4723	if (overflow) {
4724		if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4725			if (count > overflow)
4726				count -= overflow;
4727			else
4728				return;
4729		} else
4730			count += sbi->s_cluster_ratio - overflow;
4731	}
4732
4733do_more:
4734	overflow = 0;
4735	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4736
4737	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
4738			ext4_get_group_info(sb, block_group))))
4739		return;
4740
4741	/*
4742	 * Check to see if we are freeing blocks across a group
4743	 * boundary.
4744	 */
4745	if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4746		overflow = EXT4_C2B(sbi, bit) + count -
4747			EXT4_BLOCKS_PER_GROUP(sb);
4748		count -= overflow;
4749	}
4750	count_clusters = EXT4_NUM_B2C(sbi, count);
4751	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4752	if (!bitmap_bh) {
4753		err = -EIO;
4754		goto error_return;
4755	}
4756	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4757	if (!gdp) {
4758		err = -EIO;
4759		goto error_return;
4760	}
4761
4762	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4763	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4764	    in_range(block, ext4_inode_table(sb, gdp),
4765		     EXT4_SB(sb)->s_itb_per_group) ||
4766	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
4767		     EXT4_SB(sb)->s_itb_per_group)) {
4768
4769		ext4_error(sb, "Freeing blocks in system zone - "
4770			   "Block = %llu, count = %lu", block, count);
4771		/* err = 0. ext4_std_error should be a no op */
4772		goto error_return;
4773	}
4774
4775	BUFFER_TRACE(bitmap_bh, "getting write access");
4776	err = ext4_journal_get_write_access(handle, bitmap_bh);
4777	if (err)
4778		goto error_return;
4779
4780	/*
4781	 * We are about to modify some metadata.  Call the journal APIs
4782	 * to unshare ->b_data if a currently-committing transaction is
4783	 * using it
4784	 */
4785	BUFFER_TRACE(gd_bh, "get_write_access");
4786	err = ext4_journal_get_write_access(handle, gd_bh);
4787	if (err)
4788		goto error_return;
4789#ifdef AGGRESSIVE_CHECK
4790	{
4791		int i;
4792		for (i = 0; i < count_clusters; i++)
4793			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4794	}
4795#endif
4796	trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4797
4798	err = ext4_mb_load_buddy(sb, block_group, &e4b);
4799	if (err)
4800		goto error_return;
4801
4802	if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
4803		struct ext4_free_data *new_entry;
4804		/*
4805		 * blocks being freed are metadata. these blocks shouldn't
4806		 * be used until this transaction is committed
4807		 *
4808		 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
4809		 * to fail.
4810		 */
4811		new_entry = kmem_cache_alloc(ext4_free_data_cachep,
4812				GFP_NOFS|__GFP_NOFAIL);
4813		new_entry->efd_start_cluster = bit;
4814		new_entry->efd_group = block_group;
4815		new_entry->efd_count = count_clusters;
4816		new_entry->efd_tid = handle->h_transaction->t_tid;
4817
4818		ext4_lock_group(sb, block_group);
4819		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4820		ext4_mb_free_metadata(handle, &e4b, new_entry);
4821	} else {
4822		/* need to update group_info->bb_free and bitmap
4823		 * with group lock held. generate_buddy look at
4824		 * them with group lock_held
4825		 */
4826		if (test_opt(sb, DISCARD)) {
4827			err = ext4_issue_discard(sb, block_group, bit, count);
4828			if (err && err != -EOPNOTSUPP)
4829				ext4_msg(sb, KERN_WARNING, "discard request in"
4830					 " group:%d block:%d count:%lu failed"
4831					 " with %d", block_group, bit, count,
4832					 err);
4833		} else
4834			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
4835
4836		ext4_lock_group(sb, block_group);
4837		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4838		mb_free_blocks(inode, &e4b, bit, count_clusters);
4839	}
4840
4841	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4842	ext4_free_group_clusters_set(sb, gdp, ret);
4843	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
4844	ext4_group_desc_csum_set(sb, block_group, gdp);
4845	ext4_unlock_group(sb, block_group);
4846
4847	if (sbi->s_log_groups_per_flex) {
4848		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4849		atomic64_add(count_clusters,
4850			     &sbi->s_flex_groups[flex_group].free_clusters);
4851	}
4852
4853	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4854		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4855	percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4856
4857	ext4_mb_unload_buddy(&e4b);
4858
4859	/* We dirtied the bitmap block */
4860	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4861	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4862
4863	/* And the group descriptor block */
4864	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4865	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4866	if (!err)
4867		err = ret;
4868
4869	if (overflow && !err) {
4870		block += count;
4871		count = overflow;
4872		put_bh(bitmap_bh);
4873		goto do_more;
4874	}
4875error_return:
4876	brelse(bitmap_bh);
4877	ext4_std_error(sb, err);
4878	return;
4879}
4880
4881/**
4882 * ext4_group_add_blocks() -- Add given blocks to an existing group
4883 * @handle:			handle to this transaction
4884 * @sb:				super block
4885 * @block:			start physical block to add to the block group
4886 * @count:			number of blocks to free
4887 *
4888 * This marks the blocks as free in the bitmap and buddy.
4889 */
4890int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4891			 ext4_fsblk_t block, unsigned long count)
4892{
4893	struct buffer_head *bitmap_bh = NULL;
4894	struct buffer_head *gd_bh;
4895	ext4_group_t block_group;
4896	ext4_grpblk_t bit;
4897	unsigned int i;
4898	struct ext4_group_desc *desc;
4899	struct ext4_sb_info *sbi = EXT4_SB(sb);
4900	struct ext4_buddy e4b;
4901	int err = 0, ret, blk_free_count;
4902	ext4_grpblk_t blocks_freed;
4903
4904	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4905
4906	if (count == 0)
4907		return 0;
4908
4909	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4910	/*
4911	 * Check to see if we are freeing blocks across a group
4912	 * boundary.
4913	 */
4914	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4915		ext4_warning(sb, "too much blocks added to group %u\n",
4916			     block_group);
4917		err = -EINVAL;
4918		goto error_return;
4919	}
4920
4921	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4922	if (!bitmap_bh) {
4923		err = -EIO;
4924		goto error_return;
4925	}
4926
4927	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4928	if (!desc) {
4929		err = -EIO;
4930		goto error_return;
4931	}
4932
4933	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4934	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4935	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4936	    in_range(block + count - 1, ext4_inode_table(sb, desc),
4937		     sbi->s_itb_per_group)) {
4938		ext4_error(sb, "Adding blocks in system zones - "
4939			   "Block = %llu, count = %lu",
4940			   block, count);
4941		err = -EINVAL;
4942		goto error_return;
4943	}
4944
4945	BUFFER_TRACE(bitmap_bh, "getting write access");
4946	err = ext4_journal_get_write_access(handle, bitmap_bh);
4947	if (err)
4948		goto error_return;
4949
4950	/*
4951	 * We are about to modify some metadata.  Call the journal APIs
4952	 * to unshare ->b_data if a currently-committing transaction is
4953	 * using it
4954	 */
4955	BUFFER_TRACE(gd_bh, "get_write_access");
4956	err = ext4_journal_get_write_access(handle, gd_bh);
4957	if (err)
4958		goto error_return;
4959
4960	for (i = 0, blocks_freed = 0; i < count; i++) {
4961		BUFFER_TRACE(bitmap_bh, "clear bit");
4962		if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4963			ext4_error(sb, "bit already cleared for block %llu",
4964				   (ext4_fsblk_t)(block + i));
4965			BUFFER_TRACE(bitmap_bh, "bit already cleared");
4966		} else {
4967			blocks_freed++;
4968		}
4969	}
4970
4971	err = ext4_mb_load_buddy(sb, block_group, &e4b);
4972	if (err)
4973		goto error_return;
4974
4975	/*
4976	 * need to update group_info->bb_free and bitmap
4977	 * with group lock held. generate_buddy look at
4978	 * them with group lock_held
4979	 */
4980	ext4_lock_group(sb, block_group);
4981	mb_clear_bits(bitmap_bh->b_data, bit, count);
4982	mb_free_blocks(NULL, &e4b, bit, count);
4983	blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
4984	ext4_free_group_clusters_set(sb, desc, blk_free_count);
4985	ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
4986	ext4_group_desc_csum_set(sb, block_group, desc);
4987	ext4_unlock_group(sb, block_group);
4988	percpu_counter_add(&sbi->s_freeclusters_counter,
4989			   EXT4_NUM_B2C(sbi, blocks_freed));
4990
4991	if (sbi->s_log_groups_per_flex) {
4992		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4993		atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
4994			     &sbi->s_flex_groups[flex_group].free_clusters);
4995	}
4996
4997	ext4_mb_unload_buddy(&e4b);
4998
4999	/* We dirtied the bitmap block */
5000	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
5001	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
5002
5003	/* And the group descriptor block */
5004	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
5005	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
5006	if (!err)
5007		err = ret;
5008
5009error_return:
5010	brelse(bitmap_bh);
5011	ext4_std_error(sb, err);
5012	return err;
5013}
5014
5015/**
5016 * ext4_trim_extent -- function to TRIM one single free extent in the group
5017 * @sb:		super block for the file system
5018 * @start:	starting block of the free extent in the alloc. group
5019 * @count:	number of blocks to TRIM
5020 * @group:	alloc. group we are working with
5021 * @e4b:	ext4 buddy for the group
5022 *
5023 * Trim "count" blocks starting at "start" in the "group". To assure that no
5024 * one will allocate those blocks, mark it as used in buddy bitmap. This must
5025 * be called with under the group lock.
5026 */
5027static int ext4_trim_extent(struct super_block *sb, int start, int count,
5028			     ext4_group_t group, struct ext4_buddy *e4b)
5029__releases(bitlock)
5030__acquires(bitlock)
5031{
5032	struct ext4_free_extent ex;
5033	int ret = 0;
5034
5035	trace_ext4_trim_extent(sb, group, start, count);
5036
5037	assert_spin_locked(ext4_group_lock_ptr(sb, group));
5038
5039	ex.fe_start = start;
5040	ex.fe_group = group;
5041	ex.fe_len = count;
5042
5043	/*
5044	 * Mark blocks used, so no one can reuse them while
5045	 * being trimmed.
5046	 */
5047	mb_mark_used(e4b, &ex);
5048	ext4_unlock_group(sb, group);
5049	ret = ext4_issue_discard(sb, group, start, count);
5050	ext4_lock_group(sb, group);
5051	mb_free_blocks(NULL, e4b, start, ex.fe_len);
5052	return ret;
5053}
5054
5055/**
5056 * ext4_trim_all_free -- function to trim all free space in alloc. group
5057 * @sb:			super block for file system
5058 * @group:		group to be trimmed
5059 * @start:		first group block to examine
5060 * @max:		last group block to examine
5061 * @minblocks:		minimum extent block count
5062 *
5063 * ext4_trim_all_free walks through group's buddy bitmap searching for free
5064 * extents. When the free block is found, ext4_trim_extent is called to TRIM
5065 * the extent.
5066 *
5067 *
5068 * ext4_trim_all_free walks through group's block bitmap searching for free
5069 * extents. When the free extent is found, mark it as used in group buddy
5070 * bitmap. Then issue a TRIM command on this extent and free the extent in
5071 * the group buddy bitmap. This is done until whole group is scanned.
5072 */
5073static ext4_grpblk_t
5074ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
5075		   ext4_grpblk_t start, ext4_grpblk_t max,
5076		   ext4_grpblk_t minblocks)
5077{
5078	void *bitmap;
5079	ext4_grpblk_t next, count = 0, free_count = 0;
5080	struct ext4_buddy e4b;
5081	int ret = 0;
5082
5083	trace_ext4_trim_all_free(sb, group, start, max);
5084
5085	ret = ext4_mb_load_buddy(sb, group, &e4b);
5086	if (ret) {
5087		ext4_error(sb, "Error in loading buddy "
5088				"information for %u", group);
5089		return ret;
5090	}
5091	bitmap = e4b.bd_bitmap;
5092
5093	ext4_lock_group(sb, group);
5094	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
5095	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
5096		goto out;
5097
5098	start = (e4b.bd_info->bb_first_free > start) ?
5099		e4b.bd_info->bb_first_free : start;
5100
5101	while (start <= max) {
5102		start = mb_find_next_zero_bit(bitmap, max + 1, start);
5103		if (start > max)
5104			break;
5105		next = mb_find_next_bit(bitmap, max + 1, start);
5106
5107		if ((next - start) >= minblocks) {
5108			ret = ext4_trim_extent(sb, start,
5109					       next - start, group, &e4b);
5110			if (ret && ret != -EOPNOTSUPP)
5111				break;
5112			ret = 0;
5113			count += next - start;
5114		}
5115		free_count += next - start;
5116		start = next + 1;
5117
5118		if (fatal_signal_pending(current)) {
5119			count = -ERESTARTSYS;
5120			break;
5121		}
5122
5123		if (need_resched()) {
5124			ext4_unlock_group(sb, group);
5125			cond_resched();
5126			ext4_lock_group(sb, group);
5127		}
5128
5129		if ((e4b.bd_info->bb_free - free_count) < minblocks)
5130			break;
5131	}
5132
5133	if (!ret) {
5134		ret = count;
5135		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
5136	}
5137out:
5138	ext4_unlock_group(sb, group);
5139	ext4_mb_unload_buddy(&e4b);
5140
5141	ext4_debug("trimmed %d blocks in the group %d\n",
5142		count, group);
5143
5144	return ret;
5145}
5146
5147/**
5148 * ext4_trim_fs() -- trim ioctl handle function
5149 * @sb:			superblock for filesystem
5150 * @range:		fstrim_range structure
5151 *
5152 * start:	First Byte to trim
5153 * len:		number of Bytes to trim from start
5154 * minlen:	minimum extent length in Bytes
5155 * ext4_trim_fs goes through all allocation groups containing Bytes from
5156 * start to start+len. For each such a group ext4_trim_all_free function
5157 * is invoked to trim all free space.
5158 */
5159int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5160{
5161	struct ext4_group_info *grp;
5162	ext4_group_t group, first_group, last_group;
5163	ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5164	uint64_t start, end, minlen, trimmed = 0;
5165	ext4_fsblk_t first_data_blk =
5166			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
5167	ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5168	int ret = 0;
5169
5170	start = range->start >> sb->s_blocksize_bits;
5171	end = start + (range->len >> sb->s_blocksize_bits) - 1;
5172	minlen = EXT4_NUM_B2C(EXT4_SB(sb),
5173			      range->minlen >> sb->s_blocksize_bits);
5174
5175	if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
5176	    start >= max_blks ||
5177	    range->len < sb->s_blocksize)
5178		return -EINVAL;
5179	if (end >= max_blks)
5180		end = max_blks - 1;
5181	if (end <= first_data_blk)
5182		goto out;
5183	if (start < first_data_blk)
5184		start = first_data_blk;
5185
5186	/* Determine first and last group to examine based on start and end */
5187	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5188				     &first_group, &first_cluster);
5189	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5190				     &last_group, &last_cluster);
5191
5192	/* end now represents the last cluster to discard in this group */
5193	end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5194
5195	for (group = first_group; group <= last_group; group++) {
5196		grp = ext4_get_group_info(sb, group);
5197		/* We only do this if the grp has never been initialized */
5198		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
5199			ret = ext4_mb_init_group(sb, group);
5200			if (ret)
5201				break;
5202		}
5203
5204		/*
5205		 * For all the groups except the last one, last cluster will
5206		 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5207		 * change it for the last group, note that last_cluster is
5208		 * already computed earlier by ext4_get_group_no_and_offset()
5209		 */
5210		if (group == last_group)
5211			end = last_cluster;
5212
5213		if (grp->bb_free >= minlen) {
5214			cnt = ext4_trim_all_free(sb, group, first_cluster,
5215						end, minlen);
5216			if (cnt < 0) {
5217				ret = cnt;
5218				break;
5219			}
5220			trimmed += cnt;
5221		}
5222
5223		/*
5224		 * For every group except the first one, we are sure
5225		 * that the first cluster to discard will be cluster #0.
5226		 */
5227		first_cluster = 0;
5228	}
5229
5230	if (!ret)
5231		atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5232
5233out:
5234	range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
5235	return ret;
5236}
5237