1/*
2 * btree.c - NILFS B-tree.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/slab.h>
24#include <linux/string.h>
25#include <linux/errno.h>
26#include <linux/pagevec.h>
27#include "nilfs.h"
28#include "page.h"
29#include "btnode.h"
30#include "btree.h"
31#include "alloc.h"
32#include "dat.h"
33
34static void __nilfs_btree_init(struct nilfs_bmap *bmap);
35
36static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
37{
38	struct nilfs_btree_path *path;
39	int level = NILFS_BTREE_LEVEL_DATA;
40
41	path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
42	if (path == NULL)
43		goto out;
44
45	for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
46		path[level].bp_bh = NULL;
47		path[level].bp_sib_bh = NULL;
48		path[level].bp_index = 0;
49		path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
50		path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
51		path[level].bp_op = NULL;
52	}
53
54out:
55	return path;
56}
57
58static void nilfs_btree_free_path(struct nilfs_btree_path *path)
59{
60	int level = NILFS_BTREE_LEVEL_DATA;
61
62	for (; level < NILFS_BTREE_LEVEL_MAX; level++)
63		brelse(path[level].bp_bh);
64
65	kmem_cache_free(nilfs_btree_path_cache, path);
66}
67
68/*
69 * B-tree node operations
70 */
71static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
72				     __u64 ptr, struct buffer_head **bhp)
73{
74	struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
75	struct buffer_head *bh;
76
77	bh = nilfs_btnode_create_block(btnc, ptr);
78	if (!bh)
79		return -ENOMEM;
80
81	set_buffer_nilfs_volatile(bh);
82	*bhp = bh;
83	return 0;
84}
85
86static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
87{
88	return node->bn_flags;
89}
90
91static void
92nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
93{
94	node->bn_flags = flags;
95}
96
97static int nilfs_btree_node_root(const struct nilfs_btree_node *node)
98{
99	return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
100}
101
102static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
103{
104	return node->bn_level;
105}
106
107static void
108nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
109{
110	node->bn_level = level;
111}
112
113static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
114{
115	return le16_to_cpu(node->bn_nchildren);
116}
117
118static void
119nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
120{
121	node->bn_nchildren = cpu_to_le16(nchildren);
122}
123
124static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
125{
126	return 1 << btree->b_inode->i_blkbits;
127}
128
129static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
130{
131	return btree->b_nchildren_per_block;
132}
133
134static __le64 *
135nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
136{
137	return (__le64 *)((char *)(node + 1) +
138			  (nilfs_btree_node_root(node) ?
139			   0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
140}
141
142static __le64 *
143nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax)
144{
145	return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax);
146}
147
148static __u64
149nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
150{
151	return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index));
152}
153
154static void
155nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
156{
157	*(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key);
158}
159
160static __u64
161nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index,
162			 int ncmax)
163{
164	return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index));
165}
166
167static void
168nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr,
169			 int ncmax)
170{
171	*(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr);
172}
173
174static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags,
175				  int level, int nchildren, int ncmax,
176				  const __u64 *keys, const __u64 *ptrs)
177{
178	__le64 *dkeys;
179	__le64 *dptrs;
180	int i;
181
182	nilfs_btree_node_set_flags(node, flags);
183	nilfs_btree_node_set_level(node, level);
184	nilfs_btree_node_set_nchildren(node, nchildren);
185
186	dkeys = nilfs_btree_node_dkeys(node);
187	dptrs = nilfs_btree_node_dptrs(node, ncmax);
188	for (i = 0; i < nchildren; i++) {
189		dkeys[i] = cpu_to_le64(keys[i]);
190		dptrs[i] = cpu_to_le64(ptrs[i]);
191	}
192}
193
194/* Assume the buffer heads corresponding to left and right are locked. */
195static void nilfs_btree_node_move_left(struct nilfs_btree_node *left,
196				       struct nilfs_btree_node *right,
197				       int n, int lncmax, int rncmax)
198{
199	__le64 *ldkeys, *rdkeys;
200	__le64 *ldptrs, *rdptrs;
201	int lnchildren, rnchildren;
202
203	ldkeys = nilfs_btree_node_dkeys(left);
204	ldptrs = nilfs_btree_node_dptrs(left, lncmax);
205	lnchildren = nilfs_btree_node_get_nchildren(left);
206
207	rdkeys = nilfs_btree_node_dkeys(right);
208	rdptrs = nilfs_btree_node_dptrs(right, rncmax);
209	rnchildren = nilfs_btree_node_get_nchildren(right);
210
211	memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
212	memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
213	memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
214	memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
215
216	lnchildren += n;
217	rnchildren -= n;
218	nilfs_btree_node_set_nchildren(left, lnchildren);
219	nilfs_btree_node_set_nchildren(right, rnchildren);
220}
221
222/* Assume that the buffer heads corresponding to left and right are locked. */
223static void nilfs_btree_node_move_right(struct nilfs_btree_node *left,
224					struct nilfs_btree_node *right,
225					int n, int lncmax, int rncmax)
226{
227	__le64 *ldkeys, *rdkeys;
228	__le64 *ldptrs, *rdptrs;
229	int lnchildren, rnchildren;
230
231	ldkeys = nilfs_btree_node_dkeys(left);
232	ldptrs = nilfs_btree_node_dptrs(left, lncmax);
233	lnchildren = nilfs_btree_node_get_nchildren(left);
234
235	rdkeys = nilfs_btree_node_dkeys(right);
236	rdptrs = nilfs_btree_node_dptrs(right, rncmax);
237	rnchildren = nilfs_btree_node_get_nchildren(right);
238
239	memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
240	memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
241	memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
242	memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
243
244	lnchildren -= n;
245	rnchildren += n;
246	nilfs_btree_node_set_nchildren(left, lnchildren);
247	nilfs_btree_node_set_nchildren(right, rnchildren);
248}
249
250/* Assume that the buffer head corresponding to node is locked. */
251static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index,
252				    __u64 key, __u64 ptr, int ncmax)
253{
254	__le64 *dkeys;
255	__le64 *dptrs;
256	int nchildren;
257
258	dkeys = nilfs_btree_node_dkeys(node);
259	dptrs = nilfs_btree_node_dptrs(node, ncmax);
260	nchildren = nilfs_btree_node_get_nchildren(node);
261	if (index < nchildren) {
262		memmove(dkeys + index + 1, dkeys + index,
263			(nchildren - index) * sizeof(*dkeys));
264		memmove(dptrs + index + 1, dptrs + index,
265			(nchildren - index) * sizeof(*dptrs));
266	}
267	dkeys[index] = cpu_to_le64(key);
268	dptrs[index] = cpu_to_le64(ptr);
269	nchildren++;
270	nilfs_btree_node_set_nchildren(node, nchildren);
271}
272
273/* Assume that the buffer head corresponding to node is locked. */
274static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index,
275				    __u64 *keyp, __u64 *ptrp, int ncmax)
276{
277	__u64 key;
278	__u64 ptr;
279	__le64 *dkeys;
280	__le64 *dptrs;
281	int nchildren;
282
283	dkeys = nilfs_btree_node_dkeys(node);
284	dptrs = nilfs_btree_node_dptrs(node, ncmax);
285	key = le64_to_cpu(dkeys[index]);
286	ptr = le64_to_cpu(dptrs[index]);
287	nchildren = nilfs_btree_node_get_nchildren(node);
288	if (keyp != NULL)
289		*keyp = key;
290	if (ptrp != NULL)
291		*ptrp = ptr;
292
293	if (index < nchildren - 1) {
294		memmove(dkeys + index, dkeys + index + 1,
295			(nchildren - index - 1) * sizeof(*dkeys));
296		memmove(dptrs + index, dptrs + index + 1,
297			(nchildren - index - 1) * sizeof(*dptrs));
298	}
299	nchildren--;
300	nilfs_btree_node_set_nchildren(node, nchildren);
301}
302
303static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
304				   __u64 key, int *indexp)
305{
306	__u64 nkey;
307	int index, low, high, s;
308
309	/* binary search */
310	low = 0;
311	high = nilfs_btree_node_get_nchildren(node) - 1;
312	index = 0;
313	s = 0;
314	while (low <= high) {
315		index = (low + high) / 2;
316		nkey = nilfs_btree_node_get_key(node, index);
317		if (nkey == key) {
318			s = 0;
319			goto out;
320		} else if (nkey < key) {
321			low = index + 1;
322			s = -1;
323		} else {
324			high = index - 1;
325			s = 1;
326		}
327	}
328
329	/* adjust index */
330	if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
331		if (s > 0 && index > 0)
332			index--;
333	} else if (s < 0)
334		index++;
335
336 out:
337	*indexp = index;
338
339	return s == 0;
340}
341
342/**
343 * nilfs_btree_node_broken - verify consistency of btree node
344 * @node: btree node block to be examined
345 * @size: node size (in bytes)
346 * @blocknr: block number
347 *
348 * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
349 */
350static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
351				   size_t size, sector_t blocknr)
352{
353	int level, flags, nchildren;
354	int ret = 0;
355
356	level = nilfs_btree_node_get_level(node);
357	flags = nilfs_btree_node_get_flags(node);
358	nchildren = nilfs_btree_node_get_nchildren(node);
359
360	if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
361		     level >= NILFS_BTREE_LEVEL_MAX ||
362		     (flags & NILFS_BTREE_NODE_ROOT) ||
363		     nchildren < 0 ||
364		     nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
365		printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
366		       "level = %d, flags = 0x%x, nchildren = %d\n",
367		       (unsigned long long)blocknr, level, flags, nchildren);
368		ret = 1;
369	}
370	return ret;
371}
372
373/**
374 * nilfs_btree_root_broken - verify consistency of btree root node
375 * @node: btree root node to be examined
376 * @ino: inode number
377 *
378 * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
379 */
380static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
381				   unsigned long ino)
382{
383	int level, flags, nchildren;
384	int ret = 0;
385
386	level = nilfs_btree_node_get_level(node);
387	flags = nilfs_btree_node_get_flags(node);
388	nchildren = nilfs_btree_node_get_nchildren(node);
389
390	if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
391		     level >= NILFS_BTREE_LEVEL_MAX ||
392		     nchildren < 0 ||
393		     nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
394		pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
395			ino, level, flags, nchildren);
396		ret = 1;
397	}
398	return ret;
399}
400
401int nilfs_btree_broken_node_block(struct buffer_head *bh)
402{
403	int ret;
404
405	if (buffer_nilfs_checked(bh))
406		return 0;
407
408	ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
409				       bh->b_size, bh->b_blocknr);
410	if (likely(!ret))
411		set_buffer_nilfs_checked(bh);
412	return ret;
413}
414
415static struct nilfs_btree_node *
416nilfs_btree_get_root(const struct nilfs_bmap *btree)
417{
418	return (struct nilfs_btree_node *)btree->b_u.u_data;
419}
420
421static struct nilfs_btree_node *
422nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
423{
424	return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
425}
426
427static struct nilfs_btree_node *
428nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
429{
430	return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
431}
432
433static int nilfs_btree_height(const struct nilfs_bmap *btree)
434{
435	return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
436}
437
438static struct nilfs_btree_node *
439nilfs_btree_get_node(const struct nilfs_bmap *btree,
440		     const struct nilfs_btree_path *path,
441		     int level, int *ncmaxp)
442{
443	struct nilfs_btree_node *node;
444
445	if (level == nilfs_btree_height(btree) - 1) {
446		node = nilfs_btree_get_root(btree);
447		*ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX;
448	} else {
449		node = nilfs_btree_get_nonroot_node(path, level);
450		*ncmaxp = nilfs_btree_nchildren_per_block(btree);
451	}
452	return node;
453}
454
455static int
456nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
457{
458	if (unlikely(nilfs_btree_node_get_level(node) != level)) {
459		dump_stack();
460		printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
461		       nilfs_btree_node_get_level(node), level);
462		return 1;
463	}
464	return 0;
465}
466
467struct nilfs_btree_readahead_info {
468	struct nilfs_btree_node *node;	/* parent node */
469	int max_ra_blocks;		/* max nof blocks to read ahead */
470	int index;			/* current index on the parent node */
471	int ncmax;			/* nof children in the parent node */
472};
473
474static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
475				   struct buffer_head **bhp,
476				   const struct nilfs_btree_readahead_info *ra)
477{
478	struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
479	struct buffer_head *bh, *ra_bh;
480	sector_t submit_ptr = 0;
481	int ret;
482
483	ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr);
484	if (ret) {
485		if (ret != -EEXIST)
486			return ret;
487		goto out_check;
488	}
489
490	if (ra) {
491		int i, n;
492		__u64 ptr2;
493
494		/* read ahead sibling nodes */
495		for (n = ra->max_ra_blocks, i = ra->index + 1;
496		     n > 0 && i < ra->ncmax; n--, i++) {
497			ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
498
499			ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA,
500							&ra_bh, &submit_ptr);
501			if (likely(!ret || ret == -EEXIST))
502				brelse(ra_bh);
503			else if (ret != -EBUSY)
504				break;
505			if (!buffer_locked(bh))
506				goto out_no_wait;
507		}
508	}
509
510	wait_on_buffer(bh);
511
512 out_no_wait:
513	if (!buffer_uptodate(bh)) {
514		brelse(bh);
515		return -EIO;
516	}
517
518 out_check:
519	if (nilfs_btree_broken_node_block(bh)) {
520		clear_buffer_uptodate(bh);
521		brelse(bh);
522		return -EINVAL;
523	}
524
525	*bhp = bh;
526	return 0;
527}
528
529static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
530				   struct buffer_head **bhp)
531{
532	return __nilfs_btree_get_block(btree, ptr, bhp, NULL);
533}
534
535static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
536				 struct nilfs_btree_path *path,
537				 __u64 key, __u64 *ptrp, int minlevel,
538				 int readahead)
539{
540	struct nilfs_btree_node *node;
541	struct nilfs_btree_readahead_info p, *ra;
542	__u64 ptr;
543	int level, index, found, ncmax, ret;
544
545	node = nilfs_btree_get_root(btree);
546	level = nilfs_btree_node_get_level(node);
547	if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
548		return -ENOENT;
549
550	found = nilfs_btree_node_lookup(node, key, &index);
551	ptr = nilfs_btree_node_get_ptr(node, index,
552				       NILFS_BTREE_ROOT_NCHILDREN_MAX);
553	path[level].bp_bh = NULL;
554	path[level].bp_index = index;
555
556	ncmax = nilfs_btree_nchildren_per_block(btree);
557
558	while (--level >= minlevel) {
559		ra = NULL;
560		if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) {
561			p.node = nilfs_btree_get_node(btree, path, level + 1,
562						      &p.ncmax);
563			p.index = index;
564			p.max_ra_blocks = 7;
565			ra = &p;
566		}
567		ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh,
568					      ra);
569		if (ret < 0)
570			return ret;
571
572		node = nilfs_btree_get_nonroot_node(path, level);
573		if (nilfs_btree_bad_node(node, level))
574			return -EINVAL;
575		if (!found)
576			found = nilfs_btree_node_lookup(node, key, &index);
577		else
578			index = 0;
579		if (index < ncmax) {
580			ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
581		} else {
582			WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
583			/* insert */
584			ptr = NILFS_BMAP_INVALID_PTR;
585		}
586		path[level].bp_index = index;
587	}
588	if (!found)
589		return -ENOENT;
590
591	if (ptrp != NULL)
592		*ptrp = ptr;
593
594	return 0;
595}
596
597static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
598				      struct nilfs_btree_path *path,
599				      __u64 *keyp, __u64 *ptrp)
600{
601	struct nilfs_btree_node *node;
602	__u64 ptr;
603	int index, level, ncmax, ret;
604
605	node = nilfs_btree_get_root(btree);
606	index = nilfs_btree_node_get_nchildren(node) - 1;
607	if (index < 0)
608		return -ENOENT;
609	level = nilfs_btree_node_get_level(node);
610	ptr = nilfs_btree_node_get_ptr(node, index,
611				       NILFS_BTREE_ROOT_NCHILDREN_MAX);
612	path[level].bp_bh = NULL;
613	path[level].bp_index = index;
614	ncmax = nilfs_btree_nchildren_per_block(btree);
615
616	for (level--; level > 0; level--) {
617		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
618		if (ret < 0)
619			return ret;
620		node = nilfs_btree_get_nonroot_node(path, level);
621		if (nilfs_btree_bad_node(node, level))
622			return -EINVAL;
623		index = nilfs_btree_node_get_nchildren(node) - 1;
624		ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
625		path[level].bp_index = index;
626	}
627
628	if (keyp != NULL)
629		*keyp = nilfs_btree_node_get_key(node, index);
630	if (ptrp != NULL)
631		*ptrp = ptr;
632
633	return 0;
634}
635
636/**
637 * nilfs_btree_get_next_key - get next valid key from btree path array
638 * @btree: bmap struct of btree
639 * @path: array of nilfs_btree_path struct
640 * @minlevel: start level
641 * @nextkey: place to store the next valid key
642 *
643 * Return Value: If a next key was found, 0 is returned. Otherwise,
644 * -ENOENT is returned.
645 */
646static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree,
647				    const struct nilfs_btree_path *path,
648				    int minlevel, __u64 *nextkey)
649{
650	struct nilfs_btree_node *node;
651	int maxlevel = nilfs_btree_height(btree) - 1;
652	int index, next_adj, level;
653
654	/* Next index is already set to bp_index for leaf nodes. */
655	next_adj = 0;
656	for (level = minlevel; level <= maxlevel; level++) {
657		if (level == maxlevel)
658			node = nilfs_btree_get_root(btree);
659		else
660			node = nilfs_btree_get_nonroot_node(path, level);
661
662		index = path[level].bp_index + next_adj;
663		if (index < nilfs_btree_node_get_nchildren(node)) {
664			/* Next key is in this node */
665			*nextkey = nilfs_btree_node_get_key(node, index);
666			return 0;
667		}
668		/* For non-leaf nodes, next index is stored at bp_index + 1. */
669		next_adj = 1;
670	}
671	return -ENOENT;
672}
673
674static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
675			      __u64 key, int level, __u64 *ptrp)
676{
677	struct nilfs_btree_path *path;
678	int ret;
679
680	path = nilfs_btree_alloc_path();
681	if (path == NULL)
682		return -ENOMEM;
683
684	ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0);
685
686	nilfs_btree_free_path(path);
687
688	return ret;
689}
690
691static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
692				     __u64 key, __u64 *ptrp, unsigned maxblocks)
693{
694	struct nilfs_btree_path *path;
695	struct nilfs_btree_node *node;
696	struct inode *dat = NULL;
697	__u64 ptr, ptr2;
698	sector_t blocknr;
699	int level = NILFS_BTREE_LEVEL_NODE_MIN;
700	int ret, cnt, index, maxlevel, ncmax;
701	struct nilfs_btree_readahead_info p;
702
703	path = nilfs_btree_alloc_path();
704	if (path == NULL)
705		return -ENOMEM;
706
707	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1);
708	if (ret < 0)
709		goto out;
710
711	if (NILFS_BMAP_USE_VBN(btree)) {
712		dat = nilfs_bmap_get_dat(btree);
713		ret = nilfs_dat_translate(dat, ptr, &blocknr);
714		if (ret < 0)
715			goto out;
716		ptr = blocknr;
717	}
718	cnt = 1;
719	if (cnt == maxblocks)
720		goto end;
721
722	maxlevel = nilfs_btree_height(btree) - 1;
723	node = nilfs_btree_get_node(btree, path, level, &ncmax);
724	index = path[level].bp_index + 1;
725	for (;;) {
726		while (index < nilfs_btree_node_get_nchildren(node)) {
727			if (nilfs_btree_node_get_key(node, index) !=
728			    key + cnt)
729				goto end;
730			ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax);
731			if (dat) {
732				ret = nilfs_dat_translate(dat, ptr2, &blocknr);
733				if (ret < 0)
734					goto out;
735				ptr2 = blocknr;
736			}
737			if (ptr2 != ptr + cnt || ++cnt == maxblocks)
738				goto end;
739			index++;
740			continue;
741		}
742		if (level == maxlevel)
743			break;
744
745		/* look-up right sibling node */
746		p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax);
747		p.index = path[level + 1].bp_index + 1;
748		p.max_ra_blocks = 7;
749		if (p.index >= nilfs_btree_node_get_nchildren(p.node) ||
750		    nilfs_btree_node_get_key(p.node, p.index) != key + cnt)
751			break;
752		ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax);
753		path[level + 1].bp_index = p.index;
754
755		brelse(path[level].bp_bh);
756		path[level].bp_bh = NULL;
757
758		ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh,
759					      &p);
760		if (ret < 0)
761			goto out;
762		node = nilfs_btree_get_nonroot_node(path, level);
763		ncmax = nilfs_btree_nchildren_per_block(btree);
764		index = 0;
765		path[level].bp_index = index;
766	}
767 end:
768	*ptrp = ptr;
769	ret = cnt;
770 out:
771	nilfs_btree_free_path(path);
772	return ret;
773}
774
775static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
776				    struct nilfs_btree_path *path,
777				    int level, __u64 key)
778{
779	if (level < nilfs_btree_height(btree) - 1) {
780		do {
781			nilfs_btree_node_set_key(
782				nilfs_btree_get_nonroot_node(path, level),
783				path[level].bp_index, key);
784			if (!buffer_dirty(path[level].bp_bh))
785				mark_buffer_dirty(path[level].bp_bh);
786		} while ((path[level].bp_index == 0) &&
787			 (++level < nilfs_btree_height(btree) - 1));
788	}
789
790	/* root */
791	if (level == nilfs_btree_height(btree) - 1) {
792		nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
793					 path[level].bp_index, key);
794	}
795}
796
797static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
798				  struct nilfs_btree_path *path,
799				  int level, __u64 *keyp, __u64 *ptrp)
800{
801	struct nilfs_btree_node *node;
802	int ncblk;
803
804	if (level < nilfs_btree_height(btree) - 1) {
805		node = nilfs_btree_get_nonroot_node(path, level);
806		ncblk = nilfs_btree_nchildren_per_block(btree);
807		nilfs_btree_node_insert(node, path[level].bp_index,
808					*keyp, *ptrp, ncblk);
809		if (!buffer_dirty(path[level].bp_bh))
810			mark_buffer_dirty(path[level].bp_bh);
811
812		if (path[level].bp_index == 0)
813			nilfs_btree_promote_key(btree, path, level + 1,
814						nilfs_btree_node_get_key(node,
815									 0));
816	} else {
817		node = nilfs_btree_get_root(btree);
818		nilfs_btree_node_insert(node, path[level].bp_index,
819					*keyp, *ptrp,
820					NILFS_BTREE_ROOT_NCHILDREN_MAX);
821	}
822}
823
824static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
825				   struct nilfs_btree_path *path,
826				   int level, __u64 *keyp, __u64 *ptrp)
827{
828	struct nilfs_btree_node *node, *left;
829	int nchildren, lnchildren, n, move, ncblk;
830
831	node = nilfs_btree_get_nonroot_node(path, level);
832	left = nilfs_btree_get_sib_node(path, level);
833	nchildren = nilfs_btree_node_get_nchildren(node);
834	lnchildren = nilfs_btree_node_get_nchildren(left);
835	ncblk = nilfs_btree_nchildren_per_block(btree);
836	move = 0;
837
838	n = (nchildren + lnchildren + 1) / 2 - lnchildren;
839	if (n > path[level].bp_index) {
840		/* move insert point */
841		n--;
842		move = 1;
843	}
844
845	nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
846
847	if (!buffer_dirty(path[level].bp_bh))
848		mark_buffer_dirty(path[level].bp_bh);
849	if (!buffer_dirty(path[level].bp_sib_bh))
850		mark_buffer_dirty(path[level].bp_sib_bh);
851
852	nilfs_btree_promote_key(btree, path, level + 1,
853				nilfs_btree_node_get_key(node, 0));
854
855	if (move) {
856		brelse(path[level].bp_bh);
857		path[level].bp_bh = path[level].bp_sib_bh;
858		path[level].bp_sib_bh = NULL;
859		path[level].bp_index += lnchildren;
860		path[level + 1].bp_index--;
861	} else {
862		brelse(path[level].bp_sib_bh);
863		path[level].bp_sib_bh = NULL;
864		path[level].bp_index -= n;
865	}
866
867	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
868}
869
870static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
871				    struct nilfs_btree_path *path,
872				    int level, __u64 *keyp, __u64 *ptrp)
873{
874	struct nilfs_btree_node *node, *right;
875	int nchildren, rnchildren, n, move, ncblk;
876
877	node = nilfs_btree_get_nonroot_node(path, level);
878	right = nilfs_btree_get_sib_node(path, level);
879	nchildren = nilfs_btree_node_get_nchildren(node);
880	rnchildren = nilfs_btree_node_get_nchildren(right);
881	ncblk = nilfs_btree_nchildren_per_block(btree);
882	move = 0;
883
884	n = (nchildren + rnchildren + 1) / 2 - rnchildren;
885	if (n > nchildren - path[level].bp_index) {
886		/* move insert point */
887		n--;
888		move = 1;
889	}
890
891	nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
892
893	if (!buffer_dirty(path[level].bp_bh))
894		mark_buffer_dirty(path[level].bp_bh);
895	if (!buffer_dirty(path[level].bp_sib_bh))
896		mark_buffer_dirty(path[level].bp_sib_bh);
897
898	path[level + 1].bp_index++;
899	nilfs_btree_promote_key(btree, path, level + 1,
900				nilfs_btree_node_get_key(right, 0));
901	path[level + 1].bp_index--;
902
903	if (move) {
904		brelse(path[level].bp_bh);
905		path[level].bp_bh = path[level].bp_sib_bh;
906		path[level].bp_sib_bh = NULL;
907		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
908		path[level + 1].bp_index++;
909	} else {
910		brelse(path[level].bp_sib_bh);
911		path[level].bp_sib_bh = NULL;
912	}
913
914	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
915}
916
917static void nilfs_btree_split(struct nilfs_bmap *btree,
918			      struct nilfs_btree_path *path,
919			      int level, __u64 *keyp, __u64 *ptrp)
920{
921	struct nilfs_btree_node *node, *right;
922	__u64 newkey;
923	__u64 newptr;
924	int nchildren, n, move, ncblk;
925
926	node = nilfs_btree_get_nonroot_node(path, level);
927	right = nilfs_btree_get_sib_node(path, level);
928	nchildren = nilfs_btree_node_get_nchildren(node);
929	ncblk = nilfs_btree_nchildren_per_block(btree);
930	move = 0;
931
932	n = (nchildren + 1) / 2;
933	if (n > nchildren - path[level].bp_index) {
934		n--;
935		move = 1;
936	}
937
938	nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
939
940	if (!buffer_dirty(path[level].bp_bh))
941		mark_buffer_dirty(path[level].bp_bh);
942	if (!buffer_dirty(path[level].bp_sib_bh))
943		mark_buffer_dirty(path[level].bp_sib_bh);
944
945	newkey = nilfs_btree_node_get_key(right, 0);
946	newptr = path[level].bp_newreq.bpr_ptr;
947
948	if (move) {
949		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
950		nilfs_btree_node_insert(right, path[level].bp_index,
951					*keyp, *ptrp, ncblk);
952
953		*keyp = nilfs_btree_node_get_key(right, 0);
954		*ptrp = path[level].bp_newreq.bpr_ptr;
955
956		brelse(path[level].bp_bh);
957		path[level].bp_bh = path[level].bp_sib_bh;
958		path[level].bp_sib_bh = NULL;
959	} else {
960		nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
961
962		*keyp = nilfs_btree_node_get_key(right, 0);
963		*ptrp = path[level].bp_newreq.bpr_ptr;
964
965		brelse(path[level].bp_sib_bh);
966		path[level].bp_sib_bh = NULL;
967	}
968
969	path[level + 1].bp_index++;
970}
971
972static void nilfs_btree_grow(struct nilfs_bmap *btree,
973			     struct nilfs_btree_path *path,
974			     int level, __u64 *keyp, __u64 *ptrp)
975{
976	struct nilfs_btree_node *root, *child;
977	int n, ncblk;
978
979	root = nilfs_btree_get_root(btree);
980	child = nilfs_btree_get_sib_node(path, level);
981	ncblk = nilfs_btree_nchildren_per_block(btree);
982
983	n = nilfs_btree_node_get_nchildren(root);
984
985	nilfs_btree_node_move_right(root, child, n,
986				    NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
987	nilfs_btree_node_set_level(root, level + 1);
988
989	if (!buffer_dirty(path[level].bp_sib_bh))
990		mark_buffer_dirty(path[level].bp_sib_bh);
991
992	path[level].bp_bh = path[level].bp_sib_bh;
993	path[level].bp_sib_bh = NULL;
994
995	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
996
997	*keyp = nilfs_btree_node_get_key(child, 0);
998	*ptrp = path[level].bp_newreq.bpr_ptr;
999}
1000
1001static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree,
1002				   const struct nilfs_btree_path *path)
1003{
1004	struct nilfs_btree_node *node;
1005	int level, ncmax;
1006
1007	if (path == NULL)
1008		return NILFS_BMAP_INVALID_PTR;
1009
1010	/* left sibling */
1011	level = NILFS_BTREE_LEVEL_NODE_MIN;
1012	if (path[level].bp_index > 0) {
1013		node = nilfs_btree_get_node(btree, path, level, &ncmax);
1014		return nilfs_btree_node_get_ptr(node,
1015						path[level].bp_index - 1,
1016						ncmax);
1017	}
1018
1019	/* parent */
1020	level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
1021	if (level <= nilfs_btree_height(btree) - 1) {
1022		node = nilfs_btree_get_node(btree, path, level, &ncmax);
1023		return nilfs_btree_node_get_ptr(node, path[level].bp_index,
1024						ncmax);
1025	}
1026
1027	return NILFS_BMAP_INVALID_PTR;
1028}
1029
1030static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
1031				       const struct nilfs_btree_path *path,
1032				       __u64 key)
1033{
1034	__u64 ptr;
1035
1036	ptr = nilfs_bmap_find_target_seq(btree, key);
1037	if (ptr != NILFS_BMAP_INVALID_PTR)
1038		/* sequential access */
1039		return ptr;
1040	else {
1041		ptr = nilfs_btree_find_near(btree, path);
1042		if (ptr != NILFS_BMAP_INVALID_PTR)
1043			/* near */
1044			return ptr;
1045	}
1046	/* block group */
1047	return nilfs_bmap_find_target_in_group(btree);
1048}
1049
1050static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree,
1051				      struct nilfs_btree_path *path,
1052				      int *levelp, __u64 key, __u64 ptr,
1053				      struct nilfs_bmap_stats *stats)
1054{
1055	struct buffer_head *bh;
1056	struct nilfs_btree_node *node, *parent, *sib;
1057	__u64 sibptr;
1058	int pindex, level, ncmax, ncblk, ret;
1059	struct inode *dat = NULL;
1060
1061	stats->bs_nblocks = 0;
1062	level = NILFS_BTREE_LEVEL_DATA;
1063
1064	/* allocate a new ptr for data block */
1065	if (NILFS_BMAP_USE_VBN(btree)) {
1066		path[level].bp_newreq.bpr_ptr =
1067			nilfs_btree_find_target_v(btree, path, key);
1068		dat = nilfs_bmap_get_dat(btree);
1069	}
1070
1071	ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
1072	if (ret < 0)
1073		goto err_out_data;
1074
1075	ncblk = nilfs_btree_nchildren_per_block(btree);
1076
1077	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1078	     level < nilfs_btree_height(btree) - 1;
1079	     level++) {
1080		node = nilfs_btree_get_nonroot_node(path, level);
1081		if (nilfs_btree_node_get_nchildren(node) < ncblk) {
1082			path[level].bp_op = nilfs_btree_do_insert;
1083			stats->bs_nblocks++;
1084			goto out;
1085		}
1086
1087		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1088		pindex = path[level + 1].bp_index;
1089
1090		/* left sibling */
1091		if (pindex > 0) {
1092			sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
1093							  ncmax);
1094			ret = nilfs_btree_get_block(btree, sibptr, &bh);
1095			if (ret < 0)
1096				goto err_out_child_node;
1097			sib = (struct nilfs_btree_node *)bh->b_data;
1098			if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
1099				path[level].bp_sib_bh = bh;
1100				path[level].bp_op = nilfs_btree_carry_left;
1101				stats->bs_nblocks++;
1102				goto out;
1103			} else {
1104				brelse(bh);
1105			}
1106		}
1107
1108		/* right sibling */
1109		if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) {
1110			sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
1111							  ncmax);
1112			ret = nilfs_btree_get_block(btree, sibptr, &bh);
1113			if (ret < 0)
1114				goto err_out_child_node;
1115			sib = (struct nilfs_btree_node *)bh->b_data;
1116			if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
1117				path[level].bp_sib_bh = bh;
1118				path[level].bp_op = nilfs_btree_carry_right;
1119				stats->bs_nblocks++;
1120				goto out;
1121			} else {
1122				brelse(bh);
1123			}
1124		}
1125
1126		/* split */
1127		path[level].bp_newreq.bpr_ptr =
1128			path[level - 1].bp_newreq.bpr_ptr + 1;
1129		ret = nilfs_bmap_prepare_alloc_ptr(btree,
1130						   &path[level].bp_newreq, dat);
1131		if (ret < 0)
1132			goto err_out_child_node;
1133		ret = nilfs_btree_get_new_block(btree,
1134						path[level].bp_newreq.bpr_ptr,
1135						&bh);
1136		if (ret < 0)
1137			goto err_out_curr_node;
1138
1139		stats->bs_nblocks++;
1140
1141		sib = (struct nilfs_btree_node *)bh->b_data;
1142		nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL);
1143		path[level].bp_sib_bh = bh;
1144		path[level].bp_op = nilfs_btree_split;
1145	}
1146
1147	/* root */
1148	node = nilfs_btree_get_root(btree);
1149	if (nilfs_btree_node_get_nchildren(node) <
1150	    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1151		path[level].bp_op = nilfs_btree_do_insert;
1152		stats->bs_nblocks++;
1153		goto out;
1154	}
1155
1156	/* grow */
1157	path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
1158	ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
1159	if (ret < 0)
1160		goto err_out_child_node;
1161	ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
1162					&bh);
1163	if (ret < 0)
1164		goto err_out_curr_node;
1165
1166	nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data,
1167			      0, level, 0, ncblk, NULL, NULL);
1168	path[level].bp_sib_bh = bh;
1169	path[level].bp_op = nilfs_btree_grow;
1170
1171	level++;
1172	path[level].bp_op = nilfs_btree_do_insert;
1173
1174	/* a newly-created node block and a data block are added */
1175	stats->bs_nblocks += 2;
1176
1177	/* success */
1178 out:
1179	*levelp = level;
1180	return ret;
1181
1182	/* error */
1183 err_out_curr_node:
1184	nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1185 err_out_child_node:
1186	for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1187		nilfs_btnode_delete(path[level].bp_sib_bh);
1188		nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1189
1190	}
1191
1192	nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1193 err_out_data:
1194	*levelp = level;
1195	stats->bs_nblocks = 0;
1196	return ret;
1197}
1198
1199static void nilfs_btree_commit_insert(struct nilfs_bmap *btree,
1200				      struct nilfs_btree_path *path,
1201				      int maxlevel, __u64 key, __u64 ptr)
1202{
1203	struct inode *dat = NULL;
1204	int level;
1205
1206	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1207	ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1208	if (NILFS_BMAP_USE_VBN(btree)) {
1209		nilfs_bmap_set_target_v(btree, key, ptr);
1210		dat = nilfs_bmap_get_dat(btree);
1211	}
1212
1213	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1214		nilfs_bmap_commit_alloc_ptr(btree,
1215					    &path[level - 1].bp_newreq, dat);
1216		path[level].bp_op(btree, path, level, &key, &ptr);
1217	}
1218
1219	if (!nilfs_bmap_dirty(btree))
1220		nilfs_bmap_set_dirty(btree);
1221}
1222
1223static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
1224{
1225	struct nilfs_btree_path *path;
1226	struct nilfs_bmap_stats stats;
1227	int level, ret;
1228
1229	path = nilfs_btree_alloc_path();
1230	if (path == NULL)
1231		return -ENOMEM;
1232
1233	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1234				    NILFS_BTREE_LEVEL_NODE_MIN, 0);
1235	if (ret != -ENOENT) {
1236		if (ret == 0)
1237			ret = -EEXIST;
1238		goto out;
1239	}
1240
1241	ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
1242	if (ret < 0)
1243		goto out;
1244	nilfs_btree_commit_insert(btree, path, level, key, ptr);
1245	nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
1246
1247 out:
1248	nilfs_btree_free_path(path);
1249	return ret;
1250}
1251
1252static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
1253				  struct nilfs_btree_path *path,
1254				  int level, __u64 *keyp, __u64 *ptrp)
1255{
1256	struct nilfs_btree_node *node;
1257	int ncblk;
1258
1259	if (level < nilfs_btree_height(btree) - 1) {
1260		node = nilfs_btree_get_nonroot_node(path, level);
1261		ncblk = nilfs_btree_nchildren_per_block(btree);
1262		nilfs_btree_node_delete(node, path[level].bp_index,
1263					keyp, ptrp, ncblk);
1264		if (!buffer_dirty(path[level].bp_bh))
1265			mark_buffer_dirty(path[level].bp_bh);
1266		if (path[level].bp_index == 0)
1267			nilfs_btree_promote_key(btree, path, level + 1,
1268				nilfs_btree_node_get_key(node, 0));
1269	} else {
1270		node = nilfs_btree_get_root(btree);
1271		nilfs_btree_node_delete(node, path[level].bp_index,
1272					keyp, ptrp,
1273					NILFS_BTREE_ROOT_NCHILDREN_MAX);
1274	}
1275}
1276
1277static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
1278				    struct nilfs_btree_path *path,
1279				    int level, __u64 *keyp, __u64 *ptrp)
1280{
1281	struct nilfs_btree_node *node, *left;
1282	int nchildren, lnchildren, n, ncblk;
1283
1284	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1285
1286	node = nilfs_btree_get_nonroot_node(path, level);
1287	left = nilfs_btree_get_sib_node(path, level);
1288	nchildren = nilfs_btree_node_get_nchildren(node);
1289	lnchildren = nilfs_btree_node_get_nchildren(left);
1290	ncblk = nilfs_btree_nchildren_per_block(btree);
1291
1292	n = (nchildren + lnchildren) / 2 - nchildren;
1293
1294	nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
1295
1296	if (!buffer_dirty(path[level].bp_bh))
1297		mark_buffer_dirty(path[level].bp_bh);
1298	if (!buffer_dirty(path[level].bp_sib_bh))
1299		mark_buffer_dirty(path[level].bp_sib_bh);
1300
1301	nilfs_btree_promote_key(btree, path, level + 1,
1302				nilfs_btree_node_get_key(node, 0));
1303
1304	brelse(path[level].bp_sib_bh);
1305	path[level].bp_sib_bh = NULL;
1306	path[level].bp_index += n;
1307}
1308
1309static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
1310				     struct nilfs_btree_path *path,
1311				     int level, __u64 *keyp, __u64 *ptrp)
1312{
1313	struct nilfs_btree_node *node, *right;
1314	int nchildren, rnchildren, n, ncblk;
1315
1316	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1317
1318	node = nilfs_btree_get_nonroot_node(path, level);
1319	right = nilfs_btree_get_sib_node(path, level);
1320	nchildren = nilfs_btree_node_get_nchildren(node);
1321	rnchildren = nilfs_btree_node_get_nchildren(right);
1322	ncblk = nilfs_btree_nchildren_per_block(btree);
1323
1324	n = (nchildren + rnchildren) / 2 - nchildren;
1325
1326	nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
1327
1328	if (!buffer_dirty(path[level].bp_bh))
1329		mark_buffer_dirty(path[level].bp_bh);
1330	if (!buffer_dirty(path[level].bp_sib_bh))
1331		mark_buffer_dirty(path[level].bp_sib_bh);
1332
1333	path[level + 1].bp_index++;
1334	nilfs_btree_promote_key(btree, path, level + 1,
1335				nilfs_btree_node_get_key(right, 0));
1336	path[level + 1].bp_index--;
1337
1338	brelse(path[level].bp_sib_bh);
1339	path[level].bp_sib_bh = NULL;
1340}
1341
1342static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
1343				    struct nilfs_btree_path *path,
1344				    int level, __u64 *keyp, __u64 *ptrp)
1345{
1346	struct nilfs_btree_node *node, *left;
1347	int n, ncblk;
1348
1349	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1350
1351	node = nilfs_btree_get_nonroot_node(path, level);
1352	left = nilfs_btree_get_sib_node(path, level);
1353	ncblk = nilfs_btree_nchildren_per_block(btree);
1354
1355	n = nilfs_btree_node_get_nchildren(node);
1356
1357	nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
1358
1359	if (!buffer_dirty(path[level].bp_sib_bh))
1360		mark_buffer_dirty(path[level].bp_sib_bh);
1361
1362	nilfs_btnode_delete(path[level].bp_bh);
1363	path[level].bp_bh = path[level].bp_sib_bh;
1364	path[level].bp_sib_bh = NULL;
1365	path[level].bp_index += nilfs_btree_node_get_nchildren(left);
1366}
1367
1368static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
1369				     struct nilfs_btree_path *path,
1370				     int level, __u64 *keyp, __u64 *ptrp)
1371{
1372	struct nilfs_btree_node *node, *right;
1373	int n, ncblk;
1374
1375	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1376
1377	node = nilfs_btree_get_nonroot_node(path, level);
1378	right = nilfs_btree_get_sib_node(path, level);
1379	ncblk = nilfs_btree_nchildren_per_block(btree);
1380
1381	n = nilfs_btree_node_get_nchildren(right);
1382
1383	nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
1384
1385	if (!buffer_dirty(path[level].bp_bh))
1386		mark_buffer_dirty(path[level].bp_bh);
1387
1388	nilfs_btnode_delete(path[level].bp_sib_bh);
1389	path[level].bp_sib_bh = NULL;
1390	path[level + 1].bp_index++;
1391}
1392
1393static void nilfs_btree_shrink(struct nilfs_bmap *btree,
1394			       struct nilfs_btree_path *path,
1395			       int level, __u64 *keyp, __u64 *ptrp)
1396{
1397	struct nilfs_btree_node *root, *child;
1398	int n, ncblk;
1399
1400	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1401
1402	root = nilfs_btree_get_root(btree);
1403	child = nilfs_btree_get_nonroot_node(path, level);
1404	ncblk = nilfs_btree_nchildren_per_block(btree);
1405
1406	nilfs_btree_node_delete(root, 0, NULL, NULL,
1407				NILFS_BTREE_ROOT_NCHILDREN_MAX);
1408	nilfs_btree_node_set_level(root, level);
1409	n = nilfs_btree_node_get_nchildren(child);
1410	nilfs_btree_node_move_left(root, child, n,
1411				   NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
1412
1413	nilfs_btnode_delete(path[level].bp_bh);
1414	path[level].bp_bh = NULL;
1415}
1416
1417static void nilfs_btree_nop(struct nilfs_bmap *btree,
1418			    struct nilfs_btree_path *path,
1419			    int level, __u64 *keyp, __u64 *ptrp)
1420{
1421}
1422
1423static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
1424				      struct nilfs_btree_path *path,
1425				      int *levelp,
1426				      struct nilfs_bmap_stats *stats,
1427				      struct inode *dat)
1428{
1429	struct buffer_head *bh;
1430	struct nilfs_btree_node *node, *parent, *sib;
1431	__u64 sibptr;
1432	int pindex, dindex, level, ncmin, ncmax, ncblk, ret;
1433
1434	ret = 0;
1435	stats->bs_nblocks = 0;
1436	ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
1437	ncblk = nilfs_btree_nchildren_per_block(btree);
1438
1439	for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index;
1440	     level < nilfs_btree_height(btree) - 1;
1441	     level++) {
1442		node = nilfs_btree_get_nonroot_node(path, level);
1443		path[level].bp_oldreq.bpr_ptr =
1444			nilfs_btree_node_get_ptr(node, dindex, ncblk);
1445		ret = nilfs_bmap_prepare_end_ptr(btree,
1446						 &path[level].bp_oldreq, dat);
1447		if (ret < 0)
1448			goto err_out_child_node;
1449
1450		if (nilfs_btree_node_get_nchildren(node) > ncmin) {
1451			path[level].bp_op = nilfs_btree_do_delete;
1452			stats->bs_nblocks++;
1453			goto out;
1454		}
1455
1456		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1457		pindex = path[level + 1].bp_index;
1458		dindex = pindex;
1459
1460		if (pindex > 0) {
1461			/* left sibling */
1462			sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
1463							  ncmax);
1464			ret = nilfs_btree_get_block(btree, sibptr, &bh);
1465			if (ret < 0)
1466				goto err_out_curr_node;
1467			sib = (struct nilfs_btree_node *)bh->b_data;
1468			if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
1469				path[level].bp_sib_bh = bh;
1470				path[level].bp_op = nilfs_btree_borrow_left;
1471				stats->bs_nblocks++;
1472				goto out;
1473			} else {
1474				path[level].bp_sib_bh = bh;
1475				path[level].bp_op = nilfs_btree_concat_left;
1476				stats->bs_nblocks++;
1477				/* continue; */
1478			}
1479		} else if (pindex <
1480			   nilfs_btree_node_get_nchildren(parent) - 1) {
1481			/* right sibling */
1482			sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
1483							  ncmax);
1484			ret = nilfs_btree_get_block(btree, sibptr, &bh);
1485			if (ret < 0)
1486				goto err_out_curr_node;
1487			sib = (struct nilfs_btree_node *)bh->b_data;
1488			if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
1489				path[level].bp_sib_bh = bh;
1490				path[level].bp_op = nilfs_btree_borrow_right;
1491				stats->bs_nblocks++;
1492				goto out;
1493			} else {
1494				path[level].bp_sib_bh = bh;
1495				path[level].bp_op = nilfs_btree_concat_right;
1496				stats->bs_nblocks++;
1497				/*
1498				 * When merging right sibling node
1499				 * into the current node, pointer to
1500				 * the right sibling node must be
1501				 * terminated instead.  The adjustment
1502				 * below is required for that.
1503				 */
1504				dindex = pindex + 1;
1505				/* continue; */
1506			}
1507		} else {
1508			/* no siblings */
1509			/* the only child of the root node */
1510			WARN_ON(level != nilfs_btree_height(btree) - 2);
1511			if (nilfs_btree_node_get_nchildren(node) - 1 <=
1512			    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1513				path[level].bp_op = nilfs_btree_shrink;
1514				stats->bs_nblocks += 2;
1515				level++;
1516				path[level].bp_op = nilfs_btree_nop;
1517				goto shrink_root_child;
1518			} else {
1519				path[level].bp_op = nilfs_btree_do_delete;
1520				stats->bs_nblocks++;
1521				goto out;
1522			}
1523		}
1524	}
1525
1526	/* child of the root node is deleted */
1527	path[level].bp_op = nilfs_btree_do_delete;
1528	stats->bs_nblocks++;
1529
1530shrink_root_child:
1531	node = nilfs_btree_get_root(btree);
1532	path[level].bp_oldreq.bpr_ptr =
1533		nilfs_btree_node_get_ptr(node, dindex,
1534					 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1535
1536	ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
1537	if (ret < 0)
1538		goto err_out_child_node;
1539
1540	/* success */
1541 out:
1542	*levelp = level;
1543	return ret;
1544
1545	/* error */
1546 err_out_curr_node:
1547	nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
1548 err_out_child_node:
1549	for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1550		brelse(path[level].bp_sib_bh);
1551		nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
1552	}
1553	*levelp = level;
1554	stats->bs_nblocks = 0;
1555	return ret;
1556}
1557
1558static void nilfs_btree_commit_delete(struct nilfs_bmap *btree,
1559				      struct nilfs_btree_path *path,
1560				      int maxlevel, struct inode *dat)
1561{
1562	int level;
1563
1564	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1565		nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat);
1566		path[level].bp_op(btree, path, level, NULL, NULL);
1567	}
1568
1569	if (!nilfs_bmap_dirty(btree))
1570		nilfs_bmap_set_dirty(btree);
1571}
1572
1573static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
1574
1575{
1576	struct nilfs_btree_path *path;
1577	struct nilfs_bmap_stats stats;
1578	struct inode *dat;
1579	int level, ret;
1580
1581	path = nilfs_btree_alloc_path();
1582	if (path == NULL)
1583		return -ENOMEM;
1584
1585	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1586				    NILFS_BTREE_LEVEL_NODE_MIN, 0);
1587	if (ret < 0)
1588		goto out;
1589
1590
1591	dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
1592
1593	ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
1594	if (ret < 0)
1595		goto out;
1596	nilfs_btree_commit_delete(btree, path, level, dat);
1597	nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks);
1598
1599out:
1600	nilfs_btree_free_path(path);
1601	return ret;
1602}
1603
1604static int nilfs_btree_seek_key(const struct nilfs_bmap *btree, __u64 start,
1605				__u64 *keyp)
1606{
1607	struct nilfs_btree_path *path;
1608	const int minlevel = NILFS_BTREE_LEVEL_NODE_MIN;
1609	int ret;
1610
1611	path = nilfs_btree_alloc_path();
1612	if (!path)
1613		return -ENOMEM;
1614
1615	ret = nilfs_btree_do_lookup(btree, path, start, NULL, minlevel, 0);
1616	if (!ret)
1617		*keyp = start;
1618	else if (ret == -ENOENT)
1619		ret = nilfs_btree_get_next_key(btree, path, minlevel, keyp);
1620
1621	nilfs_btree_free_path(path);
1622	return ret;
1623}
1624
1625static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
1626{
1627	struct nilfs_btree_path *path;
1628	int ret;
1629
1630	path = nilfs_btree_alloc_path();
1631	if (path == NULL)
1632		return -ENOMEM;
1633
1634	ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1635
1636	nilfs_btree_free_path(path);
1637
1638	return ret;
1639}
1640
1641static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
1642{
1643	struct buffer_head *bh;
1644	struct nilfs_btree_node *root, *node;
1645	__u64 maxkey, nextmaxkey;
1646	__u64 ptr;
1647	int nchildren, ret;
1648
1649	root = nilfs_btree_get_root(btree);
1650	switch (nilfs_btree_height(btree)) {
1651	case 2:
1652		bh = NULL;
1653		node = root;
1654		break;
1655	case 3:
1656		nchildren = nilfs_btree_node_get_nchildren(root);
1657		if (nchildren > 1)
1658			return 0;
1659		ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
1660					       NILFS_BTREE_ROOT_NCHILDREN_MAX);
1661		ret = nilfs_btree_get_block(btree, ptr, &bh);
1662		if (ret < 0)
1663			return ret;
1664		node = (struct nilfs_btree_node *)bh->b_data;
1665		break;
1666	default:
1667		return 0;
1668	}
1669
1670	nchildren = nilfs_btree_node_get_nchildren(node);
1671	maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
1672	nextmaxkey = (nchildren > 1) ?
1673		nilfs_btree_node_get_key(node, nchildren - 2) : 0;
1674	if (bh != NULL)
1675		brelse(bh);
1676
1677	return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
1678}
1679
1680static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
1681				   __u64 *keys, __u64 *ptrs, int nitems)
1682{
1683	struct buffer_head *bh;
1684	struct nilfs_btree_node *node, *root;
1685	__le64 *dkeys;
1686	__le64 *dptrs;
1687	__u64 ptr;
1688	int nchildren, ncmax, i, ret;
1689
1690	root = nilfs_btree_get_root(btree);
1691	switch (nilfs_btree_height(btree)) {
1692	case 2:
1693		bh = NULL;
1694		node = root;
1695		ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX;
1696		break;
1697	case 3:
1698		nchildren = nilfs_btree_node_get_nchildren(root);
1699		WARN_ON(nchildren > 1);
1700		ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
1701					       NILFS_BTREE_ROOT_NCHILDREN_MAX);
1702		ret = nilfs_btree_get_block(btree, ptr, &bh);
1703		if (ret < 0)
1704			return ret;
1705		node = (struct nilfs_btree_node *)bh->b_data;
1706		ncmax = nilfs_btree_nchildren_per_block(btree);
1707		break;
1708	default:
1709		node = NULL;
1710		return -EINVAL;
1711	}
1712
1713	nchildren = nilfs_btree_node_get_nchildren(node);
1714	if (nchildren < nitems)
1715		nitems = nchildren;
1716	dkeys = nilfs_btree_node_dkeys(node);
1717	dptrs = nilfs_btree_node_dptrs(node, ncmax);
1718	for (i = 0; i < nitems; i++) {
1719		keys[i] = le64_to_cpu(dkeys[i]);
1720		ptrs[i] = le64_to_cpu(dptrs[i]);
1721	}
1722
1723	if (bh != NULL)
1724		brelse(bh);
1725
1726	return nitems;
1727}
1728
1729static int
1730nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
1731				       union nilfs_bmap_ptr_req *dreq,
1732				       union nilfs_bmap_ptr_req *nreq,
1733				       struct buffer_head **bhp,
1734				       struct nilfs_bmap_stats *stats)
1735{
1736	struct buffer_head *bh;
1737	struct inode *dat = NULL;
1738	int ret;
1739
1740	stats->bs_nblocks = 0;
1741
1742	/* for data */
1743	/* cannot find near ptr */
1744	if (NILFS_BMAP_USE_VBN(btree)) {
1745		dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
1746		dat = nilfs_bmap_get_dat(btree);
1747	}
1748
1749	ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
1750	if (ret < 0)
1751		return ret;
1752
1753	*bhp = NULL;
1754	stats->bs_nblocks++;
1755	if (nreq != NULL) {
1756		nreq->bpr_ptr = dreq->bpr_ptr + 1;
1757		ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat);
1758		if (ret < 0)
1759			goto err_out_dreq;
1760
1761		ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh);
1762		if (ret < 0)
1763			goto err_out_nreq;
1764
1765		*bhp = bh;
1766		stats->bs_nblocks++;
1767	}
1768
1769	/* success */
1770	return 0;
1771
1772	/* error */
1773 err_out_nreq:
1774	nilfs_bmap_abort_alloc_ptr(btree, nreq, dat);
1775 err_out_dreq:
1776	nilfs_bmap_abort_alloc_ptr(btree, dreq, dat);
1777	stats->bs_nblocks = 0;
1778	return ret;
1779
1780}
1781
1782static void
1783nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
1784				      __u64 key, __u64 ptr,
1785				      const __u64 *keys, const __u64 *ptrs,
1786				      int n,
1787				      union nilfs_bmap_ptr_req *dreq,
1788				      union nilfs_bmap_ptr_req *nreq,
1789				      struct buffer_head *bh)
1790{
1791	struct nilfs_btree_node *node;
1792	struct inode *dat;
1793	__u64 tmpptr;
1794	int ncblk;
1795
1796	/* free resources */
1797	if (btree->b_ops->bop_clear != NULL)
1798		btree->b_ops->bop_clear(btree);
1799
1800	/* ptr must be a pointer to a buffer head. */
1801	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1802
1803	/* convert and insert */
1804	dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
1805	__nilfs_btree_init(btree);
1806	if (nreq != NULL) {
1807		nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
1808		nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
1809
1810		/* create child node at level 1 */
1811		node = (struct nilfs_btree_node *)bh->b_data;
1812		ncblk = nilfs_btree_nchildren_per_block(btree);
1813		nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
1814		nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
1815		if (!buffer_dirty(bh))
1816			mark_buffer_dirty(bh);
1817		if (!nilfs_bmap_dirty(btree))
1818			nilfs_bmap_set_dirty(btree);
1819
1820		brelse(bh);
1821
1822		/* create root node at level 2 */
1823		node = nilfs_btree_get_root(btree);
1824		tmpptr = nreq->bpr_ptr;
1825		nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1,
1826				      NILFS_BTREE_ROOT_NCHILDREN_MAX,
1827				      &keys[0], &tmpptr);
1828	} else {
1829		nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
1830
1831		/* create root node at level 1 */
1832		node = nilfs_btree_get_root(btree);
1833		nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n,
1834				      NILFS_BTREE_ROOT_NCHILDREN_MAX,
1835				      keys, ptrs);
1836		nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr,
1837					NILFS_BTREE_ROOT_NCHILDREN_MAX);
1838		if (!nilfs_bmap_dirty(btree))
1839			nilfs_bmap_set_dirty(btree);
1840	}
1841
1842	if (NILFS_BMAP_USE_VBN(btree))
1843		nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr);
1844}
1845
1846/**
1847 * nilfs_btree_convert_and_insert -
1848 * @bmap:
1849 * @key:
1850 * @ptr:
1851 * @keys:
1852 * @ptrs:
1853 * @n:
1854 */
1855int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
1856				   __u64 key, __u64 ptr,
1857				   const __u64 *keys, const __u64 *ptrs, int n)
1858{
1859	struct buffer_head *bh;
1860	union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
1861	struct nilfs_bmap_stats stats;
1862	int ret;
1863
1864	if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1865		di = &dreq;
1866		ni = NULL;
1867	} else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
1868			   1 << btree->b_inode->i_blkbits)) {
1869		di = &dreq;
1870		ni = &nreq;
1871	} else {
1872		di = NULL;
1873		ni = NULL;
1874		BUG();
1875	}
1876
1877	ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh,
1878						     &stats);
1879	if (ret < 0)
1880		return ret;
1881	nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
1882					      di, ni, bh);
1883	nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
1884	return 0;
1885}
1886
1887static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
1888				   struct nilfs_btree_path *path,
1889				   int level,
1890				   struct buffer_head *bh)
1891{
1892	while ((++level < nilfs_btree_height(btree) - 1) &&
1893	       !buffer_dirty(path[level].bp_bh))
1894		mark_buffer_dirty(path[level].bp_bh);
1895
1896	return 0;
1897}
1898
1899static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
1900					struct nilfs_btree_path *path,
1901					int level, struct inode *dat)
1902{
1903	struct nilfs_btree_node *parent;
1904	int ncmax, ret;
1905
1906	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1907	path[level].bp_oldreq.bpr_ptr =
1908		nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
1909					 ncmax);
1910	path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1911	ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
1912				       &path[level].bp_newreq.bpr_req);
1913	if (ret < 0)
1914		return ret;
1915
1916	if (buffer_nilfs_node(path[level].bp_bh)) {
1917		path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
1918		path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
1919		path[level].bp_ctxt.bh = path[level].bp_bh;
1920		ret = nilfs_btnode_prepare_change_key(
1921			&NILFS_BMAP_I(btree)->i_btnode_cache,
1922			&path[level].bp_ctxt);
1923		if (ret < 0) {
1924			nilfs_dat_abort_update(dat,
1925					       &path[level].bp_oldreq.bpr_req,
1926					       &path[level].bp_newreq.bpr_req);
1927			return ret;
1928		}
1929	}
1930
1931	return 0;
1932}
1933
1934static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
1935					struct nilfs_btree_path *path,
1936					int level, struct inode *dat)
1937{
1938	struct nilfs_btree_node *parent;
1939	int ncmax;
1940
1941	nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
1942				&path[level].bp_newreq.bpr_req,
1943				btree->b_ptr_type == NILFS_BMAP_PTR_VS);
1944
1945	if (buffer_nilfs_node(path[level].bp_bh)) {
1946		nilfs_btnode_commit_change_key(
1947			&NILFS_BMAP_I(btree)->i_btnode_cache,
1948			&path[level].bp_ctxt);
1949		path[level].bp_bh = path[level].bp_ctxt.bh;
1950	}
1951	set_buffer_nilfs_volatile(path[level].bp_bh);
1952
1953	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1954	nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index,
1955				 path[level].bp_newreq.bpr_ptr, ncmax);
1956}
1957
1958static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
1959				       struct nilfs_btree_path *path,
1960				       int level, struct inode *dat)
1961{
1962	nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
1963			       &path[level].bp_newreq.bpr_req);
1964	if (buffer_nilfs_node(path[level].bp_bh))
1965		nilfs_btnode_abort_change_key(
1966			&NILFS_BMAP_I(btree)->i_btnode_cache,
1967			&path[level].bp_ctxt);
1968}
1969
1970static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree,
1971					   struct nilfs_btree_path *path,
1972					   int minlevel, int *maxlevelp,
1973					   struct inode *dat)
1974{
1975	int level, ret;
1976
1977	level = minlevel;
1978	if (!buffer_nilfs_volatile(path[level].bp_bh)) {
1979		ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
1980		if (ret < 0)
1981			return ret;
1982	}
1983	while ((++level < nilfs_btree_height(btree) - 1) &&
1984	       !buffer_dirty(path[level].bp_bh)) {
1985
1986		WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
1987		ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
1988		if (ret < 0)
1989			goto out;
1990	}
1991
1992	/* success */
1993	*maxlevelp = level - 1;
1994	return 0;
1995
1996	/* error */
1997 out:
1998	while (--level > minlevel)
1999		nilfs_btree_abort_update_v(btree, path, level, dat);
2000	if (!buffer_nilfs_volatile(path[level].bp_bh))
2001		nilfs_btree_abort_update_v(btree, path, level, dat);
2002	return ret;
2003}
2004
2005static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree,
2006					   struct nilfs_btree_path *path,
2007					   int minlevel, int maxlevel,
2008					   struct buffer_head *bh,
2009					   struct inode *dat)
2010{
2011	int level;
2012
2013	if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
2014		nilfs_btree_commit_update_v(btree, path, minlevel, dat);
2015
2016	for (level = minlevel + 1; level <= maxlevel; level++)
2017		nilfs_btree_commit_update_v(btree, path, level, dat);
2018}
2019
2020static int nilfs_btree_propagate_v(struct nilfs_bmap *btree,
2021				   struct nilfs_btree_path *path,
2022				   int level, struct buffer_head *bh)
2023{
2024	int maxlevel = 0, ret;
2025	struct nilfs_btree_node *parent;
2026	struct inode *dat = nilfs_bmap_get_dat(btree);
2027	__u64 ptr;
2028	int ncmax;
2029
2030	get_bh(bh);
2031	path[level].bp_bh = bh;
2032	ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
2033					      dat);
2034	if (ret < 0)
2035		goto out;
2036
2037	if (buffer_nilfs_volatile(path[level].bp_bh)) {
2038		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
2039		ptr = nilfs_btree_node_get_ptr(parent,
2040					       path[level + 1].bp_index,
2041					       ncmax);
2042		ret = nilfs_dat_mark_dirty(dat, ptr);
2043		if (ret < 0)
2044			goto out;
2045	}
2046
2047	nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
2048
2049 out:
2050	brelse(path[level].bp_bh);
2051	path[level].bp_bh = NULL;
2052	return ret;
2053}
2054
2055static int nilfs_btree_propagate(struct nilfs_bmap *btree,
2056				 struct buffer_head *bh)
2057{
2058	struct nilfs_btree_path *path;
2059	struct nilfs_btree_node *node;
2060	__u64 key;
2061	int level, ret;
2062
2063	WARN_ON(!buffer_dirty(bh));
2064
2065	path = nilfs_btree_alloc_path();
2066	if (path == NULL)
2067		return -ENOMEM;
2068
2069	if (buffer_nilfs_node(bh)) {
2070		node = (struct nilfs_btree_node *)bh->b_data;
2071		key = nilfs_btree_node_get_key(node, 0);
2072		level = nilfs_btree_node_get_level(node);
2073	} else {
2074		key = nilfs_bmap_data_get_key(btree, bh);
2075		level = NILFS_BTREE_LEVEL_DATA;
2076	}
2077
2078	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
2079	if (ret < 0) {
2080		if (unlikely(ret == -ENOENT))
2081			printk(KERN_CRIT "%s: key = %llu, level == %d\n",
2082			       __func__, (unsigned long long)key, level);
2083		goto out;
2084	}
2085
2086	ret = NILFS_BMAP_USE_VBN(btree) ?
2087		nilfs_btree_propagate_v(btree, path, level, bh) :
2088		nilfs_btree_propagate_p(btree, path, level, bh);
2089
2090 out:
2091	nilfs_btree_free_path(path);
2092
2093	return ret;
2094}
2095
2096static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree,
2097				    struct buffer_head *bh)
2098{
2099	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr);
2100}
2101
2102static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
2103					 struct list_head *lists,
2104					 struct buffer_head *bh)
2105{
2106	struct list_head *head;
2107	struct buffer_head *cbh;
2108	struct nilfs_btree_node *node, *cnode;
2109	__u64 key, ckey;
2110	int level;
2111
2112	get_bh(bh);
2113	node = (struct nilfs_btree_node *)bh->b_data;
2114	key = nilfs_btree_node_get_key(node, 0);
2115	level = nilfs_btree_node_get_level(node);
2116	if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
2117	    level >= NILFS_BTREE_LEVEL_MAX) {
2118		dump_stack();
2119		printk(KERN_WARNING
2120		       "%s: invalid btree level: %d (key=%llu, ino=%lu, "
2121		       "blocknr=%llu)\n",
2122		       __func__, level, (unsigned long long)key,
2123		       NILFS_BMAP_I(btree)->vfs_inode.i_ino,
2124		       (unsigned long long)bh->b_blocknr);
2125		return;
2126	}
2127
2128	list_for_each(head, &lists[level]) {
2129		cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
2130		cnode = (struct nilfs_btree_node *)cbh->b_data;
2131		ckey = nilfs_btree_node_get_key(cnode, 0);
2132		if (key < ckey)
2133			break;
2134	}
2135	list_add_tail(&bh->b_assoc_buffers, head);
2136}
2137
2138static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
2139					     struct list_head *listp)
2140{
2141	struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
2142	struct list_head lists[NILFS_BTREE_LEVEL_MAX];
2143	struct pagevec pvec;
2144	struct buffer_head *bh, *head;
2145	pgoff_t index = 0;
2146	int level, i;
2147
2148	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
2149	     level < NILFS_BTREE_LEVEL_MAX;
2150	     level++)
2151		INIT_LIST_HEAD(&lists[level]);
2152
2153	pagevec_init(&pvec, 0);
2154
2155	while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
2156				  PAGEVEC_SIZE)) {
2157		for (i = 0; i < pagevec_count(&pvec); i++) {
2158			bh = head = page_buffers(pvec.pages[i]);
2159			do {
2160				if (buffer_dirty(bh))
2161					nilfs_btree_add_dirty_buffer(btree,
2162								     lists, bh);
2163			} while ((bh = bh->b_this_page) != head);
2164		}
2165		pagevec_release(&pvec);
2166		cond_resched();
2167	}
2168
2169	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
2170	     level < NILFS_BTREE_LEVEL_MAX;
2171	     level++)
2172		list_splice_tail(&lists[level], listp);
2173}
2174
2175static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
2176				struct nilfs_btree_path *path,
2177				int level,
2178				struct buffer_head **bh,
2179				sector_t blocknr,
2180				union nilfs_binfo *binfo)
2181{
2182	struct nilfs_btree_node *parent;
2183	__u64 key;
2184	__u64 ptr;
2185	int ncmax, ret;
2186
2187	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
2188	ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
2189				       ncmax);
2190	if (buffer_nilfs_node(*bh)) {
2191		path[level].bp_ctxt.oldkey = ptr;
2192		path[level].bp_ctxt.newkey = blocknr;
2193		path[level].bp_ctxt.bh = *bh;
2194		ret = nilfs_btnode_prepare_change_key(
2195			&NILFS_BMAP_I(btree)->i_btnode_cache,
2196			&path[level].bp_ctxt);
2197		if (ret < 0)
2198			return ret;
2199		nilfs_btnode_commit_change_key(
2200			&NILFS_BMAP_I(btree)->i_btnode_cache,
2201			&path[level].bp_ctxt);
2202		*bh = path[level].bp_ctxt.bh;
2203	}
2204
2205	nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr,
2206				 ncmax);
2207
2208	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2209	/* on-disk format */
2210	binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
2211	binfo->bi_dat.bi_level = level;
2212
2213	return 0;
2214}
2215
2216static int nilfs_btree_assign_v(struct nilfs_bmap *btree,
2217				struct nilfs_btree_path *path,
2218				int level,
2219				struct buffer_head **bh,
2220				sector_t blocknr,
2221				union nilfs_binfo *binfo)
2222{
2223	struct nilfs_btree_node *parent;
2224	struct inode *dat = nilfs_bmap_get_dat(btree);
2225	__u64 key;
2226	__u64 ptr;
2227	union nilfs_bmap_ptr_req req;
2228	int ncmax, ret;
2229
2230	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
2231	ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
2232				       ncmax);
2233	req.bpr_ptr = ptr;
2234	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
2235	if (ret < 0)
2236		return ret;
2237	nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
2238
2239	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2240	/* on-disk format */
2241	binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
2242	binfo->bi_v.bi_blkoff = cpu_to_le64(key);
2243
2244	return 0;
2245}
2246
2247static int nilfs_btree_assign(struct nilfs_bmap *btree,
2248			      struct buffer_head **bh,
2249			      sector_t blocknr,
2250			      union nilfs_binfo *binfo)
2251{
2252	struct nilfs_btree_path *path;
2253	struct nilfs_btree_node *node;
2254	__u64 key;
2255	int level, ret;
2256
2257	path = nilfs_btree_alloc_path();
2258	if (path == NULL)
2259		return -ENOMEM;
2260
2261	if (buffer_nilfs_node(*bh)) {
2262		node = (struct nilfs_btree_node *)(*bh)->b_data;
2263		key = nilfs_btree_node_get_key(node, 0);
2264		level = nilfs_btree_node_get_level(node);
2265	} else {
2266		key = nilfs_bmap_data_get_key(btree, *bh);
2267		level = NILFS_BTREE_LEVEL_DATA;
2268	}
2269
2270	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
2271	if (ret < 0) {
2272		WARN_ON(ret == -ENOENT);
2273		goto out;
2274	}
2275
2276	ret = NILFS_BMAP_USE_VBN(btree) ?
2277		nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
2278		nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2279
2280 out:
2281	nilfs_btree_free_path(path);
2282
2283	return ret;
2284}
2285
2286static int nilfs_btree_assign_gc(struct nilfs_bmap *btree,
2287				 struct buffer_head **bh,
2288				 sector_t blocknr,
2289				 union nilfs_binfo *binfo)
2290{
2291	struct nilfs_btree_node *node;
2292	__u64 key;
2293	int ret;
2294
2295	ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr,
2296			     blocknr);
2297	if (ret < 0)
2298		return ret;
2299
2300	if (buffer_nilfs_node(*bh)) {
2301		node = (struct nilfs_btree_node *)(*bh)->b_data;
2302		key = nilfs_btree_node_get_key(node, 0);
2303	} else
2304		key = nilfs_bmap_data_get_key(btree, *bh);
2305
2306	/* on-disk format */
2307	binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
2308	binfo->bi_v.bi_blkoff = cpu_to_le64(key);
2309
2310	return 0;
2311}
2312
2313static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
2314{
2315	struct buffer_head *bh;
2316	struct nilfs_btree_path *path;
2317	__u64 ptr;
2318	int ret;
2319
2320	path = nilfs_btree_alloc_path();
2321	if (path == NULL)
2322		return -ENOMEM;
2323
2324	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0);
2325	if (ret < 0) {
2326		WARN_ON(ret == -ENOENT);
2327		goto out;
2328	}
2329	ret = nilfs_btree_get_block(btree, ptr, &bh);
2330	if (ret < 0) {
2331		WARN_ON(ret == -ENOENT);
2332		goto out;
2333	}
2334
2335	if (!buffer_dirty(bh))
2336		mark_buffer_dirty(bh);
2337	brelse(bh);
2338	if (!nilfs_bmap_dirty(btree))
2339		nilfs_bmap_set_dirty(btree);
2340
2341 out:
2342	nilfs_btree_free_path(path);
2343	return ret;
2344}
2345
2346static const struct nilfs_bmap_operations nilfs_btree_ops = {
2347	.bop_lookup		=	nilfs_btree_lookup,
2348	.bop_lookup_contig	=	nilfs_btree_lookup_contig,
2349	.bop_insert		=	nilfs_btree_insert,
2350	.bop_delete		=	nilfs_btree_delete,
2351	.bop_clear		=	NULL,
2352
2353	.bop_propagate		=	nilfs_btree_propagate,
2354
2355	.bop_lookup_dirty_buffers =	nilfs_btree_lookup_dirty_buffers,
2356
2357	.bop_assign		=	nilfs_btree_assign,
2358	.bop_mark		=	nilfs_btree_mark,
2359
2360	.bop_seek_key		=	nilfs_btree_seek_key,
2361	.bop_last_key		=	nilfs_btree_last_key,
2362
2363	.bop_check_insert	=	NULL,
2364	.bop_check_delete	=	nilfs_btree_check_delete,
2365	.bop_gather_data	=	nilfs_btree_gather_data,
2366};
2367
2368static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
2369	.bop_lookup		=	NULL,
2370	.bop_lookup_contig	=	NULL,
2371	.bop_insert		=	NULL,
2372	.bop_delete		=	NULL,
2373	.bop_clear		=	NULL,
2374
2375	.bop_propagate		=	nilfs_btree_propagate_gc,
2376
2377	.bop_lookup_dirty_buffers =	nilfs_btree_lookup_dirty_buffers,
2378
2379	.bop_assign		=	nilfs_btree_assign_gc,
2380	.bop_mark		=	NULL,
2381
2382	.bop_seek_key		=	NULL,
2383	.bop_last_key		=	NULL,
2384
2385	.bop_check_insert	=	NULL,
2386	.bop_check_delete	=	NULL,
2387	.bop_gather_data	=	NULL,
2388};
2389
2390static void __nilfs_btree_init(struct nilfs_bmap *bmap)
2391{
2392	bmap->b_ops = &nilfs_btree_ops;
2393	bmap->b_nchildren_per_block =
2394		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
2395}
2396
2397int nilfs_btree_init(struct nilfs_bmap *bmap)
2398{
2399	int ret = 0;
2400
2401	__nilfs_btree_init(bmap);
2402
2403	if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
2404				    bmap->b_inode->i_ino))
2405		ret = -EIO;
2406	return ret;
2407}
2408
2409void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
2410{
2411	bmap->b_ops = &nilfs_btree_ops_gc;
2412	bmap->b_nchildren_per_block =
2413		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
2414}
2415