1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_mount.h"
26#include "xfs_inode.h"
27#include "xfs_trans.h"
28#include "xfs_inode_item.h"
29#include "xfs_buf_item.h"
30#include "xfs_btree.h"
31#include "xfs_error.h"
32#include "xfs_trace.h"
33#include "xfs_cksum.h"
34#include "xfs_alloc.h"
35
36/*
37 * Cursor allocation zone.
38 */
39kmem_zone_t	*xfs_btree_cur_zone;
40
41/*
42 * Btree magic numbers.
43 */
44static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
45	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
46	  XFS_FIBT_MAGIC },
47	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
48	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
49};
50#define xfs_btree_magic(cur) \
51	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
52
53
54STATIC int				/* error (0 or EFSCORRUPTED) */
55xfs_btree_check_lblock(
56	struct xfs_btree_cur	*cur,	/* btree cursor */
57	struct xfs_btree_block	*block,	/* btree long form block pointer */
58	int			level,	/* level of the btree block */
59	struct xfs_buf		*bp)	/* buffer for block, if any */
60{
61	int			lblock_ok = 1; /* block passes checks */
62	struct xfs_mount	*mp;	/* file system mount point */
63
64	mp = cur->bc_mp;
65
66	if (xfs_sb_version_hascrc(&mp->m_sb)) {
67		lblock_ok = lblock_ok &&
68			uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
69			block->bb_u.l.bb_blkno == cpu_to_be64(
70				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
71	}
72
73	lblock_ok = lblock_ok &&
74		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
75		be16_to_cpu(block->bb_level) == level &&
76		be16_to_cpu(block->bb_numrecs) <=
77			cur->bc_ops->get_maxrecs(cur, level) &&
78		block->bb_u.l.bb_leftsib &&
79		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
80		 XFS_FSB_SANITY_CHECK(mp,
81			be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
82		block->bb_u.l.bb_rightsib &&
83		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
84		 XFS_FSB_SANITY_CHECK(mp,
85			be64_to_cpu(block->bb_u.l.bb_rightsib)));
86
87	if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
88			XFS_ERRTAG_BTREE_CHECK_LBLOCK,
89			XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
90		if (bp)
91			trace_xfs_btree_corrupt(bp, _RET_IP_);
92		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
93		return -EFSCORRUPTED;
94	}
95	return 0;
96}
97
98STATIC int				/* error (0 or EFSCORRUPTED) */
99xfs_btree_check_sblock(
100	struct xfs_btree_cur	*cur,	/* btree cursor */
101	struct xfs_btree_block	*block,	/* btree short form block pointer */
102	int			level,	/* level of the btree block */
103	struct xfs_buf		*bp)	/* buffer containing block */
104{
105	struct xfs_mount	*mp;	/* file system mount point */
106	struct xfs_buf		*agbp;	/* buffer for ag. freespace struct */
107	struct xfs_agf		*agf;	/* ag. freespace structure */
108	xfs_agblock_t		agflen;	/* native ag. freespace length */
109	int			sblock_ok = 1; /* block passes checks */
110
111	mp = cur->bc_mp;
112	agbp = cur->bc_private.a.agbp;
113	agf = XFS_BUF_TO_AGF(agbp);
114	agflen = be32_to_cpu(agf->agf_length);
115
116	if (xfs_sb_version_hascrc(&mp->m_sb)) {
117		sblock_ok = sblock_ok &&
118			uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
119			block->bb_u.s.bb_blkno == cpu_to_be64(
120				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
121	}
122
123	sblock_ok = sblock_ok &&
124		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
125		be16_to_cpu(block->bb_level) == level &&
126		be16_to_cpu(block->bb_numrecs) <=
127			cur->bc_ops->get_maxrecs(cur, level) &&
128		(block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
129		 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
130		block->bb_u.s.bb_leftsib &&
131		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
132		 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
133		block->bb_u.s.bb_rightsib;
134
135	if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
136			XFS_ERRTAG_BTREE_CHECK_SBLOCK,
137			XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
138		if (bp)
139			trace_xfs_btree_corrupt(bp, _RET_IP_);
140		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
141		return -EFSCORRUPTED;
142	}
143	return 0;
144}
145
146/*
147 * Debug routine: check that block header is ok.
148 */
149int
150xfs_btree_check_block(
151	struct xfs_btree_cur	*cur,	/* btree cursor */
152	struct xfs_btree_block	*block,	/* generic btree block pointer */
153	int			level,	/* level of the btree block */
154	struct xfs_buf		*bp)	/* buffer containing block, if any */
155{
156	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
157		return xfs_btree_check_lblock(cur, block, level, bp);
158	else
159		return xfs_btree_check_sblock(cur, block, level, bp);
160}
161
162/*
163 * Check that (long) pointer is ok.
164 */
165int					/* error (0 or EFSCORRUPTED) */
166xfs_btree_check_lptr(
167	struct xfs_btree_cur	*cur,	/* btree cursor */
168	xfs_fsblock_t		bno,	/* btree block disk address */
169	int			level)	/* btree block level */
170{
171	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
172		level > 0 &&
173		bno != NULLFSBLOCK &&
174		XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
175	return 0;
176}
177
178#ifdef DEBUG
179/*
180 * Check that (short) pointer is ok.
181 */
182STATIC int				/* error (0 or EFSCORRUPTED) */
183xfs_btree_check_sptr(
184	struct xfs_btree_cur	*cur,	/* btree cursor */
185	xfs_agblock_t		bno,	/* btree block disk address */
186	int			level)	/* btree block level */
187{
188	xfs_agblock_t		agblocks = cur->bc_mp->m_sb.sb_agblocks;
189
190	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
191		level > 0 &&
192		bno != NULLAGBLOCK &&
193		bno != 0 &&
194		bno < agblocks);
195	return 0;
196}
197
198/*
199 * Check that block ptr is ok.
200 */
201STATIC int				/* error (0 or EFSCORRUPTED) */
202xfs_btree_check_ptr(
203	struct xfs_btree_cur	*cur,	/* btree cursor */
204	union xfs_btree_ptr	*ptr,	/* btree block disk address */
205	int			index,	/* offset from ptr to check */
206	int			level)	/* btree block level */
207{
208	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
209		return xfs_btree_check_lptr(cur,
210				be64_to_cpu((&ptr->l)[index]), level);
211	} else {
212		return xfs_btree_check_sptr(cur,
213				be32_to_cpu((&ptr->s)[index]), level);
214	}
215}
216#endif
217
218/*
219 * Calculate CRC on the whole btree block and stuff it into the
220 * long-form btree header.
221 *
222 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
223 * it into the buffer so recovery knows what the last modifcation was that made
224 * it to disk.
225 */
226void
227xfs_btree_lblock_calc_crc(
228	struct xfs_buf		*bp)
229{
230	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
231	struct xfs_buf_log_item	*bip = bp->b_fspriv;
232
233	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
234		return;
235	if (bip)
236		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
237	xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
238}
239
240bool
241xfs_btree_lblock_verify_crc(
242	struct xfs_buf		*bp)
243{
244	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
245		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
246
247	return true;
248}
249
250/*
251 * Calculate CRC on the whole btree block and stuff it into the
252 * short-form btree header.
253 *
254 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
255 * it into the buffer so recovery knows what the last modifcation was that made
256 * it to disk.
257 */
258void
259xfs_btree_sblock_calc_crc(
260	struct xfs_buf		*bp)
261{
262	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
263	struct xfs_buf_log_item	*bip = bp->b_fspriv;
264
265	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
266		return;
267	if (bip)
268		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
269	xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
270}
271
272bool
273xfs_btree_sblock_verify_crc(
274	struct xfs_buf		*bp)
275{
276	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
277		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
278
279	return true;
280}
281
282/*
283 * Delete the btree cursor.
284 */
285void
286xfs_btree_del_cursor(
287	xfs_btree_cur_t	*cur,		/* btree cursor */
288	int		error)		/* del because of error */
289{
290	int		i;		/* btree level */
291
292	/*
293	 * Clear the buffer pointers, and release the buffers.
294	 * If we're doing this in the face of an error, we
295	 * need to make sure to inspect all of the entries
296	 * in the bc_bufs array for buffers to be unlocked.
297	 * This is because some of the btree code works from
298	 * level n down to 0, and if we get an error along
299	 * the way we won't have initialized all the entries
300	 * down to 0.
301	 */
302	for (i = 0; i < cur->bc_nlevels; i++) {
303		if (cur->bc_bufs[i])
304			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
305		else if (!error)
306			break;
307	}
308	/*
309	 * Can't free a bmap cursor without having dealt with the
310	 * allocated indirect blocks' accounting.
311	 */
312	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
313	       cur->bc_private.b.allocated == 0);
314	/*
315	 * Free the cursor.
316	 */
317	kmem_zone_free(xfs_btree_cur_zone, cur);
318}
319
320/*
321 * Duplicate the btree cursor.
322 * Allocate a new one, copy the record, re-get the buffers.
323 */
324int					/* error */
325xfs_btree_dup_cursor(
326	xfs_btree_cur_t	*cur,		/* input cursor */
327	xfs_btree_cur_t	**ncur)		/* output cursor */
328{
329	xfs_buf_t	*bp;		/* btree block's buffer pointer */
330	int		error;		/* error return value */
331	int		i;		/* level number of btree block */
332	xfs_mount_t	*mp;		/* mount structure for filesystem */
333	xfs_btree_cur_t	*new;		/* new cursor value */
334	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
335
336	tp = cur->bc_tp;
337	mp = cur->bc_mp;
338
339	/*
340	 * Allocate a new cursor like the old one.
341	 */
342	new = cur->bc_ops->dup_cursor(cur);
343
344	/*
345	 * Copy the record currently in the cursor.
346	 */
347	new->bc_rec = cur->bc_rec;
348
349	/*
350	 * For each level current, re-get the buffer and copy the ptr value.
351	 */
352	for (i = 0; i < new->bc_nlevels; i++) {
353		new->bc_ptrs[i] = cur->bc_ptrs[i];
354		new->bc_ra[i] = cur->bc_ra[i];
355		bp = cur->bc_bufs[i];
356		if (bp) {
357			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
358						   XFS_BUF_ADDR(bp), mp->m_bsize,
359						   0, &bp,
360						   cur->bc_ops->buf_ops);
361			if (error) {
362				xfs_btree_del_cursor(new, error);
363				*ncur = NULL;
364				return error;
365			}
366		}
367		new->bc_bufs[i] = bp;
368	}
369	*ncur = new;
370	return 0;
371}
372
373/*
374 * XFS btree block layout and addressing:
375 *
376 * There are two types of blocks in the btree: leaf and non-leaf blocks.
377 *
378 * The leaf record start with a header then followed by records containing
379 * the values.  A non-leaf block also starts with the same header, and
380 * then first contains lookup keys followed by an equal number of pointers
381 * to the btree blocks at the previous level.
382 *
383 *		+--------+-------+-------+-------+-------+-------+-------+
384 * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
385 *		+--------+-------+-------+-------+-------+-------+-------+
386 *
387 *		+--------+-------+-------+-------+-------+-------+-------+
388 * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
389 *		+--------+-------+-------+-------+-------+-------+-------+
390 *
391 * The header is called struct xfs_btree_block for reasons better left unknown
392 * and comes in different versions for short (32bit) and long (64bit) block
393 * pointers.  The record and key structures are defined by the btree instances
394 * and opaque to the btree core.  The block pointers are simple disk endian
395 * integers, available in a short (32bit) and long (64bit) variant.
396 *
397 * The helpers below calculate the offset of a given record, key or pointer
398 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
399 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
400 * inside the btree block is done using indices starting at one, not zero!
401 */
402
403/*
404 * Return size of the btree block header for this btree instance.
405 */
406static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
407{
408	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
409		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
410			return XFS_BTREE_LBLOCK_CRC_LEN;
411		return XFS_BTREE_LBLOCK_LEN;
412	}
413	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
414		return XFS_BTREE_SBLOCK_CRC_LEN;
415	return XFS_BTREE_SBLOCK_LEN;
416}
417
418/*
419 * Return size of btree block pointers for this btree instance.
420 */
421static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
422{
423	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
424		sizeof(__be64) : sizeof(__be32);
425}
426
427/*
428 * Calculate offset of the n-th record in a btree block.
429 */
430STATIC size_t
431xfs_btree_rec_offset(
432	struct xfs_btree_cur	*cur,
433	int			n)
434{
435	return xfs_btree_block_len(cur) +
436		(n - 1) * cur->bc_ops->rec_len;
437}
438
439/*
440 * Calculate offset of the n-th key in a btree block.
441 */
442STATIC size_t
443xfs_btree_key_offset(
444	struct xfs_btree_cur	*cur,
445	int			n)
446{
447	return xfs_btree_block_len(cur) +
448		(n - 1) * cur->bc_ops->key_len;
449}
450
451/*
452 * Calculate offset of the n-th block pointer in a btree block.
453 */
454STATIC size_t
455xfs_btree_ptr_offset(
456	struct xfs_btree_cur	*cur,
457	int			n,
458	int			level)
459{
460	return xfs_btree_block_len(cur) +
461		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
462		(n - 1) * xfs_btree_ptr_len(cur);
463}
464
465/*
466 * Return a pointer to the n-th record in the btree block.
467 */
468STATIC union xfs_btree_rec *
469xfs_btree_rec_addr(
470	struct xfs_btree_cur	*cur,
471	int			n,
472	struct xfs_btree_block	*block)
473{
474	return (union xfs_btree_rec *)
475		((char *)block + xfs_btree_rec_offset(cur, n));
476}
477
478/*
479 * Return a pointer to the n-th key in the btree block.
480 */
481STATIC union xfs_btree_key *
482xfs_btree_key_addr(
483	struct xfs_btree_cur	*cur,
484	int			n,
485	struct xfs_btree_block	*block)
486{
487	return (union xfs_btree_key *)
488		((char *)block + xfs_btree_key_offset(cur, n));
489}
490
491/*
492 * Return a pointer to the n-th block pointer in the btree block.
493 */
494STATIC union xfs_btree_ptr *
495xfs_btree_ptr_addr(
496	struct xfs_btree_cur	*cur,
497	int			n,
498	struct xfs_btree_block	*block)
499{
500	int			level = xfs_btree_get_level(block);
501
502	ASSERT(block->bb_level != 0);
503
504	return (union xfs_btree_ptr *)
505		((char *)block + xfs_btree_ptr_offset(cur, n, level));
506}
507
508/*
509 * Get the root block which is stored in the inode.
510 *
511 * For now this btree implementation assumes the btree root is always
512 * stored in the if_broot field of an inode fork.
513 */
514STATIC struct xfs_btree_block *
515xfs_btree_get_iroot(
516       struct xfs_btree_cur    *cur)
517{
518       struct xfs_ifork        *ifp;
519
520       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
521       return (struct xfs_btree_block *)ifp->if_broot;
522}
523
524/*
525 * Retrieve the block pointer from the cursor at the given level.
526 * This may be an inode btree root or from a buffer.
527 */
528STATIC struct xfs_btree_block *		/* generic btree block pointer */
529xfs_btree_get_block(
530	struct xfs_btree_cur	*cur,	/* btree cursor */
531	int			level,	/* level in btree */
532	struct xfs_buf		**bpp)	/* buffer containing the block */
533{
534	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
535	    (level == cur->bc_nlevels - 1)) {
536		*bpp = NULL;
537		return xfs_btree_get_iroot(cur);
538	}
539
540	*bpp = cur->bc_bufs[level];
541	return XFS_BUF_TO_BLOCK(*bpp);
542}
543
544/*
545 * Get a buffer for the block, return it with no data read.
546 * Long-form addressing.
547 */
548xfs_buf_t *				/* buffer for fsbno */
549xfs_btree_get_bufl(
550	xfs_mount_t	*mp,		/* file system mount point */
551	xfs_trans_t	*tp,		/* transaction pointer */
552	xfs_fsblock_t	fsbno,		/* file system block number */
553	uint		lock)		/* lock flags for get_buf */
554{
555	xfs_daddr_t		d;		/* real disk block address */
556
557	ASSERT(fsbno != NULLFSBLOCK);
558	d = XFS_FSB_TO_DADDR(mp, fsbno);
559	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
560}
561
562/*
563 * Get a buffer for the block, return it with no data read.
564 * Short-form addressing.
565 */
566xfs_buf_t *				/* buffer for agno/agbno */
567xfs_btree_get_bufs(
568	xfs_mount_t	*mp,		/* file system mount point */
569	xfs_trans_t	*tp,		/* transaction pointer */
570	xfs_agnumber_t	agno,		/* allocation group number */
571	xfs_agblock_t	agbno,		/* allocation group block number */
572	uint		lock)		/* lock flags for get_buf */
573{
574	xfs_daddr_t		d;		/* real disk block address */
575
576	ASSERT(agno != NULLAGNUMBER);
577	ASSERT(agbno != NULLAGBLOCK);
578	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
579	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
580}
581
582/*
583 * Check for the cursor referring to the last block at the given level.
584 */
585int					/* 1=is last block, 0=not last block */
586xfs_btree_islastblock(
587	xfs_btree_cur_t		*cur,	/* btree cursor */
588	int			level)	/* level to check */
589{
590	struct xfs_btree_block	*block;	/* generic btree block pointer */
591	xfs_buf_t		*bp;	/* buffer containing block */
592
593	block = xfs_btree_get_block(cur, level, &bp);
594	xfs_btree_check_block(cur, block, level, bp);
595	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
596		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
597	else
598		return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
599}
600
601/*
602 * Change the cursor to point to the first record at the given level.
603 * Other levels are unaffected.
604 */
605STATIC int				/* success=1, failure=0 */
606xfs_btree_firstrec(
607	xfs_btree_cur_t		*cur,	/* btree cursor */
608	int			level)	/* level to change */
609{
610	struct xfs_btree_block	*block;	/* generic btree block pointer */
611	xfs_buf_t		*bp;	/* buffer containing block */
612
613	/*
614	 * Get the block pointer for this level.
615	 */
616	block = xfs_btree_get_block(cur, level, &bp);
617	xfs_btree_check_block(cur, block, level, bp);
618	/*
619	 * It's empty, there is no such record.
620	 */
621	if (!block->bb_numrecs)
622		return 0;
623	/*
624	 * Set the ptr value to 1, that's the first record/key.
625	 */
626	cur->bc_ptrs[level] = 1;
627	return 1;
628}
629
630/*
631 * Change the cursor to point to the last record in the current block
632 * at the given level.  Other levels are unaffected.
633 */
634STATIC int				/* success=1, failure=0 */
635xfs_btree_lastrec(
636	xfs_btree_cur_t		*cur,	/* btree cursor */
637	int			level)	/* level to change */
638{
639	struct xfs_btree_block	*block;	/* generic btree block pointer */
640	xfs_buf_t		*bp;	/* buffer containing block */
641
642	/*
643	 * Get the block pointer for this level.
644	 */
645	block = xfs_btree_get_block(cur, level, &bp);
646	xfs_btree_check_block(cur, block, level, bp);
647	/*
648	 * It's empty, there is no such record.
649	 */
650	if (!block->bb_numrecs)
651		return 0;
652	/*
653	 * Set the ptr value to numrecs, that's the last record/key.
654	 */
655	cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
656	return 1;
657}
658
659/*
660 * Compute first and last byte offsets for the fields given.
661 * Interprets the offsets table, which contains struct field offsets.
662 */
663void
664xfs_btree_offsets(
665	__int64_t	fields,		/* bitmask of fields */
666	const short	*offsets,	/* table of field offsets */
667	int		nbits,		/* number of bits to inspect */
668	int		*first,		/* output: first byte offset */
669	int		*last)		/* output: last byte offset */
670{
671	int		i;		/* current bit number */
672	__int64_t	imask;		/* mask for current bit number */
673
674	ASSERT(fields != 0);
675	/*
676	 * Find the lowest bit, so the first byte offset.
677	 */
678	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
679		if (imask & fields) {
680			*first = offsets[i];
681			break;
682		}
683	}
684	/*
685	 * Find the highest bit, so the last byte offset.
686	 */
687	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
688		if (imask & fields) {
689			*last = offsets[i + 1] - 1;
690			break;
691		}
692	}
693}
694
695/*
696 * Get a buffer for the block, return it read in.
697 * Long-form addressing.
698 */
699int
700xfs_btree_read_bufl(
701	struct xfs_mount	*mp,		/* file system mount point */
702	struct xfs_trans	*tp,		/* transaction pointer */
703	xfs_fsblock_t		fsbno,		/* file system block number */
704	uint			lock,		/* lock flags for read_buf */
705	struct xfs_buf		**bpp,		/* buffer for fsbno */
706	int			refval,		/* ref count value for buffer */
707	const struct xfs_buf_ops *ops)
708{
709	struct xfs_buf		*bp;		/* return value */
710	xfs_daddr_t		d;		/* real disk block address */
711	int			error;
712
713	ASSERT(fsbno != NULLFSBLOCK);
714	d = XFS_FSB_TO_DADDR(mp, fsbno);
715	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
716				   mp->m_bsize, lock, &bp, ops);
717	if (error)
718		return error;
719	if (bp)
720		xfs_buf_set_ref(bp, refval);
721	*bpp = bp;
722	return 0;
723}
724
725/*
726 * Read-ahead the block, don't wait for it, don't return a buffer.
727 * Long-form addressing.
728 */
729/* ARGSUSED */
730void
731xfs_btree_reada_bufl(
732	struct xfs_mount	*mp,		/* file system mount point */
733	xfs_fsblock_t		fsbno,		/* file system block number */
734	xfs_extlen_t		count,		/* count of filesystem blocks */
735	const struct xfs_buf_ops *ops)
736{
737	xfs_daddr_t		d;
738
739	ASSERT(fsbno != NULLFSBLOCK);
740	d = XFS_FSB_TO_DADDR(mp, fsbno);
741	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
742}
743
744/*
745 * Read-ahead the block, don't wait for it, don't return a buffer.
746 * Short-form addressing.
747 */
748/* ARGSUSED */
749void
750xfs_btree_reada_bufs(
751	struct xfs_mount	*mp,		/* file system mount point */
752	xfs_agnumber_t		agno,		/* allocation group number */
753	xfs_agblock_t		agbno,		/* allocation group block number */
754	xfs_extlen_t		count,		/* count of filesystem blocks */
755	const struct xfs_buf_ops *ops)
756{
757	xfs_daddr_t		d;
758
759	ASSERT(agno != NULLAGNUMBER);
760	ASSERT(agbno != NULLAGBLOCK);
761	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
762	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
763}
764
765STATIC int
766xfs_btree_readahead_lblock(
767	struct xfs_btree_cur	*cur,
768	int			lr,
769	struct xfs_btree_block	*block)
770{
771	int			rval = 0;
772	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
773	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
774
775	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
776		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
777				     cur->bc_ops->buf_ops);
778		rval++;
779	}
780
781	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
782		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
783				     cur->bc_ops->buf_ops);
784		rval++;
785	}
786
787	return rval;
788}
789
790STATIC int
791xfs_btree_readahead_sblock(
792	struct xfs_btree_cur	*cur,
793	int			lr,
794	struct xfs_btree_block *block)
795{
796	int			rval = 0;
797	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
798	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
799
800
801	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
802		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
803				     left, 1, cur->bc_ops->buf_ops);
804		rval++;
805	}
806
807	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
808		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
809				     right, 1, cur->bc_ops->buf_ops);
810		rval++;
811	}
812
813	return rval;
814}
815
816/*
817 * Read-ahead btree blocks, at the given level.
818 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
819 */
820STATIC int
821xfs_btree_readahead(
822	struct xfs_btree_cur	*cur,		/* btree cursor */
823	int			lev,		/* level in btree */
824	int			lr)		/* left/right bits */
825{
826	struct xfs_btree_block	*block;
827
828	/*
829	 * No readahead needed if we are at the root level and the
830	 * btree root is stored in the inode.
831	 */
832	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
833	    (lev == cur->bc_nlevels - 1))
834		return 0;
835
836	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
837		return 0;
838
839	cur->bc_ra[lev] |= lr;
840	block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
841
842	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
843		return xfs_btree_readahead_lblock(cur, lr, block);
844	return xfs_btree_readahead_sblock(cur, lr, block);
845}
846
847STATIC xfs_daddr_t
848xfs_btree_ptr_to_daddr(
849	struct xfs_btree_cur	*cur,
850	union xfs_btree_ptr	*ptr)
851{
852	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
853		ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
854
855		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
856	} else {
857		ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
858		ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
859
860		return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
861					be32_to_cpu(ptr->s));
862	}
863}
864
865/*
866 * Readahead @count btree blocks at the given @ptr location.
867 *
868 * We don't need to care about long or short form btrees here as we have a
869 * method of converting the ptr directly to a daddr available to us.
870 */
871STATIC void
872xfs_btree_readahead_ptr(
873	struct xfs_btree_cur	*cur,
874	union xfs_btree_ptr	*ptr,
875	xfs_extlen_t		count)
876{
877	xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
878			  xfs_btree_ptr_to_daddr(cur, ptr),
879			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
880}
881
882/*
883 * Set the buffer for level "lev" in the cursor to bp, releasing
884 * any previous buffer.
885 */
886STATIC void
887xfs_btree_setbuf(
888	xfs_btree_cur_t		*cur,	/* btree cursor */
889	int			lev,	/* level in btree */
890	xfs_buf_t		*bp)	/* new buffer to set */
891{
892	struct xfs_btree_block	*b;	/* btree block */
893
894	if (cur->bc_bufs[lev])
895		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
896	cur->bc_bufs[lev] = bp;
897	cur->bc_ra[lev] = 0;
898
899	b = XFS_BUF_TO_BLOCK(bp);
900	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
901		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
902			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
903		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
904			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
905	} else {
906		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
907			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
908		if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
909			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
910	}
911}
912
913STATIC int
914xfs_btree_ptr_is_null(
915	struct xfs_btree_cur	*cur,
916	union xfs_btree_ptr	*ptr)
917{
918	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
919		return ptr->l == cpu_to_be64(NULLFSBLOCK);
920	else
921		return ptr->s == cpu_to_be32(NULLAGBLOCK);
922}
923
924STATIC void
925xfs_btree_set_ptr_null(
926	struct xfs_btree_cur	*cur,
927	union xfs_btree_ptr	*ptr)
928{
929	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
930		ptr->l = cpu_to_be64(NULLFSBLOCK);
931	else
932		ptr->s = cpu_to_be32(NULLAGBLOCK);
933}
934
935/*
936 * Get/set/init sibling pointers
937 */
938STATIC void
939xfs_btree_get_sibling(
940	struct xfs_btree_cur	*cur,
941	struct xfs_btree_block	*block,
942	union xfs_btree_ptr	*ptr,
943	int			lr)
944{
945	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
946
947	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
948		if (lr == XFS_BB_RIGHTSIB)
949			ptr->l = block->bb_u.l.bb_rightsib;
950		else
951			ptr->l = block->bb_u.l.bb_leftsib;
952	} else {
953		if (lr == XFS_BB_RIGHTSIB)
954			ptr->s = block->bb_u.s.bb_rightsib;
955		else
956			ptr->s = block->bb_u.s.bb_leftsib;
957	}
958}
959
960STATIC void
961xfs_btree_set_sibling(
962	struct xfs_btree_cur	*cur,
963	struct xfs_btree_block	*block,
964	union xfs_btree_ptr	*ptr,
965	int			lr)
966{
967	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
968
969	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
970		if (lr == XFS_BB_RIGHTSIB)
971			block->bb_u.l.bb_rightsib = ptr->l;
972		else
973			block->bb_u.l.bb_leftsib = ptr->l;
974	} else {
975		if (lr == XFS_BB_RIGHTSIB)
976			block->bb_u.s.bb_rightsib = ptr->s;
977		else
978			block->bb_u.s.bb_leftsib = ptr->s;
979	}
980}
981
982void
983xfs_btree_init_block_int(
984	struct xfs_mount	*mp,
985	struct xfs_btree_block	*buf,
986	xfs_daddr_t		blkno,
987	__u32			magic,
988	__u16			level,
989	__u16			numrecs,
990	__u64			owner,
991	unsigned int		flags)
992{
993	buf->bb_magic = cpu_to_be32(magic);
994	buf->bb_level = cpu_to_be16(level);
995	buf->bb_numrecs = cpu_to_be16(numrecs);
996
997	if (flags & XFS_BTREE_LONG_PTRS) {
998		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
999		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1000		if (flags & XFS_BTREE_CRC_BLOCKS) {
1001			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1002			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1003			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
1004			buf->bb_u.l.bb_pad = 0;
1005			buf->bb_u.l.bb_lsn = 0;
1006		}
1007	} else {
1008		/* owner is a 32 bit value on short blocks */
1009		__u32 __owner = (__u32)owner;
1010
1011		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1012		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1013		if (flags & XFS_BTREE_CRC_BLOCKS) {
1014			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1015			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1016			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
1017			buf->bb_u.s.bb_lsn = 0;
1018		}
1019	}
1020}
1021
1022void
1023xfs_btree_init_block(
1024	struct xfs_mount *mp,
1025	struct xfs_buf	*bp,
1026	__u32		magic,
1027	__u16		level,
1028	__u16		numrecs,
1029	__u64		owner,
1030	unsigned int	flags)
1031{
1032	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1033				 magic, level, numrecs, owner, flags);
1034}
1035
1036STATIC void
1037xfs_btree_init_block_cur(
1038	struct xfs_btree_cur	*cur,
1039	struct xfs_buf		*bp,
1040	int			level,
1041	int			numrecs)
1042{
1043	__u64 owner;
1044
1045	/*
1046	 * we can pull the owner from the cursor right now as the different
1047	 * owners align directly with the pointer size of the btree. This may
1048	 * change in future, but is safe for current users of the generic btree
1049	 * code.
1050	 */
1051	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1052		owner = cur->bc_private.b.ip->i_ino;
1053	else
1054		owner = cur->bc_private.a.agno;
1055
1056	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1057				 xfs_btree_magic(cur), level, numrecs,
1058				 owner, cur->bc_flags);
1059}
1060
1061/*
1062 * Return true if ptr is the last record in the btree and
1063 * we need to track updates to this record.  The decision
1064 * will be further refined in the update_lastrec method.
1065 */
1066STATIC int
1067xfs_btree_is_lastrec(
1068	struct xfs_btree_cur	*cur,
1069	struct xfs_btree_block	*block,
1070	int			level)
1071{
1072	union xfs_btree_ptr	ptr;
1073
1074	if (level > 0)
1075		return 0;
1076	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1077		return 0;
1078
1079	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1080	if (!xfs_btree_ptr_is_null(cur, &ptr))
1081		return 0;
1082	return 1;
1083}
1084
1085STATIC void
1086xfs_btree_buf_to_ptr(
1087	struct xfs_btree_cur	*cur,
1088	struct xfs_buf		*bp,
1089	union xfs_btree_ptr	*ptr)
1090{
1091	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1092		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1093					XFS_BUF_ADDR(bp)));
1094	else {
1095		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1096					XFS_BUF_ADDR(bp)));
1097	}
1098}
1099
1100STATIC void
1101xfs_btree_set_refs(
1102	struct xfs_btree_cur	*cur,
1103	struct xfs_buf		*bp)
1104{
1105	switch (cur->bc_btnum) {
1106	case XFS_BTNUM_BNO:
1107	case XFS_BTNUM_CNT:
1108		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1109		break;
1110	case XFS_BTNUM_INO:
1111	case XFS_BTNUM_FINO:
1112		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1113		break;
1114	case XFS_BTNUM_BMAP:
1115		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1116		break;
1117	default:
1118		ASSERT(0);
1119	}
1120}
1121
1122STATIC int
1123xfs_btree_get_buf_block(
1124	struct xfs_btree_cur	*cur,
1125	union xfs_btree_ptr	*ptr,
1126	int			flags,
1127	struct xfs_btree_block	**block,
1128	struct xfs_buf		**bpp)
1129{
1130	struct xfs_mount	*mp = cur->bc_mp;
1131	xfs_daddr_t		d;
1132
1133	/* need to sort out how callers deal with failures first */
1134	ASSERT(!(flags & XBF_TRYLOCK));
1135
1136	d = xfs_btree_ptr_to_daddr(cur, ptr);
1137	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1138				 mp->m_bsize, flags);
1139
1140	if (!*bpp)
1141		return -ENOMEM;
1142
1143	(*bpp)->b_ops = cur->bc_ops->buf_ops;
1144	*block = XFS_BUF_TO_BLOCK(*bpp);
1145	return 0;
1146}
1147
1148/*
1149 * Read in the buffer at the given ptr and return the buffer and
1150 * the block pointer within the buffer.
1151 */
1152STATIC int
1153xfs_btree_read_buf_block(
1154	struct xfs_btree_cur	*cur,
1155	union xfs_btree_ptr	*ptr,
1156	int			flags,
1157	struct xfs_btree_block	**block,
1158	struct xfs_buf		**bpp)
1159{
1160	struct xfs_mount	*mp = cur->bc_mp;
1161	xfs_daddr_t		d;
1162	int			error;
1163
1164	/* need to sort out how callers deal with failures first */
1165	ASSERT(!(flags & XBF_TRYLOCK));
1166
1167	d = xfs_btree_ptr_to_daddr(cur, ptr);
1168	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1169				   mp->m_bsize, flags, bpp,
1170				   cur->bc_ops->buf_ops);
1171	if (error)
1172		return error;
1173
1174	xfs_btree_set_refs(cur, *bpp);
1175	*block = XFS_BUF_TO_BLOCK(*bpp);
1176	return 0;
1177}
1178
1179/*
1180 * Copy keys from one btree block to another.
1181 */
1182STATIC void
1183xfs_btree_copy_keys(
1184	struct xfs_btree_cur	*cur,
1185	union xfs_btree_key	*dst_key,
1186	union xfs_btree_key	*src_key,
1187	int			numkeys)
1188{
1189	ASSERT(numkeys >= 0);
1190	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1191}
1192
1193/*
1194 * Copy records from one btree block to another.
1195 */
1196STATIC void
1197xfs_btree_copy_recs(
1198	struct xfs_btree_cur	*cur,
1199	union xfs_btree_rec	*dst_rec,
1200	union xfs_btree_rec	*src_rec,
1201	int			numrecs)
1202{
1203	ASSERT(numrecs >= 0);
1204	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1205}
1206
1207/*
1208 * Copy block pointers from one btree block to another.
1209 */
1210STATIC void
1211xfs_btree_copy_ptrs(
1212	struct xfs_btree_cur	*cur,
1213	union xfs_btree_ptr	*dst_ptr,
1214	union xfs_btree_ptr	*src_ptr,
1215	int			numptrs)
1216{
1217	ASSERT(numptrs >= 0);
1218	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1219}
1220
1221/*
1222 * Shift keys one index left/right inside a single btree block.
1223 */
1224STATIC void
1225xfs_btree_shift_keys(
1226	struct xfs_btree_cur	*cur,
1227	union xfs_btree_key	*key,
1228	int			dir,
1229	int			numkeys)
1230{
1231	char			*dst_key;
1232
1233	ASSERT(numkeys >= 0);
1234	ASSERT(dir == 1 || dir == -1);
1235
1236	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1237	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1238}
1239
1240/*
1241 * Shift records one index left/right inside a single btree block.
1242 */
1243STATIC void
1244xfs_btree_shift_recs(
1245	struct xfs_btree_cur	*cur,
1246	union xfs_btree_rec	*rec,
1247	int			dir,
1248	int			numrecs)
1249{
1250	char			*dst_rec;
1251
1252	ASSERT(numrecs >= 0);
1253	ASSERT(dir == 1 || dir == -1);
1254
1255	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1256	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1257}
1258
1259/*
1260 * Shift block pointers one index left/right inside a single btree block.
1261 */
1262STATIC void
1263xfs_btree_shift_ptrs(
1264	struct xfs_btree_cur	*cur,
1265	union xfs_btree_ptr	*ptr,
1266	int			dir,
1267	int			numptrs)
1268{
1269	char			*dst_ptr;
1270
1271	ASSERT(numptrs >= 0);
1272	ASSERT(dir == 1 || dir == -1);
1273
1274	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1275	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1276}
1277
1278/*
1279 * Log key values from the btree block.
1280 */
1281STATIC void
1282xfs_btree_log_keys(
1283	struct xfs_btree_cur	*cur,
1284	struct xfs_buf		*bp,
1285	int			first,
1286	int			last)
1287{
1288	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1289	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1290
1291	if (bp) {
1292		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1293		xfs_trans_log_buf(cur->bc_tp, bp,
1294				  xfs_btree_key_offset(cur, first),
1295				  xfs_btree_key_offset(cur, last + 1) - 1);
1296	} else {
1297		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1298				xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1299	}
1300
1301	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1302}
1303
1304/*
1305 * Log record values from the btree block.
1306 */
1307void
1308xfs_btree_log_recs(
1309	struct xfs_btree_cur	*cur,
1310	struct xfs_buf		*bp,
1311	int			first,
1312	int			last)
1313{
1314	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1315	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1316
1317	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1318	xfs_trans_log_buf(cur->bc_tp, bp,
1319			  xfs_btree_rec_offset(cur, first),
1320			  xfs_btree_rec_offset(cur, last + 1) - 1);
1321
1322	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1323}
1324
1325/*
1326 * Log block pointer fields from a btree block (nonleaf).
1327 */
1328STATIC void
1329xfs_btree_log_ptrs(
1330	struct xfs_btree_cur	*cur,	/* btree cursor */
1331	struct xfs_buf		*bp,	/* buffer containing btree block */
1332	int			first,	/* index of first pointer to log */
1333	int			last)	/* index of last pointer to log */
1334{
1335	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1336	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1337
1338	if (bp) {
1339		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
1340		int			level = xfs_btree_get_level(block);
1341
1342		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1343		xfs_trans_log_buf(cur->bc_tp, bp,
1344				xfs_btree_ptr_offset(cur, first, level),
1345				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1346	} else {
1347		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1348			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1349	}
1350
1351	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1352}
1353
1354/*
1355 * Log fields from a btree block header.
1356 */
1357void
1358xfs_btree_log_block(
1359	struct xfs_btree_cur	*cur,	/* btree cursor */
1360	struct xfs_buf		*bp,	/* buffer containing btree block */
1361	int			fields)	/* mask of fields: XFS_BB_... */
1362{
1363	int			first;	/* first byte offset logged */
1364	int			last;	/* last byte offset logged */
1365	static const short	soffsets[] = {	/* table of offsets (short) */
1366		offsetof(struct xfs_btree_block, bb_magic),
1367		offsetof(struct xfs_btree_block, bb_level),
1368		offsetof(struct xfs_btree_block, bb_numrecs),
1369		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1370		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1371		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1372		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1373		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1374		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1375		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1376		XFS_BTREE_SBLOCK_CRC_LEN
1377	};
1378	static const short	loffsets[] = {	/* table of offsets (long) */
1379		offsetof(struct xfs_btree_block, bb_magic),
1380		offsetof(struct xfs_btree_block, bb_level),
1381		offsetof(struct xfs_btree_block, bb_numrecs),
1382		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1383		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1384		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1385		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1386		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1387		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1388		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1389		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1390		XFS_BTREE_LBLOCK_CRC_LEN
1391	};
1392
1393	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1394	XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1395
1396	if (bp) {
1397		int nbits;
1398
1399		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1400			/*
1401			 * We don't log the CRC when updating a btree
1402			 * block but instead recreate it during log
1403			 * recovery.  As the log buffers have checksums
1404			 * of their own this is safe and avoids logging a crc
1405			 * update in a lot of places.
1406			 */
1407			if (fields == XFS_BB_ALL_BITS)
1408				fields = XFS_BB_ALL_BITS_CRC;
1409			nbits = XFS_BB_NUM_BITS_CRC;
1410		} else {
1411			nbits = XFS_BB_NUM_BITS;
1412		}
1413		xfs_btree_offsets(fields,
1414				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1415					loffsets : soffsets,
1416				  nbits, &first, &last);
1417		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1418		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1419	} else {
1420		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1421			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1422	}
1423
1424	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1425}
1426
1427/*
1428 * Increment cursor by one record at the level.
1429 * For nonzero levels the leaf-ward information is untouched.
1430 */
1431int						/* error */
1432xfs_btree_increment(
1433	struct xfs_btree_cur	*cur,
1434	int			level,
1435	int			*stat)		/* success/failure */
1436{
1437	struct xfs_btree_block	*block;
1438	union xfs_btree_ptr	ptr;
1439	struct xfs_buf		*bp;
1440	int			error;		/* error return value */
1441	int			lev;
1442
1443	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1444	XFS_BTREE_TRACE_ARGI(cur, level);
1445
1446	ASSERT(level < cur->bc_nlevels);
1447
1448	/* Read-ahead to the right at this level. */
1449	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1450
1451	/* Get a pointer to the btree block. */
1452	block = xfs_btree_get_block(cur, level, &bp);
1453
1454#ifdef DEBUG
1455	error = xfs_btree_check_block(cur, block, level, bp);
1456	if (error)
1457		goto error0;
1458#endif
1459
1460	/* We're done if we remain in the block after the increment. */
1461	if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1462		goto out1;
1463
1464	/* Fail if we just went off the right edge of the tree. */
1465	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1466	if (xfs_btree_ptr_is_null(cur, &ptr))
1467		goto out0;
1468
1469	XFS_BTREE_STATS_INC(cur, increment);
1470
1471	/*
1472	 * March up the tree incrementing pointers.
1473	 * Stop when we don't go off the right edge of a block.
1474	 */
1475	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1476		block = xfs_btree_get_block(cur, lev, &bp);
1477
1478#ifdef DEBUG
1479		error = xfs_btree_check_block(cur, block, lev, bp);
1480		if (error)
1481			goto error0;
1482#endif
1483
1484		if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1485			break;
1486
1487		/* Read-ahead the right block for the next loop. */
1488		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1489	}
1490
1491	/*
1492	 * If we went off the root then we are either seriously
1493	 * confused or have the tree root in an inode.
1494	 */
1495	if (lev == cur->bc_nlevels) {
1496		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1497			goto out0;
1498		ASSERT(0);
1499		error = -EFSCORRUPTED;
1500		goto error0;
1501	}
1502	ASSERT(lev < cur->bc_nlevels);
1503
1504	/*
1505	 * Now walk back down the tree, fixing up the cursor's buffer
1506	 * pointers and key numbers.
1507	 */
1508	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1509		union xfs_btree_ptr	*ptrp;
1510
1511		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1512		--lev;
1513		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1514		if (error)
1515			goto error0;
1516
1517		xfs_btree_setbuf(cur, lev, bp);
1518		cur->bc_ptrs[lev] = 1;
1519	}
1520out1:
1521	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1522	*stat = 1;
1523	return 0;
1524
1525out0:
1526	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1527	*stat = 0;
1528	return 0;
1529
1530error0:
1531	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1532	return error;
1533}
1534
1535/*
1536 * Decrement cursor by one record at the level.
1537 * For nonzero levels the leaf-ward information is untouched.
1538 */
1539int						/* error */
1540xfs_btree_decrement(
1541	struct xfs_btree_cur	*cur,
1542	int			level,
1543	int			*stat)		/* success/failure */
1544{
1545	struct xfs_btree_block	*block;
1546	xfs_buf_t		*bp;
1547	int			error;		/* error return value */
1548	int			lev;
1549	union xfs_btree_ptr	ptr;
1550
1551	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1552	XFS_BTREE_TRACE_ARGI(cur, level);
1553
1554	ASSERT(level < cur->bc_nlevels);
1555
1556	/* Read-ahead to the left at this level. */
1557	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1558
1559	/* We're done if we remain in the block after the decrement. */
1560	if (--cur->bc_ptrs[level] > 0)
1561		goto out1;
1562
1563	/* Get a pointer to the btree block. */
1564	block = xfs_btree_get_block(cur, level, &bp);
1565
1566#ifdef DEBUG
1567	error = xfs_btree_check_block(cur, block, level, bp);
1568	if (error)
1569		goto error0;
1570#endif
1571
1572	/* Fail if we just went off the left edge of the tree. */
1573	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1574	if (xfs_btree_ptr_is_null(cur, &ptr))
1575		goto out0;
1576
1577	XFS_BTREE_STATS_INC(cur, decrement);
1578
1579	/*
1580	 * March up the tree decrementing pointers.
1581	 * Stop when we don't go off the left edge of a block.
1582	 */
1583	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1584		if (--cur->bc_ptrs[lev] > 0)
1585			break;
1586		/* Read-ahead the left block for the next loop. */
1587		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1588	}
1589
1590	/*
1591	 * If we went off the root then we are seriously confused.
1592	 * or the root of the tree is in an inode.
1593	 */
1594	if (lev == cur->bc_nlevels) {
1595		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1596			goto out0;
1597		ASSERT(0);
1598		error = -EFSCORRUPTED;
1599		goto error0;
1600	}
1601	ASSERT(lev < cur->bc_nlevels);
1602
1603	/*
1604	 * Now walk back down the tree, fixing up the cursor's buffer
1605	 * pointers and key numbers.
1606	 */
1607	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1608		union xfs_btree_ptr	*ptrp;
1609
1610		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1611		--lev;
1612		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1613		if (error)
1614			goto error0;
1615		xfs_btree_setbuf(cur, lev, bp);
1616		cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1617	}
1618out1:
1619	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1620	*stat = 1;
1621	return 0;
1622
1623out0:
1624	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1625	*stat = 0;
1626	return 0;
1627
1628error0:
1629	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1630	return error;
1631}
1632
1633STATIC int
1634xfs_btree_lookup_get_block(
1635	struct xfs_btree_cur	*cur,	/* btree cursor */
1636	int			level,	/* level in the btree */
1637	union xfs_btree_ptr	*pp,	/* ptr to btree block */
1638	struct xfs_btree_block	**blkp) /* return btree block */
1639{
1640	struct xfs_buf		*bp;	/* buffer pointer for btree block */
1641	int			error = 0;
1642
1643	/* special case the root block if in an inode */
1644	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1645	    (level == cur->bc_nlevels - 1)) {
1646		*blkp = xfs_btree_get_iroot(cur);
1647		return 0;
1648	}
1649
1650	/*
1651	 * If the old buffer at this level for the disk address we are
1652	 * looking for re-use it.
1653	 *
1654	 * Otherwise throw it away and get a new one.
1655	 */
1656	bp = cur->bc_bufs[level];
1657	if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1658		*blkp = XFS_BUF_TO_BLOCK(bp);
1659		return 0;
1660	}
1661
1662	error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1663	if (error)
1664		return error;
1665
1666	xfs_btree_setbuf(cur, level, bp);
1667	return 0;
1668}
1669
1670/*
1671 * Get current search key.  For level 0 we don't actually have a key
1672 * structure so we make one up from the record.  For all other levels
1673 * we just return the right key.
1674 */
1675STATIC union xfs_btree_key *
1676xfs_lookup_get_search_key(
1677	struct xfs_btree_cur	*cur,
1678	int			level,
1679	int			keyno,
1680	struct xfs_btree_block	*block,
1681	union xfs_btree_key	*kp)
1682{
1683	if (level == 0) {
1684		cur->bc_ops->init_key_from_rec(kp,
1685				xfs_btree_rec_addr(cur, keyno, block));
1686		return kp;
1687	}
1688
1689	return xfs_btree_key_addr(cur, keyno, block);
1690}
1691
1692/*
1693 * Lookup the record.  The cursor is made to point to it, based on dir.
1694 * stat is set to 0 if can't find any such record, 1 for success.
1695 */
1696int					/* error */
1697xfs_btree_lookup(
1698	struct xfs_btree_cur	*cur,	/* btree cursor */
1699	xfs_lookup_t		dir,	/* <=, ==, or >= */
1700	int			*stat)	/* success/failure */
1701{
1702	struct xfs_btree_block	*block;	/* current btree block */
1703	__int64_t		diff;	/* difference for the current key */
1704	int			error;	/* error return value */
1705	int			keyno;	/* current key number */
1706	int			level;	/* level in the btree */
1707	union xfs_btree_ptr	*pp;	/* ptr to btree block */
1708	union xfs_btree_ptr	ptr;	/* ptr to btree block */
1709
1710	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1711	XFS_BTREE_TRACE_ARGI(cur, dir);
1712
1713	XFS_BTREE_STATS_INC(cur, lookup);
1714
1715	block = NULL;
1716	keyno = 0;
1717
1718	/* initialise start pointer from cursor */
1719	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1720	pp = &ptr;
1721
1722	/*
1723	 * Iterate over each level in the btree, starting at the root.
1724	 * For each level above the leaves, find the key we need, based
1725	 * on the lookup record, then follow the corresponding block
1726	 * pointer down to the next level.
1727	 */
1728	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1729		/* Get the block we need to do the lookup on. */
1730		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1731		if (error)
1732			goto error0;
1733
1734		if (diff == 0) {
1735			/*
1736			 * If we already had a key match at a higher level, we
1737			 * know we need to use the first entry in this block.
1738			 */
1739			keyno = 1;
1740		} else {
1741			/* Otherwise search this block. Do a binary search. */
1742
1743			int	high;	/* high entry number */
1744			int	low;	/* low entry number */
1745
1746			/* Set low and high entry numbers, 1-based. */
1747			low = 1;
1748			high = xfs_btree_get_numrecs(block);
1749			if (!high) {
1750				/* Block is empty, must be an empty leaf. */
1751				ASSERT(level == 0 && cur->bc_nlevels == 1);
1752
1753				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1754				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1755				*stat = 0;
1756				return 0;
1757			}
1758
1759			/* Binary search the block. */
1760			while (low <= high) {
1761				union xfs_btree_key	key;
1762				union xfs_btree_key	*kp;
1763
1764				XFS_BTREE_STATS_INC(cur, compare);
1765
1766				/* keyno is average of low and high. */
1767				keyno = (low + high) >> 1;
1768
1769				/* Get current search key */
1770				kp = xfs_lookup_get_search_key(cur, level,
1771						keyno, block, &key);
1772
1773				/*
1774				 * Compute difference to get next direction:
1775				 *  - less than, move right
1776				 *  - greater than, move left
1777				 *  - equal, we're done
1778				 */
1779				diff = cur->bc_ops->key_diff(cur, kp);
1780				if (diff < 0)
1781					low = keyno + 1;
1782				else if (diff > 0)
1783					high = keyno - 1;
1784				else
1785					break;
1786			}
1787		}
1788
1789		/*
1790		 * If there are more levels, set up for the next level
1791		 * by getting the block number and filling in the cursor.
1792		 */
1793		if (level > 0) {
1794			/*
1795			 * If we moved left, need the previous key number,
1796			 * unless there isn't one.
1797			 */
1798			if (diff > 0 && --keyno < 1)
1799				keyno = 1;
1800			pp = xfs_btree_ptr_addr(cur, keyno, block);
1801
1802#ifdef DEBUG
1803			error = xfs_btree_check_ptr(cur, pp, 0, level);
1804			if (error)
1805				goto error0;
1806#endif
1807			cur->bc_ptrs[level] = keyno;
1808		}
1809	}
1810
1811	/* Done with the search. See if we need to adjust the results. */
1812	if (dir != XFS_LOOKUP_LE && diff < 0) {
1813		keyno++;
1814		/*
1815		 * If ge search and we went off the end of the block, but it's
1816		 * not the last block, we're in the wrong block.
1817		 */
1818		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1819		if (dir == XFS_LOOKUP_GE &&
1820		    keyno > xfs_btree_get_numrecs(block) &&
1821		    !xfs_btree_ptr_is_null(cur, &ptr)) {
1822			int	i;
1823
1824			cur->bc_ptrs[0] = keyno;
1825			error = xfs_btree_increment(cur, 0, &i);
1826			if (error)
1827				goto error0;
1828			XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1829			XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1830			*stat = 1;
1831			return 0;
1832		}
1833	} else if (dir == XFS_LOOKUP_LE && diff > 0)
1834		keyno--;
1835	cur->bc_ptrs[0] = keyno;
1836
1837	/* Return if we succeeded or not. */
1838	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1839		*stat = 0;
1840	else if (dir != XFS_LOOKUP_EQ || diff == 0)
1841		*stat = 1;
1842	else
1843		*stat = 0;
1844	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1845	return 0;
1846
1847error0:
1848	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1849	return error;
1850}
1851
1852/*
1853 * Update keys at all levels from here to the root along the cursor's path.
1854 */
1855STATIC int
1856xfs_btree_updkey(
1857	struct xfs_btree_cur	*cur,
1858	union xfs_btree_key	*keyp,
1859	int			level)
1860{
1861	struct xfs_btree_block	*block;
1862	struct xfs_buf		*bp;
1863	union xfs_btree_key	*kp;
1864	int			ptr;
1865
1866	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1867	XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
1868
1869	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
1870
1871	/*
1872	 * Go up the tree from this level toward the root.
1873	 * At each level, update the key value to the value input.
1874	 * Stop when we reach a level where the cursor isn't pointing
1875	 * at the first entry in the block.
1876	 */
1877	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1878#ifdef DEBUG
1879		int		error;
1880#endif
1881		block = xfs_btree_get_block(cur, level, &bp);
1882#ifdef DEBUG
1883		error = xfs_btree_check_block(cur, block, level, bp);
1884		if (error) {
1885			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1886			return error;
1887		}
1888#endif
1889		ptr = cur->bc_ptrs[level];
1890		kp = xfs_btree_key_addr(cur, ptr, block);
1891		xfs_btree_copy_keys(cur, kp, keyp, 1);
1892		xfs_btree_log_keys(cur, bp, ptr, ptr);
1893	}
1894
1895	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1896	return 0;
1897}
1898
1899/*
1900 * Update the record referred to by cur to the value in the
1901 * given record. This either works (return 0) or gets an
1902 * EFSCORRUPTED error.
1903 */
1904int
1905xfs_btree_update(
1906	struct xfs_btree_cur	*cur,
1907	union xfs_btree_rec	*rec)
1908{
1909	struct xfs_btree_block	*block;
1910	struct xfs_buf		*bp;
1911	int			error;
1912	int			ptr;
1913	union xfs_btree_rec	*rp;
1914
1915	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1916	XFS_BTREE_TRACE_ARGR(cur, rec);
1917
1918	/* Pick up the current block. */
1919	block = xfs_btree_get_block(cur, 0, &bp);
1920
1921#ifdef DEBUG
1922	error = xfs_btree_check_block(cur, block, 0, bp);
1923	if (error)
1924		goto error0;
1925#endif
1926	/* Get the address of the rec to be updated. */
1927	ptr = cur->bc_ptrs[0];
1928	rp = xfs_btree_rec_addr(cur, ptr, block);
1929
1930	/* Fill in the new contents and log them. */
1931	xfs_btree_copy_recs(cur, rp, rec, 1);
1932	xfs_btree_log_recs(cur, bp, ptr, ptr);
1933
1934	/*
1935	 * If we are tracking the last record in the tree and
1936	 * we are at the far right edge of the tree, update it.
1937	 */
1938	if (xfs_btree_is_lastrec(cur, block, 0)) {
1939		cur->bc_ops->update_lastrec(cur, block, rec,
1940					    ptr, LASTREC_UPDATE);
1941	}
1942
1943	/* Updating first rec in leaf. Pass new key value up to our parent. */
1944	if (ptr == 1) {
1945		union xfs_btree_key	key;
1946
1947		cur->bc_ops->init_key_from_rec(&key, rec);
1948		error = xfs_btree_updkey(cur, &key, 1);
1949		if (error)
1950			goto error0;
1951	}
1952
1953	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1954	return 0;
1955
1956error0:
1957	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1958	return error;
1959}
1960
1961/*
1962 * Move 1 record left from cur/level if possible.
1963 * Update cur to reflect the new path.
1964 */
1965STATIC int					/* error */
1966xfs_btree_lshift(
1967	struct xfs_btree_cur	*cur,
1968	int			level,
1969	int			*stat)		/* success/failure */
1970{
1971	union xfs_btree_key	key;		/* btree key */
1972	struct xfs_buf		*lbp;		/* left buffer pointer */
1973	struct xfs_btree_block	*left;		/* left btree block */
1974	int			lrecs;		/* left record count */
1975	struct xfs_buf		*rbp;		/* right buffer pointer */
1976	struct xfs_btree_block	*right;		/* right btree block */
1977	int			rrecs;		/* right record count */
1978	union xfs_btree_ptr	lptr;		/* left btree pointer */
1979	union xfs_btree_key	*rkp = NULL;	/* right btree key */
1980	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
1981	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
1982	int			error;		/* error return value */
1983
1984	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1985	XFS_BTREE_TRACE_ARGI(cur, level);
1986
1987	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1988	    level == cur->bc_nlevels - 1)
1989		goto out0;
1990
1991	/* Set up variables for this block as "right". */
1992	right = xfs_btree_get_block(cur, level, &rbp);
1993
1994#ifdef DEBUG
1995	error = xfs_btree_check_block(cur, right, level, rbp);
1996	if (error)
1997		goto error0;
1998#endif
1999
2000	/* If we've got no left sibling then we can't shift an entry left. */
2001	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2002	if (xfs_btree_ptr_is_null(cur, &lptr))
2003		goto out0;
2004
2005	/*
2006	 * If the cursor entry is the one that would be moved, don't
2007	 * do it... it's too complicated.
2008	 */
2009	if (cur->bc_ptrs[level] <= 1)
2010		goto out0;
2011
2012	/* Set up the left neighbor as "left". */
2013	error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2014	if (error)
2015		goto error0;
2016
2017	/* If it's full, it can't take another entry. */
2018	lrecs = xfs_btree_get_numrecs(left);
2019	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2020		goto out0;
2021
2022	rrecs = xfs_btree_get_numrecs(right);
2023
2024	/*
2025	 * We add one entry to the left side and remove one for the right side.
2026	 * Account for it here, the changes will be updated on disk and logged
2027	 * later.
2028	 */
2029	lrecs++;
2030	rrecs--;
2031
2032	XFS_BTREE_STATS_INC(cur, lshift);
2033	XFS_BTREE_STATS_ADD(cur, moves, 1);
2034
2035	/*
2036	 * If non-leaf, copy a key and a ptr to the left block.
2037	 * Log the changes to the left block.
2038	 */
2039	if (level > 0) {
2040		/* It's a non-leaf.  Move keys and pointers. */
2041		union xfs_btree_key	*lkp;	/* left btree key */
2042		union xfs_btree_ptr	*lpp;	/* left address pointer */
2043
2044		lkp = xfs_btree_key_addr(cur, lrecs, left);
2045		rkp = xfs_btree_key_addr(cur, 1, right);
2046
2047		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2048		rpp = xfs_btree_ptr_addr(cur, 1, right);
2049#ifdef DEBUG
2050		error = xfs_btree_check_ptr(cur, rpp, 0, level);
2051		if (error)
2052			goto error0;
2053#endif
2054		xfs_btree_copy_keys(cur, lkp, rkp, 1);
2055		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2056
2057		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2058		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2059
2060		ASSERT(cur->bc_ops->keys_inorder(cur,
2061			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2062	} else {
2063		/* It's a leaf.  Move records.  */
2064		union xfs_btree_rec	*lrp;	/* left record pointer */
2065
2066		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2067		rrp = xfs_btree_rec_addr(cur, 1, right);
2068
2069		xfs_btree_copy_recs(cur, lrp, rrp, 1);
2070		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2071
2072		ASSERT(cur->bc_ops->recs_inorder(cur,
2073			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2074	}
2075
2076	xfs_btree_set_numrecs(left, lrecs);
2077	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2078
2079	xfs_btree_set_numrecs(right, rrecs);
2080	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2081
2082	/*
2083	 * Slide the contents of right down one entry.
2084	 */
2085	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2086	if (level > 0) {
2087		/* It's a nonleaf. operate on keys and ptrs */
2088#ifdef DEBUG
2089		int			i;		/* loop index */
2090
2091		for (i = 0; i < rrecs; i++) {
2092			error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2093			if (error)
2094				goto error0;
2095		}
2096#endif
2097		xfs_btree_shift_keys(cur,
2098				xfs_btree_key_addr(cur, 2, right),
2099				-1, rrecs);
2100		xfs_btree_shift_ptrs(cur,
2101				xfs_btree_ptr_addr(cur, 2, right),
2102				-1, rrecs);
2103
2104		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2105		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2106	} else {
2107		/* It's a leaf. operate on records */
2108		xfs_btree_shift_recs(cur,
2109			xfs_btree_rec_addr(cur, 2, right),
2110			-1, rrecs);
2111		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2112
2113		/*
2114		 * If it's the first record in the block, we'll need a key
2115		 * structure to pass up to the next level (updkey).
2116		 */
2117		cur->bc_ops->init_key_from_rec(&key,
2118			xfs_btree_rec_addr(cur, 1, right));
2119		rkp = &key;
2120	}
2121
2122	/* Update the parent key values of right. */
2123	error = xfs_btree_updkey(cur, rkp, level + 1);
2124	if (error)
2125		goto error0;
2126
2127	/* Slide the cursor value left one. */
2128	cur->bc_ptrs[level]--;
2129
2130	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2131	*stat = 1;
2132	return 0;
2133
2134out0:
2135	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2136	*stat = 0;
2137	return 0;
2138
2139error0:
2140	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2141	return error;
2142}
2143
2144/*
2145 * Move 1 record right from cur/level if possible.
2146 * Update cur to reflect the new path.
2147 */
2148STATIC int					/* error */
2149xfs_btree_rshift(
2150	struct xfs_btree_cur	*cur,
2151	int			level,
2152	int			*stat)		/* success/failure */
2153{
2154	union xfs_btree_key	key;		/* btree key */
2155	struct xfs_buf		*lbp;		/* left buffer pointer */
2156	struct xfs_btree_block	*left;		/* left btree block */
2157	struct xfs_buf		*rbp;		/* right buffer pointer */
2158	struct xfs_btree_block	*right;		/* right btree block */
2159	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2160	union xfs_btree_ptr	rptr;		/* right block pointer */
2161	union xfs_btree_key	*rkp;		/* right btree key */
2162	int			rrecs;		/* right record count */
2163	int			lrecs;		/* left record count */
2164	int			error;		/* error return value */
2165	int			i;		/* loop counter */
2166
2167	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2168	XFS_BTREE_TRACE_ARGI(cur, level);
2169
2170	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2171	    (level == cur->bc_nlevels - 1))
2172		goto out0;
2173
2174	/* Set up variables for this block as "left". */
2175	left = xfs_btree_get_block(cur, level, &lbp);
2176
2177#ifdef DEBUG
2178	error = xfs_btree_check_block(cur, left, level, lbp);
2179	if (error)
2180		goto error0;
2181#endif
2182
2183	/* If we've got no right sibling then we can't shift an entry right. */
2184	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2185	if (xfs_btree_ptr_is_null(cur, &rptr))
2186		goto out0;
2187
2188	/*
2189	 * If the cursor entry is the one that would be moved, don't
2190	 * do it... it's too complicated.
2191	 */
2192	lrecs = xfs_btree_get_numrecs(left);
2193	if (cur->bc_ptrs[level] >= lrecs)
2194		goto out0;
2195
2196	/* Set up the right neighbor as "right". */
2197	error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2198	if (error)
2199		goto error0;
2200
2201	/* If it's full, it can't take another entry. */
2202	rrecs = xfs_btree_get_numrecs(right);
2203	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2204		goto out0;
2205
2206	XFS_BTREE_STATS_INC(cur, rshift);
2207	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2208
2209	/*
2210	 * Make a hole at the start of the right neighbor block, then
2211	 * copy the last left block entry to the hole.
2212	 */
2213	if (level > 0) {
2214		/* It's a nonleaf. make a hole in the keys and ptrs */
2215		union xfs_btree_key	*lkp;
2216		union xfs_btree_ptr	*lpp;
2217		union xfs_btree_ptr	*rpp;
2218
2219		lkp = xfs_btree_key_addr(cur, lrecs, left);
2220		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2221		rkp = xfs_btree_key_addr(cur, 1, right);
2222		rpp = xfs_btree_ptr_addr(cur, 1, right);
2223
2224#ifdef DEBUG
2225		for (i = rrecs - 1; i >= 0; i--) {
2226			error = xfs_btree_check_ptr(cur, rpp, i, level);
2227			if (error)
2228				goto error0;
2229		}
2230#endif
2231
2232		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2233		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2234
2235#ifdef DEBUG
2236		error = xfs_btree_check_ptr(cur, lpp, 0, level);
2237		if (error)
2238			goto error0;
2239#endif
2240
2241		/* Now put the new data in, and log it. */
2242		xfs_btree_copy_keys(cur, rkp, lkp, 1);
2243		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2244
2245		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2246		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2247
2248		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2249			xfs_btree_key_addr(cur, 2, right)));
2250	} else {
2251		/* It's a leaf. make a hole in the records */
2252		union xfs_btree_rec	*lrp;
2253		union xfs_btree_rec	*rrp;
2254
2255		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2256		rrp = xfs_btree_rec_addr(cur, 1, right);
2257
2258		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2259
2260		/* Now put the new data in, and log it. */
2261		xfs_btree_copy_recs(cur, rrp, lrp, 1);
2262		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2263
2264		cur->bc_ops->init_key_from_rec(&key, rrp);
2265		rkp = &key;
2266
2267		ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
2268			xfs_btree_rec_addr(cur, 2, right)));
2269	}
2270
2271	/*
2272	 * Decrement and log left's numrecs, bump and log right's numrecs.
2273	 */
2274	xfs_btree_set_numrecs(left, --lrecs);
2275	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2276
2277	xfs_btree_set_numrecs(right, ++rrecs);
2278	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2279
2280	/*
2281	 * Using a temporary cursor, update the parent key values of the
2282	 * block on the right.
2283	 */
2284	error = xfs_btree_dup_cursor(cur, &tcur);
2285	if (error)
2286		goto error0;
2287	i = xfs_btree_lastrec(tcur, level);
2288	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
2289
2290	error = xfs_btree_increment(tcur, level, &i);
2291	if (error)
2292		goto error1;
2293
2294	error = xfs_btree_updkey(tcur, rkp, level + 1);
2295	if (error)
2296		goto error1;
2297
2298	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2299
2300	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2301	*stat = 1;
2302	return 0;
2303
2304out0:
2305	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2306	*stat = 0;
2307	return 0;
2308
2309error0:
2310	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2311	return error;
2312
2313error1:
2314	XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2315	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2316	return error;
2317}
2318
2319/*
2320 * Split cur/level block in half.
2321 * Return new block number and the key to its first
2322 * record (to be inserted into parent).
2323 */
2324STATIC int					/* error */
2325__xfs_btree_split(
2326	struct xfs_btree_cur	*cur,
2327	int			level,
2328	union xfs_btree_ptr	*ptrp,
2329	union xfs_btree_key	*key,
2330	struct xfs_btree_cur	**curp,
2331	int			*stat)		/* success/failure */
2332{
2333	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
2334	struct xfs_buf		*lbp;		/* left buffer pointer */
2335	struct xfs_btree_block	*left;		/* left btree block */
2336	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
2337	struct xfs_buf		*rbp;		/* right buffer pointer */
2338	struct xfs_btree_block	*right;		/* right btree block */
2339	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
2340	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
2341	struct xfs_btree_block	*rrblock;	/* right-right btree block */
2342	int			lrecs;
2343	int			rrecs;
2344	int			src_index;
2345	int			error;		/* error return value */
2346#ifdef DEBUG
2347	int			i;
2348#endif
2349
2350	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2351	XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2352
2353	XFS_BTREE_STATS_INC(cur, split);
2354
2355	/* Set up left block (current one). */
2356	left = xfs_btree_get_block(cur, level, &lbp);
2357
2358#ifdef DEBUG
2359	error = xfs_btree_check_block(cur, left, level, lbp);
2360	if (error)
2361		goto error0;
2362#endif
2363
2364	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2365
2366	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2367	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2368	if (error)
2369		goto error0;
2370	if (*stat == 0)
2371		goto out0;
2372	XFS_BTREE_STATS_INC(cur, alloc);
2373
2374	/* Set up the new block as "right". */
2375	error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2376	if (error)
2377		goto error0;
2378
2379	/* Fill in the btree header for the new right block. */
2380	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2381
2382	/*
2383	 * Split the entries between the old and the new block evenly.
2384	 * Make sure that if there's an odd number of entries now, that
2385	 * each new block will have the same number of entries.
2386	 */
2387	lrecs = xfs_btree_get_numrecs(left);
2388	rrecs = lrecs / 2;
2389	if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2390		rrecs++;
2391	src_index = (lrecs - rrecs + 1);
2392
2393	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2394
2395	/*
2396	 * Copy btree block entries from the left block over to the
2397	 * new block, the right. Update the right block and log the
2398	 * changes.
2399	 */
2400	if (level > 0) {
2401		/* It's a non-leaf.  Move keys and pointers. */
2402		union xfs_btree_key	*lkp;	/* left btree key */
2403		union xfs_btree_ptr	*lpp;	/* left address pointer */
2404		union xfs_btree_key	*rkp;	/* right btree key */
2405		union xfs_btree_ptr	*rpp;	/* right address pointer */
2406
2407		lkp = xfs_btree_key_addr(cur, src_index, left);
2408		lpp = xfs_btree_ptr_addr(cur, src_index, left);
2409		rkp = xfs_btree_key_addr(cur, 1, right);
2410		rpp = xfs_btree_ptr_addr(cur, 1, right);
2411
2412#ifdef DEBUG
2413		for (i = src_index; i < rrecs; i++) {
2414			error = xfs_btree_check_ptr(cur, lpp, i, level);
2415			if (error)
2416				goto error0;
2417		}
2418#endif
2419
2420		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2421		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2422
2423		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2424		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2425
2426		/* Grab the keys to the entries moved to the right block */
2427		xfs_btree_copy_keys(cur, key, rkp, 1);
2428	} else {
2429		/* It's a leaf.  Move records.  */
2430		union xfs_btree_rec	*lrp;	/* left record pointer */
2431		union xfs_btree_rec	*rrp;	/* right record pointer */
2432
2433		lrp = xfs_btree_rec_addr(cur, src_index, left);
2434		rrp = xfs_btree_rec_addr(cur, 1, right);
2435
2436		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2437		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2438
2439		cur->bc_ops->init_key_from_rec(key,
2440			xfs_btree_rec_addr(cur, 1, right));
2441	}
2442
2443
2444	/*
2445	 * Find the left block number by looking in the buffer.
2446	 * Adjust numrecs, sibling pointers.
2447	 */
2448	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2449	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2450	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2451	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2452
2453	lrecs -= rrecs;
2454	xfs_btree_set_numrecs(left, lrecs);
2455	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2456
2457	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2458	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2459
2460	/*
2461	 * If there's a block to the new block's right, make that block
2462	 * point back to right instead of to left.
2463	 */
2464	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2465		error = xfs_btree_read_buf_block(cur, &rrptr,
2466							0, &rrblock, &rrbp);
2467		if (error)
2468			goto error0;
2469		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2470		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2471	}
2472	/*
2473	 * If the cursor is really in the right block, move it there.
2474	 * If it's just pointing past the last entry in left, then we'll
2475	 * insert there, so don't change anything in that case.
2476	 */
2477	if (cur->bc_ptrs[level] > lrecs + 1) {
2478		xfs_btree_setbuf(cur, level, rbp);
2479		cur->bc_ptrs[level] -= lrecs;
2480	}
2481	/*
2482	 * If there are more levels, we'll need another cursor which refers
2483	 * the right block, no matter where this cursor was.
2484	 */
2485	if (level + 1 < cur->bc_nlevels) {
2486		error = xfs_btree_dup_cursor(cur, curp);
2487		if (error)
2488			goto error0;
2489		(*curp)->bc_ptrs[level + 1]++;
2490	}
2491	*ptrp = rptr;
2492	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2493	*stat = 1;
2494	return 0;
2495out0:
2496	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2497	*stat = 0;
2498	return 0;
2499
2500error0:
2501	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2502	return error;
2503}
2504
2505struct xfs_btree_split_args {
2506	struct xfs_btree_cur	*cur;
2507	int			level;
2508	union xfs_btree_ptr	*ptrp;
2509	union xfs_btree_key	*key;
2510	struct xfs_btree_cur	**curp;
2511	int			*stat;		/* success/failure */
2512	int			result;
2513	bool			kswapd;	/* allocation in kswapd context */
2514	struct completion	*done;
2515	struct work_struct	work;
2516};
2517
2518/*
2519 * Stack switching interfaces for allocation
2520 */
2521static void
2522xfs_btree_split_worker(
2523	struct work_struct	*work)
2524{
2525	struct xfs_btree_split_args	*args = container_of(work,
2526						struct xfs_btree_split_args, work);
2527	unsigned long		pflags;
2528	unsigned long		new_pflags = PF_FSTRANS;
2529
2530	/*
2531	 * we are in a transaction context here, but may also be doing work
2532	 * in kswapd context, and hence we may need to inherit that state
2533	 * temporarily to ensure that we don't block waiting for memory reclaim
2534	 * in any way.
2535	 */
2536	if (args->kswapd)
2537		new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2538
2539	current_set_flags_nested(&pflags, new_pflags);
2540
2541	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2542					 args->key, args->curp, args->stat);
2543	complete(args->done);
2544
2545	current_restore_flags_nested(&pflags, new_pflags);
2546}
2547
2548/*
2549 * BMBT split requests often come in with little stack to work on. Push
2550 * them off to a worker thread so there is lots of stack to use. For the other
2551 * btree types, just call directly to avoid the context switch overhead here.
2552 */
2553STATIC int					/* error */
2554xfs_btree_split(
2555	struct xfs_btree_cur	*cur,
2556	int			level,
2557	union xfs_btree_ptr	*ptrp,
2558	union xfs_btree_key	*key,
2559	struct xfs_btree_cur	**curp,
2560	int			*stat)		/* success/failure */
2561{
2562	struct xfs_btree_split_args	args;
2563	DECLARE_COMPLETION_ONSTACK(done);
2564
2565	if (cur->bc_btnum != XFS_BTNUM_BMAP)
2566		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2567
2568	args.cur = cur;
2569	args.level = level;
2570	args.ptrp = ptrp;
2571	args.key = key;
2572	args.curp = curp;
2573	args.stat = stat;
2574	args.done = &done;
2575	args.kswapd = current_is_kswapd();
2576	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2577	queue_work(xfs_alloc_wq, &args.work);
2578	wait_for_completion(&done);
2579	destroy_work_on_stack(&args.work);
2580	return args.result;
2581}
2582
2583
2584/*
2585 * Copy the old inode root contents into a real block and make the
2586 * broot point to it.
2587 */
2588int						/* error */
2589xfs_btree_new_iroot(
2590	struct xfs_btree_cur	*cur,		/* btree cursor */
2591	int			*logflags,	/* logging flags for inode */
2592	int			*stat)		/* return status - 0 fail */
2593{
2594	struct xfs_buf		*cbp;		/* buffer for cblock */
2595	struct xfs_btree_block	*block;		/* btree block */
2596	struct xfs_btree_block	*cblock;	/* child btree block */
2597	union xfs_btree_key	*ckp;		/* child key pointer */
2598	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
2599	union xfs_btree_key	*kp;		/* pointer to btree key */
2600	union xfs_btree_ptr	*pp;		/* pointer to block addr */
2601	union xfs_btree_ptr	nptr;		/* new block addr */
2602	int			level;		/* btree level */
2603	int			error;		/* error return code */
2604#ifdef DEBUG
2605	int			i;		/* loop counter */
2606#endif
2607
2608	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2609	XFS_BTREE_STATS_INC(cur, newroot);
2610
2611	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2612
2613	level = cur->bc_nlevels - 1;
2614
2615	block = xfs_btree_get_iroot(cur);
2616	pp = xfs_btree_ptr_addr(cur, 1, block);
2617
2618	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2619	error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2620	if (error)
2621		goto error0;
2622	if (*stat == 0) {
2623		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2624		return 0;
2625	}
2626	XFS_BTREE_STATS_INC(cur, alloc);
2627
2628	/* Copy the root into a real block. */
2629	error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2630	if (error)
2631		goto error0;
2632
2633	/*
2634	 * we can't just memcpy() the root in for CRC enabled btree blocks.
2635	 * In that case have to also ensure the blkno remains correct
2636	 */
2637	memcpy(cblock, block, xfs_btree_block_len(cur));
2638	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2639		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2640			cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2641		else
2642			cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2643	}
2644
2645	be16_add_cpu(&block->bb_level, 1);
2646	xfs_btree_set_numrecs(block, 1);
2647	cur->bc_nlevels++;
2648	cur->bc_ptrs[level + 1] = 1;
2649
2650	kp = xfs_btree_key_addr(cur, 1, block);
2651	ckp = xfs_btree_key_addr(cur, 1, cblock);
2652	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2653
2654	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2655#ifdef DEBUG
2656	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2657		error = xfs_btree_check_ptr(cur, pp, i, level);
2658		if (error)
2659			goto error0;
2660	}
2661#endif
2662	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2663
2664#ifdef DEBUG
2665	error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2666	if (error)
2667		goto error0;
2668#endif
2669	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2670
2671	xfs_iroot_realloc(cur->bc_private.b.ip,
2672			  1 - xfs_btree_get_numrecs(cblock),
2673			  cur->bc_private.b.whichfork);
2674
2675	xfs_btree_setbuf(cur, level, cbp);
2676
2677	/*
2678	 * Do all this logging at the end so that
2679	 * the root is at the right level.
2680	 */
2681	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2682	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2683	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2684
2685	*logflags |=
2686		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2687	*stat = 1;
2688	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2689	return 0;
2690error0:
2691	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2692	return error;
2693}
2694
2695/*
2696 * Allocate a new root block, fill it in.
2697 */
2698STATIC int				/* error */
2699xfs_btree_new_root(
2700	struct xfs_btree_cur	*cur,	/* btree cursor */
2701	int			*stat)	/* success/failure */
2702{
2703	struct xfs_btree_block	*block;	/* one half of the old root block */
2704	struct xfs_buf		*bp;	/* buffer containing block */
2705	int			error;	/* error return value */
2706	struct xfs_buf		*lbp;	/* left buffer pointer */
2707	struct xfs_btree_block	*left;	/* left btree block */
2708	struct xfs_buf		*nbp;	/* new (root) buffer */
2709	struct xfs_btree_block	*new;	/* new (root) btree block */
2710	int			nptr;	/* new value for key index, 1 or 2 */
2711	struct xfs_buf		*rbp;	/* right buffer pointer */
2712	struct xfs_btree_block	*right;	/* right btree block */
2713	union xfs_btree_ptr	rptr;
2714	union xfs_btree_ptr	lptr;
2715
2716	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2717	XFS_BTREE_STATS_INC(cur, newroot);
2718
2719	/* initialise our start point from the cursor */
2720	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
2721
2722	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2723	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
2724	if (error)
2725		goto error0;
2726	if (*stat == 0)
2727		goto out0;
2728	XFS_BTREE_STATS_INC(cur, alloc);
2729
2730	/* Set up the new block. */
2731	error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
2732	if (error)
2733		goto error0;
2734
2735	/* Set the root in the holding structure  increasing the level by 1. */
2736	cur->bc_ops->set_root(cur, &lptr, 1);
2737
2738	/*
2739	 * At the previous root level there are now two blocks: the old root,
2740	 * and the new block generated when it was split.  We don't know which
2741	 * one the cursor is pointing at, so we set up variables "left" and
2742	 * "right" for each case.
2743	 */
2744	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
2745
2746#ifdef DEBUG
2747	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
2748	if (error)
2749		goto error0;
2750#endif
2751
2752	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
2753	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
2754		/* Our block is left, pick up the right block. */
2755		lbp = bp;
2756		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2757		left = block;
2758		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2759		if (error)
2760			goto error0;
2761		bp = rbp;
2762		nptr = 1;
2763	} else {
2764		/* Our block is right, pick up the left block. */
2765		rbp = bp;
2766		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
2767		right = block;
2768		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2769		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2770		if (error)
2771			goto error0;
2772		bp = lbp;
2773		nptr = 2;
2774	}
2775	/* Fill in the new block's btree header and log it. */
2776	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
2777	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2778	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2779			!xfs_btree_ptr_is_null(cur, &rptr));
2780
2781	/* Fill in the key data in the new root. */
2782	if (xfs_btree_get_level(left) > 0) {
2783		xfs_btree_copy_keys(cur,
2784				xfs_btree_key_addr(cur, 1, new),
2785				xfs_btree_key_addr(cur, 1, left), 1);
2786		xfs_btree_copy_keys(cur,
2787				xfs_btree_key_addr(cur, 2, new),
2788				xfs_btree_key_addr(cur, 1, right), 1);
2789	} else {
2790		cur->bc_ops->init_key_from_rec(
2791				xfs_btree_key_addr(cur, 1, new),
2792				xfs_btree_rec_addr(cur, 1, left));
2793		cur->bc_ops->init_key_from_rec(
2794				xfs_btree_key_addr(cur, 2, new),
2795				xfs_btree_rec_addr(cur, 1, right));
2796	}
2797	xfs_btree_log_keys(cur, nbp, 1, 2);
2798
2799	/* Fill in the pointer data in the new root. */
2800	xfs_btree_copy_ptrs(cur,
2801		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
2802	xfs_btree_copy_ptrs(cur,
2803		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
2804	xfs_btree_log_ptrs(cur, nbp, 1, 2);
2805
2806	/* Fix up the cursor. */
2807	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
2808	cur->bc_ptrs[cur->bc_nlevels] = nptr;
2809	cur->bc_nlevels++;
2810	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2811	*stat = 1;
2812	return 0;
2813error0:
2814	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2815	return error;
2816out0:
2817	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2818	*stat = 0;
2819	return 0;
2820}
2821
2822STATIC int
2823xfs_btree_make_block_unfull(
2824	struct xfs_btree_cur	*cur,	/* btree cursor */
2825	int			level,	/* btree level */
2826	int			numrecs,/* # of recs in block */
2827	int			*oindex,/* old tree index */
2828	int			*index,	/* new tree index */
2829	union xfs_btree_ptr	*nptr,	/* new btree ptr */
2830	struct xfs_btree_cur	**ncur,	/* new btree cursor */
2831	union xfs_btree_rec	*nrec,	/* new record */
2832	int			*stat)
2833{
2834	union xfs_btree_key	key;	/* new btree key value */
2835	int			error = 0;
2836
2837	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2838	    level == cur->bc_nlevels - 1) {
2839	    	struct xfs_inode *ip = cur->bc_private.b.ip;
2840
2841		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2842			/* A root block that can be made bigger. */
2843			xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2844		} else {
2845			/* A root block that needs replacing */
2846			int	logflags = 0;
2847
2848			error = xfs_btree_new_iroot(cur, &logflags, stat);
2849			if (error || *stat == 0)
2850				return error;
2851
2852			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
2853		}
2854
2855		return 0;
2856	}
2857
2858	/* First, try shifting an entry to the right neighbor. */
2859	error = xfs_btree_rshift(cur, level, stat);
2860	if (error || *stat)
2861		return error;
2862
2863	/* Next, try shifting an entry to the left neighbor. */
2864	error = xfs_btree_lshift(cur, level, stat);
2865	if (error)
2866		return error;
2867
2868	if (*stat) {
2869		*oindex = *index = cur->bc_ptrs[level];
2870		return 0;
2871	}
2872
2873	/*
2874	 * Next, try splitting the current block in half.
2875	 *
2876	 * If this works we have to re-set our variables because we
2877	 * could be in a different block now.
2878	 */
2879	error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
2880	if (error || *stat == 0)
2881		return error;
2882
2883
2884	*index = cur->bc_ptrs[level];
2885	cur->bc_ops->init_rec_from_key(&key, nrec);
2886	return 0;
2887}
2888
2889/*
2890 * Insert one record/level.  Return information to the caller
2891 * allowing the next level up to proceed if necessary.
2892 */
2893STATIC int
2894xfs_btree_insrec(
2895	struct xfs_btree_cur	*cur,	/* btree cursor */
2896	int			level,	/* level to insert record at */
2897	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
2898	union xfs_btree_rec	*recp,	/* i/o: record data inserted */
2899	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
2900	int			*stat)	/* success/failure */
2901{
2902	struct xfs_btree_block	*block;	/* btree block */
2903	struct xfs_buf		*bp;	/* buffer for block */
2904	union xfs_btree_key	key;	/* btree key */
2905	union xfs_btree_ptr	nptr;	/* new block ptr */
2906	struct xfs_btree_cur	*ncur;	/* new btree cursor */
2907	union xfs_btree_rec	nrec;	/* new record count */
2908	int			optr;	/* old key/record index */
2909	int			ptr;	/* key/record index */
2910	int			numrecs;/* number of records */
2911	int			error;	/* error return value */
2912#ifdef DEBUG
2913	int			i;
2914#endif
2915
2916	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2917	XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
2918
2919	ncur = NULL;
2920
2921	/*
2922	 * If we have an external root pointer, and we've made it to the
2923	 * root level, allocate a new root block and we're done.
2924	 */
2925	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2926	    (level >= cur->bc_nlevels)) {
2927		error = xfs_btree_new_root(cur, stat);
2928		xfs_btree_set_ptr_null(cur, ptrp);
2929
2930		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2931		return error;
2932	}
2933
2934	/* If we're off the left edge, return failure. */
2935	ptr = cur->bc_ptrs[level];
2936	if (ptr == 0) {
2937		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2938		*stat = 0;
2939		return 0;
2940	}
2941
2942	/* Make a key out of the record data to be inserted, and save it. */
2943	cur->bc_ops->init_key_from_rec(&key, recp);
2944
2945	optr = ptr;
2946
2947	XFS_BTREE_STATS_INC(cur, insrec);
2948
2949	/* Get pointers to the btree buffer and block. */
2950	block = xfs_btree_get_block(cur, level, &bp);
2951	numrecs = xfs_btree_get_numrecs(block);
2952
2953#ifdef DEBUG
2954	error = xfs_btree_check_block(cur, block, level, bp);
2955	if (error)
2956		goto error0;
2957
2958	/* Check that the new entry is being inserted in the right place. */
2959	if (ptr <= numrecs) {
2960		if (level == 0) {
2961			ASSERT(cur->bc_ops->recs_inorder(cur, recp,
2962				xfs_btree_rec_addr(cur, ptr, block)));
2963		} else {
2964			ASSERT(cur->bc_ops->keys_inorder(cur, &key,
2965				xfs_btree_key_addr(cur, ptr, block)));
2966		}
2967	}
2968#endif
2969
2970	/*
2971	 * If the block is full, we can't insert the new entry until we
2972	 * make the block un-full.
2973	 */
2974	xfs_btree_set_ptr_null(cur, &nptr);
2975	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
2976		error = xfs_btree_make_block_unfull(cur, level, numrecs,
2977					&optr, &ptr, &nptr, &ncur, &nrec, stat);
2978		if (error || *stat == 0)
2979			goto error0;
2980	}
2981
2982	/*
2983	 * The current block may have changed if the block was
2984	 * previously full and we have just made space in it.
2985	 */
2986	block = xfs_btree_get_block(cur, level, &bp);
2987	numrecs = xfs_btree_get_numrecs(block);
2988
2989#ifdef DEBUG
2990	error = xfs_btree_check_block(cur, block, level, bp);
2991	if (error)
2992		return error;
2993#endif
2994
2995	/*
2996	 * At this point we know there's room for our new entry in the block
2997	 * we're pointing at.
2998	 */
2999	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3000
3001	if (level > 0) {
3002		/* It's a nonleaf. make a hole in the keys and ptrs */
3003		union xfs_btree_key	*kp;
3004		union xfs_btree_ptr	*pp;
3005
3006		kp = xfs_btree_key_addr(cur, ptr, block);
3007		pp = xfs_btree_ptr_addr(cur, ptr, block);
3008
3009#ifdef DEBUG
3010		for (i = numrecs - ptr; i >= 0; i--) {
3011			error = xfs_btree_check_ptr(cur, pp, i, level);
3012			if (error)
3013				return error;
3014		}
3015#endif
3016
3017		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3018		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3019
3020#ifdef DEBUG
3021		error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3022		if (error)
3023			goto error0;
3024#endif
3025
3026		/* Now put the new data in, bump numrecs and log it. */
3027		xfs_btree_copy_keys(cur, kp, &key, 1);
3028		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3029		numrecs++;
3030		xfs_btree_set_numrecs(block, numrecs);
3031		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3032		xfs_btree_log_keys(cur, bp, ptr, numrecs);
3033#ifdef DEBUG
3034		if (ptr < numrecs) {
3035			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3036				xfs_btree_key_addr(cur, ptr + 1, block)));
3037		}
3038#endif
3039	} else {
3040		/* It's a leaf. make a hole in the records */
3041		union xfs_btree_rec             *rp;
3042
3043		rp = xfs_btree_rec_addr(cur, ptr, block);
3044
3045		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3046
3047		/* Now put the new data in, bump numrecs and log it. */
3048		xfs_btree_copy_recs(cur, rp, recp, 1);
3049		xfs_btree_set_numrecs(block, ++numrecs);
3050		xfs_btree_log_recs(cur, bp, ptr, numrecs);
3051#ifdef DEBUG
3052		if (ptr < numrecs) {
3053			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3054				xfs_btree_rec_addr(cur, ptr + 1, block)));
3055		}
3056#endif
3057	}
3058
3059	/* Log the new number of records in the btree header. */
3060	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3061
3062	/* If we inserted at the start of a block, update the parents' keys. */
3063	if (optr == 1) {
3064		error = xfs_btree_updkey(cur, &key, level + 1);
3065		if (error)
3066			goto error0;
3067	}
3068
3069	/*
3070	 * If we are tracking the last record in the tree and
3071	 * we are at the far right edge of the tree, update it.
3072	 */
3073	if (xfs_btree_is_lastrec(cur, block, level)) {
3074		cur->bc_ops->update_lastrec(cur, block, recp,
3075					    ptr, LASTREC_INSREC);
3076	}
3077
3078	/*
3079	 * Return the new block number, if any.
3080	 * If there is one, give back a record value and a cursor too.
3081	 */
3082	*ptrp = nptr;
3083	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3084		*recp = nrec;
3085		*curp = ncur;
3086	}
3087
3088	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3089	*stat = 1;
3090	return 0;
3091
3092error0:
3093	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3094	return error;
3095}
3096
3097/*
3098 * Insert the record at the point referenced by cur.
3099 *
3100 * A multi-level split of the tree on insert will invalidate the original
3101 * cursor.  All callers of this function should assume that the cursor is
3102 * no longer valid and revalidate it.
3103 */
3104int
3105xfs_btree_insert(
3106	struct xfs_btree_cur	*cur,
3107	int			*stat)
3108{
3109	int			error;	/* error return value */
3110	int			i;	/* result value, 0 for failure */
3111	int			level;	/* current level number in btree */
3112	union xfs_btree_ptr	nptr;	/* new block number (split result) */
3113	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
3114	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
3115	union xfs_btree_rec	rec;	/* record to insert */
3116
3117	level = 0;
3118	ncur = NULL;
3119	pcur = cur;
3120
3121	xfs_btree_set_ptr_null(cur, &nptr);
3122	cur->bc_ops->init_rec_from_cur(cur, &rec);
3123
3124	/*
3125	 * Loop going up the tree, starting at the leaf level.
3126	 * Stop when we don't get a split block, that must mean that
3127	 * the insert is finished with this level.
3128	 */
3129	do {
3130		/*
3131		 * Insert nrec/nptr into this level of the tree.
3132		 * Note if we fail, nptr will be null.
3133		 */
3134		error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
3135		if (error) {
3136			if (pcur != cur)
3137				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3138			goto error0;
3139		}
3140
3141		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3142		level++;
3143
3144		/*
3145		 * See if the cursor we just used is trash.
3146		 * Can't trash the caller's cursor, but otherwise we should
3147		 * if ncur is a new cursor or we're about to be done.
3148		 */
3149		if (pcur != cur &&
3150		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3151			/* Save the state from the cursor before we trash it */
3152			if (cur->bc_ops->update_cursor)
3153				cur->bc_ops->update_cursor(pcur, cur);
3154			cur->bc_nlevels = pcur->bc_nlevels;
3155			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3156		}
3157		/* If we got a new cursor, switch to it. */
3158		if (ncur) {
3159			pcur = ncur;
3160			ncur = NULL;
3161		}
3162	} while (!xfs_btree_ptr_is_null(cur, &nptr));
3163
3164	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3165	*stat = i;
3166	return 0;
3167error0:
3168	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3169	return error;
3170}
3171
3172/*
3173 * Try to merge a non-leaf block back into the inode root.
3174 *
3175 * Note: the killroot names comes from the fact that we're effectively
3176 * killing the old root block.  But because we can't just delete the
3177 * inode we have to copy the single block it was pointing to into the
3178 * inode.
3179 */
3180STATIC int
3181xfs_btree_kill_iroot(
3182	struct xfs_btree_cur	*cur)
3183{
3184	int			whichfork = cur->bc_private.b.whichfork;
3185	struct xfs_inode	*ip = cur->bc_private.b.ip;
3186	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
3187	struct xfs_btree_block	*block;
3188	struct xfs_btree_block	*cblock;
3189	union xfs_btree_key	*kp;
3190	union xfs_btree_key	*ckp;
3191	union xfs_btree_ptr	*pp;
3192	union xfs_btree_ptr	*cpp;
3193	struct xfs_buf		*cbp;
3194	int			level;
3195	int			index;
3196	int			numrecs;
3197#ifdef DEBUG
3198	union xfs_btree_ptr	ptr;
3199	int			i;
3200#endif
3201
3202	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3203
3204	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3205	ASSERT(cur->bc_nlevels > 1);
3206
3207	/*
3208	 * Don't deal with the root block needs to be a leaf case.
3209	 * We're just going to turn the thing back into extents anyway.
3210	 */
3211	level = cur->bc_nlevels - 1;
3212	if (level == 1)
3213		goto out0;
3214
3215	/*
3216	 * Give up if the root has multiple children.
3217	 */
3218	block = xfs_btree_get_iroot(cur);
3219	if (xfs_btree_get_numrecs(block) != 1)
3220		goto out0;
3221
3222	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3223	numrecs = xfs_btree_get_numrecs(cblock);
3224
3225	/*
3226	 * Only do this if the next level will fit.
3227	 * Then the data must be copied up to the inode,
3228	 * instead of freeing the root you free the next level.
3229	 */
3230	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3231		goto out0;
3232
3233	XFS_BTREE_STATS_INC(cur, killroot);
3234
3235#ifdef DEBUG
3236	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3237	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3238	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3239	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3240#endif
3241
3242	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3243	if (index) {
3244		xfs_iroot_realloc(cur->bc_private.b.ip, index,
3245				  cur->bc_private.b.whichfork);
3246		block = ifp->if_broot;
3247	}
3248
3249	be16_add_cpu(&block->bb_numrecs, index);
3250	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3251
3252	kp = xfs_btree_key_addr(cur, 1, block);
3253	ckp = xfs_btree_key_addr(cur, 1, cblock);
3254	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3255
3256	pp = xfs_btree_ptr_addr(cur, 1, block);
3257	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3258#ifdef DEBUG
3259	for (i = 0; i < numrecs; i++) {
3260		int		error;
3261
3262		error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3263		if (error) {
3264			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3265			return error;
3266		}
3267	}
3268#endif
3269	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3270
3271	cur->bc_ops->free_block(cur, cbp);
3272	XFS_BTREE_STATS_INC(cur, free);
3273
3274	cur->bc_bufs[level - 1] = NULL;
3275	be16_add_cpu(&block->bb_level, -1);
3276	xfs_trans_log_inode(cur->bc_tp, ip,
3277		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3278	cur->bc_nlevels--;
3279out0:
3280	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3281	return 0;
3282}
3283
3284/*
3285 * Kill the current root node, and replace it with it's only child node.
3286 */
3287STATIC int
3288xfs_btree_kill_root(
3289	struct xfs_btree_cur	*cur,
3290	struct xfs_buf		*bp,
3291	int			level,
3292	union xfs_btree_ptr	*newroot)
3293{
3294	int			error;
3295
3296	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3297	XFS_BTREE_STATS_INC(cur, killroot);
3298
3299	/*
3300	 * Update the root pointer, decreasing the level by 1 and then
3301	 * free the old root.
3302	 */
3303	cur->bc_ops->set_root(cur, newroot, -1);
3304
3305	error = cur->bc_ops->free_block(cur, bp);
3306	if (error) {
3307		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3308		return error;
3309	}
3310
3311	XFS_BTREE_STATS_INC(cur, free);
3312
3313	cur->bc_bufs[level] = NULL;
3314	cur->bc_ra[level] = 0;
3315	cur->bc_nlevels--;
3316
3317	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3318	return 0;
3319}
3320
3321STATIC int
3322xfs_btree_dec_cursor(
3323	struct xfs_btree_cur	*cur,
3324	int			level,
3325	int			*stat)
3326{
3327	int			error;
3328	int			i;
3329
3330	if (level > 0) {
3331		error = xfs_btree_decrement(cur, level, &i);
3332		if (error)
3333			return error;
3334	}
3335
3336	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3337	*stat = 1;
3338	return 0;
3339}
3340
3341/*
3342 * Single level of the btree record deletion routine.
3343 * Delete record pointed to by cur/level.
3344 * Remove the record from its block then rebalance the tree.
3345 * Return 0 for error, 1 for done, 2 to go on to the next level.
3346 */
3347STATIC int					/* error */
3348xfs_btree_delrec(
3349	struct xfs_btree_cur	*cur,		/* btree cursor */
3350	int			level,		/* level removing record from */
3351	int			*stat)		/* fail/done/go-on */
3352{
3353	struct xfs_btree_block	*block;		/* btree block */
3354	union xfs_btree_ptr	cptr;		/* current block ptr */
3355	struct xfs_buf		*bp;		/* buffer for block */
3356	int			error;		/* error return value */
3357	int			i;		/* loop counter */
3358	union xfs_btree_key	key;		/* storage for keyp */
3359	union xfs_btree_key	*keyp = &key;	/* passed to the next level */
3360	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
3361	struct xfs_buf		*lbp;		/* left buffer pointer */
3362	struct xfs_btree_block	*left;		/* left btree block */
3363	int			lrecs = 0;	/* left record count */
3364	int			ptr;		/* key/record index */
3365	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
3366	struct xfs_buf		*rbp;		/* right buffer pointer */
3367	struct xfs_btree_block	*right;		/* right btree block */
3368	struct xfs_btree_block	*rrblock;	/* right-right btree block */
3369	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
3370	int			rrecs = 0;	/* right record count */
3371	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
3372	int			numrecs;	/* temporary numrec count */
3373
3374	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3375	XFS_BTREE_TRACE_ARGI(cur, level);
3376
3377	tcur = NULL;
3378
3379	/* Get the index of the entry being deleted, check for nothing there. */
3380	ptr = cur->bc_ptrs[level];
3381	if (ptr == 0) {
3382		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3383		*stat = 0;
3384		return 0;
3385	}
3386
3387	/* Get the buffer & block containing the record or key/ptr. */
3388	block = xfs_btree_get_block(cur, level, &bp);
3389	numrecs = xfs_btree_get_numrecs(block);
3390
3391#ifdef DEBUG
3392	error = xfs_btree_check_block(cur, block, level, bp);
3393	if (error)
3394		goto error0;
3395#endif
3396
3397	/* Fail if we're off the end of the block. */
3398	if (ptr > numrecs) {
3399		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3400		*stat = 0;
3401		return 0;
3402	}
3403
3404	XFS_BTREE_STATS_INC(cur, delrec);
3405	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3406
3407	/* Excise the entries being deleted. */
3408	if (level > 0) {
3409		/* It's a nonleaf. operate on keys and ptrs */
3410		union xfs_btree_key	*lkp;
3411		union xfs_btree_ptr	*lpp;
3412
3413		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3414		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3415
3416#ifdef DEBUG
3417		for (i = 0; i < numrecs - ptr; i++) {
3418			error = xfs_btree_check_ptr(cur, lpp, i, level);
3419			if (error)
3420				goto error0;
3421		}
3422#endif
3423
3424		if (ptr < numrecs) {
3425			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3426			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3427			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3428			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3429		}
3430
3431		/*
3432		 * If it's the first record in the block, we'll need to pass a
3433		 * key up to the next level (updkey).
3434		 */
3435		if (ptr == 1)
3436			keyp = xfs_btree_key_addr(cur, 1, block);
3437	} else {
3438		/* It's a leaf. operate on records */
3439		if (ptr < numrecs) {
3440			xfs_btree_shift_recs(cur,
3441				xfs_btree_rec_addr(cur, ptr + 1, block),
3442				-1, numrecs - ptr);
3443			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3444		}
3445
3446		/*
3447		 * If it's the first record in the block, we'll need a key
3448		 * structure to pass up to the next level (updkey).
3449		 */
3450		if (ptr == 1) {
3451			cur->bc_ops->init_key_from_rec(&key,
3452					xfs_btree_rec_addr(cur, 1, block));
3453			keyp = &key;
3454		}
3455	}
3456
3457	/*
3458	 * Decrement and log the number of entries in the block.
3459	 */
3460	xfs_btree_set_numrecs(block, --numrecs);
3461	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3462
3463	/*
3464	 * If we are tracking the last record in the tree and
3465	 * we are at the far right edge of the tree, update it.
3466	 */
3467	if (xfs_btree_is_lastrec(cur, block, level)) {
3468		cur->bc_ops->update_lastrec(cur, block, NULL,
3469					    ptr, LASTREC_DELREC);
3470	}
3471
3472	/*
3473	 * We're at the root level.  First, shrink the root block in-memory.
3474	 * Try to get rid of the next level down.  If we can't then there's
3475	 * nothing left to do.
3476	 */
3477	if (level == cur->bc_nlevels - 1) {
3478		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3479			xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3480					  cur->bc_private.b.whichfork);
3481
3482			error = xfs_btree_kill_iroot(cur);
3483			if (error)
3484				goto error0;
3485
3486			error = xfs_btree_dec_cursor(cur, level, stat);
3487			if (error)
3488				goto error0;
3489			*stat = 1;
3490			return 0;
3491		}
3492
3493		/*
3494		 * If this is the root level, and there's only one entry left,
3495		 * and it's NOT the leaf level, then we can get rid of this
3496		 * level.
3497		 */
3498		if (numrecs == 1 && level > 0) {
3499			union xfs_btree_ptr	*pp;
3500			/*
3501			 * pp is still set to the first pointer in the block.
3502			 * Make it the new root of the btree.
3503			 */
3504			pp = xfs_btree_ptr_addr(cur, 1, block);
3505			error = xfs_btree_kill_root(cur, bp, level, pp);
3506			if (error)
3507				goto error0;
3508		} else if (level > 0) {
3509			error = xfs_btree_dec_cursor(cur, level, stat);
3510			if (error)
3511				goto error0;
3512		}
3513		*stat = 1;
3514		return 0;
3515	}
3516
3517	/*
3518	 * If we deleted the leftmost entry in the block, update the
3519	 * key values above us in the tree.
3520	 */
3521	if (ptr == 1) {
3522		error = xfs_btree_updkey(cur, keyp, level + 1);
3523		if (error)
3524			goto error0;
3525	}
3526
3527	/*
3528	 * If the number of records remaining in the block is at least
3529	 * the minimum, we're done.
3530	 */
3531	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3532		error = xfs_btree_dec_cursor(cur, level, stat);
3533		if (error)
3534			goto error0;
3535		return 0;
3536	}
3537
3538	/*
3539	 * Otherwise, we have to move some records around to keep the
3540	 * tree balanced.  Look at the left and right sibling blocks to
3541	 * see if we can re-balance by moving only one record.
3542	 */
3543	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3544	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3545
3546	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3547		/*
3548		 * One child of root, need to get a chance to copy its contents
3549		 * into the root and delete it. Can't go up to next level,
3550		 * there's nothing to delete there.
3551		 */
3552		if (xfs_btree_ptr_is_null(cur, &rptr) &&
3553		    xfs_btree_ptr_is_null(cur, &lptr) &&
3554		    level == cur->bc_nlevels - 2) {
3555			error = xfs_btree_kill_iroot(cur);
3556			if (!error)
3557				error = xfs_btree_dec_cursor(cur, level, stat);
3558			if (error)
3559				goto error0;
3560			return 0;
3561		}
3562	}
3563
3564	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3565	       !xfs_btree_ptr_is_null(cur, &lptr));
3566
3567	/*
3568	 * Duplicate the cursor so our btree manipulations here won't
3569	 * disrupt the next level up.
3570	 */
3571	error = xfs_btree_dup_cursor(cur, &tcur);
3572	if (error)
3573		goto error0;
3574
3575	/*
3576	 * If there's a right sibling, see if it's ok to shift an entry
3577	 * out of it.
3578	 */
3579	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3580		/*
3581		 * Move the temp cursor to the last entry in the next block.
3582		 * Actually any entry but the first would suffice.
3583		 */
3584		i = xfs_btree_lastrec(tcur, level);
3585		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3586
3587		error = xfs_btree_increment(tcur, level, &i);
3588		if (error)
3589			goto error0;
3590		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3591
3592		i = xfs_btree_lastrec(tcur, level);
3593		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3594
3595		/* Grab a pointer to the block. */
3596		right = xfs_btree_get_block(tcur, level, &rbp);
3597#ifdef DEBUG
3598		error = xfs_btree_check_block(tcur, right, level, rbp);
3599		if (error)
3600			goto error0;
3601#endif
3602		/* Grab the current block number, for future use. */
3603		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3604
3605		/*
3606		 * If right block is full enough so that removing one entry
3607		 * won't make it too empty, and left-shifting an entry out
3608		 * of right to us works, we're done.
3609		 */
3610		if (xfs_btree_get_numrecs(right) - 1 >=
3611		    cur->bc_ops->get_minrecs(tcur, level)) {
3612			error = xfs_btree_lshift(tcur, level, &i);
3613			if (error)
3614				goto error0;
3615			if (i) {
3616				ASSERT(xfs_btree_get_numrecs(block) >=
3617				       cur->bc_ops->get_minrecs(tcur, level));
3618
3619				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3620				tcur = NULL;
3621
3622				error = xfs_btree_dec_cursor(cur, level, stat);
3623				if (error)
3624					goto error0;
3625				return 0;
3626			}
3627		}
3628
3629		/*
3630		 * Otherwise, grab the number of records in right for
3631		 * future reference, and fix up the temp cursor to point
3632		 * to our block again (last record).
3633		 */
3634		rrecs = xfs_btree_get_numrecs(right);
3635		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3636			i = xfs_btree_firstrec(tcur, level);
3637			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3638
3639			error = xfs_btree_decrement(tcur, level, &i);
3640			if (error)
3641				goto error0;
3642			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3643		}
3644	}
3645
3646	/*
3647	 * If there's a left sibling, see if it's ok to shift an entry
3648	 * out of it.
3649	 */
3650	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3651		/*
3652		 * Move the temp cursor to the first entry in the
3653		 * previous block.
3654		 */
3655		i = xfs_btree_firstrec(tcur, level);
3656		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3657
3658		error = xfs_btree_decrement(tcur, level, &i);
3659		if (error)
3660			goto error0;
3661		i = xfs_btree_firstrec(tcur, level);
3662		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3663
3664		/* Grab a pointer to the block. */
3665		left = xfs_btree_get_block(tcur, level, &lbp);
3666#ifdef DEBUG
3667		error = xfs_btree_check_block(cur, left, level, lbp);
3668		if (error)
3669			goto error0;
3670#endif
3671		/* Grab the current block number, for future use. */
3672		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3673
3674		/*
3675		 * If left block is full enough so that removing one entry
3676		 * won't make it too empty, and right-shifting an entry out
3677		 * of left to us works, we're done.
3678		 */
3679		if (xfs_btree_get_numrecs(left) - 1 >=
3680		    cur->bc_ops->get_minrecs(tcur, level)) {
3681			error = xfs_btree_rshift(tcur, level, &i);
3682			if (error)
3683				goto error0;
3684			if (i) {
3685				ASSERT(xfs_btree_get_numrecs(block) >=
3686				       cur->bc_ops->get_minrecs(tcur, level));
3687				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3688				tcur = NULL;
3689				if (level == 0)
3690					cur->bc_ptrs[0]++;
3691				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3692				*stat = 1;
3693				return 0;
3694			}
3695		}
3696
3697		/*
3698		 * Otherwise, grab the number of records in right for
3699		 * future reference.
3700		 */
3701		lrecs = xfs_btree_get_numrecs(left);
3702	}
3703
3704	/* Delete the temp cursor, we're done with it. */
3705	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3706	tcur = NULL;
3707
3708	/* If here, we need to do a join to keep the tree balanced. */
3709	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3710
3711	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3712	    lrecs + xfs_btree_get_numrecs(block) <=
3713			cur->bc_ops->get_maxrecs(cur, level)) {
3714		/*
3715		 * Set "right" to be the starting block,
3716		 * "left" to be the left neighbor.
3717		 */
3718		rptr = cptr;
3719		right = block;
3720		rbp = bp;
3721		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3722		if (error)
3723			goto error0;
3724
3725	/*
3726	 * If that won't work, see if we can join with the right neighbor block.
3727	 */
3728	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3729		   rrecs + xfs_btree_get_numrecs(block) <=
3730			cur->bc_ops->get_maxrecs(cur, level)) {
3731		/*
3732		 * Set "left" to be the starting block,
3733		 * "right" to be the right neighbor.
3734		 */
3735		lptr = cptr;
3736		left = block;
3737		lbp = bp;
3738		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3739		if (error)
3740			goto error0;
3741
3742	/*
3743	 * Otherwise, we can't fix the imbalance.
3744	 * Just return.  This is probably a logic error, but it's not fatal.
3745	 */
3746	} else {
3747		error = xfs_btree_dec_cursor(cur, level, stat);
3748		if (error)
3749			goto error0;
3750		return 0;
3751	}
3752
3753	rrecs = xfs_btree_get_numrecs(right);
3754	lrecs = xfs_btree_get_numrecs(left);
3755
3756	/*
3757	 * We're now going to join "left" and "right" by moving all the stuff
3758	 * in "right" to "left" and deleting "right".
3759	 */
3760	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
3761	if (level > 0) {
3762		/* It's a non-leaf.  Move keys and pointers. */
3763		union xfs_btree_key	*lkp;	/* left btree key */
3764		union xfs_btree_ptr	*lpp;	/* left address pointer */
3765		union xfs_btree_key	*rkp;	/* right btree key */
3766		union xfs_btree_ptr	*rpp;	/* right address pointer */
3767
3768		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
3769		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
3770		rkp = xfs_btree_key_addr(cur, 1, right);
3771		rpp = xfs_btree_ptr_addr(cur, 1, right);
3772#ifdef DEBUG
3773		for (i = 1; i < rrecs; i++) {
3774			error = xfs_btree_check_ptr(cur, rpp, i, level);
3775			if (error)
3776				goto error0;
3777		}
3778#endif
3779		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
3780		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
3781
3782		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
3783		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
3784	} else {
3785		/* It's a leaf.  Move records.  */
3786		union xfs_btree_rec	*lrp;	/* left record pointer */
3787		union xfs_btree_rec	*rrp;	/* right record pointer */
3788
3789		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
3790		rrp = xfs_btree_rec_addr(cur, 1, right);
3791
3792		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
3793		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
3794	}
3795
3796	XFS_BTREE_STATS_INC(cur, join);
3797
3798	/*
3799	 * Fix up the number of records and right block pointer in the
3800	 * surviving block, and log it.
3801	 */
3802	xfs_btree_set_numrecs(left, lrecs + rrecs);
3803	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
3804	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3805	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
3806
3807	/* If there is a right sibling, point it to the remaining block. */
3808	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3809	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
3810		error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
3811		if (error)
3812			goto error0;
3813		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
3814		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
3815	}
3816
3817	/* Free the deleted block. */
3818	error = cur->bc_ops->free_block(cur, rbp);
3819	if (error)
3820		goto error0;
3821	XFS_BTREE_STATS_INC(cur, free);
3822
3823	/*
3824	 * If we joined with the left neighbor, set the buffer in the
3825	 * cursor to the left block, and fix up the index.
3826	 */
3827	if (bp != lbp) {
3828		cur->bc_bufs[level] = lbp;
3829		cur->bc_ptrs[level] += lrecs;
3830		cur->bc_ra[level] = 0;
3831	}
3832	/*
3833	 * If we joined with the right neighbor and there's a level above
3834	 * us, increment the cursor at that level.
3835	 */
3836	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
3837		   (level + 1 < cur->bc_nlevels)) {
3838		error = xfs_btree_increment(cur, level + 1, &i);
3839		if (error)
3840			goto error0;
3841	}
3842
3843	/*
3844	 * Readjust the ptr at this level if it's not a leaf, since it's
3845	 * still pointing at the deletion point, which makes the cursor
3846	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
3847	 * We can't use decrement because it would change the next level up.
3848	 */
3849	if (level > 0)
3850		cur->bc_ptrs[level]--;
3851
3852	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3853	/* Return value means the next level up has something to do. */
3854	*stat = 2;
3855	return 0;
3856
3857error0:
3858	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3859	if (tcur)
3860		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
3861	return error;
3862}
3863
3864/*
3865 * Delete the record pointed to by cur.
3866 * The cursor refers to the place where the record was (could be inserted)
3867 * when the operation returns.
3868 */
3869int					/* error */
3870xfs_btree_delete(
3871	struct xfs_btree_cur	*cur,
3872	int			*stat)	/* success/failure */
3873{
3874	int			error;	/* error return value */
3875	int			level;
3876	int			i;
3877
3878	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3879
3880	/*
3881	 * Go up the tree, starting at leaf level.
3882	 *
3883	 * If 2 is returned then a join was done; go to the next level.
3884	 * Otherwise we are done.
3885	 */
3886	for (level = 0, i = 2; i == 2; level++) {
3887		error = xfs_btree_delrec(cur, level, &i);
3888		if (error)
3889			goto error0;
3890	}
3891
3892	if (i == 0) {
3893		for (level = 1; level < cur->bc_nlevels; level++) {
3894			if (cur->bc_ptrs[level] == 0) {
3895				error = xfs_btree_decrement(cur, level, &i);
3896				if (error)
3897					goto error0;
3898				break;
3899			}
3900		}
3901	}
3902
3903	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3904	*stat = i;
3905	return 0;
3906error0:
3907	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3908	return error;
3909}
3910
3911/*
3912 * Get the data from the pointed-to record.
3913 */
3914int					/* error */
3915xfs_btree_get_rec(
3916	struct xfs_btree_cur	*cur,	/* btree cursor */
3917	union xfs_btree_rec	**recp,	/* output: btree record */
3918	int			*stat)	/* output: success/failure */
3919{
3920	struct xfs_btree_block	*block;	/* btree block */
3921	struct xfs_buf		*bp;	/* buffer pointer */
3922	int			ptr;	/* record number */
3923#ifdef DEBUG
3924	int			error;	/* error return value */
3925#endif
3926
3927	ptr = cur->bc_ptrs[0];
3928	block = xfs_btree_get_block(cur, 0, &bp);
3929
3930#ifdef DEBUG
3931	error = xfs_btree_check_block(cur, block, 0, bp);
3932	if (error)
3933		return error;
3934#endif
3935
3936	/*
3937	 * Off the right end or left end, return failure.
3938	 */
3939	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
3940		*stat = 0;
3941		return 0;
3942	}
3943
3944	/*
3945	 * Point to the record and extract its data.
3946	 */
3947	*recp = xfs_btree_rec_addr(cur, ptr, block);
3948	*stat = 1;
3949	return 0;
3950}
3951
3952/*
3953 * Change the owner of a btree.
3954 *
3955 * The mechanism we use here is ordered buffer logging. Because we don't know
3956 * how many buffers were are going to need to modify, we don't really want to
3957 * have to make transaction reservations for the worst case of every buffer in a
3958 * full size btree as that may be more space that we can fit in the log....
3959 *
3960 * We do the btree walk in the most optimal manner possible - we have sibling
3961 * pointers so we can just walk all the blocks on each level from left to right
3962 * in a single pass, and then move to the next level and do the same. We can
3963 * also do readahead on the sibling pointers to get IO moving more quickly,
3964 * though for slow disks this is unlikely to make much difference to performance
3965 * as the amount of CPU work we have to do before moving to the next block is
3966 * relatively small.
3967 *
3968 * For each btree block that we load, modify the owner appropriately, set the
3969 * buffer as an ordered buffer and log it appropriately. We need to ensure that
3970 * we mark the region we change dirty so that if the buffer is relogged in
3971 * a subsequent transaction the changes we make here as an ordered buffer are
3972 * correctly relogged in that transaction.  If we are in recovery context, then
3973 * just queue the modified buffer as delayed write buffer so the transaction
3974 * recovery completion writes the changes to disk.
3975 */
3976static int
3977xfs_btree_block_change_owner(
3978	struct xfs_btree_cur	*cur,
3979	int			level,
3980	__uint64_t		new_owner,
3981	struct list_head	*buffer_list)
3982{
3983	struct xfs_btree_block	*block;
3984	struct xfs_buf		*bp;
3985	union xfs_btree_ptr     rptr;
3986
3987	/* do right sibling readahead */
3988	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
3989
3990	/* modify the owner */
3991	block = xfs_btree_get_block(cur, level, &bp);
3992	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
3993		block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
3994	else
3995		block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
3996
3997	/*
3998	 * If the block is a root block hosted in an inode, we might not have a
3999	 * buffer pointer here and we shouldn't attempt to log the change as the
4000	 * information is already held in the inode and discarded when the root
4001	 * block is formatted into the on-disk inode fork. We still change it,
4002	 * though, so everything is consistent in memory.
4003	 */
4004	if (bp) {
4005		if (cur->bc_tp) {
4006			xfs_trans_ordered_buf(cur->bc_tp, bp);
4007			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4008		} else {
4009			xfs_buf_delwri_queue(bp, buffer_list);
4010		}
4011	} else {
4012		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4013		ASSERT(level == cur->bc_nlevels - 1);
4014	}
4015
4016	/* now read rh sibling block for next iteration */
4017	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4018	if (xfs_btree_ptr_is_null(cur, &rptr))
4019		return -ENOENT;
4020
4021	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4022}
4023
4024int
4025xfs_btree_change_owner(
4026	struct xfs_btree_cur	*cur,
4027	__uint64_t		new_owner,
4028	struct list_head	*buffer_list)
4029{
4030	union xfs_btree_ptr     lptr;
4031	int			level;
4032	struct xfs_btree_block	*block = NULL;
4033	int			error = 0;
4034
4035	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4036
4037	/* for each level */
4038	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4039		/* grab the left hand block */
4040		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4041		if (error)
4042			return error;
4043
4044		/* readahead the left most block for the next level down */
4045		if (level > 0) {
4046			union xfs_btree_ptr     *ptr;
4047
4048			ptr = xfs_btree_ptr_addr(cur, 1, block);
4049			xfs_btree_readahead_ptr(cur, ptr, 1);
4050
4051			/* save for the next iteration of the loop */
4052			lptr = *ptr;
4053		}
4054
4055		/* for each buffer in the level */
4056		do {
4057			error = xfs_btree_block_change_owner(cur, level,
4058							     new_owner,
4059							     buffer_list);
4060		} while (!error);
4061
4062		if (error != -ENOENT)
4063			return error;
4064	}
4065
4066	return 0;
4067}
4068