1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_mount.h"
27#include "xfs_da_format.h"
28#include "xfs_da_btree.h"
29#include "xfs_inode.h"
30#include "xfs_trans.h"
31#include "xfs_log.h"
32#include "xfs_log_priv.h"
33#include "xfs_log_recover.h"
34#include "xfs_inode_item.h"
35#include "xfs_extfree_item.h"
36#include "xfs_trans_priv.h"
37#include "xfs_alloc.h"
38#include "xfs_ialloc.h"
39#include "xfs_quota.h"
40#include "xfs_cksum.h"
41#include "xfs_trace.h"
42#include "xfs_icache.h"
43#include "xfs_bmap_btree.h"
44#include "xfs_error.h"
45#include "xfs_dir2.h"
46
47#define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
48
49STATIC int
50xlog_find_zeroed(
51	struct xlog	*,
52	xfs_daddr_t	*);
53STATIC int
54xlog_clear_stale_blocks(
55	struct xlog	*,
56	xfs_lsn_t);
57#if defined(DEBUG)
58STATIC void
59xlog_recover_check_summary(
60	struct xlog *);
61#else
62#define	xlog_recover_check_summary(log)
63#endif
64
65/*
66 * This structure is used during recovery to record the buf log items which
67 * have been canceled and should not be replayed.
68 */
69struct xfs_buf_cancel {
70	xfs_daddr_t		bc_blkno;
71	uint			bc_len;
72	int			bc_refcount;
73	struct list_head	bc_list;
74};
75
76/*
77 * Sector aligned buffer routines for buffer create/read/write/access
78 */
79
80/*
81 * Verify the given count of basic blocks is valid number of blocks
82 * to specify for an operation involving the given XFS log buffer.
83 * Returns nonzero if the count is valid, 0 otherwise.
84 */
85
86static inline int
87xlog_buf_bbcount_valid(
88	struct xlog	*log,
89	int		bbcount)
90{
91	return bbcount > 0 && bbcount <= log->l_logBBsize;
92}
93
94/*
95 * Allocate a buffer to hold log data.  The buffer needs to be able
96 * to map to a range of nbblks basic blocks at any valid (basic
97 * block) offset within the log.
98 */
99STATIC xfs_buf_t *
100xlog_get_bp(
101	struct xlog	*log,
102	int		nbblks)
103{
104	struct xfs_buf	*bp;
105
106	if (!xlog_buf_bbcount_valid(log, nbblks)) {
107		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
108			nbblks);
109		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
110		return NULL;
111	}
112
113	/*
114	 * We do log I/O in units of log sectors (a power-of-2
115	 * multiple of the basic block size), so we round up the
116	 * requested size to accommodate the basic blocks required
117	 * for complete log sectors.
118	 *
119	 * In addition, the buffer may be used for a non-sector-
120	 * aligned block offset, in which case an I/O of the
121	 * requested size could extend beyond the end of the
122	 * buffer.  If the requested size is only 1 basic block it
123	 * will never straddle a sector boundary, so this won't be
124	 * an issue.  Nor will this be a problem if the log I/O is
125	 * done in basic blocks (sector size 1).  But otherwise we
126	 * extend the buffer by one extra log sector to ensure
127	 * there's space to accommodate this possibility.
128	 */
129	if (nbblks > 1 && log->l_sectBBsize > 1)
130		nbblks += log->l_sectBBsize;
131	nbblks = round_up(nbblks, log->l_sectBBsize);
132
133	bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
134	if (bp)
135		xfs_buf_unlock(bp);
136	return bp;
137}
138
139STATIC void
140xlog_put_bp(
141	xfs_buf_t	*bp)
142{
143	xfs_buf_free(bp);
144}
145
146/*
147 * Return the address of the start of the given block number's data
148 * in a log buffer.  The buffer covers a log sector-aligned region.
149 */
150STATIC xfs_caddr_t
151xlog_align(
152	struct xlog	*log,
153	xfs_daddr_t	blk_no,
154	int		nbblks,
155	struct xfs_buf	*bp)
156{
157	xfs_daddr_t	offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
158
159	ASSERT(offset + nbblks <= bp->b_length);
160	return bp->b_addr + BBTOB(offset);
161}
162
163
164/*
165 * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
166 */
167STATIC int
168xlog_bread_noalign(
169	struct xlog	*log,
170	xfs_daddr_t	blk_no,
171	int		nbblks,
172	struct xfs_buf	*bp)
173{
174	int		error;
175
176	if (!xlog_buf_bbcount_valid(log, nbblks)) {
177		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
178			nbblks);
179		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
180		return -EFSCORRUPTED;
181	}
182
183	blk_no = round_down(blk_no, log->l_sectBBsize);
184	nbblks = round_up(nbblks, log->l_sectBBsize);
185
186	ASSERT(nbblks > 0);
187	ASSERT(nbblks <= bp->b_length);
188
189	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
190	XFS_BUF_READ(bp);
191	bp->b_io_length = nbblks;
192	bp->b_error = 0;
193
194	error = xfs_buf_submit_wait(bp);
195	if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
196		xfs_buf_ioerror_alert(bp, __func__);
197	return error;
198}
199
200STATIC int
201xlog_bread(
202	struct xlog	*log,
203	xfs_daddr_t	blk_no,
204	int		nbblks,
205	struct xfs_buf	*bp,
206	xfs_caddr_t	*offset)
207{
208	int		error;
209
210	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
211	if (error)
212		return error;
213
214	*offset = xlog_align(log, blk_no, nbblks, bp);
215	return 0;
216}
217
218/*
219 * Read at an offset into the buffer. Returns with the buffer in it's original
220 * state regardless of the result of the read.
221 */
222STATIC int
223xlog_bread_offset(
224	struct xlog	*log,
225	xfs_daddr_t	blk_no,		/* block to read from */
226	int		nbblks,		/* blocks to read */
227	struct xfs_buf	*bp,
228	xfs_caddr_t	offset)
229{
230	xfs_caddr_t	orig_offset = bp->b_addr;
231	int		orig_len = BBTOB(bp->b_length);
232	int		error, error2;
233
234	error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
235	if (error)
236		return error;
237
238	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
239
240	/* must reset buffer pointer even on error */
241	error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
242	if (error)
243		return error;
244	return error2;
245}
246
247/*
248 * Write out the buffer at the given block for the given number of blocks.
249 * The buffer is kept locked across the write and is returned locked.
250 * This can only be used for synchronous log writes.
251 */
252STATIC int
253xlog_bwrite(
254	struct xlog	*log,
255	xfs_daddr_t	blk_no,
256	int		nbblks,
257	struct xfs_buf	*bp)
258{
259	int		error;
260
261	if (!xlog_buf_bbcount_valid(log, nbblks)) {
262		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
263			nbblks);
264		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
265		return -EFSCORRUPTED;
266	}
267
268	blk_no = round_down(blk_no, log->l_sectBBsize);
269	nbblks = round_up(nbblks, log->l_sectBBsize);
270
271	ASSERT(nbblks > 0);
272	ASSERT(nbblks <= bp->b_length);
273
274	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
275	XFS_BUF_ZEROFLAGS(bp);
276	xfs_buf_hold(bp);
277	xfs_buf_lock(bp);
278	bp->b_io_length = nbblks;
279	bp->b_error = 0;
280
281	error = xfs_bwrite(bp);
282	if (error)
283		xfs_buf_ioerror_alert(bp, __func__);
284	xfs_buf_relse(bp);
285	return error;
286}
287
288#ifdef DEBUG
289/*
290 * dump debug superblock and log record information
291 */
292STATIC void
293xlog_header_check_dump(
294	xfs_mount_t		*mp,
295	xlog_rec_header_t	*head)
296{
297	xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d",
298		__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
299	xfs_debug(mp, "    log : uuid = %pU, fmt = %d",
300		&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
301}
302#else
303#define xlog_header_check_dump(mp, head)
304#endif
305
306/*
307 * check log record header for recovery
308 */
309STATIC int
310xlog_header_check_recover(
311	xfs_mount_t		*mp,
312	xlog_rec_header_t	*head)
313{
314	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
315
316	/*
317	 * IRIX doesn't write the h_fmt field and leaves it zeroed
318	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
319	 * a dirty log created in IRIX.
320	 */
321	if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
322		xfs_warn(mp,
323	"dirty log written in incompatible format - can't recover");
324		xlog_header_check_dump(mp, head);
325		XFS_ERROR_REPORT("xlog_header_check_recover(1)",
326				 XFS_ERRLEVEL_HIGH, mp);
327		return -EFSCORRUPTED;
328	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
329		xfs_warn(mp,
330	"dirty log entry has mismatched uuid - can't recover");
331		xlog_header_check_dump(mp, head);
332		XFS_ERROR_REPORT("xlog_header_check_recover(2)",
333				 XFS_ERRLEVEL_HIGH, mp);
334		return -EFSCORRUPTED;
335	}
336	return 0;
337}
338
339/*
340 * read the head block of the log and check the header
341 */
342STATIC int
343xlog_header_check_mount(
344	xfs_mount_t		*mp,
345	xlog_rec_header_t	*head)
346{
347	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
348
349	if (uuid_is_nil(&head->h_fs_uuid)) {
350		/*
351		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
352		 * h_fs_uuid is nil, we assume this log was last mounted
353		 * by IRIX and continue.
354		 */
355		xfs_warn(mp, "nil uuid in log - IRIX style log");
356	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
357		xfs_warn(mp, "log has mismatched uuid - can't recover");
358		xlog_header_check_dump(mp, head);
359		XFS_ERROR_REPORT("xlog_header_check_mount",
360				 XFS_ERRLEVEL_HIGH, mp);
361		return -EFSCORRUPTED;
362	}
363	return 0;
364}
365
366STATIC void
367xlog_recover_iodone(
368	struct xfs_buf	*bp)
369{
370	if (bp->b_error) {
371		/*
372		 * We're not going to bother about retrying
373		 * this during recovery. One strike!
374		 */
375		if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
376			xfs_buf_ioerror_alert(bp, __func__);
377			xfs_force_shutdown(bp->b_target->bt_mount,
378						SHUTDOWN_META_IO_ERROR);
379		}
380	}
381	bp->b_iodone = NULL;
382	xfs_buf_ioend(bp);
383}
384
385/*
386 * This routine finds (to an approximation) the first block in the physical
387 * log which contains the given cycle.  It uses a binary search algorithm.
388 * Note that the algorithm can not be perfect because the disk will not
389 * necessarily be perfect.
390 */
391STATIC int
392xlog_find_cycle_start(
393	struct xlog	*log,
394	struct xfs_buf	*bp,
395	xfs_daddr_t	first_blk,
396	xfs_daddr_t	*last_blk,
397	uint		cycle)
398{
399	xfs_caddr_t	offset;
400	xfs_daddr_t	mid_blk;
401	xfs_daddr_t	end_blk;
402	uint		mid_cycle;
403	int		error;
404
405	end_blk = *last_blk;
406	mid_blk = BLK_AVG(first_blk, end_blk);
407	while (mid_blk != first_blk && mid_blk != end_blk) {
408		error = xlog_bread(log, mid_blk, 1, bp, &offset);
409		if (error)
410			return error;
411		mid_cycle = xlog_get_cycle(offset);
412		if (mid_cycle == cycle)
413			end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
414		else
415			first_blk = mid_blk; /* first_half_cycle == mid_cycle */
416		mid_blk = BLK_AVG(first_blk, end_blk);
417	}
418	ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
419	       (mid_blk == end_blk && mid_blk-1 == first_blk));
420
421	*last_blk = end_blk;
422
423	return 0;
424}
425
426/*
427 * Check that a range of blocks does not contain stop_on_cycle_no.
428 * Fill in *new_blk with the block offset where such a block is
429 * found, or with -1 (an invalid block number) if there is no such
430 * block in the range.  The scan needs to occur from front to back
431 * and the pointer into the region must be updated since a later
432 * routine will need to perform another test.
433 */
434STATIC int
435xlog_find_verify_cycle(
436	struct xlog	*log,
437	xfs_daddr_t	start_blk,
438	int		nbblks,
439	uint		stop_on_cycle_no,
440	xfs_daddr_t	*new_blk)
441{
442	xfs_daddr_t	i, j;
443	uint		cycle;
444	xfs_buf_t	*bp;
445	xfs_daddr_t	bufblks;
446	xfs_caddr_t	buf = NULL;
447	int		error = 0;
448
449	/*
450	 * Greedily allocate a buffer big enough to handle the full
451	 * range of basic blocks we'll be examining.  If that fails,
452	 * try a smaller size.  We need to be able to read at least
453	 * a log sector, or we're out of luck.
454	 */
455	bufblks = 1 << ffs(nbblks);
456	while (bufblks > log->l_logBBsize)
457		bufblks >>= 1;
458	while (!(bp = xlog_get_bp(log, bufblks))) {
459		bufblks >>= 1;
460		if (bufblks < log->l_sectBBsize)
461			return -ENOMEM;
462	}
463
464	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
465		int	bcount;
466
467		bcount = min(bufblks, (start_blk + nbblks - i));
468
469		error = xlog_bread(log, i, bcount, bp, &buf);
470		if (error)
471			goto out;
472
473		for (j = 0; j < bcount; j++) {
474			cycle = xlog_get_cycle(buf);
475			if (cycle == stop_on_cycle_no) {
476				*new_blk = i+j;
477				goto out;
478			}
479
480			buf += BBSIZE;
481		}
482	}
483
484	*new_blk = -1;
485
486out:
487	xlog_put_bp(bp);
488	return error;
489}
490
491/*
492 * Potentially backup over partial log record write.
493 *
494 * In the typical case, last_blk is the number of the block directly after
495 * a good log record.  Therefore, we subtract one to get the block number
496 * of the last block in the given buffer.  extra_bblks contains the number
497 * of blocks we would have read on a previous read.  This happens when the
498 * last log record is split over the end of the physical log.
499 *
500 * extra_bblks is the number of blocks potentially verified on a previous
501 * call to this routine.
502 */
503STATIC int
504xlog_find_verify_log_record(
505	struct xlog		*log,
506	xfs_daddr_t		start_blk,
507	xfs_daddr_t		*last_blk,
508	int			extra_bblks)
509{
510	xfs_daddr_t		i;
511	xfs_buf_t		*bp;
512	xfs_caddr_t		offset = NULL;
513	xlog_rec_header_t	*head = NULL;
514	int			error = 0;
515	int			smallmem = 0;
516	int			num_blks = *last_blk - start_blk;
517	int			xhdrs;
518
519	ASSERT(start_blk != 0 || *last_blk != start_blk);
520
521	if (!(bp = xlog_get_bp(log, num_blks))) {
522		if (!(bp = xlog_get_bp(log, 1)))
523			return -ENOMEM;
524		smallmem = 1;
525	} else {
526		error = xlog_bread(log, start_blk, num_blks, bp, &offset);
527		if (error)
528			goto out;
529		offset += ((num_blks - 1) << BBSHIFT);
530	}
531
532	for (i = (*last_blk) - 1; i >= 0; i--) {
533		if (i < start_blk) {
534			/* valid log record not found */
535			xfs_warn(log->l_mp,
536		"Log inconsistent (didn't find previous header)");
537			ASSERT(0);
538			error = -EIO;
539			goto out;
540		}
541
542		if (smallmem) {
543			error = xlog_bread(log, i, 1, bp, &offset);
544			if (error)
545				goto out;
546		}
547
548		head = (xlog_rec_header_t *)offset;
549
550		if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
551			break;
552
553		if (!smallmem)
554			offset -= BBSIZE;
555	}
556
557	/*
558	 * We hit the beginning of the physical log & still no header.  Return
559	 * to caller.  If caller can handle a return of -1, then this routine
560	 * will be called again for the end of the physical log.
561	 */
562	if (i == -1) {
563		error = 1;
564		goto out;
565	}
566
567	/*
568	 * We have the final block of the good log (the first block
569	 * of the log record _before_ the head. So we check the uuid.
570	 */
571	if ((error = xlog_header_check_mount(log->l_mp, head)))
572		goto out;
573
574	/*
575	 * We may have found a log record header before we expected one.
576	 * last_blk will be the 1st block # with a given cycle #.  We may end
577	 * up reading an entire log record.  In this case, we don't want to
578	 * reset last_blk.  Only when last_blk points in the middle of a log
579	 * record do we update last_blk.
580	 */
581	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
582		uint	h_size = be32_to_cpu(head->h_size);
583
584		xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
585		if (h_size % XLOG_HEADER_CYCLE_SIZE)
586			xhdrs++;
587	} else {
588		xhdrs = 1;
589	}
590
591	if (*last_blk - i + extra_bblks !=
592	    BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
593		*last_blk = i;
594
595out:
596	xlog_put_bp(bp);
597	return error;
598}
599
600/*
601 * Head is defined to be the point of the log where the next log write
602 * could go.  This means that incomplete LR writes at the end are
603 * eliminated when calculating the head.  We aren't guaranteed that previous
604 * LR have complete transactions.  We only know that a cycle number of
605 * current cycle number -1 won't be present in the log if we start writing
606 * from our current block number.
607 *
608 * last_blk contains the block number of the first block with a given
609 * cycle number.
610 *
611 * Return: zero if normal, non-zero if error.
612 */
613STATIC int
614xlog_find_head(
615	struct xlog	*log,
616	xfs_daddr_t	*return_head_blk)
617{
618	xfs_buf_t	*bp;
619	xfs_caddr_t	offset;
620	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
621	int		num_scan_bblks;
622	uint		first_half_cycle, last_half_cycle;
623	uint		stop_on_cycle;
624	int		error, log_bbnum = log->l_logBBsize;
625
626	/* Is the end of the log device zeroed? */
627	error = xlog_find_zeroed(log, &first_blk);
628	if (error < 0) {
629		xfs_warn(log->l_mp, "empty log check failed");
630		return error;
631	}
632	if (error == 1) {
633		*return_head_blk = first_blk;
634
635		/* Is the whole lot zeroed? */
636		if (!first_blk) {
637			/* Linux XFS shouldn't generate totally zeroed logs -
638			 * mkfs etc write a dummy unmount record to a fresh
639			 * log so we can store the uuid in there
640			 */
641			xfs_warn(log->l_mp, "totally zeroed log");
642		}
643
644		return 0;
645	}
646
647	first_blk = 0;			/* get cycle # of 1st block */
648	bp = xlog_get_bp(log, 1);
649	if (!bp)
650		return -ENOMEM;
651
652	error = xlog_bread(log, 0, 1, bp, &offset);
653	if (error)
654		goto bp_err;
655
656	first_half_cycle = xlog_get_cycle(offset);
657
658	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
659	error = xlog_bread(log, last_blk, 1, bp, &offset);
660	if (error)
661		goto bp_err;
662
663	last_half_cycle = xlog_get_cycle(offset);
664	ASSERT(last_half_cycle != 0);
665
666	/*
667	 * If the 1st half cycle number is equal to the last half cycle number,
668	 * then the entire log is stamped with the same cycle number.  In this
669	 * case, head_blk can't be set to zero (which makes sense).  The below
670	 * math doesn't work out properly with head_blk equal to zero.  Instead,
671	 * we set it to log_bbnum which is an invalid block number, but this
672	 * value makes the math correct.  If head_blk doesn't changed through
673	 * all the tests below, *head_blk is set to zero at the very end rather
674	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
675	 * in a circular file.
676	 */
677	if (first_half_cycle == last_half_cycle) {
678		/*
679		 * In this case we believe that the entire log should have
680		 * cycle number last_half_cycle.  We need to scan backwards
681		 * from the end verifying that there are no holes still
682		 * containing last_half_cycle - 1.  If we find such a hole,
683		 * then the start of that hole will be the new head.  The
684		 * simple case looks like
685		 *        x | x ... | x - 1 | x
686		 * Another case that fits this picture would be
687		 *        x | x + 1 | x ... | x
688		 * In this case the head really is somewhere at the end of the
689		 * log, as one of the latest writes at the beginning was
690		 * incomplete.
691		 * One more case is
692		 *        x | x + 1 | x ... | x - 1 | x
693		 * This is really the combination of the above two cases, and
694		 * the head has to end up at the start of the x-1 hole at the
695		 * end of the log.
696		 *
697		 * In the 256k log case, we will read from the beginning to the
698		 * end of the log and search for cycle numbers equal to x-1.
699		 * We don't worry about the x+1 blocks that we encounter,
700		 * because we know that they cannot be the head since the log
701		 * started with x.
702		 */
703		head_blk = log_bbnum;
704		stop_on_cycle = last_half_cycle - 1;
705	} else {
706		/*
707		 * In this case we want to find the first block with cycle
708		 * number matching last_half_cycle.  We expect the log to be
709		 * some variation on
710		 *        x + 1 ... | x ... | x
711		 * The first block with cycle number x (last_half_cycle) will
712		 * be where the new head belongs.  First we do a binary search
713		 * for the first occurrence of last_half_cycle.  The binary
714		 * search may not be totally accurate, so then we scan back
715		 * from there looking for occurrences of last_half_cycle before
716		 * us.  If that backwards scan wraps around the beginning of
717		 * the log, then we look for occurrences of last_half_cycle - 1
718		 * at the end of the log.  The cases we're looking for look
719		 * like
720		 *                               v binary search stopped here
721		 *        x + 1 ... | x | x + 1 | x ... | x
722		 *                   ^ but we want to locate this spot
723		 * or
724		 *        <---------> less than scan distance
725		 *        x + 1 ... | x ... | x - 1 | x
726		 *                           ^ we want to locate this spot
727		 */
728		stop_on_cycle = last_half_cycle;
729		if ((error = xlog_find_cycle_start(log, bp, first_blk,
730						&head_blk, last_half_cycle)))
731			goto bp_err;
732	}
733
734	/*
735	 * Now validate the answer.  Scan back some number of maximum possible
736	 * blocks and make sure each one has the expected cycle number.  The
737	 * maximum is determined by the total possible amount of buffering
738	 * in the in-core log.  The following number can be made tighter if
739	 * we actually look at the block size of the filesystem.
740	 */
741	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
742	if (head_blk >= num_scan_bblks) {
743		/*
744		 * We are guaranteed that the entire check can be performed
745		 * in one buffer.
746		 */
747		start_blk = head_blk - num_scan_bblks;
748		if ((error = xlog_find_verify_cycle(log,
749						start_blk, num_scan_bblks,
750						stop_on_cycle, &new_blk)))
751			goto bp_err;
752		if (new_blk != -1)
753			head_blk = new_blk;
754	} else {		/* need to read 2 parts of log */
755		/*
756		 * We are going to scan backwards in the log in two parts.
757		 * First we scan the physical end of the log.  In this part
758		 * of the log, we are looking for blocks with cycle number
759		 * last_half_cycle - 1.
760		 * If we find one, then we know that the log starts there, as
761		 * we've found a hole that didn't get written in going around
762		 * the end of the physical log.  The simple case for this is
763		 *        x + 1 ... | x ... | x - 1 | x
764		 *        <---------> less than scan distance
765		 * If all of the blocks at the end of the log have cycle number
766		 * last_half_cycle, then we check the blocks at the start of
767		 * the log looking for occurrences of last_half_cycle.  If we
768		 * find one, then our current estimate for the location of the
769		 * first occurrence of last_half_cycle is wrong and we move
770		 * back to the hole we've found.  This case looks like
771		 *        x + 1 ... | x | x + 1 | x ...
772		 *                               ^ binary search stopped here
773		 * Another case we need to handle that only occurs in 256k
774		 * logs is
775		 *        x + 1 ... | x ... | x+1 | x ...
776		 *                   ^ binary search stops here
777		 * In a 256k log, the scan at the end of the log will see the
778		 * x + 1 blocks.  We need to skip past those since that is
779		 * certainly not the head of the log.  By searching for
780		 * last_half_cycle-1 we accomplish that.
781		 */
782		ASSERT(head_blk <= INT_MAX &&
783			(xfs_daddr_t) num_scan_bblks >= head_blk);
784		start_blk = log_bbnum - (num_scan_bblks - head_blk);
785		if ((error = xlog_find_verify_cycle(log, start_blk,
786					num_scan_bblks - (int)head_blk,
787					(stop_on_cycle - 1), &new_blk)))
788			goto bp_err;
789		if (new_blk != -1) {
790			head_blk = new_blk;
791			goto validate_head;
792		}
793
794		/*
795		 * Scan beginning of log now.  The last part of the physical
796		 * log is good.  This scan needs to verify that it doesn't find
797		 * the last_half_cycle.
798		 */
799		start_blk = 0;
800		ASSERT(head_blk <= INT_MAX);
801		if ((error = xlog_find_verify_cycle(log,
802					start_blk, (int)head_blk,
803					stop_on_cycle, &new_blk)))
804			goto bp_err;
805		if (new_blk != -1)
806			head_blk = new_blk;
807	}
808
809validate_head:
810	/*
811	 * Now we need to make sure head_blk is not pointing to a block in
812	 * the middle of a log record.
813	 */
814	num_scan_bblks = XLOG_REC_SHIFT(log);
815	if (head_blk >= num_scan_bblks) {
816		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
817
818		/* start ptr at last block ptr before head_blk */
819		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
820		if (error == 1)
821			error = -EIO;
822		if (error)
823			goto bp_err;
824	} else {
825		start_blk = 0;
826		ASSERT(head_blk <= INT_MAX);
827		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
828		if (error < 0)
829			goto bp_err;
830		if (error == 1) {
831			/* We hit the beginning of the log during our search */
832			start_blk = log_bbnum - (num_scan_bblks - head_blk);
833			new_blk = log_bbnum;
834			ASSERT(start_blk <= INT_MAX &&
835				(xfs_daddr_t) log_bbnum-start_blk >= 0);
836			ASSERT(head_blk <= INT_MAX);
837			error = xlog_find_verify_log_record(log, start_blk,
838							&new_blk, (int)head_blk);
839			if (error == 1)
840				error = -EIO;
841			if (error)
842				goto bp_err;
843			if (new_blk != log_bbnum)
844				head_blk = new_blk;
845		} else if (error)
846			goto bp_err;
847	}
848
849	xlog_put_bp(bp);
850	if (head_blk == log_bbnum)
851		*return_head_blk = 0;
852	else
853		*return_head_blk = head_blk;
854	/*
855	 * When returning here, we have a good block number.  Bad block
856	 * means that during a previous crash, we didn't have a clean break
857	 * from cycle number N to cycle number N-1.  In this case, we need
858	 * to find the first block with cycle number N-1.
859	 */
860	return 0;
861
862 bp_err:
863	xlog_put_bp(bp);
864
865	if (error)
866		xfs_warn(log->l_mp, "failed to find log head");
867	return error;
868}
869
870/*
871 * Find the sync block number or the tail of the log.
872 *
873 * This will be the block number of the last record to have its
874 * associated buffers synced to disk.  Every log record header has
875 * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
876 * to get a sync block number.  The only concern is to figure out which
877 * log record header to believe.
878 *
879 * The following algorithm uses the log record header with the largest
880 * lsn.  The entire log record does not need to be valid.  We only care
881 * that the header is valid.
882 *
883 * We could speed up search by using current head_blk buffer, but it is not
884 * available.
885 */
886STATIC int
887xlog_find_tail(
888	struct xlog		*log,
889	xfs_daddr_t		*head_blk,
890	xfs_daddr_t		*tail_blk)
891{
892	xlog_rec_header_t	*rhead;
893	xlog_op_header_t	*op_head;
894	xfs_caddr_t		offset = NULL;
895	xfs_buf_t		*bp;
896	int			error, i, found;
897	xfs_daddr_t		umount_data_blk;
898	xfs_daddr_t		after_umount_blk;
899	xfs_lsn_t		tail_lsn;
900	int			hblks;
901
902	found = 0;
903
904	/*
905	 * Find previous log record
906	 */
907	if ((error = xlog_find_head(log, head_blk)))
908		return error;
909
910	bp = xlog_get_bp(log, 1);
911	if (!bp)
912		return -ENOMEM;
913	if (*head_blk == 0) {				/* special case */
914		error = xlog_bread(log, 0, 1, bp, &offset);
915		if (error)
916			goto done;
917
918		if (xlog_get_cycle(offset) == 0) {
919			*tail_blk = 0;
920			/* leave all other log inited values alone */
921			goto done;
922		}
923	}
924
925	/*
926	 * Search backwards looking for log record header block
927	 */
928	ASSERT(*head_blk < INT_MAX);
929	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
930		error = xlog_bread(log, i, 1, bp, &offset);
931		if (error)
932			goto done;
933
934		if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
935			found = 1;
936			break;
937		}
938	}
939	/*
940	 * If we haven't found the log record header block, start looking
941	 * again from the end of the physical log.  XXXmiken: There should be
942	 * a check here to make sure we didn't search more than N blocks in
943	 * the previous code.
944	 */
945	if (!found) {
946		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
947			error = xlog_bread(log, i, 1, bp, &offset);
948			if (error)
949				goto done;
950
951			if (*(__be32 *)offset ==
952			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
953				found = 2;
954				break;
955			}
956		}
957	}
958	if (!found) {
959		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
960		xlog_put_bp(bp);
961		ASSERT(0);
962		return -EIO;
963	}
964
965	/* find blk_no of tail of log */
966	rhead = (xlog_rec_header_t *)offset;
967	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
968
969	/*
970	 * Reset log values according to the state of the log when we
971	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
972	 * one because the next write starts a new cycle rather than
973	 * continuing the cycle of the last good log record.  At this
974	 * point we have guaranteed that all partial log records have been
975	 * accounted for.  Therefore, we know that the last good log record
976	 * written was complete and ended exactly on the end boundary
977	 * of the physical log.
978	 */
979	log->l_prev_block = i;
980	log->l_curr_block = (int)*head_blk;
981	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
982	if (found == 2)
983		log->l_curr_cycle++;
984	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
985	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
986	xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
987					BBTOB(log->l_curr_block));
988	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
989					BBTOB(log->l_curr_block));
990
991	/*
992	 * Look for unmount record.  If we find it, then we know there
993	 * was a clean unmount.  Since 'i' could be the last block in
994	 * the physical log, we convert to a log block before comparing
995	 * to the head_blk.
996	 *
997	 * Save the current tail lsn to use to pass to
998	 * xlog_clear_stale_blocks() below.  We won't want to clear the
999	 * unmount record if there is one, so we pass the lsn of the
1000	 * unmount record rather than the block after it.
1001	 */
1002	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1003		int	h_size = be32_to_cpu(rhead->h_size);
1004		int	h_version = be32_to_cpu(rhead->h_version);
1005
1006		if ((h_version & XLOG_VERSION_2) &&
1007		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1008			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1009			if (h_size % XLOG_HEADER_CYCLE_SIZE)
1010				hblks++;
1011		} else {
1012			hblks = 1;
1013		}
1014	} else {
1015		hblks = 1;
1016	}
1017	after_umount_blk = (i + hblks + (int)
1018		BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
1019	tail_lsn = atomic64_read(&log->l_tail_lsn);
1020	if (*head_blk == after_umount_blk &&
1021	    be32_to_cpu(rhead->h_num_logops) == 1) {
1022		umount_data_blk = (i + hblks) % log->l_logBBsize;
1023		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
1024		if (error)
1025			goto done;
1026
1027		op_head = (xlog_op_header_t *)offset;
1028		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1029			/*
1030			 * Set tail and last sync so that newly written
1031			 * log records will point recovery to after the
1032			 * current unmount record.
1033			 */
1034			xlog_assign_atomic_lsn(&log->l_tail_lsn,
1035					log->l_curr_cycle, after_umount_blk);
1036			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1037					log->l_curr_cycle, after_umount_blk);
1038			*tail_blk = after_umount_blk;
1039
1040			/*
1041			 * Note that the unmount was clean. If the unmount
1042			 * was not clean, we need to know this to rebuild the
1043			 * superblock counters from the perag headers if we
1044			 * have a filesystem using non-persistent counters.
1045			 */
1046			log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1047		}
1048	}
1049
1050	/*
1051	 * Make sure that there are no blocks in front of the head
1052	 * with the same cycle number as the head.  This can happen
1053	 * because we allow multiple outstanding log writes concurrently,
1054	 * and the later writes might make it out before earlier ones.
1055	 *
1056	 * We use the lsn from before modifying it so that we'll never
1057	 * overwrite the unmount record after a clean unmount.
1058	 *
1059	 * Do this only if we are going to recover the filesystem
1060	 *
1061	 * NOTE: This used to say "if (!readonly)"
1062	 * However on Linux, we can & do recover a read-only filesystem.
1063	 * We only skip recovery if NORECOVERY is specified on mount,
1064	 * in which case we would not be here.
1065	 *
1066	 * But... if the -device- itself is readonly, just skip this.
1067	 * We can't recover this device anyway, so it won't matter.
1068	 */
1069	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1070		error = xlog_clear_stale_blocks(log, tail_lsn);
1071
1072done:
1073	xlog_put_bp(bp);
1074
1075	if (error)
1076		xfs_warn(log->l_mp, "failed to locate log tail");
1077	return error;
1078}
1079
1080/*
1081 * Is the log zeroed at all?
1082 *
1083 * The last binary search should be changed to perform an X block read
1084 * once X becomes small enough.  You can then search linearly through
1085 * the X blocks.  This will cut down on the number of reads we need to do.
1086 *
1087 * If the log is partially zeroed, this routine will pass back the blkno
1088 * of the first block with cycle number 0.  It won't have a complete LR
1089 * preceding it.
1090 *
1091 * Return:
1092 *	0  => the log is completely written to
1093 *	1 => use *blk_no as the first block of the log
1094 *	<0 => error has occurred
1095 */
1096STATIC int
1097xlog_find_zeroed(
1098	struct xlog	*log,
1099	xfs_daddr_t	*blk_no)
1100{
1101	xfs_buf_t	*bp;
1102	xfs_caddr_t	offset;
1103	uint	        first_cycle, last_cycle;
1104	xfs_daddr_t	new_blk, last_blk, start_blk;
1105	xfs_daddr_t     num_scan_bblks;
1106	int	        error, log_bbnum = log->l_logBBsize;
1107
1108	*blk_no = 0;
1109
1110	/* check totally zeroed log */
1111	bp = xlog_get_bp(log, 1);
1112	if (!bp)
1113		return -ENOMEM;
1114	error = xlog_bread(log, 0, 1, bp, &offset);
1115	if (error)
1116		goto bp_err;
1117
1118	first_cycle = xlog_get_cycle(offset);
1119	if (first_cycle == 0) {		/* completely zeroed log */
1120		*blk_no = 0;
1121		xlog_put_bp(bp);
1122		return 1;
1123	}
1124
1125	/* check partially zeroed log */
1126	error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1127	if (error)
1128		goto bp_err;
1129
1130	last_cycle = xlog_get_cycle(offset);
1131	if (last_cycle != 0) {		/* log completely written to */
1132		xlog_put_bp(bp);
1133		return 0;
1134	} else if (first_cycle != 1) {
1135		/*
1136		 * If the cycle of the last block is zero, the cycle of
1137		 * the first block must be 1. If it's not, maybe we're
1138		 * not looking at a log... Bail out.
1139		 */
1140		xfs_warn(log->l_mp,
1141			"Log inconsistent or not a log (last==0, first!=1)");
1142		error = -EINVAL;
1143		goto bp_err;
1144	}
1145
1146	/* we have a partially zeroed log */
1147	last_blk = log_bbnum-1;
1148	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1149		goto bp_err;
1150
1151	/*
1152	 * Validate the answer.  Because there is no way to guarantee that
1153	 * the entire log is made up of log records which are the same size,
1154	 * we scan over the defined maximum blocks.  At this point, the maximum
1155	 * is not chosen to mean anything special.   XXXmiken
1156	 */
1157	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1158	ASSERT(num_scan_bblks <= INT_MAX);
1159
1160	if (last_blk < num_scan_bblks)
1161		num_scan_bblks = last_blk;
1162	start_blk = last_blk - num_scan_bblks;
1163
1164	/*
1165	 * We search for any instances of cycle number 0 that occur before
1166	 * our current estimate of the head.  What we're trying to detect is
1167	 *        1 ... | 0 | 1 | 0...
1168	 *                       ^ binary search ends here
1169	 */
1170	if ((error = xlog_find_verify_cycle(log, start_blk,
1171					 (int)num_scan_bblks, 0, &new_blk)))
1172		goto bp_err;
1173	if (new_blk != -1)
1174		last_blk = new_blk;
1175
1176	/*
1177	 * Potentially backup over partial log record write.  We don't need
1178	 * to search the end of the log because we know it is zero.
1179	 */
1180	error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1181	if (error == 1)
1182		error = -EIO;
1183	if (error)
1184		goto bp_err;
1185
1186	*blk_no = last_blk;
1187bp_err:
1188	xlog_put_bp(bp);
1189	if (error)
1190		return error;
1191	return 1;
1192}
1193
1194/*
1195 * These are simple subroutines used by xlog_clear_stale_blocks() below
1196 * to initialize a buffer full of empty log record headers and write
1197 * them into the log.
1198 */
1199STATIC void
1200xlog_add_record(
1201	struct xlog		*log,
1202	xfs_caddr_t		buf,
1203	int			cycle,
1204	int			block,
1205	int			tail_cycle,
1206	int			tail_block)
1207{
1208	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
1209
1210	memset(buf, 0, BBSIZE);
1211	recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1212	recp->h_cycle = cpu_to_be32(cycle);
1213	recp->h_version = cpu_to_be32(
1214			xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1215	recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1216	recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1217	recp->h_fmt = cpu_to_be32(XLOG_FMT);
1218	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1219}
1220
1221STATIC int
1222xlog_write_log_records(
1223	struct xlog	*log,
1224	int		cycle,
1225	int		start_block,
1226	int		blocks,
1227	int		tail_cycle,
1228	int		tail_block)
1229{
1230	xfs_caddr_t	offset;
1231	xfs_buf_t	*bp;
1232	int		balign, ealign;
1233	int		sectbb = log->l_sectBBsize;
1234	int		end_block = start_block + blocks;
1235	int		bufblks;
1236	int		error = 0;
1237	int		i, j = 0;
1238
1239	/*
1240	 * Greedily allocate a buffer big enough to handle the full
1241	 * range of basic blocks to be written.  If that fails, try
1242	 * a smaller size.  We need to be able to write at least a
1243	 * log sector, or we're out of luck.
1244	 */
1245	bufblks = 1 << ffs(blocks);
1246	while (bufblks > log->l_logBBsize)
1247		bufblks >>= 1;
1248	while (!(bp = xlog_get_bp(log, bufblks))) {
1249		bufblks >>= 1;
1250		if (bufblks < sectbb)
1251			return -ENOMEM;
1252	}
1253
1254	/* We may need to do a read at the start to fill in part of
1255	 * the buffer in the starting sector not covered by the first
1256	 * write below.
1257	 */
1258	balign = round_down(start_block, sectbb);
1259	if (balign != start_block) {
1260		error = xlog_bread_noalign(log, start_block, 1, bp);
1261		if (error)
1262			goto out_put_bp;
1263
1264		j = start_block - balign;
1265	}
1266
1267	for (i = start_block; i < end_block; i += bufblks) {
1268		int		bcount, endcount;
1269
1270		bcount = min(bufblks, end_block - start_block);
1271		endcount = bcount - j;
1272
1273		/* We may need to do a read at the end to fill in part of
1274		 * the buffer in the final sector not covered by the write.
1275		 * If this is the same sector as the above read, skip it.
1276		 */
1277		ealign = round_down(end_block, sectbb);
1278		if (j == 0 && (start_block + endcount > ealign)) {
1279			offset = bp->b_addr + BBTOB(ealign - start_block);
1280			error = xlog_bread_offset(log, ealign, sectbb,
1281							bp, offset);
1282			if (error)
1283				break;
1284
1285		}
1286
1287		offset = xlog_align(log, start_block, endcount, bp);
1288		for (; j < endcount; j++) {
1289			xlog_add_record(log, offset, cycle, i+j,
1290					tail_cycle, tail_block);
1291			offset += BBSIZE;
1292		}
1293		error = xlog_bwrite(log, start_block, endcount, bp);
1294		if (error)
1295			break;
1296		start_block += endcount;
1297		j = 0;
1298	}
1299
1300 out_put_bp:
1301	xlog_put_bp(bp);
1302	return error;
1303}
1304
1305/*
1306 * This routine is called to blow away any incomplete log writes out
1307 * in front of the log head.  We do this so that we won't become confused
1308 * if we come up, write only a little bit more, and then crash again.
1309 * If we leave the partial log records out there, this situation could
1310 * cause us to think those partial writes are valid blocks since they
1311 * have the current cycle number.  We get rid of them by overwriting them
1312 * with empty log records with the old cycle number rather than the
1313 * current one.
1314 *
1315 * The tail lsn is passed in rather than taken from
1316 * the log so that we will not write over the unmount record after a
1317 * clean unmount in a 512 block log.  Doing so would leave the log without
1318 * any valid log records in it until a new one was written.  If we crashed
1319 * during that time we would not be able to recover.
1320 */
1321STATIC int
1322xlog_clear_stale_blocks(
1323	struct xlog	*log,
1324	xfs_lsn_t	tail_lsn)
1325{
1326	int		tail_cycle, head_cycle;
1327	int		tail_block, head_block;
1328	int		tail_distance, max_distance;
1329	int		distance;
1330	int		error;
1331
1332	tail_cycle = CYCLE_LSN(tail_lsn);
1333	tail_block = BLOCK_LSN(tail_lsn);
1334	head_cycle = log->l_curr_cycle;
1335	head_block = log->l_curr_block;
1336
1337	/*
1338	 * Figure out the distance between the new head of the log
1339	 * and the tail.  We want to write over any blocks beyond the
1340	 * head that we may have written just before the crash, but
1341	 * we don't want to overwrite the tail of the log.
1342	 */
1343	if (head_cycle == tail_cycle) {
1344		/*
1345		 * The tail is behind the head in the physical log,
1346		 * so the distance from the head to the tail is the
1347		 * distance from the head to the end of the log plus
1348		 * the distance from the beginning of the log to the
1349		 * tail.
1350		 */
1351		if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1352			XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1353					 XFS_ERRLEVEL_LOW, log->l_mp);
1354			return -EFSCORRUPTED;
1355		}
1356		tail_distance = tail_block + (log->l_logBBsize - head_block);
1357	} else {
1358		/*
1359		 * The head is behind the tail in the physical log,
1360		 * so the distance from the head to the tail is just
1361		 * the tail block minus the head block.
1362		 */
1363		if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1364			XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1365					 XFS_ERRLEVEL_LOW, log->l_mp);
1366			return -EFSCORRUPTED;
1367		}
1368		tail_distance = tail_block - head_block;
1369	}
1370
1371	/*
1372	 * If the head is right up against the tail, we can't clear
1373	 * anything.
1374	 */
1375	if (tail_distance <= 0) {
1376		ASSERT(tail_distance == 0);
1377		return 0;
1378	}
1379
1380	max_distance = XLOG_TOTAL_REC_SHIFT(log);
1381	/*
1382	 * Take the smaller of the maximum amount of outstanding I/O
1383	 * we could have and the distance to the tail to clear out.
1384	 * We take the smaller so that we don't overwrite the tail and
1385	 * we don't waste all day writing from the head to the tail
1386	 * for no reason.
1387	 */
1388	max_distance = MIN(max_distance, tail_distance);
1389
1390	if ((head_block + max_distance) <= log->l_logBBsize) {
1391		/*
1392		 * We can stomp all the blocks we need to without
1393		 * wrapping around the end of the log.  Just do it
1394		 * in a single write.  Use the cycle number of the
1395		 * current cycle minus one so that the log will look like:
1396		 *     n ... | n - 1 ...
1397		 */
1398		error = xlog_write_log_records(log, (head_cycle - 1),
1399				head_block, max_distance, tail_cycle,
1400				tail_block);
1401		if (error)
1402			return error;
1403	} else {
1404		/*
1405		 * We need to wrap around the end of the physical log in
1406		 * order to clear all the blocks.  Do it in two separate
1407		 * I/Os.  The first write should be from the head to the
1408		 * end of the physical log, and it should use the current
1409		 * cycle number minus one just like above.
1410		 */
1411		distance = log->l_logBBsize - head_block;
1412		error = xlog_write_log_records(log, (head_cycle - 1),
1413				head_block, distance, tail_cycle,
1414				tail_block);
1415
1416		if (error)
1417			return error;
1418
1419		/*
1420		 * Now write the blocks at the start of the physical log.
1421		 * This writes the remainder of the blocks we want to clear.
1422		 * It uses the current cycle number since we're now on the
1423		 * same cycle as the head so that we get:
1424		 *    n ... n ... | n - 1 ...
1425		 *    ^^^^^ blocks we're writing
1426		 */
1427		distance = max_distance - (log->l_logBBsize - head_block);
1428		error = xlog_write_log_records(log, head_cycle, 0, distance,
1429				tail_cycle, tail_block);
1430		if (error)
1431			return error;
1432	}
1433
1434	return 0;
1435}
1436
1437/******************************************************************************
1438 *
1439 *		Log recover routines
1440 *
1441 ******************************************************************************
1442 */
1443
1444/*
1445 * Sort the log items in the transaction.
1446 *
1447 * The ordering constraints are defined by the inode allocation and unlink
1448 * behaviour. The rules are:
1449 *
1450 *	1. Every item is only logged once in a given transaction. Hence it
1451 *	   represents the last logged state of the item. Hence ordering is
1452 *	   dependent on the order in which operations need to be performed so
1453 *	   required initial conditions are always met.
1454 *
1455 *	2. Cancelled buffers are recorded in pass 1 in a separate table and
1456 *	   there's nothing to replay from them so we can simply cull them
1457 *	   from the transaction. However, we can't do that until after we've
1458 *	   replayed all the other items because they may be dependent on the
1459 *	   cancelled buffer and replaying the cancelled buffer can remove it
1460 *	   form the cancelled buffer table. Hence they have tobe done last.
1461 *
1462 *	3. Inode allocation buffers must be replayed before inode items that
1463 *	   read the buffer and replay changes into it. For filesystems using the
1464 *	   ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1465 *	   treated the same as inode allocation buffers as they create and
1466 *	   initialise the buffers directly.
1467 *
1468 *	4. Inode unlink buffers must be replayed after inode items are replayed.
1469 *	   This ensures that inodes are completely flushed to the inode buffer
1470 *	   in a "free" state before we remove the unlinked inode list pointer.
1471 *
1472 * Hence the ordering needs to be inode allocation buffers first, inode items
1473 * second, inode unlink buffers third and cancelled buffers last.
1474 *
1475 * But there's a problem with that - we can't tell an inode allocation buffer
1476 * apart from a regular buffer, so we can't separate them. We can, however,
1477 * tell an inode unlink buffer from the others, and so we can separate them out
1478 * from all the other buffers and move them to last.
1479 *
1480 * Hence, 4 lists, in order from head to tail:
1481 *	- buffer_list for all buffers except cancelled/inode unlink buffers
1482 *	- item_list for all non-buffer items
1483 *	- inode_buffer_list for inode unlink buffers
1484 *	- cancel_list for the cancelled buffers
1485 *
1486 * Note that we add objects to the tail of the lists so that first-to-last
1487 * ordering is preserved within the lists. Adding objects to the head of the
1488 * list means when we traverse from the head we walk them in last-to-first
1489 * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1490 * but for all other items there may be specific ordering that we need to
1491 * preserve.
1492 */
1493STATIC int
1494xlog_recover_reorder_trans(
1495	struct xlog		*log,
1496	struct xlog_recover	*trans,
1497	int			pass)
1498{
1499	xlog_recover_item_t	*item, *n;
1500	int			error = 0;
1501	LIST_HEAD(sort_list);
1502	LIST_HEAD(cancel_list);
1503	LIST_HEAD(buffer_list);
1504	LIST_HEAD(inode_buffer_list);
1505	LIST_HEAD(inode_list);
1506
1507	list_splice_init(&trans->r_itemq, &sort_list);
1508	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1509		xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
1510
1511		switch (ITEM_TYPE(item)) {
1512		case XFS_LI_ICREATE:
1513			list_move_tail(&item->ri_list, &buffer_list);
1514			break;
1515		case XFS_LI_BUF:
1516			if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1517				trace_xfs_log_recover_item_reorder_head(log,
1518							trans, item, pass);
1519				list_move(&item->ri_list, &cancel_list);
1520				break;
1521			}
1522			if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1523				list_move(&item->ri_list, &inode_buffer_list);
1524				break;
1525			}
1526			list_move_tail(&item->ri_list, &buffer_list);
1527			break;
1528		case XFS_LI_INODE:
1529		case XFS_LI_DQUOT:
1530		case XFS_LI_QUOTAOFF:
1531		case XFS_LI_EFD:
1532		case XFS_LI_EFI:
1533			trace_xfs_log_recover_item_reorder_tail(log,
1534							trans, item, pass);
1535			list_move_tail(&item->ri_list, &inode_list);
1536			break;
1537		default:
1538			xfs_warn(log->l_mp,
1539				"%s: unrecognized type of log operation",
1540				__func__);
1541			ASSERT(0);
1542			/*
1543			 * return the remaining items back to the transaction
1544			 * item list so they can be freed in caller.
1545			 */
1546			if (!list_empty(&sort_list))
1547				list_splice_init(&sort_list, &trans->r_itemq);
1548			error = -EIO;
1549			goto out;
1550		}
1551	}
1552out:
1553	ASSERT(list_empty(&sort_list));
1554	if (!list_empty(&buffer_list))
1555		list_splice(&buffer_list, &trans->r_itemq);
1556	if (!list_empty(&inode_list))
1557		list_splice_tail(&inode_list, &trans->r_itemq);
1558	if (!list_empty(&inode_buffer_list))
1559		list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1560	if (!list_empty(&cancel_list))
1561		list_splice_tail(&cancel_list, &trans->r_itemq);
1562	return error;
1563}
1564
1565/*
1566 * Build up the table of buf cancel records so that we don't replay
1567 * cancelled data in the second pass.  For buffer records that are
1568 * not cancel records, there is nothing to do here so we just return.
1569 *
1570 * If we get a cancel record which is already in the table, this indicates
1571 * that the buffer was cancelled multiple times.  In order to ensure
1572 * that during pass 2 we keep the record in the table until we reach its
1573 * last occurrence in the log, we keep a reference count in the cancel
1574 * record in the table to tell us how many times we expect to see this
1575 * record during the second pass.
1576 */
1577STATIC int
1578xlog_recover_buffer_pass1(
1579	struct xlog			*log,
1580	struct xlog_recover_item	*item)
1581{
1582	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
1583	struct list_head	*bucket;
1584	struct xfs_buf_cancel	*bcp;
1585
1586	/*
1587	 * If this isn't a cancel buffer item, then just return.
1588	 */
1589	if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1590		trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1591		return 0;
1592	}
1593
1594	/*
1595	 * Insert an xfs_buf_cancel record into the hash table of them.
1596	 * If there is already an identical record, bump its reference count.
1597	 */
1598	bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1599	list_for_each_entry(bcp, bucket, bc_list) {
1600		if (bcp->bc_blkno == buf_f->blf_blkno &&
1601		    bcp->bc_len == buf_f->blf_len) {
1602			bcp->bc_refcount++;
1603			trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1604			return 0;
1605		}
1606	}
1607
1608	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1609	bcp->bc_blkno = buf_f->blf_blkno;
1610	bcp->bc_len = buf_f->blf_len;
1611	bcp->bc_refcount = 1;
1612	list_add_tail(&bcp->bc_list, bucket);
1613
1614	trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1615	return 0;
1616}
1617
1618/*
1619 * Check to see whether the buffer being recovered has a corresponding
1620 * entry in the buffer cancel record table. If it is, return the cancel
1621 * buffer structure to the caller.
1622 */
1623STATIC struct xfs_buf_cancel *
1624xlog_peek_buffer_cancelled(
1625	struct xlog		*log,
1626	xfs_daddr_t		blkno,
1627	uint			len,
1628	ushort			flags)
1629{
1630	struct list_head	*bucket;
1631	struct xfs_buf_cancel	*bcp;
1632
1633	if (!log->l_buf_cancel_table) {
1634		/* empty table means no cancelled buffers in the log */
1635		ASSERT(!(flags & XFS_BLF_CANCEL));
1636		return NULL;
1637	}
1638
1639	bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1640	list_for_each_entry(bcp, bucket, bc_list) {
1641		if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1642			return bcp;
1643	}
1644
1645	/*
1646	 * We didn't find a corresponding entry in the table, so return 0 so
1647	 * that the buffer is NOT cancelled.
1648	 */
1649	ASSERT(!(flags & XFS_BLF_CANCEL));
1650	return NULL;
1651}
1652
1653/*
1654 * If the buffer is being cancelled then return 1 so that it will be cancelled,
1655 * otherwise return 0.  If the buffer is actually a buffer cancel item
1656 * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
1657 * table and remove it from the table if this is the last reference.
1658 *
1659 * We remove the cancel record from the table when we encounter its last
1660 * occurrence in the log so that if the same buffer is re-used again after its
1661 * last cancellation we actually replay the changes made at that point.
1662 */
1663STATIC int
1664xlog_check_buffer_cancelled(
1665	struct xlog		*log,
1666	xfs_daddr_t		blkno,
1667	uint			len,
1668	ushort			flags)
1669{
1670	struct xfs_buf_cancel	*bcp;
1671
1672	bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
1673	if (!bcp)
1674		return 0;
1675
1676	/*
1677	 * We've go a match, so return 1 so that the recovery of this buffer
1678	 * is cancelled.  If this buffer is actually a buffer cancel log
1679	 * item, then decrement the refcount on the one in the table and
1680	 * remove it if this is the last reference.
1681	 */
1682	if (flags & XFS_BLF_CANCEL) {
1683		if (--bcp->bc_refcount == 0) {
1684			list_del(&bcp->bc_list);
1685			kmem_free(bcp);
1686		}
1687	}
1688	return 1;
1689}
1690
1691/*
1692 * Perform recovery for a buffer full of inodes.  In these buffers, the only
1693 * data which should be recovered is that which corresponds to the
1694 * di_next_unlinked pointers in the on disk inode structures.  The rest of the
1695 * data for the inodes is always logged through the inodes themselves rather
1696 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1697 *
1698 * The only time when buffers full of inodes are fully recovered is when the
1699 * buffer is full of newly allocated inodes.  In this case the buffer will
1700 * not be marked as an inode buffer and so will be sent to
1701 * xlog_recover_do_reg_buffer() below during recovery.
1702 */
1703STATIC int
1704xlog_recover_do_inode_buffer(
1705	struct xfs_mount	*mp,
1706	xlog_recover_item_t	*item,
1707	struct xfs_buf		*bp,
1708	xfs_buf_log_format_t	*buf_f)
1709{
1710	int			i;
1711	int			item_index = 0;
1712	int			bit = 0;
1713	int			nbits = 0;
1714	int			reg_buf_offset = 0;
1715	int			reg_buf_bytes = 0;
1716	int			next_unlinked_offset;
1717	int			inodes_per_buf;
1718	xfs_agino_t		*logged_nextp;
1719	xfs_agino_t		*buffer_nextp;
1720
1721	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1722
1723	/*
1724	 * Post recovery validation only works properly on CRC enabled
1725	 * filesystems.
1726	 */
1727	if (xfs_sb_version_hascrc(&mp->m_sb))
1728		bp->b_ops = &xfs_inode_buf_ops;
1729
1730	inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
1731	for (i = 0; i < inodes_per_buf; i++) {
1732		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1733			offsetof(xfs_dinode_t, di_next_unlinked);
1734
1735		while (next_unlinked_offset >=
1736		       (reg_buf_offset + reg_buf_bytes)) {
1737			/*
1738			 * The next di_next_unlinked field is beyond
1739			 * the current logged region.  Find the next
1740			 * logged region that contains or is beyond
1741			 * the current di_next_unlinked field.
1742			 */
1743			bit += nbits;
1744			bit = xfs_next_bit(buf_f->blf_data_map,
1745					   buf_f->blf_map_size, bit);
1746
1747			/*
1748			 * If there are no more logged regions in the
1749			 * buffer, then we're done.
1750			 */
1751			if (bit == -1)
1752				return 0;
1753
1754			nbits = xfs_contig_bits(buf_f->blf_data_map,
1755						buf_f->blf_map_size, bit);
1756			ASSERT(nbits > 0);
1757			reg_buf_offset = bit << XFS_BLF_SHIFT;
1758			reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1759			item_index++;
1760		}
1761
1762		/*
1763		 * If the current logged region starts after the current
1764		 * di_next_unlinked field, then move on to the next
1765		 * di_next_unlinked field.
1766		 */
1767		if (next_unlinked_offset < reg_buf_offset)
1768			continue;
1769
1770		ASSERT(item->ri_buf[item_index].i_addr != NULL);
1771		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1772		ASSERT((reg_buf_offset + reg_buf_bytes) <=
1773							BBTOB(bp->b_io_length));
1774
1775		/*
1776		 * The current logged region contains a copy of the
1777		 * current di_next_unlinked field.  Extract its value
1778		 * and copy it to the buffer copy.
1779		 */
1780		logged_nextp = item->ri_buf[item_index].i_addr +
1781				next_unlinked_offset - reg_buf_offset;
1782		if (unlikely(*logged_nextp == 0)) {
1783			xfs_alert(mp,
1784		"Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
1785		"Trying to replay bad (0) inode di_next_unlinked field.",
1786				item, bp);
1787			XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1788					 XFS_ERRLEVEL_LOW, mp);
1789			return -EFSCORRUPTED;
1790		}
1791
1792		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1793					      next_unlinked_offset);
1794		*buffer_nextp = *logged_nextp;
1795
1796		/*
1797		 * If necessary, recalculate the CRC in the on-disk inode. We
1798		 * have to leave the inode in a consistent state for whoever
1799		 * reads it next....
1800		 */
1801		xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
1802				xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
1803
1804	}
1805
1806	return 0;
1807}
1808
1809/*
1810 * V5 filesystems know the age of the buffer on disk being recovered. We can
1811 * have newer objects on disk than we are replaying, and so for these cases we
1812 * don't want to replay the current change as that will make the buffer contents
1813 * temporarily invalid on disk.
1814 *
1815 * The magic number might not match the buffer type we are going to recover
1816 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags.  Hence
1817 * extract the LSN of the existing object in the buffer based on it's current
1818 * magic number.  If we don't recognise the magic number in the buffer, then
1819 * return a LSN of -1 so that the caller knows it was an unrecognised block and
1820 * so can recover the buffer.
1821 *
1822 * Note: we cannot rely solely on magic number matches to determine that the
1823 * buffer has a valid LSN - we also need to verify that it belongs to this
1824 * filesystem, so we need to extract the object's LSN and compare it to that
1825 * which we read from the superblock. If the UUIDs don't match, then we've got a
1826 * stale metadata block from an old filesystem instance that we need to recover
1827 * over the top of.
1828 */
1829static xfs_lsn_t
1830xlog_recover_get_buf_lsn(
1831	struct xfs_mount	*mp,
1832	struct xfs_buf		*bp)
1833{
1834	__uint32_t		magic32;
1835	__uint16_t		magic16;
1836	__uint16_t		magicda;
1837	void			*blk = bp->b_addr;
1838	uuid_t			*uuid;
1839	xfs_lsn_t		lsn = -1;
1840
1841	/* v4 filesystems always recover immediately */
1842	if (!xfs_sb_version_hascrc(&mp->m_sb))
1843		goto recover_immediately;
1844
1845	magic32 = be32_to_cpu(*(__be32 *)blk);
1846	switch (magic32) {
1847	case XFS_ABTB_CRC_MAGIC:
1848	case XFS_ABTC_CRC_MAGIC:
1849	case XFS_ABTB_MAGIC:
1850	case XFS_ABTC_MAGIC:
1851	case XFS_IBT_CRC_MAGIC:
1852	case XFS_IBT_MAGIC: {
1853		struct xfs_btree_block *btb = blk;
1854
1855		lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
1856		uuid = &btb->bb_u.s.bb_uuid;
1857		break;
1858	}
1859	case XFS_BMAP_CRC_MAGIC:
1860	case XFS_BMAP_MAGIC: {
1861		struct xfs_btree_block *btb = blk;
1862
1863		lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
1864		uuid = &btb->bb_u.l.bb_uuid;
1865		break;
1866	}
1867	case XFS_AGF_MAGIC:
1868		lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
1869		uuid = &((struct xfs_agf *)blk)->agf_uuid;
1870		break;
1871	case XFS_AGFL_MAGIC:
1872		lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
1873		uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
1874		break;
1875	case XFS_AGI_MAGIC:
1876		lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
1877		uuid = &((struct xfs_agi *)blk)->agi_uuid;
1878		break;
1879	case XFS_SYMLINK_MAGIC:
1880		lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
1881		uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
1882		break;
1883	case XFS_DIR3_BLOCK_MAGIC:
1884	case XFS_DIR3_DATA_MAGIC:
1885	case XFS_DIR3_FREE_MAGIC:
1886		lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
1887		uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
1888		break;
1889	case XFS_ATTR3_RMT_MAGIC:
1890		/*
1891		 * Remote attr blocks are written synchronously, rather than
1892		 * being logged. That means they do not contain a valid LSN
1893		 * (i.e. transactionally ordered) in them, and hence any time we
1894		 * see a buffer to replay over the top of a remote attribute
1895		 * block we should simply do so.
1896		 */
1897		goto recover_immediately;
1898	case XFS_SB_MAGIC:
1899		lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
1900		uuid = &((struct xfs_dsb *)blk)->sb_uuid;
1901		break;
1902	default:
1903		break;
1904	}
1905
1906	if (lsn != (xfs_lsn_t)-1) {
1907		if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
1908			goto recover_immediately;
1909		return lsn;
1910	}
1911
1912	magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
1913	switch (magicda) {
1914	case XFS_DIR3_LEAF1_MAGIC:
1915	case XFS_DIR3_LEAFN_MAGIC:
1916	case XFS_DA3_NODE_MAGIC:
1917		lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
1918		uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
1919		break;
1920	default:
1921		break;
1922	}
1923
1924	if (lsn != (xfs_lsn_t)-1) {
1925		if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
1926			goto recover_immediately;
1927		return lsn;
1928	}
1929
1930	/*
1931	 * We do individual object checks on dquot and inode buffers as they
1932	 * have their own individual LSN records. Also, we could have a stale
1933	 * buffer here, so we have to at least recognise these buffer types.
1934	 *
1935	 * A notd complexity here is inode unlinked list processing - it logs
1936	 * the inode directly in the buffer, but we don't know which inodes have
1937	 * been modified, and there is no global buffer LSN. Hence we need to
1938	 * recover all inode buffer types immediately. This problem will be
1939	 * fixed by logical logging of the unlinked list modifications.
1940	 */
1941	magic16 = be16_to_cpu(*(__be16 *)blk);
1942	switch (magic16) {
1943	case XFS_DQUOT_MAGIC:
1944	case XFS_DINODE_MAGIC:
1945		goto recover_immediately;
1946	default:
1947		break;
1948	}
1949
1950	/* unknown buffer contents, recover immediately */
1951
1952recover_immediately:
1953	return (xfs_lsn_t)-1;
1954
1955}
1956
1957/*
1958 * Validate the recovered buffer is of the correct type and attach the
1959 * appropriate buffer operations to them for writeback. Magic numbers are in a
1960 * few places:
1961 *	the first 16 bits of the buffer (inode buffer, dquot buffer),
1962 *	the first 32 bits of the buffer (most blocks),
1963 *	inside a struct xfs_da_blkinfo at the start of the buffer.
1964 */
1965static void
1966xlog_recover_validate_buf_type(
1967	struct xfs_mount	*mp,
1968	struct xfs_buf		*bp,
1969	xfs_buf_log_format_t	*buf_f)
1970{
1971	struct xfs_da_blkinfo	*info = bp->b_addr;
1972	__uint32_t		magic32;
1973	__uint16_t		magic16;
1974	__uint16_t		magicda;
1975
1976	/*
1977	 * We can only do post recovery validation on items on CRC enabled
1978	 * fielsystems as we need to know when the buffer was written to be able
1979	 * to determine if we should have replayed the item. If we replay old
1980	 * metadata over a newer buffer, then it will enter a temporarily
1981	 * inconsistent state resulting in verification failures. Hence for now
1982	 * just avoid the verification stage for non-crc filesystems
1983	 */
1984	if (!xfs_sb_version_hascrc(&mp->m_sb))
1985		return;
1986
1987	magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
1988	magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
1989	magicda = be16_to_cpu(info->magic);
1990	switch (xfs_blft_from_flags(buf_f)) {
1991	case XFS_BLFT_BTREE_BUF:
1992		switch (magic32) {
1993		case XFS_ABTB_CRC_MAGIC:
1994		case XFS_ABTC_CRC_MAGIC:
1995		case XFS_ABTB_MAGIC:
1996		case XFS_ABTC_MAGIC:
1997			bp->b_ops = &xfs_allocbt_buf_ops;
1998			break;
1999		case XFS_IBT_CRC_MAGIC:
2000		case XFS_FIBT_CRC_MAGIC:
2001		case XFS_IBT_MAGIC:
2002		case XFS_FIBT_MAGIC:
2003			bp->b_ops = &xfs_inobt_buf_ops;
2004			break;
2005		case XFS_BMAP_CRC_MAGIC:
2006		case XFS_BMAP_MAGIC:
2007			bp->b_ops = &xfs_bmbt_buf_ops;
2008			break;
2009		default:
2010			xfs_warn(mp, "Bad btree block magic!");
2011			ASSERT(0);
2012			break;
2013		}
2014		break;
2015	case XFS_BLFT_AGF_BUF:
2016		if (magic32 != XFS_AGF_MAGIC) {
2017			xfs_warn(mp, "Bad AGF block magic!");
2018			ASSERT(0);
2019			break;
2020		}
2021		bp->b_ops = &xfs_agf_buf_ops;
2022		break;
2023	case XFS_BLFT_AGFL_BUF:
2024		if (magic32 != XFS_AGFL_MAGIC) {
2025			xfs_warn(mp, "Bad AGFL block magic!");
2026			ASSERT(0);
2027			break;
2028		}
2029		bp->b_ops = &xfs_agfl_buf_ops;
2030		break;
2031	case XFS_BLFT_AGI_BUF:
2032		if (magic32 != XFS_AGI_MAGIC) {
2033			xfs_warn(mp, "Bad AGI block magic!");
2034			ASSERT(0);
2035			break;
2036		}
2037		bp->b_ops = &xfs_agi_buf_ops;
2038		break;
2039	case XFS_BLFT_UDQUOT_BUF:
2040	case XFS_BLFT_PDQUOT_BUF:
2041	case XFS_BLFT_GDQUOT_BUF:
2042#ifdef CONFIG_XFS_QUOTA
2043		if (magic16 != XFS_DQUOT_MAGIC) {
2044			xfs_warn(mp, "Bad DQUOT block magic!");
2045			ASSERT(0);
2046			break;
2047		}
2048		bp->b_ops = &xfs_dquot_buf_ops;
2049#else
2050		xfs_alert(mp,
2051	"Trying to recover dquots without QUOTA support built in!");
2052		ASSERT(0);
2053#endif
2054		break;
2055	case XFS_BLFT_DINO_BUF:
2056		if (magic16 != XFS_DINODE_MAGIC) {
2057			xfs_warn(mp, "Bad INODE block magic!");
2058			ASSERT(0);
2059			break;
2060		}
2061		bp->b_ops = &xfs_inode_buf_ops;
2062		break;
2063	case XFS_BLFT_SYMLINK_BUF:
2064		if (magic32 != XFS_SYMLINK_MAGIC) {
2065			xfs_warn(mp, "Bad symlink block magic!");
2066			ASSERT(0);
2067			break;
2068		}
2069		bp->b_ops = &xfs_symlink_buf_ops;
2070		break;
2071	case XFS_BLFT_DIR_BLOCK_BUF:
2072		if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
2073		    magic32 != XFS_DIR3_BLOCK_MAGIC) {
2074			xfs_warn(mp, "Bad dir block magic!");
2075			ASSERT(0);
2076			break;
2077		}
2078		bp->b_ops = &xfs_dir3_block_buf_ops;
2079		break;
2080	case XFS_BLFT_DIR_DATA_BUF:
2081		if (magic32 != XFS_DIR2_DATA_MAGIC &&
2082		    magic32 != XFS_DIR3_DATA_MAGIC) {
2083			xfs_warn(mp, "Bad dir data magic!");
2084			ASSERT(0);
2085			break;
2086		}
2087		bp->b_ops = &xfs_dir3_data_buf_ops;
2088		break;
2089	case XFS_BLFT_DIR_FREE_BUF:
2090		if (magic32 != XFS_DIR2_FREE_MAGIC &&
2091		    magic32 != XFS_DIR3_FREE_MAGIC) {
2092			xfs_warn(mp, "Bad dir3 free magic!");
2093			ASSERT(0);
2094			break;
2095		}
2096		bp->b_ops = &xfs_dir3_free_buf_ops;
2097		break;
2098	case XFS_BLFT_DIR_LEAF1_BUF:
2099		if (magicda != XFS_DIR2_LEAF1_MAGIC &&
2100		    magicda != XFS_DIR3_LEAF1_MAGIC) {
2101			xfs_warn(mp, "Bad dir leaf1 magic!");
2102			ASSERT(0);
2103			break;
2104		}
2105		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
2106		break;
2107	case XFS_BLFT_DIR_LEAFN_BUF:
2108		if (magicda != XFS_DIR2_LEAFN_MAGIC &&
2109		    magicda != XFS_DIR3_LEAFN_MAGIC) {
2110			xfs_warn(mp, "Bad dir leafn magic!");
2111			ASSERT(0);
2112			break;
2113		}
2114		bp->b_ops = &xfs_dir3_leafn_buf_ops;
2115		break;
2116	case XFS_BLFT_DA_NODE_BUF:
2117		if (magicda != XFS_DA_NODE_MAGIC &&
2118		    magicda != XFS_DA3_NODE_MAGIC) {
2119			xfs_warn(mp, "Bad da node magic!");
2120			ASSERT(0);
2121			break;
2122		}
2123		bp->b_ops = &xfs_da3_node_buf_ops;
2124		break;
2125	case XFS_BLFT_ATTR_LEAF_BUF:
2126		if (magicda != XFS_ATTR_LEAF_MAGIC &&
2127		    magicda != XFS_ATTR3_LEAF_MAGIC) {
2128			xfs_warn(mp, "Bad attr leaf magic!");
2129			ASSERT(0);
2130			break;
2131		}
2132		bp->b_ops = &xfs_attr3_leaf_buf_ops;
2133		break;
2134	case XFS_BLFT_ATTR_RMT_BUF:
2135		if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2136			xfs_warn(mp, "Bad attr remote magic!");
2137			ASSERT(0);
2138			break;
2139		}
2140		bp->b_ops = &xfs_attr3_rmt_buf_ops;
2141		break;
2142	case XFS_BLFT_SB_BUF:
2143		if (magic32 != XFS_SB_MAGIC) {
2144			xfs_warn(mp, "Bad SB block magic!");
2145			ASSERT(0);
2146			break;
2147		}
2148		bp->b_ops = &xfs_sb_buf_ops;
2149		break;
2150	default:
2151		xfs_warn(mp, "Unknown buffer type %d!",
2152			 xfs_blft_from_flags(buf_f));
2153		break;
2154	}
2155}
2156
2157/*
2158 * Perform a 'normal' buffer recovery.  Each logged region of the
2159 * buffer should be copied over the corresponding region in the
2160 * given buffer.  The bitmap in the buf log format structure indicates
2161 * where to place the logged data.
2162 */
2163STATIC void
2164xlog_recover_do_reg_buffer(
2165	struct xfs_mount	*mp,
2166	xlog_recover_item_t	*item,
2167	struct xfs_buf		*bp,
2168	xfs_buf_log_format_t	*buf_f)
2169{
2170	int			i;
2171	int			bit;
2172	int			nbits;
2173	int                     error;
2174
2175	trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
2176
2177	bit = 0;
2178	i = 1;  /* 0 is the buf format structure */
2179	while (1) {
2180		bit = xfs_next_bit(buf_f->blf_data_map,
2181				   buf_f->blf_map_size, bit);
2182		if (bit == -1)
2183			break;
2184		nbits = xfs_contig_bits(buf_f->blf_data_map,
2185					buf_f->blf_map_size, bit);
2186		ASSERT(nbits > 0);
2187		ASSERT(item->ri_buf[i].i_addr != NULL);
2188		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
2189		ASSERT(BBTOB(bp->b_io_length) >=
2190		       ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
2191
2192		/*
2193		 * The dirty regions logged in the buffer, even though
2194		 * contiguous, may span multiple chunks. This is because the
2195		 * dirty region may span a physical page boundary in a buffer
2196		 * and hence be split into two separate vectors for writing into
2197		 * the log. Hence we need to trim nbits back to the length of
2198		 * the current region being copied out of the log.
2199		 */
2200		if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
2201			nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
2202
2203		/*
2204		 * Do a sanity check if this is a dquot buffer. Just checking
2205		 * the first dquot in the buffer should do. XXXThis is
2206		 * probably a good thing to do for other buf types also.
2207		 */
2208		error = 0;
2209		if (buf_f->blf_flags &
2210		   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2211			if (item->ri_buf[i].i_addr == NULL) {
2212				xfs_alert(mp,
2213					"XFS: NULL dquot in %s.", __func__);
2214				goto next;
2215			}
2216			if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
2217				xfs_alert(mp,
2218					"XFS: dquot too small (%d) in %s.",
2219					item->ri_buf[i].i_len, __func__);
2220				goto next;
2221			}
2222			error = xfs_dqcheck(mp, item->ri_buf[i].i_addr,
2223					       -1, 0, XFS_QMOPT_DOWARN,
2224					       "dquot_buf_recover");
2225			if (error)
2226				goto next;
2227		}
2228
2229		memcpy(xfs_buf_offset(bp,
2230			(uint)bit << XFS_BLF_SHIFT),	/* dest */
2231			item->ri_buf[i].i_addr,		/* source */
2232			nbits<<XFS_BLF_SHIFT);		/* length */
2233 next:
2234		i++;
2235		bit += nbits;
2236	}
2237
2238	/* Shouldn't be any more regions */
2239	ASSERT(i == item->ri_total);
2240
2241	xlog_recover_validate_buf_type(mp, bp, buf_f);
2242}
2243
2244/*
2245 * Perform a dquot buffer recovery.
2246 * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2247 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2248 * Else, treat it as a regular buffer and do recovery.
2249 *
2250 * Return false if the buffer was tossed and true if we recovered the buffer to
2251 * indicate to the caller if the buffer needs writing.
2252 */
2253STATIC bool
2254xlog_recover_do_dquot_buffer(
2255	struct xfs_mount		*mp,
2256	struct xlog			*log,
2257	struct xlog_recover_item	*item,
2258	struct xfs_buf			*bp,
2259	struct xfs_buf_log_format	*buf_f)
2260{
2261	uint			type;
2262
2263	trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2264
2265	/*
2266	 * Filesystems are required to send in quota flags at mount time.
2267	 */
2268	if (!mp->m_qflags)
2269		return false;
2270
2271	type = 0;
2272	if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2273		type |= XFS_DQ_USER;
2274	if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2275		type |= XFS_DQ_PROJ;
2276	if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2277		type |= XFS_DQ_GROUP;
2278	/*
2279	 * This type of quotas was turned off, so ignore this buffer
2280	 */
2281	if (log->l_quotaoffs_flag & type)
2282		return false;
2283
2284	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2285	return true;
2286}
2287
2288/*
2289 * This routine replays a modification made to a buffer at runtime.
2290 * There are actually two types of buffer, regular and inode, which
2291 * are handled differently.  Inode buffers are handled differently
2292 * in that we only recover a specific set of data from them, namely
2293 * the inode di_next_unlinked fields.  This is because all other inode
2294 * data is actually logged via inode records and any data we replay
2295 * here which overlaps that may be stale.
2296 *
2297 * When meta-data buffers are freed at run time we log a buffer item
2298 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2299 * of the buffer in the log should not be replayed at recovery time.
2300 * This is so that if the blocks covered by the buffer are reused for
2301 * file data before we crash we don't end up replaying old, freed
2302 * meta-data into a user's file.
2303 *
2304 * To handle the cancellation of buffer log items, we make two passes
2305 * over the log during recovery.  During the first we build a table of
2306 * those buffers which have been cancelled, and during the second we
2307 * only replay those buffers which do not have corresponding cancel
2308 * records in the table.  See xlog_recover_buffer_pass[1,2] above
2309 * for more details on the implementation of the table of cancel records.
2310 */
2311STATIC int
2312xlog_recover_buffer_pass2(
2313	struct xlog			*log,
2314	struct list_head		*buffer_list,
2315	struct xlog_recover_item	*item,
2316	xfs_lsn_t			current_lsn)
2317{
2318	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
2319	xfs_mount_t		*mp = log->l_mp;
2320	xfs_buf_t		*bp;
2321	int			error;
2322	uint			buf_flags;
2323	xfs_lsn_t		lsn;
2324
2325	/*
2326	 * In this pass we only want to recover all the buffers which have
2327	 * not been cancelled and are not cancellation buffers themselves.
2328	 */
2329	if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2330			buf_f->blf_len, buf_f->blf_flags)) {
2331		trace_xfs_log_recover_buf_cancel(log, buf_f);
2332		return 0;
2333	}
2334
2335	trace_xfs_log_recover_buf_recover(log, buf_f);
2336
2337	buf_flags = 0;
2338	if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2339		buf_flags |= XBF_UNMAPPED;
2340
2341	bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2342			  buf_flags, NULL);
2343	if (!bp)
2344		return -ENOMEM;
2345	error = bp->b_error;
2346	if (error) {
2347		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2348		goto out_release;
2349	}
2350
2351	/*
2352	 * Recover the buffer only if we get an LSN from it and it's less than
2353	 * the lsn of the transaction we are replaying.
2354	 *
2355	 * Note that we have to be extremely careful of readahead here.
2356	 * Readahead does not attach verfiers to the buffers so if we don't
2357	 * actually do any replay after readahead because of the LSN we found
2358	 * in the buffer if more recent than that current transaction then we
2359	 * need to attach the verifier directly. Failure to do so can lead to
2360	 * future recovery actions (e.g. EFI and unlinked list recovery) can
2361	 * operate on the buffers and they won't get the verifier attached. This
2362	 * can lead to blocks on disk having the correct content but a stale
2363	 * CRC.
2364	 *
2365	 * It is safe to assume these clean buffers are currently up to date.
2366	 * If the buffer is dirtied by a later transaction being replayed, then
2367	 * the verifier will be reset to match whatever recover turns that
2368	 * buffer into.
2369	 */
2370	lsn = xlog_recover_get_buf_lsn(mp, bp);
2371	if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2372		xlog_recover_validate_buf_type(mp, bp, buf_f);
2373		goto out_release;
2374	}
2375
2376	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2377		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2378		if (error)
2379			goto out_release;
2380	} else if (buf_f->blf_flags &
2381		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2382		bool	dirty;
2383
2384		dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2385		if (!dirty)
2386			goto out_release;
2387	} else {
2388		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2389	}
2390
2391	/*
2392	 * Perform delayed write on the buffer.  Asynchronous writes will be
2393	 * slower when taking into account all the buffers to be flushed.
2394	 *
2395	 * Also make sure that only inode buffers with good sizes stay in
2396	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
2397	 * or mp->m_inode_cluster_size bytes, whichever is bigger.  The inode
2398	 * buffers in the log can be a different size if the log was generated
2399	 * by an older kernel using unclustered inode buffers or a newer kernel
2400	 * running with a different inode cluster size.  Regardless, if the
2401	 * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
2402	 * for *our* value of mp->m_inode_cluster_size, then we need to keep
2403	 * the buffer out of the buffer cache so that the buffer won't
2404	 * overlap with future reads of those inodes.
2405	 */
2406	if (XFS_DINODE_MAGIC ==
2407	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2408	    (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
2409			(__uint32_t)log->l_mp->m_inode_cluster_size))) {
2410		xfs_buf_stale(bp);
2411		error = xfs_bwrite(bp);
2412	} else {
2413		ASSERT(bp->b_target->bt_mount == mp);
2414		bp->b_iodone = xlog_recover_iodone;
2415		xfs_buf_delwri_queue(bp, buffer_list);
2416	}
2417
2418out_release:
2419	xfs_buf_relse(bp);
2420	return error;
2421}
2422
2423/*
2424 * Inode fork owner changes
2425 *
2426 * If we have been told that we have to reparent the inode fork, it's because an
2427 * extent swap operation on a CRC enabled filesystem has been done and we are
2428 * replaying it. We need to walk the BMBT of the appropriate fork and change the
2429 * owners of it.
2430 *
2431 * The complexity here is that we don't have an inode context to work with, so
2432 * after we've replayed the inode we need to instantiate one.  This is where the
2433 * fun begins.
2434 *
2435 * We are in the middle of log recovery, so we can't run transactions. That
2436 * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2437 * that will result in the corresponding iput() running the inode through
2438 * xfs_inactive(). If we've just replayed an inode core that changes the link
2439 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2440 * transactions (bad!).
2441 *
2442 * So, to avoid this, we instantiate an inode directly from the inode core we've
2443 * just recovered. We have the buffer still locked, and all we really need to
2444 * instantiate is the inode core and the forks being modified. We can do this
2445 * manually, then run the inode btree owner change, and then tear down the
2446 * xfs_inode without having to run any transactions at all.
2447 *
2448 * Also, because we don't have a transaction context available here but need to
2449 * gather all the buffers we modify for writeback so we pass the buffer_list
2450 * instead for the operation to use.
2451 */
2452
2453STATIC int
2454xfs_recover_inode_owner_change(
2455	struct xfs_mount	*mp,
2456	struct xfs_dinode	*dip,
2457	struct xfs_inode_log_format *in_f,
2458	struct list_head	*buffer_list)
2459{
2460	struct xfs_inode	*ip;
2461	int			error;
2462
2463	ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
2464
2465	ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2466	if (!ip)
2467		return -ENOMEM;
2468
2469	/* instantiate the inode */
2470	xfs_dinode_from_disk(&ip->i_d, dip);
2471	ASSERT(ip->i_d.di_version >= 3);
2472
2473	error = xfs_iformat_fork(ip, dip);
2474	if (error)
2475		goto out_free_ip;
2476
2477
2478	if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2479		ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2480		error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2481					      ip->i_ino, buffer_list);
2482		if (error)
2483			goto out_free_ip;
2484	}
2485
2486	if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2487		ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2488		error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2489					      ip->i_ino, buffer_list);
2490		if (error)
2491			goto out_free_ip;
2492	}
2493
2494out_free_ip:
2495	xfs_inode_free(ip);
2496	return error;
2497}
2498
2499STATIC int
2500xlog_recover_inode_pass2(
2501	struct xlog			*log,
2502	struct list_head		*buffer_list,
2503	struct xlog_recover_item	*item,
2504	xfs_lsn_t			current_lsn)
2505{
2506	xfs_inode_log_format_t	*in_f;
2507	xfs_mount_t		*mp = log->l_mp;
2508	xfs_buf_t		*bp;
2509	xfs_dinode_t		*dip;
2510	int			len;
2511	xfs_caddr_t		src;
2512	xfs_caddr_t		dest;
2513	int			error;
2514	int			attr_index;
2515	uint			fields;
2516	xfs_icdinode_t		*dicp;
2517	uint			isize;
2518	int			need_free = 0;
2519
2520	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2521		in_f = item->ri_buf[0].i_addr;
2522	} else {
2523		in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
2524		need_free = 1;
2525		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2526		if (error)
2527			goto error;
2528	}
2529
2530	/*
2531	 * Inode buffers can be freed, look out for it,
2532	 * and do not replay the inode.
2533	 */
2534	if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2535					in_f->ilf_len, 0)) {
2536		error = 0;
2537		trace_xfs_log_recover_inode_cancel(log, in_f);
2538		goto error;
2539	}
2540	trace_xfs_log_recover_inode_recover(log, in_f);
2541
2542	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2543			  &xfs_inode_buf_ops);
2544	if (!bp) {
2545		error = -ENOMEM;
2546		goto error;
2547	}
2548	error = bp->b_error;
2549	if (error) {
2550		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2551		goto out_release;
2552	}
2553	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2554	dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
2555
2556	/*
2557	 * Make sure the place we're flushing out to really looks
2558	 * like an inode!
2559	 */
2560	if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
2561		xfs_alert(mp,
2562	"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2563			__func__, dip, bp, in_f->ilf_ino);
2564		XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2565				 XFS_ERRLEVEL_LOW, mp);
2566		error = -EFSCORRUPTED;
2567		goto out_release;
2568	}
2569	dicp = item->ri_buf[1].i_addr;
2570	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2571		xfs_alert(mp,
2572			"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2573			__func__, item, in_f->ilf_ino);
2574		XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2575				 XFS_ERRLEVEL_LOW, mp);
2576		error = -EFSCORRUPTED;
2577		goto out_release;
2578	}
2579
2580	/*
2581	 * If the inode has an LSN in it, recover the inode only if it's less
2582	 * than the lsn of the transaction we are replaying. Note: we still
2583	 * need to replay an owner change even though the inode is more recent
2584	 * than the transaction as there is no guarantee that all the btree
2585	 * blocks are more recent than this transaction, too.
2586	 */
2587	if (dip->di_version >= 3) {
2588		xfs_lsn_t	lsn = be64_to_cpu(dip->di_lsn);
2589
2590		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2591			trace_xfs_log_recover_inode_skip(log, in_f);
2592			error = 0;
2593			goto out_owner_change;
2594		}
2595	}
2596
2597	/*
2598	 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
2599	 * are transactional and if ordering is necessary we can determine that
2600	 * more accurately by the LSN field in the V3 inode core. Don't trust
2601	 * the inode versions we might be changing them here - use the
2602	 * superblock flag to determine whether we need to look at di_flushiter
2603	 * to skip replay when the on disk inode is newer than the log one
2604	 */
2605	if (!xfs_sb_version_hascrc(&mp->m_sb) &&
2606	    dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2607		/*
2608		 * Deal with the wrap case, DI_MAX_FLUSH is less
2609		 * than smaller numbers
2610		 */
2611		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
2612		    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2613			/* do nothing */
2614		} else {
2615			trace_xfs_log_recover_inode_skip(log, in_f);
2616			error = 0;
2617			goto out_release;
2618		}
2619	}
2620
2621	/* Take the opportunity to reset the flush iteration count */
2622	dicp->di_flushiter = 0;
2623
2624	if (unlikely(S_ISREG(dicp->di_mode))) {
2625		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2626		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2627			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2628					 XFS_ERRLEVEL_LOW, mp, dicp);
2629			xfs_alert(mp,
2630		"%s: Bad regular inode log record, rec ptr 0x%p, "
2631		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2632				__func__, item, dip, bp, in_f->ilf_ino);
2633			error = -EFSCORRUPTED;
2634			goto out_release;
2635		}
2636	} else if (unlikely(S_ISDIR(dicp->di_mode))) {
2637		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2638		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2639		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2640			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2641					     XFS_ERRLEVEL_LOW, mp, dicp);
2642			xfs_alert(mp,
2643		"%s: Bad dir inode log record, rec ptr 0x%p, "
2644		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2645				__func__, item, dip, bp, in_f->ilf_ino);
2646			error = -EFSCORRUPTED;
2647			goto out_release;
2648		}
2649	}
2650	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2651		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2652				     XFS_ERRLEVEL_LOW, mp, dicp);
2653		xfs_alert(mp,
2654	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2655	"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2656			__func__, item, dip, bp, in_f->ilf_ino,
2657			dicp->di_nextents + dicp->di_anextents,
2658			dicp->di_nblocks);
2659		error = -EFSCORRUPTED;
2660		goto out_release;
2661	}
2662	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2663		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2664				     XFS_ERRLEVEL_LOW, mp, dicp);
2665		xfs_alert(mp,
2666	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2667	"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2668			item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2669		error = -EFSCORRUPTED;
2670		goto out_release;
2671	}
2672	isize = xfs_icdinode_size(dicp->di_version);
2673	if (unlikely(item->ri_buf[1].i_len > isize)) {
2674		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2675				     XFS_ERRLEVEL_LOW, mp, dicp);
2676		xfs_alert(mp,
2677			"%s: Bad inode log record length %d, rec ptr 0x%p",
2678			__func__, item->ri_buf[1].i_len, item);
2679		error = -EFSCORRUPTED;
2680		goto out_release;
2681	}
2682
2683	/* The core is in in-core format */
2684	xfs_dinode_to_disk(dip, dicp);
2685
2686	/* the rest is in on-disk format */
2687	if (item->ri_buf[1].i_len > isize) {
2688		memcpy((char *)dip + isize,
2689			item->ri_buf[1].i_addr + isize,
2690			item->ri_buf[1].i_len - isize);
2691	}
2692
2693	fields = in_f->ilf_fields;
2694	switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2695	case XFS_ILOG_DEV:
2696		xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
2697		break;
2698	case XFS_ILOG_UUID:
2699		memcpy(XFS_DFORK_DPTR(dip),
2700		       &in_f->ilf_u.ilfu_uuid,
2701		       sizeof(uuid_t));
2702		break;
2703	}
2704
2705	if (in_f->ilf_size == 2)
2706		goto out_owner_change;
2707	len = item->ri_buf[2].i_len;
2708	src = item->ri_buf[2].i_addr;
2709	ASSERT(in_f->ilf_size <= 4);
2710	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2711	ASSERT(!(fields & XFS_ILOG_DFORK) ||
2712	       (len == in_f->ilf_dsize));
2713
2714	switch (fields & XFS_ILOG_DFORK) {
2715	case XFS_ILOG_DDATA:
2716	case XFS_ILOG_DEXT:
2717		memcpy(XFS_DFORK_DPTR(dip), src, len);
2718		break;
2719
2720	case XFS_ILOG_DBROOT:
2721		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
2722				 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
2723				 XFS_DFORK_DSIZE(dip, mp));
2724		break;
2725
2726	default:
2727		/*
2728		 * There are no data fork flags set.
2729		 */
2730		ASSERT((fields & XFS_ILOG_DFORK) == 0);
2731		break;
2732	}
2733
2734	/*
2735	 * If we logged any attribute data, recover it.  There may or
2736	 * may not have been any other non-core data logged in this
2737	 * transaction.
2738	 */
2739	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2740		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2741			attr_index = 3;
2742		} else {
2743			attr_index = 2;
2744		}
2745		len = item->ri_buf[attr_index].i_len;
2746		src = item->ri_buf[attr_index].i_addr;
2747		ASSERT(len == in_f->ilf_asize);
2748
2749		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2750		case XFS_ILOG_ADATA:
2751		case XFS_ILOG_AEXT:
2752			dest = XFS_DFORK_APTR(dip);
2753			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2754			memcpy(dest, src, len);
2755			break;
2756
2757		case XFS_ILOG_ABROOT:
2758			dest = XFS_DFORK_APTR(dip);
2759			xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
2760					 len, (xfs_bmdr_block_t*)dest,
2761					 XFS_DFORK_ASIZE(dip, mp));
2762			break;
2763
2764		default:
2765			xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2766			ASSERT(0);
2767			error = -EIO;
2768			goto out_release;
2769		}
2770	}
2771
2772out_owner_change:
2773	if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
2774		error = xfs_recover_inode_owner_change(mp, dip, in_f,
2775						       buffer_list);
2776	/* re-generate the checksum. */
2777	xfs_dinode_calc_crc(log->l_mp, dip);
2778
2779	ASSERT(bp->b_target->bt_mount == mp);
2780	bp->b_iodone = xlog_recover_iodone;
2781	xfs_buf_delwri_queue(bp, buffer_list);
2782
2783out_release:
2784	xfs_buf_relse(bp);
2785error:
2786	if (need_free)
2787		kmem_free(in_f);
2788	return error;
2789}
2790
2791/*
2792 * Recover QUOTAOFF records. We simply make a note of it in the xlog
2793 * structure, so that we know not to do any dquot item or dquot buffer recovery,
2794 * of that type.
2795 */
2796STATIC int
2797xlog_recover_quotaoff_pass1(
2798	struct xlog			*log,
2799	struct xlog_recover_item	*item)
2800{
2801	xfs_qoff_logformat_t	*qoff_f = item->ri_buf[0].i_addr;
2802	ASSERT(qoff_f);
2803
2804	/*
2805	 * The logitem format's flag tells us if this was user quotaoff,
2806	 * group/project quotaoff or both.
2807	 */
2808	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2809		log->l_quotaoffs_flag |= XFS_DQ_USER;
2810	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2811		log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2812	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2813		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2814
2815	return 0;
2816}
2817
2818/*
2819 * Recover a dquot record
2820 */
2821STATIC int
2822xlog_recover_dquot_pass2(
2823	struct xlog			*log,
2824	struct list_head		*buffer_list,
2825	struct xlog_recover_item	*item,
2826	xfs_lsn_t			current_lsn)
2827{
2828	xfs_mount_t		*mp = log->l_mp;
2829	xfs_buf_t		*bp;
2830	struct xfs_disk_dquot	*ddq, *recddq;
2831	int			error;
2832	xfs_dq_logformat_t	*dq_f;
2833	uint			type;
2834
2835
2836	/*
2837	 * Filesystems are required to send in quota flags at mount time.
2838	 */
2839	if (mp->m_qflags == 0)
2840		return 0;
2841
2842	recddq = item->ri_buf[1].i_addr;
2843	if (recddq == NULL) {
2844		xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
2845		return -EIO;
2846	}
2847	if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2848		xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
2849			item->ri_buf[1].i_len, __func__);
2850		return -EIO;
2851	}
2852
2853	/*
2854	 * This type of quotas was turned off, so ignore this record.
2855	 */
2856	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2857	ASSERT(type);
2858	if (log->l_quotaoffs_flag & type)
2859		return 0;
2860
2861	/*
2862	 * At this point we know that quota was _not_ turned off.
2863	 * Since the mount flags are not indicating to us otherwise, this
2864	 * must mean that quota is on, and the dquot needs to be replayed.
2865	 * Remember that we may not have fully recovered the superblock yet,
2866	 * so we can't do the usual trick of looking at the SB quota bits.
2867	 *
2868	 * The other possibility, of course, is that the quota subsystem was
2869	 * removed since the last mount - ENOSYS.
2870	 */
2871	dq_f = item->ri_buf[0].i_addr;
2872	ASSERT(dq_f);
2873	error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2874			   "xlog_recover_dquot_pass2 (log copy)");
2875	if (error)
2876		return -EIO;
2877	ASSERT(dq_f->qlf_len == 1);
2878
2879	/*
2880	 * At this point we are assuming that the dquots have been allocated
2881	 * and hence the buffer has valid dquots stamped in it. It should,
2882	 * therefore, pass verifier validation. If the dquot is bad, then the
2883	 * we'll return an error here, so we don't need to specifically check
2884	 * the dquot in the buffer after the verifier has run.
2885	 */
2886	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
2887				   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
2888				   &xfs_dquot_buf_ops);
2889	if (error)
2890		return error;
2891
2892	ASSERT(bp);
2893	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2894
2895	/*
2896	 * If the dquot has an LSN in it, recover the dquot only if it's less
2897	 * than the lsn of the transaction we are replaying.
2898	 */
2899	if (xfs_sb_version_hascrc(&mp->m_sb)) {
2900		struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
2901		xfs_lsn_t	lsn = be64_to_cpu(dqb->dd_lsn);
2902
2903		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2904			goto out_release;
2905		}
2906	}
2907
2908	memcpy(ddq, recddq, item->ri_buf[1].i_len);
2909	if (xfs_sb_version_hascrc(&mp->m_sb)) {
2910		xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
2911				 XFS_DQUOT_CRC_OFF);
2912	}
2913
2914	ASSERT(dq_f->qlf_size == 2);
2915	ASSERT(bp->b_target->bt_mount == mp);
2916	bp->b_iodone = xlog_recover_iodone;
2917	xfs_buf_delwri_queue(bp, buffer_list);
2918
2919out_release:
2920	xfs_buf_relse(bp);
2921	return 0;
2922}
2923
2924/*
2925 * This routine is called to create an in-core extent free intent
2926 * item from the efi format structure which was logged on disk.
2927 * It allocates an in-core efi, copies the extents from the format
2928 * structure into it, and adds the efi to the AIL with the given
2929 * LSN.
2930 */
2931STATIC int
2932xlog_recover_efi_pass2(
2933	struct xlog			*log,
2934	struct xlog_recover_item	*item,
2935	xfs_lsn_t			lsn)
2936{
2937	int			error;
2938	xfs_mount_t		*mp = log->l_mp;
2939	xfs_efi_log_item_t	*efip;
2940	xfs_efi_log_format_t	*efi_formatp;
2941
2942	efi_formatp = item->ri_buf[0].i_addr;
2943
2944	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2945	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2946					 &(efip->efi_format)))) {
2947		xfs_efi_item_free(efip);
2948		return error;
2949	}
2950	atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2951
2952	spin_lock(&log->l_ailp->xa_lock);
2953	/*
2954	 * xfs_trans_ail_update() drops the AIL lock.
2955	 */
2956	xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2957	return 0;
2958}
2959
2960
2961/*
2962 * This routine is called when an efd format structure is found in
2963 * a committed transaction in the log.  It's purpose is to cancel
2964 * the corresponding efi if it was still in the log.  To do this
2965 * it searches the AIL for the efi with an id equal to that in the
2966 * efd format structure.  If we find it, we remove the efi from the
2967 * AIL and free it.
2968 */
2969STATIC int
2970xlog_recover_efd_pass2(
2971	struct xlog			*log,
2972	struct xlog_recover_item	*item)
2973{
2974	xfs_efd_log_format_t	*efd_formatp;
2975	xfs_efi_log_item_t	*efip = NULL;
2976	xfs_log_item_t		*lip;
2977	__uint64_t		efi_id;
2978	struct xfs_ail_cursor	cur;
2979	struct xfs_ail		*ailp = log->l_ailp;
2980
2981	efd_formatp = item->ri_buf[0].i_addr;
2982	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2983		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2984	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2985		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
2986	efi_id = efd_formatp->efd_efi_id;
2987
2988	/*
2989	 * Search for the efi with the id in the efd format structure
2990	 * in the AIL.
2991	 */
2992	spin_lock(&ailp->xa_lock);
2993	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2994	while (lip != NULL) {
2995		if (lip->li_type == XFS_LI_EFI) {
2996			efip = (xfs_efi_log_item_t *)lip;
2997			if (efip->efi_format.efi_id == efi_id) {
2998				/*
2999				 * xfs_trans_ail_delete() drops the
3000				 * AIL lock.
3001				 */
3002				xfs_trans_ail_delete(ailp, lip,
3003						     SHUTDOWN_CORRUPT_INCORE);
3004				xfs_efi_item_free(efip);
3005				spin_lock(&ailp->xa_lock);
3006				break;
3007			}
3008		}
3009		lip = xfs_trans_ail_cursor_next(ailp, &cur);
3010	}
3011	xfs_trans_ail_cursor_done(&cur);
3012	spin_unlock(&ailp->xa_lock);
3013
3014	return 0;
3015}
3016
3017/*
3018 * This routine is called when an inode create format structure is found in a
3019 * committed transaction in the log.  It's purpose is to initialise the inodes
3020 * being allocated on disk. This requires us to get inode cluster buffers that
3021 * match the range to be intialised, stamped with inode templates and written
3022 * by delayed write so that subsequent modifications will hit the cached buffer
3023 * and only need writing out at the end of recovery.
3024 */
3025STATIC int
3026xlog_recover_do_icreate_pass2(
3027	struct xlog		*log,
3028	struct list_head	*buffer_list,
3029	xlog_recover_item_t	*item)
3030{
3031	struct xfs_mount	*mp = log->l_mp;
3032	struct xfs_icreate_log	*icl;
3033	xfs_agnumber_t		agno;
3034	xfs_agblock_t		agbno;
3035	unsigned int		count;
3036	unsigned int		isize;
3037	xfs_agblock_t		length;
3038
3039	icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3040	if (icl->icl_type != XFS_LI_ICREATE) {
3041		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3042		return -EINVAL;
3043	}
3044
3045	if (icl->icl_size != 1) {
3046		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3047		return -EINVAL;
3048	}
3049
3050	agno = be32_to_cpu(icl->icl_ag);
3051	if (agno >= mp->m_sb.sb_agcount) {
3052		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3053		return -EINVAL;
3054	}
3055	agbno = be32_to_cpu(icl->icl_agbno);
3056	if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3057		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3058		return -EINVAL;
3059	}
3060	isize = be32_to_cpu(icl->icl_isize);
3061	if (isize != mp->m_sb.sb_inodesize) {
3062		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3063		return -EINVAL;
3064	}
3065	count = be32_to_cpu(icl->icl_count);
3066	if (!count) {
3067		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3068		return -EINVAL;
3069	}
3070	length = be32_to_cpu(icl->icl_length);
3071	if (!length || length >= mp->m_sb.sb_agblocks) {
3072		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3073		return -EINVAL;
3074	}
3075
3076	/* existing allocation is fixed value */
3077	ASSERT(count == mp->m_ialloc_inos);
3078	ASSERT(length == mp->m_ialloc_blks);
3079	if (count != mp->m_ialloc_inos ||
3080	     length != mp->m_ialloc_blks) {
3081		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
3082		return -EINVAL;
3083	}
3084
3085	/*
3086	 * Inode buffers can be freed. Do not replay the inode initialisation as
3087	 * we could be overwriting something written after this inode buffer was
3088	 * cancelled.
3089	 *
3090	 * XXX: we need to iterate all buffers and only init those that are not
3091	 * cancelled. I think that a more fine grained factoring of
3092	 * xfs_ialloc_inode_init may be appropriate here to enable this to be
3093	 * done easily.
3094	 */
3095	if (xlog_check_buffer_cancelled(log,
3096			XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
3097		return 0;
3098
3099	xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
3100					be32_to_cpu(icl->icl_gen));
3101	return 0;
3102}
3103
3104STATIC void
3105xlog_recover_buffer_ra_pass2(
3106	struct xlog                     *log,
3107	struct xlog_recover_item        *item)
3108{
3109	struct xfs_buf_log_format	*buf_f = item->ri_buf[0].i_addr;
3110	struct xfs_mount		*mp = log->l_mp;
3111
3112	if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
3113			buf_f->blf_len, buf_f->blf_flags)) {
3114		return;
3115	}
3116
3117	xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
3118				buf_f->blf_len, NULL);
3119}
3120
3121STATIC void
3122xlog_recover_inode_ra_pass2(
3123	struct xlog                     *log,
3124	struct xlog_recover_item        *item)
3125{
3126	struct xfs_inode_log_format	ilf_buf;
3127	struct xfs_inode_log_format	*ilfp;
3128	struct xfs_mount		*mp = log->l_mp;
3129	int			error;
3130
3131	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3132		ilfp = item->ri_buf[0].i_addr;
3133	} else {
3134		ilfp = &ilf_buf;
3135		memset(ilfp, 0, sizeof(*ilfp));
3136		error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
3137		if (error)
3138			return;
3139	}
3140
3141	if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
3142		return;
3143
3144	xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
3145				ilfp->ilf_len, &xfs_inode_buf_ra_ops);
3146}
3147
3148STATIC void
3149xlog_recover_dquot_ra_pass2(
3150	struct xlog			*log,
3151	struct xlog_recover_item	*item)
3152{
3153	struct xfs_mount	*mp = log->l_mp;
3154	struct xfs_disk_dquot	*recddq;
3155	struct xfs_dq_logformat	*dq_f;
3156	uint			type;
3157	int			len;
3158
3159
3160	if (mp->m_qflags == 0)
3161		return;
3162
3163	recddq = item->ri_buf[1].i_addr;
3164	if (recddq == NULL)
3165		return;
3166	if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
3167		return;
3168
3169	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3170	ASSERT(type);
3171	if (log->l_quotaoffs_flag & type)
3172		return;
3173
3174	dq_f = item->ri_buf[0].i_addr;
3175	ASSERT(dq_f);
3176	ASSERT(dq_f->qlf_len == 1);
3177
3178	len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
3179	if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
3180		return;
3181
3182	xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
3183			  &xfs_dquot_buf_ra_ops);
3184}
3185
3186STATIC void
3187xlog_recover_ra_pass2(
3188	struct xlog			*log,
3189	struct xlog_recover_item	*item)
3190{
3191	switch (ITEM_TYPE(item)) {
3192	case XFS_LI_BUF:
3193		xlog_recover_buffer_ra_pass2(log, item);
3194		break;
3195	case XFS_LI_INODE:
3196		xlog_recover_inode_ra_pass2(log, item);
3197		break;
3198	case XFS_LI_DQUOT:
3199		xlog_recover_dquot_ra_pass2(log, item);
3200		break;
3201	case XFS_LI_EFI:
3202	case XFS_LI_EFD:
3203	case XFS_LI_QUOTAOFF:
3204	default:
3205		break;
3206	}
3207}
3208
3209STATIC int
3210xlog_recover_commit_pass1(
3211	struct xlog			*log,
3212	struct xlog_recover		*trans,
3213	struct xlog_recover_item	*item)
3214{
3215	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
3216
3217	switch (ITEM_TYPE(item)) {
3218	case XFS_LI_BUF:
3219		return xlog_recover_buffer_pass1(log, item);
3220	case XFS_LI_QUOTAOFF:
3221		return xlog_recover_quotaoff_pass1(log, item);
3222	case XFS_LI_INODE:
3223	case XFS_LI_EFI:
3224	case XFS_LI_EFD:
3225	case XFS_LI_DQUOT:
3226	case XFS_LI_ICREATE:
3227		/* nothing to do in pass 1 */
3228		return 0;
3229	default:
3230		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
3231			__func__, ITEM_TYPE(item));
3232		ASSERT(0);
3233		return -EIO;
3234	}
3235}
3236
3237STATIC int
3238xlog_recover_commit_pass2(
3239	struct xlog			*log,
3240	struct xlog_recover		*trans,
3241	struct list_head		*buffer_list,
3242	struct xlog_recover_item	*item)
3243{
3244	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
3245
3246	switch (ITEM_TYPE(item)) {
3247	case XFS_LI_BUF:
3248		return xlog_recover_buffer_pass2(log, buffer_list, item,
3249						 trans->r_lsn);
3250	case XFS_LI_INODE:
3251		return xlog_recover_inode_pass2(log, buffer_list, item,
3252						 trans->r_lsn);
3253	case XFS_LI_EFI:
3254		return xlog_recover_efi_pass2(log, item, trans->r_lsn);
3255	case XFS_LI_EFD:
3256		return xlog_recover_efd_pass2(log, item);
3257	case XFS_LI_DQUOT:
3258		return xlog_recover_dquot_pass2(log, buffer_list, item,
3259						trans->r_lsn);
3260	case XFS_LI_ICREATE:
3261		return xlog_recover_do_icreate_pass2(log, buffer_list, item);
3262	case XFS_LI_QUOTAOFF:
3263		/* nothing to do in pass2 */
3264		return 0;
3265	default:
3266		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
3267			__func__, ITEM_TYPE(item));
3268		ASSERT(0);
3269		return -EIO;
3270	}
3271}
3272
3273STATIC int
3274xlog_recover_items_pass2(
3275	struct xlog                     *log,
3276	struct xlog_recover             *trans,
3277	struct list_head                *buffer_list,
3278	struct list_head                *item_list)
3279{
3280	struct xlog_recover_item	*item;
3281	int				error = 0;
3282
3283	list_for_each_entry(item, item_list, ri_list) {
3284		error = xlog_recover_commit_pass2(log, trans,
3285					  buffer_list, item);
3286		if (error)
3287			return error;
3288	}
3289
3290	return error;
3291}
3292
3293/*
3294 * Perform the transaction.
3295 *
3296 * If the transaction modifies a buffer or inode, do it now.  Otherwise,
3297 * EFIs and EFDs get queued up by adding entries into the AIL for them.
3298 */
3299STATIC int
3300xlog_recover_commit_trans(
3301	struct xlog		*log,
3302	struct xlog_recover	*trans,
3303	int			pass)
3304{
3305	int				error = 0;
3306	int				error2;
3307	int				items_queued = 0;
3308	struct xlog_recover_item	*item;
3309	struct xlog_recover_item	*next;
3310	LIST_HEAD			(buffer_list);
3311	LIST_HEAD			(ra_list);
3312	LIST_HEAD			(done_list);
3313
3314	#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
3315
3316	hlist_del(&trans->r_list);
3317
3318	error = xlog_recover_reorder_trans(log, trans, pass);
3319	if (error)
3320		return error;
3321
3322	list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
3323		switch (pass) {
3324		case XLOG_RECOVER_PASS1:
3325			error = xlog_recover_commit_pass1(log, trans, item);
3326			break;
3327		case XLOG_RECOVER_PASS2:
3328			xlog_recover_ra_pass2(log, item);
3329			list_move_tail(&item->ri_list, &ra_list);
3330			items_queued++;
3331			if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
3332				error = xlog_recover_items_pass2(log, trans,
3333						&buffer_list, &ra_list);
3334				list_splice_tail_init(&ra_list, &done_list);
3335				items_queued = 0;
3336			}
3337
3338			break;
3339		default:
3340			ASSERT(0);
3341		}
3342
3343		if (error)
3344			goto out;
3345	}
3346
3347out:
3348	if (!list_empty(&ra_list)) {
3349		if (!error)
3350			error = xlog_recover_items_pass2(log, trans,
3351					&buffer_list, &ra_list);
3352		list_splice_tail_init(&ra_list, &done_list);
3353	}
3354
3355	if (!list_empty(&done_list))
3356		list_splice_init(&done_list, &trans->r_itemq);
3357
3358	error2 = xfs_buf_delwri_submit(&buffer_list);
3359	return error ? error : error2;
3360}
3361
3362STATIC void
3363xlog_recover_add_item(
3364	struct list_head	*head)
3365{
3366	xlog_recover_item_t	*item;
3367
3368	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
3369	INIT_LIST_HEAD(&item->ri_list);
3370	list_add_tail(&item->ri_list, head);
3371}
3372
3373STATIC int
3374xlog_recover_add_to_cont_trans(
3375	struct xlog		*log,
3376	struct xlog_recover	*trans,
3377	xfs_caddr_t		dp,
3378	int			len)
3379{
3380	xlog_recover_item_t	*item;
3381	xfs_caddr_t		ptr, old_ptr;
3382	int			old_len;
3383
3384	if (list_empty(&trans->r_itemq)) {
3385		/* finish copying rest of trans header */
3386		xlog_recover_add_item(&trans->r_itemq);
3387		ptr = (xfs_caddr_t) &trans->r_theader +
3388				sizeof(xfs_trans_header_t) - len;
3389		memcpy(ptr, dp, len);
3390		return 0;
3391	}
3392	/* take the tail entry */
3393	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
3394
3395	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
3396	old_len = item->ri_buf[item->ri_cnt-1].i_len;
3397
3398	ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
3399	memcpy(&ptr[old_len], dp, len);
3400	item->ri_buf[item->ri_cnt-1].i_len += len;
3401	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
3402	trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
3403	return 0;
3404}
3405
3406/*
3407 * The next region to add is the start of a new region.  It could be
3408 * a whole region or it could be the first part of a new region.  Because
3409 * of this, the assumption here is that the type and size fields of all
3410 * format structures fit into the first 32 bits of the structure.
3411 *
3412 * This works because all regions must be 32 bit aligned.  Therefore, we
3413 * either have both fields or we have neither field.  In the case we have
3414 * neither field, the data part of the region is zero length.  We only have
3415 * a log_op_header and can throw away the header since a new one will appear
3416 * later.  If we have at least 4 bytes, then we can determine how many regions
3417 * will appear in the current log item.
3418 */
3419STATIC int
3420xlog_recover_add_to_trans(
3421	struct xlog		*log,
3422	struct xlog_recover	*trans,
3423	xfs_caddr_t		dp,
3424	int			len)
3425{
3426	xfs_inode_log_format_t	*in_f;			/* any will do */
3427	xlog_recover_item_t	*item;
3428	xfs_caddr_t		ptr;
3429
3430	if (!len)
3431		return 0;
3432	if (list_empty(&trans->r_itemq)) {
3433		/* we need to catch log corruptions here */
3434		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
3435			xfs_warn(log->l_mp, "%s: bad header magic number",
3436				__func__);
3437			ASSERT(0);
3438			return -EIO;
3439		}
3440		if (len == sizeof(xfs_trans_header_t))
3441			xlog_recover_add_item(&trans->r_itemq);
3442		memcpy(&trans->r_theader, dp, len);
3443		return 0;
3444	}
3445
3446	ptr = kmem_alloc(len, KM_SLEEP);
3447	memcpy(ptr, dp, len);
3448	in_f = (xfs_inode_log_format_t *)ptr;
3449
3450	/* take the tail entry */
3451	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
3452	if (item->ri_total != 0 &&
3453	     item->ri_total == item->ri_cnt) {
3454		/* tail item is in use, get a new one */
3455		xlog_recover_add_item(&trans->r_itemq);
3456		item = list_entry(trans->r_itemq.prev,
3457					xlog_recover_item_t, ri_list);
3458	}
3459
3460	if (item->ri_total == 0) {		/* first region to be added */
3461		if (in_f->ilf_size == 0 ||
3462		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
3463			xfs_warn(log->l_mp,
3464		"bad number of regions (%d) in inode log format",
3465				  in_f->ilf_size);
3466			ASSERT(0);
3467			kmem_free(ptr);
3468			return -EIO;
3469		}
3470
3471		item->ri_total = in_f->ilf_size;
3472		item->ri_buf =
3473			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
3474				    KM_SLEEP);
3475	}
3476	ASSERT(item->ri_total > item->ri_cnt);
3477	/* Description region is ri_buf[0] */
3478	item->ri_buf[item->ri_cnt].i_addr = ptr;
3479	item->ri_buf[item->ri_cnt].i_len  = len;
3480	item->ri_cnt++;
3481	trace_xfs_log_recover_item_add(log, trans, item, 0);
3482	return 0;
3483}
3484
3485/*
3486 * Free up any resources allocated by the transaction
3487 *
3488 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
3489 */
3490STATIC void
3491xlog_recover_free_trans(
3492	struct xlog_recover	*trans)
3493{
3494	xlog_recover_item_t	*item, *n;
3495	int			i;
3496
3497	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
3498		/* Free the regions in the item. */
3499		list_del(&item->ri_list);
3500		for (i = 0; i < item->ri_cnt; i++)
3501			kmem_free(item->ri_buf[i].i_addr);
3502		/* Free the item itself */
3503		kmem_free(item->ri_buf);
3504		kmem_free(item);
3505	}
3506	/* Free the transaction recover structure */
3507	kmem_free(trans);
3508}
3509
3510/*
3511 * On error or completion, trans is freed.
3512 */
3513STATIC int
3514xlog_recovery_process_trans(
3515	struct xlog		*log,
3516	struct xlog_recover	*trans,
3517	xfs_caddr_t		dp,
3518	unsigned int		len,
3519	unsigned int		flags,
3520	int			pass)
3521{
3522	int			error = 0;
3523	bool			freeit = false;
3524
3525	/* mask off ophdr transaction container flags */
3526	flags &= ~XLOG_END_TRANS;
3527	if (flags & XLOG_WAS_CONT_TRANS)
3528		flags &= ~XLOG_CONTINUE_TRANS;
3529
3530	/*
3531	 * Callees must not free the trans structure. We'll decide if we need to
3532	 * free it or not based on the operation being done and it's result.
3533	 */
3534	switch (flags) {
3535	/* expected flag values */
3536	case 0:
3537	case XLOG_CONTINUE_TRANS:
3538		error = xlog_recover_add_to_trans(log, trans, dp, len);
3539		break;
3540	case XLOG_WAS_CONT_TRANS:
3541		error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
3542		break;
3543	case XLOG_COMMIT_TRANS:
3544		error = xlog_recover_commit_trans(log, trans, pass);
3545		/* success or fail, we are now done with this transaction. */
3546		freeit = true;
3547		break;
3548
3549	/* unexpected flag values */
3550	case XLOG_UNMOUNT_TRANS:
3551		/* just skip trans */
3552		xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
3553		freeit = true;
3554		break;
3555	case XLOG_START_TRANS:
3556	default:
3557		xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
3558		ASSERT(0);
3559		error = -EIO;
3560		break;
3561	}
3562	if (error || freeit)
3563		xlog_recover_free_trans(trans);
3564	return error;
3565}
3566
3567/*
3568 * Lookup the transaction recovery structure associated with the ID in the
3569 * current ophdr. If the transaction doesn't exist and the start flag is set in
3570 * the ophdr, then allocate a new transaction for future ID matches to find.
3571 * Either way, return what we found during the lookup - an existing transaction
3572 * or nothing.
3573 */
3574STATIC struct xlog_recover *
3575xlog_recover_ophdr_to_trans(
3576	struct hlist_head	rhash[],
3577	struct xlog_rec_header	*rhead,
3578	struct xlog_op_header	*ohead)
3579{
3580	struct xlog_recover	*trans;
3581	xlog_tid_t		tid;
3582	struct hlist_head	*rhp;
3583
3584	tid = be32_to_cpu(ohead->oh_tid);
3585	rhp = &rhash[XLOG_RHASH(tid)];
3586	hlist_for_each_entry(trans, rhp, r_list) {
3587		if (trans->r_log_tid == tid)
3588			return trans;
3589	}
3590
3591	/*
3592	 * skip over non-start transaction headers - we could be
3593	 * processing slack space before the next transaction starts
3594	 */
3595	if (!(ohead->oh_flags & XLOG_START_TRANS))
3596		return NULL;
3597
3598	ASSERT(be32_to_cpu(ohead->oh_len) == 0);
3599
3600	/*
3601	 * This is a new transaction so allocate a new recovery container to
3602	 * hold the recovery ops that will follow.
3603	 */
3604	trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
3605	trans->r_log_tid = tid;
3606	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
3607	INIT_LIST_HEAD(&trans->r_itemq);
3608	INIT_HLIST_NODE(&trans->r_list);
3609	hlist_add_head(&trans->r_list, rhp);
3610
3611	/*
3612	 * Nothing more to do for this ophdr. Items to be added to this new
3613	 * transaction will be in subsequent ophdr containers.
3614	 */
3615	return NULL;
3616}
3617
3618STATIC int
3619xlog_recover_process_ophdr(
3620	struct xlog		*log,
3621	struct hlist_head	rhash[],
3622	struct xlog_rec_header	*rhead,
3623	struct xlog_op_header	*ohead,
3624	xfs_caddr_t		dp,
3625	xfs_caddr_t		end,
3626	int			pass)
3627{
3628	struct xlog_recover	*trans;
3629	unsigned int		len;
3630
3631	/* Do we understand who wrote this op? */
3632	if (ohead->oh_clientid != XFS_TRANSACTION &&
3633	    ohead->oh_clientid != XFS_LOG) {
3634		xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
3635			__func__, ohead->oh_clientid);
3636		ASSERT(0);
3637		return -EIO;
3638	}
3639
3640	/*
3641	 * Check the ophdr contains all the data it is supposed to contain.
3642	 */
3643	len = be32_to_cpu(ohead->oh_len);
3644	if (dp + len > end) {
3645		xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
3646		WARN_ON(1);
3647		return -EIO;
3648	}
3649
3650	trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
3651	if (!trans) {
3652		/* nothing to do, so skip over this ophdr */
3653		return 0;
3654	}
3655
3656	return xlog_recovery_process_trans(log, trans, dp, len,
3657					   ohead->oh_flags, pass);
3658}
3659
3660/*
3661 * There are two valid states of the r_state field.  0 indicates that the
3662 * transaction structure is in a normal state.  We have either seen the
3663 * start of the transaction or the last operation we added was not a partial
3664 * operation.  If the last operation we added to the transaction was a
3665 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
3666 *
3667 * NOTE: skip LRs with 0 data length.
3668 */
3669STATIC int
3670xlog_recover_process_data(
3671	struct xlog		*log,
3672	struct hlist_head	rhash[],
3673	struct xlog_rec_header	*rhead,
3674	xfs_caddr_t		dp,
3675	int			pass)
3676{
3677	struct xlog_op_header	*ohead;
3678	xfs_caddr_t		end;
3679	int			num_logops;
3680	int			error;
3681
3682	end = dp + be32_to_cpu(rhead->h_len);
3683	num_logops = be32_to_cpu(rhead->h_num_logops);
3684
3685	/* check the log format matches our own - else we can't recover */
3686	if (xlog_header_check_recover(log->l_mp, rhead))
3687		return -EIO;
3688
3689	while ((dp < end) && num_logops) {
3690
3691		ohead = (struct xlog_op_header *)dp;
3692		dp += sizeof(*ohead);
3693		ASSERT(dp <= end);
3694
3695		/* errors will abort recovery */
3696		error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
3697						    dp, end, pass);
3698		if (error)
3699			return error;
3700
3701		dp += be32_to_cpu(ohead->oh_len);
3702		num_logops--;
3703	}
3704	return 0;
3705}
3706
3707/*
3708 * Process an extent free intent item that was recovered from
3709 * the log.  We need to free the extents that it describes.
3710 */
3711STATIC int
3712xlog_recover_process_efi(
3713	xfs_mount_t		*mp,
3714	xfs_efi_log_item_t	*efip)
3715{
3716	xfs_efd_log_item_t	*efdp;
3717	xfs_trans_t		*tp;
3718	int			i;
3719	int			error = 0;
3720	xfs_extent_t		*extp;
3721	xfs_fsblock_t		startblock_fsb;
3722
3723	ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
3724
3725	/*
3726	 * First check the validity of the extents described by the
3727	 * EFI.  If any are bad, then assume that all are bad and
3728	 * just toss the EFI.
3729	 */
3730	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3731		extp = &(efip->efi_format.efi_extents[i]);
3732		startblock_fsb = XFS_BB_TO_FSB(mp,
3733				   XFS_FSB_TO_DADDR(mp, extp->ext_start));
3734		if ((startblock_fsb == 0) ||
3735		    (extp->ext_len == 0) ||
3736		    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
3737		    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
3738			/*
3739			 * This will pull the EFI from the AIL and
3740			 * free the memory associated with it.
3741			 */
3742			set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3743			xfs_efi_release(efip, efip->efi_format.efi_nextents);
3744			return -EIO;
3745		}
3746	}
3747
3748	tp = xfs_trans_alloc(mp, 0);
3749	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
3750	if (error)
3751		goto abort_error;
3752	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
3753
3754	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3755		extp = &(efip->efi_format.efi_extents[i]);
3756		error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3757		if (error)
3758			goto abort_error;
3759		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3760					 extp->ext_len);
3761	}
3762
3763	set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3764	error = xfs_trans_commit(tp, 0);
3765	return error;
3766
3767abort_error:
3768	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3769	return error;
3770}
3771
3772/*
3773 * When this is called, all of the EFIs which did not have
3774 * corresponding EFDs should be in the AIL.  What we do now
3775 * is free the extents associated with each one.
3776 *
3777 * Since we process the EFIs in normal transactions, they
3778 * will be removed at some point after the commit.  This prevents
3779 * us from just walking down the list processing each one.
3780 * We'll use a flag in the EFI to skip those that we've already
3781 * processed and use the AIL iteration mechanism's generation
3782 * count to try to speed this up at least a bit.
3783 *
3784 * When we start, we know that the EFIs are the only things in
3785 * the AIL.  As we process them, however, other items are added
3786 * to the AIL.  Since everything added to the AIL must come after
3787 * everything already in the AIL, we stop processing as soon as
3788 * we see something other than an EFI in the AIL.
3789 */
3790STATIC int
3791xlog_recover_process_efis(
3792	struct xlog	*log)
3793{
3794	xfs_log_item_t		*lip;
3795	xfs_efi_log_item_t	*efip;
3796	int			error = 0;
3797	struct xfs_ail_cursor	cur;
3798	struct xfs_ail		*ailp;
3799
3800	ailp = log->l_ailp;
3801	spin_lock(&ailp->xa_lock);
3802	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3803	while (lip != NULL) {
3804		/*
3805		 * We're done when we see something other than an EFI.
3806		 * There should be no EFIs left in the AIL now.
3807		 */
3808		if (lip->li_type != XFS_LI_EFI) {
3809#ifdef DEBUG
3810			for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
3811				ASSERT(lip->li_type != XFS_LI_EFI);
3812#endif
3813			break;
3814		}
3815
3816		/*
3817		 * Skip EFIs that we've already processed.
3818		 */
3819		efip = (xfs_efi_log_item_t *)lip;
3820		if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
3821			lip = xfs_trans_ail_cursor_next(ailp, &cur);
3822			continue;
3823		}
3824
3825		spin_unlock(&ailp->xa_lock);
3826		error = xlog_recover_process_efi(log->l_mp, efip);
3827		spin_lock(&ailp->xa_lock);
3828		if (error)
3829			goto out;
3830		lip = xfs_trans_ail_cursor_next(ailp, &cur);
3831	}
3832out:
3833	xfs_trans_ail_cursor_done(&cur);
3834	spin_unlock(&ailp->xa_lock);
3835	return error;
3836}
3837
3838/*
3839 * This routine performs a transaction to null out a bad inode pointer
3840 * in an agi unlinked inode hash bucket.
3841 */
3842STATIC void
3843xlog_recover_clear_agi_bucket(
3844	xfs_mount_t	*mp,
3845	xfs_agnumber_t	agno,
3846	int		bucket)
3847{
3848	xfs_trans_t	*tp;
3849	xfs_agi_t	*agi;
3850	xfs_buf_t	*agibp;
3851	int		offset;
3852	int		error;
3853
3854	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3855	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
3856	if (error)
3857		goto out_abort;
3858
3859	error = xfs_read_agi(mp, tp, agno, &agibp);
3860	if (error)
3861		goto out_abort;
3862
3863	agi = XFS_BUF_TO_AGI(agibp);
3864	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3865	offset = offsetof(xfs_agi_t, agi_unlinked) +
3866		 (sizeof(xfs_agino_t) * bucket);
3867	xfs_trans_log_buf(tp, agibp, offset,
3868			  (offset + sizeof(xfs_agino_t) - 1));
3869
3870	error = xfs_trans_commit(tp, 0);
3871	if (error)
3872		goto out_error;
3873	return;
3874
3875out_abort:
3876	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3877out_error:
3878	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
3879	return;
3880}
3881
3882STATIC xfs_agino_t
3883xlog_recover_process_one_iunlink(
3884	struct xfs_mount		*mp,
3885	xfs_agnumber_t			agno,
3886	xfs_agino_t			agino,
3887	int				bucket)
3888{
3889	struct xfs_buf			*ibp;
3890	struct xfs_dinode		*dip;
3891	struct xfs_inode		*ip;
3892	xfs_ino_t			ino;
3893	int				error;
3894
3895	ino = XFS_AGINO_TO_INO(mp, agno, agino);
3896	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
3897	if (error)
3898		goto fail;
3899
3900	/*
3901	 * Get the on disk inode to find the next inode in the bucket.
3902	 */
3903	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
3904	if (error)
3905		goto fail_iput;
3906
3907	ASSERT(ip->i_d.di_nlink == 0);
3908	ASSERT(ip->i_d.di_mode != 0);
3909
3910	/* setup for the next pass */
3911	agino = be32_to_cpu(dip->di_next_unlinked);
3912	xfs_buf_relse(ibp);
3913
3914	/*
3915	 * Prevent any DMAPI event from being sent when the reference on
3916	 * the inode is dropped.
3917	 */
3918	ip->i_d.di_dmevmask = 0;
3919
3920	IRELE(ip);
3921	return agino;
3922
3923 fail_iput:
3924	IRELE(ip);
3925 fail:
3926	/*
3927	 * We can't read in the inode this bucket points to, or this inode
3928	 * is messed up.  Just ditch this bucket of inodes.  We will lose
3929	 * some inodes and space, but at least we won't hang.
3930	 *
3931	 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
3932	 * clear the inode pointer in the bucket.
3933	 */
3934	xlog_recover_clear_agi_bucket(mp, agno, bucket);
3935	return NULLAGINO;
3936}
3937
3938/*
3939 * xlog_iunlink_recover
3940 *
3941 * This is called during recovery to process any inodes which
3942 * we unlinked but not freed when the system crashed.  These
3943 * inodes will be on the lists in the AGI blocks.  What we do
3944 * here is scan all the AGIs and fully truncate and free any
3945 * inodes found on the lists.  Each inode is removed from the
3946 * lists when it has been fully truncated and is freed.  The
3947 * freeing of the inode and its removal from the list must be
3948 * atomic.
3949 */
3950STATIC void
3951xlog_recover_process_iunlinks(
3952	struct xlog	*log)
3953{
3954	xfs_mount_t	*mp;
3955	xfs_agnumber_t	agno;
3956	xfs_agi_t	*agi;
3957	xfs_buf_t	*agibp;
3958	xfs_agino_t	agino;
3959	int		bucket;
3960	int		error;
3961	uint		mp_dmevmask;
3962
3963	mp = log->l_mp;
3964
3965	/*
3966	 * Prevent any DMAPI event from being sent while in this function.
3967	 */
3968	mp_dmevmask = mp->m_dmevmask;
3969	mp->m_dmevmask = 0;
3970
3971	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3972		/*
3973		 * Find the agi for this ag.
3974		 */
3975		error = xfs_read_agi(mp, NULL, agno, &agibp);
3976		if (error) {
3977			/*
3978			 * AGI is b0rked. Don't process it.
3979			 *
3980			 * We should probably mark the filesystem as corrupt
3981			 * after we've recovered all the ag's we can....
3982			 */
3983			continue;
3984		}
3985		/*
3986		 * Unlock the buffer so that it can be acquired in the normal
3987		 * course of the transaction to truncate and free each inode.
3988		 * Because we are not racing with anyone else here for the AGI
3989		 * buffer, we don't even need to hold it locked to read the
3990		 * initial unlinked bucket entries out of the buffer. We keep
3991		 * buffer reference though, so that it stays pinned in memory
3992		 * while we need the buffer.
3993		 */
3994		agi = XFS_BUF_TO_AGI(agibp);
3995		xfs_buf_unlock(agibp);
3996
3997		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3998			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3999			while (agino != NULLAGINO) {
4000				agino = xlog_recover_process_one_iunlink(mp,
4001							agno, agino, bucket);
4002			}
4003		}
4004		xfs_buf_rele(agibp);
4005	}
4006
4007	mp->m_dmevmask = mp_dmevmask;
4008}
4009
4010/*
4011 * Upack the log buffer data and crc check it. If the check fails, issue a
4012 * warning if and only if the CRC in the header is non-zero. This makes the
4013 * check an advisory warning, and the zero CRC check will prevent failure
4014 * warnings from being emitted when upgrading the kernel from one that does not
4015 * add CRCs by default.
4016 *
4017 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
4018 * corruption failure
4019 */
4020STATIC int
4021xlog_unpack_data_crc(
4022	struct xlog_rec_header	*rhead,
4023	xfs_caddr_t		dp,
4024	struct xlog		*log)
4025{
4026	__le32			crc;
4027
4028	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
4029	if (crc != rhead->h_crc) {
4030		if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
4031			xfs_alert(log->l_mp,
4032		"log record CRC mismatch: found 0x%x, expected 0x%x.",
4033					le32_to_cpu(rhead->h_crc),
4034					le32_to_cpu(crc));
4035			xfs_hex_dump(dp, 32);
4036		}
4037
4038		/*
4039		 * If we've detected a log record corruption, then we can't
4040		 * recover past this point. Abort recovery if we are enforcing
4041		 * CRC protection by punting an error back up the stack.
4042		 */
4043		if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
4044			return -EFSCORRUPTED;
4045	}
4046
4047	return 0;
4048}
4049
4050STATIC int
4051xlog_unpack_data(
4052	struct xlog_rec_header	*rhead,
4053	xfs_caddr_t		dp,
4054	struct xlog		*log)
4055{
4056	int			i, j, k;
4057	int			error;
4058
4059	error = xlog_unpack_data_crc(rhead, dp, log);
4060	if (error)
4061		return error;
4062
4063	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
4064		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
4065		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
4066		dp += BBSIZE;
4067	}
4068
4069	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
4070		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
4071		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
4072			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
4073			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
4074			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
4075			dp += BBSIZE;
4076		}
4077	}
4078
4079	return 0;
4080}
4081
4082STATIC int
4083xlog_valid_rec_header(
4084	struct xlog		*log,
4085	struct xlog_rec_header	*rhead,
4086	xfs_daddr_t		blkno)
4087{
4088	int			hlen;
4089
4090	if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
4091		XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
4092				XFS_ERRLEVEL_LOW, log->l_mp);
4093		return -EFSCORRUPTED;
4094	}
4095	if (unlikely(
4096	    (!rhead->h_version ||
4097	    (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
4098		xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
4099			__func__, be32_to_cpu(rhead->h_version));
4100		return -EIO;
4101	}
4102
4103	/* LR body must have data or it wouldn't have been written */
4104	hlen = be32_to_cpu(rhead->h_len);
4105	if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
4106		XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
4107				XFS_ERRLEVEL_LOW, log->l_mp);
4108		return -EFSCORRUPTED;
4109	}
4110	if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
4111		XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
4112				XFS_ERRLEVEL_LOW, log->l_mp);
4113		return -EFSCORRUPTED;
4114	}
4115	return 0;
4116}
4117
4118/*
4119 * Read the log from tail to head and process the log records found.
4120 * Handle the two cases where the tail and head are in the same cycle
4121 * and where the active portion of the log wraps around the end of
4122 * the physical log separately.  The pass parameter is passed through
4123 * to the routines called to process the data and is not looked at
4124 * here.
4125 */
4126STATIC int
4127xlog_do_recovery_pass(
4128	struct xlog		*log,
4129	xfs_daddr_t		head_blk,
4130	xfs_daddr_t		tail_blk,
4131	int			pass)
4132{
4133	xlog_rec_header_t	*rhead;
4134	xfs_daddr_t		blk_no;
4135	xfs_caddr_t		offset;
4136	xfs_buf_t		*hbp, *dbp;
4137	int			error = 0, h_size;
4138	int			bblks, split_bblks;
4139	int			hblks, split_hblks, wrapped_hblks;
4140	struct hlist_head	rhash[XLOG_RHASH_SIZE];
4141
4142	ASSERT(head_blk != tail_blk);
4143
4144	/*
4145	 * Read the header of the tail block and get the iclog buffer size from
4146	 * h_size.  Use this to tell how many sectors make up the log header.
4147	 */
4148	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
4149		/*
4150		 * When using variable length iclogs, read first sector of
4151		 * iclog header and extract the header size from it.  Get a
4152		 * new hbp that is the correct size.
4153		 */
4154		hbp = xlog_get_bp(log, 1);
4155		if (!hbp)
4156			return -ENOMEM;
4157
4158		error = xlog_bread(log, tail_blk, 1, hbp, &offset);
4159		if (error)
4160			goto bread_err1;
4161
4162		rhead = (xlog_rec_header_t *)offset;
4163		error = xlog_valid_rec_header(log, rhead, tail_blk);
4164		if (error)
4165			goto bread_err1;
4166		h_size = be32_to_cpu(rhead->h_size);
4167		if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
4168		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
4169			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
4170			if (h_size % XLOG_HEADER_CYCLE_SIZE)
4171				hblks++;
4172			xlog_put_bp(hbp);
4173			hbp = xlog_get_bp(log, hblks);
4174		} else {
4175			hblks = 1;
4176		}
4177	} else {
4178		ASSERT(log->l_sectBBsize == 1);
4179		hblks = 1;
4180		hbp = xlog_get_bp(log, 1);
4181		h_size = XLOG_BIG_RECORD_BSIZE;
4182	}
4183
4184	if (!hbp)
4185		return -ENOMEM;
4186	dbp = xlog_get_bp(log, BTOBB(h_size));
4187	if (!dbp) {
4188		xlog_put_bp(hbp);
4189		return -ENOMEM;
4190	}
4191
4192	memset(rhash, 0, sizeof(rhash));
4193	blk_no = tail_blk;
4194	if (tail_blk > head_blk) {
4195		/*
4196		 * Perform recovery around the end of the physical log.
4197		 * When the head is not on the same cycle number as the tail,
4198		 * we can't do a sequential recovery.
4199		 */
4200		while (blk_no < log->l_logBBsize) {
4201			/*
4202			 * Check for header wrapping around physical end-of-log
4203			 */
4204			offset = hbp->b_addr;
4205			split_hblks = 0;
4206			wrapped_hblks = 0;
4207			if (blk_no + hblks <= log->l_logBBsize) {
4208				/* Read header in one read */
4209				error = xlog_bread(log, blk_no, hblks, hbp,
4210						   &offset);
4211				if (error)
4212					goto bread_err2;
4213			} else {
4214				/* This LR is split across physical log end */
4215				if (blk_no != log->l_logBBsize) {
4216					/* some data before physical log end */
4217					ASSERT(blk_no <= INT_MAX);
4218					split_hblks = log->l_logBBsize - (int)blk_no;
4219					ASSERT(split_hblks > 0);
4220					error = xlog_bread(log, blk_no,
4221							   split_hblks, hbp,
4222							   &offset);
4223					if (error)
4224						goto bread_err2;
4225				}
4226
4227				/*
4228				 * Note: this black magic still works with
4229				 * large sector sizes (non-512) only because:
4230				 * - we increased the buffer size originally
4231				 *   by 1 sector giving us enough extra space
4232				 *   for the second read;
4233				 * - the log start is guaranteed to be sector
4234				 *   aligned;
4235				 * - we read the log end (LR header start)
4236				 *   _first_, then the log start (LR header end)
4237				 *   - order is important.
4238				 */
4239				wrapped_hblks = hblks - split_hblks;
4240				error = xlog_bread_offset(log, 0,
4241						wrapped_hblks, hbp,
4242						offset + BBTOB(split_hblks));
4243				if (error)
4244					goto bread_err2;
4245			}
4246			rhead = (xlog_rec_header_t *)offset;
4247			error = xlog_valid_rec_header(log, rhead,
4248						split_hblks ? blk_no : 0);
4249			if (error)
4250				goto bread_err2;
4251
4252			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
4253			blk_no += hblks;
4254
4255			/* Read in data for log record */
4256			if (blk_no + bblks <= log->l_logBBsize) {
4257				error = xlog_bread(log, blk_no, bblks, dbp,
4258						   &offset);
4259				if (error)
4260					goto bread_err2;
4261			} else {
4262				/* This log record is split across the
4263				 * physical end of log */
4264				offset = dbp->b_addr;
4265				split_bblks = 0;
4266				if (blk_no != log->l_logBBsize) {
4267					/* some data is before the physical
4268					 * end of log */
4269					ASSERT(!wrapped_hblks);
4270					ASSERT(blk_no <= INT_MAX);
4271					split_bblks =
4272						log->l_logBBsize - (int)blk_no;
4273					ASSERT(split_bblks > 0);
4274					error = xlog_bread(log, blk_no,
4275							split_bblks, dbp,
4276							&offset);
4277					if (error)
4278						goto bread_err2;
4279				}
4280
4281				/*
4282				 * Note: this black magic still works with
4283				 * large sector sizes (non-512) only because:
4284				 * - we increased the buffer size originally
4285				 *   by 1 sector giving us enough extra space
4286				 *   for the second read;
4287				 * - the log start is guaranteed to be sector
4288				 *   aligned;
4289				 * - we read the log end (LR header start)
4290				 *   _first_, then the log start (LR header end)
4291				 *   - order is important.
4292				 */
4293				error = xlog_bread_offset(log, 0,
4294						bblks - split_bblks, dbp,
4295						offset + BBTOB(split_bblks));
4296				if (error)
4297					goto bread_err2;
4298			}
4299
4300			error = xlog_unpack_data(rhead, offset, log);
4301			if (error)
4302				goto bread_err2;
4303
4304			error = xlog_recover_process_data(log, rhash,
4305							rhead, offset, pass);
4306			if (error)
4307				goto bread_err2;
4308			blk_no += bblks;
4309		}
4310
4311		ASSERT(blk_no >= log->l_logBBsize);
4312		blk_no -= log->l_logBBsize;
4313	}
4314
4315	/* read first part of physical log */
4316	while (blk_no < head_blk) {
4317		error = xlog_bread(log, blk_no, hblks, hbp, &offset);
4318		if (error)
4319			goto bread_err2;
4320
4321		rhead = (xlog_rec_header_t *)offset;
4322		error = xlog_valid_rec_header(log, rhead, blk_no);
4323		if (error)
4324			goto bread_err2;
4325
4326		/* blocks in data section */
4327		bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
4328		error = xlog_bread(log, blk_no+hblks, bblks, dbp,
4329				   &offset);
4330		if (error)
4331			goto bread_err2;
4332
4333		error = xlog_unpack_data(rhead, offset, log);
4334		if (error)
4335			goto bread_err2;
4336
4337		error = xlog_recover_process_data(log, rhash,
4338						rhead, offset, pass);
4339		if (error)
4340			goto bread_err2;
4341		blk_no += bblks + hblks;
4342	}
4343
4344 bread_err2:
4345	xlog_put_bp(dbp);
4346 bread_err1:
4347	xlog_put_bp(hbp);
4348	return error;
4349}
4350
4351/*
4352 * Do the recovery of the log.  We actually do this in two phases.
4353 * The two passes are necessary in order to implement the function
4354 * of cancelling a record written into the log.  The first pass
4355 * determines those things which have been cancelled, and the
4356 * second pass replays log items normally except for those which
4357 * have been cancelled.  The handling of the replay and cancellations
4358 * takes place in the log item type specific routines.
4359 *
4360 * The table of items which have cancel records in the log is allocated
4361 * and freed at this level, since only here do we know when all of
4362 * the log recovery has been completed.
4363 */
4364STATIC int
4365xlog_do_log_recovery(
4366	struct xlog	*log,
4367	xfs_daddr_t	head_blk,
4368	xfs_daddr_t	tail_blk)
4369{
4370	int		error, i;
4371
4372	ASSERT(head_blk != tail_blk);
4373
4374	/*
4375	 * First do a pass to find all of the cancelled buf log items.
4376	 * Store them in the buf_cancel_table for use in the second pass.
4377	 */
4378	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
4379						 sizeof(struct list_head),
4380						 KM_SLEEP);
4381	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
4382		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
4383
4384	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
4385				      XLOG_RECOVER_PASS1);
4386	if (error != 0) {
4387		kmem_free(log->l_buf_cancel_table);
4388		log->l_buf_cancel_table = NULL;
4389		return error;
4390	}
4391	/*
4392	 * Then do a second pass to actually recover the items in the log.
4393	 * When it is complete free the table of buf cancel items.
4394	 */
4395	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
4396				      XLOG_RECOVER_PASS2);
4397#ifdef DEBUG
4398	if (!error) {
4399		int	i;
4400
4401		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
4402			ASSERT(list_empty(&log->l_buf_cancel_table[i]));
4403	}
4404#endif	/* DEBUG */
4405
4406	kmem_free(log->l_buf_cancel_table);
4407	log->l_buf_cancel_table = NULL;
4408
4409	return error;
4410}
4411
4412/*
4413 * Do the actual recovery
4414 */
4415STATIC int
4416xlog_do_recover(
4417	struct xlog	*log,
4418	xfs_daddr_t	head_blk,
4419	xfs_daddr_t	tail_blk)
4420{
4421	int		error;
4422	xfs_buf_t	*bp;
4423	xfs_sb_t	*sbp;
4424
4425	/*
4426	 * First replay the images in the log.
4427	 */
4428	error = xlog_do_log_recovery(log, head_blk, tail_blk);
4429	if (error)
4430		return error;
4431
4432	/*
4433	 * If IO errors happened during recovery, bail out.
4434	 */
4435	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
4436		return -EIO;
4437	}
4438
4439	/*
4440	 * We now update the tail_lsn since much of the recovery has completed
4441	 * and there may be space available to use.  If there were no extent
4442	 * or iunlinks, we can free up the entire log and set the tail_lsn to
4443	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
4444	 * lsn of the last known good LR on disk.  If there are extent frees
4445	 * or iunlinks they will have some entries in the AIL; so we look at
4446	 * the AIL to determine how to set the tail_lsn.
4447	 */
4448	xlog_assign_tail_lsn(log->l_mp);
4449
4450	/*
4451	 * Now that we've finished replaying all buffer and inode
4452	 * updates, re-read in the superblock and reverify it.
4453	 */
4454	bp = xfs_getsb(log->l_mp, 0);
4455	XFS_BUF_UNDONE(bp);
4456	ASSERT(!(XFS_BUF_ISWRITE(bp)));
4457	XFS_BUF_READ(bp);
4458	XFS_BUF_UNASYNC(bp);
4459	bp->b_ops = &xfs_sb_buf_ops;
4460
4461	error = xfs_buf_submit_wait(bp);
4462	if (error) {
4463		if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
4464			xfs_buf_ioerror_alert(bp, __func__);
4465			ASSERT(0);
4466		}
4467		xfs_buf_relse(bp);
4468		return error;
4469	}
4470
4471	/* Convert superblock from on-disk format */
4472	sbp = &log->l_mp->m_sb;
4473	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
4474	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
4475	ASSERT(xfs_sb_good_version(sbp));
4476	xfs_reinit_percpu_counters(log->l_mp);
4477
4478	xfs_buf_relse(bp);
4479
4480
4481	xlog_recover_check_summary(log);
4482
4483	/* Normal transactions can now occur */
4484	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
4485	return 0;
4486}
4487
4488/*
4489 * Perform recovery and re-initialize some log variables in xlog_find_tail.
4490 *
4491 * Return error or zero.
4492 */
4493int
4494xlog_recover(
4495	struct xlog	*log)
4496{
4497	xfs_daddr_t	head_blk, tail_blk;
4498	int		error;
4499
4500	/* find the tail of the log */
4501	if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
4502		return error;
4503
4504	if (tail_blk != head_blk) {
4505		/* There used to be a comment here:
4506		 *
4507		 * disallow recovery on read-only mounts.  note -- mount
4508		 * checks for ENOSPC and turns it into an intelligent
4509		 * error message.
4510		 * ...but this is no longer true.  Now, unless you specify
4511		 * NORECOVERY (in which case this function would never be
4512		 * called), we just go ahead and recover.  We do this all
4513		 * under the vfs layer, so we can get away with it unless
4514		 * the device itself is read-only, in which case we fail.
4515		 */
4516		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
4517			return error;
4518		}
4519
4520		/*
4521		 * Version 5 superblock log feature mask validation. We know the
4522		 * log is dirty so check if there are any unknown log features
4523		 * in what we need to recover. If there are unknown features
4524		 * (e.g. unsupported transactions, then simply reject the
4525		 * attempt at recovery before touching anything.
4526		 */
4527		if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
4528		    xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
4529					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
4530			xfs_warn(log->l_mp,
4531"Superblock has unknown incompatible log features (0x%x) enabled.\n"
4532"The log can not be fully and/or safely recovered by this kernel.\n"
4533"Please recover the log on a kernel that supports the unknown features.",
4534				(log->l_mp->m_sb.sb_features_log_incompat &
4535					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
4536			return -EINVAL;
4537		}
4538
4539		/*
4540		 * Delay log recovery if the debug hook is set. This is debug
4541		 * instrumention to coordinate simulation of I/O failures with
4542		 * log recovery.
4543		 */
4544		if (xfs_globals.log_recovery_delay) {
4545			xfs_notice(log->l_mp,
4546				"Delaying log recovery for %d seconds.",
4547				xfs_globals.log_recovery_delay);
4548			msleep(xfs_globals.log_recovery_delay * 1000);
4549		}
4550
4551		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
4552				log->l_mp->m_logname ? log->l_mp->m_logname
4553						     : "internal");
4554
4555		error = xlog_do_recover(log, head_blk, tail_blk);
4556		log->l_flags |= XLOG_RECOVERY_NEEDED;
4557	}
4558	return error;
4559}
4560
4561/*
4562 * In the first part of recovery we replay inodes and buffers and build
4563 * up the list of extent free items which need to be processed.  Here
4564 * we process the extent free items and clean up the on disk unlinked
4565 * inode lists.  This is separated from the first part of recovery so
4566 * that the root and real-time bitmap inodes can be read in from disk in
4567 * between the two stages.  This is necessary so that we can free space
4568 * in the real-time portion of the file system.
4569 */
4570int
4571xlog_recover_finish(
4572	struct xlog	*log)
4573{
4574	/*
4575	 * Now we're ready to do the transactions needed for the
4576	 * rest of recovery.  Start with completing all the extent
4577	 * free intent records and then process the unlinked inode
4578	 * lists.  At this point, we essentially run in normal mode
4579	 * except that we're still performing recovery actions
4580	 * rather than accepting new requests.
4581	 */
4582	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
4583		int	error;
4584		error = xlog_recover_process_efis(log);
4585		if (error) {
4586			xfs_alert(log->l_mp, "Failed to recover EFIs");
4587			return error;
4588		}
4589		/*
4590		 * Sync the log to get all the EFIs out of the AIL.
4591		 * This isn't absolutely necessary, but it helps in
4592		 * case the unlink transactions would have problems
4593		 * pushing the EFIs out of the way.
4594		 */
4595		xfs_log_force(log->l_mp, XFS_LOG_SYNC);
4596
4597		xlog_recover_process_iunlinks(log);
4598
4599		xlog_recover_check_summary(log);
4600
4601		xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
4602				log->l_mp->m_logname ? log->l_mp->m_logname
4603						     : "internal");
4604		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
4605	} else {
4606		xfs_info(log->l_mp, "Ending clean mount");
4607	}
4608	return 0;
4609}
4610
4611
4612#if defined(DEBUG)
4613/*
4614 * Read all of the agf and agi counters and check that they
4615 * are consistent with the superblock counters.
4616 */
4617void
4618xlog_recover_check_summary(
4619	struct xlog	*log)
4620{
4621	xfs_mount_t	*mp;
4622	xfs_agf_t	*agfp;
4623	xfs_buf_t	*agfbp;
4624	xfs_buf_t	*agibp;
4625	xfs_agnumber_t	agno;
4626	__uint64_t	freeblks;
4627	__uint64_t	itotal;
4628	__uint64_t	ifree;
4629	int		error;
4630
4631	mp = log->l_mp;
4632
4633	freeblks = 0LL;
4634	itotal = 0LL;
4635	ifree = 0LL;
4636	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4637		error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
4638		if (error) {
4639			xfs_alert(mp, "%s agf read failed agno %d error %d",
4640						__func__, agno, error);
4641		} else {
4642			agfp = XFS_BUF_TO_AGF(agfbp);
4643			freeblks += be32_to_cpu(agfp->agf_freeblks) +
4644				    be32_to_cpu(agfp->agf_flcount);
4645			xfs_buf_relse(agfbp);
4646		}
4647
4648		error = xfs_read_agi(mp, NULL, agno, &agibp);
4649		if (error) {
4650			xfs_alert(mp, "%s agi read failed agno %d error %d",
4651						__func__, agno, error);
4652		} else {
4653			struct xfs_agi	*agi = XFS_BUF_TO_AGI(agibp);
4654
4655			itotal += be32_to_cpu(agi->agi_count);
4656			ifree += be32_to_cpu(agi->agi_freecount);
4657			xfs_buf_relse(agibp);
4658		}
4659	}
4660}
4661#endif /* DEBUG */
4662