root/fs/xfs/scrub/fscounters.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. xchk_fscount_warmup
  2. xchk_setup_fscounters
  3. xchk_fscount_aggregate_agcounts
  4. xchk_fscount_within_range
  5. xchk_fscounters

   1 // SPDX-License-Identifier: GPL-2.0+
   2 /*
   3  * Copyright (C) 2019 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_sb.h"
  13 #include "xfs_alloc.h"
  14 #include "xfs_ialloc.h"
  15 #include "xfs_health.h"
  16 #include "scrub/scrub.h"
  17 #include "scrub/common.h"
  18 #include "scrub/trace.h"
  19 
  20 /*
  21  * FS Summary Counters
  22  * ===================
  23  *
  24  * The basics of filesystem summary counter checking are that we iterate the
  25  * AGs counting the number of free blocks, free space btree blocks, per-AG
  26  * reservations, inodes, delayed allocation reservations, and free inodes.
  27  * Then we compare what we computed against the in-core counters.
  28  *
  29  * However, the reality is that summary counters are a tricky beast to check.
  30  * While we /could/ freeze the filesystem and scramble around the AGs counting
  31  * the free blocks, in practice we prefer not do that for a scan because
  32  * freezing is costly.  To get around this, we added a per-cpu counter of the
  33  * delalloc reservations so that we can rotor around the AGs relatively
  34  * quickly, and we allow the counts to be slightly off because we're not taking
  35  * any locks while we do this.
  36  *
  37  * So the first thing we do is warm up the buffer cache in the setup routine by
  38  * walking all the AGs to make sure the incore per-AG structure has been
  39  * initialized.  The expected value calculation then iterates the incore per-AG
  40  * structures as quickly as it can.  We snapshot the percpu counters before and
  41  * after this operation and use the difference in counter values to guess at
  42  * our tolerance for mismatch between expected and actual counter values.
  43  */
  44 
  45 /*
  46  * Since the expected value computation is lockless but only browses incore
  47  * values, the percpu counters should be fairly close to each other.  However,
  48  * we'll allow ourselves to be off by at least this (arbitrary) amount.
  49  */
  50 #define XCHK_FSCOUNT_MIN_VARIANCE       (512)
  51 
  52 /*
  53  * Make sure the per-AG structure has been initialized from the on-disk header
  54  * contents and trust that the incore counters match the ondisk counters.  (The
  55  * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
  56  * summary counters after checking all AG headers).  Do this from the setup
  57  * function so that the inner AG aggregation loop runs as quickly as possible.
  58  *
  59  * This function runs during the setup phase /before/ we start checking any
  60  * metadata.
  61  */
  62 STATIC int
  63 xchk_fscount_warmup(
  64         struct xfs_scrub        *sc)
  65 {
  66         struct xfs_mount        *mp = sc->mp;
  67         struct xfs_buf          *agi_bp = NULL;
  68         struct xfs_buf          *agf_bp = NULL;
  69         struct xfs_perag        *pag = NULL;
  70         xfs_agnumber_t          agno;
  71         int                     error = 0;
  72 
  73         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
  74                 pag = xfs_perag_get(mp, agno);
  75 
  76                 if (pag->pagi_init && pag->pagf_init)
  77                         goto next_loop_perag;
  78 
  79                 /* Lock both AG headers. */
  80                 error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
  81                 if (error)
  82                         break;
  83                 error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
  84                 if (error)
  85                         break;
  86                 error = -ENOMEM;
  87                 if (!agf_bp || !agi_bp)
  88                         break;
  89 
  90                 /*
  91                  * These are supposed to be initialized by the header read
  92                  * function.
  93                  */
  94                 error = -EFSCORRUPTED;
  95                 if (!pag->pagi_init || !pag->pagf_init)
  96                         break;
  97 
  98                 xfs_buf_relse(agf_bp);
  99                 agf_bp = NULL;
 100                 xfs_buf_relse(agi_bp);
 101                 agi_bp = NULL;
 102 next_loop_perag:
 103                 xfs_perag_put(pag);
 104                 pag = NULL;
 105                 error = 0;
 106 
 107                 if (fatal_signal_pending(current))
 108                         break;
 109         }
 110 
 111         if (agf_bp)
 112                 xfs_buf_relse(agf_bp);
 113         if (agi_bp)
 114                 xfs_buf_relse(agi_bp);
 115         if (pag)
 116                 xfs_perag_put(pag);
 117         return error;
 118 }
 119 
 120 int
 121 xchk_setup_fscounters(
 122         struct xfs_scrub        *sc,
 123         struct xfs_inode        *ip)
 124 {
 125         struct xchk_fscounters  *fsc;
 126         int                     error;
 127 
 128         sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
 129         if (!sc->buf)
 130                 return -ENOMEM;
 131         fsc = sc->buf;
 132 
 133         xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
 134 
 135         /* We must get the incore counters set up before we can proceed. */
 136         error = xchk_fscount_warmup(sc);
 137         if (error)
 138                 return error;
 139 
 140         /*
 141          * Pause background reclaim while we're scrubbing to reduce the
 142          * likelihood of background perturbations to the counters throwing off
 143          * our calculations.
 144          */
 145         xchk_stop_reaping(sc);
 146 
 147         return xchk_trans_alloc(sc, 0);
 148 }
 149 
 150 /*
 151  * Calculate what the global in-core counters ought to be from the incore
 152  * per-AG structure.  Callers can compare this to the actual in-core counters
 153  * to estimate by how much both in-core and on-disk counters need to be
 154  * adjusted.
 155  */
 156 STATIC int
 157 xchk_fscount_aggregate_agcounts(
 158         struct xfs_scrub        *sc,
 159         struct xchk_fscounters  *fsc)
 160 {
 161         struct xfs_mount        *mp = sc->mp;
 162         struct xfs_perag        *pag;
 163         uint64_t                delayed;
 164         xfs_agnumber_t          agno;
 165         int                     tries = 8;
 166 
 167 retry:
 168         fsc->icount = 0;
 169         fsc->ifree = 0;
 170         fsc->fdblocks = 0;
 171 
 172         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 173                 pag = xfs_perag_get(mp, agno);
 174 
 175                 /* This somehow got unset since the warmup? */
 176                 if (!pag->pagi_init || !pag->pagf_init) {
 177                         xfs_perag_put(pag);
 178                         return -EFSCORRUPTED;
 179                 }
 180 
 181                 /* Count all the inodes */
 182                 fsc->icount += pag->pagi_count;
 183                 fsc->ifree += pag->pagi_freecount;
 184 
 185                 /* Add up the free/freelist/bnobt/cntbt blocks */
 186                 fsc->fdblocks += pag->pagf_freeblks;
 187                 fsc->fdblocks += pag->pagf_flcount;
 188                 fsc->fdblocks += pag->pagf_btreeblks;
 189 
 190                 /*
 191                  * Per-AG reservations are taken out of the incore counters,
 192                  * so they must be left out of the free blocks computation.
 193                  */
 194                 fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
 195                 fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
 196 
 197                 xfs_perag_put(pag);
 198 
 199                 if (fatal_signal_pending(current))
 200                         break;
 201         }
 202 
 203         /*
 204          * The global incore space reservation is taken from the incore
 205          * counters, so leave that out of the computation.
 206          */
 207         fsc->fdblocks -= mp->m_resblks_avail;
 208 
 209         /*
 210          * Delayed allocation reservations are taken out of the incore counters
 211          * but not recorded on disk, so leave them and their indlen blocks out
 212          * of the computation.
 213          */
 214         delayed = percpu_counter_sum(&mp->m_delalloc_blks);
 215         fsc->fdblocks -= delayed;
 216 
 217         trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
 218                         delayed);
 219 
 220 
 221         /* Bail out if the values we compute are totally nonsense. */
 222         if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
 223             fsc->fdblocks > mp->m_sb.sb_dblocks ||
 224             fsc->ifree > fsc->icount_max)
 225                 return -EFSCORRUPTED;
 226 
 227         /*
 228          * If ifree > icount then we probably had some perturbation in the
 229          * counters while we were calculating things.  We'll try a few times
 230          * to maintain ifree <= icount before giving up.
 231          */
 232         if (fsc->ifree > fsc->icount) {
 233                 if (tries--)
 234                         goto retry;
 235                 xchk_set_incomplete(sc);
 236                 return 0;
 237         }
 238 
 239         return 0;
 240 }
 241 
 242 /*
 243  * Is the @counter reasonably close to the @expected value?
 244  *
 245  * We neither locked nor froze anything in the filesystem while aggregating the
 246  * per-AG data to compute the @expected value, which means that the counter
 247  * could have changed.  We know the @old_value of the summation of the counter
 248  * before the aggregation, and we re-sum the counter now.  If the expected
 249  * value falls between the two summations, we're ok.
 250  *
 251  * Otherwise, we /might/ have a problem.  If the change in the summations is
 252  * more than we want to tolerate, the filesystem is probably busy and we should
 253  * just send back INCOMPLETE and see if userspace will try again.
 254  */
 255 static inline bool
 256 xchk_fscount_within_range(
 257         struct xfs_scrub        *sc,
 258         const int64_t           old_value,
 259         struct percpu_counter   *counter,
 260         uint64_t                expected)
 261 {
 262         int64_t                 min_value, max_value;
 263         int64_t                 curr_value = percpu_counter_sum(counter);
 264 
 265         trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
 266                         old_value);
 267 
 268         /* Negative values are always wrong. */
 269         if (curr_value < 0)
 270                 return false;
 271 
 272         /* Exact matches are always ok. */
 273         if (curr_value == expected)
 274                 return true;
 275 
 276         min_value = min(old_value, curr_value);
 277         max_value = max(old_value, curr_value);
 278 
 279         /* Within the before-and-after range is ok. */
 280         if (expected >= min_value && expected <= max_value)
 281                 return true;
 282 
 283         /*
 284          * If the difference between the two summations is too large, the fs
 285          * might just be busy and so we'll mark the scrub incomplete.  Return
 286          * true here so that we don't mark the counter corrupt.
 287          *
 288          * XXX: In the future when userspace can grant scrub permission to
 289          * quiesce the filesystem to solve the outsized variance problem, this
 290          * check should be moved up and the return code changed to signal to
 291          * userspace that we need quiesce permission.
 292          */
 293         if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
 294                 xchk_set_incomplete(sc);
 295                 return true;
 296         }
 297 
 298         return false;
 299 }
 300 
 301 /* Check the superblock counters. */
 302 int
 303 xchk_fscounters(
 304         struct xfs_scrub        *sc)
 305 {
 306         struct xfs_mount        *mp = sc->mp;
 307         struct xchk_fscounters  *fsc = sc->buf;
 308         int64_t                 icount, ifree, fdblocks;
 309         int                     error;
 310 
 311         /* Snapshot the percpu counters. */
 312         icount = percpu_counter_sum(&mp->m_icount);
 313         ifree = percpu_counter_sum(&mp->m_ifree);
 314         fdblocks = percpu_counter_sum(&mp->m_fdblocks);
 315 
 316         /* No negative values, please! */
 317         if (icount < 0 || ifree < 0 || fdblocks < 0)
 318                 xchk_set_corrupt(sc);
 319 
 320         /* See if icount is obviously wrong. */
 321         if (icount < fsc->icount_min || icount > fsc->icount_max)
 322                 xchk_set_corrupt(sc);
 323 
 324         /* See if fdblocks is obviously wrong. */
 325         if (fdblocks > mp->m_sb.sb_dblocks)
 326                 xchk_set_corrupt(sc);
 327 
 328         /*
 329          * If ifree exceeds icount by more than the minimum variance then
 330          * something's probably wrong with the counters.
 331          */
 332         if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
 333                 xchk_set_corrupt(sc);
 334 
 335         /* Walk the incore AG headers to calculate the expected counters. */
 336         error = xchk_fscount_aggregate_agcounts(sc, fsc);
 337         if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
 338                 return error;
 339         if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
 340                 return 0;
 341 
 342         /* Compare the in-core counters with whatever we counted. */
 343         if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
 344                 xchk_set_corrupt(sc);
 345 
 346         if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
 347                 xchk_set_corrupt(sc);
 348 
 349         if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
 350                         fsc->fdblocks))
 351                 xchk_set_corrupt(sc);
 352 
 353         return 0;
 354 }

/* [<][>][^][v][top][bottom][index][help] */