// SPDX-License-Identifier: GPL-2.0-or-later
|
/*
|
* Copyright (C) 2020 Oracle. All Rights Reserved.
|
* Author: Darrick J. Wong <darrick.wong@oracle.com>
|
*/
|
#include "xfs.h"
|
#include "xfs_fs.h"
|
#include "xfs_shared.h"
|
#include "xfs_format.h"
|
#include "xfs_log_format.h"
|
#include "xfs_trans_resv.h"
|
#include "xfs_bit.h"
|
#include "xfs_mount.h"
|
#include "xfs_inode.h"
|
#include "xfs_trans.h"
|
#include "xfs_btree.h"
|
#include "xfs_trace.h"
|
#include "xfs_btree_staging.h"
|
|
/*
|
* Staging Cursors and Fake Roots for Btrees
|
* =========================================
|
*
|
* A staging btree cursor is a special type of btree cursor that callers must
|
* use to construct a new btree index using the btree bulk loader code. The
|
* bulk loading code uses the staging btree cursor to abstract the details of
|
* initializing new btree blocks and filling them with records or key/ptr
|
* pairs. Regular btree operations (e.g. queries and modifications) are not
|
* supported with staging cursors, and callers must not invoke them.
|
*
|
* Fake root structures contain all the information about a btree that is under
|
* construction by the bulk loading code. Staging btree cursors point to fake
|
* root structures instead of the usual AG header or inode structure.
|
*
|
* Callers are expected to initialize a fake root structure and pass it into
|
* the _stage_cursor function for a specific btree type. When bulk loading is
|
* complete, callers should call the _commit_staged_btree function for that
|
* specific btree type to commit the new btree into the filesystem.
|
*/
|
|
/*
|
* Don't allow staging cursors to be duplicated because they're supposed to be
|
* kept private to a single thread.
|
*/
|
STATIC struct xfs_btree_cur *
|
xfs_btree_fakeroot_dup_cursor(
|
struct xfs_btree_cur *cur)
|
{
|
ASSERT(0);
|
return NULL;
|
}
|
|
/*
|
* Don't allow block allocation for a staging cursor, because staging cursors
|
* do not support regular btree modifications.
|
*
|
* Bulk loading uses a separate callback to obtain new blocks from a
|
* preallocated list, which prevents ENOSPC failures during loading.
|
*/
|
STATIC int
|
xfs_btree_fakeroot_alloc_block(
|
struct xfs_btree_cur *cur,
|
union xfs_btree_ptr *start_bno,
|
union xfs_btree_ptr *new_bno,
|
int *stat)
|
{
|
ASSERT(0);
|
return -EFSCORRUPTED;
|
}
|
|
/*
|
* Don't allow block freeing for a staging cursor, because staging cursors
|
* do not support regular btree modifications.
|
*/
|
STATIC int
|
xfs_btree_fakeroot_free_block(
|
struct xfs_btree_cur *cur,
|
struct xfs_buf *bp)
|
{
|
ASSERT(0);
|
return -EFSCORRUPTED;
|
}
|
|
/* Initialize a pointer to the root block from the fakeroot. */
|
STATIC void
|
xfs_btree_fakeroot_init_ptr_from_cur(
|
struct xfs_btree_cur *cur,
|
union xfs_btree_ptr *ptr)
|
{
|
struct xbtree_afakeroot *afake;
|
|
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
|
|
afake = cur->bc_ag.afake;
|
ptr->s = cpu_to_be32(afake->af_root);
|
}
|
|
/*
|
* Bulk Loading for AG Btrees
|
* ==========================
|
*
|
* For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
|
* staging cursor. Callers should initialize this to zero.
|
*
|
* The _stage_cursor() function for a specific btree type should call
|
* xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
|
* cursor. The corresponding _commit_staged_btree() function should log the
|
* new root and call xfs_btree_commit_afakeroot() to transform the staging
|
* cursor into a regular btree cursor.
|
*/
|
|
/* Update the btree root information for a per-AG fake root. */
|
STATIC void
|
xfs_btree_afakeroot_set_root(
|
struct xfs_btree_cur *cur,
|
union xfs_btree_ptr *ptr,
|
int inc)
|
{
|
struct xbtree_afakeroot *afake = cur->bc_ag.afake;
|
|
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
|
afake->af_root = be32_to_cpu(ptr->s);
|
afake->af_levels += inc;
|
}
|
|
/*
|
* Initialize a AG-rooted btree cursor with the given AG btree fake root.
|
* The btree cursor's bc_ops will be overridden as needed to make the staging
|
* functionality work.
|
*/
|
void
|
xfs_btree_stage_afakeroot(
|
struct xfs_btree_cur *cur,
|
struct xbtree_afakeroot *afake)
|
{
|
struct xfs_btree_ops *nops;
|
|
ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
|
ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
|
ASSERT(cur->bc_tp == NULL);
|
|
nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
|
memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
|
nops->alloc_block = xfs_btree_fakeroot_alloc_block;
|
nops->free_block = xfs_btree_fakeroot_free_block;
|
nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
|
nops->set_root = xfs_btree_afakeroot_set_root;
|
nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
|
|
cur->bc_ag.afake = afake;
|
cur->bc_nlevels = afake->af_levels;
|
cur->bc_ops = nops;
|
cur->bc_flags |= XFS_BTREE_STAGING;
|
}
|
|
/*
|
* Transform an AG-rooted staging btree cursor back into a regular cursor by
|
* substituting a real btree root for the fake one and restoring normal btree
|
* cursor ops. The caller must log the btree root change prior to calling
|
* this.
|
*/
|
void
|
xfs_btree_commit_afakeroot(
|
struct xfs_btree_cur *cur,
|
struct xfs_trans *tp,
|
struct xfs_buf *agbp,
|
const struct xfs_btree_ops *ops)
|
{
|
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
|
ASSERT(cur->bc_tp == NULL);
|
|
trace_xfs_btree_commit_afakeroot(cur);
|
|
kmem_free((void *)cur->bc_ops);
|
cur->bc_ag.agbp = agbp;
|
cur->bc_ops = ops;
|
cur->bc_flags &= ~XFS_BTREE_STAGING;
|
cur->bc_tp = tp;
|
}
|
|
/*
|
* Bulk Loading for Inode-Rooted Btrees
|
* ====================================
|
*
|
* For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
|
* the staging cursor. This structure should be initialized as follows:
|
*
|
* - if_fork_size field should be set to the number of bytes available to the
|
* fork in the inode.
|
*
|
* - if_fork should point to a freshly allocated struct xfs_ifork.
|
*
|
* - if_format should be set to the appropriate fork type (e.g.
|
* XFS_DINODE_FMT_BTREE).
|
*
|
* All other fields must be zero.
|
*
|
* The _stage_cursor() function for a specific btree type should call
|
* xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
|
* cursor. The corresponding _commit_staged_btree() function should log the
|
* new root and call xfs_btree_commit_ifakeroot() to transform the staging
|
* cursor into a regular btree cursor.
|
*/
|
|
/*
|
* Initialize an inode-rooted btree cursor with the given inode btree fake
|
* root. The btree cursor's bc_ops will be overridden as needed to make the
|
* staging functionality work. If new_ops is not NULL, these new ops will be
|
* passed out to the caller for further overriding.
|
*/
|
void
|
xfs_btree_stage_ifakeroot(
|
struct xfs_btree_cur *cur,
|
struct xbtree_ifakeroot *ifake,
|
struct xfs_btree_ops **new_ops)
|
{
|
struct xfs_btree_ops *nops;
|
|
ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
|
ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
|
ASSERT(cur->bc_tp == NULL);
|
|
nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
|
memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
|
nops->alloc_block = xfs_btree_fakeroot_alloc_block;
|
nops->free_block = xfs_btree_fakeroot_free_block;
|
nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
|
nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
|
|
cur->bc_ino.ifake = ifake;
|
cur->bc_nlevels = ifake->if_levels;
|
cur->bc_ops = nops;
|
cur->bc_flags |= XFS_BTREE_STAGING;
|
|
if (new_ops)
|
*new_ops = nops;
|
}
|
|
/*
|
* Transform an inode-rooted staging btree cursor back into a regular cursor by
|
* substituting a real btree root for the fake one and restoring normal btree
|
* cursor ops. The caller must log the btree root change prior to calling
|
* this.
|
*/
|
void
|
xfs_btree_commit_ifakeroot(
|
struct xfs_btree_cur *cur,
|
struct xfs_trans *tp,
|
int whichfork,
|
const struct xfs_btree_ops *ops)
|
{
|
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
|
ASSERT(cur->bc_tp == NULL);
|
|
trace_xfs_btree_commit_ifakeroot(cur);
|
|
kmem_free((void *)cur->bc_ops);
|
cur->bc_ino.ifake = NULL;
|
cur->bc_ino.whichfork = whichfork;
|
cur->bc_ops = ops;
|
cur->bc_flags &= ~XFS_BTREE_STAGING;
|
cur->bc_tp = tp;
|
}
|
|
/*
|
* Bulk Loading of Staged Btrees
|
* =============================
|
*
|
* This interface is used with a staged btree cursor to create a totally new
|
* btree with a large number of records (i.e. more than what would fit in a
|
* single root block). When the creation is complete, the new root can be
|
* linked atomically into the filesystem by committing the staged cursor.
|
*
|
* Creation of a new btree proceeds roughly as follows:
|
*
|
* The first step is to initialize an appropriate fake btree root structure and
|
* then construct a staged btree cursor. Refer to the block comments about
|
* "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
|
* more information about how to do this.
|
*
|
* The second step is to initialize a struct xfs_btree_bload context as
|
* documented in the structure definition.
|
*
|
* The third step is to call xfs_btree_bload_compute_geometry to compute the
|
* height of and the number of blocks needed to construct the btree. See the
|
* section "Computing the Geometry of the New Btree" for details about this
|
* computation.
|
*
|
* In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
|
* save them for later use by ->claim_block(). Bulk loading requires all
|
* blocks to be allocated beforehand to avoid ENOSPC failures midway through a
|
* rebuild, and to minimize seek distances of the new btree.
|
*
|
* Step five is to call xfs_btree_bload() to start constructing the btree.
|
*
|
* The final step is to commit the staging btree cursor, which logs the new
|
* btree root and turns the staging cursor into a regular cursor. The caller
|
* is responsible for cleaning up the previous btree blocks, if any.
|
*
|
* Computing the Geometry of the New Btree
|
* =======================================
|
*
|
* The number of items placed in each btree block is computed via the following
|
* algorithm: For leaf levels, the number of items for the level is nr_records
|
* in the bload structure. For node levels, the number of items for the level
|
* is the number of blocks in the next lower level of the tree. For each
|
* level, the desired number of items per block is defined as:
|
*
|
* desired = max(minrecs, maxrecs - slack factor)
|
*
|
* The number of blocks for the level is defined to be:
|
*
|
* blocks = floor(nr_items / desired)
|
*
|
* Note this is rounded down so that the npb calculation below will never fall
|
* below minrecs. The number of items that will actually be loaded into each
|
* btree block is defined as:
|
*
|
* npb = nr_items / blocks
|
*
|
* Some of the leftmost blocks in the level will contain one extra record as
|
* needed to handle uneven division. If the number of records in any block
|
* would exceed maxrecs for that level, blocks is incremented and npb is
|
* recalculated.
|
*
|
* In other words, we compute the number of blocks needed to satisfy a given
|
* loading level, then spread the items as evenly as possible.
|
*
|
* The height and number of fs blocks required to create the btree are computed
|
* and returned via btree_height and nr_blocks.
|
*/
|
|
/*
|
* Put a btree block that we're loading onto the ordered list and release it.
|
* The btree blocks will be written to disk when bulk loading is finished.
|
*/
|
static void
|
xfs_btree_bload_drop_buf(
|
struct list_head *buffers_list,
|
struct xfs_buf **bpp)
|
{
|
if (*bpp == NULL)
|
return;
|
|
if (!xfs_buf_delwri_queue(*bpp, buffers_list))
|
ASSERT(0);
|
|
xfs_buf_relse(*bpp);
|
*bpp = NULL;
|
}
|
|
/*
|
* Allocate and initialize one btree block for bulk loading.
|
*
|
* The new btree block will have its level and numrecs fields set to the values
|
* of the level and nr_this_block parameters, respectively.
|
*
|
* The caller should ensure that ptrp, bpp, and blockp refer to the left
|
* sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp
|
* will all point to the new block.
|
*/
|
STATIC int
|
xfs_btree_bload_prep_block(
|
struct xfs_btree_cur *cur,
|
struct xfs_btree_bload *bbl,
|
struct list_head *buffers_list,
|
unsigned int level,
|
unsigned int nr_this_block,
|
union xfs_btree_ptr *ptrp, /* in/out */
|
struct xfs_buf **bpp, /* in/out */
|
struct xfs_btree_block **blockp, /* in/out */
|
void *priv)
|
{
|
union xfs_btree_ptr new_ptr;
|
struct xfs_buf *new_bp;
|
struct xfs_btree_block *new_block;
|
int ret;
|
|
if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
|
level == cur->bc_nlevels - 1) {
|
struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
|
size_t new_size;
|
|
ASSERT(*bpp == NULL);
|
|
/* Allocate a new incore btree root block. */
|
new_size = bbl->iroot_size(cur, nr_this_block, priv);
|
ifp->if_broot = kmem_zalloc(new_size, 0);
|
ifp->if_broot_bytes = (int)new_size;
|
ifp->if_flags |= XFS_IFBROOT;
|
|
/* Initialize it and send it out. */
|
xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
|
XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
|
nr_this_block, cur->bc_ino.ip->i_ino,
|
cur->bc_flags);
|
|
*bpp = NULL;
|
*blockp = ifp->if_broot;
|
xfs_btree_set_ptr_null(cur, ptrp);
|
return 0;
|
}
|
|
/* Claim one of the caller's preallocated blocks. */
|
xfs_btree_set_ptr_null(cur, &new_ptr);
|
ret = bbl->claim_block(cur, &new_ptr, priv);
|
if (ret)
|
return ret;
|
|
ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
|
|
ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
|
if (ret)
|
return ret;
|
|
/*
|
* The previous block (if any) is the left sibling of the new block,
|
* so set its right sibling pointer to the new block and drop it.
|
*/
|
if (*blockp)
|
xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
|
xfs_btree_bload_drop_buf(buffers_list, bpp);
|
|
/* Initialize the new btree block. */
|
xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
|
xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
|
|
/* Set the out parameters. */
|
*bpp = new_bp;
|
*blockp = new_block;
|
xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
|
return 0;
|
}
|
|
/* Load one leaf block. */
|
STATIC int
|
xfs_btree_bload_leaf(
|
struct xfs_btree_cur *cur,
|
unsigned int recs_this_block,
|
xfs_btree_bload_get_record_fn get_record,
|
struct xfs_btree_block *block,
|
void *priv)
|
{
|
unsigned int j;
|
int ret;
|
|
/* Fill the leaf block with records. */
|
for (j = 1; j <= recs_this_block; j++) {
|
union xfs_btree_rec *block_rec;
|
|
ret = get_record(cur, priv);
|
if (ret)
|
return ret;
|
block_rec = xfs_btree_rec_addr(cur, j, block);
|
cur->bc_ops->init_rec_from_cur(cur, block_rec);
|
}
|
|
return 0;
|
}
|
|
/*
|
* Load one node block with key/ptr pairs.
|
*
|
* child_ptr must point to a block within the next level down in the tree. A
|
* key/ptr entry will be created in the new node block to the block pointed to
|
* by child_ptr. On exit, child_ptr points to the next block on the child
|
* level that needs processing.
|
*/
|
STATIC int
|
xfs_btree_bload_node(
|
struct xfs_btree_cur *cur,
|
unsigned int recs_this_block,
|
union xfs_btree_ptr *child_ptr,
|
struct xfs_btree_block *block)
|
{
|
unsigned int j;
|
int ret;
|
|
/* Fill the node block with keys and pointers. */
|
for (j = 1; j <= recs_this_block; j++) {
|
union xfs_btree_key child_key;
|
union xfs_btree_ptr *block_ptr;
|
union xfs_btree_key *block_key;
|
struct xfs_btree_block *child_block;
|
struct xfs_buf *child_bp;
|
|
ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
|
|
ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
|
&child_bp);
|
if (ret)
|
return ret;
|
|
block_ptr = xfs_btree_ptr_addr(cur, j, block);
|
xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
|
|
block_key = xfs_btree_key_addr(cur, j, block);
|
xfs_btree_get_keys(cur, child_block, &child_key);
|
xfs_btree_copy_keys(cur, block_key, &child_key, 1);
|
|
xfs_btree_get_sibling(cur, child_block, child_ptr,
|
XFS_BB_RIGHTSIB);
|
xfs_buf_relse(child_bp);
|
}
|
|
return 0;
|
}
|
|
/*
|
* Compute the maximum number of records (or keyptrs) per block that we want to
|
* install at this level in the btree. Caller is responsible for having set
|
* @cur->bc_ino.forksize to the desired fork size, if appropriate.
|
*/
|
STATIC unsigned int
|
xfs_btree_bload_max_npb(
|
struct xfs_btree_cur *cur,
|
struct xfs_btree_bload *bbl,
|
unsigned int level)
|
{
|
unsigned int ret;
|
|
if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
|
return cur->bc_ops->get_dmaxrecs(cur, level);
|
|
ret = cur->bc_ops->get_maxrecs(cur, level);
|
if (level == 0)
|
ret -= bbl->leaf_slack;
|
else
|
ret -= bbl->node_slack;
|
return ret;
|
}
|
|
/*
|
* Compute the desired number of records (or keyptrs) per block that we want to
|
* install at this level in the btree, which must be somewhere between minrecs
|
* and max_npb. The caller is free to install fewer records per block.
|
*/
|
STATIC unsigned int
|
xfs_btree_bload_desired_npb(
|
struct xfs_btree_cur *cur,
|
struct xfs_btree_bload *bbl,
|
unsigned int level)
|
{
|
unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level);
|
|
/* Root blocks are not subject to minrecs rules. */
|
if (level == cur->bc_nlevels - 1)
|
return max(1U, npb);
|
|
return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
|
}
|
|
/*
|
* Compute the number of records to be stored in each block at this level and
|
* the number of blocks for this level. For leaf levels, we must populate an
|
* empty root block even if there are no records, so we have to have at least
|
* one block.
|
*/
|
STATIC void
|
xfs_btree_bload_level_geometry(
|
struct xfs_btree_cur *cur,
|
struct xfs_btree_bload *bbl,
|
unsigned int level,
|
uint64_t nr_this_level,
|
unsigned int *avg_per_block,
|
uint64_t *blocks,
|
uint64_t *blocks_with_extra)
|
{
|
uint64_t npb;
|
uint64_t dontcare;
|
unsigned int desired_npb;
|
unsigned int maxnr;
|
|
maxnr = cur->bc_ops->get_maxrecs(cur, level);
|
|
/*
|
* Compute the number of blocks we need to fill each block with the
|
* desired number of records/keyptrs per block. Because desired_npb
|
* could be minrecs, we use regular integer division (which rounds
|
* the block count down) so that in the next step the effective # of
|
* items per block will never be less than desired_npb.
|
*/
|
desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
|
*blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
|
*blocks = max(1ULL, *blocks);
|
|
/*
|
* Compute the number of records that we will actually put in each
|
* block, assuming that we want to spread the records evenly between
|
* the blocks. Take care that the effective # of items per block (npb)
|
* won't exceed maxrecs even for the blocks that get an extra record,
|
* since desired_npb could be maxrecs, and in the previous step we
|
* rounded the block count down.
|
*/
|
npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
|
if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
|
(*blocks)++;
|
npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
|
}
|
|
*avg_per_block = min_t(uint64_t, npb, nr_this_level);
|
|
trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
|
*avg_per_block, desired_npb, *blocks,
|
*blocks_with_extra);
|
}
|
|
/*
|
* Ensure a slack value is appropriate for the btree.
|
*
|
* If the slack value is negative, set slack so that we fill the block to
|
* halfway between minrecs and maxrecs. Make sure the slack is never so large
|
* that we can underflow minrecs.
|
*/
|
static void
|
xfs_btree_bload_ensure_slack(
|
struct xfs_btree_cur *cur,
|
int *slack,
|
int level)
|
{
|
int maxr;
|
int minr;
|
|
maxr = cur->bc_ops->get_maxrecs(cur, level);
|
minr = cur->bc_ops->get_minrecs(cur, level);
|
|
/*
|
* If slack is negative, automatically set slack so that we load the
|
* btree block approximately halfway between minrecs and maxrecs.
|
* Generally, this will net us 75% loading.
|
*/
|
if (*slack < 0)
|
*slack = maxr - ((maxr + minr) >> 1);
|
|
*slack = min(*slack, maxr - minr);
|
}
|
|
/*
|
* Prepare a btree cursor for a bulk load operation by computing the geometry
|
* fields in bbl. Caller must ensure that the btree cursor is a staging
|
* cursor. This function can be called multiple times.
|
*/
|
int
|
xfs_btree_bload_compute_geometry(
|
struct xfs_btree_cur *cur,
|
struct xfs_btree_bload *bbl,
|
uint64_t nr_records)
|
{
|
uint64_t nr_blocks = 0;
|
uint64_t nr_this_level;
|
|
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
|
|
/*
|
* Make sure that the slack values make sense for traditional leaf and
|
* node blocks. Inode-rooted btrees will return different minrecs and
|
* maxrecs values for the root block (bc_nlevels == level - 1). We're
|
* checking levels 0 and 1 here, so set bc_nlevels such that the btree
|
* code doesn't interpret either as the root level.
|
*/
|
cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1;
|
xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
|
xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
|
|
bbl->nr_records = nr_this_level = nr_records;
|
for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) {
|
uint64_t level_blocks;
|
uint64_t dontcare64;
|
unsigned int level = cur->bc_nlevels - 1;
|
unsigned int avg_per_block;
|
|
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
|
&avg_per_block, &level_blocks, &dontcare64);
|
|
if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
|
/*
|
* If all the items we want to store at this level
|
* would fit in the inode root block, then we have our
|
* btree root and are done.
|
*
|
* Note that bmap btrees forbid records in the root.
|
*/
|
if (level != 0 && nr_this_level <= avg_per_block) {
|
nr_blocks++;
|
break;
|
}
|
|
/*
|
* Otherwise, we have to store all the items for this
|
* level in traditional btree blocks and therefore need
|
* another level of btree to point to those blocks.
|
*
|
* We have to re-compute the geometry for each level of
|
* an inode-rooted btree because the geometry differs
|
* between a btree root in an inode fork and a
|
* traditional btree block.
|
*
|
* This distinction is made in the btree code based on
|
* whether level == bc_nlevels - 1. Based on the
|
* previous root block size check against the root
|
* block geometry, we know that we aren't yet ready to
|
* populate the root. Increment bc_nevels and
|
* recalculate the geometry for a traditional
|
* block-based btree level.
|
*/
|
cur->bc_nlevels++;
|
xfs_btree_bload_level_geometry(cur, bbl, level,
|
nr_this_level, &avg_per_block,
|
&level_blocks, &dontcare64);
|
} else {
|
/*
|
* If all the items we want to store at this level
|
* would fit in a single root block, we're done.
|
*/
|
if (nr_this_level <= avg_per_block) {
|
nr_blocks++;
|
break;
|
}
|
|
/* Otherwise, we need another level of btree. */
|
cur->bc_nlevels++;
|
}
|
|
nr_blocks += level_blocks;
|
nr_this_level = level_blocks;
|
}
|
|
if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS)
|
return -EOVERFLOW;
|
|
bbl->btree_height = cur->bc_nlevels;
|
if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
|
bbl->nr_blocks = nr_blocks - 1;
|
else
|
bbl->nr_blocks = nr_blocks;
|
return 0;
|
}
|
|
/* Bulk load a btree given the parameters and geometry established in bbl. */
|
int
|
xfs_btree_bload(
|
struct xfs_btree_cur *cur,
|
struct xfs_btree_bload *bbl,
|
void *priv)
|
{
|
struct list_head buffers_list;
|
union xfs_btree_ptr child_ptr;
|
union xfs_btree_ptr ptr;
|
struct xfs_buf *bp = NULL;
|
struct xfs_btree_block *block = NULL;
|
uint64_t nr_this_level = bbl->nr_records;
|
uint64_t blocks;
|
uint64_t i;
|
uint64_t blocks_with_extra;
|
uint64_t total_blocks = 0;
|
unsigned int avg_per_block;
|
unsigned int level = 0;
|
int ret;
|
|
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
|
|
INIT_LIST_HEAD(&buffers_list);
|
cur->bc_nlevels = bbl->btree_height;
|
xfs_btree_set_ptr_null(cur, &child_ptr);
|
xfs_btree_set_ptr_null(cur, &ptr);
|
|
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
|
&avg_per_block, &blocks, &blocks_with_extra);
|
|
/* Load each leaf block. */
|
for (i = 0; i < blocks; i++) {
|
unsigned int nr_this_block = avg_per_block;
|
|
/*
|
* Due to rounding, btree blocks will not be evenly populated
|
* in most cases. blocks_with_extra tells us how many blocks
|
* will receive an extra record to distribute the excess across
|
* the current level as evenly as possible.
|
*/
|
if (i < blocks_with_extra)
|
nr_this_block++;
|
|
ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
|
nr_this_block, &ptr, &bp, &block, priv);
|
if (ret)
|
goto out;
|
|
trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
|
nr_this_block);
|
|
ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
|
block, priv);
|
if (ret)
|
goto out;
|
|
/*
|
* Record the leftmost leaf pointer so we know where to start
|
* with the first node level.
|
*/
|
if (i == 0)
|
xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
|
}
|
total_blocks += blocks;
|
xfs_btree_bload_drop_buf(&buffers_list, &bp);
|
|
/* Populate the internal btree nodes. */
|
for (level = 1; level < cur->bc_nlevels; level++) {
|
union xfs_btree_ptr first_ptr;
|
|
nr_this_level = blocks;
|
block = NULL;
|
xfs_btree_set_ptr_null(cur, &ptr);
|
|
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
|
&avg_per_block, &blocks, &blocks_with_extra);
|
|
/* Load each node block. */
|
for (i = 0; i < blocks; i++) {
|
unsigned int nr_this_block = avg_per_block;
|
|
if (i < blocks_with_extra)
|
nr_this_block++;
|
|
ret = xfs_btree_bload_prep_block(cur, bbl,
|
&buffers_list, level, nr_this_block,
|
&ptr, &bp, &block, priv);
|
if (ret)
|
goto out;
|
|
trace_xfs_btree_bload_block(cur, level, i, blocks,
|
&ptr, nr_this_block);
|
|
ret = xfs_btree_bload_node(cur, nr_this_block,
|
&child_ptr, block);
|
if (ret)
|
goto out;
|
|
/*
|
* Record the leftmost node pointer so that we know
|
* where to start the next node level above this one.
|
*/
|
if (i == 0)
|
xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
|
}
|
total_blocks += blocks;
|
xfs_btree_bload_drop_buf(&buffers_list, &bp);
|
xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
|
}
|
|
/* Initialize the new root. */
|
if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
|
ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
|
cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
|
cur->bc_ino.ifake->if_blocks = total_blocks - 1;
|
} else {
|
cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
|
cur->bc_ag.afake->af_levels = cur->bc_nlevels;
|
cur->bc_ag.afake->af_blocks = total_blocks;
|
}
|
|
/*
|
* Write the new blocks to disk. If the ordered list isn't empty after
|
* that, then something went wrong and we have to fail. This should
|
* never happen, but we'll check anyway.
|
*/
|
ret = xfs_buf_delwri_submit(&buffers_list);
|
if (ret)
|
goto out;
|
if (!list_empty(&buffers_list)) {
|
ASSERT(list_empty(&buffers_list));
|
ret = -EIO;
|
}
|
|
out:
|
xfs_buf_delwri_cancel(&buffers_list);
|
if (bp)
|
xfs_buf_relse(bp);
|
return ret;
|
}
|