From 2f7c68cb55ecb7331f2381deb497c27155f32faf Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Wed, 03 Jan 2024 09:43:39 +0000 Subject: [PATCH] update kernel to 5.10.198 --- kernel/fs/ext4/extents_status.c | 1055 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 files changed, 997 insertions(+), 58 deletions(-) diff --git a/kernel/fs/ext4/extents_status.c b/kernel/fs/ext4/extents_status.c index 027c3e1..fee54ab 100644 --- a/kernel/fs/ext4/extents_status.c +++ b/kernel/fs/ext4/extents_status.c @@ -142,13 +142,16 @@ */ static struct kmem_cache *ext4_es_cachep; +static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end); + ext4_lblk_t end, int *reserved); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); int __init ext4_init_es(void) { @@ -233,66 +236,186 @@ } /* - * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering - * @es->lblk if it exists, otherwise, the next extent after @es->lblk. + * ext4_es_find_extent_range - find extent with specified status within block + * range or next extent following block range in + * extents status tree * - * @inode: the inode which owns delayed extents - * @lblk: the offset where we start to search - * @end: the offset where we stop to search - * @es: delayed extent that we found + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * @es - extent found, if any + * + * Find the first extent within the block range specified by @lblk and @end + * in the extents status tree that satisfies @matching_fn. If a match + * is found, it's returned in @es. If not, and a matching extent is found + * beyond the block range, it's returned in @es. If no match is found, an + * extent is returned in @es whose es_lblk, es_len, and es_pblk components + * are 0. */ -void ext4_es_find_delayed_extent_range(struct inode *inode, - ext4_lblk_t lblk, ext4_lblk_t end, - struct extent_status *es) +static void __es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; - BUG_ON(es == NULL); - BUG_ON(end < lblk); - trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); + WARN_ON(es == NULL); + WARN_ON(end < lblk); - read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; - /* find extent in cache firstly */ + /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; - if (tree->cache_es) { - es1 = tree->cache_es; - if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u) %llu %x\n", - lblk, es1->es_lblk, es1->es_len, - ext4_es_pblock(es1), ext4_es_status(es1)); - goto out; - } + es1 = READ_ONCE(tree->cache_es); + if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u) %llu %x\n", + lblk, es1->es_lblk, es1->es_len, + ext4_es_pblock(es1), ext4_es_status(es1)); + goto out; } es1 = __es_tree_search(&tree->root, lblk); out: - if (es1 && !ext4_es_is_delayed(es1)) { + if (es1 && !matching_fn(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); if (es1->es_lblk > end) { es1 = NULL; break; } - if (ext4_es_is_delayed(es1)) + if (matching_fn(es1)) break; } } - if (es1 && ext4_es_is_delayed(es1)) { - tree->cache_es = es1; + if (es1 && matching_fn(es1)) { + WRITE_ONCE(tree->cache_es, es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; } +} + +/* + * Locking for __es_find_extent_range() for external use + */ +void ext4_es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + trace_ext4_es_find_extent_range_enter(inode, lblk); + + read_lock(&EXT4_I(inode)->i_es_lock); + __es_find_extent_range(inode, matching_fn, lblk, end, es); read_unlock(&EXT4_I(inode)->i_es_lock); - trace_ext4_es_find_delayed_extent_range_exit(inode, es); + trace_ext4_es_find_extent_range_exit(inode, es); +} + +/* + * __es_scan_range - search block range for block with specified status + * in extents status tree + * + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * + * Returns true if at least one block in the specified block range satisfies + * the criterion specified by @matching_fn, and false if not. If at least + * one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t start, ext4_lblk_t end) +{ + struct extent_status es; + + __es_find_extent_range(inode, matching_fn, start, end, &es); + if (es.es_len == 0) + return false; /* no matching extent in the tree */ + else if (es.es_lblk <= start && + start < es.es_lblk + es.es_len) + return true; + else if (start <= es.es_lblk && es.es_lblk <= end) + return true; + else + return false; +} +/* + * Locking for __es_scan_range() for external use + */ +bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end) +{ + bool ret; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return false; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_range(inode, matching_fn, lblk, end); + read_unlock(&EXT4_I(inode)->i_es_lock); + + return ret; +} + +/* + * __es_scan_clu - search cluster for block with specified status in + * extents status tree + * + * @inode - file containing the cluster + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block in cluster to be searched + * + * Returns true if at least one extent in the cluster containing @lblk + * satisfies the criterion specified by @matching_fn, and false if not. If at + * least one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); +} + +/* + * Locking for __es_scan_clu() for external use + */ +bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + bool ret; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return false; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_clu(inode, matching_fn, lblk); + read_unlock(&EXT4_I(inode)->i_es_lock); + + return ret; } static void ext4_es_list_add(struct inode *inode) @@ -595,7 +718,7 @@ * We don't need to check unwritten extent because * indirect-based file doesn't have it. */ - BUG_ON(1); + BUG(); } } else if (retval == 0) { if (ext4_es_is_written(es)) { @@ -664,7 +787,7 @@ } p = &(*p)->rb_right; } else { - BUG_ON(1); + BUG(); return -EINVAL; } } @@ -694,6 +817,10 @@ struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); @@ -719,7 +846,7 @@ ext4_es_insert_extent_check(inode, &newes); write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end); + err = __es_remove_extent(inode, lblk, end, NULL); if (err != 0) goto error; retry: @@ -729,6 +856,11 @@ goto retry; if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) err = 0; + + if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && + (status & EXTENT_STATUS_WRITTEN || + status & EXTENT_STATUS_UNWRITTEN)) + __revise_pending(inode, lblk, len); error: write_unlock(&EXT4_I(inode)->i_es_lock); @@ -750,6 +882,9 @@ struct extent_status *es; struct extent_status newes; ext4_lblk_t end = lblk + len - 1; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; newes.es_lblk = lblk; newes.es_len = len; @@ -777,6 +912,7 @@ * Return: 1 on found, 0 on not */ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, struct extent_status *es) { struct ext4_es_tree *tree; @@ -784,6 +920,9 @@ struct extent_status *es1 = NULL; struct rb_node *node; int found = 0; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; trace_ext4_es_lookup_extent_enter(inode, lblk); es_debug("lookup extent in block %u\n", lblk); @@ -793,14 +932,12 @@ /* find extent in cache firstly */ es->es_lblk = es->es_len = es->es_pblk = 0; - if (tree->cache_es) { - es1 = tree->cache_es; - if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u)\n", - lblk, es1->es_lblk, es1->es_len); - found = 1; - goto out; - } + es1 = READ_ONCE(tree->cache_es); + if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u)\n", + lblk, es1->es_lblk, es1->es_len); + found = 1; + goto out; } node = tree->root.rb_node; @@ -825,9 +962,18 @@ es->es_pblk = es1->es_pblk; if (!ext4_es_is_referenced(es1)) ext4_es_set_referenced(es1); - stats->es_stats_cache_hits++; + percpu_counter_inc(&stats->es_stats_cache_hits); + if (next_lblk) { + node = rb_next(&es1->rb_node); + if (node) { + es1 = rb_entry(node, struct extent_status, + rb_node); + *next_lblk = es1->es_lblk; + } else + *next_lblk = 0; + } } else { - stats->es_stats_cache_misses++; + percpu_counter_inc(&stats->es_stats_cache_misses); } read_unlock(&EXT4_I(inode)->i_es_lock); @@ -836,8 +982,322 @@ return found; } +struct rsvd_count { + int ndelonly; + bool first_do_lblk_found; + ext4_lblk_t first_do_lblk; + ext4_lblk_t last_do_lblk; + struct extent_status *left_es; + bool partial; + ext4_lblk_t lclu; +}; + +/* + * init_rsvd - initialize reserved count data before removing block range + * in file from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @es - pointer to first extent in range + * @rc - pointer to reserved count data + * + * Assumes es is not NULL + */ +static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es, struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct rb_node *node; + + rc->ndelonly = 0; + + /* + * for bigalloc, note the first delonly block in the range has not + * been found, record the extent containing the block to the left of + * the region to be removed, if any, and note that there's no partial + * cluster to track + */ + if (sbi->s_cluster_ratio > 1) { + rc->first_do_lblk_found = false; + if (lblk > es->es_lblk) { + rc->left_es = es; + } else { + node = rb_prev(&es->rb_node); + rc->left_es = node ? rb_entry(node, + struct extent_status, + rb_node) : NULL; + } + rc->partial = false; + } +} + +/* + * count_rsvd - count the clusters containing delayed and not unwritten + * (delonly) blocks in a range within an extent and add to + * the running tally in rsvd_count + * + * @inode - file containing extent + * @lblk - first block in range + * @len - length of range in blocks + * @es - pointer to extent containing clusters to be counted + * @rc - pointer to reserved count data + * + * Tracks partial clusters found at the beginning and end of extents so + * they aren't overcounted when they span adjacent extents + */ +static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, + struct extent_status *es, struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t i, end, nclu; + + if (!ext4_es_is_delonly(es)) + return; + + WARN_ON(len <= 0); + + if (sbi->s_cluster_ratio == 1) { + rc->ndelonly += (int) len; + return; + } + + /* bigalloc */ + + i = (lblk < es->es_lblk) ? es->es_lblk : lblk; + end = lblk + (ext4_lblk_t) len - 1; + end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; + + /* record the first block of the first delonly extent seen */ + if (!rc->first_do_lblk_found) { + rc->first_do_lblk = i; + rc->first_do_lblk_found = true; + } + + /* update the last lblk in the region seen so far */ + rc->last_do_lblk = end; + + /* + * if we're tracking a partial cluster and the current extent + * doesn't start with it, count it and stop tracking + */ + if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { + rc->ndelonly++; + rc->partial = false; + } + + /* + * if the first cluster doesn't start on a cluster boundary but + * ends on one, count it + */ + if (EXT4_LBLK_COFF(sbi, i) != 0) { + if (end >= EXT4_LBLK_CFILL(sbi, i)) { + rc->ndelonly++; + rc->partial = false; + i = EXT4_LBLK_CFILL(sbi, i) + 1; + } + } + + /* + * if the current cluster starts on a cluster boundary, count the + * number of whole delonly clusters in the extent + */ + if ((i + sbi->s_cluster_ratio - 1) <= end) { + nclu = (end - i + 1) >> sbi->s_cluster_bits; + rc->ndelonly += nclu; + i += nclu << sbi->s_cluster_bits; + } + + /* + * start tracking a partial cluster if there's a partial at the end + * of the current extent and we're not already tracking one + */ + if (!rc->partial && i <= end) { + rc->partial = true; + rc->lclu = EXT4_B2C(sbi, i); + } +} + +/* + * __pr_tree_search - search for a pending cluster reservation + * + * @root - root of pending reservation tree + * @lclu - logical cluster to search for + * + * Returns the pending reservation for the cluster identified by @lclu + * if found. If not, returns a reservation for the next cluster if any, + * and if not, returns NULL. + */ +static struct pending_reservation *__pr_tree_search(struct rb_root *root, + ext4_lblk_t lclu) +{ + struct rb_node *node = root->rb_node; + struct pending_reservation *pr = NULL; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else + return pr; + } + if (pr && lclu < pr->lclu) + return pr; + if (pr && lclu > pr->lclu) { + node = rb_next(&pr->rb_node); + return node ? rb_entry(node, struct pending_reservation, + rb_node) : NULL; + } + return NULL; +} + +/* + * get_rsvd - calculates and returns the number of cluster reservations to be + * released when removing a block range from the extent status tree + * and releases any pending reservations within the range + * + * @inode - file containing block range + * @end - last block in range + * @right_es - pointer to extent containing next block beyond end or NULL + * @rc - pointer to reserved count data + * + * The number of reservations to be released is equal to the number of + * clusters containing delayed and not unwritten (delonly) blocks within + * the range, minus the number of clusters still containing delonly blocks + * at the ends of the range, and minus the number of pending reservations + * within the range. + */ +static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, + struct extent_status *right_es, + struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node *node; + ext4_lblk_t first_lclu, last_lclu; + bool left_delonly, right_delonly, count_pending; + struct extent_status *es; + + if (sbi->s_cluster_ratio > 1) { + /* count any remaining partial cluster */ + if (rc->partial) + rc->ndelonly++; + + if (rc->ndelonly == 0) + return 0; + + first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); + last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); + + /* + * decrease the delonly count by the number of clusters at the + * ends of the range that still contain delonly blocks - + * these clusters still need to be reserved + */ + left_delonly = right_delonly = false; + + es = rc->left_es; + while (es && ext4_es_end(es) >= + EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { + if (ext4_es_is_delonly(es)) { + rc->ndelonly--; + left_delonly = true; + break; + } + node = rb_prev(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, rb_node); + } + if (right_es && (!left_delonly || first_lclu != last_lclu)) { + if (end < ext4_es_end(right_es)) { + es = right_es; + } else { + node = rb_next(&right_es->rb_node); + es = node ? rb_entry(node, struct extent_status, + rb_node) : NULL; + } + while (es && es->es_lblk <= + EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { + if (ext4_es_is_delonly(es)) { + rc->ndelonly--; + right_delonly = true; + break; + } + node = rb_next(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, + rb_node); + } + } + + /* + * Determine the block range that should be searched for + * pending reservations, if any. Clusters on the ends of the + * original removed range containing delonly blocks are + * excluded. They've already been accounted for and it's not + * possible to determine if an associated pending reservation + * should be released with the information available in the + * extents status tree. + */ + if (first_lclu == last_lclu) { + if (left_delonly | right_delonly) + count_pending = false; + else + count_pending = true; + } else { + if (left_delonly) + first_lclu++; + if (right_delonly) + last_lclu--; + if (first_lclu <= last_lclu) + count_pending = true; + else + count_pending = false; + } + + /* + * a pending reservation found between first_lclu and last_lclu + * represents an allocated cluster that contained at least one + * delonly block, so the delonly total must be reduced by one + * for each pending reservation found and released + */ + if (count_pending) { + pr = __pr_tree_search(&tree->root, first_lclu); + while (pr && pr->lclu <= last_lclu) { + rc->ndelonly--; + node = rb_next(&pr->rb_node); + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + if (!node) + break; + pr = rb_entry(node, struct pending_reservation, + rb_node); + } + } + } + return rc->ndelonly; +} + + +/* + * __es_remove_extent - removes block range from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @end - last block in range + * @reserved - number of cluster reservations released + * + * If @reserved is not NULL and delayed allocation is enabled, counts + * block/cluster reservations freed by removing range and if bigalloc + * enabled cancels pending reservations as needed. Returns 0 on success, + * error code on failure. + */ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end) + ext4_lblk_t end, int *reserved) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node *node; @@ -846,9 +1306,14 @@ ext4_lblk_t len1, len2; ext4_fsblk_t block; int err; + bool count_reserved = true; + struct rsvd_count rc; + if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) + count_reserved = false; retry: err = 0; + es = __es_tree_search(&tree->root, lblk); if (!es) goto out; @@ -857,6 +1322,8 @@ /* Simply invalidate cache_es. */ tree->cache_es = NULL; + if (count_reserved) + init_rsvd(inode, lblk, es, &rc); orig_es.es_lblk = es->es_lblk; orig_es.es_len = es->es_len; @@ -898,10 +1365,16 @@ ext4_es_store_pblock(es, block); } } - goto out; + if (count_reserved) + count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, + &orig_es, &rc); + goto out_get_reserved; } if (len1 > 0) { + if (count_reserved) + count_rsvd(inode, lblk, orig_es.es_len - len1, + &orig_es, &rc); node = rb_next(&es->rb_node); if (node) es = rb_entry(node, struct extent_status, rb_node); @@ -910,6 +1383,8 @@ } while (es && ext4_es_end(es) <= end) { + if (count_reserved) + count_rsvd(inode, es->es_lblk, es->es_len, es, &rc); node = rb_next(&es->rb_node); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); @@ -924,6 +1399,9 @@ ext4_lblk_t orig_len = es->es_len; len1 = ext4_es_end(es) - end; + if (count_reserved) + count_rsvd(inode, es->es_lblk, orig_len - len1, + es, &rc); es->es_lblk = end + 1; es->es_len = len1; if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { @@ -932,20 +1410,32 @@ } } +out_get_reserved: + if (count_reserved) + *reserved = get_rsvd(inode, end, es, &rc); out: return err; } /* - * ext4_es_remove_extent() removes a space from a extent status tree. + * ext4_es_remove_extent - removes block range from extent status tree * - * Return 0 on success, error code on failure. + * @inode - file containing range + * @lblk - first block in range + * @len - number of blocks to remove + * + * Reduces block/cluster reservation count and for bigalloc cancels pending + * reservations as needed. Returns 0 on success, error code on failure. */ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { ext4_lblk_t end; int err = 0; + int reserved = 0; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", @@ -963,9 +1453,10 @@ * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end); + err = __es_remove_extent(inode, lblk, end, &reserved); write_unlock(&EXT4_I(inode)->i_es_lock); ext4_es_print_tree(inode); + ext4_da_release_space(inode, reserved); return err; } @@ -1111,9 +1602,9 @@ seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); - seq_printf(seq, " %lu/%lu cache hits/misses\n", - es_stats->es_stats_cache_hits, - es_stats->es_stats_cache_misses); + seq_printf(seq, " %lld/%lld cache hits/misses\n", + percpu_counter_sum_positive(&es_stats->es_stats_cache_hits), + percpu_counter_sum_positive(&es_stats->es_stats_cache_misses)); if (inode_cnt) seq_printf(seq, " %d inodes on list\n", inode_cnt); @@ -1140,35 +1631,46 @@ sbi->s_es_nr_inode = 0; spin_lock_init(&sbi->s_es_lock); sbi->s_es_stats.es_stats_shrunk = 0; - sbi->s_es_stats.es_stats_cache_hits = 0; - sbi->s_es_stats.es_stats_cache_misses = 0; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0, + GFP_KERNEL); + if (err) + return err; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0, + GFP_KERNEL); + if (err) + goto err1; sbi->s_es_stats.es_stats_scan_time = 0; sbi->s_es_stats.es_stats_max_scan_time = 0; err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); if (err) - return err; + goto err2; err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); if (err) - goto err1; + goto err3; sbi->s_es_shrinker.scan_objects = ext4_es_scan; sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; err = register_shrinker(&sbi->s_es_shrinker); if (err) - goto err2; + goto err4; return 0; - -err2: +err4: percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); -err1: +err3: percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); +err2: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); +err1: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); return err; } void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); unregister_shrinker(&sbi->s_es_shrinker); @@ -1193,7 +1695,7 @@ es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); if (!es) goto out_wrap; - node = &es->rb_node; + while (*nr_to_scan > 0) { if (es->es_lblk > end) { ei->i_es_shrink_lblk = end + 1; @@ -1250,3 +1752,440 @@ ei->i_es_tree.cache_es = NULL; return nr_shrunk; } + +/* + * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove + * discretionary entries from the extent status cache. (Some entries + * must be present for proper operations.) + */ +void ext4_clear_inode_es(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct extent_status *es; + struct ext4_es_tree *tree; + struct rb_node *node; + + write_lock(&ei->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + tree->cache_es = NULL; + node = rb_first(&tree->root); + while (node) { + es = rb_entry(node, struct extent_status, rb_node); + node = rb_next(node); + if (!ext4_es_is_delayed(es)) { + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + } + } + ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED); + write_unlock(&ei->i_es_lock); +} + +#ifdef ES_DEBUG__ +static void ext4_print_pending_tree(struct inode *inode) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr; + + printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_pending_tree; + node = rb_first(&tree->root); + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + printk(KERN_DEBUG " %u", pr->lclu); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_print_pending_tree(inode) +#endif + +int __init ext4_init_pending(void) +{ + ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation", + sizeof(struct pending_reservation), + 0, (SLAB_RECLAIM_ACCOUNT), NULL); + if (ext4_pending_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_pending(void) +{ + kmem_cache_destroy(ext4_pending_cachep); +} + +void ext4_init_pending_tree(struct ext4_pending_tree *tree) +{ + tree->root = RB_ROOT; +} + +/* + * __get_pending - retrieve a pointer to a pending reservation + * + * @inode - file containing the pending cluster reservation + * @lclu - logical cluster of interest + * + * Returns a pointer to a pending reservation if it's a member of + * the set, and NULL if not. Must be called holding i_es_lock. + */ +static struct pending_reservation *__get_pending(struct inode *inode, + ext4_lblk_t lclu) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr = NULL; + + tree = &EXT4_I(inode)->i_pending_tree; + node = (&tree->root)->rb_node; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else if (lclu == pr->lclu) + return pr; + } + return NULL; +} + +/* + * __insert_pending - adds a pending cluster reservation to the set of + * pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster to be added + * + * Returns 0 on successful insertion and -ENOMEM on failure. If the + * pending reservation is already in the set, returns successfully. + */ +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct pending_reservation *pr; + ext4_lblk_t lclu; + int ret = 0; + + lclu = EXT4_B2C(sbi, lblk); + /* search to find parent for insertion */ + while (*p) { + parent = *p; + pr = rb_entry(parent, struct pending_reservation, rb_node); + + if (lclu < pr->lclu) { + p = &(*p)->rb_left; + } else if (lclu > pr->lclu) { + p = &(*p)->rb_right; + } else { + /* pending reservation already inserted */ + goto out; + } + } + + pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); + if (pr == NULL) { + ret = -ENOMEM; + goto out; + } + pr->lclu = lclu; + + rb_link_node(&pr->rb_node, parent, p); + rb_insert_color(&pr->rb_node, &tree->root); + +out: + return ret; +} + +/* + * __remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Returns successfully if pending reservation is not a member of the set. + */ +static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree; + + pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); + if (pr != NULL) { + tree = &EXT4_I(inode)->i_pending_tree; + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + } +} + +/* + * ext4_remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Locking for external use of __remove_pending. + */ +void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + write_lock(&ei->i_es_lock); + __remove_pending(inode, lblk); + write_unlock(&ei->i_es_lock); +} + +/* + * ext4_is_pending - determine whether a cluster has a pending reservation + * on it + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster + * + * Returns true if there's a pending reservation for the cluster in the + * set of pending reservations, and false if not. + */ +bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + bool ret; + + read_lock(&ei->i_es_lock); + ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); + read_unlock(&ei->i_es_lock); + + return ret; +} + +/* + * ext4_es_insert_delayed_block - adds a delayed block to the extents status + * tree, adding a pending reservation where + * needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * @allocated - indicates whether a physical cluster has been allocated for + * the logical cluster that contains the block + * + * Returns 0 on success, negative error code on failure. + */ +int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated) +{ + struct extent_status newes; + int err = 0; + + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return 0; + + es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", + lblk, inode->i_ino); + + newes.es_lblk = lblk; + newes.es_len = 1; + ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); + trace_ext4_es_insert_delayed_block(inode, &newes, allocated); + + ext4_es_insert_extent_check(inode, &newes); + + write_lock(&EXT4_I(inode)->i_es_lock); + + err = __es_remove_extent(inode, lblk, lblk, NULL); + if (err != 0) + goto error; +retry: + err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) + goto retry; + if (err != 0) + goto error; + + if (allocated) + __insert_pending(inode, lblk); + +error: + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + ext4_print_pending_tree(inode); + + return err; +} + +/* + * __es_delayed_clu - count number of clusters containing blocks that + * are delayed only + * + * @inode - file containing block range + * @start - logical block defining start of range + * @end - logical block defining end of range + * + * Returns the number of clusters containing only delayed (not delayed + * and unwritten) blocks in the range specified by @start and @end. Any + * cluster or part of a cluster within the range and containing a delayed + * and not unwritten block within the range is counted as a whole cluster. + */ +static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct rb_node *node; + ext4_lblk_t first_lclu, last_lclu; + unsigned long long last_counted_lclu; + unsigned int n = 0; + + /* guaranteed to be unequal to any ext4_lblk_t value */ + last_counted_lclu = ~0ULL; + + es = __es_tree_search(&tree->root, start); + + while (es && (es->es_lblk <= end)) { + if (ext4_es_is_delonly(es)) { + if (es->es_lblk <= start) + first_lclu = EXT4_B2C(sbi, start); + else + first_lclu = EXT4_B2C(sbi, es->es_lblk); + + if (ext4_es_end(es) >= end) + last_lclu = EXT4_B2C(sbi, end); + else + last_lclu = EXT4_B2C(sbi, ext4_es_end(es)); + + if (first_lclu == last_counted_lclu) + n += last_lclu - first_lclu; + else + n += last_lclu - first_lclu + 1; + last_counted_lclu = last_lclu; + } + node = rb_next(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, rb_node); + } + + return n; +} + +/* + * ext4_es_delayed_clu - count number of clusters containing blocks that + * are both delayed and unwritten + * + * @inode - file containing block range + * @lblk - logical block defining start of range + * @len - number of blocks in range + * + * Locking for external use of __es_delayed_clu(). + */ +unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_lblk_t end; + unsigned int n; + + if (len == 0) + return 0; + + end = lblk + len - 1; + WARN_ON(end < lblk); + + read_lock(&ei->i_es_lock); + + n = __es_delayed_clu(inode, lblk, end); + + read_unlock(&ei->i_es_lock); + + return n; +} + +/* + * __revise_pending - makes, cancels, or leaves unchanged pending cluster + * reservations for a specified block range depending + * upon the presence or absence of delayed blocks + * outside the range within clusters at the ends of the + * range + * + * @inode - file containing the range + * @lblk - logical block defining the start of range + * @len - length of range in blocks + * + * Used after a newly allocated extent is added to the extents status tree. + * Requires that the extents in the range have either written or unwritten + * status. Must be called while holding i_es_lock. + */ +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t end = lblk + len - 1; + ext4_lblk_t first, last; + bool f_del = false, l_del = false; + + if (len == 0) + return; + + /* + * Two cases - block range within single cluster and block range + * spanning two or more clusters. Note that a cluster belonging + * to a range starting and/or ending on a cluster boundary is treated + * as if it does not contain a delayed extent. The new range may + * have allocated space for previously delayed blocks out to the + * cluster boundary, requiring that any pre-existing pending + * reservation be canceled. Because this code only looks at blocks + * outside the range, it should revise pending reservations + * correctly even if the extent represented by the range can't be + * inserted in the extents status tree due to ENOSPC. + */ + + if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delonly, + first, lblk - 1); + if (f_del) { + __insert_pending(inode, first); + } else { + last = EXT4_LBLK_CMASK(sbi, end) + + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, + &ext4_es_is_delonly, + end + 1, last); + if (l_del) + __insert_pending(inode, last); + else + __remove_pending(inode, last); + } + } else { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delonly, + first, lblk - 1); + if (f_del) + __insert_pending(inode, first); + else + __remove_pending(inode, first); + + last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, &ext4_es_is_delonly, + end + 1, last); + if (l_del) + __insert_pending(inode, last); + else + __remove_pending(inode, last); + } +} -- Gitblit v1.6.2