From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/vmstat.c | 259 +++++++++++++++++++++++++++++---------------------- 1 files changed, 148 insertions(+), 111 deletions(-) diff --git a/kernel/mm/vmstat.c b/kernel/mm/vmstat.c index d139fd9..604a993 100644 --- a/kernel/mm/vmstat.c +++ b/kernel/mm/vmstat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/vmstat.c * @@ -75,7 +76,7 @@ static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -227,7 +228,7 @@ * 125 1024 10 16-32 GB 9 */ - mem = zone->managed_pages >> (27 - PAGE_SHIFT); + mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT); threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); @@ -324,7 +325,7 @@ t = __this_cpu_read(pcp->stat_threshold); - if (unlikely(x > t || x < -t)) { + if (unlikely(abs(x) > t)) { zone_page_state_add(x, zone, item); x = 0; } @@ -340,11 +341,16 @@ long x; long t; + if (vmstat_item_in_bytes(item)) { + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); + delta >>= PAGE_SHIFT; + } + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); - if (unlikely(x > t || x < -t)) { + if (unlikely(abs(x) > t)) { node_page_state_add(x, pgdat, item); x = 0; } @@ -397,6 +403,8 @@ s8 __percpu *p = pcp->vm_node_stat_diff + item; s8 v, t; + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -440,6 +448,8 @@ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; s8 __percpu *p = pcp->vm_node_stat_diff + item; s8 v, t; + + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -501,7 +511,7 @@ o = this_cpu_read(*p); n = delta + o; - if (n > t || n < -t) { + if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; /* Overflow must be added to zone counters */ @@ -540,6 +550,11 @@ s8 __percpu *p = pcp->vm_node_stat_diff + item; long o, n, t, z; + if (vmstat_item_in_bytes(item)) { + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); + delta >>= PAGE_SHIFT; + } + do { z = 0; /* overflow to node counters */ @@ -558,7 +573,7 @@ o = this_cpu_read(*p); n = delta + o; - if (n > t || n < -t) { + if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; /* Overflow must be added to node counters */ @@ -988,8 +1003,8 @@ /* * Determine the per node value of a stat item. */ -unsigned long node_page_state(struct pglist_data *pgdat, - enum node_stat_item item) +unsigned long node_page_state_pages(struct pglist_data *pgdat, + enum node_stat_item item) { long x = atomic_long_read(&pgdat->vm_stat[item]); #ifdef CONFIG_SMP @@ -997,6 +1012,14 @@ x = 0; #endif return x; +} + +unsigned long node_page_state(struct pglist_data *pgdat, + enum node_stat_item item) +{ + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + + return node_page_state_pages(pgdat, item); } #endif @@ -1073,6 +1096,24 @@ return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); } +/* + * Calculates external fragmentation within a zone wrt the given order. + * It is defined as the percentage of pages found in blocks of size + * less than 1 << order. It returns values in range [0, 100]. + */ +unsigned int extfrag_for_order(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + if (info.free_pages == 0) + return 0; + + return div_u64((info.free_pages - + (info.free_blocks_suitable << order)) * 100, + info.free_pages); +} + /* Same as __fragmentation index but allocs contig_page_info on stack */ int fragmentation_index(struct zone *zone, unsigned int order) { @@ -1083,7 +1124,8 @@ } #endif -#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) +#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \ + defined(CONFIG_NUMA) || defined(CONFIG_MEMCG) #ifdef CONFIG_ZONE_DMA #define TEXT_FOR_DMA(xx) xx "_dma", #else @@ -1106,7 +1148,7 @@ TEXT_FOR_HIGHMEM(xx) xx "_movable", const char * const vmstat_text[] = { - /* enum zone_stat_item countes */ + /* enum zone_stat_item counters */ "nr_free_pages", "nr_zone_inactive_anon", "nr_zone_active_anon", @@ -1116,14 +1158,8 @@ "nr_zone_write_pending", "nr_mlock", "nr_page_table_pages", - "nr_kernel_stack", -#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) - "nr_shadow_call_stack_bytes", -#endif "nr_bounce", -#if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", -#endif "nr_free_cma", /* enum numa_stat_item counters */ @@ -1136,7 +1172,7 @@ "numa_other", #endif - /* Node-based counters */ + /* enum node_stat_item counters */ "nr_inactive_anon", "nr_active_anon", "nr_inactive_file", @@ -1146,9 +1182,13 @@ "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", - "workingset_refault", - "workingset_activate", - "workingset_restore", + "workingset_nodes", + "workingset_refault_anon", + "workingset_refault_file", + "workingset_activate_anon", + "workingset_activate_file", + "workingset_restore_anon", + "workingset_restore_file", "workingset_nodereclaim", "nr_anon_pages", "nr_mapped", @@ -1159,28 +1199,29 @@ "nr_shmem", "nr_shmem_hugepages", "nr_shmem_pmdmapped", + "nr_file_hugepages", + "nr_file_pmdmapped", "nr_anon_transparent_hugepages", - "nr_unstable", "nr_vmscan_write", "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", "nr_kernel_misc_reclaimable", - "nr_unreclaimable_pages", + "nr_foll_pin_acquired", + "nr_foll_pin_released", + "nr_kernel_stack", +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + "nr_shadow_call_stack", +#endif - - "nr_ion_heap", - "nr_ion_heap_pool", - "nr_gpu_heap", /* enum writeback_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", -#ifdef CONFIG_VM_EVENT_COUNTERS +#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) /* enum vm_event_item counters */ "pgpgin", "pgpgout", - "pgpgoutclean", "pswpin", "pswpout", @@ -1198,11 +1239,16 @@ "pglazyfreed", "pgrefill", + "pgreuse", "pgsteal_kswapd", "pgsteal_direct", "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", + "pgscan_anon", + "pgscan_file", + "pgsteal_anon", + "pgsteal_file", #ifdef CONFIG_NUMA "zone_reclaim_failed", @@ -1230,6 +1276,9 @@ #ifdef CONFIG_MIGRATION "pgmigrate_success", "pgmigrate_fail", + "thp_migration_success", + "thp_migration_fail", + "thp_migration_split", #endif #ifdef CONFIG_COMPACTION "compact_migrate_scanned", @@ -1247,6 +1296,10 @@ "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif +#ifdef CONFIG_CMA + "cma_alloc_success", + "cma_alloc_fail", +#endif "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", @@ -1258,9 +1311,12 @@ #ifdef CONFIG_TRANSPARENT_HUGEPAGE "thp_fault_alloc", "thp_fault_fallback", + "thp_fault_fallback_charge", "thp_collapse_alloc", "thp_collapse_alloc_failed", "thp_file_alloc", + "thp_file_fallback", + "thp_file_fallback_charge", "thp_file_mapped", "thp_split_page", "thp_split_page_failed", @@ -1296,9 +1352,13 @@ "swap_ra", "swap_ra_hit", #endif -#endif /* CONFIG_VM_EVENTS_COUNTERS */ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + "speculative_pgfault", + "speculative_pgfault_file" +#endif +#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; -#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ +#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ defined(CONFIG_PROC_FS) @@ -1388,12 +1448,26 @@ unsigned long freecount = 0; struct free_area *area; struct list_head *curr; + bool overflow = false; area = &(zone->free_area[order]); - list_for_each(curr, &area->free_list[mtype]) - freecount++; - seq_printf(m, "%6lu ", freecount); + list_for_each(curr, &area->free_list[mtype]) { + /* + * Cap the free_list iteration because it might + * be really large and we are under a spinlock + * so a long time spent here could trigger a + * hard lockup detector. Anyway this is a + * debugging tool so knowing there is a handful + * of pages of this order should be more than + * sufficient. + */ + if (++freecount >= 100000) { + overflow = true; + break; + } + } + seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount); spin_unlock_irq(&zone->lock); cond_resched(); spin_lock_irq(&zone->lock); @@ -1433,10 +1507,6 @@ page = pfn_to_online_page(pfn); if (!page) - continue; - - /* Watch for unexpected holes punched in the memmap */ - if (!memmap_valid_within(pfn, page, zone)) continue; if (page_zone(page) != zone) @@ -1555,14 +1625,8 @@ if (is_zone_first_populated(pgdat, zone)) { seq_printf(m, "\n per-node stats"); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { - /* Skip hidden vmstat items. */ - if (*vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_STAT_ITEMS] == '\0') - continue; - seq_printf(m, "\n %-12s %lu", - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_STAT_ITEMS], - node_page_state(pgdat, i)); + seq_printf(m, "\n %-12s %lu", node_stat_name(i), + node_page_state_pages(pgdat, i)); } } seq_printf(m, @@ -1572,14 +1636,16 @@ "\n high %lu" "\n spanned %lu" "\n present %lu" - "\n managed %lu", + "\n managed %lu" + "\n cma %lu", zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), zone->spanned_pages, zone->present_pages, - zone->managed_pages); + zone_managed_pages(zone), + zone_cma_pages(zone)); seq_printf(m, "\n protection: (%ld", @@ -1595,14 +1661,13 @@ } for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], - zone_page_state(zone, i)); + seq_printf(m, "\n %-12s %lu", zone_stat_name(i), + zone_page_state(zone, i)); #ifdef CONFIG_NUMA for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], - zone_numa_state_snapshot(zone, i)); + seq_printf(m, "\n %-12s %lu", numa_stat_name(i), + zone_numa_state_snapshot(zone, i)); #endif seq_printf(m, "\n pagesets"); @@ -1653,29 +1718,23 @@ .show = zoneinfo_show, }; -enum writeback_stat_item { - NR_DIRTY_THRESHOLD, - NR_DIRTY_BG_THRESHOLD, - NR_VM_WRITEBACK_STAT_ITEMS, -}; +#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \ + NR_VM_NUMA_STAT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + \ + NR_VM_WRITEBACK_STAT_ITEMS + \ + (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \ + NR_VM_EVENT_ITEMS : 0)) static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; - int i, stat_items_size; + int i; - if (*pos >= ARRAY_SIZE(vmstat_text)) + if (*pos >= NR_VMSTAT_ITEMS) return NULL; - stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + - NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) + - NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + - NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); -#ifdef CONFIG_VM_EVENT_COUNTERS - stat_items_size += sizeof(struct vm_event_state); -#endif - - v = kmalloc(stat_items_size, GFP_KERNEL); + BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS); + v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); @@ -1690,7 +1749,7 @@ #endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - v[i] = global_node_page_state(i); + v[i] = global_node_page_state_pages(i); v += NR_VM_NODE_STAT_ITEMS; global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, @@ -1708,10 +1767,7 @@ static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) { (*pos)++; - //nr_gpu_heap is out-of-tree now so we don't want to export it. - if (*pos == NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_STAT_ITEMS + NR_GPU_HEAP) - (*pos)++; - if (*pos >= ARRAY_SIZE(vmstat_text)) + if (*pos >= NR_VMSTAT_ITEMS) return NULL; return (unsigned long *)m->private + *pos; } @@ -1724,6 +1780,14 @@ seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); + + if (off == NR_VMSTAT_ITEMS - 1) { + /* + * We've come to the end - add any deprecated counters to avoid + * breaking userspace which might depend on them being present. + */ + seq_puts(m, "nr_unstable 0\n"); + } return 0; } @@ -1752,7 +1816,7 @@ } int vmstat_refresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long val; int err; @@ -1777,7 +1841,7 @@ val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", - __func__, vmstat_text[i], val); + __func__, zone_stat_name(i), val); err = -EINVAL; } } @@ -1786,7 +1850,7 @@ val = atomic_long_read(&vm_numa_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", - __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val); + __func__, numa_stat_name(i), val); err = -EINVAL; } } @@ -2056,24 +2120,14 @@ return 0; } -static const struct seq_operations unusable_op = { +static const struct seq_operations unusable_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = unusable_show, }; -static int unusable_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &unusable_op); -} - -static const struct file_operations unusable_file_ops = { - .open = unusable_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(unusable); static void extfrag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) @@ -2108,45 +2162,28 @@ return 0; } -static const struct seq_operations extfrag_op = { +static const struct seq_operations extfrag_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = extfrag_show, }; -static int extfrag_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &extfrag_op); -} - -static const struct file_operations extfrag_file_ops = { - .open = extfrag_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(extfrag); static int __init extfrag_debug_init(void) { struct dentry *extfrag_debug_root; extfrag_debug_root = debugfs_create_dir("extfrag", NULL); - if (!extfrag_debug_root) - return -ENOMEM; - if (!debugfs_create_file("unusable_index", 0444, - extfrag_debug_root, NULL, &unusable_file_ops)) - goto fail; + debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, + &unusable_fops); - if (!debugfs_create_file("extfrag_index", 0444, - extfrag_debug_root, NULL, &extfrag_file_ops)) - goto fail; + debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, + &extfrag_fops); return 0; -fail: - debugfs_remove_recursive(extfrag_debug_root); - return -ENOMEM; } module_init(extfrag_debug_init); -- Gitblit v1.6.2