hc
2023-11-20 2e7bd41e4e8ab3d1efdabd9e263a2f7fe79bff8c
kernel/mm/page_alloc.c
....@@ -60,6 +60,7 @@
6060 #include <linux/hugetlb.h>
6161 #include <linux/sched/rt.h>
6262 #include <linux/sched/mm.h>
63
+#include <linux/locallock.h>
6364 #include <linux/page_owner.h>
6465 #include <linux/kthread.h>
6566 #include <linux/memcontrol.h>
....@@ -352,6 +353,18 @@
352353 int nr_online_nodes __read_mostly = 1;
353354 EXPORT_SYMBOL(nr_node_ids);
354355 EXPORT_SYMBOL(nr_online_nodes);
356
+#endif
357
+
358
+static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
359
+
360
+#ifdef CONFIG_PREEMPT_RT_BASE
361
+# define cpu_lock_irqsave(cpu, flags) \
362
+ local_lock_irqsave_on(pa_lock, flags, cpu)
363
+# define cpu_unlock_irqrestore(cpu, flags) \
364
+ local_unlock_irqrestore_on(pa_lock, flags, cpu)
365
+#else
366
+# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
367
+# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
355368 #endif
356369
357370 int page_group_by_mobility_disabled __read_mostly;
....@@ -1172,7 +1185,7 @@
11721185 }
11731186
11741187 /*
1175
- * Frees a number of pages from the PCP lists
1188
+ * Frees a number of pages which have been collected from the pcp lists.
11761189 * Assumes all pages on list are in same zone, and of same order.
11771190 * count is the number of pages to free.
11781191 *
....@@ -1182,15 +1195,57 @@
11821195 * And clear the zone's pages_scanned counter, to hold off the "all pages are
11831196 * pinned" detection logic.
11841197 */
1185
-static void free_pcppages_bulk(struct zone *zone, int count,
1186
- struct per_cpu_pages *pcp)
1198
+static void free_pcppages_bulk(struct zone *zone, struct list_head *head,
1199
+ bool zone_retry)
1200
+{
1201
+ bool isolated_pageblocks;
1202
+ struct page *page, *tmp;
1203
+ unsigned long flags;
1204
+
1205
+ spin_lock_irqsave(&zone->lock, flags);
1206
+ isolated_pageblocks = has_isolate_pageblock(zone);
1207
+
1208
+ /*
1209
+ * Use safe version since after __free_one_page(),
1210
+ * page->lru.next will not point to original list.
1211
+ */
1212
+ list_for_each_entry_safe(page, tmp, head, lru) {
1213
+ int mt = get_pcppage_migratetype(page);
1214
+
1215
+ if (page_zone(page) != zone) {
1216
+ /*
1217
+ * free_unref_page_list() sorts pages by zone. If we end
1218
+ * up with pages from a different NUMA nodes belonging
1219
+ * to the same ZONE index then we need to redo with the
1220
+ * correct ZONE pointer. Skip the page for now, redo it
1221
+ * on the next iteration.
1222
+ */
1223
+ WARN_ON_ONCE(zone_retry == false);
1224
+ if (zone_retry)
1225
+ continue;
1226
+ }
1227
+
1228
+ /* MIGRATE_ISOLATE page should not go to pcplists */
1229
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1230
+ /* Pageblock could have been isolated meanwhile */
1231
+ if (unlikely(isolated_pageblocks))
1232
+ mt = get_pageblock_migratetype(page);
1233
+
1234
+ list_del(&page->lru);
1235
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1236
+ trace_mm_page_pcpu_drain(page, 0, mt);
1237
+ }
1238
+ spin_unlock_irqrestore(&zone->lock, flags);
1239
+}
1240
+
1241
+static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
1242
+ struct list_head *dst)
1243
+
11871244 {
11881245 int migratetype = 0;
11891246 int batch_free = 0;
11901247 int prefetch_nr = 0;
1191
- bool isolated_pageblocks;
1192
- struct page *page, *tmp;
1193
- LIST_HEAD(head);
1248
+ struct page *page;
11941249
11951250 /*
11961251 * Ensure proper count is passed which otherwise would stuck in the
....@@ -1227,7 +1282,7 @@
12271282 if (bulkfree_pcp_prepare(page))
12281283 continue;
12291284
1230
- list_add_tail(&page->lru, &head);
1285
+ list_add_tail(&page->lru, dst);
12311286
12321287 /*
12331288 * We are going to put the page back to the global
....@@ -1242,26 +1297,6 @@
12421297 prefetch_buddy(page);
12431298 } while (--count && --batch_free && !list_empty(list));
12441299 }
1245
-
1246
- spin_lock(&zone->lock);
1247
- isolated_pageblocks = has_isolate_pageblock(zone);
1248
-
1249
- /*
1250
- * Use safe version since after __free_one_page(),
1251
- * page->lru.next will not point to original list.
1252
- */
1253
- list_for_each_entry_safe(page, tmp, &head, lru) {
1254
- int mt = get_pcppage_migratetype(page);
1255
- /* MIGRATE_ISOLATE page should not go to pcplists */
1256
- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1257
- /* Pageblock could have been isolated meanwhile */
1258
- if (unlikely(isolated_pageblocks))
1259
- mt = get_pageblock_migratetype(page);
1260
-
1261
- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1262
- trace_mm_page_pcpu_drain(page, 0, mt);
1263
- }
1264
- spin_unlock(&zone->lock);
12651300 }
12661301
12671302 static void free_one_page(struct zone *zone,
....@@ -1363,10 +1398,10 @@
13631398 return;
13641399
13651400 migratetype = get_pfnblock_migratetype(page, pfn);
1366
- local_irq_save(flags);
1401
+ local_lock_irqsave(pa_lock, flags);
13671402 __count_vm_events(PGFREE, 1 << order);
13681403 free_one_page(page_zone(page), page, pfn, order, migratetype);
1369
- local_irq_restore(flags);
1404
+ local_unlock_irqrestore(pa_lock, flags);
13701405 }
13711406
13721407 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
....@@ -2673,13 +2708,18 @@
26732708 {
26742709 unsigned long flags;
26752710 int to_drain, batch;
2711
+ LIST_HEAD(dst);
26762712
2677
- local_irq_save(flags);
2713
+ local_lock_irqsave(pa_lock, flags);
26782714 batch = READ_ONCE(pcp->batch);
26792715 to_drain = min(pcp->count, batch);
26802716 if (to_drain > 0)
2681
- free_pcppages_bulk(zone, to_drain, pcp);
2682
- local_irq_restore(flags);
2717
+ isolate_pcp_pages(to_drain, pcp, &dst);
2718
+
2719
+ local_unlock_irqrestore(pa_lock, flags);
2720
+
2721
+ if (to_drain > 0)
2722
+ free_pcppages_bulk(zone, &dst, false);
26832723 }
26842724 #endif
26852725
....@@ -2695,14 +2735,21 @@
26952735 unsigned long flags;
26962736 struct per_cpu_pageset *pset;
26972737 struct per_cpu_pages *pcp;
2738
+ LIST_HEAD(dst);
2739
+ int count;
26982740
2699
- local_irq_save(flags);
2741
+ cpu_lock_irqsave(cpu, flags);
27002742 pset = per_cpu_ptr(zone->pageset, cpu);
27012743
27022744 pcp = &pset->pcp;
2703
- if (pcp->count)
2704
- free_pcppages_bulk(zone, pcp->count, pcp);
2705
- local_irq_restore(flags);
2745
+ count = pcp->count;
2746
+ if (count)
2747
+ isolate_pcp_pages(count, pcp, &dst);
2748
+
2749
+ cpu_unlock_irqrestore(cpu, flags);
2750
+
2751
+ if (count)
2752
+ free_pcppages_bulk(zone, &dst, false);
27062753 }
27072754
27082755 /*
....@@ -2737,6 +2784,7 @@
27372784 drain_pages(cpu);
27382785 }
27392786
2787
+#ifndef CONFIG_PREEMPT_RT_BASE
27402788 static void drain_local_pages_wq(struct work_struct *work)
27412789 {
27422790 /*
....@@ -2750,6 +2798,7 @@
27502798 drain_local_pages(NULL);
27512799 preempt_enable();
27522800 }
2801
+#endif
27532802
27542803 /*
27552804 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
....@@ -2816,7 +2865,14 @@
28162865 else
28172866 cpumask_clear_cpu(cpu, &cpus_with_pcps);
28182867 }
2819
-
2868
+#ifdef CONFIG_PREEMPT_RT_BASE
2869
+ for_each_cpu(cpu, &cpus_with_pcps) {
2870
+ if (zone)
2871
+ drain_pages_zone(cpu, zone);
2872
+ else
2873
+ drain_pages(cpu);
2874
+ }
2875
+#else
28202876 for_each_cpu(cpu, &cpus_with_pcps) {
28212877 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
28222878 INIT_WORK(work, drain_local_pages_wq);
....@@ -2824,6 +2880,7 @@
28242880 }
28252881 for_each_cpu(cpu, &cpus_with_pcps)
28262882 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
2883
+#endif
28272884
28282885 mutex_unlock(&pcpu_drain_mutex);
28292886 }
....@@ -2895,7 +2952,8 @@
28952952 return true;
28962953 }
28972954
2898
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
2955
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
2956
+ struct list_head *dst)
28992957 {
29002958 struct zone *zone = page_zone(page);
29012959 struct per_cpu_pages *pcp;
....@@ -2924,7 +2982,8 @@
29242982 pcp->count++;
29252983 if (pcp->count >= pcp->high) {
29262984 unsigned long batch = READ_ONCE(pcp->batch);
2927
- free_pcppages_bulk(zone, batch, pcp);
2985
+
2986
+ isolate_pcp_pages(batch, pcp, dst);
29282987 }
29292988 }
29302989
....@@ -2935,13 +2994,17 @@
29352994 {
29362995 unsigned long flags;
29372996 unsigned long pfn = page_to_pfn(page);
2997
+ struct zone *zone = page_zone(page);
2998
+ LIST_HEAD(dst);
29382999
29393000 if (!free_unref_page_prepare(page, pfn))
29403001 return;
29413002
2942
- local_irq_save(flags);
2943
- free_unref_page_commit(page, pfn);
2944
- local_irq_restore(flags);
3003
+ local_lock_irqsave(pa_lock, flags);
3004
+ free_unref_page_commit(page, pfn, &dst);
3005
+ local_unlock_irqrestore(pa_lock, flags);
3006
+ if (!list_empty(&dst))
3007
+ free_pcppages_bulk(zone, &dst, false);
29453008 }
29463009
29473010 /*
....@@ -2952,6 +3015,11 @@
29523015 struct page *page, *next;
29533016 unsigned long flags, pfn;
29543017 int batch_count = 0;
3018
+ struct list_head dsts[__MAX_NR_ZONES];
3019
+ int i;
3020
+
3021
+ for (i = 0; i < __MAX_NR_ZONES; i++)
3022
+ INIT_LIST_HEAD(&dsts[i]);
29553023
29563024 /* Prepare pages for freeing */
29573025 list_for_each_entry_safe(page, next, list, lru) {
....@@ -2961,25 +3029,42 @@
29613029 set_page_private(page, pfn);
29623030 }
29633031
2964
- local_irq_save(flags);
3032
+ local_lock_irqsave(pa_lock, flags);
29653033 list_for_each_entry_safe(page, next, list, lru) {
29663034 unsigned long pfn = page_private(page);
3035
+ enum zone_type type;
29673036
29683037 set_page_private(page, 0);
29693038 trace_mm_page_free_batched(page);
2970
- free_unref_page_commit(page, pfn);
3039
+ type = page_zonenum(page);
3040
+ free_unref_page_commit(page, pfn, &dsts[type]);
29713041
29723042 /*
29733043 * Guard against excessive IRQ disabled times when we get
29743044 * a large list of pages to free.
29753045 */
29763046 if (++batch_count == SWAP_CLUSTER_MAX) {
2977
- local_irq_restore(flags);
3047
+ local_unlock_irqrestore(pa_lock, flags);
29783048 batch_count = 0;
2979
- local_irq_save(flags);
3049
+ local_lock_irqsave(pa_lock, flags);
29803050 }
29813051 }
2982
- local_irq_restore(flags);
3052
+ local_unlock_irqrestore(pa_lock, flags);
3053
+
3054
+ for (i = 0; i < __MAX_NR_ZONES; ) {
3055
+ struct page *page;
3056
+ struct zone *zone;
3057
+
3058
+ if (list_empty(&dsts[i])) {
3059
+ i++;
3060
+ continue;
3061
+ }
3062
+
3063
+ page = list_first_entry(&dsts[i], struct page, lru);
3064
+ zone = page_zone(page);
3065
+
3066
+ free_pcppages_bulk(zone, &dsts[i], true);
3067
+ }
29833068 }
29843069
29853070 /*
....@@ -3124,7 +3209,7 @@
31243209 struct page *page;
31253210 unsigned long flags;
31263211
3127
- local_irq_save(flags);
3212
+ local_lock_irqsave(pa_lock, flags);
31283213 pcp = &this_cpu_ptr(zone->pageset)->pcp;
31293214 page = __rmqueue_pcplist(zone, migratetype, pcp,
31303215 gfp_flags);
....@@ -3132,7 +3217,7 @@
31323217 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
31333218 zone_statistics(preferred_zone, zone);
31343219 }
3135
- local_irq_restore(flags);
3220
+ local_unlock_irqrestore(pa_lock, flags);
31363221 return page;
31373222 }
31383223
....@@ -3159,7 +3244,7 @@
31593244 * allocate greater than order-1 page units with __GFP_NOFAIL.
31603245 */
31613246 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
3162
- spin_lock_irqsave(&zone->lock, flags);
3247
+ local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
31633248
31643249 do {
31653250 page = NULL;
....@@ -3186,14 +3271,14 @@
31863271
31873272 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
31883273 zone_statistics(preferred_zone, zone);
3189
- local_irq_restore(flags);
3274
+ local_unlock_irqrestore(pa_lock, flags);
31903275
31913276 out:
31923277 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
31933278 return page;
31943279
31953280 failed:
3196
- local_irq_restore(flags);
3281
+ local_unlock_irqrestore(pa_lock, flags);
31973282 return NULL;
31983283 }
31993284
....@@ -7333,8 +7418,9 @@
73337418
73347419 static int page_alloc_cpu_dead(unsigned int cpu)
73357420 {
7336
-
7421
+ local_lock_irq_on(swapvec_lock, cpu);
73377422 lru_add_drain_cpu(cpu);
7423
+ local_unlock_irq_on(swapvec_lock, cpu);
73387424 drain_pages(cpu);
73397425
73407426 /*
....@@ -8257,7 +8343,7 @@
82578343 struct per_cpu_pageset *pset;
82588344
82598345 /* avoid races with drain_pages() */
8260
- local_irq_save(flags);
8346
+ local_lock_irqsave(pa_lock, flags);
82618347 if (zone->pageset != &boot_pageset) {
82628348 for_each_online_cpu(cpu) {
82638349 pset = per_cpu_ptr(zone->pageset, cpu);
....@@ -8266,7 +8352,7 @@
82668352 free_percpu(zone->pageset);
82678353 zone->pageset = &boot_pageset;
82688354 }
8269
- local_irq_restore(flags);
8355
+ local_unlock_irqrestore(pa_lock, flags);
82708356 }
82718357
82728358 #ifdef CONFIG_MEMORY_HOTREMOVE