hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/hugetlb.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Generic hugetlb support.
34 * (C) Nadia Yvette Chambers, April 2004
....@@ -15,9 +16,10 @@
1516 #include <linux/compiler.h>
1617 #include <linux/cpuset.h>
1718 #include <linux/mutex.h>
18
-#include <linux/bootmem.h>
19
+#include <linux/memblock.h>
1920 #include <linux/sysfs.h>
2021 #include <linux/slab.h>
22
+#include <linux/sched/mm.h>
2123 #include <linux/mmdebug.h>
2224 #include <linux/sched/signal.h>
2325 #include <linux/rmap.h>
....@@ -25,22 +27,30 @@
2527 #include <linux/swap.h>
2628 #include <linux/swapops.h>
2729 #include <linux/jhash.h>
30
+#include <linux/numa.h>
31
+#include <linux/llist.h>
32
+#include <linux/cma.h>
2833
2934 #include <asm/page.h>
30
-#include <asm/pgtable.h>
35
+#include <asm/pgalloc.h>
3136 #include <asm/tlb.h>
3237
3338 #include <linux/io.h>
3439 #include <linux/hugetlb.h>
3540 #include <linux/hugetlb_cgroup.h>
3641 #include <linux/node.h>
37
-#include <linux/userfaultfd_k.h>
3842 #include <linux/page_owner.h>
3943 #include "internal.h"
4044
4145 int hugetlb_max_hstate __read_mostly;
4246 unsigned int default_hstate_idx;
4347 struct hstate hstates[HUGE_MAX_HSTATE];
48
+
49
+#ifdef CONFIG_CMA
50
+static struct cma *hugetlb_cma[MAX_NUMNODES];
51
+#endif
52
+static unsigned long hugetlb_cma_size __initdata;
53
+
4454 /*
4555 * Minimum page order among possible hugepage sizes, set to a proper value
4656 * at boot time.
....@@ -52,8 +62,8 @@
5262 /* for command line parsing */
5363 static struct hstate * __initdata parsed_hstate;
5464 static unsigned long __initdata default_hstate_max_huge_pages;
55
-static unsigned long __initdata default_hstate_size;
5665 static bool __initdata parsed_valid_hugepagesz = true;
66
+static bool __initdata parsed_default_hugepagesz;
5767
5868 /*
5969 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
....@@ -67,6 +77,9 @@
6777 */
6878 static int num_fault_mutexes;
6979 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
80
+
81
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
82
+ unsigned long start, unsigned long end);
7083
7184 static inline bool PageHugeFreed(struct page *head)
7285 {
....@@ -93,7 +106,7 @@
93106 spin_unlock(&spool->lock);
94107
95108 /* If no pages are used, and no other handles to the subpool
96
- * remain, give up any reservations mased on minimum size and
109
+ * remain, give up any reservations based on minimum size and
97110 * free the subpool */
98111 if (free) {
99112 if (spool->min_hpages != -1)
....@@ -138,10 +151,10 @@
138151 /*
139152 * Subpool accounting for allocating and reserving pages.
140153 * Return -ENOMEM if there are not enough resources to satisfy the
141
- * the request. Otherwise, return the number of pages by which the
154
+ * request. Otherwise, return the number of pages by which the
142155 * global pools must be adjusted (upward). The returned value may
143156 * only be different than the passed value (delta) in the case where
144
- * a subpool minimum size must be manitained.
157
+ * a subpool minimum size must be maintained.
145158 */
146159 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
147160 long delta)
....@@ -232,114 +245,317 @@
232245 return subpool_inode(file_inode(vma->vm_file));
233246 }
234247
235
-/*
236
- * Region tracking -- allows tracking of reservations and instantiated pages
237
- * across the pages in a mapping.
238
- *
239
- * The region data structures are embedded into a resv_map and protected
240
- * by a resv_map's lock. The set of regions within the resv_map represent
241
- * reservations for huge pages, or huge pages that have already been
242
- * instantiated within the map. The from and to elements are huge page
243
- * indicies into the associated mapping. from indicates the starting index
244
- * of the region. to represents the first index past the end of the region.
245
- *
246
- * For example, a file region structure with from == 0 and to == 4 represents
247
- * four huge pages in a mapping. It is important to note that the to element
248
- * represents the first element past the end of the region. This is used in
249
- * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
250
- *
251
- * Interval notation of the form [from, to) will be used to indicate that
252
- * the endpoint from is inclusive and to is exclusive.
248
+/* Helper that removes a struct file_region from the resv_map cache and returns
249
+ * it for use.
253250 */
254
-struct file_region {
255
- struct list_head link;
256
- long from;
257
- long to;
258
-};
259
-
260
-/*
261
- * Add the huge page range represented by [f, t) to the reserve
262
- * map. In the normal case, existing regions will be expanded
263
- * to accommodate the specified range. Sufficient regions should
264
- * exist for expansion due to the previous call to region_chg
265
- * with the same range. However, it is possible that region_del
266
- * could have been called after region_chg and modifed the map
267
- * in such a way that no region exists to be expanded. In this
268
- * case, pull a region descriptor from the cache associated with
269
- * the map and use that for the new range.
270
- *
271
- * Return the number of new huge pages added to the map. This
272
- * number is greater than or equal to zero.
273
- */
274
-static long region_add(struct resv_map *resv, long f, long t)
251
+static struct file_region *
252
+get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
275253 {
276
- struct list_head *head = &resv->regions;
277
- struct file_region *rg, *nrg, *trg;
278
- long add = 0;
254
+ struct file_region *nrg = NULL;
279255
280
- spin_lock(&resv->lock);
281
- /* Locate the region we are either in or before. */
282
- list_for_each_entry(rg, head, link)
283
- if (f <= rg->to)
284
- break;
256
+ VM_BUG_ON(resv->region_cache_count <= 0);
285257
286
- /*
287
- * If no region exists which can be expanded to include the
288
- * specified range, the list must have been modified by an
289
- * interleving call to region_del(). Pull a region descriptor
290
- * from the cache and use it for this range.
291
- */
292
- if (&rg->link == head || t < rg->from) {
293
- VM_BUG_ON(resv->region_cache_count <= 0);
258
+ resv->region_cache_count--;
259
+ nrg = list_first_entry(&resv->region_cache, struct file_region, link);
260
+ list_del(&nrg->link);
294261
295
- resv->region_cache_count--;
296
- nrg = list_first_entry(&resv->region_cache, struct file_region,
297
- link);
298
- list_del(&nrg->link);
262
+ nrg->from = from;
263
+ nrg->to = to;
299264
300
- nrg->from = f;
301
- nrg->to = t;
302
- list_add(&nrg->link, rg->link.prev);
265
+ return nrg;
266
+}
303267
304
- add += t - f;
305
- goto out_locked;
268
+static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
269
+ struct file_region *rg)
270
+{
271
+#ifdef CONFIG_CGROUP_HUGETLB
272
+ nrg->reservation_counter = rg->reservation_counter;
273
+ nrg->css = rg->css;
274
+ if (rg->css)
275
+ css_get(rg->css);
276
+#endif
277
+}
278
+
279
+/* Helper that records hugetlb_cgroup uncharge info. */
280
+static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
281
+ struct hstate *h,
282
+ struct resv_map *resv,
283
+ struct file_region *nrg)
284
+{
285
+#ifdef CONFIG_CGROUP_HUGETLB
286
+ if (h_cg) {
287
+ nrg->reservation_counter =
288
+ &h_cg->rsvd_hugepage[hstate_index(h)];
289
+ nrg->css = &h_cg->css;
290
+ /*
291
+ * The caller will hold exactly one h_cg->css reference for the
292
+ * whole contiguous reservation region. But this area might be
293
+ * scattered when there are already some file_regions reside in
294
+ * it. As a result, many file_regions may share only one css
295
+ * reference. In order to ensure that one file_region must hold
296
+ * exactly one h_cg->css reference, we should do css_get for
297
+ * each file_region and leave the reference held by caller
298
+ * untouched.
299
+ */
300
+ css_get(&h_cg->css);
301
+ if (!resv->pages_per_hpage)
302
+ resv->pages_per_hpage = pages_per_huge_page(h);
303
+ /* pages_per_hpage should be the same for all entries in
304
+ * a resv_map.
305
+ */
306
+ VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
307
+ } else {
308
+ nrg->reservation_counter = NULL;
309
+ nrg->css = NULL;
310
+ }
311
+#endif
312
+}
313
+
314
+static void put_uncharge_info(struct file_region *rg)
315
+{
316
+#ifdef CONFIG_CGROUP_HUGETLB
317
+ if (rg->css)
318
+ css_put(rg->css);
319
+#endif
320
+}
321
+
322
+static bool has_same_uncharge_info(struct file_region *rg,
323
+ struct file_region *org)
324
+{
325
+#ifdef CONFIG_CGROUP_HUGETLB
326
+ return rg && org &&
327
+ rg->reservation_counter == org->reservation_counter &&
328
+ rg->css == org->css;
329
+
330
+#else
331
+ return true;
332
+#endif
333
+}
334
+
335
+static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
336
+{
337
+ struct file_region *nrg = NULL, *prg = NULL;
338
+
339
+ prg = list_prev_entry(rg, link);
340
+ if (&prg->link != &resv->regions && prg->to == rg->from &&
341
+ has_same_uncharge_info(prg, rg)) {
342
+ prg->to = rg->to;
343
+
344
+ list_del(&rg->link);
345
+ put_uncharge_info(rg);
346
+ kfree(rg);
347
+
348
+ rg = prg;
306349 }
307350
308
- /* Round our left edge to the current segment if it encloses us. */
309
- if (f > rg->from)
310
- f = rg->from;
351
+ nrg = list_next_entry(rg, link);
352
+ if (&nrg->link != &resv->regions && nrg->from == rg->to &&
353
+ has_same_uncharge_info(nrg, rg)) {
354
+ nrg->from = rg->from;
311355
312
- /* Check for and consume any regions we now overlap with. */
313
- nrg = rg;
314
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
315
- if (&rg->link == head)
316
- break;
356
+ list_del(&rg->link);
357
+ put_uncharge_info(rg);
358
+ kfree(rg);
359
+ }
360
+}
361
+
362
+/*
363
+ * Must be called with resv->lock held.
364
+ *
365
+ * Calling this with regions_needed != NULL will count the number of pages
366
+ * to be added but will not modify the linked list. And regions_needed will
367
+ * indicate the number of file_regions needed in the cache to carry out to add
368
+ * the regions for this range.
369
+ */
370
+static long add_reservation_in_range(struct resv_map *resv, long f, long t,
371
+ struct hugetlb_cgroup *h_cg,
372
+ struct hstate *h, long *regions_needed)
373
+{
374
+ long add = 0;
375
+ struct list_head *head = &resv->regions;
376
+ long last_accounted_offset = f;
377
+ struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
378
+
379
+ if (regions_needed)
380
+ *regions_needed = 0;
381
+
382
+ /* In this loop, we essentially handle an entry for the range
383
+ * [last_accounted_offset, rg->from), at every iteration, with some
384
+ * bounds checking.
385
+ */
386
+ list_for_each_entry_safe(rg, trg, head, link) {
387
+ /* Skip irrelevant regions that start before our range. */
388
+ if (rg->from < f) {
389
+ /* If this region ends after the last accounted offset,
390
+ * then we need to update last_accounted_offset.
391
+ */
392
+ if (rg->to > last_accounted_offset)
393
+ last_accounted_offset = rg->to;
394
+ continue;
395
+ }
396
+
397
+ /* When we find a region that starts beyond our range, we've
398
+ * finished.
399
+ */
317400 if (rg->from > t)
318401 break;
319402
320
- /* If this area reaches higher then extend our area to
321
- * include it completely. If this is not the first area
322
- * which we intend to reuse, free it. */
323
- if (rg->to > t)
324
- t = rg->to;
325
- if (rg != nrg) {
326
- /* Decrement return value by the deleted range.
327
- * Another range will span this area so that by
328
- * end of routine add will be >= zero
329
- */
330
- add -= (rg->to - rg->from);
331
- list_del(&rg->link);
332
- kfree(rg);
403
+ /* Add an entry for last_accounted_offset -> rg->from, and
404
+ * update last_accounted_offset.
405
+ */
406
+ if (rg->from > last_accounted_offset) {
407
+ add += rg->from - last_accounted_offset;
408
+ if (!regions_needed) {
409
+ nrg = get_file_region_entry_from_cache(
410
+ resv, last_accounted_offset, rg->from);
411
+ record_hugetlb_cgroup_uncharge_info(h_cg, h,
412
+ resv, nrg);
413
+ list_add(&nrg->link, rg->link.prev);
414
+ coalesce_file_region(resv, nrg);
415
+ } else
416
+ *regions_needed += 1;
333417 }
418
+
419
+ last_accounted_offset = rg->to;
334420 }
335421
336
- add += (nrg->from - f); /* Added to beginning of region */
337
- nrg->from = f;
338
- add += t - nrg->to; /* Added to end of region */
339
- nrg->to = t;
422
+ /* Handle the case where our range extends beyond
423
+ * last_accounted_offset.
424
+ */
425
+ if (last_accounted_offset < t) {
426
+ add += t - last_accounted_offset;
427
+ if (!regions_needed) {
428
+ nrg = get_file_region_entry_from_cache(
429
+ resv, last_accounted_offset, t);
430
+ record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
431
+ list_add(&nrg->link, rg->link.prev);
432
+ coalesce_file_region(resv, nrg);
433
+ } else
434
+ *regions_needed += 1;
435
+ }
340436
341
-out_locked:
342
- resv->adds_in_progress--;
437
+ VM_BUG_ON(add < 0);
438
+ return add;
439
+}
440
+
441
+/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
442
+ */
443
+static int allocate_file_region_entries(struct resv_map *resv,
444
+ int regions_needed)
445
+ __must_hold(&resv->lock)
446
+{
447
+ struct list_head allocated_regions;
448
+ int to_allocate = 0, i = 0;
449
+ struct file_region *trg = NULL, *rg = NULL;
450
+
451
+ VM_BUG_ON(regions_needed < 0);
452
+
453
+ INIT_LIST_HEAD(&allocated_regions);
454
+
455
+ /*
456
+ * Check for sufficient descriptors in the cache to accommodate
457
+ * the number of in progress add operations plus regions_needed.
458
+ *
459
+ * This is a while loop because when we drop the lock, some other call
460
+ * to region_add or region_del may have consumed some region_entries,
461
+ * so we keep looping here until we finally have enough entries for
462
+ * (adds_in_progress + regions_needed).
463
+ */
464
+ while (resv->region_cache_count <
465
+ (resv->adds_in_progress + regions_needed)) {
466
+ to_allocate = resv->adds_in_progress + regions_needed -
467
+ resv->region_cache_count;
468
+
469
+ /* At this point, we should have enough entries in the cache
470
+ * for all the existings adds_in_progress. We should only be
471
+ * needing to allocate for regions_needed.
472
+ */
473
+ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
474
+
475
+ spin_unlock(&resv->lock);
476
+ for (i = 0; i < to_allocate; i++) {
477
+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
478
+ if (!trg)
479
+ goto out_of_memory;
480
+ list_add(&trg->link, &allocated_regions);
481
+ }
482
+
483
+ spin_lock(&resv->lock);
484
+
485
+ list_splice(&allocated_regions, &resv->region_cache);
486
+ resv->region_cache_count += to_allocate;
487
+ }
488
+
489
+ return 0;
490
+
491
+out_of_memory:
492
+ list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
493
+ list_del(&rg->link);
494
+ kfree(rg);
495
+ }
496
+ return -ENOMEM;
497
+}
498
+
499
+/*
500
+ * Add the huge page range represented by [f, t) to the reserve
501
+ * map. Regions will be taken from the cache to fill in this range.
502
+ * Sufficient regions should exist in the cache due to the previous
503
+ * call to region_chg with the same range, but in some cases the cache will not
504
+ * have sufficient entries due to races with other code doing region_add or
505
+ * region_del. The extra needed entries will be allocated.
506
+ *
507
+ * regions_needed is the out value provided by a previous call to region_chg.
508
+ *
509
+ * Return the number of new huge pages added to the map. This number is greater
510
+ * than or equal to zero. If file_region entries needed to be allocated for
511
+ * this operation and we were not able to allocate, it returns -ENOMEM.
512
+ * region_add of regions of length 1 never allocate file_regions and cannot
513
+ * fail; region_chg will always allocate at least 1 entry and a region_add for
514
+ * 1 page will only require at most 1 entry.
515
+ */
516
+static long region_add(struct resv_map *resv, long f, long t,
517
+ long in_regions_needed, struct hstate *h,
518
+ struct hugetlb_cgroup *h_cg)
519
+{
520
+ long add = 0, actual_regions_needed = 0;
521
+
522
+ spin_lock(&resv->lock);
523
+retry:
524
+
525
+ /* Count how many regions are actually needed to execute this add. */
526
+ add_reservation_in_range(resv, f, t, NULL, NULL,
527
+ &actual_regions_needed);
528
+
529
+ /*
530
+ * Check for sufficient descriptors in the cache to accommodate
531
+ * this add operation. Note that actual_regions_needed may be greater
532
+ * than in_regions_needed, as the resv_map may have been modified since
533
+ * the region_chg call. In this case, we need to make sure that we
534
+ * allocate extra entries, such that we have enough for all the
535
+ * existing adds_in_progress, plus the excess needed for this
536
+ * operation.
537
+ */
538
+ if (actual_regions_needed > in_regions_needed &&
539
+ resv->region_cache_count <
540
+ resv->adds_in_progress +
541
+ (actual_regions_needed - in_regions_needed)) {
542
+ /* region_add operation of range 1 should never need to
543
+ * allocate file_region entries.
544
+ */
545
+ VM_BUG_ON(t - f <= 1);
546
+
547
+ if (allocate_file_region_entries(
548
+ resv, actual_regions_needed - in_regions_needed)) {
549
+ return -ENOMEM;
550
+ }
551
+
552
+ goto retry;
553
+ }
554
+
555
+ add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
556
+
557
+ resv->adds_in_progress -= in_regions_needed;
558
+
343559 spin_unlock(&resv->lock);
344560 VM_BUG_ON(add < 0);
345561 return add;
....@@ -352,111 +568,38 @@
352568 * call to region_add that will actually modify the reserve
353569 * map to add the specified range [f, t). region_chg does
354570 * not change the number of huge pages represented by the
355
- * map. However, if the existing regions in the map can not
356
- * be expanded to represent the new range, a new file_region
357
- * structure is added to the map as a placeholder. This is
358
- * so that the subsequent region_add call will have all the
359
- * regions it needs and will not fail.
571
+ * map. A number of new file_region structures is added to the cache as a
572
+ * placeholder, for the subsequent region_add call to use. At least 1
573
+ * file_region structure is added.
360574 *
361
- * Upon entry, region_chg will also examine the cache of region descriptors
362
- * associated with the map. If there are not enough descriptors cached, one
363
- * will be allocated for the in progress add operation.
575
+ * out_regions_needed is the number of regions added to the
576
+ * resv->adds_in_progress. This value needs to be provided to a follow up call
577
+ * to region_add or region_abort for proper accounting.
364578 *
365579 * Returns the number of huge pages that need to be added to the existing
366580 * reservation map for the range [f, t). This number is greater or equal to
367581 * zero. -ENOMEM is returned if a new file_region structure or cache entry
368582 * is needed and can not be allocated.
369583 */
370
-static long region_chg(struct resv_map *resv, long f, long t)
584
+static long region_chg(struct resv_map *resv, long f, long t,
585
+ long *out_regions_needed)
371586 {
372
- struct list_head *head = &resv->regions;
373
- struct file_region *rg, *nrg = NULL;
374587 long chg = 0;
375588
376
-retry:
377589 spin_lock(&resv->lock);
378
-retry_locked:
379
- resv->adds_in_progress++;
380590
381
- /*
382
- * Check for sufficient descriptors in the cache to accommodate
383
- * the number of in progress add operations.
384
- */
385
- if (resv->adds_in_progress > resv->region_cache_count) {
386
- struct file_region *trg;
591
+ /* Count how many hugepages in this range are NOT represented. */
592
+ chg = add_reservation_in_range(resv, f, t, NULL, NULL,
593
+ out_regions_needed);
387594
388
- VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
389
- /* Must drop lock to allocate a new descriptor. */
390
- resv->adds_in_progress--;
391
- spin_unlock(&resv->lock);
595
+ if (*out_regions_needed == 0)
596
+ *out_regions_needed = 1;
392597
393
- trg = kmalloc(sizeof(*trg), GFP_KERNEL);
394
- if (!trg) {
395
- kfree(nrg);
396
- return -ENOMEM;
397
- }
598
+ if (allocate_file_region_entries(resv, *out_regions_needed))
599
+ return -ENOMEM;
398600
399
- spin_lock(&resv->lock);
400
- list_add(&trg->link, &resv->region_cache);
401
- resv->region_cache_count++;
402
- goto retry_locked;
403
- }
601
+ resv->adds_in_progress += *out_regions_needed;
404602
405
- /* Locate the region we are before or in. */
406
- list_for_each_entry(rg, head, link)
407
- if (f <= rg->to)
408
- break;
409
-
410
- /* If we are below the current region then a new region is required.
411
- * Subtle, allocate a new region at the position but make it zero
412
- * size such that we can guarantee to record the reservation. */
413
- if (&rg->link == head || t < rg->from) {
414
- if (!nrg) {
415
- resv->adds_in_progress--;
416
- spin_unlock(&resv->lock);
417
- nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
418
- if (!nrg)
419
- return -ENOMEM;
420
-
421
- nrg->from = f;
422
- nrg->to = f;
423
- INIT_LIST_HEAD(&nrg->link);
424
- goto retry;
425
- }
426
-
427
- list_add(&nrg->link, rg->link.prev);
428
- chg = t - f;
429
- goto out_nrg;
430
- }
431
-
432
- /* Round our left edge to the current segment if it encloses us. */
433
- if (f > rg->from)
434
- f = rg->from;
435
- chg = t - f;
436
-
437
- /* Check for and consume any regions we now overlap with. */
438
- list_for_each_entry(rg, rg->link.prev, link) {
439
- if (&rg->link == head)
440
- break;
441
- if (rg->from > t)
442
- goto out;
443
-
444
- /* We overlap with this area, if it extends further than
445
- * us then we must extend ourselves. Account for its
446
- * existing reservation. */
447
- if (rg->to > t) {
448
- chg += rg->to - t;
449
- t = rg->to;
450
- }
451
- chg -= rg->to - rg->from;
452
- }
453
-
454
-out:
455
- spin_unlock(&resv->lock);
456
- /* We already know we raced and no longer need the new region */
457
- kfree(nrg);
458
- return chg;
459
-out_nrg:
460603 spin_unlock(&resv->lock);
461604 return chg;
462605 }
....@@ -466,17 +609,20 @@
466609 * of the resv_map keeps track of the operations in progress between
467610 * calls to region_chg and region_add. Operations are sometimes
468611 * aborted after the call to region_chg. In such cases, region_abort
469
- * is called to decrement the adds_in_progress counter.
612
+ * is called to decrement the adds_in_progress counter. regions_needed
613
+ * is the value returned by the region_chg call, it is used to decrement
614
+ * the adds_in_progress counter.
470615 *
471616 * NOTE: The range arguments [f, t) are not needed or used in this
472617 * routine. They are kept to make reading the calling code easier as
473618 * arguments will match the associated region_chg call.
474619 */
475
-static void region_abort(struct resv_map *resv, long f, long t)
620
+static void region_abort(struct resv_map *resv, long f, long t,
621
+ long regions_needed)
476622 {
477623 spin_lock(&resv->lock);
478624 VM_BUG_ON(!resv->region_cache_count);
479
- resv->adds_in_progress--;
625
+ resv->adds_in_progress -= regions_needed;
480626 spin_unlock(&resv->lock);
481627 }
482628
....@@ -540,10 +686,15 @@
540686 }
541687
542688 del += t - f;
689
+ hugetlb_cgroup_uncharge_file_region(
690
+ resv, rg, t - f, false);
543691
544692 /* New entry for end of split region */
545693 nrg->from = t;
546694 nrg->to = rg->to;
695
+
696
+ copy_hugetlb_cgroup_uncharge_info(nrg, rg);
697
+
547698 INIT_LIST_HEAD(&nrg->link);
548699
549700 /* Original entry is trimmed */
....@@ -556,15 +707,23 @@
556707
557708 if (f <= rg->from && t >= rg->to) { /* Remove entire region */
558709 del += rg->to - rg->from;
710
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
711
+ rg->to - rg->from, true);
559712 list_del(&rg->link);
560713 kfree(rg);
561714 continue;
562715 }
563716
564717 if (f <= rg->from) { /* Trim beginning of region */
718
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
719
+ t - rg->from, false);
720
+
565721 del += t - rg->from;
566722 rg->from = t;
567723 } else { /* Trim end of region */
724
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
725
+ rg->to - f, false);
726
+
568727 del += rg->to - f;
569728 rg->to = f;
570729 }
....@@ -715,6 +874,25 @@
715874 vma->vm_private_data = (void *)value;
716875 }
717876
877
+static void
878
+resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
879
+ struct hugetlb_cgroup *h_cg,
880
+ struct hstate *h)
881
+{
882
+#ifdef CONFIG_CGROUP_HUGETLB
883
+ if (!h_cg || !h) {
884
+ resv_map->reservation_counter = NULL;
885
+ resv_map->pages_per_hpage = 0;
886
+ resv_map->css = NULL;
887
+ } else {
888
+ resv_map->reservation_counter =
889
+ &h_cg->rsvd_hugepage[hstate_index(h)];
890
+ resv_map->pages_per_hpage = pages_per_huge_page(h);
891
+ resv_map->css = &h_cg->css;
892
+ }
893
+#endif
894
+}
895
+
718896 struct resv_map *resv_map_alloc(void)
719897 {
720898 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
....@@ -731,6 +909,13 @@
731909 INIT_LIST_HEAD(&resv_map->regions);
732910
733911 resv_map->adds_in_progress = 0;
912
+ /*
913
+ * Initialize these to 0. On shared mappings, 0's here indicate these
914
+ * fields don't do cgroup accounting. On private mappings, these will be
915
+ * re-initialized to the proper values, to indicate that hugetlb cgroup
916
+ * reservations are to be un-charged from here.
917
+ */
918
+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
734919
735920 INIT_LIST_HEAD(&resv_map->region_cache);
736921 list_add(&rg->link, &resv_map->region_cache);
....@@ -761,7 +946,15 @@
761946
762947 static inline struct resv_map *inode_resv_map(struct inode *inode)
763948 {
764
- return inode->i_mapping->private_data;
949
+ /*
950
+ * At inode evict time, i_mapping may not point to the original
951
+ * address space within the inode. This original address space
952
+ * contains the pointer to the resv_map. So, always use the
953
+ * address space embedded within the inode.
954
+ * The VERY common case is inode->mapping == &inode->i_data but,
955
+ * this may not be true for device special inodes.
956
+ */
957
+ return (struct resv_map *)(&inode->i_data)->private_data;
765958 }
766959
767960 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
....@@ -836,7 +1029,7 @@
8361029 * We know VM_NORESERVE is not set. Therefore, there SHOULD
8371030 * be a region map for all pages. The only situation where
8381031 * there is no region map is if a hole was punched via
839
- * fallocate. In this case, there really are no reverves to
1032
+ * fallocate. In this case, there really are no reserves to
8401033 * use. This situation is indicated if chg != 0.
8411034 */
8421035 if (chg)
....@@ -886,22 +1079,24 @@
8861079 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
8871080 {
8881081 struct page *page;
1082
+ bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
8891083
890
- list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
891
- if (!PageHWPoison(page))
892
- break;
893
- /*
894
- * if 'non-isolated free hugepage' not found on the list,
895
- * the allocation fails.
896
- */
897
- if (&h->hugepage_freelists[nid] == &page->lru)
898
- return NULL;
899
- list_move(&page->lru, &h->hugepage_activelist);
900
- set_page_refcounted(page);
901
- ClearPageHugeFreed(page);
902
- h->free_huge_pages--;
903
- h->free_huge_pages_node[nid]--;
904
- return page;
1084
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
1085
+ if (nocma && is_migrate_cma_page(page))
1086
+ continue;
1087
+
1088
+ if (PageHWPoison(page))
1089
+ continue;
1090
+
1091
+ list_move(&page->lru, &h->hugepage_activelist);
1092
+ set_page_refcounted(page);
1093
+ ClearPageHugeFreed(page);
1094
+ h->free_huge_pages--;
1095
+ h->free_huge_pages_node[nid]--;
1096
+ return page;
1097
+ }
1098
+
1099
+ return NULL;
9051100 }
9061101
9071102 static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
....@@ -911,7 +1106,7 @@
9111106 struct zonelist *zonelist;
9121107 struct zone *zone;
9131108 struct zoneref *z;
914
- int node = -1;
1109
+ int node = NUMA_NO_NODE;
9151110
9161111 zonelist = node_zonelist(nid, gfp_mask);
9171112
....@@ -938,15 +1133,6 @@
9381133 goto retry_cpuset;
9391134
9401135 return NULL;
941
-}
942
-
943
-/* Movability of hugepages depends on migration support. */
944
-static inline gfp_t htlb_alloc_mask(struct hstate *h)
945
-{
946
- if (hugepage_migration_supported(h))
947
- return GFP_HIGHUSER_MOVABLE;
948
- else
949
- return GFP_HIGHUSER;
9501136 }
9511137
9521138 static struct page *dequeue_huge_page_vma(struct hstate *h,
....@@ -1068,108 +1254,85 @@
10681254 struct page *p = page + 1;
10691255
10701256 atomic_set(compound_mapcount_ptr(page), 0);
1257
+ atomic_set(compound_pincount_ptr(page), 0);
1258
+
10711259 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
10721260 clear_compound_head(p);
10731261 set_page_refcounted(p);
10741262 }
10751263
10761264 set_compound_order(page, 0);
1265
+ page[1].compound_nr = 0;
10771266 __ClearPageHead(page);
10781267 }
10791268
10801269 static void free_gigantic_page(struct page *page, unsigned int order)
10811270 {
1271
+ /*
1272
+ * If the page isn't allocated using the cma allocator,
1273
+ * cma_release() returns false.
1274
+ */
1275
+#ifdef CONFIG_CMA
1276
+ if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
1277
+ return;
1278
+#endif
1279
+
10821280 free_contig_range(page_to_pfn(page), 1 << order);
10831281 }
10841282
1085
-static int __alloc_gigantic_page(unsigned long start_pfn,
1086
- unsigned long nr_pages, gfp_t gfp_mask)
1087
-{
1088
- unsigned long end_pfn = start_pfn + nr_pages;
1089
- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
1090
- gfp_mask);
1091
-}
1092
-
1093
-static bool pfn_range_valid_gigantic(struct zone *z,
1094
- unsigned long start_pfn, unsigned long nr_pages)
1095
-{
1096
- unsigned long i, end_pfn = start_pfn + nr_pages;
1097
- struct page *page;
1098
-
1099
- for (i = start_pfn; i < end_pfn; i++) {
1100
- page = pfn_to_online_page(i);
1101
- if (!page)
1102
- return false;
1103
-
1104
- if (page_zone(page) != z)
1105
- return false;
1106
-
1107
- if (PageReserved(page))
1108
- return false;
1109
-
1110
- if (page_count(page) > 0)
1111
- return false;
1112
-
1113
- if (PageHuge(page))
1114
- return false;
1115
- }
1116
-
1117
- return true;
1118
-}
1119
-
1120
-static bool zone_spans_last_pfn(const struct zone *zone,
1121
- unsigned long start_pfn, unsigned long nr_pages)
1122
-{
1123
- unsigned long last_pfn = start_pfn + nr_pages - 1;
1124
- return zone_spans_pfn(zone, last_pfn);
1125
-}
1126
-
1283
+#ifdef CONFIG_CONTIG_ALLOC
11271284 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
11281285 int nid, nodemask_t *nodemask)
11291286 {
1130
- unsigned int order = huge_page_order(h);
1131
- unsigned long nr_pages = 1 << order;
1132
- unsigned long ret, pfn, flags;
1133
- struct zonelist *zonelist;
1134
- struct zone *zone;
1135
- struct zoneref *z;
1287
+ unsigned long nr_pages = 1UL << huge_page_order(h);
1288
+ if (nid == NUMA_NO_NODE)
1289
+ nid = numa_mem_id();
11361290
1137
- zonelist = node_zonelist(nid, gfp_mask);
1138
- for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
1139
- spin_lock_irqsave(&zone->lock, flags);
1291
+#ifdef CONFIG_CMA
1292
+ {
1293
+ struct page *page;
1294
+ int node;
11401295
1141
- pfn = ALIGN(zone->zone_start_pfn, nr_pages);
1142
- while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
1143
- if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
1144
- /*
1145
- * We release the zone lock here because
1146
- * alloc_contig_range() will also lock the zone
1147
- * at some point. If there's an allocation
1148
- * spinning on this lock, it may win the race
1149
- * and cause alloc_contig_range() to fail...
1150
- */
1151
- spin_unlock_irqrestore(&zone->lock, flags);
1152
- ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
1153
- if (!ret)
1154
- return pfn_to_page(pfn);
1155
- spin_lock_irqsave(&zone->lock, flags);
1156
- }
1157
- pfn += nr_pages;
1296
+ if (hugetlb_cma[nid]) {
1297
+ page = cma_alloc(hugetlb_cma[nid], nr_pages,
1298
+ huge_page_order(h),
1299
+ GFP_KERNEL | __GFP_NOWARN);
1300
+ if (page)
1301
+ return page;
11581302 }
11591303
1160
- spin_unlock_irqrestore(&zone->lock, flags);
1161
- }
1304
+ if (!(gfp_mask & __GFP_THISNODE)) {
1305
+ for_each_node_mask(node, *nodemask) {
1306
+ if (node == nid || !hugetlb_cma[node])
1307
+ continue;
11621308
1163
- return NULL;
1309
+ page = cma_alloc(hugetlb_cma[node], nr_pages,
1310
+ huge_page_order(h),
1311
+ GFP_KERNEL | __GFP_NOWARN);
1312
+ if (page)
1313
+ return page;
1314
+ }
1315
+ }
1316
+ }
1317
+#endif
1318
+
1319
+ return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
11641320 }
11651321
1166
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
1167
-static void prep_compound_gigantic_page(struct page *page, unsigned int order);
1322
+#else /* !CONFIG_CONTIG_ALLOC */
1323
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1324
+ int nid, nodemask_t *nodemask)
1325
+{
1326
+ return NULL;
1327
+}
1328
+#endif /* CONFIG_CONTIG_ALLOC */
11681329
11691330 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1170
-static inline bool gigantic_page_supported(void) { return false; }
11711331 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1172
- int nid, nodemask_t *nodemask) { return NULL; }
1332
+ int nid, nodemask_t *nodemask)
1333
+{
1334
+ return NULL;
1335
+}
11731336 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
11741337 static inline void destroy_compound_gigantic_page(struct page *page,
11751338 unsigned int order) { }
....@@ -1180,7 +1343,7 @@
11801343 int i;
11811344 struct page *subpage = page;
11821345
1183
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
1346
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
11841347 return;
11851348
11861349 h->nr_huge_pages--;
....@@ -1193,11 +1356,18 @@
11931356 1 << PG_writeback);
11941357 }
11951358 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1359
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
11961360 set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
11971361 set_page_refcounted(page);
11981362 if (hstate_is_gigantic(h)) {
1363
+ /*
1364
+ * Temporarily drop the hugetlb_lock, because
1365
+ * we might block in free_gigantic_page().
1366
+ */
1367
+ spin_unlock(&hugetlb_lock);
11991368 destroy_compound_gigantic_page(page, huge_page_order(h));
12001369 free_gigantic_page(page, huge_page_order(h));
1370
+ spin_lock(&hugetlb_lock);
12011371 } else {
12021372 __free_pages(page, huge_page_order(h));
12031373 }
....@@ -1260,7 +1430,7 @@
12601430 page[2].mapping = NULL;
12611431 }
12621432
1263
-void free_huge_page(struct page *page)
1433
+static void __free_huge_page(struct page *page)
12641434 {
12651435 /*
12661436 * Can't pass hstate in here because it is called from the
....@@ -1272,10 +1442,11 @@
12721442 (struct hugepage_subpool *)page_private(page);
12731443 bool restore_reserve;
12741444
1275
- set_page_private(page, 0);
1276
- page->mapping = NULL;
12771445 VM_BUG_ON_PAGE(page_count(page), page);
12781446 VM_BUG_ON_PAGE(page_mapcount(page), page);
1447
+
1448
+ set_page_private(page, 0);
1449
+ page->mapping = NULL;
12791450 restore_reserve = PagePrivate(page);
12801451 ClearPagePrivate(page);
12811452
....@@ -1302,6 +1473,8 @@
13021473 clear_page_huge_active(page);
13031474 hugetlb_cgroup_uncharge_page(hstate_index(h),
13041475 pages_per_huge_page(h), page);
1476
+ hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
1477
+ pages_per_huge_page(h), page);
13051478 if (restore_reserve)
13061479 h->resv_huge_pages++;
13071480
....@@ -1322,12 +1495,61 @@
13221495 spin_unlock(&hugetlb_lock);
13231496 }
13241497
1498
+/*
1499
+ * As free_huge_page() can be called from a non-task context, we have
1500
+ * to defer the actual freeing in a workqueue to prevent potential
1501
+ * hugetlb_lock deadlock.
1502
+ *
1503
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to
1504
+ * be freed and frees them one-by-one. As the page->mapping pointer is
1505
+ * going to be cleared in __free_huge_page() anyway, it is reused as the
1506
+ * llist_node structure of a lockless linked list of huge pages to be freed.
1507
+ */
1508
+static LLIST_HEAD(hpage_freelist);
1509
+
1510
+static void free_hpage_workfn(struct work_struct *work)
1511
+{
1512
+ struct llist_node *node;
1513
+ struct page *page;
1514
+
1515
+ node = llist_del_all(&hpage_freelist);
1516
+
1517
+ while (node) {
1518
+ page = container_of((struct address_space **)node,
1519
+ struct page, mapping);
1520
+ node = node->next;
1521
+ __free_huge_page(page);
1522
+ }
1523
+}
1524
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1525
+
1526
+void free_huge_page(struct page *page)
1527
+{
1528
+ /*
1529
+ * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
1530
+ */
1531
+ if (!in_task()) {
1532
+ /*
1533
+ * Only call schedule_work() if hpage_freelist is previously
1534
+ * empty. Otherwise, schedule_work() had been called but the
1535
+ * workfn hasn't retrieved the list yet.
1536
+ */
1537
+ if (llist_add((struct llist_node *)&page->mapping,
1538
+ &hpage_freelist))
1539
+ schedule_work(&free_hpage_work);
1540
+ return;
1541
+ }
1542
+
1543
+ __free_huge_page(page);
1544
+}
1545
+
13251546 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
13261547 {
13271548 INIT_LIST_HEAD(&page->lru);
13281549 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1329
- spin_lock(&hugetlb_lock);
13301550 set_hugetlb_cgroup(page, NULL);
1551
+ set_hugetlb_cgroup_rsvd(page, NULL);
1552
+ spin_lock(&hugetlb_lock);
13311553 h->nr_huge_pages++;
13321554 h->nr_huge_pages_node[nid]++;
13331555 ClearPageHugeFreed(page);
....@@ -1349,7 +1571,7 @@
13491571 * For gigantic hugepages allocated through bootmem at
13501572 * boot, it's safer to be consistent with the not-gigantic
13511573 * hugepages and clear the PG_reserved bit from all tail pages
1352
- * too. Otherwse drivers using get_user_pages() to access tail
1574
+ * too. Otherwise drivers using get_user_pages() to access tail
13531575 * pages may get the reference counting wrong if they see
13541576 * PG_reserved set on a tail page (despite the head page not
13551577 * having PG_reserved set). Enforcing this consistency between
....@@ -1362,6 +1584,7 @@
13621584 set_compound_head(p, page);
13631585 }
13641586 atomic_set(compound_mapcount_ptr(page), -1);
1587
+ atomic_set(compound_pincount_ptr(page), 0);
13651588 }
13661589
13671590 /*
....@@ -1388,7 +1611,27 @@
13881611 if (!PageHead(page_head))
13891612 return 0;
13901613
1391
- return get_compound_page_dtor(page_head) == free_huge_page;
1614
+ return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
1615
+}
1616
+
1617
+/*
1618
+ * Find and lock address space (mapping) in write mode.
1619
+ *
1620
+ * Upon entry, the page is locked which means that page_mapping() is
1621
+ * stable. Due to locking order, we can only trylock_write. If we can
1622
+ * not get the lock, simply return NULL to caller.
1623
+ */
1624
+struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
1625
+{
1626
+ struct address_space *mapping = page_mapping(hpage);
1627
+
1628
+ if (!mapping)
1629
+ return mapping;
1630
+
1631
+ if (i_mmap_trylock_write(mapping))
1632
+ return mapping;
1633
+
1634
+ return NULL;
13921635 }
13931636
13941637 pgoff_t hugetlb_basepage_index(struct page *page)
....@@ -1406,12 +1649,25 @@
14061649 }
14071650
14081651 static struct page *alloc_buddy_huge_page(struct hstate *h,
1409
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
1652
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
1653
+ nodemask_t *node_alloc_noretry)
14101654 {
14111655 int order = huge_page_order(h);
14121656 struct page *page;
1657
+ bool alloc_try_hard = true;
14131658
1414
- gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
1659
+ /*
1660
+ * By default we always try hard to allocate the page with
1661
+ * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
1662
+ * a loop (to adjust global huge page counts) and previous allocation
1663
+ * failed, do not continue to try hard on the same node. Use the
1664
+ * node_alloc_noretry bitmap to manage this state information.
1665
+ */
1666
+ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1667
+ alloc_try_hard = false;
1668
+ gfp_mask |= __GFP_COMP|__GFP_NOWARN;
1669
+ if (alloc_try_hard)
1670
+ gfp_mask |= __GFP_RETRY_MAYFAIL;
14151671 if (nid == NUMA_NO_NODE)
14161672 nid = numa_mem_id();
14171673 page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
....@@ -1419,6 +1675,22 @@
14191675 __count_vm_event(HTLB_BUDDY_PGALLOC);
14201676 else
14211677 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1678
+
1679
+ /*
1680
+ * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
1681
+ * indicates an overall state change. Clear bit so that we resume
1682
+ * normal 'try hard' allocations.
1683
+ */
1684
+ if (node_alloc_noretry && page && !alloc_try_hard)
1685
+ node_clear(nid, *node_alloc_noretry);
1686
+
1687
+ /*
1688
+ * If we tried hard to get a page but failed, set bit so that
1689
+ * subsequent attempts will not try as hard until there is an
1690
+ * overall state change.
1691
+ */
1692
+ if (node_alloc_noretry && !page && alloc_try_hard)
1693
+ node_set(nid, *node_alloc_noretry);
14221694
14231695 return page;
14241696 }
....@@ -1428,7 +1700,8 @@
14281700 * should use this function to get new hugetlb pages
14291701 */
14301702 static struct page *alloc_fresh_huge_page(struct hstate *h,
1431
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
1703
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
1704
+ nodemask_t *node_alloc_noretry)
14321705 {
14331706 struct page *page;
14341707
....@@ -1436,7 +1709,7 @@
14361709 page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
14371710 else
14381711 page = alloc_buddy_huge_page(h, gfp_mask,
1439
- nid, nmask);
1712
+ nid, nmask, node_alloc_noretry);
14401713 if (!page)
14411714 return NULL;
14421715
....@@ -1451,14 +1724,16 @@
14511724 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
14521725 * manner.
14531726 */
1454
-static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
1727
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
1728
+ nodemask_t *node_alloc_noretry)
14551729 {
14561730 struct page *page;
14571731 int nr_nodes, node;
14581732 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
14591733
14601734 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1461
- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
1735
+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
1736
+ node_alloc_noretry);
14621737 if (page)
14631738 break;
14641739 }
....@@ -1623,7 +1898,7 @@
16231898 goto out_unlock;
16241899 spin_unlock(&hugetlb_lock);
16251900
1626
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
1901
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
16271902 if (!page)
16281903 return NULL;
16291904
....@@ -1652,14 +1927,14 @@
16521927 }
16531928
16541929 static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
1655
- int nid, nodemask_t *nmask)
1930
+ int nid, nodemask_t *nmask)
16561931 {
16571932 struct page *page;
16581933
16591934 if (hstate_is_gigantic(h))
16601935 return NULL;
16611936
1662
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
1937
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
16631938 if (!page)
16641939 return NULL;
16651940
....@@ -1693,31 +1968,9 @@
16931968 }
16941969
16951970 /* page migration callback function */
1696
-struct page *alloc_huge_page_node(struct hstate *h, int nid)
1697
-{
1698
- gfp_t gfp_mask = htlb_alloc_mask(h);
1699
- struct page *page = NULL;
1700
-
1701
- if (nid != NUMA_NO_NODE)
1702
- gfp_mask |= __GFP_THISNODE;
1703
-
1704
- spin_lock(&hugetlb_lock);
1705
- if (h->free_huge_pages - h->resv_huge_pages > 0)
1706
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
1707
- spin_unlock(&hugetlb_lock);
1708
-
1709
- if (!page)
1710
- page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
1711
-
1712
- return page;
1713
-}
1714
-
1715
-/* page migration callback function */
17161971 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
1717
- nodemask_t *nmask)
1972
+ nodemask_t *nmask, gfp_t gfp_mask)
17181973 {
1719
- gfp_t gfp_mask = htlb_alloc_mask(h);
1720
-
17211974 spin_lock(&hugetlb_lock);
17221975 if (h->free_huge_pages - h->resv_huge_pages > 0) {
17231976 struct page *page;
....@@ -1745,7 +1998,7 @@
17451998
17461999 gfp_mask = htlb_alloc_mask(h);
17472000 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1748
- page = alloc_huge_page_nodemask(h, node, nodemask);
2001
+ page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
17492002 mpol_cond_put(mpol);
17502003
17512004 return page;
....@@ -1756,6 +2009,7 @@
17562009 * of size 'delta'.
17572010 */
17582011 static int gather_surplus_pages(struct hstate *h, int delta)
2012
+ __must_hold(&hugetlb_lock)
17592013 {
17602014 struct list_head surplus_list;
17612015 struct page *page, *tmp;
....@@ -1873,7 +2127,7 @@
18732127 * evenly across all nodes with memory. Iterate across these nodes
18742128 * until we can no longer free unreserved surplus pages. This occurs
18752129 * when the nodes with surplus pages have no free pages.
1876
- * free_pool_huge_page() will balance the the freed pages across the
2130
+ * free_pool_huge_page() will balance the freed pages across the
18772131 * on-line nodes with memory and will handle the hstate accounting.
18782132 *
18792133 * Note that we decrement resv_huge_pages as we free the pages. If
....@@ -1931,6 +2185,7 @@
19312185 struct resv_map *resv;
19322186 pgoff_t idx;
19332187 long ret;
2188
+ long dummy_out_regions_needed;
19342189
19352190 resv = vma_resv_map(vma);
19362191 if (!resv)
....@@ -1939,20 +2194,29 @@
19392194 idx = vma_hugecache_offset(h, vma, addr);
19402195 switch (mode) {
19412196 case VMA_NEEDS_RESV:
1942
- ret = region_chg(resv, idx, idx + 1);
2197
+ ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2198
+ /* We assume that vma_reservation_* routines always operate on
2199
+ * 1 page, and that adding to resv map a 1 page entry can only
2200
+ * ever require 1 region.
2201
+ */
2202
+ VM_BUG_ON(dummy_out_regions_needed != 1);
19432203 break;
19442204 case VMA_COMMIT_RESV:
1945
- ret = region_add(resv, idx, idx + 1);
2205
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2206
+ /* region_add calls of range 1 should never fail. */
2207
+ VM_BUG_ON(ret < 0);
19462208 break;
19472209 case VMA_END_RESV:
1948
- region_abort(resv, idx, idx + 1);
2210
+ region_abort(resv, idx, idx + 1, 1);
19492211 ret = 0;
19502212 break;
19512213 case VMA_ADD_RESV:
1952
- if (vma->vm_flags & VM_MAYSHARE)
1953
- ret = region_add(resv, idx, idx + 1);
1954
- else {
1955
- region_abort(resv, idx, idx + 1);
2214
+ if (vma->vm_flags & VM_MAYSHARE) {
2215
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2216
+ /* region_add calls of range 1 should never fail. */
2217
+ VM_BUG_ON(ret < 0);
2218
+ } else {
2219
+ region_abort(resv, idx, idx + 1, 1);
19562220 ret = region_del(resv, idx, idx + 1);
19572221 }
19582222 break;
....@@ -2063,6 +2327,7 @@
20632327 long gbl_chg;
20642328 int ret, idx;
20652329 struct hugetlb_cgroup *h_cg;
2330
+ bool deferred_reserve;
20662331
20672332 idx = hstate_index(h);
20682333 /*
....@@ -2100,9 +2365,19 @@
21002365 gbl_chg = 1;
21012366 }
21022367
2368
+ /* If this allocation is not consuming a reservation, charge it now.
2369
+ */
2370
+ deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
2371
+ if (deferred_reserve) {
2372
+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
2373
+ idx, pages_per_huge_page(h), &h_cg);
2374
+ if (ret)
2375
+ goto out_subpool_put;
2376
+ }
2377
+
21032378 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
21042379 if (ret)
2105
- goto out_subpool_put;
2380
+ goto out_uncharge_cgroup_reservation;
21062381
21072382 spin_lock(&hugetlb_lock);
21082383 /*
....@@ -2116,15 +2391,23 @@
21162391 page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
21172392 if (!page)
21182393 goto out_uncharge_cgroup;
2394
+ spin_lock(&hugetlb_lock);
21192395 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
21202396 SetPagePrivate(page);
21212397 h->resv_huge_pages--;
21222398 }
2123
- spin_lock(&hugetlb_lock);
2124
- list_move(&page->lru, &h->hugepage_activelist);
2399
+ list_add(&page->lru, &h->hugepage_activelist);
21252400 /* Fall through */
21262401 }
21272402 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
2403
+ /* If allocation is not consuming a reservation, also store the
2404
+ * hugetlb_cgroup pointer on the page.
2405
+ */
2406
+ if (deferred_reserve) {
2407
+ hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
2408
+ h_cg, page);
2409
+ }
2410
+
21282411 spin_unlock(&hugetlb_lock);
21292412
21302413 set_page_private(page, (unsigned long)spool);
....@@ -2144,11 +2427,18 @@
21442427
21452428 rsv_adjust = hugepage_subpool_put_pages(spool, 1);
21462429 hugetlb_acct_memory(h, -rsv_adjust);
2430
+ if (deferred_reserve)
2431
+ hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
2432
+ pages_per_huge_page(h), page);
21472433 }
21482434 return page;
21492435
21502436 out_uncharge_cgroup:
21512437 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
2438
+out_uncharge_cgroup_reservation:
2439
+ if (deferred_reserve)
2440
+ hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
2441
+ h_cg);
21522442 out_subpool_put:
21532443 if (map_chg || avoid_reserve)
21542444 hugepage_subpool_put_pages(spool, 1);
....@@ -2166,9 +2456,9 @@
21662456 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
21672457 void *addr;
21682458
2169
- addr = memblock_virt_alloc_try_nid_raw(
2459
+ addr = memblock_alloc_try_nid_raw(
21702460 huge_page_size(h), huge_page_size(h),
2171
- 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
2461
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
21722462 if (addr) {
21732463 /*
21742464 * Use the beginning of the huge page to store the
....@@ -2190,16 +2480,10 @@
21902480 return 1;
21912481 }
21922482
2193
-static void __init prep_compound_huge_page(struct page *page,
2194
- unsigned int order)
2195
-{
2196
- if (unlikely(order > (MAX_ORDER - 1)))
2197
- prep_compound_gigantic_page(page, order);
2198
- else
2199
- prep_compound_page(page, order);
2200
-}
2201
-
2202
-/* Put bootmem huge pages into the standard lists after mem_map is up */
2483
+/*
2484
+ * Put bootmem huge pages into the standard lists after mem_map is up.
2485
+ * Note: This only applies to gigantic (order > MAX_ORDER) pages.
2486
+ */
22032487 static void __init gather_bootmem_prealloc(void)
22042488 {
22052489 struct huge_bootmem_page *m;
....@@ -2208,20 +2492,19 @@
22082492 struct page *page = virt_to_page(m);
22092493 struct hstate *h = m->hstate;
22102494
2495
+ VM_BUG_ON(!hstate_is_gigantic(h));
22112496 WARN_ON(page_count(page) != 1);
2212
- prep_compound_huge_page(page, h->order);
2497
+ prep_compound_gigantic_page(page, huge_page_order(h));
22132498 WARN_ON(PageReserved(page));
22142499 prep_new_huge_page(h, page, page_to_nid(page));
22152500 put_page(page); /* free it into the hugepage allocator */
22162501
22172502 /*
2218
- * If we had gigantic hugepages allocated at boot time, we need
2219
- * to restore the 'stolen' pages to totalram_pages in order to
2220
- * fix confusing memory reports from free(1) and another
2221
- * side-effects, like CommitLimit going negative.
2503
+ * We need to restore the 'stolen' pages to totalram_pages
2504
+ * in order to fix confusing memory reports from free(1) and
2505
+ * other side-effects, like CommitLimit going negative.
22222506 */
2223
- if (hstate_is_gigantic(h))
2224
- adjust_managed_page_count(page, 1 << h->order);
2507
+ adjust_managed_page_count(page, pages_per_huge_page(h));
22252508 cond_resched();
22262509 }
22272510 }
....@@ -2229,13 +2512,37 @@
22292512 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
22302513 {
22312514 unsigned long i;
2515
+ nodemask_t *node_alloc_noretry;
2516
+
2517
+ if (!hstate_is_gigantic(h)) {
2518
+ /*
2519
+ * Bit mask controlling how hard we retry per-node allocations.
2520
+ * Ignore errors as lower level routines can deal with
2521
+ * node_alloc_noretry == NULL. If this kmalloc fails at boot
2522
+ * time, we are likely in bigger trouble.
2523
+ */
2524
+ node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
2525
+ GFP_KERNEL);
2526
+ } else {
2527
+ /* allocations done at boot time */
2528
+ node_alloc_noretry = NULL;
2529
+ }
2530
+
2531
+ /* bit mask controlling how hard we retry per-node allocations */
2532
+ if (node_alloc_noretry)
2533
+ nodes_clear(*node_alloc_noretry);
22322534
22332535 for (i = 0; i < h->max_huge_pages; ++i) {
22342536 if (hstate_is_gigantic(h)) {
2537
+ if (hugetlb_cma_size) {
2538
+ pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
2539
+ goto free;
2540
+ }
22352541 if (!alloc_bootmem_huge_page(h))
22362542 break;
22372543 } else if (!alloc_pool_huge_page(h,
2238
- &node_states[N_MEMORY]))
2544
+ &node_states[N_MEMORY],
2545
+ node_alloc_noretry))
22392546 break;
22402547 cond_resched();
22412548 }
....@@ -2247,6 +2554,8 @@
22472554 h->max_huge_pages, buf, i);
22482555 h->max_huge_pages = i;
22492556 }
2557
+free:
2558
+ kfree(node_alloc_noretry);
22502559 }
22512560
22522561 static void __init hugetlb_init_hstates(void)
....@@ -2341,13 +2650,59 @@
23412650 }
23422651
23432652 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
2344
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
2345
- nodemask_t *nodes_allowed)
2653
+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
2654
+ nodemask_t *nodes_allowed)
23462655 {
23472656 unsigned long min_count, ret;
2657
+ NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
23482658
2349
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
2350
- return h->max_huge_pages;
2659
+ /*
2660
+ * Bit mask controlling how hard we retry per-node allocations.
2661
+ * If we can not allocate the bit mask, do not attempt to allocate
2662
+ * the requested huge pages.
2663
+ */
2664
+ if (node_alloc_noretry)
2665
+ nodes_clear(*node_alloc_noretry);
2666
+ else
2667
+ return -ENOMEM;
2668
+
2669
+ spin_lock(&hugetlb_lock);
2670
+
2671
+ /*
2672
+ * Check for a node specific request.
2673
+ * Changing node specific huge page count may require a corresponding
2674
+ * change to the global count. In any case, the passed node mask
2675
+ * (nodes_allowed) will restrict alloc/free to the specified node.
2676
+ */
2677
+ if (nid != NUMA_NO_NODE) {
2678
+ unsigned long old_count = count;
2679
+
2680
+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
2681
+ /*
2682
+ * User may have specified a large count value which caused the
2683
+ * above calculation to overflow. In this case, they wanted
2684
+ * to allocate as many huge pages as possible. Set count to
2685
+ * largest possible value to align with their intention.
2686
+ */
2687
+ if (count < old_count)
2688
+ count = ULONG_MAX;
2689
+ }
2690
+
2691
+ /*
2692
+ * Gigantic pages runtime allocation depend on the capability for large
2693
+ * page range allocation.
2694
+ * If the system does not provide this feature, return an error when
2695
+ * the user tries to allocate gigantic pages but let the user free the
2696
+ * boottime allocated gigantic pages.
2697
+ */
2698
+ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
2699
+ if (count > persistent_huge_pages(h)) {
2700
+ spin_unlock(&hugetlb_lock);
2701
+ NODEMASK_FREE(node_alloc_noretry);
2702
+ return -EINVAL;
2703
+ }
2704
+ /* Fall through to decrease pool */
2705
+ }
23512706
23522707 /*
23532708 * Increase the pool size
....@@ -2360,7 +2715,6 @@
23602715 * pool might be one hugepage larger than it needs to be, but
23612716 * within all the constraints specified by the sysctls.
23622717 */
2363
- spin_lock(&hugetlb_lock);
23642718 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
23652719 if (!adjust_pool_surplus(h, nodes_allowed, -1))
23662720 break;
....@@ -2377,7 +2731,8 @@
23772731 /* yield cpu to avoid soft lockup */
23782732 cond_resched();
23792733
2380
- ret = alloc_pool_huge_page(h, nodes_allowed);
2734
+ ret = alloc_pool_huge_page(h, nodes_allowed,
2735
+ node_alloc_noretry);
23812736 spin_lock(&hugetlb_lock);
23822737 if (!ret)
23832738 goto out;
....@@ -2415,9 +2770,12 @@
24152770 break;
24162771 }
24172772 out:
2418
- ret = persistent_huge_pages(h);
2773
+ h->max_huge_pages = persistent_huge_pages(h);
24192774 spin_unlock(&hugetlb_lock);
2420
- return ret;
2775
+
2776
+ NODEMASK_FREE(node_alloc_noretry);
2777
+
2778
+ return 0;
24212779 }
24222780
24232781 #define HSTATE_ATTR_RO(_name) \
....@@ -2467,41 +2825,32 @@
24672825 unsigned long count, size_t len)
24682826 {
24692827 int err;
2470
- NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
2828
+ nodemask_t nodes_allowed, *n_mask;
24712829
2472
- if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
2473
- err = -EINVAL;
2474
- goto out;
2475
- }
2830
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
2831
+ return -EINVAL;
24762832
24772833 if (nid == NUMA_NO_NODE) {
24782834 /*
24792835 * global hstate attribute
24802836 */
24812837 if (!(obey_mempolicy &&
2482
- init_nodemask_of_mempolicy(nodes_allowed))) {
2483
- NODEMASK_FREE(nodes_allowed);
2484
- nodes_allowed = &node_states[N_MEMORY];
2485
- }
2486
- } else if (nodes_allowed) {
2838
+ init_nodemask_of_mempolicy(&nodes_allowed)))
2839
+ n_mask = &node_states[N_MEMORY];
2840
+ else
2841
+ n_mask = &nodes_allowed;
2842
+ } else {
24872843 /*
2488
- * per node hstate attribute: adjust count to global,
2489
- * but restrict alloc/free to the specified node.
2844
+ * Node specific request. count adjustment happens in
2845
+ * set_max_huge_pages() after acquiring hugetlb_lock.
24902846 */
2491
- count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
2492
- init_nodemask_of_node(nodes_allowed, nid);
2493
- } else
2494
- nodes_allowed = &node_states[N_MEMORY];
2847
+ init_nodemask_of_node(&nodes_allowed, nid);
2848
+ n_mask = &nodes_allowed;
2849
+ }
24952850
2496
- h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
2851
+ err = set_max_huge_pages(h, count, nid, n_mask);
24972852
2498
- if (nodes_allowed != &node_states[N_MEMORY])
2499
- NODEMASK_FREE(nodes_allowed);
2500
-
2501
- return len;
2502
-out:
2503
- NODEMASK_FREE(nodes_allowed);
2504
- return err;
2853
+ return err ? err : len;
25052854 }
25062855
25072856 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
....@@ -2675,7 +3024,7 @@
26753024 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
26763025 hstate_kobjs, &hstate_attr_group);
26773026 if (err)
2678
- pr_err("Hugetlb: Unable to add hstate %s", h->name);
3027
+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
26793028 }
26803029 }
26813030
....@@ -2779,7 +3128,7 @@
27793128 nhs->hstate_kobjs,
27803129 &per_node_hstate_attr_group);
27813130 if (err) {
2782
- pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
3131
+ pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
27833132 h->name, node->dev.id);
27843133 hugetlb_unregister_node(node);
27853134 break;
....@@ -2827,25 +3176,44 @@
28273176 {
28283177 int i;
28293178
2830
- if (!hugepages_supported())
3179
+ if (!hugepages_supported()) {
3180
+ if (hugetlb_max_hstate || default_hstate_max_huge_pages)
3181
+ pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
28313182 return 0;
3183
+ }
28323184
2833
- if (!size_to_hstate(default_hstate_size)) {
2834
- if (default_hstate_size != 0) {
2835
- pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
2836
- default_hstate_size, HPAGE_SIZE);
3185
+ /*
3186
+ * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
3187
+ * architectures depend on setup being done here.
3188
+ */
3189
+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
3190
+ if (!parsed_default_hugepagesz) {
3191
+ /*
3192
+ * If we did not parse a default huge page size, set
3193
+ * default_hstate_idx to HPAGE_SIZE hstate. And, if the
3194
+ * number of huge pages for this default size was implicitly
3195
+ * specified, set that here as well.
3196
+ * Note that the implicit setting will overwrite an explicit
3197
+ * setting. A warning will be printed in this case.
3198
+ */
3199
+ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
3200
+ if (default_hstate_max_huge_pages) {
3201
+ if (default_hstate.max_huge_pages) {
3202
+ char buf[32];
3203
+
3204
+ string_get_size(huge_page_size(&default_hstate),
3205
+ 1, STRING_UNITS_2, buf, 32);
3206
+ pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
3207
+ default_hstate.max_huge_pages, buf);
3208
+ pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
3209
+ default_hstate_max_huge_pages);
3210
+ }
3211
+ default_hstate.max_huge_pages =
3212
+ default_hstate_max_huge_pages;
28373213 }
2838
-
2839
- default_hstate_size = HPAGE_SIZE;
2840
- if (!size_to_hstate(default_hstate_size))
2841
- hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
2842
- }
2843
- default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
2844
- if (default_hstate_max_huge_pages) {
2845
- if (!default_hstate.max_huge_pages)
2846
- default_hstate.max_huge_pages = default_hstate_max_huge_pages;
28473214 }
28483215
3216
+ hugetlb_cma_check();
28493217 hugetlb_init_hstates();
28503218 gather_bootmem_prealloc();
28513219 report_hugepages();
....@@ -2870,10 +3238,10 @@
28703238 }
28713239 subsys_initcall(hugetlb_init);
28723240
2873
-/* Should be called on processing a hugepagesz=... option */
2874
-void __init hugetlb_bad_size(void)
3241
+/* Overwritten by architectures with more huge page sizes */
3242
+bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
28753243 {
2876
- parsed_valid_hugepagesz = false;
3244
+ return size == HPAGE_SIZE;
28773245 }
28783246
28793247 void __init hugetlb_add_hstate(unsigned int order)
....@@ -2882,7 +3250,6 @@
28823250 unsigned long i;
28833251
28843252 if (size_to_hstate(PAGE_SIZE << order)) {
2885
- pr_warn("hugepagesz= specified twice, ignoring\n");
28863253 return;
28873254 }
28883255 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
....@@ -2903,20 +3270,29 @@
29033270 parsed_hstate = h;
29043271 }
29053272
2906
-static int __init hugetlb_nrpages_setup(char *s)
3273
+/*
3274
+ * hugepages command line processing
3275
+ * hugepages normally follows a valid hugepagsz or default_hugepagsz
3276
+ * specification. If not, ignore the hugepages value. hugepages can also
3277
+ * be the first huge page command line option in which case it implicitly
3278
+ * specifies the number of huge pages for the default size.
3279
+ */
3280
+static int __init hugepages_setup(char *s)
29073281 {
29083282 unsigned long *mhp;
29093283 static unsigned long *last_mhp;
29103284
29113285 if (!parsed_valid_hugepagesz) {
2912
- pr_warn("hugepages = %s preceded by "
2913
- "an unsupported hugepagesz, ignoring\n", s);
3286
+ pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
29143287 parsed_valid_hugepagesz = true;
2915
- return 1;
3288
+ return 0;
29163289 }
3290
+
29173291 /*
2918
- * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
2919
- * so this hugepages= parameter goes to the "default hstate".
3292
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
3293
+ * yet, so this hugepages= parameter goes to the "default hstate".
3294
+ * Otherwise, it goes with the previously parsed hugepagesz or
3295
+ * default_hugepagesz.
29203296 */
29213297 else if (!hugetlb_max_hstate)
29223298 mhp = &default_hstate_max_huge_pages;
....@@ -2924,8 +3300,8 @@
29243300 mhp = &parsed_hstate->max_huge_pages;
29253301
29263302 if (mhp == last_mhp) {
2927
- pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
2928
- return 1;
3303
+ pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
3304
+ return 0;
29293305 }
29303306
29313307 if (sscanf(s, "%lu", mhp) <= 0)
....@@ -2943,22 +3319,118 @@
29433319
29443320 return 1;
29453321 }
2946
-__setup("hugepages=", hugetlb_nrpages_setup);
3322
+__setup("hugepages=", hugepages_setup);
29473323
2948
-static int __init hugetlb_default_setup(char *s)
3324
+/*
3325
+ * hugepagesz command line processing
3326
+ * A specific huge page size can only be specified once with hugepagesz.
3327
+ * hugepagesz is followed by hugepages on the command line. The global
3328
+ * variable 'parsed_valid_hugepagesz' is used to determine if prior
3329
+ * hugepagesz argument was valid.
3330
+ */
3331
+static int __init hugepagesz_setup(char *s)
29493332 {
2950
- default_hstate_size = memparse(s, &s);
3333
+ unsigned long size;
3334
+ struct hstate *h;
3335
+
3336
+ parsed_valid_hugepagesz = false;
3337
+ size = (unsigned long)memparse(s, NULL);
3338
+
3339
+ if (!arch_hugetlb_valid_size(size)) {
3340
+ pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
3341
+ return 0;
3342
+ }
3343
+
3344
+ h = size_to_hstate(size);
3345
+ if (h) {
3346
+ /*
3347
+ * hstate for this size already exists. This is normally
3348
+ * an error, but is allowed if the existing hstate is the
3349
+ * default hstate. More specifically, it is only allowed if
3350
+ * the number of huge pages for the default hstate was not
3351
+ * previously specified.
3352
+ */
3353
+ if (!parsed_default_hugepagesz || h != &default_hstate ||
3354
+ default_hstate.max_huge_pages) {
3355
+ pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
3356
+ return 0;
3357
+ }
3358
+
3359
+ /*
3360
+ * No need to call hugetlb_add_hstate() as hstate already
3361
+ * exists. But, do set parsed_hstate so that a following
3362
+ * hugepages= parameter will be applied to this hstate.
3363
+ */
3364
+ parsed_hstate = h;
3365
+ parsed_valid_hugepagesz = true;
3366
+ return 1;
3367
+ }
3368
+
3369
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
3370
+ parsed_valid_hugepagesz = true;
29513371 return 1;
29523372 }
2953
-__setup("default_hugepagesz=", hugetlb_default_setup);
3373
+__setup("hugepagesz=", hugepagesz_setup);
29543374
2955
-static unsigned int cpuset_mems_nr(unsigned int *array)
3375
+/*
3376
+ * default_hugepagesz command line input
3377
+ * Only one instance of default_hugepagesz allowed on command line.
3378
+ */
3379
+static int __init default_hugepagesz_setup(char *s)
3380
+{
3381
+ unsigned long size;
3382
+
3383
+ parsed_valid_hugepagesz = false;
3384
+ if (parsed_default_hugepagesz) {
3385
+ pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
3386
+ return 0;
3387
+ }
3388
+
3389
+ size = (unsigned long)memparse(s, NULL);
3390
+
3391
+ if (!arch_hugetlb_valid_size(size)) {
3392
+ pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
3393
+ return 0;
3394
+ }
3395
+
3396
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
3397
+ parsed_valid_hugepagesz = true;
3398
+ parsed_default_hugepagesz = true;
3399
+ default_hstate_idx = hstate_index(size_to_hstate(size));
3400
+
3401
+ /*
3402
+ * The number of default huge pages (for this size) could have been
3403
+ * specified as the first hugetlb parameter: hugepages=X. If so,
3404
+ * then default_hstate_max_huge_pages is set. If the default huge
3405
+ * page size is gigantic (>= MAX_ORDER), then the pages must be
3406
+ * allocated here from bootmem allocator.
3407
+ */
3408
+ if (default_hstate_max_huge_pages) {
3409
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
3410
+ if (hstate_is_gigantic(&default_hstate))
3411
+ hugetlb_hstate_alloc_pages(&default_hstate);
3412
+ default_hstate_max_huge_pages = 0;
3413
+ }
3414
+
3415
+ return 1;
3416
+}
3417
+__setup("default_hugepagesz=", default_hugepagesz_setup);
3418
+
3419
+static unsigned int allowed_mems_nr(struct hstate *h)
29563420 {
29573421 int node;
29583422 unsigned int nr = 0;
3423
+ nodemask_t *mpol_allowed;
3424
+ unsigned int *array = h->free_huge_pages_node;
3425
+ gfp_t gfp_mask = htlb_alloc_mask(h);
29593426
2960
- for_each_node_mask(node, cpuset_current_mems_allowed)
2961
- nr += array[node];
3427
+ mpol_allowed = policy_nodemask_current(gfp_mask);
3428
+
3429
+ for_each_node_mask(node, cpuset_current_mems_allowed) {
3430
+ if (!mpol_allowed ||
3431
+ (mpol_allowed && node_isset(node, *mpol_allowed)))
3432
+ nr += array[node];
3433
+ }
29623434
29633435 return nr;
29643436 }
....@@ -2982,7 +3454,7 @@
29823454
29833455 static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
29843456 struct ctl_table *table, int write,
2985
- void __user *buffer, size_t *length, loff_t *ppos)
3457
+ void *buffer, size_t *length, loff_t *ppos)
29863458 {
29873459 struct hstate *h = &default_hstate;
29883460 unsigned long tmp = h->max_huge_pages;
....@@ -3004,7 +3476,7 @@
30043476 }
30053477
30063478 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
3007
- void __user *buffer, size_t *length, loff_t *ppos)
3479
+ void *buffer, size_t *length, loff_t *ppos)
30083480 {
30093481
30103482 return hugetlb_sysctl_handler_common(false, table, write,
....@@ -3013,7 +3485,7 @@
30133485
30143486 #ifdef CONFIG_NUMA
30153487 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
3016
- void __user *buffer, size_t *length, loff_t *ppos)
3488
+ void *buffer, size_t *length, loff_t *ppos)
30173489 {
30183490 return hugetlb_sysctl_handler_common(true, table, write,
30193491 buffer, length, ppos);
....@@ -3021,8 +3493,7 @@
30213493 #endif /* CONFIG_NUMA */
30223494
30233495 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
3024
- void __user *buffer,
3025
- size_t *length, loff_t *ppos)
3496
+ void *buffer, size_t *length, loff_t *ppos)
30263497 {
30273498 struct hstate *h = &default_hstate;
30283499 unsigned long tmp;
....@@ -3082,18 +3553,20 @@
30823553 seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
30833554 }
30843555
3085
-int hugetlb_report_node_meminfo(int nid, char *buf)
3556
+int hugetlb_report_node_meminfo(char *buf, int len, int nid)
30863557 {
30873558 struct hstate *h = &default_hstate;
3559
+
30883560 if (!hugepages_supported())
30893561 return 0;
3090
- return sprintf(buf,
3091
- "Node %d HugePages_Total: %5u\n"
3092
- "Node %d HugePages_Free: %5u\n"
3093
- "Node %d HugePages_Surp: %5u\n",
3094
- nid, h->nr_huge_pages_node[nid],
3095
- nid, h->free_huge_pages_node[nid],
3096
- nid, h->surplus_huge_pages_node[nid]);
3562
+
3563
+ return sysfs_emit_at(buf, len,
3564
+ "Node %d HugePages_Total: %5u\n"
3565
+ "Node %d HugePages_Free: %5u\n"
3566
+ "Node %d HugePages_Surp: %5u\n",
3567
+ nid, h->nr_huge_pages_node[nid],
3568
+ nid, h->free_huge_pages_node[nid],
3569
+ nid, h->surplus_huge_pages_node[nid]);
30973570 }
30983571
30993572 void hugetlb_show_meminfo(void)
....@@ -3152,12 +3625,18 @@
31523625 * we fall back to check against current free page availability as
31533626 * a best attempt and hopefully to minimize the impact of changing
31543627 * semantics that cpuset has.
3628
+ *
3629
+ * Apart from cpuset, we also have memory policy mechanism that
3630
+ * also determines from which node the kernel will allocate memory
3631
+ * in a NUMA system. So similar to cpuset, we also should consider
3632
+ * the memory policy of the current task. Similar to the description
3633
+ * above.
31553634 */
31563635 if (delta > 0) {
31573636 if (gather_surplus_pages(h, delta) < 0)
31583637 goto out;
31593638
3160
- if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
3639
+ if (delta > allowed_mems_nr(h)) {
31613640 return_unused_surplus_pages(h, delta);
31623641 goto out;
31633642 }
....@@ -3184,8 +3663,10 @@
31843663 * after this open call completes. It is therefore safe to take a
31853664 * new reference here without additional locking.
31863665 */
3187
- if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
3666
+ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
3667
+ resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
31883668 kref_get(&resv->refs);
3669
+ }
31893670 }
31903671
31913672 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
....@@ -3203,9 +3684,7 @@
32033684 end = vma_hugecache_offset(h, vma, vma->vm_end);
32043685
32053686 reserve = (end - start) - region_count(resv, start, end);
3206
-
3207
- kref_put(&resv->refs, resv_map_release);
3208
-
3687
+ hugetlb_cgroup_uncharge_counter(resv, start, end);
32093688 if (reserve) {
32103689 /*
32113690 * Decrement reserve counts. The global reserve count may be
....@@ -3214,12 +3693,33 @@
32143693 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
32153694 hugetlb_acct_memory(h, -gbl_reserve);
32163695 }
3696
+
3697
+ kref_put(&resv->refs, resv_map_release);
32173698 }
32183699
32193700 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
32203701 {
32213702 if (addr & ~(huge_page_mask(hstate_vma(vma))))
32223703 return -EINVAL;
3704
+
3705
+ /*
3706
+ * PMD sharing is only possible for PUD_SIZE-aligned address ranges
3707
+ * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
3708
+ * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
3709
+ */
3710
+ if (addr & ~PUD_MASK) {
3711
+ /*
3712
+ * hugetlb_vm_op_split is called right before we attempt to
3713
+ * split the VMA. We will need to unshare PMDs in the old and
3714
+ * new VMAs, so let's unshare before we split.
3715
+ */
3716
+ unsigned long floor = addr & PUD_MASK;
3717
+ unsigned long ceil = floor + PUD_SIZE;
3718
+
3719
+ if (floor >= vma->vm_start && ceil <= vma->vm_end)
3720
+ hugetlb_unshare_pmds(vma, floor, ceil);
3721
+ }
3722
+
32233723 return 0;
32243724 }
32253725
....@@ -3293,23 +3793,23 @@
32933793 if (huge_pte_none(pte) || pte_present(pte))
32943794 return false;
32953795 swp = pte_to_swp_entry(pte);
3296
- if (non_swap_entry(swp) && is_migration_entry(swp))
3796
+ if (is_migration_entry(swp))
32973797 return true;
32983798 else
32993799 return false;
33003800 }
33013801
3302
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
3802
+static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
33033803 {
33043804 swp_entry_t swp;
33053805
33063806 if (huge_pte_none(pte) || pte_present(pte))
3307
- return 0;
3807
+ return false;
33083808 swp = pte_to_swp_entry(pte);
3309
- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
3310
- return 1;
3809
+ if (is_hwpoison_entry(swp))
3810
+ return true;
33113811 else
3312
- return 0;
3812
+ return false;
33133813 }
33143814
33153815 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
....@@ -3321,23 +3821,33 @@
33213821 int cow;
33223822 struct hstate *h = hstate_vma(vma);
33233823 unsigned long sz = huge_page_size(h);
3324
- unsigned long mmun_start; /* For mmu_notifiers */
3325
- unsigned long mmun_end; /* For mmu_notifiers */
3824
+ struct address_space *mapping = vma->vm_file->f_mapping;
3825
+ struct mmu_notifier_range range;
33263826 int ret = 0;
33273827
33283828 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
33293829
3330
- mmun_start = vma->vm_start;
3331
- mmun_end = vma->vm_end;
3332
- if (cow)
3333
- mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
3830
+ if (cow) {
3831
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
3832
+ vma->vm_start,
3833
+ vma->vm_end);
3834
+ mmu_notifier_invalidate_range_start(&range);
3835
+ } else {
3836
+ /*
3837
+ * For shared mappings i_mmap_rwsem must be held to call
3838
+ * huge_pte_alloc, otherwise the returned ptep could go
3839
+ * away if part of a shared pmd and another thread calls
3840
+ * huge_pmd_unshare.
3841
+ */
3842
+ i_mmap_lock_read(mapping);
3843
+ }
33343844
33353845 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
33363846 spinlock_t *src_ptl, *dst_ptl;
33373847 src_pte = huge_pte_offset(src, addr, sz);
33383848 if (!src_pte)
33393849 continue;
3340
- dst_pte = huge_pte_alloc(dst, addr, sz);
3850
+ dst_pte = huge_pte_alloc(dst, vma, addr, sz);
33413851 if (!dst_pte) {
33423852 ret = -ENOMEM;
33433853 break;
....@@ -3406,7 +3916,9 @@
34063916 }
34073917
34083918 if (cow)
3409
- mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
3919
+ mmu_notifier_invalidate_range_end(&range);
3920
+ else
3921
+ i_mmap_unlock_read(mapping);
34103922
34113923 return ret;
34123924 }
....@@ -3423,8 +3935,7 @@
34233935 struct page *page;
34243936 struct hstate *h = hstate_vma(vma);
34253937 unsigned long sz = huge_page_size(h);
3426
- unsigned long mmun_start = start; /* For mmu_notifiers */
3427
- unsigned long mmun_end = end; /* For mmu_notifiers */
3938
+ struct mmu_notifier_range range;
34283939 bool force_flush = false;
34293940
34303941 WARN_ON(!is_vm_hugetlb_page(vma));
....@@ -3435,14 +3946,16 @@
34353946 * This is a hugetlb vma, all the pte entries should point
34363947 * to huge page.
34373948 */
3438
- tlb_remove_check_page_size_change(tlb, sz);
3949
+ tlb_change_page_size(tlb, sz);
34393950 tlb_start_vma(tlb, vma);
34403951
34413952 /*
34423953 * If sharing possible, alert mmu notifiers of worst case.
34433954 */
3444
- adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
3445
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3955
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
3956
+ end);
3957
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
3958
+ mmu_notifier_invalidate_range_start(&range);
34463959 address = start;
34473960 for (; address < end; address += sz) {
34483961 ptep = huge_pte_offset(mm, address, sz);
....@@ -3450,7 +3963,7 @@
34503963 continue;
34513964
34523965 ptl = huge_pte_lock(h, mm, ptep);
3453
- if (huge_pmd_unshare(mm, &address, ptep)) {
3966
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
34543967 spin_unlock(ptl);
34553968 tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
34563969 force_flush = true;
....@@ -3508,7 +4021,7 @@
35084021 if (ref_page)
35094022 break;
35104023 }
3511
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
4024
+ mmu_notifier_invalidate_range_end(&range);
35124025 tlb_end_vma(tlb, vma);
35134026
35144027 /*
....@@ -3642,9 +4155,8 @@
36424155 struct page *old_page, *new_page;
36434156 int outside_reserve = 0;
36444157 vm_fault_t ret = 0;
3645
- unsigned long mmun_start; /* For mmu_notifiers */
3646
- unsigned long mmun_end; /* For mmu_notifiers */
36474158 unsigned long haddr = address & huge_page_mask(h);
4159
+ struct mmu_notifier_range range;
36484160
36494161 pte = huge_ptep_get(ptep);
36504162 old_page = pte_page(pte);
....@@ -3689,10 +4201,30 @@
36894201 * may get SIGKILLed if it later faults.
36904202 */
36914203 if (outside_reserve) {
4204
+ struct address_space *mapping = vma->vm_file->f_mapping;
4205
+ pgoff_t idx;
4206
+ u32 hash;
4207
+
36924208 put_page(old_page);
36934209 BUG_ON(huge_pte_none(pte));
4210
+ /*
4211
+ * Drop hugetlb_fault_mutex and i_mmap_rwsem before
4212
+ * unmapping. unmapping needs to hold i_mmap_rwsem
4213
+ * in write mode. Dropping i_mmap_rwsem in read mode
4214
+ * here is OK as COW mappings do not interact with
4215
+ * PMD sharing.
4216
+ *
4217
+ * Reacquire both after unmap operation.
4218
+ */
4219
+ idx = vma_hugecache_offset(h, vma, haddr);
4220
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
4221
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
4222
+ i_mmap_unlock_read(mapping);
4223
+
36944224 unmap_ref_private(mm, vma, old_page, haddr);
3695
- BUG_ON(huge_pte_none(pte));
4225
+
4226
+ i_mmap_lock_read(mapping);
4227
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
36964228 spin_lock(ptl);
36974229 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
36984230 if (likely(ptep &&
....@@ -3722,9 +4254,9 @@
37224254 pages_per_huge_page(h));
37234255 __SetPageUptodate(new_page);
37244256
3725
- mmun_start = haddr;
3726
- mmun_end = mmun_start + huge_page_size(h);
3727
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
4257
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
4258
+ haddr + huge_page_size(h));
4259
+ mmu_notifier_invalidate_range_start(&range);
37284260
37294261 /*
37304262 * Retake the page table lock to check for racing updates
....@@ -3737,7 +4269,7 @@
37374269
37384270 /* Break COW */
37394271 huge_ptep_clear_flush(vma, haddr, ptep);
3740
- mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
4272
+ mmu_notifier_invalidate_range(mm, range.start, range.end);
37414273 set_huge_pte_at(mm, haddr, ptep,
37424274 make_huge_pte(vma, new_page, 1));
37434275 page_remove_rmap(old_page, true);
....@@ -3747,7 +4279,7 @@
37474279 new_page = old_page;
37484280 }
37494281 spin_unlock(ptl);
3750
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
4282
+ mmu_notifier_invalidate_range_end(&range);
37514283 out_release_all:
37524284 restore_reserve_on_error(h, vma, haddr, new_page);
37534285 put_page(new_page);
....@@ -3814,6 +4346,38 @@
38144346 return 0;
38154347 }
38164348
4349
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
4350
+ struct address_space *mapping,
4351
+ pgoff_t idx,
4352
+ unsigned int flags,
4353
+ unsigned long haddr,
4354
+ unsigned long reason)
4355
+{
4356
+ u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
4357
+ struct vm_fault vmf = {
4358
+ .vma = vma,
4359
+ .address = haddr,
4360
+ .flags = flags,
4361
+ /*
4362
+ * Hard to debug if it ends up being
4363
+ * used by a callee that assumes
4364
+ * something about the other
4365
+ * uninitialized fields... same as in
4366
+ * memory.c
4367
+ */
4368
+ };
4369
+
4370
+ /*
4371
+ * vma_lock and hugetlb_fault_mutex must be dropped
4372
+ * before handling userfault. Also mmap_lock will
4373
+ * be dropped during handling userfault, any vma
4374
+ * operation should be careful from here.
4375
+ */
4376
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
4377
+ i_mmap_unlock_read(mapping);
4378
+ return handle_userfault(&vmf, VM_UFFD_MISSING);
4379
+}
4380
+
38174381 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
38184382 struct vm_area_struct *vma,
38194383 struct address_space *mapping, pgoff_t idx,
....@@ -3828,6 +4392,7 @@
38284392 spinlock_t *ptl;
38294393 unsigned long haddr = address & huge_page_mask(h);
38304394 bool new_page = false;
4395
+ u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
38314396
38324397 /*
38334398 * Currently, we are forced to kill the process in the event the
....@@ -3837,52 +4402,50 @@
38374402 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
38384403 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
38394404 current->pid);
3840
- return ret;
4405
+ goto out;
38414406 }
38424407
38434408 /*
3844
- * Use page lock to guard against racing truncation
3845
- * before we get page_table_lock.
4409
+ * We can not race with truncation due to holding i_mmap_rwsem.
4410
+ * i_size is modified when holding i_mmap_rwsem, so check here
4411
+ * once for faults beyond end of file.
38464412 */
4413
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
4414
+ if (idx >= size)
4415
+ goto out;
4416
+
38474417 retry:
38484418 page = find_lock_page(mapping, idx);
38494419 if (!page) {
3850
- size = i_size_read(mapping->host) >> huge_page_shift(h);
3851
- if (idx >= size)
3852
- goto out;
3853
-
3854
- /*
3855
- * Check for page in userfault range
3856
- */
4420
+ /* Check for page in userfault range */
38574421 if (userfaultfd_missing(vma)) {
3858
- u32 hash;
3859
- struct vm_fault vmf = {
3860
- .vma = vma,
3861
- .address = haddr,
3862
- .flags = flags,
3863
- /*
3864
- * Hard to debug if it ends up being
3865
- * used by a callee that assumes
3866
- * something about the other
3867
- * uninitialized fields... same as in
3868
- * memory.c
3869
- */
3870
- };
3871
-
3872
- /*
3873
- * hugetlb_fault_mutex must be dropped before
3874
- * handling userfault. Reacquire after handling
3875
- * fault to make calling code simpler.
3876
- */
3877
- hash = hugetlb_fault_mutex_hash(h, mapping, idx);
3878
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
3879
- ret = handle_userfault(&vmf, VM_UFFD_MISSING);
3880
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
4422
+ ret = hugetlb_handle_userfault(vma, mapping, idx,
4423
+ flags, haddr,
4424
+ VM_UFFD_MISSING);
38814425 goto out;
38824426 }
38834427
38844428 page = alloc_huge_page(vma, haddr, 0);
38854429 if (IS_ERR(page)) {
4430
+ /*
4431
+ * Returning error will result in faulting task being
4432
+ * sent SIGBUS. The hugetlb fault mutex prevents two
4433
+ * tasks from racing to fault in the same page which
4434
+ * could result in false unable to allocate errors.
4435
+ * Page migration does not take the fault mutex, but
4436
+ * does a clear then write of pte's under page table
4437
+ * lock. Page fault code could race with migration,
4438
+ * notice the clear pte and try to allocate a page
4439
+ * here. Before returning error, get ptl and make
4440
+ * sure there really is no pte entry.
4441
+ */
4442
+ ptl = huge_pte_lock(h, mm, ptep);
4443
+ if (!huge_pte_none(huge_ptep_get(ptep))) {
4444
+ ret = 0;
4445
+ spin_unlock(ptl);
4446
+ goto out;
4447
+ }
4448
+ spin_unlock(ptl);
38864449 ret = vmf_error(PTR_ERR(page));
38874450 goto out;
38884451 }
....@@ -3917,6 +4480,16 @@
39174480 VM_FAULT_SET_HINDEX(hstate_index(h));
39184481 goto backout_unlocked;
39194482 }
4483
+
4484
+ /* Check for page in userfault range. */
4485
+ if (userfaultfd_minor(vma)) {
4486
+ unlock_page(page);
4487
+ put_page(page);
4488
+ ret = hugetlb_handle_userfault(vma, mapping, idx,
4489
+ flags, haddr,
4490
+ VM_UFFD_MINOR);
4491
+ goto out;
4492
+ }
39204493 }
39214494
39224495 /*
....@@ -3935,10 +4508,6 @@
39354508 }
39364509
39374510 ptl = huge_pte_lock(h, mm, ptep);
3938
- size = i_size_read(mapping->host) >> huge_page_shift(h);
3939
- if (idx >= size)
3940
- goto backout;
3941
-
39424511 ret = 0;
39434512 if (!huge_pte_none(huge_ptep_get(ptep)))
39444513 goto backout;
....@@ -3970,6 +4539,8 @@
39704539
39714540 unlock_page(page);
39724541 out:
4542
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
4543
+ i_mmap_unlock_read(mapping);
39734544 return ret;
39744545
39754546 backout:
....@@ -3982,8 +4553,7 @@
39824553 }
39834554
39844555 #ifdef CONFIG_SMP
3985
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
3986
- pgoff_t idx)
4556
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
39874557 {
39884558 unsigned long key[2];
39894559 u32 hash;
....@@ -4000,8 +4570,7 @@
40004570 * For uniprocesor systems we always use a single mutex, so just
40014571 * return 0 and avoid the hashing overhead.
40024572 */
4003
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
4004
- pgoff_t idx)
4573
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
40054574 {
40064575 return 0;
40074576 }
....@@ -4024,6 +4593,11 @@
40244593
40254594 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
40264595 if (ptep) {
4596
+ /*
4597
+ * Since we hold no locks, ptep could be stale. That is
4598
+ * OK as we are only making decisions based on content and
4599
+ * not actually modifying content here.
4600
+ */
40274601 entry = huge_ptep_get(ptep);
40284602 if (unlikely(is_hugetlb_entry_migration(entry))) {
40294603 migration_entry_wait_huge(vma, mm, ptep);
....@@ -4031,37 +4605,52 @@
40314605 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
40324606 return VM_FAULT_HWPOISON_LARGE |
40334607 VM_FAULT_SET_HINDEX(hstate_index(h));
4034
- } else {
4035
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
4036
- if (!ptep)
4037
- return VM_FAULT_OOM;
40384608 }
40394609
4610
+ /*
4611
+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
4612
+ * until finished with ptep. This serves two purposes:
4613
+ * 1) It prevents huge_pmd_unshare from being called elsewhere
4614
+ * and making the ptep no longer valid.
4615
+ * 2) It synchronizes us with i_size modifications during truncation.
4616
+ *
4617
+ * ptep could have already be assigned via huge_pte_offset. That
4618
+ * is OK, as huge_pte_alloc will return the same value unless
4619
+ * something has changed.
4620
+ */
40404621 mapping = vma->vm_file->f_mapping;
4041
- idx = vma_hugecache_offset(h, vma, haddr);
4622
+ i_mmap_lock_read(mapping);
4623
+ ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
4624
+ if (!ptep) {
4625
+ i_mmap_unlock_read(mapping);
4626
+ return VM_FAULT_OOM;
4627
+ }
40424628
40434629 /*
40444630 * Serialize hugepage allocation and instantiation, so that we don't
40454631 * get spurious allocation failures if two CPUs race to instantiate
40464632 * the same page in the page cache.
40474633 */
4048
- hash = hugetlb_fault_mutex_hash(h, mapping, idx);
4634
+ idx = vma_hugecache_offset(h, vma, haddr);
4635
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
40494636 mutex_lock(&hugetlb_fault_mutex_table[hash]);
40504637
40514638 entry = huge_ptep_get(ptep);
4052
- if (huge_pte_none(entry)) {
4053
- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
4054
- goto out_mutex;
4055
- }
4639
+ if (huge_pte_none(entry))
4640
+ /*
4641
+ * hugetlb_no_page will drop vma lock and hugetlb fault
4642
+ * mutex internally, which make us return immediately.
4643
+ */
4644
+ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
40564645
40574646 ret = 0;
40584647
40594648 /*
40604649 * entry could be a migration/hwpoison entry at this point, so this
40614650 * check prevents the kernel from going below assuming that we have
4062
- * a active hugepage in pagecache. This goto expects the 2nd page fault,
4063
- * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
4064
- * handle it.
4651
+ * an active hugepage in pagecache. This goto expects the 2nd page
4652
+ * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
4653
+ * properly handle it.
40654654 */
40664655 if (!pte_present(entry))
40674656 goto out_mutex;
....@@ -4132,6 +4721,7 @@
41324721 }
41334722 out_mutex:
41344723 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
4724
+ i_mmap_unlock_read(mapping);
41354725 /*
41364726 * Generally it's safe to hold refcount during waiting page lock. But
41374727 * here we just wait to defer the next page fault to avoid busy loop and
....@@ -4144,6 +4734,7 @@
41444734 return ret;
41454735 }
41464736
4737
+#ifdef CONFIG_USERFAULTFD
41474738 /*
41484739 * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
41494740 * modifications for huge pages.
....@@ -4153,8 +4744,10 @@
41534744 struct vm_area_struct *dst_vma,
41544745 unsigned long dst_addr,
41554746 unsigned long src_addr,
4747
+ enum mcopy_atomic_mode mode,
41564748 struct page **pagep)
41574749 {
4750
+ bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
41584751 struct address_space *mapping;
41594752 pgoff_t idx;
41604753 unsigned long size;
....@@ -4164,8 +4757,17 @@
41644757 spinlock_t *ptl;
41654758 int ret;
41664759 struct page *page;
4760
+ int writable;
41674761
4168
- if (!*pagep) {
4762
+ mapping = dst_vma->vm_file->f_mapping;
4763
+ idx = vma_hugecache_offset(h, dst_vma, dst_addr);
4764
+
4765
+ if (is_continue) {
4766
+ ret = -EFAULT;
4767
+ page = find_lock_page(mapping, idx);
4768
+ if (!page)
4769
+ goto out;
4770
+ } else if (!*pagep) {
41694771 /* If a page already exists, then it's UFFDIO_COPY for
41704772 * a non-missing case. Return -EEXIST.
41714773 */
....@@ -4185,7 +4787,7 @@
41854787 (const void __user *) src_addr,
41864788 pages_per_huge_page(h), false);
41874789
4188
- /* fallback to copy_from_user outside mmap_sem */
4790
+ /* fallback to copy_from_user outside mmap_lock */
41894791 if (unlikely(ret)) {
41904792 ret = -ENOENT;
41914793 *pagep = page;
....@@ -4204,13 +4806,8 @@
42044806 */
42054807 __SetPageUptodate(page);
42064808
4207
- mapping = dst_vma->vm_file->f_mapping;
4208
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
4209
-
4210
- /*
4211
- * If shared, add to page cache
4212
- */
4213
- if (vm_shared) {
4809
+ /* Add shared, newly allocated pages to the page cache. */
4810
+ if (vm_shared && !is_continue) {
42144811 size = i_size_read(mapping->host) >> huge_page_shift(h);
42154812 ret = -EFAULT;
42164813 if (idx >= size)
....@@ -4255,8 +4852,14 @@
42554852 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
42564853 }
42574854
4258
- _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
4259
- if (dst_vma->vm_flags & VM_WRITE)
4855
+ /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
4856
+ if (is_continue && !vm_shared)
4857
+ writable = 0;
4858
+ else
4859
+ writable = dst_vma->vm_flags & VM_WRITE;
4860
+
4861
+ _dst_pte = make_huge_pte(dst_vma, page, writable);
4862
+ if (writable)
42604863 _dst_pte = huge_pte_mkdirty(_dst_pte);
42614864 _dst_pte = pte_mkyoung(_dst_pte);
42624865
....@@ -4270,25 +4873,27 @@
42704873 update_mmu_cache(dst_vma, dst_addr, dst_pte);
42714874
42724875 spin_unlock(ptl);
4273
- set_page_huge_active(page);
4274
- if (vm_shared)
4876
+ if (!is_continue)
4877
+ set_page_huge_active(page);
4878
+ if (vm_shared || is_continue)
42754879 unlock_page(page);
42764880 ret = 0;
42774881 out:
42784882 return ret;
42794883 out_release_unlock:
42804884 spin_unlock(ptl);
4281
- if (vm_shared)
4885
+ if (vm_shared || is_continue)
42824886 unlock_page(page);
42834887 out_release_nounlock:
42844888 put_page(page);
42854889 goto out;
42864890 }
4891
+#endif /* CONFIG_USERFAULTFD */
42874892
42884893 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
42894894 struct page **pages, struct vm_area_struct **vmas,
42904895 unsigned long *position, unsigned long *nr_pages,
4291
- long i, unsigned int flags, int *nonblocking)
4896
+ long i, unsigned int flags, int *locked)
42924897 {
42934898 unsigned long pfn_offset;
42944899 unsigned long vaddr = *position;
....@@ -4306,7 +4911,7 @@
43064911 * If we have a pending SIGKILL, don't keep faulting pages and
43074912 * potentially allocating memory.
43084913 */
4309
- if (unlikely(fatal_signal_pending(current))) {
4914
+ if (fatal_signal_pending(current)) {
43104915 remainder = 0;
43114916 break;
43124917 }
....@@ -4359,14 +4964,17 @@
43594964 spin_unlock(ptl);
43604965 if (flags & FOLL_WRITE)
43614966 fault_flags |= FAULT_FLAG_WRITE;
4362
- if (nonblocking)
4363
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
4967
+ if (locked)
4968
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
4969
+ FAULT_FLAG_KILLABLE;
43644970 if (flags & FOLL_NOWAIT)
43654971 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
43664972 FAULT_FLAG_RETRY_NOWAIT;
43674973 if (flags & FOLL_TRIED) {
4368
- VM_WARN_ON_ONCE(fault_flags &
4369
- FAULT_FLAG_ALLOW_RETRY);
4974
+ /*
4975
+ * Note: FAULT_FLAG_ALLOW_RETRY and
4976
+ * FAULT_FLAG_TRIED can co-exist
4977
+ */
43704978 fault_flags |= FAULT_FLAG_TRIED;
43714979 }
43724980 ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
....@@ -4376,9 +4984,9 @@
43764984 break;
43774985 }
43784986 if (ret & VM_FAULT_RETRY) {
4379
- if (nonblocking &&
4987
+ if (locked &&
43804988 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
4381
- *nonblocking = 0;
4989
+ *locked = 0;
43824990 *nr_pages = 0;
43834991 /*
43844992 * VM_FAULT_RETRY must not return an
....@@ -4398,21 +5006,38 @@
43985006 page = pte_page(huge_ptep_get(pte));
43995007
44005008 /*
4401
- * Instead of doing 'try_get_page()' below in the same_page
4402
- * loop, just check the count once here.
5009
+ * If subpage information not requested, update counters
5010
+ * and skip the same_page loop below.
44035011 */
4404
- if (unlikely(page_count(page) <= 0)) {
4405
- if (pages) {
5012
+ if (!pages && !vmas && !pfn_offset &&
5013
+ (vaddr + huge_page_size(h) < vma->vm_end) &&
5014
+ (remainder >= pages_per_huge_page(h))) {
5015
+ vaddr += huge_page_size(h);
5016
+ remainder -= pages_per_huge_page(h);
5017
+ i += pages_per_huge_page(h);
5018
+ spin_unlock(ptl);
5019
+ continue;
5020
+ }
5021
+
5022
+same_page:
5023
+ if (pages) {
5024
+ pages[i] = mem_map_offset(page, pfn_offset);
5025
+ /*
5026
+ * try_grab_page() should always succeed here, because:
5027
+ * a) we hold the ptl lock, and b) we've just checked
5028
+ * that the huge page is present in the page tables. If
5029
+ * the huge page is present, then the tail pages must
5030
+ * also be present. The ptl prevents the head page and
5031
+ * tail pages from being rearranged in any way. So this
5032
+ * page must be available at this point, unless the page
5033
+ * refcount overflowed:
5034
+ */
5035
+ if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
44065036 spin_unlock(ptl);
44075037 remainder = 0;
44085038 err = -ENOMEM;
44095039 break;
44105040 }
4411
- }
4412
-same_page:
4413
- if (pages) {
4414
- pages[i] = mem_map_offset(page, pfn_offset);
4415
- get_page(pages[i]);
44165041 }
44175042
44185043 if (vmas)
....@@ -4443,14 +5068,6 @@
44435068 return i ? i : err;
44445069 }
44455070
4446
-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
4447
-/*
4448
- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
4449
- * implement this.
4450
- */
4451
-#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
4452
-#endif
4453
-
44545071 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
44555072 unsigned long address, unsigned long end, pgprot_t newprot)
44565073 {
....@@ -4460,21 +5077,22 @@
44605077 pte_t pte;
44615078 struct hstate *h = hstate_vma(vma);
44625079 unsigned long pages = 0;
4463
- unsigned long f_start = start;
4464
- unsigned long f_end = end;
44655080 bool shared_pmd = false;
5081
+ struct mmu_notifier_range range;
44665082
44675083 /*
44685084 * In the case of shared PMDs, the area to flush could be beyond
4469
- * start/end. Set f_start/f_end to cover the maximum possible
5085
+ * start/end. Set range.start/range.end to cover the maximum possible
44705086 * range if PMD sharing is possible.
44715087 */
4472
- adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
5088
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
5089
+ 0, vma, mm, start, end);
5090
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
44735091
44745092 BUG_ON(address >= end);
4475
- flush_cache_range(vma, f_start, f_end);
5093
+ flush_cache_range(vma, range.start, range.end);
44765094
4477
- mmu_notifier_invalidate_range_start(mm, f_start, f_end);
5095
+ mmu_notifier_invalidate_range_start(&range);
44785096 i_mmap_lock_write(vma->vm_file->f_mapping);
44795097 for (; address < end; address += huge_page_size(h)) {
44805098 spinlock_t *ptl;
....@@ -4482,7 +5100,7 @@
44825100 if (!ptep)
44835101 continue;
44845102 ptl = huge_pte_lock(h, mm, ptep);
4485
- if (huge_pmd_unshare(mm, &address, ptep)) {
5103
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
44865104 pages++;
44875105 spin_unlock(ptl);
44885106 shared_pmd = true;
....@@ -4509,10 +5127,12 @@
45095127 continue;
45105128 }
45115129 if (!huge_pte_none(pte)) {
4512
- pte = huge_ptep_get_and_clear(mm, address, ptep);
4513
- pte = pte_mkhuge(huge_pte_modify(pte, newprot));
5130
+ pte_t old_pte;
5131
+
5132
+ old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
5133
+ pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
45145134 pte = arch_make_huge_pte(pte, vma, NULL, 0);
4515
- set_huge_pte_at(mm, address, ptep, pte);
5135
+ huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
45165136 pages++;
45175137 }
45185138 spin_unlock(ptl);
....@@ -4525,7 +5145,7 @@
45255145 * did unshare a page of pmds, flush the range corresponding to the pud.
45265146 */
45275147 if (shared_pmd)
4528
- flush_hugetlb_tlb_range(vma, f_start, f_end);
5148
+ flush_hugetlb_tlb_range(vma, range.start, range.end);
45295149 else
45305150 flush_hugetlb_tlb_range(vma, start, end);
45315151 /*
....@@ -4535,7 +5155,7 @@
45355155 * See Documentation/vm/mmu_notifier.rst
45365156 */
45375157 i_mmap_unlock_write(vma->vm_file->f_mapping);
4538
- mmu_notifier_invalidate_range_end(mm, f_start, f_end);
5158
+ mmu_notifier_invalidate_range_end(&range);
45395159
45405160 return pages << h->order;
45415161 }
....@@ -4545,11 +5165,12 @@
45455165 struct vm_area_struct *vma,
45465166 vm_flags_t vm_flags)
45475167 {
4548
- long ret, chg;
5168
+ long ret, chg, add = -1;
45495169 struct hstate *h = hstate_inode(inode);
45505170 struct hugepage_subpool *spool = subpool_inode(inode);
45515171 struct resv_map *resv_map;
4552
- long gbl_reserve;
5172
+ struct hugetlb_cgroup *h_cg = NULL;
5173
+ long gbl_reserve, regions_needed = 0;
45535174
45545175 /* This should never happen */
45555176 if (from > to) {
....@@ -4572,11 +5193,17 @@
45725193 * called to make the mapping read-write. Assume !vma is a shm mapping
45735194 */
45745195 if (!vma || vma->vm_flags & VM_MAYSHARE) {
5196
+ /*
5197
+ * resv_map can not be NULL as hugetlb_reserve_pages is only
5198
+ * called for inodes for which resv_maps were created (see
5199
+ * hugetlbfs_get_inode).
5200
+ */
45755201 resv_map = inode_resv_map(inode);
45765202
4577
- chg = region_chg(resv_map, from, to);
5203
+ chg = region_chg(resv_map, from, to, &regions_needed);
45785204
45795205 } else {
5206
+ /* Private mapping. */
45805207 resv_map = resv_map_alloc();
45815208 if (!resv_map)
45825209 return -ENOMEM;
....@@ -4592,6 +5219,21 @@
45925219 goto out_err;
45935220 }
45945221
5222
+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
5223
+ hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
5224
+
5225
+ if (ret < 0) {
5226
+ ret = -ENOMEM;
5227
+ goto out_err;
5228
+ }
5229
+
5230
+ if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
5231
+ /* For private mappings, the hugetlb_cgroup uncharge info hangs
5232
+ * of the resv_map.
5233
+ */
5234
+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
5235
+ }
5236
+
45955237 /*
45965238 * There must be enough pages in the subpool for the mapping. If
45975239 * the subpool has a minimum size, there may be some global
....@@ -4600,7 +5242,7 @@
46005242 gbl_reserve = hugepage_subpool_get_pages(spool, chg);
46015243 if (gbl_reserve < 0) {
46025244 ret = -ENOSPC;
4603
- goto out_err;
5245
+ goto out_uncharge_cgroup;
46045246 }
46055247
46065248 /*
....@@ -4609,9 +5251,7 @@
46095251 */
46105252 ret = hugetlb_acct_memory(h, gbl_reserve);
46115253 if (ret < 0) {
4612
- /* put back original number of pages, chg */
4613
- (void)hugepage_subpool_put_pages(spool, chg);
4614
- goto out_err;
5254
+ goto out_put_pages;
46155255 }
46165256
46175257 /*
....@@ -4626,9 +5266,13 @@
46265266 * else has to be done for private mappings here
46275267 */
46285268 if (!vma || vma->vm_flags & VM_MAYSHARE) {
4629
- long add = region_add(resv_map, from, to);
5269
+ add = region_add(resv_map, from, to, regions_needed, h, h_cg);
46305270
4631
- if (unlikely(chg > add)) {
5271
+ if (unlikely(add < 0)) {
5272
+ hugetlb_acct_memory(h, -gbl_reserve);
5273
+ ret = add;
5274
+ goto out_put_pages;
5275
+ } else if (unlikely(chg > add)) {
46325276 /*
46335277 * pages in this range were added to the reserve
46345278 * map between region_chg and region_add. This
....@@ -4638,17 +5282,41 @@
46385282 */
46395283 long rsv_adjust;
46405284
5285
+ /*
5286
+ * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
5287
+ * reference to h_cg->css. See comment below for detail.
5288
+ */
5289
+ hugetlb_cgroup_uncharge_cgroup_rsvd(
5290
+ hstate_index(h),
5291
+ (chg - add) * pages_per_huge_page(h), h_cg);
5292
+
46415293 rsv_adjust = hugepage_subpool_put_pages(spool,
46425294 chg - add);
46435295 hugetlb_acct_memory(h, -rsv_adjust);
5296
+ } else if (h_cg) {
5297
+ /*
5298
+ * The file_regions will hold their own reference to
5299
+ * h_cg->css. So we should release the reference held
5300
+ * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
5301
+ * done.
5302
+ */
5303
+ hugetlb_cgroup_put_rsvd_cgroup(h_cg);
46445304 }
46455305 }
46465306 return 0;
5307
+out_put_pages:
5308
+ /* put back original number of pages, chg */
5309
+ (void)hugepage_subpool_put_pages(spool, chg);
5310
+out_uncharge_cgroup:
5311
+ hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
5312
+ chg * pages_per_huge_page(h), h_cg);
46475313 out_err:
46485314 if (!vma || vma->vm_flags & VM_MAYSHARE)
4649
- /* Don't call region_abort if region_chg failed */
4650
- if (chg >= 0)
4651
- region_abort(resv_map, from, to);
5315
+ /* Only call region_abort if the region_chg succeeded but the
5316
+ * region_add failed or didn't run.
5317
+ */
5318
+ if (chg >= 0 && add < 0)
5319
+ region_abort(resv_map, from, to, regions_needed);
46525320 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
46535321 kref_put(&resv_map->refs, resv_map_release);
46545322 return ret;
....@@ -4663,6 +5331,10 @@
46635331 struct hugepage_subpool *spool = subpool_inode(inode);
46645332 long gbl_reserve;
46655333
5334
+ /*
5335
+ * Since this routine can be called in the evict inode path for all
5336
+ * hugetlbfs inodes, resv_map could be NULL.
5337
+ */
46665338 if (resv_map) {
46675339 chg = region_del(resv_map, start, end);
46685340 /*
....@@ -4727,6 +5399,15 @@
47275399 return false;
47285400 }
47295401
5402
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
5403
+{
5404
+#ifdef CONFIG_USERFAULTFD
5405
+ if (uffd_disable_huge_pmd_share(vma))
5406
+ return false;
5407
+#endif
5408
+ return vma_shareable(vma, addr);
5409
+}
5410
+
47305411 /*
47315412 * Determine if start,end range within vma could be mapped by shared pmd.
47325413 * If yes, adjust start and end to cover range associated with possible
....@@ -4758,14 +5439,22 @@
47585439 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
47595440 * and returns the corresponding pte. While this is not necessary for the
47605441 * !shared pmd case because we can allocate the pmd later as well, it makes the
4761
- * code much cleaner. pmd allocation is essential for the shared case because
4762
- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
4763
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
4764
- * bad pmd for sharing.
5442
+ * code much cleaner.
5443
+ *
5444
+ * This routine must be called with i_mmap_rwsem held in at least read mode if
5445
+ * sharing is possible. For hugetlbfs, this prevents removal of any page
5446
+ * table entries associated with the address space. This is important as we
5447
+ * are setting up sharing based on existing page table entries (mappings).
5448
+ *
5449
+ * NOTE: This routine is only called from huge_pte_alloc. Some callers of
5450
+ * huge_pte_alloc know that sharing is not possible and do not take
5451
+ * i_mmap_rwsem as a performance optimization. This is handled by the
5452
+ * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
5453
+ * only required for subsequent processing.
47655454 */
4766
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
5455
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
5456
+ unsigned long addr, pud_t *pud)
47675457 {
4768
- struct vm_area_struct *vma = find_vma(mm, addr);
47695458 struct address_space *mapping = vma->vm_file->f_mapping;
47705459 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
47715460 vma->vm_pgoff;
....@@ -4775,10 +5464,7 @@
47755464 pte_t *pte;
47765465 spinlock_t *ptl;
47775466
4778
- if (!vma_shareable(vma, addr))
4779
- return (pte_t *)pmd_alloc(mm, pud, addr);
4780
-
4781
- i_mmap_lock_write(mapping);
5467
+ i_mmap_assert_locked(mapping);
47825468 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
47835469 if (svma == vma)
47845470 continue;
....@@ -4808,7 +5494,6 @@
48085494 spin_unlock(ptl);
48095495 out:
48105496 pte = (pte_t *)pmd_alloc(mm, pud, addr);
4811
- i_mmap_unlock_write(mapping);
48125497 return pte;
48135498 }
48145499
....@@ -4819,17 +5504,19 @@
48195504 * indicated by page_count > 1, unmap is achieved by clearing pud and
48205505 * decrementing the ref count. If count == 1, the pte page is not shared.
48215506 *
4822
- * called with page table lock held.
5507
+ * Called with page table lock held and i_mmap_rwsem held in write mode.
48235508 *
48245509 * returns: 1 successfully unmapped a shared pte page
48255510 * 0 the underlying pte page is not shared, or it is the last user
48265511 */
4827
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
5512
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
5513
+ unsigned long *addr, pte_t *ptep)
48285514 {
48295515 pgd_t *pgd = pgd_offset(mm, *addr);
48305516 p4d_t *p4d = p4d_offset(pgd, *addr);
48315517 pud_t *pud = pud_offset(p4d, *addr);
48325518
5519
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
48335520 BUG_ON(page_count(virt_to_page(ptep)) == 0);
48345521 if (page_count(virt_to_page(ptep)) == 1)
48355522 return 0;
....@@ -4837,17 +5524,26 @@
48375524 pud_clear(pud);
48385525 put_page(virt_to_page(ptep));
48395526 mm_dec_nr_pmds(mm);
4840
- *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
5527
+ /*
5528
+ * This update of passed address optimizes loops sequentially
5529
+ * processing addresses in increments of huge page size (PMD_SIZE
5530
+ * in this case). By clearing the pud, a PUD_SIZE area is unmapped.
5531
+ * Update address to the 'last page' in the cleared area so that
5532
+ * calling loop can move to first page past this area.
5533
+ */
5534
+ *addr |= PUD_SIZE - PMD_SIZE;
48415535 return 1;
48425536 }
4843
-#define want_pmd_share() (1)
5537
+
48445538 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
4845
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
5539
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
5540
+ unsigned long addr, pud_t *pud)
48465541 {
48475542 return NULL;
48485543 }
48495544
4850
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
5545
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
5546
+ unsigned long *addr, pte_t *ptep)
48515547 {
48525548 return 0;
48535549 }
....@@ -4856,11 +5552,15 @@
48565552 unsigned long *start, unsigned long *end)
48575553 {
48585554 }
4859
-#define want_pmd_share() (0)
5555
+
5556
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
5557
+{
5558
+ return false;
5559
+}
48605560 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
48615561
48625562 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
4863
-pte_t *huge_pte_alloc(struct mm_struct *mm,
5563
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
48645564 unsigned long addr, unsigned long sz)
48655565 {
48665566 pgd_t *pgd;
....@@ -4878,8 +5578,8 @@
48785578 pte = (pte_t *)pud;
48795579 } else {
48805580 BUG_ON(sz != PMD_SIZE);
4881
- if (want_pmd_share() && pud_none(*pud))
4882
- pte = huge_pmd_share(mm, addr, pud);
5581
+ if (want_pmd_share(vma, addr) && pud_none(*pud))
5582
+ pte = huge_pmd_share(mm, vma, addr, pud);
48835583 else
48845584 pte = (pte_t *)pmd_alloc(mm, pud, addr);
48855585 }
....@@ -4893,8 +5593,8 @@
48935593 * huge_pte_offset() - Walk the page table to resolve the hugepage
48945594 * entry at address @addr
48955595 *
4896
- * Return: Pointer to page table or swap entry (PUD or PMD) for
4897
- * address @addr, or NULL if a p*d_none() entry is encountered and the
5596
+ * Return: Pointer to page table entry (PUD or PMD) for
5597
+ * address @addr, or NULL if a !p*d_present() entry is encountered and the
48985598 * size @sz doesn't match the hugepage size at this level of the page
48995599 * table.
49005600 */
....@@ -4903,8 +5603,8 @@
49035603 {
49045604 pgd_t *pgd;
49055605 p4d_t *p4d;
4906
- pud_t *pud, pud_entry;
4907
- pmd_t *pmd, pmd_entry;
5606
+ pud_t *pud;
5607
+ pmd_t *pmd;
49085608
49095609 pgd = pgd_offset(mm, addr);
49105610 if (!pgd_present(*pgd))
....@@ -4914,22 +5614,16 @@
49145614 return NULL;
49155615
49165616 pud = pud_offset(p4d, addr);
4917
- pud_entry = READ_ONCE(*pud);
4918
- if (sz != PUD_SIZE && pud_none(pud_entry))
4919
- return NULL;
4920
- /* hugepage or swap? */
4921
- if (pud_huge(pud_entry) || !pud_present(pud_entry))
5617
+ if (sz == PUD_SIZE)
5618
+ /* must be pud huge, non-present or none */
49225619 return (pte_t *)pud;
5620
+ if (!pud_present(*pud))
5621
+ return NULL;
5622
+ /* must have a valid entry and size to go further */
49235623
49245624 pmd = pmd_offset(pud, addr);
4925
- pmd_entry = READ_ONCE(*pmd);
4926
- if (sz != PMD_SIZE && pmd_none(pmd_entry))
4927
- return NULL;
4928
- /* hugepage or swap? */
4929
- if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry))
4930
- return (pte_t *)pmd;
4931
-
4932
- return NULL;
5625
+ /* must be pmd huge, non-present or none */
5626
+ return (pte_t *)pmd;
49335627 }
49345628
49355629 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
....@@ -4954,30 +5648,45 @@
49545648 }
49555649
49565650 struct page * __weak
4957
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
4958
- pmd_t *pmd, int flags)
5651
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
49595652 {
5653
+ struct hstate *h = hstate_vma(vma);
5654
+ struct mm_struct *mm = vma->vm_mm;
49605655 struct page *page = NULL;
49615656 spinlock_t *ptl;
4962
- pte_t pte;
5657
+ pte_t *ptep, pte;
5658
+
5659
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
5660
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
5661
+ (FOLL_PIN | FOLL_GET)))
5662
+ return NULL;
5663
+
49635664 retry:
4964
- ptl = pmd_lockptr(mm, pmd);
4965
- spin_lock(ptl);
4966
- /*
4967
- * make sure that the address range covered by this pmd is not
4968
- * unmapped from other threads.
4969
- */
4970
- if (!pmd_huge(*pmd))
4971
- goto out;
4972
- pte = huge_ptep_get((pte_t *)pmd);
5665
+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
5666
+ if (!ptep)
5667
+ return NULL;
5668
+
5669
+ ptl = huge_pte_lock(h, mm, ptep);
5670
+ pte = huge_ptep_get(ptep);
49735671 if (pte_present(pte)) {
4974
- page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
4975
- if (flags & FOLL_GET)
4976
- get_page(page);
5672
+ page = pte_page(pte) +
5673
+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
5674
+ /*
5675
+ * try_grab_page() should always succeed here, because: a) we
5676
+ * hold the pmd (ptl) lock, and b) we've just checked that the
5677
+ * huge pmd (head) page is present in the page tables. The ptl
5678
+ * prevents the head page and tail pages from being rearranged
5679
+ * in any way. So this page must be available at this point,
5680
+ * unless the page refcount overflowed:
5681
+ */
5682
+ if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
5683
+ page = NULL;
5684
+ goto out;
5685
+ }
49775686 } else {
49785687 if (is_hugetlb_entry_migration(pte)) {
49795688 spin_unlock(ptl);
4980
- __migration_entry_wait(mm, (pte_t *)pmd, ptl);
5689
+ __migration_entry_wait(mm, ptep, ptl);
49815690 goto retry;
49825691 }
49835692 /*
....@@ -4994,7 +5703,7 @@
49945703 follow_huge_pud(struct mm_struct *mm, unsigned long address,
49955704 pud_t *pud, int flags)
49965705 {
4997
- if (flags & FOLL_GET)
5706
+ if (flags & (FOLL_GET | FOLL_PIN))
49985707 return NULL;
49995708
50005709 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
....@@ -5003,20 +5712,20 @@
50035712 struct page * __weak
50045713 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
50055714 {
5006
- if (flags & FOLL_GET)
5715
+ if (flags & (FOLL_GET | FOLL_PIN))
50075716 return NULL;
50085717
50095718 return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
50105719 }
50115720
5012
-bool isolate_huge_page(struct page *page, struct list_head *list)
5721
+int isolate_hugetlb(struct page *page, struct list_head *list)
50135722 {
5014
- bool ret = true;
5723
+ int ret = 0;
50155724
50165725 spin_lock(&hugetlb_lock);
50175726 if (!PageHeadHuge(page) || !page_huge_active(page) ||
50185727 !get_page_unless_zero(page)) {
5019
- ret = false;
5728
+ ret = -EBUSY;
50205729 goto unlock;
50215730 }
50225731 clear_page_huge_active(page);
....@@ -5068,3 +5777,132 @@
50685777 spin_unlock(&hugetlb_lock);
50695778 }
50705779 }
5780
+
5781
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
5782
+ unsigned long start,
5783
+ unsigned long end)
5784
+{
5785
+ struct hstate *h = hstate_vma(vma);
5786
+ unsigned long sz = huge_page_size(h);
5787
+ struct mm_struct *mm = vma->vm_mm;
5788
+ struct mmu_notifier_range range;
5789
+ unsigned long address;
5790
+ spinlock_t *ptl;
5791
+ pte_t *ptep;
5792
+
5793
+ if (!(vma->vm_flags & VM_MAYSHARE))
5794
+ return;
5795
+
5796
+ if (start >= end)
5797
+ return;
5798
+
5799
+ flush_cache_range(vma, start, end);
5800
+ /*
5801
+ * No need to call adjust_range_if_pmd_sharing_possible(), because
5802
+ * we have already done the PUD_SIZE alignment.
5803
+ */
5804
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
5805
+ start, end);
5806
+ mmu_notifier_invalidate_range_start(&range);
5807
+ i_mmap_lock_write(vma->vm_file->f_mapping);
5808
+ for (address = start; address < end; address += PUD_SIZE) {
5809
+ unsigned long tmp = address;
5810
+
5811
+ ptep = huge_pte_offset(mm, address, sz);
5812
+ if (!ptep)
5813
+ continue;
5814
+ ptl = huge_pte_lock(h, mm, ptep);
5815
+ /* We don't want 'address' to be changed */
5816
+ huge_pmd_unshare(mm, vma, &tmp, ptep);
5817
+ spin_unlock(ptl);
5818
+ }
5819
+ flush_hugetlb_tlb_range(vma, start, end);
5820
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
5821
+ /*
5822
+ * No need to call mmu_notifier_invalidate_range(), see
5823
+ * Documentation/vm/mmu_notifier.rst.
5824
+ */
5825
+ mmu_notifier_invalidate_range_end(&range);
5826
+}
5827
+
5828
+/*
5829
+ * This function will unconditionally remove all the shared pmd pgtable entries
5830
+ * within the specific vma for a hugetlbfs memory range.
5831
+ */
5832
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
5833
+{
5834
+ hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
5835
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE));
5836
+}
5837
+
5838
+#ifdef CONFIG_CMA
5839
+static bool cma_reserve_called __initdata;
5840
+
5841
+static int __init cmdline_parse_hugetlb_cma(char *p)
5842
+{
5843
+ hugetlb_cma_size = memparse(p, &p);
5844
+ return 0;
5845
+}
5846
+
5847
+early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
5848
+
5849
+void __init hugetlb_cma_reserve(int order)
5850
+{
5851
+ unsigned long size, reserved, per_node;
5852
+ int nid;
5853
+
5854
+ cma_reserve_called = true;
5855
+
5856
+ if (!hugetlb_cma_size)
5857
+ return;
5858
+
5859
+ if (hugetlb_cma_size < (PAGE_SIZE << order)) {
5860
+ pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
5861
+ (PAGE_SIZE << order) / SZ_1M);
5862
+ return;
5863
+ }
5864
+
5865
+ /*
5866
+ * If 3 GB area is requested on a machine with 4 numa nodes,
5867
+ * let's allocate 1 GB on first three nodes and ignore the last one.
5868
+ */
5869
+ per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
5870
+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
5871
+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
5872
+
5873
+ reserved = 0;
5874
+ for_each_node_state(nid, N_ONLINE) {
5875
+ int res;
5876
+ char name[CMA_MAX_NAME];
5877
+
5878
+ size = min(per_node, hugetlb_cma_size - reserved);
5879
+ size = round_up(size, PAGE_SIZE << order);
5880
+
5881
+ snprintf(name, sizeof(name), "hugetlb%d", nid);
5882
+ res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
5883
+ 0, false, name,
5884
+ &hugetlb_cma[nid], nid);
5885
+ if (res) {
5886
+ pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
5887
+ res, nid);
5888
+ continue;
5889
+ }
5890
+
5891
+ reserved += size;
5892
+ pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
5893
+ size / SZ_1M, nid);
5894
+
5895
+ if (reserved >= hugetlb_cma_size)
5896
+ break;
5897
+ }
5898
+}
5899
+
5900
+void __init hugetlb_cma_check(void)
5901
+{
5902
+ if (!hugetlb_cma_size || cma_reserve_called)
5903
+ return;
5904
+
5905
+ pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
5906
+}
5907
+
5908
+#endif /* CONFIG_CMA */