hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/util.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 #include <linux/mm.h>
23 #include <linux/slab.h>
34 #include <linux/string.h>
....@@ -6,6 +7,7 @@
67 #include <linux/err.h>
78 #include <linux/sched.h>
89 #include <linux/sched/mm.h>
10
+#include <linux/sched/signal.h>
911 #include <linux/sched/task_stack.h>
1012 #include <linux/security.h>
1113 #include <linux/swap.h>
....@@ -14,17 +16,21 @@
1416 #include <linux/hugetlb.h>
1517 #include <linux/vmalloc.h>
1618 #include <linux/userfaultfd_k.h>
19
+#include <linux/elf.h>
20
+#include <linux/elf-randomize.h>
21
+#include <linux/personality.h>
22
+#include <linux/random.h>
23
+#include <linux/processor.h>
24
+#include <linux/sizes.h>
25
+#include <linux/compat.h>
1726
18
-#include <asm/sections.h>
1927 #include <linux/uaccess.h>
2028
2129 #include "internal.h"
22
-
23
-static inline int is_kernel_rodata(unsigned long addr)
24
-{
25
- return addr >= (unsigned long)__start_rodata &&
26
- addr < (unsigned long)__end_rodata;
27
-}
30
+#ifndef __GENKSYMS__
31
+#include <trace/hooks/syscall_check.h>
32
+#include <trace/hooks/mm.h>
33
+#endif
2834
2935 /**
3036 * kfree_const - conditionally free memory
....@@ -43,6 +49,8 @@
4349 * kstrdup - allocate space for and copy an existing string
4450 * @s: the string to duplicate
4551 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
52
+ *
53
+ * Return: newly allocated copy of @s or %NULL in case of error
4654 */
4755 char *kstrdup(const char *s, gfp_t gfp)
4856 {
....@@ -65,9 +73,11 @@
6573 * @s: the string to duplicate
6674 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
6775 *
68
- * Function returns source string if it is in .rodata section otherwise it
69
- * fallbacks to kstrdup.
70
- * Strings allocated by kstrdup_const should be freed by kfree_const.
76
+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
77
+ * must not be passed to krealloc().
78
+ *
79
+ * Return: source string if it is in .rodata section otherwise
80
+ * fallback to kstrdup.
7181 */
7282 const char *kstrdup_const(const char *s, gfp_t gfp)
7383 {
....@@ -85,6 +95,8 @@
8595 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
8696 *
8797 * Note: Use kmemdup_nul() instead if the size is known exactly.
98
+ *
99
+ * Return: newly allocated copy of @s or %NULL in case of error
88100 */
89101 char *kstrndup(const char *s, size_t max, gfp_t gfp)
90102 {
....@@ -110,6 +122,8 @@
110122 * @src: memory region to duplicate
111123 * @len: memory region length
112124 * @gfp: GFP mask to use
125
+ *
126
+ * Return: newly allocated copy of @src or %NULL in case of error
113127 */
114128 void *kmemdup(const void *src, size_t len, gfp_t gfp)
115129 {
....@@ -127,6 +141,9 @@
127141 * @s: The data to stringify
128142 * @len: The size of the data
129143 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
144
+ *
145
+ * Return: newly allocated copy of @s with NUL-termination or %NULL in
146
+ * case of error
130147 */
131148 char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
132149 {
....@@ -150,14 +167,14 @@
150167 * @src: source address in user space
151168 * @len: number of bytes to copy
152169 *
153
- * Returns an ERR_PTR() on failure. Result is physically
170
+ * Return: an ERR_PTR() on failure. Result is physically
154171 * contiguous, to be freed by kfree().
155172 */
156173 void *memdup_user(const void __user *src, size_t len)
157174 {
158175 void *p;
159176
160
- p = kmalloc_track_caller(len, GFP_USER);
177
+ p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
161178 if (!p)
162179 return ERR_PTR(-ENOMEM);
163180
....@@ -176,7 +193,7 @@
176193 * @src: source address in user space
177194 * @len: number of bytes to copy
178195 *
179
- * Returns an ERR_PTR() on failure. Result may be not
196
+ * Return: an ERR_PTR() on failure. Result may be not
180197 * physically contiguous. Use kvfree() to free.
181198 */
182199 void *vmemdup_user(const void __user *src, size_t len)
....@@ -200,6 +217,8 @@
200217 * strndup_user - duplicate an existing string from user space
201218 * @s: The string to duplicate
202219 * @n: Maximum number of bytes to copy, including the trailing NUL.
220
+ *
221
+ * Return: newly allocated copy of @s or an ERR_PTR() in case of error
203222 */
204223 char *strndup_user(const char __user *s, long n)
205224 {
....@@ -231,7 +250,7 @@
231250 * @src: source address in user space
232251 * @len: number of bytes to copy
233252 *
234
- * Returns an ERR_PTR() on failure.
253
+ * Return: an ERR_PTR() on failure.
235254 */
236255 void *memdup_user_nul(const void __user *src, size_t len)
237256 {
....@@ -257,7 +276,7 @@
257276 EXPORT_SYMBOL(memdup_user_nul);
258277
259278 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
260
- struct vm_area_struct *prev, struct rb_node *rb_parent)
279
+ struct vm_area_struct *prev)
261280 {
262281 struct vm_area_struct *next;
263282
....@@ -266,16 +285,26 @@
266285 next = prev->vm_next;
267286 prev->vm_next = vma;
268287 } else {
288
+ next = mm->mmap;
269289 mm->mmap = vma;
270
- if (rb_parent)
271
- next = rb_entry(rb_parent,
272
- struct vm_area_struct, vm_rb);
273
- else
274
- next = NULL;
275290 }
276291 vma->vm_next = next;
277292 if (next)
278293 next->vm_prev = vma;
294
+}
295
+
296
+void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
297
+{
298
+ struct vm_area_struct *prev, *next;
299
+
300
+ next = vma->vm_next;
301
+ prev = vma->vm_prev;
302
+ if (prev)
303
+ prev->vm_next = next;
304
+ else
305
+ mm->mmap = next;
306
+ if (next)
307
+ next->vm_prev = prev;
279308 }
280309
281310 /* Check if the vma is being used as a stack by this task */
....@@ -286,7 +315,138 @@
286315 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
287316 }
288317
289
-#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
318
+#ifndef STACK_RND_MASK
319
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
320
+#endif
321
+
322
+unsigned long randomize_stack_top(unsigned long stack_top)
323
+{
324
+ unsigned long random_variable = 0;
325
+
326
+ if (current->flags & PF_RANDOMIZE) {
327
+ random_variable = get_random_long();
328
+ random_variable &= STACK_RND_MASK;
329
+ random_variable <<= PAGE_SHIFT;
330
+ }
331
+#ifdef CONFIG_STACK_GROWSUP
332
+ return PAGE_ALIGN(stack_top) + random_variable;
333
+#else
334
+ return PAGE_ALIGN(stack_top) - random_variable;
335
+#endif
336
+}
337
+
338
+/**
339
+ * randomize_page - Generate a random, page aligned address
340
+ * @start: The smallest acceptable address the caller will take.
341
+ * @range: The size of the area, starting at @start, within which the
342
+ * random address must fall.
343
+ *
344
+ * If @start + @range would overflow, @range is capped.
345
+ *
346
+ * NOTE: Historical use of randomize_range, which this replaces, presumed that
347
+ * @start was already page aligned. We now align it regardless.
348
+ *
349
+ * Return: A page aligned address within [start, start + range). On error,
350
+ * @start is returned.
351
+ */
352
+unsigned long randomize_page(unsigned long start, unsigned long range)
353
+{
354
+ if (!PAGE_ALIGNED(start)) {
355
+ range -= PAGE_ALIGN(start) - start;
356
+ start = PAGE_ALIGN(start);
357
+ }
358
+
359
+ if (start > ULONG_MAX - range)
360
+ range = ULONG_MAX - start;
361
+
362
+ range >>= PAGE_SHIFT;
363
+
364
+ if (range == 0)
365
+ return start;
366
+
367
+ return start + (get_random_long() % range << PAGE_SHIFT);
368
+}
369
+
370
+#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
371
+unsigned long arch_randomize_brk(struct mm_struct *mm)
372
+{
373
+ /* Is the current task 32bit ? */
374
+ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
375
+ return randomize_page(mm->brk, SZ_32M);
376
+
377
+ return randomize_page(mm->brk, SZ_1G);
378
+}
379
+
380
+unsigned long arch_mmap_rnd(void)
381
+{
382
+ unsigned long rnd;
383
+
384
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
385
+ if (is_compat_task())
386
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
387
+ else
388
+#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
389
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
390
+
391
+ return rnd << PAGE_SHIFT;
392
+}
393
+EXPORT_SYMBOL_GPL(arch_mmap_rnd);
394
+
395
+static int mmap_is_legacy(struct rlimit *rlim_stack)
396
+{
397
+ if (current->personality & ADDR_COMPAT_LAYOUT)
398
+ return 1;
399
+
400
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
401
+ return 1;
402
+
403
+ return sysctl_legacy_va_layout;
404
+}
405
+
406
+/*
407
+ * Leave enough space between the mmap area and the stack to honour ulimit in
408
+ * the face of randomisation.
409
+ */
410
+#define MIN_GAP (SZ_128M)
411
+#define MAX_GAP (STACK_TOP / 6 * 5)
412
+
413
+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
414
+{
415
+ unsigned long gap = rlim_stack->rlim_cur;
416
+ unsigned long pad = stack_guard_gap;
417
+
418
+ /* Account for stack randomization if necessary */
419
+ if (current->flags & PF_RANDOMIZE)
420
+ pad += (STACK_RND_MASK << PAGE_SHIFT);
421
+
422
+ /* Values close to RLIM_INFINITY can overflow. */
423
+ if (gap + pad > gap)
424
+ gap += pad;
425
+
426
+ if (gap < MIN_GAP)
427
+ gap = MIN_GAP;
428
+ else if (gap > MAX_GAP)
429
+ gap = MAX_GAP;
430
+
431
+ return PAGE_ALIGN(STACK_TOP - gap - rnd);
432
+}
433
+
434
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
435
+{
436
+ unsigned long random_factor = 0UL;
437
+
438
+ if (current->flags & PF_RANDOMIZE)
439
+ random_factor = arch_mmap_rnd();
440
+
441
+ if (mmap_is_legacy(rlim_stack)) {
442
+ mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
443
+ mm->get_unmapped_area = arch_get_unmapped_area;
444
+ } else {
445
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
446
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
447
+ }
448
+}
449
+#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
290450 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
291451 {
292452 mm->mmap_base = TASK_UNMAPPED_BASE;
....@@ -294,52 +454,79 @@
294454 }
295455 #endif
296456
297
-/*
298
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
299
- * back to the regular GUP.
300
- * Note a difference with get_user_pages_fast: this always returns the
301
- * number of pages pinned, 0 if no pages were pinned.
302
- * If the architecture does not support this function, simply return with no
303
- * pages pinned.
457
+/**
458
+ * __account_locked_vm - account locked pages to an mm's locked_vm
459
+ * @mm: mm to account against
460
+ * @pages: number of pages to account
461
+ * @inc: %true if @pages should be considered positive, %false if not
462
+ * @task: task used to check RLIMIT_MEMLOCK
463
+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
464
+ *
465
+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
466
+ * that mmap_lock is held as writer.
467
+ *
468
+ * Return:
469
+ * * 0 on success
470
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
304471 */
305
-int __weak __get_user_pages_fast(unsigned long start,
306
- int nr_pages, int write, struct page **pages)
472
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
473
+ struct task_struct *task, bool bypass_rlim)
307474 {
308
- return 0;
475
+ unsigned long locked_vm, limit;
476
+ int ret = 0;
477
+
478
+ mmap_assert_write_locked(mm);
479
+
480
+ locked_vm = mm->locked_vm;
481
+ if (inc) {
482
+ if (!bypass_rlim) {
483
+ limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
484
+ if (locked_vm + pages > limit)
485
+ ret = -ENOMEM;
486
+ }
487
+ if (!ret)
488
+ mm->locked_vm = locked_vm + pages;
489
+ } else {
490
+ WARN_ON_ONCE(pages > locked_vm);
491
+ mm->locked_vm = locked_vm - pages;
492
+ }
493
+
494
+ pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
495
+ (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
496
+ locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
497
+ ret ? " - exceeded" : "");
498
+
499
+ return ret;
309500 }
310
-EXPORT_SYMBOL_GPL(__get_user_pages_fast);
501
+EXPORT_SYMBOL_GPL(__account_locked_vm);
311502
312503 /**
313
- * get_user_pages_fast() - pin user pages in memory
314
- * @start: starting user address
315
- * @nr_pages: number of pages from start to pin
316
- * @write: whether pages will be written to
317
- * @pages: array that receives pointers to the pages pinned.
318
- * Should be at least nr_pages long.
504
+ * account_locked_vm - account locked pages to an mm's locked_vm
505
+ * @mm: mm to account against, may be NULL
506
+ * @pages: number of pages to account
507
+ * @inc: %true if @pages should be considered positive, %false if not
319508 *
320
- * Returns number of pages pinned. This may be fewer than the number
321
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
322
- * were pinned, returns -errno.
509
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
323510 *
324
- * get_user_pages_fast provides equivalent functionality to get_user_pages,
325
- * operating on current and current->mm, with force=0 and vma=NULL. However
326
- * unlike get_user_pages, it must be called without mmap_sem held.
327
- *
328
- * get_user_pages_fast may take mmap_sem and page table locks, so no
329
- * assumptions can be made about lack of locking. get_user_pages_fast is to be
330
- * implemented in a way that is advantageous (vs get_user_pages()) when the
331
- * user memory area is already faulted in and present in ptes. However if the
332
- * pages have to be faulted in, it may turn out to be slightly slower so
333
- * callers need to carefully consider what to use. On many architectures,
334
- * get_user_pages_fast simply falls back to get_user_pages.
511
+ * Return:
512
+ * * 0 on success, or if mm is NULL
513
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
335514 */
336
-int __weak get_user_pages_fast(unsigned long start,
337
- int nr_pages, int write, struct page **pages)
515
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
338516 {
339
- return get_user_pages_unlocked(start, nr_pages, pages,
340
- write ? FOLL_WRITE : 0);
517
+ int ret;
518
+
519
+ if (pages == 0 || !mm)
520
+ return 0;
521
+
522
+ mmap_write_lock(mm);
523
+ ret = __account_locked_vm(mm, pages, inc, current,
524
+ capable(CAP_IPC_LOCK));
525
+ mmap_write_unlock(mm);
526
+
527
+ return ret;
341528 }
342
-EXPORT_SYMBOL_GPL(get_user_pages_fast);
529
+EXPORT_SYMBOL_GPL(account_locked_vm);
343530
344531 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
345532 unsigned long len, unsigned long prot,
....@@ -352,15 +539,16 @@
352539
353540 ret = security_mmap_file(file, prot, flag);
354541 if (!ret) {
355
- if (down_write_killable(&mm->mmap_sem))
542
+ if (mmap_write_lock_killable(mm))
356543 return -EINTR;
357
- ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
358
- &populate, &uf);
359
- up_write(&mm->mmap_sem);
544
+ ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
545
+ &uf);
546
+ mmap_write_unlock(mm);
360547 userfaultfd_unmap_complete(mm, &uf);
361548 if (populate)
362549 mm_populate(ret, populate);
363550 }
551
+ trace_android_vh_check_mmap_file(file, prot, flag, ret);
364552 return ret;
365553 }
366554
....@@ -393,11 +581,14 @@
393581 *
394582 * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
395583 * fall back to vmalloc.
584
+ *
585
+ * Return: pointer to the allocated memory of %NULL in case of failure
396586 */
397587 void *kvmalloc_node(size_t size, gfp_t flags, int node)
398588 {
399589 gfp_t kmalloc_flags = flags;
400590 void *ret;
591
+ bool use_vmalloc = false;
401592
402593 /*
403594 * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
....@@ -405,6 +596,10 @@
405596 */
406597 if ((flags & GFP_KERNEL) != GFP_KERNEL)
407598 return kmalloc_node(size, flags, node);
599
+
600
+ trace_android_vh_kvmalloc_node_use_vmalloc(size, &kmalloc_flags, &use_vmalloc);
601
+ if (use_vmalloc)
602
+ goto use_vmalloc_node;
408603
409604 /*
410605 * We want to attempt a large physically contiguous block first because
....@@ -429,7 +624,14 @@
429624 if (ret || size <= PAGE_SIZE)
430625 return ret;
431626
432
- return __vmalloc_node_flags_caller(size, node, flags,
627
+ /* Don't even allow crazy sizes */
628
+ if (unlikely(size > INT_MAX)) {
629
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
630
+ return NULL;
631
+ }
632
+
633
+use_vmalloc_node:
634
+ return __vmalloc_node(size, 1, flags, node,
433635 __builtin_return_address(0));
434636 }
435637 EXPORT_SYMBOL(kvmalloc_node);
....@@ -442,7 +644,7 @@
442644 * It is slightly more efficient to use kfree() or vfree() if you are certain
443645 * that you know which one to use.
444646 *
445
- * Context: Any context except NMI.
647
+ * Context: Either preemptible task context or not-NMI interrupt.
446648 */
447649 void kvfree(const void *addr)
448650 {
....@@ -470,6 +672,21 @@
470672 }
471673 }
472674 EXPORT_SYMBOL(kvfree_sensitive);
675
+
676
+void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
677
+{
678
+ void *newp;
679
+
680
+ if (oldsize >= newsize)
681
+ return (void *)p;
682
+ newp = kvmalloc(newsize, flags);
683
+ if (!newp)
684
+ return NULL;
685
+ memcpy(newp, p, oldsize);
686
+ kvfree(p);
687
+ return newp;
688
+}
689
+EXPORT_SYMBOL(kvrealloc);
473690
474691 static inline void *__page_rmapping(struct page *page)
475692 {
....@@ -503,7 +720,7 @@
503720 return true;
504721 if (PageHuge(page))
505722 return false;
506
- for (i = 0; i < (1 << compound_order(page)); i++) {
723
+ for (i = 0; i < compound_nr(page); i++) {
507724 if (atomic_read(&page[i]._mapcount) >= 0)
508725 return true;
509726 }
....@@ -584,9 +801,8 @@
584801 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
585802 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
586803
587
-int overcommit_ratio_handler(struct ctl_table *table, int write,
588
- void __user *buffer, size_t *lenp,
589
- loff_t *ppos)
804
+int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
805
+ size_t *lenp, loff_t *ppos)
590806 {
591807 int ret;
592808
....@@ -596,9 +812,49 @@
596812 return ret;
597813 }
598814
599
-int overcommit_kbytes_handler(struct ctl_table *table, int write,
600
- void __user *buffer, size_t *lenp,
601
- loff_t *ppos)
815
+static void sync_overcommit_as(struct work_struct *dummy)
816
+{
817
+ percpu_counter_sync(&vm_committed_as);
818
+}
819
+
820
+int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
821
+ size_t *lenp, loff_t *ppos)
822
+{
823
+ struct ctl_table t;
824
+ int new_policy = -1;
825
+ int ret;
826
+
827
+ /*
828
+ * The deviation of sync_overcommit_as could be big with loose policy
829
+ * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
830
+ * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
831
+ * with the strict "NEVER", and to avoid possible race condtion (even
832
+ * though user usually won't too frequently do the switching to policy
833
+ * OVERCOMMIT_NEVER), the switch is done in the following order:
834
+ * 1. changing the batch
835
+ * 2. sync percpu count on each CPU
836
+ * 3. switch the policy
837
+ */
838
+ if (write) {
839
+ t = *table;
840
+ t.data = &new_policy;
841
+ ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
842
+ if (ret || new_policy == -1)
843
+ return ret;
844
+
845
+ mm_compute_batch(new_policy);
846
+ if (new_policy == OVERCOMMIT_NEVER)
847
+ schedule_on_each_cpu(sync_overcommit_as);
848
+ sysctl_overcommit_memory = new_policy;
849
+ } else {
850
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
851
+ }
852
+
853
+ return ret;
854
+}
855
+
856
+int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
857
+ size_t *lenp, loff_t *ppos)
602858 {
603859 int ret;
604860
....@@ -618,7 +874,7 @@
618874 if (sysctl_overcommit_kbytes)
619875 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
620876 else
621
- allowed = ((totalram_pages - hugetlb_total_pages())
877
+ allowed = ((totalram_pages() - hugetlb_total_pages())
622878 * sysctl_overcommit_ratio / 100);
623879 allowed += total_swap_pages;
624880
....@@ -638,10 +894,15 @@
638894 * balancing memory across competing virtual machines that are hosted.
639895 * Several metrics drive this policy engine including the guest reported
640896 * memory commitment.
897
+ *
898
+ * The time cost of this is very low for small platforms, and for big
899
+ * platform like a 2S/36C/72T Skylake server, in worst case where
900
+ * vm_committed_as's spinlock is under severe contention, the time cost
901
+ * could be about 30~40 microseconds.
641902 */
642903 unsigned long vm_memory_committed(void)
643904 {
644
- return percpu_counter_read_positive(&vm_committed_as);
905
+ return percpu_counter_sum_positive(&vm_committed_as);
645906 }
646907 EXPORT_SYMBOL_GPL(vm_memory_committed);
647908
....@@ -663,11 +924,7 @@
663924 */
664925 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
665926 {
666
- long free, allowed, reserve;
667
-
668
- VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
669
- -(s64)vm_committed_as_batch * num_online_cpus(),
670
- "memory commitment underflow");
927
+ long allowed;
671928
672929 vm_acct_memory(pages);
673930
....@@ -678,51 +935,9 @@
678935 return 0;
679936
680937 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
681
- free = global_zone_page_state(NR_FREE_PAGES);
682
- free += global_node_page_state(NR_FILE_PAGES);
683
-
684
- /*
685
- * shmem pages shouldn't be counted as free in this
686
- * case, they can't be purged, only swapped out, and
687
- * that won't affect the overall amount of available
688
- * memory in the system.
689
- */
690
- free -= global_node_page_state(NR_SHMEM);
691
-
692
- free += get_nr_swap_pages();
693
-
694
- /*
695
- * Any slabs which are created with the
696
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
697
- * which are reclaimable, under pressure. The dentry
698
- * cache and most inode caches should fall into this
699
- */
700
- free += global_node_page_state(NR_SLAB_RECLAIMABLE);
701
-
702
- /*
703
- * Part of the kernel memory, which can be released
704
- * under memory pressure.
705
- */
706
- free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
707
-
708
- /*
709
- * Leave reserved pages. The pages are not for anonymous pages.
710
- */
711
- if (free <= totalreserve_pages)
938
+ if (pages > totalram_pages() + total_swap_pages)
712939 goto error;
713
- else
714
- free -= totalreserve_pages;
715
-
716
- /*
717
- * Reserve some for root
718
- */
719
- if (!cap_sys_admin)
720
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
721
-
722
- if (free > pages)
723
- return 0;
724
-
725
- goto error;
940
+ return 0;
726941 }
727942
728943 allowed = vm_commit_limit();
....@@ -736,7 +951,8 @@
736951 * Don't let a single process grow so big a user can't recover
737952 */
738953 if (mm) {
739
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
954
+ long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
955
+
740956 allowed -= min_t(long, mm->total_vm / 32, reserve);
741957 }
742958
....@@ -754,7 +970,8 @@
754970 * @buffer: the buffer to copy to.
755971 * @buflen: the length of the buffer. Larger cmdline values are truncated
756972 * to this length.
757
- * Returns the size of the cmdline field copied. Note that the copy does
973
+ *
974
+ * Return: the size of the cmdline field copied. Note that the copy does
758975 * not guarantee an ending NULL byte.
759976 */
760977 int get_cmdline(struct task_struct *task, char *buffer, int buflen)
....@@ -768,12 +985,12 @@
768985 if (!mm->arg_end)
769986 goto out_mm; /* Shh! No looking before we're done */
770987
771
- down_read(&mm->mmap_sem);
988
+ spin_lock(&mm->arg_lock);
772989 arg_start = mm->arg_start;
773990 arg_end = mm->arg_end;
774991 env_start = mm->env_start;
775992 env_end = mm->env_end;
776
- up_read(&mm->mmap_sem);
993
+ spin_unlock(&mm->arg_lock);
777994
778995 len = arg_end - arg_start;
779996
....@@ -805,3 +1022,16 @@
8051022 out:
8061023 return res;
8071024 }
1025
+
1026
+int __weak memcmp_pages(struct page *page1, struct page *page2)
1027
+{
1028
+ char *addr1, *addr2;
1029
+ int ret;
1030
+
1031
+ addr1 = kmap_atomic(page1);
1032
+ addr2 = kmap_atomic(page2);
1033
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
1034
+ kunmap_atomic(addr2);
1035
+ kunmap_atomic(addr1);
1036
+ return ret;
1037
+}