~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	#include <linux/mm.h>
2	3	#include <linux/slab.h>
3	4	#include <linux/string.h>
..	..	@@ -6,6 +7,7 @@
6	7	#include <linux/err.h>
7	8	#include <linux/sched.h>
8	9	#include <linux/sched/mm.h>
	10	+#include <linux/sched/signal.h>
9	11	#include <linux/sched/task_stack.h>
10	12	#include <linux/security.h>
11	13	#include <linux/swap.h>
..	..	@@ -14,17 +16,21 @@
14	16	#include <linux/hugetlb.h>
15	17	#include <linux/vmalloc.h>
16	18	#include <linux/userfaultfd_k.h>
	19	+#include <linux/elf.h>
	20	+#include <linux/elf-randomize.h>
	21	+#include <linux/personality.h>
	22	+#include <linux/random.h>
	23	+#include <linux/processor.h>
	24	+#include <linux/sizes.h>
	25	+#include <linux/compat.h>
17	26
18		-#include <asm/sections.h>
19	27	#include <linux/uaccess.h>
20	28
21	29	#include "internal.h"
22		-
23		-static inline int is_kernel_rodata(unsigned long addr)
24		-{
25		- return addr >= (unsigned long)__start_rodata &&
26		- addr < (unsigned long)__end_rodata;
27		-}
	30	+#ifndef __GENKSYMS__
	31	+#include <trace/hooks/syscall_check.h>
	32	+#include <trace/hooks/mm.h>
	33	+#endif
28	34
29	35	/**
30	36	* kfree_const - conditionally free memory
..	..	@@ -43,6 +49,8 @@
43	49	* kstrdup - allocate space for and copy an existing string
44	50	* @s: the string to duplicate
45	51	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
	52	+ *
	53	+ * Return: newly allocated copy of @s or %NULL in case of error
46	54	*/
47	55	char kstrdup(const char s, gfp_t gfp)
48	56	{
..	..	@@ -65,9 +73,11 @@
65	73	* @s: the string to duplicate
66	74	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
67	75	*
68		- * Function returns source string if it is in .rodata section otherwise it
69		- * fallbacks to kstrdup.
70		- * Strings allocated by kstrdup_const should be freed by kfree_const.
	76	+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
	77	+ * must not be passed to krealloc().
	78	+ *
	79	+ * Return: source string if it is in .rodata section otherwise
	80	+ * fallback to kstrdup.
71	81	*/
72	82	const char kstrdup_const(const char s, gfp_t gfp)
73	83	{
..	..	@@ -85,6 +95,8 @@
85	95	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
86	96	*
87	97	* Note: Use kmemdup_nul() instead if the size is known exactly.
	98	+ *
	99	+ * Return: newly allocated copy of @s or %NULL in case of error
88	100	*/
89	101	char kstrndup(const char s, size_t max, gfp_t gfp)
90	102	{
..	..	@@ -110,6 +122,8 @@
110	122	* @src: memory region to duplicate
111	123	* @len: memory region length
112	124	* @gfp: GFP mask to use
	125	+ *
	126	+ * Return: newly allocated copy of @src or %NULL in case of error
113	127	*/
114	128	void kmemdup(const void src, size_t len, gfp_t gfp)
115	129	{
..	..	@@ -127,6 +141,9 @@
127	141	* @s: The data to stringify
128	142	* @len: The size of the data
129	143	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
	144	+ *
	145	+ * Return: newly allocated copy of @s with NUL-termination or %NULL in
	146	+ * case of error
130	147	*/
131	148	char kmemdup_nul(const char s, size_t len, gfp_t gfp)
132	149	{
..	..	@@ -150,14 +167,14 @@
150	167	* @src: source address in user space
151	168	* @len: number of bytes to copy
152	169	*
153		- * Returns an ERR_PTR() on failure. Result is physically
	170	+ * Return: an ERR_PTR() on failure. Result is physically
154	171	* contiguous, to be freed by kfree().
155	172	*/
156	173	void memdup_user(const void __user src, size_t len)
157	174	{
158	175	void *p;
159	176
160		- p = kmalloc_track_caller(len, GFP_USER);
	177	+ p = kmalloc_track_caller(len, GFP_USER \| __GFP_NOWARN);
161	178	if (!p)
162	179	return ERR_PTR(-ENOMEM);
163	180
..	..	@@ -176,7 +193,7 @@
176	193	* @src: source address in user space
177	194	* @len: number of bytes to copy
178	195	*
179		- * Returns an ERR_PTR() on failure. Result may be not
	196	+ * Return: an ERR_PTR() on failure. Result may be not
180	197	* physically contiguous. Use kvfree() to free.
181	198	*/
182	199	void vmemdup_user(const void __user src, size_t len)
..	..	@@ -200,6 +217,8 @@
200	217	* strndup_user - duplicate an existing string from user space
201	218	* @s: The string to duplicate
202	219	* @n: Maximum number of bytes to copy, including the trailing NUL.
	220	+ *
	221	+ * Return: newly allocated copy of @s or an ERR_PTR() in case of error
203	222	*/
204	223	char strndup_user(const char __user s, long n)
205	224	{
..	..	@@ -231,7 +250,7 @@
231	250	* @src: source address in user space
232	251	* @len: number of bytes to copy
233	252	*
234		- * Returns an ERR_PTR() on failure.
	253	+ * Return: an ERR_PTR() on failure.
235	254	*/
236	255	void memdup_user_nul(const void __user src, size_t len)
237	256	{
..	..	@@ -257,7 +276,7 @@
257	276	EXPORT_SYMBOL(memdup_user_nul);
258	277
259	278	void __vma_link_list(struct mm_struct mm, struct vm_area_struct vma,
260		- struct vm_area_struct prev, struct rb_node rb_parent)
	279	+ struct vm_area_struct *prev)
261	280	{
262	281	struct vm_area_struct *next;
263	282
..	..	@@ -266,16 +285,26 @@
266	285	next = prev->vm_next;
267	286	prev->vm_next = vma;
268	287	} else {
	288	+ next = mm->mmap;
269	289	mm->mmap = vma;
270		- if (rb_parent)
271		- next = rb_entry(rb_parent,
272		- struct vm_area_struct, vm_rb);
273		- else
274		- next = NULL;
275	290	}
276	291	vma->vm_next = next;
277	292	if (next)
278	293	next->vm_prev = vma;
	294	+}
	295	+
	296	+void __vma_unlink_list(struct mm_struct mm, struct vm_area_struct vma)
	297	+{
	298	+ struct vm_area_struct prev, next;
	299	+
	300	+ next = vma->vm_next;
	301	+ prev = vma->vm_prev;
	302	+ if (prev)
	303	+ prev->vm_next = next;
	304	+ else
	305	+ mm->mmap = next;
	306	+ if (next)
	307	+ next->vm_prev = prev;
279	308	}
280	309
281	310	/* Check if the vma is being used as a stack by this task */
..	..	@@ -286,7 +315,138 @@
286	315	return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
287	316	}
288	317
289		-#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
	318	+#ifndef STACK_RND_MASK
	319	+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
	320	+#endif
	321	+
	322	+unsigned long randomize_stack_top(unsigned long stack_top)
	323	+{
	324	+ unsigned long random_variable = 0;
	325	+
	326	+ if (current->flags & PF_RANDOMIZE) {
	327	+ random_variable = get_random_long();
	328	+ random_variable &= STACK_RND_MASK;
	329	+ random_variable <<= PAGE_SHIFT;
	330	+ }
	331	+#ifdef CONFIG_STACK_GROWSUP
	332	+ return PAGE_ALIGN(stack_top) + random_variable;
	333	+#else
	334	+ return PAGE_ALIGN(stack_top) - random_variable;
	335	+#endif
	336	+}
	337	+
	338	+/**
	339	+ * randomize_page - Generate a random, page aligned address
	340	+ * @start: The smallest acceptable address the caller will take.
	341	+ * @range: The size of the area, starting at @start, within which the
	342	+ * random address must fall.
	343	+ *
	344	+ * If @start + @range would overflow, @range is capped.
	345	+ *
	346	+ * NOTE: Historical use of randomize_range, which this replaces, presumed that
	347	+ * @start was already page aligned. We now align it regardless.
	348	+ *
	349	+ * Return: A page aligned address within [start, start + range). On error,
	350	+ * @start is returned.
	351	+ */
	352	+unsigned long randomize_page(unsigned long start, unsigned long range)
	353	+{
	354	+ if (!PAGE_ALIGNED(start)) {
	355	+ range -= PAGE_ALIGN(start) - start;
	356	+ start = PAGE_ALIGN(start);
	357	+ }
	358	+
	359	+ if (start > ULONG_MAX - range)
	360	+ range = ULONG_MAX - start;
	361	+
	362	+ range >>= PAGE_SHIFT;
	363	+
	364	+ if (range == 0)
	365	+ return start;
	366	+
	367	+ return start + (get_random_long() % range << PAGE_SHIFT);
	368	+}
	369	+
	370	+#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
	371	+unsigned long arch_randomize_brk(struct mm_struct *mm)
	372	+{
	373	+ /* Is the current task 32bit ? */
	374	+ if (!IS_ENABLED(CONFIG_64BIT) \|\| is_compat_task())
	375	+ return randomize_page(mm->brk, SZ_32M);
	376	+
	377	+ return randomize_page(mm->brk, SZ_1G);
	378	+}
	379	+
	380	+unsigned long arch_mmap_rnd(void)
	381	+{
	382	+ unsigned long rnd;
	383	+
	384	+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
	385	+ if (is_compat_task())
	386	+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
	387	+ else
	388	+#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
	389	+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
	390	+
	391	+ return rnd << PAGE_SHIFT;
	392	+}
	393	+EXPORT_SYMBOL_GPL(arch_mmap_rnd);
	394	+
	395	+static int mmap_is_legacy(struct rlimit *rlim_stack)
	396	+{
	397	+ if (current->personality & ADDR_COMPAT_LAYOUT)
	398	+ return 1;
	399	+
	400	+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
	401	+ return 1;
	402	+
	403	+ return sysctl_legacy_va_layout;
	404	+}
	405	+
	406	+/*
	407	+ * Leave enough space between the mmap area and the stack to honour ulimit in
	408	+ * the face of randomisation.
	409	+ */
	410	+#define MIN_GAP (SZ_128M)
	411	+#define MAX_GAP (STACK_TOP / 6 * 5)
	412	+
	413	+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
	414	+{
	415	+ unsigned long gap = rlim_stack->rlim_cur;
	416	+ unsigned long pad = stack_guard_gap;
	417	+
	418	+ /* Account for stack randomization if necessary */
	419	+ if (current->flags & PF_RANDOMIZE)
	420	+ pad += (STACK_RND_MASK << PAGE_SHIFT);
	421	+
	422	+ /* Values close to RLIM_INFINITY can overflow. */
	423	+ if (gap + pad > gap)
	424	+ gap += pad;
	425	+
	426	+ if (gap < MIN_GAP)
	427	+ gap = MIN_GAP;
	428	+ else if (gap > MAX_GAP)
	429	+ gap = MAX_GAP;
	430	+
	431	+ return PAGE_ALIGN(STACK_TOP - gap - rnd);
	432	+}
	433	+
	434	+void arch_pick_mmap_layout(struct mm_struct mm, struct rlimit rlim_stack)
	435	+{
	436	+ unsigned long random_factor = 0UL;
	437	+
	438	+ if (current->flags & PF_RANDOMIZE)
	439	+ random_factor = arch_mmap_rnd();
	440	+
	441	+ if (mmap_is_legacy(rlim_stack)) {
	442	+ mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
	443	+ mm->get_unmapped_area = arch_get_unmapped_area;
	444	+ } else {
	445	+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
	446	+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
	447	+ }
	448	+}
	449	+#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
290	450	void arch_pick_mmap_layout(struct mm_struct mm, struct rlimit rlim_stack)
291	451	{
292	452	mm->mmap_base = TASK_UNMAPPED_BASE;
..	..	@@ -294,52 +454,79 @@
294	454	}
295	455	#endif
296	456
297		-/*
298		- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
299		- * back to the regular GUP.
300		- * Note a difference with get_user_pages_fast: this always returns the
301		- * number of pages pinned, 0 if no pages were pinned.
302		- * If the architecture does not support this function, simply return with no
303		- * pages pinned.
	457	+/**
	458	+ * __account_locked_vm - account locked pages to an mm's locked_vm
	459	+ * @mm: mm to account against
	460	+ * @pages: number of pages to account
	461	+ * @inc: %true if @pages should be considered positive, %false if not
	462	+ * @task: task used to check RLIMIT_MEMLOCK
	463	+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
	464	+ *
	465	+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
	466	+ * that mmap_lock is held as writer.
	467	+ *
	468	+ * Return:
	469	+ * * 0 on success
	470	+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
304	471	*/
305		-int __weak __get_user_pages_fast(unsigned long start,
306		- int nr_pages, int write, struct page **pages)
	472	+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
	473	+ struct task_struct *task, bool bypass_rlim)
307	474	{
308		- return 0;
	475	+ unsigned long locked_vm, limit;
	476	+ int ret = 0;
	477	+
	478	+ mmap_assert_write_locked(mm);
	479	+
	480	+ locked_vm = mm->locked_vm;
	481	+ if (inc) {
	482	+ if (!bypass_rlim) {
	483	+ limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
	484	+ if (locked_vm + pages > limit)
	485	+ ret = -ENOMEM;
	486	+ }
	487	+ if (!ret)
	488	+ mm->locked_vm = locked_vm + pages;
	489	+ } else {
	490	+ WARN_ON_ONCE(pages > locked_vm);
	491	+ mm->locked_vm = locked_vm - pages;
	492	+ }
	493	+
	494	+ pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
	495	+ (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
	496	+ locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
	497	+ ret ? " - exceeded" : "");
	498	+
	499	+ return ret;
309	500	}
310		-EXPORT_SYMBOL_GPL(__get_user_pages_fast);
	501	+EXPORT_SYMBOL_GPL(__account_locked_vm);
311	502
312	503	/**
313		- * get_user_pages_fast() - pin user pages in memory
314		- * @start: starting user address
315		- * @nr_pages: number of pages from start to pin
316		- * @write: whether pages will be written to
317		- * @pages: array that receives pointers to the pages pinned.
318		- * Should be at least nr_pages long.
	504	+ * account_locked_vm - account locked pages to an mm's locked_vm
	505	+ * @mm: mm to account against, may be NULL
	506	+ * @pages: number of pages to account
	507	+ * @inc: %true if @pages should be considered positive, %false if not
319	508	*
320		- * Returns number of pages pinned. This may be fewer than the number
321		- * requested. If nr_pages is 0 or negative, returns 0. If no pages
322		- * were pinned, returns -errno.
	509	+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
323	510	*
324		- * get_user_pages_fast provides equivalent functionality to get_user_pages,
325		- * operating on current and current->mm, with force=0 and vma=NULL. However
326		- * unlike get_user_pages, it must be called without mmap_sem held.
327		- *
328		- * get_user_pages_fast may take mmap_sem and page table locks, so no
329		- * assumptions can be made about lack of locking. get_user_pages_fast is to be
330		- * implemented in a way that is advantageous (vs get_user_pages()) when the
331		- * user memory area is already faulted in and present in ptes. However if the
332		- * pages have to be faulted in, it may turn out to be slightly slower so
333		- * callers need to carefully consider what to use. On many architectures,
334		- * get_user_pages_fast simply falls back to get_user_pages.
	511	+ * Return:
	512	+ * * 0 on success, or if mm is NULL
	513	+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
335	514	*/
336		-int __weak get_user_pages_fast(unsigned long start,
337		- int nr_pages, int write, struct page **pages)
	515	+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
338	516	{
339		- return get_user_pages_unlocked(start, nr_pages, pages,
340		- write ? FOLL_WRITE : 0);
	517	+ int ret;
	518	+
	519	+ if (pages == 0 \|\| !mm)
	520	+ return 0;
	521	+
	522	+ mmap_write_lock(mm);
	523	+ ret = __account_locked_vm(mm, pages, inc, current,
	524	+ capable(CAP_IPC_LOCK));
	525	+ mmap_write_unlock(mm);
	526	+
	527	+ return ret;
341	528	}
342		-EXPORT_SYMBOL_GPL(get_user_pages_fast);
	529	+EXPORT_SYMBOL_GPL(account_locked_vm);
343	530
344	531	unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
345	532	unsigned long len, unsigned long prot,
..	..	@@ -352,15 +539,16 @@
352	539
353	540	ret = security_mmap_file(file, prot, flag);
354	541	if (!ret) {
355		- if (down_write_killable(&mm->mmap_sem))
	542	+ if (mmap_write_lock_killable(mm))
356	543	return -EINTR;
357		- ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
358		- &populate, &uf);
359		- up_write(&mm->mmap_sem);
	544	+ ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
	545	+ &uf);
	546	+ mmap_write_unlock(mm);
360	547	userfaultfd_unmap_complete(mm, &uf);
361	548	if (populate)
362	549	mm_populate(ret, populate);
363	550	}
	551	+ trace_android_vh_check_mmap_file(file, prot, flag, ret);
364	552	return ret;
365	553	}
366	554
..	..	@@ -393,11 +581,14 @@
393	581	*
394	582	* Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
395	583	* fall back to vmalloc.
	584	+ *
	585	+ * Return: pointer to the allocated memory of %NULL in case of failure
396	586	*/
397	587	void *kvmalloc_node(size_t size, gfp_t flags, int node)
398	588	{
399	589	gfp_t kmalloc_flags = flags;
400	590	void *ret;
	591	+ bool use_vmalloc = false;
401	592
402	593	/*
403	594	* vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
..	..	@@ -405,6 +596,10 @@
405	596	*/
406	597	if ((flags & GFP_KERNEL) != GFP_KERNEL)
407	598	return kmalloc_node(size, flags, node);
	599	+
	600	+ trace_android_vh_kvmalloc_node_use_vmalloc(size, &kmalloc_flags, &use_vmalloc);
	601	+ if (use_vmalloc)
	602	+ goto use_vmalloc_node;
408	603
409	604	/*
410	605	* We want to attempt a large physically contiguous block first because
..	..	@@ -429,7 +624,14 @@
429	624	if (ret \|\| size <= PAGE_SIZE)
430	625	return ret;
431	626
432		- return __vmalloc_node_flags_caller(size, node, flags,
	627	+ /* Don't even allow crazy sizes */
	628	+ if (unlikely(size > INT_MAX)) {
	629	+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
	630	+ return NULL;
	631	+ }
	632	+
	633	+use_vmalloc_node:
	634	+ return __vmalloc_node(size, 1, flags, node,
433	635	__builtin_return_address(0));
434	636	}
435	637	EXPORT_SYMBOL(kvmalloc_node);
..	..	@@ -442,7 +644,7 @@
442	644	* It is slightly more efficient to use kfree() or vfree() if you are certain
443	645	* that you know which one to use.
444	646	*
445		- * Context: Any context except NMI.
	647	+ * Context: Either preemptible task context or not-NMI interrupt.
446	648	*/
447	649	void kvfree(const void *addr)
448	650	{
..	..	@@ -470,6 +672,21 @@
470	672	}
471	673	}
472	674	EXPORT_SYMBOL(kvfree_sensitive);
	675	+
	676	+void kvrealloc(const void p, size_t oldsize, size_t newsize, gfp_t flags)
	677	+{
	678	+ void *newp;
	679	+
	680	+ if (oldsize >= newsize)
	681	+ return (void *)p;
	682	+ newp = kvmalloc(newsize, flags);
	683	+ if (!newp)
	684	+ return NULL;
	685	+ memcpy(newp, p, oldsize);
	686	+ kvfree(p);
	687	+ return newp;
	688	+}
	689	+EXPORT_SYMBOL(kvrealloc);
473	690
474	691	static inline void __page_rmapping(struct page page)
475	692	{
..	..	@@ -503,7 +720,7 @@
503	720	return true;
504	721	if (PageHuge(page))
505	722	return false;
506		- for (i = 0; i < (1 << compound_order(page)); i++) {
	723	+ for (i = 0; i < compound_nr(page); i++) {
507	724	if (atomic_read(&page[i]._mapcount) >= 0)
508	725	return true;
509	726	}
..	..	@@ -584,9 +801,8 @@
584	801	unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
585	802	unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
586	803
587		-int overcommit_ratio_handler(struct ctl_table *table, int write,
588		- void __user buffer, size_t lenp,
589		- loff_t *ppos)
	804	+int overcommit_ratio_handler(struct ctl_table table, int write, void buffer,
	805	+ size_t lenp, loff_t ppos)
590	806	{
591	807	int ret;
592	808
..	..	@@ -596,9 +812,49 @@
596	812	return ret;
597	813	}
598	814
599		-int overcommit_kbytes_handler(struct ctl_table *table, int write,
600		- void __user buffer, size_t lenp,
601		- loff_t *ppos)
	815	+static void sync_overcommit_as(struct work_struct *dummy)
	816	+{
	817	+ percpu_counter_sync(&vm_committed_as);
	818	+}
	819	+
	820	+int overcommit_policy_handler(struct ctl_table table, int write, void buffer,
	821	+ size_t lenp, loff_t ppos)
	822	+{
	823	+ struct ctl_table t;
	824	+ int new_policy = -1;
	825	+ int ret;
	826	+
	827	+ /*
	828	+ * The deviation of sync_overcommit_as could be big with loose policy
	829	+ * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
	830	+ * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
	831	+ * with the strict "NEVER", and to avoid possible race condtion (even
	832	+ * though user usually won't too frequently do the switching to policy
	833	+ * OVERCOMMIT_NEVER), the switch is done in the following order:
	834	+ * 1. changing the batch
	835	+ * 2. sync percpu count on each CPU
	836	+ * 3. switch the policy
	837	+ */
	838	+ if (write) {
	839	+ t = *table;
	840	+ t.data = &new_policy;
	841	+ ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
	842	+ if (ret \|\| new_policy == -1)
	843	+ return ret;
	844	+
	845	+ mm_compute_batch(new_policy);
	846	+ if (new_policy == OVERCOMMIT_NEVER)
	847	+ schedule_on_each_cpu(sync_overcommit_as);
	848	+ sysctl_overcommit_memory = new_policy;
	849	+ } else {
	850	+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
	851	+ }
	852	+
	853	+ return ret;
	854	+}
	855	+
	856	+int overcommit_kbytes_handler(struct ctl_table table, int write, void buffer,
	857	+ size_t lenp, loff_t ppos)
602	858	{
603	859	int ret;
604	860
..	..	@@ -618,7 +874,7 @@
618	874	if (sysctl_overcommit_kbytes)
619	875	allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
620	876	else
621		- allowed = ((totalram_pages - hugetlb_total_pages())
	877	+ allowed = ((totalram_pages() - hugetlb_total_pages())
622	878	* sysctl_overcommit_ratio / 100);
623	879	allowed += total_swap_pages;
624	880
..	..	@@ -638,10 +894,15 @@
638	894	* balancing memory across competing virtual machines that are hosted.
639	895	* Several metrics drive this policy engine including the guest reported
640	896	* memory commitment.
	897	+ *
	898	+ * The time cost of this is very low for small platforms, and for big
	899	+ * platform like a 2S/36C/72T Skylake server, in worst case where
	900	+ * vm_committed_as's spinlock is under severe contention, the time cost
	901	+ * could be about 30~40 microseconds.
641	902	*/
642	903	unsigned long vm_memory_committed(void)
643	904	{
644		- return percpu_counter_read_positive(&vm_committed_as);
	905	+ return percpu_counter_sum_positive(&vm_committed_as);
645	906	}
646	907	EXPORT_SYMBOL_GPL(vm_memory_committed);
647	908
..	..	@@ -663,11 +924,7 @@
663	924	*/
664	925	int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
665	926	{
666		- long free, allowed, reserve;
667		-
668		- VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
669		- -(s64)vm_committed_as_batch * num_online_cpus(),
670		- "memory commitment underflow");
	927	+ long allowed;
671	928
672	929	vm_acct_memory(pages);
673	930
..	..	@@ -678,51 +935,9 @@
678	935	return 0;
679	936
680	937	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
681		- free = global_zone_page_state(NR_FREE_PAGES);
682		- free += global_node_page_state(NR_FILE_PAGES);
683		-
684		- /*
685		- * shmem pages shouldn't be counted as free in this
686		- * case, they can't be purged, only swapped out, and
687		- * that won't affect the overall amount of available
688		- * memory in the system.
689		- */
690		- free -= global_node_page_state(NR_SHMEM);
691		-
692		- free += get_nr_swap_pages();
693		-
694		- /*
695		- * Any slabs which are created with the
696		- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
697		- * which are reclaimable, under pressure. The dentry
698		- * cache and most inode caches should fall into this
699		- */
700		- free += global_node_page_state(NR_SLAB_RECLAIMABLE);
701		-
702		- /*
703		- * Part of the kernel memory, which can be released
704		- * under memory pressure.
705		- */
706		- free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
707		-
708		- /*
709		- * Leave reserved pages. The pages are not for anonymous pages.
710		- */
711		- if (free <= totalreserve_pages)
	938	+ if (pages > totalram_pages() + total_swap_pages)
712	939	goto error;
713		- else
714		- free -= totalreserve_pages;
715		-
716		- /*
717		- * Reserve some for root
718		- */
719		- if (!cap_sys_admin)
720		- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
721		-
722		- if (free > pages)
723		- return 0;
724		-
725		- goto error;
	940	+ return 0;
726	941	}
727	942
728	943	allowed = vm_commit_limit();
..	..	@@ -736,7 +951,8 @@
736	951	* Don't let a single process grow so big a user can't recover
737	952	*/
738	953	if (mm) {
739		- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
	954	+ long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
	955	+
740	956	allowed -= min_t(long, mm->total_vm / 32, reserve);
741	957	}
742	958
..	..	@@ -754,7 +970,8 @@
754	970	* @buffer: the buffer to copy to.
755	971	* @buflen: the length of the buffer. Larger cmdline values are truncated
756	972	* to this length.
757		- * Returns the size of the cmdline field copied. Note that the copy does
	973	+ *
	974	+ * Return: the size of the cmdline field copied. Note that the copy does
758	975	* not guarantee an ending NULL byte.
759	976	*/
760	977	int get_cmdline(struct task_struct task, char buffer, int buflen)
..	..	@@ -768,12 +985,12 @@
768	985	if (!mm->arg_end)
769	986	goto out_mm; /* Shh! No looking before we're done */
770	987
771		- down_read(&mm->mmap_sem);
	988	+ spin_lock(&mm->arg_lock);
772	989	arg_start = mm->arg_start;
773	990	arg_end = mm->arg_end;
774	991	env_start = mm->env_start;
775	992	env_end = mm->env_end;
776		- up_read(&mm->mmap_sem);
	993	+ spin_unlock(&mm->arg_lock);
777	994
778	995	len = arg_end - arg_start;
779	996
..	..	@@ -805,3 +1022,16 @@
805	1022	out:
806	1023	return res;
807	1024	}
	1025	+
	1026	+int __weak memcmp_pages(struct page page1, struct page page2)
	1027	+{
	1028	+ char addr1, addr2;
	1029	+ int ret;
	1030	+
	1031	+ addr1 = kmap_atomic(page1);
	1032	+ addr2 = kmap_atomic(page2);
	1033	+ ret = memcmp(addr1, addr2, PAGE_SIZE);
	1034	+ kunmap_atomic(addr2);
	1035	+ kunmap_atomic(addr1);
	1036	+ return ret;
	1037	+}