~hc/RK356X_SDK_RELEASE.git

..	..	@@ -30,12 +30,11 @@
30	30
31	31	#include "internal.h"
32	32
33		-static pmd_t get_old_pmd(struct mm_struct mm, unsigned long addr)
	33	+static pud_t get_old_pud(struct mm_struct mm, unsigned long addr)
34	34	{
35	35	pgd_t *pgd;
36	36	p4d_t *p4d;
37	37	pud_t *pud;
38		- pmd_t *pmd;
39	38
40	39	pgd = pgd_offset(mm, addr);
41	40	if (pgd_none_or_clear_bad(pgd))
..	..	@@ -49,6 +48,18 @@
49	48	if (pud_none_or_clear_bad(pud))
50	49	return NULL;
51	50
	51	+ return pud;
	52	+}
	53	+
	54	+static pmd_t get_old_pmd(struct mm_struct mm, unsigned long addr)
	55	+{
	56	+ pud_t *pud;
	57	+ pmd_t *pmd;
	58	+
	59	+ pud = get_old_pud(mm, addr);
	60	+ if (!pud)
	61	+ return NULL;
	62	+
52	63	pmd = pmd_offset(pud, addr);
53	64	if (pmd_none(*pmd))
54	65	return NULL;
..	..	@@ -56,19 +67,27 @@
56	67	return pmd;
57	68	}
58	69
59		-static pmd_t alloc_new_pmd(struct mm_struct mm, struct vm_area_struct *vma,
	70	+static pud_t alloc_new_pud(struct mm_struct mm, struct vm_area_struct *vma,
60	71	unsigned long addr)
61	72	{
62	73	pgd_t *pgd;
63	74	p4d_t *p4d;
64		- pud_t *pud;
65		- pmd_t *pmd;
66	75
67	76	pgd = pgd_offset(mm, addr);
68	77	p4d = p4d_alloc(mm, pgd, addr);
69	78	if (!p4d)
70	79	return NULL;
71		- pud = pud_alloc(mm, p4d, addr);
	80	+
	81	+ return pud_alloc(mm, p4d, addr);
	82	+}
	83	+
	84	+static pmd_t alloc_new_pmd(struct mm_struct mm, struct vm_area_struct *vma,
	85	+ unsigned long addr)
	86	+{
	87	+ pud_t *pud;
	88	+ pmd_t *pmd;
	89	+
	90	+ pud = alloc_new_pud(mm, vma, addr);
72	91	if (!pud)
73	92	return NULL;
74	93
..	..	@@ -133,7 +152,7 @@
133	152	* such races:
134	153	*
135	154	* - During exec() shift_arg_pages(), we use a specially tagged vma
136		- * which rmap call sites look for using is_vma_temporary_stack().
	155	+ * which rmap call sites look for using vma_is_temporary_stack().
137	156	*
138	157	* - During mremap(), new_vma is often known to be placed after vma
139	158	* in rmap traversal order. This ensures rmap will always observe
..	..	@@ -146,7 +165,7 @@
146	165
147	166	/*
148	167	* We don't have to worry about the ordering of src and dst
149		- * pte locks because exclusive mmap_sem prevents deadlock.
	168	+ * pte locks because exclusive mmap_lock prevents deadlock.
150	169	*/
151	170	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
152	171	new_pte = pte_offset_map(new_pmd, new_addr);
..	..	@@ -191,63 +210,327 @@
191	210	drop_rmap_locks(vma);
192	211	}
193	212
	213	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	214	+static inline bool trylock_vma_ref_count(struct vm_area_struct *vma)
	215	+{
	216	+ /*
	217	+ * If we have the only reference, swap the refcount to -1. This
	218	+ * will prevent other concurrent references by get_vma() for SPFs.
	219	+ */
	220	+ return atomic_cmpxchg(&vma->vm_ref_count, 1, -1) == 1;
	221	+}
	222	+
	223	+/*
	224	+ * Restore the VMA reference count to 1 after a fast mremap.
	225	+ */
	226	+static inline void unlock_vma_ref_count(struct vm_area_struct *vma)
	227	+{
	228	+ /*
	229	+ * This should only be called after a corresponding,
	230	+ * successful trylock_vma_ref_count().
	231	+ */
	232	+ VM_BUG_ON_VMA(atomic_cmpxchg(&vma->vm_ref_count, -1, 1) != -1,
	233	+ vma);
	234	+}
	235	+#else /* !CONFIG_SPECULATIVE_PAGE_FAULT */
	236	+static inline bool trylock_vma_ref_count(struct vm_area_struct *vma)
	237	+{
	238	+ return true;
	239	+}
	240	+static inline void unlock_vma_ref_count(struct vm_area_struct *vma)
	241	+{
	242	+}
	243	+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
	244	+
	245	+#ifdef CONFIG_HAVE_MOVE_PMD
	246	+static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
	247	+ unsigned long new_addr, pmd_t old_pmd, pmd_t new_pmd)
	248	+{
	249	+ spinlock_t old_ptl, new_ptl;
	250	+ struct mm_struct *mm = vma->vm_mm;
	251	+ pmd_t pmd;
	252	+
	253	+ /*
	254	+ * The destination pmd shouldn't be established, free_pgtables()
	255	+ * should have released it.
	256	+ *
	257	+ * However, there's a case during execve() where we use mremap
	258	+ * to move the initial stack, and in that case the target area
	259	+ * may overlap the source area (always moving down).
	260	+ *
	261	+ * If everything is PMD-aligned, that works fine, as moving
	262	+ * each pmd down will clear the source pmd. But if we first
	263	+ * have a few 4kB-only pages that get moved down, and then
	264	+ * hit the "now the rest is PMD-aligned, let's do everything
	265	+ * one pmd at a time", we will still have the old (now empty
	266	+ * of any 4kB pages, but still there) PMD in the page table
	267	+ * tree.
	268	+ *
	269	+ * Warn on it once - because we really should try to figure
	270	+ * out how to do this better - but then say "I won't move
	271	+ * this pmd".
	272	+ *
	273	+ * One alternative might be to just unmap the target pmd at
	274	+ * this point, and verify that it really is empty. We'll see.
	275	+ */
	276	+ if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
	277	+ return false;
	278	+
	279	+ /*
	280	+ * We hold both exclusive mmap_lock and rmap_lock at this point and
	281	+ * cannot block. If we cannot immediately take exclusive ownership
	282	+ * of the VMA fallback to the move_ptes().
	283	+ */
	284	+ if (!trylock_vma_ref_count(vma))
	285	+ return false;
	286	+
	287	+ /*
	288	+ * We don't have to worry about the ordering of src and dst
	289	+ * ptlocks because exclusive mmap_lock prevents deadlock.
	290	+ */
	291	+ old_ptl = pmd_lock(vma->vm_mm, old_pmd);
	292	+ new_ptl = pmd_lockptr(mm, new_pmd);
	293	+ if (new_ptl != old_ptl)
	294	+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
	295	+
	296	+ /* Clear the pmd */
	297	+ pmd = *old_pmd;
	298	+ pmd_clear(old_pmd);
	299	+
	300	+ VM_BUG_ON(!pmd_none(*new_pmd));
	301	+
	302	+ /* Set the new pmd */
	303	+ set_pmd_at(mm, new_addr, new_pmd, pmd);
	304	+ flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
	305	+ if (new_ptl != old_ptl)
	306	+ spin_unlock(new_ptl);
	307	+ spin_unlock(old_ptl);
	308	+
	309	+ unlock_vma_ref_count(vma);
	310	+ return true;
	311	+}
	312	+#else
	313	+static inline bool move_normal_pmd(struct vm_area_struct *vma,
	314	+ unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
	315	+ pmd_t *new_pmd)
	316	+{
	317	+ return false;
	318	+}
	319	+#endif
	320	+
	321	+#ifdef CONFIG_HAVE_MOVE_PUD
	322	+static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
	323	+ unsigned long new_addr, pud_t old_pud, pud_t new_pud)
	324	+{
	325	+ spinlock_t old_ptl, new_ptl;
	326	+ struct mm_struct *mm = vma->vm_mm;
	327	+ pud_t pud;
	328	+
	329	+ /*
	330	+ * The destination pud shouldn't be established, free_pgtables()
	331	+ * should have released it.
	332	+ */
	333	+ if (WARN_ON_ONCE(!pud_none(*new_pud)))
	334	+ return false;
	335	+
	336	+ /*
	337	+ * We hold both exclusive mmap_lock and rmap_lock at this point and
	338	+ * cannot block. If we cannot immediately take exclusive ownership
	339	+ * of the VMA fallback to the move_ptes().
	340	+ */
	341	+ if (!trylock_vma_ref_count(vma))
	342	+ return false;
	343	+
	344	+ /*
	345	+ * We don't have to worry about the ordering of src and dst
	346	+ * ptlocks because exclusive mmap_lock prevents deadlock.
	347	+ */
	348	+ old_ptl = pud_lock(vma->vm_mm, old_pud);
	349	+ new_ptl = pud_lockptr(mm, new_pud);
	350	+ if (new_ptl != old_ptl)
	351	+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
	352	+
	353	+ /* Clear the pud */
	354	+ pud = *old_pud;
	355	+ pud_clear(old_pud);
	356	+
	357	+ VM_BUG_ON(!pud_none(*new_pud));
	358	+
	359	+ /* Set the new pud */
	360	+ set_pud_at(mm, new_addr, new_pud, pud);
	361	+ flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
	362	+ if (new_ptl != old_ptl)
	363	+ spin_unlock(new_ptl);
	364	+ spin_unlock(old_ptl);
	365	+
	366	+ unlock_vma_ref_count(vma);
	367	+ return true;
	368	+}
	369	+#else
	370	+static inline bool move_normal_pud(struct vm_area_struct *vma,
	371	+ unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
	372	+ pud_t *new_pud)
	373	+{
	374	+ return false;
	375	+}
	376	+#endif
	377	+
	378	+enum pgt_entry {
	379	+ NORMAL_PMD,
	380	+ HPAGE_PMD,
	381	+ NORMAL_PUD,
	382	+};
	383	+
	384	+/*
	385	+ * Returns an extent of the corresponding size for the pgt_entry specified if
	386	+ * valid. Else returns a smaller extent bounded by the end of the source and
	387	+ * destination pgt_entry.
	388	+ */
	389	+static __always_inline unsigned long get_extent(enum pgt_entry entry,
	390	+ unsigned long old_addr, unsigned long old_end,
	391	+ unsigned long new_addr)
	392	+{
	393	+ unsigned long next, extent, mask, size;
	394	+
	395	+ switch (entry) {
	396	+ case HPAGE_PMD:
	397	+ case NORMAL_PMD:
	398	+ mask = PMD_MASK;
	399	+ size = PMD_SIZE;
	400	+ break;
	401	+ case NORMAL_PUD:
	402	+ mask = PUD_MASK;
	403	+ size = PUD_SIZE;
	404	+ break;
	405	+ default:
	406	+ BUILD_BUG();
	407	+ break;
	408	+ }
	409	+
	410	+ next = (old_addr + size) & mask;
	411	+ /* even if next overflowed, extent below will be ok */
	412	+ extent = next - old_addr;
	413	+ if (extent > old_end - old_addr)
	414	+ extent = old_end - old_addr;
	415	+ next = (new_addr + size) & mask;
	416	+ if (extent > next - new_addr)
	417	+ extent = next - new_addr;
	418	+ return extent;
	419	+}
	420	+
	421	+/*
	422	+ * Attempts to speedup the move by moving entry at the level corresponding to
	423	+ * pgt_entry. Returns true if the move was successful, else false.
	424	+ */
	425	+static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
	426	+ unsigned long old_addr, unsigned long new_addr,
	427	+ void old_entry, void new_entry, bool need_rmap_locks)
	428	+{
	429	+ bool moved = false;
	430	+
	431	+ /* See comment in move_ptes() */
	432	+ if (need_rmap_locks)
	433	+ take_rmap_locks(vma);
	434	+
	435	+ switch (entry) {
	436	+ case NORMAL_PMD:
	437	+ moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
	438	+ new_entry);
	439	+ break;
	440	+ case NORMAL_PUD:
	441	+ moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
	442	+ new_entry);
	443	+ break;
	444	+ case HPAGE_PMD:
	445	+ moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
	446	+ move_huge_pmd(vma, old_addr, new_addr, old_entry,
	447	+ new_entry);
	448	+ break;
	449	+ default:
	450	+ WARN_ON_ONCE(1);
	451	+ break;
	452	+ }
	453	+
	454	+ if (need_rmap_locks)
	455	+ drop_rmap_locks(vma);
	456	+
	457	+ return moved;
	458	+}
	459	+
194	460	unsigned long move_page_tables(struct vm_area_struct *vma,
195	461	unsigned long old_addr, struct vm_area_struct *new_vma,
196	462	unsigned long new_addr, unsigned long len,
197	463	bool need_rmap_locks)
198	464	{
199		- unsigned long extent, next, old_end;
	465	+ unsigned long extent, old_end;
	466	+ struct mmu_notifier_range range;
200	467	pmd_t old_pmd, new_pmd;
201		- unsigned long mmun_start; /* For mmu_notifiers */
202		- unsigned long mmun_end; /* For mmu_notifiers */
	468	+
	469	+ if (!len)
	470	+ return 0;
203	471
204	472	old_end = old_addr + len;
205	473	flush_cache_range(vma, old_addr, old_end);
206	474
207		- mmun_start = old_addr;
208		- mmun_end = old_end;
209		- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
	475	+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
	476	+ old_addr, old_end);
	477	+ mmu_notifier_invalidate_range_start(&range);
210	478
211	479	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
212	480	cond_resched();
213		- next = (old_addr + PMD_SIZE) & PMD_MASK;
214		- /* even if next overflowed, extent below will be ok */
215		- extent = next - old_addr;
216		- if (extent > old_end - old_addr)
217		- extent = old_end - old_addr;
	481	+ /*
	482	+ * If extent is PUD-sized try to speed up the move by moving at the
	483	+ * PUD level if possible.
	484	+ */
	485	+ extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
	486	+ if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
	487	+ pud_t old_pud, new_pud;
	488	+
	489	+ old_pud = get_old_pud(vma->vm_mm, old_addr);
	490	+ if (!old_pud)
	491	+ continue;
	492	+ new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
	493	+ if (!new_pud)
	494	+ break;
	495	+ if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
	496	+ old_pud, new_pud, true))
	497	+ continue;
	498	+ }
	499	+
	500	+ extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
218	501	old_pmd = get_old_pmd(vma->vm_mm, old_addr);
219	502	if (!old_pmd)
220	503	continue;
221	504	new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
222	505	if (!new_pmd)
223	506	break;
224		- if (is_swap_pmd(old_pmd) \|\| pmd_trans_huge(old_pmd) \|\| pmd_devmap(*old_pmd)) {
225		- if (extent == HPAGE_PMD_SIZE) {
226		- bool moved;
227		- /* See comment in move_ptes() */
228		- if (need_rmap_locks)
229		- take_rmap_locks(vma);
230		- moved = move_huge_pmd(vma, old_addr, new_addr,
231		- old_end, old_pmd, new_pmd);
232		- if (need_rmap_locks)
233		- drop_rmap_locks(vma);
234		- if (moved)
235		- continue;
236		- }
	507	+ if (is_swap_pmd(old_pmd) \|\| pmd_trans_huge(old_pmd) \|\|
	508	+ pmd_devmap(*old_pmd)) {
	509	+ if (extent == HPAGE_PMD_SIZE &&
	510	+ move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
	511	+ old_pmd, new_pmd, need_rmap_locks))
	512	+ continue;
237	513	split_huge_pmd(vma, old_pmd, old_addr);
238	514	if (pmd_trans_unstable(old_pmd))
239	515	continue;
	516	+ } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
	517	+ extent == PMD_SIZE) {
	518	+ /*
	519	+ * If the extent is PMD-sized, try to speed the move by
	520	+ * moving at the PMD level if possible.
	521	+ */
	522	+ if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
	523	+ old_pmd, new_pmd, true))
	524	+ continue;
240	525	}
241		- if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
	526	+
	527	+ if (pte_alloc(new_vma->vm_mm, new_pmd))
242	528	break;
243		- next = (new_addr + PMD_SIZE) & PMD_MASK;
244		- if (extent > next - new_addr)
245		- extent = next - new_addr;
246	529	move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
247	530	new_pmd, new_addr, need_rmap_locks);
248	531	}
249	532
250		- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
	533	+ mmu_notifier_invalidate_range_end(&range);
251	534
252	535	return len + old_addr - old_end; /* how much done */
253	536	}
..	..	@@ -255,8 +538,8 @@
255	538	static unsigned long move_vma(struct vm_area_struct *vma,
256	539	unsigned long old_addr, unsigned long old_len,
257	540	unsigned long new_len, unsigned long new_addr,
258		- bool locked, struct vm_userfaultfd_ctx uf,
259		- struct list_head *uf_unmap)
	541	+ bool *locked, unsigned long flags,
	542	+ struct vm_userfaultfd_ctx uf, struct list_head uf_unmap)
260	543	{
261	544	struct mm_struct *mm = vma->vm_mm;
262	545	struct vm_area_struct *new_vma;
..	..	@@ -294,6 +577,14 @@
294	577	if (!new_vma)
295	578	return -ENOMEM;
296	579
	580	+ /* new_vma is returned protected by copy_vma, to prevent speculative
	581	+ * page fault to be done in the destination area before we move the pte.
	582	+ * Now, we must also protect the source VMA since we don't want pages
	583	+ * to be mapped in our back while we are copying the PTEs.
	584	+ */
	585	+ if (vma != new_vma)
	586	+ vm_write_begin(vma);
	587	+
297	588	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
298	589	need_rmap_locks);
299	590	if (moved_len < old_len) {
..	..	@@ -310,6 +601,8 @@
310	601	*/
311	602	move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
312	603	true);
	604	+ if (vma != new_vma)
	605	+ vm_write_end(vma);
313	606	vma = new_vma;
314	607	old_len = new_len;
315	608	old_addr = new_addr;
..	..	@@ -318,7 +611,10 @@
318	611	mremap_userfaultfd_prep(new_vma, uf);
319	612	arch_remap(mm, old_addr, old_addr + old_len,
320	613	new_addr, new_addr + new_len);
	614	+ if (vma != new_vma)
	615	+ vm_write_end(vma);
321	616	}
	617	+ vm_write_end(new_vma);
322	618
323	619	/* Conceal VM_ACCOUNT so old reservation is not undone */
324	620	if (vm_flags & VM_ACCOUNT) {
..	..	@@ -345,11 +641,43 @@
345	641	if (unlikely(vma->vm_flags & VM_PFNMAP))
346	642	untrack_pfn_moved(vma);
347	643
	644	+ if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
	645	+ if (vm_flags & VM_ACCOUNT) {
	646	+ /* Always put back VM_ACCOUNT since we won't unmap */
	647	+ vma->vm_flags \|= VM_ACCOUNT;
	648	+
	649	+ vm_acct_memory(new_len >> PAGE_SHIFT);
	650	+ }
	651	+
	652	+ /*
	653	+ * VMAs can actually be merged back together in copy_vma
	654	+ * calling merge_vma. This can happen with anonymous vmas
	655	+ * which have not yet been faulted, so if we were to consider
	656	+ * this VMA split we'll end up adding VM_ACCOUNT on the
	657	+ * next VMA, which is completely unrelated if this VMA
	658	+ * was re-merged.
	659	+ */
	660	+ if (split && new_vma == vma)
	661	+ split = 0;
	662	+
	663	+ /* We always clear VM_LOCKED[ONFAULT] on the old vma */
	664	+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
	665	+
	666	+ /* Because we won't unmap we don't need to touch locked_vm */
	667	+ goto out;
	668	+ }
	669	+
348	670	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
349	671	/* OOM: unable to split vma, just get accounts right */
350	672	vm_unacct_memory(excess >> PAGE_SHIFT);
351	673	excess = 0;
352	674	}
	675	+
	676	+ if (vm_flags & VM_LOCKED) {
	677	+ mm->locked_vm += new_len >> PAGE_SHIFT;
	678	+ *locked = true;
	679	+ }
	680	+out:
353	681	mm->hiwater_vm = hiwater_vm;
354	682
355	683	/* Restore VM_ACCOUNT if one or two pieces of vma left */
..	..	@@ -359,16 +687,12 @@
359	687	vma->vm_next->vm_flags \|= VM_ACCOUNT;
360	688	}
361	689
362		- if (vm_flags & VM_LOCKED) {
363		- mm->locked_vm += new_len >> PAGE_SHIFT;
364		- *locked = true;
365		- }
366		-
367	690	return new_addr;
368	691	}
369	692
370	693	static struct vm_area_struct *vma_to_resize(unsigned long addr,
371		- unsigned long old_len, unsigned long new_len, unsigned long *p)
	694	+ unsigned long old_len, unsigned long new_len, unsigned long flags,
	695	+ unsigned long *p)
372	696	{
373	697	struct mm_struct *mm = current->mm;
374	698	struct vm_area_struct *vma = find_vma(mm, addr);
..	..	@@ -389,6 +713,10 @@
389	713	pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
390	714	return ERR_PTR(-EINVAL);
391	715	}
	716	+
	717	+ if ((flags & MREMAP_DONTUNMAP) &&
	718	+ (vma->vm_flags & (VM_DONTEXPAND \| VM_PFNMAP)))
	719	+ return ERR_PTR(-EINVAL);
392	720
393	721	if (is_vm_hugetlb_page(vma))
394	722	return ERR_PTR(-EINVAL);
..	..	@@ -434,7 +762,7 @@
434	762
435	763	static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
436	764	unsigned long new_addr, unsigned long new_len, bool *locked,
437		- struct vm_userfaultfd_ctx *uf,
	765	+ unsigned long flags, struct vm_userfaultfd_ctx *uf,
438	766	struct list_head *uf_unmap_early,
439	767	struct list_head *uf_unmap)
440	768	{
..	..	@@ -442,7 +770,7 @@
442	770	struct vm_area_struct *vma;
443	771	unsigned long ret = -EINVAL;
444	772	unsigned long charged = 0;
445		- unsigned long map_flags;
	773	+ unsigned long map_flags = 0;
446	774
447	775	if (offset_in_page(new_addr))
448	776	goto out;
..	..	@@ -454,9 +782,28 @@
454	782	if (addr + old_len > new_addr && new_addr + new_len > addr)
455	783	goto out;
456	784
457		- ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
458		- if (ret)
459		- goto out;
	785	+ /*
	786	+ * move_vma() need us to stay 4 maps below the threshold, otherwise
	787	+ * it will bail out at the very beginning.
	788	+ * That is a problem if we have already unmaped the regions here
	789	+ * (new_addr, and old_addr), because userspace will not know the
	790	+ * state of the vma's after it gets -ENOMEM.
	791	+ * So, to avoid such scenario we can pre-compute if the whole
	792	+ * operation has high chances to success map-wise.
	793	+ * Worst-scenario case is when both vma's (new_addr and old_addr) get
	794	+ * split in 3 before unmaping it.
	795	+ * That means 2 more maps (1 for each) to the ones we already hold.
	796	+ * Check whether current map count plus 2 still leads us to 4 maps below
	797	+ * the threshold, otherwise return -ENOMEM here to be more safe.
	798	+ */
	799	+ if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
	800	+ return -ENOMEM;
	801	+
	802	+ if (flags & MREMAP_FIXED) {
	803	+ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
	804	+ if (ret)
	805	+ goto out;
	806	+ }
460	807
461	808	if (old_len >= new_len) {
462	809	ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
..	..	@@ -465,26 +812,41 @@
465	812	old_len = new_len;
466	813	}
467	814
468		- vma = vma_to_resize(addr, old_len, new_len, &charged);
	815	+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
469	816	if (IS_ERR(vma)) {
470	817	ret = PTR_ERR(vma);
471	818	goto out;
472	819	}
473	820
474		- map_flags = MAP_FIXED;
	821	+ /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
	822	+ if (flags & MREMAP_DONTUNMAP &&
	823	+ !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
	824	+ ret = -ENOMEM;
	825	+ goto out;
	826	+ }
	827	+
	828	+ if (flags & MREMAP_FIXED)
	829	+ map_flags \|= MAP_FIXED;
	830	+
475	831	if (vma->vm_flags & VM_MAYSHARE)
476	832	map_flags \|= MAP_SHARED;
477	833
478	834	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
479	835	((addr - vma->vm_start) >> PAGE_SHIFT),
480	836	map_flags);
481		- if (offset_in_page(ret))
	837	+ if (IS_ERR_VALUE(ret))
482	838	goto out1;
483	839
484		- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
	840	+ /* We got a new mapping */
	841	+ if (!(flags & MREMAP_FIXED))
	842	+ new_addr = ret;
	843	+
	844	+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
485	845	uf_unmap);
	846	+
486	847	if (!(offset_in_page(ret)))
487	848	goto out;
	849	+
488	850	out1:
489	851	vm_unacct_memory(charged);
490	852
..	..	@@ -521,17 +883,37 @@
521	883	unsigned long ret = -EINVAL;
522	884	unsigned long charged = 0;
523	885	bool locked = false;
	886	+ bool downgraded = false;
524	887	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
525	888	LIST_HEAD(uf_unmap_early);
526	889	LIST_HEAD(uf_unmap);
527	890
	891	+ /*
	892	+ * There is a deliberate asymmetry here: we strip the pointer tag
	893	+ * from the old address but leave the new address alone. This is
	894	+ * for consistency with mmap(), where we prevent the creation of
	895	+ * aliasing mappings in userspace by leaving the tag bits of the
	896	+ * mapping address intact. A non-zero tag will cause the subsequent
	897	+ * range checks to reject the address as invalid.
	898	+ *
	899	+ * See Documentation/arm64/tagged-address-abi.rst for more information.
	900	+ */
528	901	addr = untagged_addr(addr);
529	902
530		- if (flags & ~(MREMAP_FIXED \| MREMAP_MAYMOVE))
	903	+ if (flags & ~(MREMAP_FIXED \| MREMAP_MAYMOVE \| MREMAP_DONTUNMAP))
531	904	return ret;
532	905
533	906	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
534	907	return ret;
	908	+
	909	+ /*
	910	+ * MREMAP_DONTUNMAP is always a move and it does not allow resizing
	911	+ * in the process.
	912	+ */
	913	+ if (flags & MREMAP_DONTUNMAP &&
	914	+ (!(flags & MREMAP_MAYMOVE) \|\| old_len != new_len))
	915	+ return ret;
	916	+
535	917
536	918	if (offset_in_page(addr))
537	919	return ret;
..	..	@@ -547,24 +929,33 @@
547	929	if (!new_len)
548	930	return ret;
549	931
550		- if (down_write_killable(&current->mm->mmap_sem))
	932	+ if (mmap_write_lock_killable(current->mm))
551	933	return -EINTR;
552	934
553		- if (flags & MREMAP_FIXED) {
	935	+ if (flags & (MREMAP_FIXED \| MREMAP_DONTUNMAP)) {
554	936	ret = mremap_to(addr, old_len, new_addr, new_len,
555		- &locked, &uf, &uf_unmap_early, &uf_unmap);
	937	+ &locked, flags, &uf, &uf_unmap_early,
	938	+ &uf_unmap);
556	939	goto out;
557	940	}
558	941
559	942	/*
560	943	* Always allow a shrinking remap: that just unmaps
561	944	* the unnecessary pages..
562		- * do_munmap does all the needed commit accounting
	945	+ * __do_munmap does all the needed commit accounting, and
	946	+ * downgrades mmap_lock to read if so directed.
563	947	*/
564	948	if (old_len >= new_len) {
565		- ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
566		- if (ret && old_len != new_len)
	949	+ int retval;
	950	+
	951	+ retval = __do_munmap(mm, addr+new_len, old_len - new_len,
	952	+ &uf_unmap, true);
	953	+ if (retval < 0 && old_len != new_len) {
	954	+ ret = retval;
567	955	goto out;
	956	+ /* Returning 1 indicates mmap_lock is downgraded to read. */
	957	+ } else if (retval == 1)
	958	+ downgraded = true;
568	959	ret = addr;
569	960	goto out;
570	961	}
..	..	@@ -572,7 +963,7 @@
572	963	/*
573	964	* Ok, we need to grow..
574	965	*/
575		- vma = vma_to_resize(addr, old_len, new_len, &charged);
	966	+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
576	967	if (IS_ERR(vma)) {
577	968	ret = PTR_ERR(vma);
578	969	goto out;
..	..	@@ -616,24 +1007,27 @@
616	1007	vma->vm_pgoff +
617	1008	((addr - vma->vm_start) >> PAGE_SHIFT),
618	1009	map_flags);
619		- if (offset_in_page(new_addr)) {
	1010	+ if (IS_ERR_VALUE(new_addr)) {
620	1011	ret = new_addr;
621	1012	goto out;
622	1013	}
623	1014
624	1015	ret = move_vma(vma, addr, old_len, new_len, new_addr,
625		- &locked, &uf, &uf_unmap);
	1016	+ &locked, flags, &uf, &uf_unmap);
626	1017	}
627	1018	out:
628	1019	if (offset_in_page(ret)) {
629	1020	vm_unacct_memory(charged);
630		- locked = 0;
	1021	+ locked = false;
631	1022	}
632		- up_write(&current->mm->mmap_sem);
	1023	+ if (downgraded)
	1024	+ mmap_read_unlock(current->mm);
	1025	+ else
	1026	+ mmap_write_unlock(current->mm);
633	1027	if (locked && new_len > old_len)
634	1028	mm_populate(new_addr + old_len, new_len - old_len);
635	1029	userfaultfd_unmap_complete(mm, &uf_unmap_early);
636		- mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
	1030	+ mremap_userfaultfd_complete(&uf, addr, ret, old_len);
637	1031	userfaultfd_unmap_complete(mm, &uf_unmap);
638	1032	return ret;
639	1033	}