~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Generic hugetlb support.
3	4	* (C) Nadia Yvette Chambers, April 2004
..	..	@@ -15,9 +16,10 @@
15	16	#include <linux/compiler.h>
16	17	#include <linux/cpuset.h>
17	18	#include <linux/mutex.h>
18		-#include <linux/bootmem.h>
	19	+#include <linux/memblock.h>
19	20	#include <linux/sysfs.h>
20	21	#include <linux/slab.h>
	22	+#include <linux/sched/mm.h>
21	23	#include <linux/mmdebug.h>
22	24	#include <linux/sched/signal.h>
23	25	#include <linux/rmap.h>
..	..	@@ -25,22 +27,30 @@
25	27	#include <linux/swap.h>
26	28	#include <linux/swapops.h>
27	29	#include <linux/jhash.h>
	30	+#include <linux/numa.h>
	31	+#include <linux/llist.h>
	32	+#include <linux/cma.h>
28	33
29	34	#include <asm/page.h>
30		-#include <asm/pgtable.h>
	35	+#include <asm/pgalloc.h>
31	36	#include <asm/tlb.h>
32	37
33	38	#include <linux/io.h>
34	39	#include <linux/hugetlb.h>
35	40	#include <linux/hugetlb_cgroup.h>
36	41	#include <linux/node.h>
37		-#include <linux/userfaultfd_k.h>
38	42	#include <linux/page_owner.h>
39	43	#include "internal.h"
40	44
41	45	int hugetlb_max_hstate __read_mostly;
42	46	unsigned int default_hstate_idx;
43	47	struct hstate hstates[HUGE_MAX_HSTATE];
	48	+
	49	+#ifdef CONFIG_CMA
	50	+static struct cma *hugetlb_cma[MAX_NUMNODES];
	51	+#endif
	52	+static unsigned long hugetlb_cma_size __initdata;
	53	+
44	54	/*
45	55	* Minimum page order among possible hugepage sizes, set to a proper value
46	56	* at boot time.
..	..	@@ -52,8 +62,8 @@
52	62	/* for command line parsing */
53	63	static struct hstate * __initdata parsed_hstate;
54	64	static unsigned long __initdata default_hstate_max_huge_pages;
55		-static unsigned long __initdata default_hstate_size;
56	65	static bool __initdata parsed_valid_hugepagesz = true;
	66	+static bool __initdata parsed_default_hugepagesz;
57	67
58	68	/*
59	69	* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
..	..	@@ -67,6 +77,9 @@
67	77	*/
68	78	static int num_fault_mutexes;
69	79	struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
	80	+
	81	+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
	82	+ unsigned long start, unsigned long end);
70	83
71	84	static inline bool PageHugeFreed(struct page *head)
72	85	{
..	..	@@ -93,7 +106,7 @@
93	106	spin_unlock(&spool->lock);
94	107
95	108	/* If no pages are used, and no other handles to the subpool
96		- * remain, give up any reservations mased on minimum size and
	109	+ * remain, give up any reservations based on minimum size and
97	110	* free the subpool */
98	111	if (free) {
99	112	if (spool->min_hpages != -1)
..	..	@@ -138,10 +151,10 @@
138	151	/*
139	152	* Subpool accounting for allocating and reserving pages.
140	153	* Return -ENOMEM if there are not enough resources to satisfy the
141		- * the request. Otherwise, return the number of pages by which the
	154	+ * request. Otherwise, return the number of pages by which the
142	155	* global pools must be adjusted (upward). The returned value may
143	156	* only be different than the passed value (delta) in the case where
144		- * a subpool minimum size must be manitained.
	157	+ * a subpool minimum size must be maintained.
145	158	*/
146	159	static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
147	160	long delta)
..	..	@@ -232,114 +245,317 @@
232	245	return subpool_inode(file_inode(vma->vm_file));
233	246	}
234	247
235		-/*
236		- * Region tracking -- allows tracking of reservations and instantiated pages
237		- * across the pages in a mapping.
238		- *
239		- * The region data structures are embedded into a resv_map and protected
240		- * by a resv_map's lock. The set of regions within the resv_map represent
241		- * reservations for huge pages, or huge pages that have already been
242		- * instantiated within the map. The from and to elements are huge page
243		- * indicies into the associated mapping. from indicates the starting index
244		- * of the region. to represents the first index past the end of the region.
245		- *
246		- * For example, a file region structure with from == 0 and to == 4 represents
247		- * four huge pages in a mapping. It is important to note that the to element
248		- * represents the first element past the end of the region. This is used in
249		- * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
250		- *
251		- * Interval notation of the form [from, to) will be used to indicate that
252		- * the endpoint from is inclusive and to is exclusive.
	248	+/* Helper that removes a struct file_region from the resv_map cache and returns
	249	+ * it for use.
253	250	*/
254		-struct file_region {
255		- struct list_head link;
256		- long from;
257		- long to;
258		-};
259		-
260		-/*
261		- * Add the huge page range represented by [f, t) to the reserve
262		- * map. In the normal case, existing regions will be expanded
263		- * to accommodate the specified range. Sufficient regions should
264		- * exist for expansion due to the previous call to region_chg
265		- * with the same range. However, it is possible that region_del
266		- * could have been called after region_chg and modifed the map
267		- * in such a way that no region exists to be expanded. In this
268		- * case, pull a region descriptor from the cache associated with
269		- * the map and use that for the new range.
270		- *
271		- * Return the number of new huge pages added to the map. This
272		- * number is greater than or equal to zero.
273		- */
274		-static long region_add(struct resv_map *resv, long f, long t)
	251	+static struct file_region *
	252	+get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
275	253	{
276		- struct list_head *head = &resv->regions;
277		- struct file_region rg, nrg, *trg;
278		- long add = 0;
	254	+ struct file_region *nrg = NULL;
279	255
280		- spin_lock(&resv->lock);
281		- /* Locate the region we are either in or before. */
282		- list_for_each_entry(rg, head, link)
283		- if (f <= rg->to)
284		- break;
	256	+ VM_BUG_ON(resv->region_cache_count <= 0);
285	257
286		- /*
287		- * If no region exists which can be expanded to include the
288		- * specified range, the list must have been modified by an
289		- * interleving call to region_del(). Pull a region descriptor
290		- * from the cache and use it for this range.
291		- */
292		- if (&rg->link == head \|\| t < rg->from) {
293		- VM_BUG_ON(resv->region_cache_count <= 0);
	258	+ resv->region_cache_count--;
	259	+ nrg = list_first_entry(&resv->region_cache, struct file_region, link);
	260	+ list_del(&nrg->link);
294	261
295		- resv->region_cache_count--;
296		- nrg = list_first_entry(&resv->region_cache, struct file_region,
297		- link);
298		- list_del(&nrg->link);
	262	+ nrg->from = from;
	263	+ nrg->to = to;
299	264
300		- nrg->from = f;
301		- nrg->to = t;
302		- list_add(&nrg->link, rg->link.prev);
	265	+ return nrg;
	266	+}
303	267
304		- add += t - f;
305		- goto out_locked;
	268	+static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
	269	+ struct file_region *rg)
	270	+{
	271	+#ifdef CONFIG_CGROUP_HUGETLB
	272	+ nrg->reservation_counter = rg->reservation_counter;
	273	+ nrg->css = rg->css;
	274	+ if (rg->css)
	275	+ css_get(rg->css);
	276	+#endif
	277	+}
	278	+
	279	+/* Helper that records hugetlb_cgroup uncharge info. */
	280	+static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
	281	+ struct hstate *h,
	282	+ struct resv_map *resv,
	283	+ struct file_region *nrg)
	284	+{
	285	+#ifdef CONFIG_CGROUP_HUGETLB
	286	+ if (h_cg) {
	287	+ nrg->reservation_counter =
	288	+ &h_cg->rsvd_hugepage[hstate_index(h)];
	289	+ nrg->css = &h_cg->css;
	290	+ /*
	291	+ * The caller will hold exactly one h_cg->css reference for the
	292	+ * whole contiguous reservation region. But this area might be
	293	+ * scattered when there are already some file_regions reside in
	294	+ * it. As a result, many file_regions may share only one css
	295	+ * reference. In order to ensure that one file_region must hold
	296	+ * exactly one h_cg->css reference, we should do css_get for
	297	+ * each file_region and leave the reference held by caller
	298	+ * untouched.
	299	+ */
	300	+ css_get(&h_cg->css);
	301	+ if (!resv->pages_per_hpage)
	302	+ resv->pages_per_hpage = pages_per_huge_page(h);
	303	+ /* pages_per_hpage should be the same for all entries in
	304	+ * a resv_map.
	305	+ */
	306	+ VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
	307	+ } else {
	308	+ nrg->reservation_counter = NULL;
	309	+ nrg->css = NULL;
	310	+ }
	311	+#endif
	312	+}
	313	+
	314	+static void put_uncharge_info(struct file_region *rg)
	315	+{
	316	+#ifdef CONFIG_CGROUP_HUGETLB
	317	+ if (rg->css)
	318	+ css_put(rg->css);
	319	+#endif
	320	+}
	321	+
	322	+static bool has_same_uncharge_info(struct file_region *rg,
	323	+ struct file_region *org)
	324	+{
	325	+#ifdef CONFIG_CGROUP_HUGETLB
	326	+ return rg && org &&
	327	+ rg->reservation_counter == org->reservation_counter &&
	328	+ rg->css == org->css;
	329	+
	330	+#else
	331	+ return true;
	332	+#endif
	333	+}
	334	+
	335	+static void coalesce_file_region(struct resv_map resv, struct file_region rg)
	336	+{
	337	+ struct file_region nrg = NULL, prg = NULL;
	338	+
	339	+ prg = list_prev_entry(rg, link);
	340	+ if (&prg->link != &resv->regions && prg->to == rg->from &&
	341	+ has_same_uncharge_info(prg, rg)) {
	342	+ prg->to = rg->to;
	343	+
	344	+ list_del(&rg->link);
	345	+ put_uncharge_info(rg);
	346	+ kfree(rg);
	347	+
	348	+ rg = prg;
306	349	}
307	350
308		- /* Round our left edge to the current segment if it encloses us. */
309		- if (f > rg->from)
310		- f = rg->from;
	351	+ nrg = list_next_entry(rg, link);
	352	+ if (&nrg->link != &resv->regions && nrg->from == rg->to &&
	353	+ has_same_uncharge_info(nrg, rg)) {
	354	+ nrg->from = rg->from;
311	355
312		- /* Check for and consume any regions we now overlap with. */
313		- nrg = rg;
314		- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
315		- if (&rg->link == head)
316		- break;
	356	+ list_del(&rg->link);
	357	+ put_uncharge_info(rg);
	358	+ kfree(rg);
	359	+ }
	360	+}
	361	+
	362	+/*
	363	+ * Must be called with resv->lock held.
	364	+ *
	365	+ * Calling this with regions_needed != NULL will count the number of pages
	366	+ * to be added but will not modify the linked list. And regions_needed will
	367	+ * indicate the number of file_regions needed in the cache to carry out to add
	368	+ * the regions for this range.
	369	+ */
	370	+static long add_reservation_in_range(struct resv_map *resv, long f, long t,
	371	+ struct hugetlb_cgroup *h_cg,
	372	+ struct hstate h, long regions_needed)
	373	+{
	374	+ long add = 0;
	375	+ struct list_head *head = &resv->regions;
	376	+ long last_accounted_offset = f;
	377	+ struct file_region rg = NULL, trg = NULL, *nrg = NULL;
	378	+
	379	+ if (regions_needed)
	380	+ *regions_needed = 0;
	381	+
	382	+ /* In this loop, we essentially handle an entry for the range
	383	+ * [last_accounted_offset, rg->from), at every iteration, with some
	384	+ * bounds checking.
	385	+ */
	386	+ list_for_each_entry_safe(rg, trg, head, link) {
	387	+ /* Skip irrelevant regions that start before our range. */
	388	+ if (rg->from < f) {
	389	+ /* If this region ends after the last accounted offset,
	390	+ * then we need to update last_accounted_offset.
	391	+ */
	392	+ if (rg->to > last_accounted_offset)
	393	+ last_accounted_offset = rg->to;
	394	+ continue;
	395	+ }
	396	+
	397	+ /* When we find a region that starts beyond our range, we've
	398	+ * finished.
	399	+ */
317	400	if (rg->from > t)
318	401	break;
319	402
320		- /* If this area reaches higher then extend our area to
321		- * include it completely. If this is not the first area
322		- * which we intend to reuse, free it. */
323		- if (rg->to > t)
324		- t = rg->to;
325		- if (rg != nrg) {
326		- /* Decrement return value by the deleted range.
327		- * Another range will span this area so that by
328		- * end of routine add will be >= zero
329		- */
330		- add -= (rg->to - rg->from);
331		- list_del(&rg->link);
332		- kfree(rg);
	403	+ /* Add an entry for last_accounted_offset -> rg->from, and
	404	+ * update last_accounted_offset.
	405	+ */
	406	+ if (rg->from > last_accounted_offset) {
	407	+ add += rg->from - last_accounted_offset;
	408	+ if (!regions_needed) {
	409	+ nrg = get_file_region_entry_from_cache(
	410	+ resv, last_accounted_offset, rg->from);
	411	+ record_hugetlb_cgroup_uncharge_info(h_cg, h,
	412	+ resv, nrg);
	413	+ list_add(&nrg->link, rg->link.prev);
	414	+ coalesce_file_region(resv, nrg);
	415	+ } else
	416	+ *regions_needed += 1;
333	417	}
	418	+
	419	+ last_accounted_offset = rg->to;
334	420	}
335	421
336		- add += (nrg->from - f); /* Added to beginning of region */
337		- nrg->from = f;
338		- add += t - nrg->to; /* Added to end of region */
339		- nrg->to = t;
	422	+ /* Handle the case where our range extends beyond
	423	+ * last_accounted_offset.
	424	+ */
	425	+ if (last_accounted_offset < t) {
	426	+ add += t - last_accounted_offset;
	427	+ if (!regions_needed) {
	428	+ nrg = get_file_region_entry_from_cache(
	429	+ resv, last_accounted_offset, t);
	430	+ record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
	431	+ list_add(&nrg->link, rg->link.prev);
	432	+ coalesce_file_region(resv, nrg);
	433	+ } else
	434	+ *regions_needed += 1;
	435	+ }
340	436
341		-out_locked:
342		- resv->adds_in_progress--;
	437	+ VM_BUG_ON(add < 0);
	438	+ return add;
	439	+}
	440	+
	441	+/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
	442	+ */
	443	+static int allocate_file_region_entries(struct resv_map *resv,
	444	+ int regions_needed)
	445	+ __must_hold(&resv->lock)
	446	+{
	447	+ struct list_head allocated_regions;
	448	+ int to_allocate = 0, i = 0;
	449	+ struct file_region trg = NULL, rg = NULL;
	450	+
	451	+ VM_BUG_ON(regions_needed < 0);
	452	+
	453	+ INIT_LIST_HEAD(&allocated_regions);
	454	+
	455	+ /*
	456	+ * Check for sufficient descriptors in the cache to accommodate
	457	+ * the number of in progress add operations plus regions_needed.
	458	+ *
	459	+ * This is a while loop because when we drop the lock, some other call
	460	+ * to region_add or region_del may have consumed some region_entries,
	461	+ * so we keep looping here until we finally have enough entries for
	462	+ * (adds_in_progress + regions_needed).
	463	+ */
	464	+ while (resv->region_cache_count <
	465	+ (resv->adds_in_progress + regions_needed)) {
	466	+ to_allocate = resv->adds_in_progress + regions_needed -
	467	+ resv->region_cache_count;
	468	+
	469	+ /* At this point, we should have enough entries in the cache
	470	+ * for all the existings adds_in_progress. We should only be
	471	+ * needing to allocate for regions_needed.
	472	+ */
	473	+ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
	474	+
	475	+ spin_unlock(&resv->lock);
	476	+ for (i = 0; i < to_allocate; i++) {
	477	+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
	478	+ if (!trg)
	479	+ goto out_of_memory;
	480	+ list_add(&trg->link, &allocated_regions);
	481	+ }
	482	+
	483	+ spin_lock(&resv->lock);
	484	+
	485	+ list_splice(&allocated_regions, &resv->region_cache);
	486	+ resv->region_cache_count += to_allocate;
	487	+ }
	488	+
	489	+ return 0;
	490	+
	491	+out_of_memory:
	492	+ list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
	493	+ list_del(&rg->link);
	494	+ kfree(rg);
	495	+ }
	496	+ return -ENOMEM;
	497	+}
	498	+
	499	+/*
	500	+ * Add the huge page range represented by [f, t) to the reserve
	501	+ * map. Regions will be taken from the cache to fill in this range.
	502	+ * Sufficient regions should exist in the cache due to the previous
	503	+ * call to region_chg with the same range, but in some cases the cache will not
	504	+ * have sufficient entries due to races with other code doing region_add or
	505	+ * region_del. The extra needed entries will be allocated.
	506	+ *
	507	+ * regions_needed is the out value provided by a previous call to region_chg.
	508	+ *
	509	+ * Return the number of new huge pages added to the map. This number is greater
	510	+ * than or equal to zero. If file_region entries needed to be allocated for
	511	+ * this operation and we were not able to allocate, it returns -ENOMEM.
	512	+ * region_add of regions of length 1 never allocate file_regions and cannot
	513	+ * fail; region_chg will always allocate at least 1 entry and a region_add for
	514	+ * 1 page will only require at most 1 entry.
	515	+ */
	516	+static long region_add(struct resv_map *resv, long f, long t,
	517	+ long in_regions_needed, struct hstate *h,
	518	+ struct hugetlb_cgroup *h_cg)
	519	+{
	520	+ long add = 0, actual_regions_needed = 0;
	521	+
	522	+ spin_lock(&resv->lock);
	523	+retry:
	524	+
	525	+ /* Count how many regions are actually needed to execute this add. */
	526	+ add_reservation_in_range(resv, f, t, NULL, NULL,
	527	+ &actual_regions_needed);
	528	+
	529	+ /*
	530	+ * Check for sufficient descriptors in the cache to accommodate
	531	+ * this add operation. Note that actual_regions_needed may be greater
	532	+ * than in_regions_needed, as the resv_map may have been modified since
	533	+ * the region_chg call. In this case, we need to make sure that we
	534	+ * allocate extra entries, such that we have enough for all the
	535	+ * existing adds_in_progress, plus the excess needed for this
	536	+ * operation.
	537	+ */
	538	+ if (actual_regions_needed > in_regions_needed &&
	539	+ resv->region_cache_count <
	540	+ resv->adds_in_progress +
	541	+ (actual_regions_needed - in_regions_needed)) {
	542	+ /* region_add operation of range 1 should never need to
	543	+ * allocate file_region entries.
	544	+ */
	545	+ VM_BUG_ON(t - f <= 1);
	546	+
	547	+ if (allocate_file_region_entries(
	548	+ resv, actual_regions_needed - in_regions_needed)) {
	549	+ return -ENOMEM;
	550	+ }
	551	+
	552	+ goto retry;
	553	+ }
	554	+
	555	+ add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
	556	+
	557	+ resv->adds_in_progress -= in_regions_needed;
	558	+
343	559	spin_unlock(&resv->lock);
344	560	VM_BUG_ON(add < 0);
345	561	return add;
..	..	@@ -352,111 +568,38 @@
352	568	* call to region_add that will actually modify the reserve
353	569	* map to add the specified range [f, t). region_chg does
354	570	* not change the number of huge pages represented by the
355		- * map. However, if the existing regions in the map can not
356		- * be expanded to represent the new range, a new file_region
357		- * structure is added to the map as a placeholder. This is
358		- * so that the subsequent region_add call will have all the
359		- * regions it needs and will not fail.
	571	+ * map. A number of new file_region structures is added to the cache as a
	572	+ * placeholder, for the subsequent region_add call to use. At least 1
	573	+ * file_region structure is added.
360	574	*
361		- * Upon entry, region_chg will also examine the cache of region descriptors
362		- * associated with the map. If there are not enough descriptors cached, one
363		- * will be allocated for the in progress add operation.
	575	+ * out_regions_needed is the number of regions added to the
	576	+ * resv->adds_in_progress. This value needs to be provided to a follow up call
	577	+ * to region_add or region_abort for proper accounting.
364	578	*
365	579	* Returns the number of huge pages that need to be added to the existing
366	580	* reservation map for the range [f, t). This number is greater or equal to
367	581	* zero. -ENOMEM is returned if a new file_region structure or cache entry
368	582	* is needed and can not be allocated.
369	583	*/
370		-static long region_chg(struct resv_map *resv, long f, long t)
	584	+static long region_chg(struct resv_map *resv, long f, long t,
	585	+ long *out_regions_needed)
371	586	{
372		- struct list_head *head = &resv->regions;
373		- struct file_region rg, nrg = NULL;
374	587	long chg = 0;
375	588
376		-retry:
377	589	spin_lock(&resv->lock);
378		-retry_locked:
379		- resv->adds_in_progress++;
380	590
381		- /*
382		- * Check for sufficient descriptors in the cache to accommodate
383		- * the number of in progress add operations.
384		- */
385		- if (resv->adds_in_progress > resv->region_cache_count) {
386		- struct file_region *trg;
	591	+ /* Count how many hugepages in this range are NOT represented. */
	592	+ chg = add_reservation_in_range(resv, f, t, NULL, NULL,
	593	+ out_regions_needed);
387	594
388		- VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
389		- /* Must drop lock to allocate a new descriptor. */
390		- resv->adds_in_progress--;
391		- spin_unlock(&resv->lock);
	595	+ if (*out_regions_needed == 0)
	596	+ *out_regions_needed = 1;
392	597
393		- trg = kmalloc(sizeof(*trg), GFP_KERNEL);
394		- if (!trg) {
395		- kfree(nrg);
396		- return -ENOMEM;
397		- }
	598	+ if (allocate_file_region_entries(resv, *out_regions_needed))
	599	+ return -ENOMEM;
398	600
399		- spin_lock(&resv->lock);
400		- list_add(&trg->link, &resv->region_cache);
401		- resv->region_cache_count++;
402		- goto retry_locked;
403		- }
	601	+ resv->adds_in_progress += *out_regions_needed;
404	602
405		- /* Locate the region we are before or in. */
406		- list_for_each_entry(rg, head, link)
407		- if (f <= rg->to)
408		- break;
409		-
410		- /* If we are below the current region then a new region is required.
411		- * Subtle, allocate a new region at the position but make it zero
412		- * size such that we can guarantee to record the reservation. */
413		- if (&rg->link == head \|\| t < rg->from) {
414		- if (!nrg) {
415		- resv->adds_in_progress--;
416		- spin_unlock(&resv->lock);
417		- nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
418		- if (!nrg)
419		- return -ENOMEM;
420		-
421		- nrg->from = f;
422		- nrg->to = f;
423		- INIT_LIST_HEAD(&nrg->link);
424		- goto retry;
425		- }
426		-
427		- list_add(&nrg->link, rg->link.prev);
428		- chg = t - f;
429		- goto out_nrg;
430		- }
431		-
432		- /* Round our left edge to the current segment if it encloses us. */
433		- if (f > rg->from)
434		- f = rg->from;
435		- chg = t - f;
436		-
437		- /* Check for and consume any regions we now overlap with. */
438		- list_for_each_entry(rg, rg->link.prev, link) {
439		- if (&rg->link == head)
440		- break;
441		- if (rg->from > t)
442		- goto out;
443		-
444		- /* We overlap with this area, if it extends further than
445		- * us then we must extend ourselves. Account for its
446		- * existing reservation. */
447		- if (rg->to > t) {
448		- chg += rg->to - t;
449		- t = rg->to;
450		- }
451		- chg -= rg->to - rg->from;
452		- }
453		-
454		-out:
455		- spin_unlock(&resv->lock);
456		- /* We already know we raced and no longer need the new region */
457		- kfree(nrg);
458		- return chg;
459		-out_nrg:
460	603	spin_unlock(&resv->lock);
461	604	return chg;
462	605	}
..	..	@@ -466,17 +609,20 @@
466	609	* of the resv_map keeps track of the operations in progress between
467	610	* calls to region_chg and region_add. Operations are sometimes
468	611	* aborted after the call to region_chg. In such cases, region_abort
469		- * is called to decrement the adds_in_progress counter.
	612	+ * is called to decrement the adds_in_progress counter. regions_needed
	613	+ * is the value returned by the region_chg call, it is used to decrement
	614	+ * the adds_in_progress counter.
470	615	*
471	616	* NOTE: The range arguments [f, t) are not needed or used in this
472	617	* routine. They are kept to make reading the calling code easier as
473	618	* arguments will match the associated region_chg call.
474	619	*/
475		-static void region_abort(struct resv_map *resv, long f, long t)
	620	+static void region_abort(struct resv_map *resv, long f, long t,
	621	+ long regions_needed)
476	622	{
477	623	spin_lock(&resv->lock);
478	624	VM_BUG_ON(!resv->region_cache_count);
479		- resv->adds_in_progress--;
	625	+ resv->adds_in_progress -= regions_needed;
480	626	spin_unlock(&resv->lock);
481	627	}
482	628
..	..	@@ -540,10 +686,15 @@
540	686	}
541	687
542	688	del += t - f;
	689	+ hugetlb_cgroup_uncharge_file_region(
	690	+ resv, rg, t - f, false);
543	691
544	692	/* New entry for end of split region */
545	693	nrg->from = t;
546	694	nrg->to = rg->to;
	695	+
	696	+ copy_hugetlb_cgroup_uncharge_info(nrg, rg);
	697	+
547	698	INIT_LIST_HEAD(&nrg->link);
548	699
549	700	/* Original entry is trimmed */
..	..	@@ -556,15 +707,23 @@
556	707
557	708	if (f <= rg->from && t >= rg->to) { /* Remove entire region */
558	709	del += rg->to - rg->from;
	710	+ hugetlb_cgroup_uncharge_file_region(resv, rg,
	711	+ rg->to - rg->from, true);
559	712	list_del(&rg->link);
560	713	kfree(rg);
561	714	continue;
562	715	}
563	716
564	717	if (f <= rg->from) { /* Trim beginning of region */
	718	+ hugetlb_cgroup_uncharge_file_region(resv, rg,
	719	+ t - rg->from, false);
	720	+
565	721	del += t - rg->from;
566	722	rg->from = t;
567	723	} else { /* Trim end of region */
	724	+ hugetlb_cgroup_uncharge_file_region(resv, rg,
	725	+ rg->to - f, false);
	726	+
568	727	del += rg->to - f;
569	728	rg->to = f;
570	729	}
..	..	@@ -715,6 +874,25 @@
715	874	vma->vm_private_data = (void *)value;
716	875	}
717	876
	877	+static void
	878	+resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
	879	+ struct hugetlb_cgroup *h_cg,
	880	+ struct hstate *h)
	881	+{
	882	+#ifdef CONFIG_CGROUP_HUGETLB
	883	+ if (!h_cg \|\| !h) {
	884	+ resv_map->reservation_counter = NULL;
	885	+ resv_map->pages_per_hpage = 0;
	886	+ resv_map->css = NULL;
	887	+ } else {
	888	+ resv_map->reservation_counter =
	889	+ &h_cg->rsvd_hugepage[hstate_index(h)];
	890	+ resv_map->pages_per_hpage = pages_per_huge_page(h);
	891	+ resv_map->css = &h_cg->css;
	892	+ }
	893	+#endif
	894	+}
	895	+
718	896	struct resv_map *resv_map_alloc(void)
719	897	{
720	898	struct resv_map resv_map = kmalloc(sizeof(resv_map), GFP_KERNEL);
..	..	@@ -731,6 +909,13 @@
731	909	INIT_LIST_HEAD(&resv_map->regions);
732	910
733	911	resv_map->adds_in_progress = 0;
	912	+ /*
	913	+ * Initialize these to 0. On shared mappings, 0's here indicate these
	914	+ * fields don't do cgroup accounting. On private mappings, these will be
	915	+ * re-initialized to the proper values, to indicate that hugetlb cgroup
	916	+ * reservations are to be un-charged from here.
	917	+ */
	918	+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
734	919
735	920	INIT_LIST_HEAD(&resv_map->region_cache);
736	921	list_add(&rg->link, &resv_map->region_cache);
..	..	@@ -761,7 +946,15 @@
761	946
762	947	static inline struct resv_map inode_resv_map(struct inode inode)
763	948	{
764		- return inode->i_mapping->private_data;
	949	+ /*
	950	+ * At inode evict time, i_mapping may not point to the original
	951	+ * address space within the inode. This original address space
	952	+ * contains the pointer to the resv_map. So, always use the
	953	+ * address space embedded within the inode.
	954	+ * The VERY common case is inode->mapping == &inode->i_data but,
	955	+ * this may not be true for device special inodes.
	956	+ */
	957	+ return (struct resv_map *)(&inode->i_data)->private_data;
765	958	}
766	959
767	960	static struct resv_map vma_resv_map(struct vm_area_struct vma)
..	..	@@ -836,7 +1029,7 @@
836	1029	* We know VM_NORESERVE is not set. Therefore, there SHOULD
837	1030	* be a region map for all pages. The only situation where
838	1031	* there is no region map is if a hole was punched via
839		- * fallocate. In this case, there really are no reverves to
	1032	+ * fallocate. In this case, there really are no reserves to
840	1033	* use. This situation is indicated if chg != 0.
841	1034	*/
842	1035	if (chg)
..	..	@@ -886,22 +1079,24 @@
886	1079	static struct page dequeue_huge_page_node_exact(struct hstate h, int nid)
887	1080	{
888	1081	struct page *page;
	1082	+ bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
889	1083
890		- list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
891		- if (!PageHWPoison(page))
892		- break;
893		- /*
894		- * if 'non-isolated free hugepage' not found on the list,
895		- * the allocation fails.
896		- */
897		- if (&h->hugepage_freelists[nid] == &page->lru)
898		- return NULL;
899		- list_move(&page->lru, &h->hugepage_activelist);
900		- set_page_refcounted(page);
901		- ClearPageHugeFreed(page);
902		- h->free_huge_pages--;
903		- h->free_huge_pages_node[nid]--;
904		- return page;
	1084	+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
	1085	+ if (nocma && is_migrate_cma_page(page))
	1086	+ continue;
	1087	+
	1088	+ if (PageHWPoison(page))
	1089	+ continue;
	1090	+
	1091	+ list_move(&page->lru, &h->hugepage_activelist);
	1092	+ set_page_refcounted(page);
	1093	+ ClearPageHugeFreed(page);
	1094	+ h->free_huge_pages--;
	1095	+ h->free_huge_pages_node[nid]--;
	1096	+ return page;
	1097	+ }
	1098	+
	1099	+ return NULL;
905	1100	}
906	1101
907	1102	static struct page dequeue_huge_page_nodemask(struct hstate h, gfp_t gfp_mask, int nid,
..	..	@@ -911,7 +1106,7 @@
911	1106	struct zonelist *zonelist;
912	1107	struct zone *zone;
913	1108	struct zoneref *z;
914		- int node = -1;
	1109	+ int node = NUMA_NO_NODE;
915	1110
916	1111	zonelist = node_zonelist(nid, gfp_mask);
917	1112
..	..	@@ -938,15 +1133,6 @@
938	1133	goto retry_cpuset;
939	1134
940	1135	return NULL;
941		-}
942		-
943		-/* Movability of hugepages depends on migration support. */
944		-static inline gfp_t htlb_alloc_mask(struct hstate *h)
945		-{
946		- if (hugepage_migration_supported(h))
947		- return GFP_HIGHUSER_MOVABLE;
948		- else
949		- return GFP_HIGHUSER;
950	1136	}
951	1137
952	1138	static struct page dequeue_huge_page_vma(struct hstate h,
..	..	@@ -1068,108 +1254,85 @@
1068	1254	struct page *p = page + 1;
1069	1255
1070	1256	atomic_set(compound_mapcount_ptr(page), 0);
	1257	+ atomic_set(compound_pincount_ptr(page), 0);
	1258	+
1071	1259	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1072	1260	clear_compound_head(p);
1073	1261	set_page_refcounted(p);
1074	1262	}
1075	1263
1076	1264	set_compound_order(page, 0);
	1265	+ page[1].compound_nr = 0;
1077	1266	__ClearPageHead(page);
1078	1267	}
1079	1268
1080	1269	static void free_gigantic_page(struct page *page, unsigned int order)
1081	1270	{
	1271	+ /*
	1272	+ * If the page isn't allocated using the cma allocator,
	1273	+ * cma_release() returns false.
	1274	+ */
	1275	+#ifdef CONFIG_CMA
	1276	+ if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
	1277	+ return;
	1278	+#endif
	1279	+
1082	1280	free_contig_range(page_to_pfn(page), 1 << order);
1083	1281	}
1084	1282
1085		-static int __alloc_gigantic_page(unsigned long start_pfn,
1086		- unsigned long nr_pages, gfp_t gfp_mask)
1087		-{
1088		- unsigned long end_pfn = start_pfn + nr_pages;
1089		- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
1090		- gfp_mask);
1091		-}
1092		-
1093		-static bool pfn_range_valid_gigantic(struct zone *z,
1094		- unsigned long start_pfn, unsigned long nr_pages)
1095		-{
1096		- unsigned long i, end_pfn = start_pfn + nr_pages;
1097		- struct page *page;
1098		-
1099		- for (i = start_pfn; i < end_pfn; i++) {
1100		- page = pfn_to_online_page(i);
1101		- if (!page)
1102		- return false;
1103		-
1104		- if (page_zone(page) != z)
1105		- return false;
1106		-
1107		- if (PageReserved(page))
1108		- return false;
1109		-
1110		- if (page_count(page) > 0)
1111		- return false;
1112		-
1113		- if (PageHuge(page))
1114		- return false;
1115		- }
1116		-
1117		- return true;
1118		-}
1119		-
1120		-static bool zone_spans_last_pfn(const struct zone *zone,
1121		- unsigned long start_pfn, unsigned long nr_pages)
1122		-{
1123		- unsigned long last_pfn = start_pfn + nr_pages - 1;
1124		- return zone_spans_pfn(zone, last_pfn);
1125		-}
1126		-
	1283	+#ifdef CONFIG_CONTIG_ALLOC
1127	1284	static struct page alloc_gigantic_page(struct hstate h, gfp_t gfp_mask,
1128	1285	int nid, nodemask_t *nodemask)
1129	1286	{
1130		- unsigned int order = huge_page_order(h);
1131		- unsigned long nr_pages = 1 << order;
1132		- unsigned long ret, pfn, flags;
1133		- struct zonelist *zonelist;
1134		- struct zone *zone;
1135		- struct zoneref *z;
	1287	+ unsigned long nr_pages = 1UL << huge_page_order(h);
	1288	+ if (nid == NUMA_NO_NODE)
	1289	+ nid = numa_mem_id();
1136	1290
1137		- zonelist = node_zonelist(nid, gfp_mask);
1138		- for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
1139		- spin_lock_irqsave(&zone->lock, flags);
	1291	+#ifdef CONFIG_CMA
	1292	+ {
	1293	+ struct page *page;
	1294	+ int node;
1140	1295
1141		- pfn = ALIGN(zone->zone_start_pfn, nr_pages);
1142		- while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
1143		- if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
1144		- /*
1145		- * We release the zone lock here because
1146		- * alloc_contig_range() will also lock the zone
1147		- * at some point. If there's an allocation
1148		- * spinning on this lock, it may win the race
1149		- * and cause alloc_contig_range() to fail...
1150		- */
1151		- spin_unlock_irqrestore(&zone->lock, flags);
1152		- ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
1153		- if (!ret)
1154		- return pfn_to_page(pfn);
1155		- spin_lock_irqsave(&zone->lock, flags);
1156		- }
1157		- pfn += nr_pages;
	1296	+ if (hugetlb_cma[nid]) {
	1297	+ page = cma_alloc(hugetlb_cma[nid], nr_pages,
	1298	+ huge_page_order(h),
	1299	+ GFP_KERNEL \| __GFP_NOWARN);
	1300	+ if (page)
	1301	+ return page;
1158	1302	}
1159	1303
1160		- spin_unlock_irqrestore(&zone->lock, flags);
1161		- }
	1304	+ if (!(gfp_mask & __GFP_THISNODE)) {
	1305	+ for_each_node_mask(node, *nodemask) {
	1306	+ if (node == nid \|\| !hugetlb_cma[node])
	1307	+ continue;
1162	1308
1163		- return NULL;
	1309	+ page = cma_alloc(hugetlb_cma[node], nr_pages,
	1310	+ huge_page_order(h),
	1311	+ GFP_KERNEL \| __GFP_NOWARN);
	1312	+ if (page)
	1313	+ return page;
	1314	+ }
	1315	+ }
	1316	+ }
	1317	+#endif
	1318	+
	1319	+ return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1164	1320	}
1165	1321
1166		-static void prep_new_huge_page(struct hstate h, struct page page, int nid);
1167		-static void prep_compound_gigantic_page(struct page *page, unsigned int order);
	1322	+#else /* !CONFIG_CONTIG_ALLOC */
	1323	+static struct page alloc_gigantic_page(struct hstate h, gfp_t gfp_mask,
	1324	+ int nid, nodemask_t *nodemask)
	1325	+{
	1326	+ return NULL;
	1327	+}
	1328	+#endif /* CONFIG_CONTIG_ALLOC */
1168	1329
1169	1330	#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1170		-static inline bool gigantic_page_supported(void) { return false; }
1171	1331	static struct page alloc_gigantic_page(struct hstate h, gfp_t gfp_mask,
1172		- int nid, nodemask_t *nodemask) { return NULL; }
	1332	+ int nid, nodemask_t *nodemask)
	1333	+{
	1334	+ return NULL;
	1335	+}
1173	1336	static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1174	1337	static inline void destroy_compound_gigantic_page(struct page *page,
1175	1338	unsigned int order) { }
..	..	@@ -1180,7 +1343,7 @@
1180	1343	int i;
1181	1344	struct page *subpage = page;
1182	1345
1183		- if (hstate_is_gigantic(h) && !gigantic_page_supported())
	1346	+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1184	1347	return;
1185	1348
1186	1349	h->nr_huge_pages--;
..	..	@@ -1193,11 +1356,18 @@
1193	1356	1 << PG_writeback);
1194	1357	}
1195	1358	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
	1359	+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
1196	1360	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
1197	1361	set_page_refcounted(page);
1198	1362	if (hstate_is_gigantic(h)) {
	1363	+ /*
	1364	+ * Temporarily drop the hugetlb_lock, because
	1365	+ * we might block in free_gigantic_page().
	1366	+ */
	1367	+ spin_unlock(&hugetlb_lock);
1199	1368	destroy_compound_gigantic_page(page, huge_page_order(h));
1200	1369	free_gigantic_page(page, huge_page_order(h));
	1370	+ spin_lock(&hugetlb_lock);
1201	1371	} else {
1202	1372	__free_pages(page, huge_page_order(h));
1203	1373	}
..	..	@@ -1260,7 +1430,7 @@
1260	1430	page[2].mapping = NULL;
1261	1431	}
1262	1432
1263		-void free_huge_page(struct page *page)
	1433	+static void __free_huge_page(struct page *page)
1264	1434	{
1265	1435	/*
1266	1436	* Can't pass hstate in here because it is called from the
..	..	@@ -1272,10 +1442,11 @@
1272	1442	(struct hugepage_subpool *)page_private(page);
1273	1443	bool restore_reserve;
1274	1444
1275		- set_page_private(page, 0);
1276		- page->mapping = NULL;
1277	1445	VM_BUG_ON_PAGE(page_count(page), page);
1278	1446	VM_BUG_ON_PAGE(page_mapcount(page), page);
	1447	+
	1448	+ set_page_private(page, 0);
	1449	+ page->mapping = NULL;
1279	1450	restore_reserve = PagePrivate(page);
1280	1451	ClearPagePrivate(page);
1281	1452
..	..	@@ -1302,6 +1473,8 @@
1302	1473	clear_page_huge_active(page);
1303	1474	hugetlb_cgroup_uncharge_page(hstate_index(h),
1304	1475	pages_per_huge_page(h), page);
	1476	+ hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
	1477	+ pages_per_huge_page(h), page);
1305	1478	if (restore_reserve)
1306	1479	h->resv_huge_pages++;
1307	1480
..	..	@@ -1322,12 +1495,61 @@
1322	1495	spin_unlock(&hugetlb_lock);
1323	1496	}
1324	1497
	1498	+/*
	1499	+ * As free_huge_page() can be called from a non-task context, we have
	1500	+ * to defer the actual freeing in a workqueue to prevent potential
	1501	+ * hugetlb_lock deadlock.
	1502	+ *
	1503	+ * free_hpage_workfn() locklessly retrieves the linked list of pages to
	1504	+ * be freed and frees them one-by-one. As the page->mapping pointer is
	1505	+ * going to be cleared in __free_huge_page() anyway, it is reused as the
	1506	+ * llist_node structure of a lockless linked list of huge pages to be freed.
	1507	+ */
	1508	+static LLIST_HEAD(hpage_freelist);
	1509	+
	1510	+static void free_hpage_workfn(struct work_struct *work)
	1511	+{
	1512	+ struct llist_node *node;
	1513	+ struct page *page;
	1514	+
	1515	+ node = llist_del_all(&hpage_freelist);
	1516	+
	1517	+ while (node) {
	1518	+ page = container_of((struct address_space **)node,
	1519	+ struct page, mapping);
	1520	+ node = node->next;
	1521	+ __free_huge_page(page);
	1522	+ }
	1523	+}
	1524	+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
	1525	+
	1526	+void free_huge_page(struct page *page)
	1527	+{
	1528	+ /*
	1529	+ * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
	1530	+ */
	1531	+ if (!in_task()) {
	1532	+ /*
	1533	+ * Only call schedule_work() if hpage_freelist is previously
	1534	+ * empty. Otherwise, schedule_work() had been called but the
	1535	+ * workfn hasn't retrieved the list yet.
	1536	+ */
	1537	+ if (llist_add((struct llist_node *)&page->mapping,
	1538	+ &hpage_freelist))
	1539	+ schedule_work(&free_hpage_work);
	1540	+ return;
	1541	+ }
	1542	+
	1543	+ __free_huge_page(page);
	1544	+}
	1545	+
1325	1546	static void prep_new_huge_page(struct hstate h, struct page page, int nid)
1326	1547	{
1327	1548	INIT_LIST_HEAD(&page->lru);
1328	1549	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1329		- spin_lock(&hugetlb_lock);
1330	1550	set_hugetlb_cgroup(page, NULL);
	1551	+ set_hugetlb_cgroup_rsvd(page, NULL);
	1552	+ spin_lock(&hugetlb_lock);
1331	1553	h->nr_huge_pages++;
1332	1554	h->nr_huge_pages_node[nid]++;
1333	1555	ClearPageHugeFreed(page);
..	..	@@ -1349,7 +1571,7 @@
1349	1571	* For gigantic hugepages allocated through bootmem at
1350	1572	* boot, it's safer to be consistent with the not-gigantic
1351	1573	* hugepages and clear the PG_reserved bit from all tail pages
1352		- * too. Otherwse drivers using get_user_pages() to access tail
	1574	+ * too. Otherwise drivers using get_user_pages() to access tail
1353	1575	* pages may get the reference counting wrong if they see
1354	1576	* PG_reserved set on a tail page (despite the head page not
1355	1577	* having PG_reserved set). Enforcing this consistency between
..	..	@@ -1362,6 +1584,7 @@
1362	1584	set_compound_head(p, page);
1363	1585	}
1364	1586	atomic_set(compound_mapcount_ptr(page), -1);
	1587	+ atomic_set(compound_pincount_ptr(page), 0);
1365	1588	}
1366	1589
1367	1590	/*
..	..	@@ -1388,7 +1611,27 @@
1388	1611	if (!PageHead(page_head))
1389	1612	return 0;
1390	1613
1391		- return get_compound_page_dtor(page_head) == free_huge_page;
	1614	+ return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
	1615	+}
	1616	+
	1617	+/*
	1618	+ * Find and lock address space (mapping) in write mode.
	1619	+ *
	1620	+ * Upon entry, the page is locked which means that page_mapping() is
	1621	+ * stable. Due to locking order, we can only trylock_write. If we can
	1622	+ * not get the lock, simply return NULL to caller.
	1623	+ */
	1624	+struct address_space hugetlb_page_mapping_lock_write(struct page hpage)
	1625	+{
	1626	+ struct address_space *mapping = page_mapping(hpage);
	1627	+
	1628	+ if (!mapping)
	1629	+ return mapping;
	1630	+
	1631	+ if (i_mmap_trylock_write(mapping))
	1632	+ return mapping;
	1633	+
	1634	+ return NULL;
1392	1635	}
1393	1636
1394	1637	pgoff_t hugetlb_basepage_index(struct page *page)
..	..	@@ -1406,12 +1649,25 @@
1406	1649	}
1407	1650
1408	1651	static struct page alloc_buddy_huge_page(struct hstate h,
1409		- gfp_t gfp_mask, int nid, nodemask_t *nmask)
	1652	+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
	1653	+ nodemask_t *node_alloc_noretry)
1410	1654	{
1411	1655	int order = huge_page_order(h);
1412	1656	struct page *page;
	1657	+ bool alloc_try_hard = true;
1413	1658
1414		- gfp_mask \|= __GFP_COMP\|__GFP_RETRY_MAYFAIL\|__GFP_NOWARN;
	1659	+ /*
	1660	+ * By default we always try hard to allocate the page with
	1661	+ * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
	1662	+ * a loop (to adjust global huge page counts) and previous allocation
	1663	+ * failed, do not continue to try hard on the same node. Use the
	1664	+ * node_alloc_noretry bitmap to manage this state information.
	1665	+ */
	1666	+ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
	1667	+ alloc_try_hard = false;
	1668	+ gfp_mask \|= __GFP_COMP\|__GFP_NOWARN;
	1669	+ if (alloc_try_hard)
	1670	+ gfp_mask \|= __GFP_RETRY_MAYFAIL;
1415	1671	if (nid == NUMA_NO_NODE)
1416	1672	nid = numa_mem_id();
1417	1673	page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
..	..	@@ -1419,6 +1675,22 @@
1419	1675	__count_vm_event(HTLB_BUDDY_PGALLOC);
1420	1676	else
1421	1677	__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
	1678	+
	1679	+ /*
	1680	+ * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
	1681	+ * indicates an overall state change. Clear bit so that we resume
	1682	+ * normal 'try hard' allocations.
	1683	+ */
	1684	+ if (node_alloc_noretry && page && !alloc_try_hard)
	1685	+ node_clear(nid, *node_alloc_noretry);
	1686	+
	1687	+ /*
	1688	+ * If we tried hard to get a page but failed, set bit so that
	1689	+ * subsequent attempts will not try as hard until there is an
	1690	+ * overall state change.
	1691	+ */
	1692	+ if (node_alloc_noretry && !page && alloc_try_hard)
	1693	+ node_set(nid, *node_alloc_noretry);
1422	1694
1423	1695	return page;
1424	1696	}
..	..	@@ -1428,7 +1700,8 @@
1428	1700	* should use this function to get new hugetlb pages
1429	1701	*/
1430	1702	static struct page alloc_fresh_huge_page(struct hstate h,
1431		- gfp_t gfp_mask, int nid, nodemask_t *nmask)
	1703	+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
	1704	+ nodemask_t *node_alloc_noretry)
1432	1705	{
1433	1706	struct page *page;
1434	1707
..	..	@@ -1436,7 +1709,7 @@
1436	1709	page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
1437	1710	else
1438	1711	page = alloc_buddy_huge_page(h, gfp_mask,
1439		- nid, nmask);
	1712	+ nid, nmask, node_alloc_noretry);
1440	1713	if (!page)
1441	1714	return NULL;
1442	1715
..	..	@@ -1451,14 +1724,16 @@
1451	1724	* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
1452	1725	* manner.
1453	1726	*/
1454		-static int alloc_pool_huge_page(struct hstate h, nodemask_t nodes_allowed)
	1727	+static int alloc_pool_huge_page(struct hstate h, nodemask_t nodes_allowed,
	1728	+ nodemask_t *node_alloc_noretry)
1455	1729	{
1456	1730	struct page *page;
1457	1731	int nr_nodes, node;
1458	1732	gfp_t gfp_mask = htlb_alloc_mask(h) \| __GFP_THISNODE;
1459	1733
1460	1734	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1461		- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
	1735	+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
	1736	+ node_alloc_noretry);
1462	1737	if (page)
1463	1738	break;
1464	1739	}
..	..	@@ -1623,7 +1898,7 @@
1623	1898	goto out_unlock;
1624	1899	spin_unlock(&hugetlb_lock);
1625	1900
1626		- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
	1901	+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
1627	1902	if (!page)
1628	1903	return NULL;
1629	1904
..	..	@@ -1652,14 +1927,14 @@
1652	1927	}
1653	1928
1654	1929	static struct page alloc_migrate_huge_page(struct hstate h, gfp_t gfp_mask,
1655		- int nid, nodemask_t *nmask)
	1930	+ int nid, nodemask_t *nmask)
1656	1931	{
1657	1932	struct page *page;
1658	1933
1659	1934	if (hstate_is_gigantic(h))
1660	1935	return NULL;
1661	1936
1662		- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
	1937	+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
1663	1938	if (!page)
1664	1939	return NULL;
1665	1940
..	..	@@ -1693,31 +1968,9 @@
1693	1968	}
1694	1969
1695	1970	/* page migration callback function */
1696		-struct page alloc_huge_page_node(struct hstate h, int nid)
1697		-{
1698		- gfp_t gfp_mask = htlb_alloc_mask(h);
1699		- struct page *page = NULL;
1700		-
1701		- if (nid != NUMA_NO_NODE)
1702		- gfp_mask \|= __GFP_THISNODE;
1703		-
1704		- spin_lock(&hugetlb_lock);
1705		- if (h->free_huge_pages - h->resv_huge_pages > 0)
1706		- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
1707		- spin_unlock(&hugetlb_lock);
1708		-
1709		- if (!page)
1710		- page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
1711		-
1712		- return page;
1713		-}
1714		-
1715		-/* page migration callback function */
1716	1971	struct page alloc_huge_page_nodemask(struct hstate h, int preferred_nid,
1717		- nodemask_t *nmask)
	1972	+ nodemask_t *nmask, gfp_t gfp_mask)
1718	1973	{
1719		- gfp_t gfp_mask = htlb_alloc_mask(h);
1720		-
1721	1974	spin_lock(&hugetlb_lock);
1722	1975	if (h->free_huge_pages - h->resv_huge_pages > 0) {
1723	1976	struct page *page;
..	..	@@ -1745,7 +1998,7 @@
1745	1998
1746	1999	gfp_mask = htlb_alloc_mask(h);
1747	2000	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1748		- page = alloc_huge_page_nodemask(h, node, nodemask);
	2001	+ page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
1749	2002	mpol_cond_put(mpol);
1750	2003
1751	2004	return page;
..	..	@@ -1756,6 +2009,7 @@
1756	2009	* of size 'delta'.
1757	2010	*/
1758	2011	static int gather_surplus_pages(struct hstate *h, int delta)
	2012	+ __must_hold(&hugetlb_lock)
1759	2013	{
1760	2014	struct list_head surplus_list;
1761	2015	struct page page, tmp;
..	..	@@ -1873,7 +2127,7 @@
1873	2127	* evenly across all nodes with memory. Iterate across these nodes
1874	2128	* until we can no longer free unreserved surplus pages. This occurs
1875	2129	* when the nodes with surplus pages have no free pages.
1876		- * free_pool_huge_page() will balance the the freed pages across the
	2130	+ * free_pool_huge_page() will balance the freed pages across the
1877	2131	* on-line nodes with memory and will handle the hstate accounting.
1878	2132	*
1879	2133	* Note that we decrement resv_huge_pages as we free the pages. If
..	..	@@ -1931,6 +2185,7 @@
1931	2185	struct resv_map *resv;
1932	2186	pgoff_t idx;
1933	2187	long ret;
	2188	+ long dummy_out_regions_needed;
1934	2189
1935	2190	resv = vma_resv_map(vma);
1936	2191	if (!resv)
..	..	@@ -1939,20 +2194,29 @@
1939	2194	idx = vma_hugecache_offset(h, vma, addr);
1940	2195	switch (mode) {
1941	2196	case VMA_NEEDS_RESV:
1942		- ret = region_chg(resv, idx, idx + 1);
	2197	+ ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
	2198	+ /* We assume that vma_reservation_* routines always operate on
	2199	+ * 1 page, and that adding to resv map a 1 page entry can only
	2200	+ * ever require 1 region.
	2201	+ */
	2202	+ VM_BUG_ON(dummy_out_regions_needed != 1);
1943	2203	break;
1944	2204	case VMA_COMMIT_RESV:
1945		- ret = region_add(resv, idx, idx + 1);
	2205	+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
	2206	+ /* region_add calls of range 1 should never fail. */
	2207	+ VM_BUG_ON(ret < 0);
1946	2208	break;
1947	2209	case VMA_END_RESV:
1948		- region_abort(resv, idx, idx + 1);
	2210	+ region_abort(resv, idx, idx + 1, 1);
1949	2211	ret = 0;
1950	2212	break;
1951	2213	case VMA_ADD_RESV:
1952		- if (vma->vm_flags & VM_MAYSHARE)
1953		- ret = region_add(resv, idx, idx + 1);
1954		- else {
1955		- region_abort(resv, idx, idx + 1);
	2214	+ if (vma->vm_flags & VM_MAYSHARE) {
	2215	+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
	2216	+ /* region_add calls of range 1 should never fail. */
	2217	+ VM_BUG_ON(ret < 0);
	2218	+ } else {
	2219	+ region_abort(resv, idx, idx + 1, 1);
1956	2220	ret = region_del(resv, idx, idx + 1);
1957	2221	}
1958	2222	break;
..	..	@@ -2063,6 +2327,7 @@
2063	2327	long gbl_chg;
2064	2328	int ret, idx;
2065	2329	struct hugetlb_cgroup *h_cg;
	2330	+ bool deferred_reserve;
2066	2331
2067	2332	idx = hstate_index(h);
2068	2333	/*
..	..	@@ -2100,9 +2365,19 @@
2100	2365	gbl_chg = 1;
2101	2366	}
2102	2367
	2368	+ /* If this allocation is not consuming a reservation, charge it now.
	2369	+ */
	2370	+ deferred_reserve = map_chg \|\| avoid_reserve \|\| !vma_resv_map(vma);
	2371	+ if (deferred_reserve) {
	2372	+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
	2373	+ idx, pages_per_huge_page(h), &h_cg);
	2374	+ if (ret)
	2375	+ goto out_subpool_put;
	2376	+ }
	2377	+
2103	2378	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
2104	2379	if (ret)
2105		- goto out_subpool_put;
	2380	+ goto out_uncharge_cgroup_reservation;
2106	2381
2107	2382	spin_lock(&hugetlb_lock);
2108	2383	/*
..	..	@@ -2116,15 +2391,23 @@
2116	2391	page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
2117	2392	if (!page)
2118	2393	goto out_uncharge_cgroup;
	2394	+ spin_lock(&hugetlb_lock);
2119	2395	if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
2120	2396	SetPagePrivate(page);
2121	2397	h->resv_huge_pages--;
2122	2398	}
2123		- spin_lock(&hugetlb_lock);
2124		- list_move(&page->lru, &h->hugepage_activelist);
	2399	+ list_add(&page->lru, &h->hugepage_activelist);
2125	2400	/* Fall through */
2126	2401	}
2127	2402	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
	2403	+ /* If allocation is not consuming a reservation, also store the
	2404	+ * hugetlb_cgroup pointer on the page.
	2405	+ */
	2406	+ if (deferred_reserve) {
	2407	+ hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
	2408	+ h_cg, page);
	2409	+ }
	2410	+
2128	2411	spin_unlock(&hugetlb_lock);
2129	2412
2130	2413	set_page_private(page, (unsigned long)spool);
..	..	@@ -2144,11 +2427,18 @@
2144	2427
2145	2428	rsv_adjust = hugepage_subpool_put_pages(spool, 1);
2146	2429	hugetlb_acct_memory(h, -rsv_adjust);
	2430	+ if (deferred_reserve)
	2431	+ hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
	2432	+ pages_per_huge_page(h), page);
2147	2433	}
2148	2434	return page;
2149	2435
2150	2436	out_uncharge_cgroup:
2151	2437	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
	2438	+out_uncharge_cgroup_reservation:
	2439	+ if (deferred_reserve)
	2440	+ hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
	2441	+ h_cg);
2152	2442	out_subpool_put:
2153	2443	if (map_chg \|\| avoid_reserve)
2154	2444	hugepage_subpool_put_pages(spool, 1);
..	..	@@ -2166,9 +2456,9 @@
2166	2456	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
2167	2457	void *addr;
2168	2458
2169		- addr = memblock_virt_alloc_try_nid_raw(
	2459	+ addr = memblock_alloc_try_nid_raw(
2170	2460	huge_page_size(h), huge_page_size(h),
2171		- 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
	2461	+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
2172	2462	if (addr) {
2173	2463	/*
2174	2464	* Use the beginning of the huge page to store the
..	..	@@ -2190,16 +2480,10 @@
2190	2480	return 1;
2191	2481	}
2192	2482
2193		-static void __init prep_compound_huge_page(struct page *page,
2194		- unsigned int order)
2195		-{
2196		- if (unlikely(order > (MAX_ORDER - 1)))
2197		- prep_compound_gigantic_page(page, order);
2198		- else
2199		- prep_compound_page(page, order);
2200		-}
2201		-
2202		-/* Put bootmem huge pages into the standard lists after mem_map is up */
	2483	+/*
	2484	+ * Put bootmem huge pages into the standard lists after mem_map is up.
	2485	+ * Note: This only applies to gigantic (order > MAX_ORDER) pages.
	2486	+ */
2203	2487	static void __init gather_bootmem_prealloc(void)
2204	2488	{
2205	2489	struct huge_bootmem_page *m;
..	..	@@ -2208,20 +2492,19 @@
2208	2492	struct page *page = virt_to_page(m);
2209	2493	struct hstate *h = m->hstate;
2210	2494
	2495	+ VM_BUG_ON(!hstate_is_gigantic(h));
2211	2496	WARN_ON(page_count(page) != 1);
2212		- prep_compound_huge_page(page, h->order);
	2497	+ prep_compound_gigantic_page(page, huge_page_order(h));
2213	2498	WARN_ON(PageReserved(page));
2214	2499	prep_new_huge_page(h, page, page_to_nid(page));
2215	2500	put_page(page); /* free it into the hugepage allocator */
2216	2501
2217	2502	/*
2218		- * If we had gigantic hugepages allocated at boot time, we need
2219		- * to restore the 'stolen' pages to totalram_pages in order to
2220		- * fix confusing memory reports from free(1) and another
2221		- * side-effects, like CommitLimit going negative.
	2503	+ * We need to restore the 'stolen' pages to totalram_pages
	2504	+ * in order to fix confusing memory reports from free(1) and
	2505	+ * other side-effects, like CommitLimit going negative.
2222	2506	*/
2223		- if (hstate_is_gigantic(h))
2224		- adjust_managed_page_count(page, 1 << h->order);
	2507	+ adjust_managed_page_count(page, pages_per_huge_page(h));
2225	2508	cond_resched();
2226	2509	}
2227	2510	}
..	..	@@ -2229,13 +2512,37 @@
2229	2512	static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
2230	2513	{
2231	2514	unsigned long i;
	2515	+ nodemask_t *node_alloc_noretry;
	2516	+
	2517	+ if (!hstate_is_gigantic(h)) {
	2518	+ /*
	2519	+ * Bit mask controlling how hard we retry per-node allocations.
	2520	+ * Ignore errors as lower level routines can deal with
	2521	+ * node_alloc_noretry == NULL. If this kmalloc fails at boot
	2522	+ * time, we are likely in bigger trouble.
	2523	+ */
	2524	+ node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
	2525	+ GFP_KERNEL);
	2526	+ } else {
	2527	+ /* allocations done at boot time */
	2528	+ node_alloc_noretry = NULL;
	2529	+ }
	2530	+
	2531	+ /* bit mask controlling how hard we retry per-node allocations */
	2532	+ if (node_alloc_noretry)
	2533	+ nodes_clear(*node_alloc_noretry);
2232	2534
2233	2535	for (i = 0; i < h->max_huge_pages; ++i) {
2234	2536	if (hstate_is_gigantic(h)) {
	2537	+ if (hugetlb_cma_size) {
	2538	+ pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
	2539	+ goto free;
	2540	+ }
2235	2541	if (!alloc_bootmem_huge_page(h))
2236	2542	break;
2237	2543	} else if (!alloc_pool_huge_page(h,
2238		- &node_states[N_MEMORY]))
	2544	+ &node_states[N_MEMORY],
	2545	+ node_alloc_noretry))
2239	2546	break;
2240	2547	cond_resched();
2241	2548	}
..	..	@@ -2247,6 +2554,8 @@
2247	2554	h->max_huge_pages, buf, i);
2248	2555	h->max_huge_pages = i;
2249	2556	}
	2557	+free:
	2558	+ kfree(node_alloc_noretry);
2250	2559	}
2251	2560
2252	2561	static void __init hugetlb_init_hstates(void)
..	..	@@ -2341,13 +2650,59 @@
2341	2650	}
2342	2651
2343	2652	#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
2344		-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
2345		- nodemask_t *nodes_allowed)
	2653	+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
	2654	+ nodemask_t *nodes_allowed)
2346	2655	{
2347	2656	unsigned long min_count, ret;
	2657	+ NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
2348	2658
2349		- if (hstate_is_gigantic(h) && !gigantic_page_supported())
2350		- return h->max_huge_pages;
	2659	+ /*
	2660	+ * Bit mask controlling how hard we retry per-node allocations.
	2661	+ * If we can not allocate the bit mask, do not attempt to allocate
	2662	+ * the requested huge pages.
	2663	+ */
	2664	+ if (node_alloc_noretry)
	2665	+ nodes_clear(*node_alloc_noretry);
	2666	+ else
	2667	+ return -ENOMEM;
	2668	+
	2669	+ spin_lock(&hugetlb_lock);
	2670	+
	2671	+ /*
	2672	+ * Check for a node specific request.
	2673	+ * Changing node specific huge page count may require a corresponding
	2674	+ * change to the global count. In any case, the passed node mask
	2675	+ * (nodes_allowed) will restrict alloc/free to the specified node.
	2676	+ */
	2677	+ if (nid != NUMA_NO_NODE) {
	2678	+ unsigned long old_count = count;
	2679	+
	2680	+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
	2681	+ /*
	2682	+ * User may have specified a large count value which caused the
	2683	+ * above calculation to overflow. In this case, they wanted
	2684	+ * to allocate as many huge pages as possible. Set count to
	2685	+ * largest possible value to align with their intention.
	2686	+ */
	2687	+ if (count < old_count)
	2688	+ count = ULONG_MAX;
	2689	+ }
	2690	+
	2691	+ /*
	2692	+ * Gigantic pages runtime allocation depend on the capability for large
	2693	+ * page range allocation.
	2694	+ * If the system does not provide this feature, return an error when
	2695	+ * the user tries to allocate gigantic pages but let the user free the
	2696	+ * boottime allocated gigantic pages.
	2697	+ */
	2698	+ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
	2699	+ if (count > persistent_huge_pages(h)) {
	2700	+ spin_unlock(&hugetlb_lock);
	2701	+ NODEMASK_FREE(node_alloc_noretry);
	2702	+ return -EINVAL;
	2703	+ }
	2704	+ /* Fall through to decrease pool */
	2705	+ }
2351	2706
2352	2707	/*
2353	2708	* Increase the pool size
..	..	@@ -2360,7 +2715,6 @@
2360	2715	* pool might be one hugepage larger than it needs to be, but
2361	2716	* within all the constraints specified by the sysctls.
2362	2717	*/
2363		- spin_lock(&hugetlb_lock);
2364	2718	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
2365	2719	if (!adjust_pool_surplus(h, nodes_allowed, -1))
2366	2720	break;
..	..	@@ -2377,7 +2731,8 @@
2377	2731	/* yield cpu to avoid soft lockup */
2378	2732	cond_resched();
2379	2733
2380		- ret = alloc_pool_huge_page(h, nodes_allowed);
	2734	+ ret = alloc_pool_huge_page(h, nodes_allowed,
	2735	+ node_alloc_noretry);
2381	2736	spin_lock(&hugetlb_lock);
2382	2737	if (!ret)
2383	2738	goto out;
..	..	@@ -2415,9 +2770,12 @@
2415	2770	break;
2416	2771	}
2417	2772	out:
2418		- ret = persistent_huge_pages(h);
	2773	+ h->max_huge_pages = persistent_huge_pages(h);
2419	2774	spin_unlock(&hugetlb_lock);
2420		- return ret;
	2775	+
	2776	+ NODEMASK_FREE(node_alloc_noretry);
	2777	+
	2778	+ return 0;
2421	2779	}
2422	2780
2423	2781	#define HSTATE_ATTR_RO(_name) \
..	..	@@ -2467,41 +2825,32 @@
2467	2825	unsigned long count, size_t len)
2468	2826	{
2469	2827	int err;
2470		- NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL \| __GFP_NORETRY);
	2828	+ nodemask_t nodes_allowed, *n_mask;
2471	2829
2472		- if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
2473		- err = -EINVAL;
2474		- goto out;
2475		- }
	2830	+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
	2831	+ return -EINVAL;
2476	2832
2477	2833	if (nid == NUMA_NO_NODE) {
2478	2834	/*
2479	2835	* global hstate attribute
2480	2836	*/
2481	2837	if (!(obey_mempolicy &&
2482		- init_nodemask_of_mempolicy(nodes_allowed))) {
2483		- NODEMASK_FREE(nodes_allowed);
2484		- nodes_allowed = &node_states[N_MEMORY];
2485		- }
2486		- } else if (nodes_allowed) {
	2838	+ init_nodemask_of_mempolicy(&nodes_allowed)))
	2839	+ n_mask = &node_states[N_MEMORY];
	2840	+ else
	2841	+ n_mask = &nodes_allowed;
	2842	+ } else {
2487	2843	/*
2488		- * per node hstate attribute: adjust count to global,
2489		- * but restrict alloc/free to the specified node.
	2844	+ * Node specific request. count adjustment happens in
	2845	+ * set_max_huge_pages() after acquiring hugetlb_lock.
2490	2846	*/
2491		- count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
2492		- init_nodemask_of_node(nodes_allowed, nid);
2493		- } else
2494		- nodes_allowed = &node_states[N_MEMORY];
	2847	+ init_nodemask_of_node(&nodes_allowed, nid);
	2848	+ n_mask = &nodes_allowed;
	2849	+ }
2495	2850
2496		- h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
	2851	+ err = set_max_huge_pages(h, count, nid, n_mask);
2497	2852
2498		- if (nodes_allowed != &node_states[N_MEMORY])
2499		- NODEMASK_FREE(nodes_allowed);
2500		-
2501		- return len;
2502		-out:
2503		- NODEMASK_FREE(nodes_allowed);
2504		- return err;
	2853	+ return err ? err : len;
2505	2854	}
2506	2855
2507	2856	static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
..	..	@@ -2675,7 +3024,7 @@
2675	3024	err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
2676	3025	hstate_kobjs, &hstate_attr_group);
2677	3026	if (err)
2678		- pr_err("Hugetlb: Unable to add hstate %s", h->name);
	3027	+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
2679	3028	}
2680	3029	}
2681	3030
..	..	@@ -2779,7 +3128,7 @@
2779	3128	nhs->hstate_kobjs,
2780	3129	&per_node_hstate_attr_group);
2781	3130	if (err) {
2782		- pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
	3131	+ pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
2783	3132	h->name, node->dev.id);
2784	3133	hugetlb_unregister_node(node);
2785	3134	break;
..	..	@@ -2827,25 +3176,44 @@
2827	3176	{
2828	3177	int i;
2829	3178
2830		- if (!hugepages_supported())
	3179	+ if (!hugepages_supported()) {
	3180	+ if (hugetlb_max_hstate \|\| default_hstate_max_huge_pages)
	3181	+ pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
2831	3182	return 0;
	3183	+ }
2832	3184
2833		- if (!size_to_hstate(default_hstate_size)) {
2834		- if (default_hstate_size != 0) {
2835		- pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
2836		- default_hstate_size, HPAGE_SIZE);
	3185	+ /*
	3186	+ * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
	3187	+ * architectures depend on setup being done here.
	3188	+ */
	3189	+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
	3190	+ if (!parsed_default_hugepagesz) {
	3191	+ /*
	3192	+ * If we did not parse a default huge page size, set
	3193	+ * default_hstate_idx to HPAGE_SIZE hstate. And, if the
	3194	+ * number of huge pages for this default size was implicitly
	3195	+ * specified, set that here as well.
	3196	+ * Note that the implicit setting will overwrite an explicit
	3197	+ * setting. A warning will be printed in this case.
	3198	+ */
	3199	+ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
	3200	+ if (default_hstate_max_huge_pages) {
	3201	+ if (default_hstate.max_huge_pages) {
	3202	+ char buf[32];
	3203	+
	3204	+ string_get_size(huge_page_size(&default_hstate),
	3205	+ 1, STRING_UNITS_2, buf, 32);
	3206	+ pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
	3207	+ default_hstate.max_huge_pages, buf);
	3208	+ pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
	3209	+ default_hstate_max_huge_pages);
	3210	+ }
	3211	+ default_hstate.max_huge_pages =
	3212	+ default_hstate_max_huge_pages;
2837	3213	}
2838		-
2839		- default_hstate_size = HPAGE_SIZE;
2840		- if (!size_to_hstate(default_hstate_size))
2841		- hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
2842		- }
2843		- default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
2844		- if (default_hstate_max_huge_pages) {
2845		- if (!default_hstate.max_huge_pages)
2846		- default_hstate.max_huge_pages = default_hstate_max_huge_pages;
2847	3214	}
2848	3215
	3216	+ hugetlb_cma_check();
2849	3217	hugetlb_init_hstates();
2850	3218	gather_bootmem_prealloc();
2851	3219	report_hugepages();
..	..	@@ -2870,10 +3238,10 @@
2870	3238	}
2871	3239	subsys_initcall(hugetlb_init);
2872	3240
2873		-/* Should be called on processing a hugepagesz=... option */
2874		-void __init hugetlb_bad_size(void)
	3241	+/* Overwritten by architectures with more huge page sizes */
	3242	+bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
2875	3243	{
2876		- parsed_valid_hugepagesz = false;
	3244	+ return size == HPAGE_SIZE;
2877	3245	}
2878	3246
2879	3247	void __init hugetlb_add_hstate(unsigned int order)
..	..	@@ -2882,7 +3250,6 @@
2882	3250	unsigned long i;
2883	3251
2884	3252	if (size_to_hstate(PAGE_SIZE << order)) {
2885		- pr_warn("hugepagesz= specified twice, ignoring\n");
2886	3253	return;
2887	3254	}
2888	3255	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
..	..	@@ -2903,20 +3270,29 @@
2903	3270	parsed_hstate = h;
2904	3271	}
2905	3272
2906		-static int __init hugetlb_nrpages_setup(char *s)
	3273	+/*
	3274	+ * hugepages command line processing
	3275	+ * hugepages normally follows a valid hugepagsz or default_hugepagsz
	3276	+ * specification. If not, ignore the hugepages value. hugepages can also
	3277	+ * be the first huge page command line option in which case it implicitly
	3278	+ * specifies the number of huge pages for the default size.
	3279	+ */
	3280	+static int __init hugepages_setup(char *s)
2907	3281	{
2908	3282	unsigned long *mhp;
2909	3283	static unsigned long *last_mhp;
2910	3284
2911	3285	if (!parsed_valid_hugepagesz) {
2912		- pr_warn("hugepages = %s preceded by "
2913		- "an unsupported hugepagesz, ignoring\n", s);
	3286	+ pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
2914	3287	parsed_valid_hugepagesz = true;
2915		- return 1;
	3288	+ return 0;
2916	3289	}
	3290	+
2917	3291	/*
2918		- * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
2919		- * so this hugepages= parameter goes to the "default hstate".
	3292	+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
	3293	+ * yet, so this hugepages= parameter goes to the "default hstate".
	3294	+ * Otherwise, it goes with the previously parsed hugepagesz or
	3295	+ * default_hugepagesz.
2920	3296	*/
2921	3297	else if (!hugetlb_max_hstate)
2922	3298	mhp = &default_hstate_max_huge_pages;
..	..	@@ -2924,8 +3300,8 @@
2924	3300	mhp = &parsed_hstate->max_huge_pages;
2925	3301
2926	3302	if (mhp == last_mhp) {
2927		- pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
2928		- return 1;
	3303	+ pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
	3304	+ return 0;
2929	3305	}
2930	3306
2931	3307	if (sscanf(s, "%lu", mhp) <= 0)
..	..	@@ -2943,22 +3319,118 @@
2943	3319
2944	3320	return 1;
2945	3321	}
2946		-__setup("hugepages=", hugetlb_nrpages_setup);
	3322	+__setup("hugepages=", hugepages_setup);
2947	3323
2948		-static int __init hugetlb_default_setup(char *s)
	3324	+/*
	3325	+ * hugepagesz command line processing
	3326	+ * A specific huge page size can only be specified once with hugepagesz.
	3327	+ * hugepagesz is followed by hugepages on the command line. The global
	3328	+ * variable 'parsed_valid_hugepagesz' is used to determine if prior
	3329	+ * hugepagesz argument was valid.
	3330	+ */
	3331	+static int __init hugepagesz_setup(char *s)
2949	3332	{
2950		- default_hstate_size = memparse(s, &s);
	3333	+ unsigned long size;
	3334	+ struct hstate *h;
	3335	+
	3336	+ parsed_valid_hugepagesz = false;
	3337	+ size = (unsigned long)memparse(s, NULL);
	3338	+
	3339	+ if (!arch_hugetlb_valid_size(size)) {
	3340	+ pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
	3341	+ return 0;
	3342	+ }
	3343	+
	3344	+ h = size_to_hstate(size);
	3345	+ if (h) {
	3346	+ /*
	3347	+ * hstate for this size already exists. This is normally
	3348	+ * an error, but is allowed if the existing hstate is the
	3349	+ * default hstate. More specifically, it is only allowed if
	3350	+ * the number of huge pages for the default hstate was not
	3351	+ * previously specified.
	3352	+ */
	3353	+ if (!parsed_default_hugepagesz \|\| h != &default_hstate \|\|
	3354	+ default_hstate.max_huge_pages) {
	3355	+ pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
	3356	+ return 0;
	3357	+ }
	3358	+
	3359	+ /*
	3360	+ * No need to call hugetlb_add_hstate() as hstate already
	3361	+ * exists. But, do set parsed_hstate so that a following
	3362	+ * hugepages= parameter will be applied to this hstate.
	3363	+ */
	3364	+ parsed_hstate = h;
	3365	+ parsed_valid_hugepagesz = true;
	3366	+ return 1;
	3367	+ }
	3368	+
	3369	+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
	3370	+ parsed_valid_hugepagesz = true;
2951	3371	return 1;
2952	3372	}
2953		-__setup("default_hugepagesz=", hugetlb_default_setup);
	3373	+__setup("hugepagesz=", hugepagesz_setup);
2954	3374
2955		-static unsigned int cpuset_mems_nr(unsigned int *array)
	3375	+/*
	3376	+ * default_hugepagesz command line input
	3377	+ * Only one instance of default_hugepagesz allowed on command line.
	3378	+ */
	3379	+static int __init default_hugepagesz_setup(char *s)
	3380	+{
	3381	+ unsigned long size;
	3382	+
	3383	+ parsed_valid_hugepagesz = false;
	3384	+ if (parsed_default_hugepagesz) {
	3385	+ pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
	3386	+ return 0;
	3387	+ }
	3388	+
	3389	+ size = (unsigned long)memparse(s, NULL);
	3390	+
	3391	+ if (!arch_hugetlb_valid_size(size)) {
	3392	+ pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
	3393	+ return 0;
	3394	+ }
	3395	+
	3396	+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
	3397	+ parsed_valid_hugepagesz = true;
	3398	+ parsed_default_hugepagesz = true;
	3399	+ default_hstate_idx = hstate_index(size_to_hstate(size));
	3400	+
	3401	+ /*
	3402	+ * The number of default huge pages (for this size) could have been
	3403	+ * specified as the first hugetlb parameter: hugepages=X. If so,
	3404	+ * then default_hstate_max_huge_pages is set. If the default huge
	3405	+ * page size is gigantic (>= MAX_ORDER), then the pages must be
	3406	+ * allocated here from bootmem allocator.
	3407	+ */
	3408	+ if (default_hstate_max_huge_pages) {
	3409	+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
	3410	+ if (hstate_is_gigantic(&default_hstate))
	3411	+ hugetlb_hstate_alloc_pages(&default_hstate);
	3412	+ default_hstate_max_huge_pages = 0;
	3413	+ }
	3414	+
	3415	+ return 1;
	3416	+}
	3417	+__setup("default_hugepagesz=", default_hugepagesz_setup);
	3418	+
	3419	+static unsigned int allowed_mems_nr(struct hstate *h)
2956	3420	{
2957	3421	int node;
2958	3422	unsigned int nr = 0;
	3423	+ nodemask_t *mpol_allowed;
	3424	+ unsigned int *array = h->free_huge_pages_node;
	3425	+ gfp_t gfp_mask = htlb_alloc_mask(h);
2959	3426
2960		- for_each_node_mask(node, cpuset_current_mems_allowed)
2961		- nr += array[node];
	3427	+ mpol_allowed = policy_nodemask_current(gfp_mask);
	3428	+
	3429	+ for_each_node_mask(node, cpuset_current_mems_allowed) {
	3430	+ if (!mpol_allowed \|\|
	3431	+ (mpol_allowed && node_isset(node, *mpol_allowed)))
	3432	+ nr += array[node];
	3433	+ }
2962	3434
2963	3435	return nr;
2964	3436	}
..	..	@@ -2982,7 +3454,7 @@
2982	3454
2983	3455	static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2984	3456	struct ctl_table *table, int write,
2985		- void __user buffer, size_t length, loff_t *ppos)
	3457	+ void buffer, size_t length, loff_t *ppos)
2986	3458	{
2987	3459	struct hstate *h = &default_hstate;
2988	3460	unsigned long tmp = h->max_huge_pages;
..	..	@@ -3004,7 +3476,7 @@
3004	3476	}
3005	3477
3006	3478	int hugetlb_sysctl_handler(struct ctl_table *table, int write,
3007		- void __user buffer, size_t length, loff_t *ppos)
	3479	+ void buffer, size_t length, loff_t *ppos)
3008	3480	{
3009	3481
3010	3482	return hugetlb_sysctl_handler_common(false, table, write,
..	..	@@ -3013,7 +3485,7 @@
3013	3485
3014	3486	#ifdef CONFIG_NUMA
3015	3487	int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
3016		- void __user buffer, size_t length, loff_t *ppos)
	3488	+ void buffer, size_t length, loff_t *ppos)
3017	3489	{
3018	3490	return hugetlb_sysctl_handler_common(true, table, write,
3019	3491	buffer, length, ppos);
..	..	@@ -3021,8 +3493,7 @@
3021	3493	#endif /* CONFIG_NUMA */
3022	3494
3023	3495	int hugetlb_overcommit_handler(struct ctl_table *table, int write,
3024		- void __user *buffer,
3025		- size_t length, loff_t ppos)
	3496	+ void buffer, size_t length, loff_t *ppos)
3026	3497	{
3027	3498	struct hstate *h = &default_hstate;
3028	3499	unsigned long tmp;
..	..	@@ -3082,18 +3553,20 @@
3082	3553	seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
3083	3554	}
3084	3555
3085		-int hugetlb_report_node_meminfo(int nid, char *buf)
	3556	+int hugetlb_report_node_meminfo(char *buf, int len, int nid)
3086	3557	{
3087	3558	struct hstate *h = &default_hstate;
	3559	+
3088	3560	if (!hugepages_supported())
3089	3561	return 0;
3090		- return sprintf(buf,
3091		- "Node %d HugePages_Total: %5u\n"
3092		- "Node %d HugePages_Free: %5u\n"
3093		- "Node %d HugePages_Surp: %5u\n",
3094		- nid, h->nr_huge_pages_node[nid],
3095		- nid, h->free_huge_pages_node[nid],
3096		- nid, h->surplus_huge_pages_node[nid]);
	3562	+
	3563	+ return sysfs_emit_at(buf, len,
	3564	+ "Node %d HugePages_Total: %5u\n"
	3565	+ "Node %d HugePages_Free: %5u\n"
	3566	+ "Node %d HugePages_Surp: %5u\n",
	3567	+ nid, h->nr_huge_pages_node[nid],
	3568	+ nid, h->free_huge_pages_node[nid],
	3569	+ nid, h->surplus_huge_pages_node[nid]);
3097	3570	}
3098	3571
3099	3572	void hugetlb_show_meminfo(void)
..	..	@@ -3152,12 +3625,18 @@
3152	3625	* we fall back to check against current free page availability as
3153	3626	* a best attempt and hopefully to minimize the impact of changing
3154	3627	* semantics that cpuset has.
	3628	+ *
	3629	+ * Apart from cpuset, we also have memory policy mechanism that
	3630	+ * also determines from which node the kernel will allocate memory
	3631	+ * in a NUMA system. So similar to cpuset, we also should consider
	3632	+ * the memory policy of the current task. Similar to the description
	3633	+ * above.
3155	3634	*/
3156	3635	if (delta > 0) {
3157	3636	if (gather_surplus_pages(h, delta) < 0)
3158	3637	goto out;
3159	3638
3160		- if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
	3639	+ if (delta > allowed_mems_nr(h)) {
3161	3640	return_unused_surplus_pages(h, delta);
3162	3641	goto out;
3163	3642	}
..	..	@@ -3184,8 +3663,10 @@
3184	3663	* after this open call completes. It is therefore safe to take a
3185	3664	* new reference here without additional locking.
3186	3665	*/
3187		- if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
	3666	+ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
	3667	+ resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
3188	3668	kref_get(&resv->refs);
	3669	+ }
3189	3670	}
3190	3671
3191	3672	static void hugetlb_vm_op_close(struct vm_area_struct *vma)
..	..	@@ -3203,9 +3684,7 @@
3203	3684	end = vma_hugecache_offset(h, vma, vma->vm_end);
3204	3685
3205	3686	reserve = (end - start) - region_count(resv, start, end);
3206		-
3207		- kref_put(&resv->refs, resv_map_release);
3208		-
	3687	+ hugetlb_cgroup_uncharge_counter(resv, start, end);
3209	3688	if (reserve) {
3210	3689	/*
3211	3690	* Decrement reserve counts. The global reserve count may be
..	..	@@ -3214,12 +3693,33 @@
3214	3693	gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
3215	3694	hugetlb_acct_memory(h, -gbl_reserve);
3216	3695	}
	3696	+
	3697	+ kref_put(&resv->refs, resv_map_release);
3217	3698	}
3218	3699
3219	3700	static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
3220	3701	{
3221	3702	if (addr & ~(huge_page_mask(hstate_vma(vma))))
3222	3703	return -EINVAL;
	3704	+
	3705	+ /*
	3706	+ * PMD sharing is only possible for PUD_SIZE-aligned address ranges
	3707	+ * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
	3708	+ * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
	3709	+ */
	3710	+ if (addr & ~PUD_MASK) {
	3711	+ /*
	3712	+ * hugetlb_vm_op_split is called right before we attempt to
	3713	+ * split the VMA. We will need to unshare PMDs in the old and
	3714	+ * new VMAs, so let's unshare before we split.
	3715	+ */
	3716	+ unsigned long floor = addr & PUD_MASK;
	3717	+ unsigned long ceil = floor + PUD_SIZE;
	3718	+
	3719	+ if (floor >= vma->vm_start && ceil <= vma->vm_end)
	3720	+ hugetlb_unshare_pmds(vma, floor, ceil);
	3721	+ }
	3722	+
3223	3723	return 0;
3224	3724	}
3225	3725
..	..	@@ -3293,23 +3793,23 @@
3293	3793	if (huge_pte_none(pte) \|\| pte_present(pte))
3294	3794	return false;
3295	3795	swp = pte_to_swp_entry(pte);
3296		- if (non_swap_entry(swp) && is_migration_entry(swp))
	3796	+ if (is_migration_entry(swp))
3297	3797	return true;
3298	3798	else
3299	3799	return false;
3300	3800	}
3301	3801
3302		-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
	3802	+static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
3303	3803	{
3304	3804	swp_entry_t swp;
3305	3805
3306	3806	if (huge_pte_none(pte) \|\| pte_present(pte))
3307		- return 0;
	3807	+ return false;
3308	3808	swp = pte_to_swp_entry(pte);
3309		- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
3310		- return 1;
	3809	+ if (is_hwpoison_entry(swp))
	3810	+ return true;
3311	3811	else
3312		- return 0;
	3812	+ return false;
3313	3813	}
3314	3814
3315	3815	int copy_hugetlb_page_range(struct mm_struct dst, struct mm_struct src,
..	..	@@ -3321,23 +3821,33 @@
3321	3821	int cow;
3322	3822	struct hstate *h = hstate_vma(vma);
3323	3823	unsigned long sz = huge_page_size(h);
3324		- unsigned long mmun_start; /* For mmu_notifiers */
3325		- unsigned long mmun_end; /* For mmu_notifiers */
	3824	+ struct address_space *mapping = vma->vm_file->f_mapping;
	3825	+ struct mmu_notifier_range range;
3326	3826	int ret = 0;
3327	3827
3328	3828	cow = (vma->vm_flags & (VM_SHARED \| VM_MAYWRITE)) == VM_MAYWRITE;
3329	3829
3330		- mmun_start = vma->vm_start;
3331		- mmun_end = vma->vm_end;
3332		- if (cow)
3333		- mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
	3830	+ if (cow) {
	3831	+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
	3832	+ vma->vm_start,
	3833	+ vma->vm_end);
	3834	+ mmu_notifier_invalidate_range_start(&range);
	3835	+ } else {
	3836	+ /*
	3837	+ * For shared mappings i_mmap_rwsem must be held to call
	3838	+ * huge_pte_alloc, otherwise the returned ptep could go
	3839	+ * away if part of a shared pmd and another thread calls
	3840	+ * huge_pmd_unshare.
	3841	+ */
	3842	+ i_mmap_lock_read(mapping);
	3843	+ }
3334	3844
3335	3845	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
3336	3846	spinlock_t src_ptl, dst_ptl;
3337	3847	src_pte = huge_pte_offset(src, addr, sz);
3338	3848	if (!src_pte)
3339	3849	continue;
3340		- dst_pte = huge_pte_alloc(dst, addr, sz);
	3850	+ dst_pte = huge_pte_alloc(dst, vma, addr, sz);
3341	3851	if (!dst_pte) {
3342	3852	ret = -ENOMEM;
3343	3853	break;
..	..	@@ -3406,7 +3916,9 @@
3406	3916	}
3407	3917
3408	3918	if (cow)
3409		- mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
	3919	+ mmu_notifier_invalidate_range_end(&range);
	3920	+ else
	3921	+ i_mmap_unlock_read(mapping);
3410	3922
3411	3923	return ret;
3412	3924	}
..	..	@@ -3423,8 +3935,7 @@
3423	3935	struct page *page;
3424	3936	struct hstate *h = hstate_vma(vma);
3425	3937	unsigned long sz = huge_page_size(h);
3426		- unsigned long mmun_start = start; /* For mmu_notifiers */
3427		- unsigned long mmun_end = end; /* For mmu_notifiers */
	3938	+ struct mmu_notifier_range range;
3428	3939	bool force_flush = false;
3429	3940
3430	3941	WARN_ON(!is_vm_hugetlb_page(vma));
..	..	@@ -3435,14 +3946,16 @@
3435	3946	* This is a hugetlb vma, all the pte entries should point
3436	3947	* to huge page.
3437	3948	*/
3438		- tlb_remove_check_page_size_change(tlb, sz);
	3949	+ tlb_change_page_size(tlb, sz);
3439	3950	tlb_start_vma(tlb, vma);
3440	3951
3441	3952	/*
3442	3953	* If sharing possible, alert mmu notifiers of worst case.
3443	3954	*/
3444		- adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
3445		- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
	3955	+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
	3956	+ end);
	3957	+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
	3958	+ mmu_notifier_invalidate_range_start(&range);
3446	3959	address = start;
3447	3960	for (; address < end; address += sz) {
3448	3961	ptep = huge_pte_offset(mm, address, sz);
..	..	@@ -3450,7 +3963,7 @@
3450	3963	continue;
3451	3964
3452	3965	ptl = huge_pte_lock(h, mm, ptep);
3453		- if (huge_pmd_unshare(mm, &address, ptep)) {
	3966	+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
3454	3967	spin_unlock(ptl);
3455	3968	tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
3456	3969	force_flush = true;
..	..	@@ -3508,7 +4021,7 @@
3508	4021	if (ref_page)
3509	4022	break;
3510	4023	}
3511		- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
	4024	+ mmu_notifier_invalidate_range_end(&range);
3512	4025	tlb_end_vma(tlb, vma);
3513	4026
3514	4027	/*
..	..	@@ -3642,9 +4155,8 @@
3642	4155	struct page old_page, new_page;
3643	4156	int outside_reserve = 0;
3644	4157	vm_fault_t ret = 0;
3645		- unsigned long mmun_start; /* For mmu_notifiers */
3646		- unsigned long mmun_end; /* For mmu_notifiers */
3647	4158	unsigned long haddr = address & huge_page_mask(h);
	4159	+ struct mmu_notifier_range range;
3648	4160
3649	4161	pte = huge_ptep_get(ptep);
3650	4162	old_page = pte_page(pte);
..	..	@@ -3689,10 +4201,30 @@
3689	4201	* may get SIGKILLed if it later faults.
3690	4202	*/
3691	4203	if (outside_reserve) {
	4204	+ struct address_space *mapping = vma->vm_file->f_mapping;
	4205	+ pgoff_t idx;
	4206	+ u32 hash;
	4207	+
3692	4208	put_page(old_page);
3693	4209	BUG_ON(huge_pte_none(pte));
	4210	+ /*
	4211	+ * Drop hugetlb_fault_mutex and i_mmap_rwsem before
	4212	+ * unmapping. unmapping needs to hold i_mmap_rwsem
	4213	+ * in write mode. Dropping i_mmap_rwsem in read mode
	4214	+ * here is OK as COW mappings do not interact with
	4215	+ * PMD sharing.
	4216	+ *
	4217	+ * Reacquire both after unmap operation.
	4218	+ */
	4219	+ idx = vma_hugecache_offset(h, vma, haddr);
	4220	+ hash = hugetlb_fault_mutex_hash(mapping, idx);
	4221	+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
	4222	+ i_mmap_unlock_read(mapping);
	4223	+
3694	4224	unmap_ref_private(mm, vma, old_page, haddr);
3695		- BUG_ON(huge_pte_none(pte));
	4225	+
	4226	+ i_mmap_lock_read(mapping);
	4227	+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
3696	4228	spin_lock(ptl);
3697	4229	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
3698	4230	if (likely(ptep &&
..	..	@@ -3722,9 +4254,9 @@
3722	4254	pages_per_huge_page(h));
3723	4255	__SetPageUptodate(new_page);
3724	4256
3725		- mmun_start = haddr;
3726		- mmun_end = mmun_start + huge_page_size(h);
3727		- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
	4257	+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
	4258	+ haddr + huge_page_size(h));
	4259	+ mmu_notifier_invalidate_range_start(&range);
3728	4260
3729	4261	/*
3730	4262	* Retake the page table lock to check for racing updates
..	..	@@ -3737,7 +4269,7 @@
3737	4269
3738	4270	/* Break COW */
3739	4271	huge_ptep_clear_flush(vma, haddr, ptep);
3740		- mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
	4272	+ mmu_notifier_invalidate_range(mm, range.start, range.end);
3741	4273	set_huge_pte_at(mm, haddr, ptep,
3742	4274	make_huge_pte(vma, new_page, 1));
3743	4275	page_remove_rmap(old_page, true);
..	..	@@ -3747,7 +4279,7 @@
3747	4279	new_page = old_page;
3748	4280	}
3749	4281	spin_unlock(ptl);
3750		- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
	4282	+ mmu_notifier_invalidate_range_end(&range);
3751	4283	out_release_all:
3752	4284	restore_reserve_on_error(h, vma, haddr, new_page);
3753	4285	put_page(new_page);
..	..	@@ -3814,6 +4346,38 @@
3814	4346	return 0;
3815	4347	}
3816	4348
	4349	+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
	4350	+ struct address_space *mapping,
	4351	+ pgoff_t idx,
	4352	+ unsigned int flags,
	4353	+ unsigned long haddr,
	4354	+ unsigned long reason)
	4355	+{
	4356	+ u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
	4357	+ struct vm_fault vmf = {
	4358	+ .vma = vma,
	4359	+ .address = haddr,
	4360	+ .flags = flags,
	4361	+ /*
	4362	+ * Hard to debug if it ends up being
	4363	+ * used by a callee that assumes
	4364	+ * something about the other
	4365	+ * uninitialized fields... same as in
	4366	+ * memory.c
	4367	+ */
	4368	+ };
	4369	+
	4370	+ /*
	4371	+ * vma_lock and hugetlb_fault_mutex must be dropped
	4372	+ * before handling userfault. Also mmap_lock will
	4373	+ * be dropped during handling userfault, any vma
	4374	+ * operation should be careful from here.
	4375	+ */
	4376	+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
	4377	+ i_mmap_unlock_read(mapping);
	4378	+ return handle_userfault(&vmf, VM_UFFD_MISSING);
	4379	+}
	4380	+
3817	4381	static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
3818	4382	struct vm_area_struct *vma,
3819	4383	struct address_space *mapping, pgoff_t idx,
..	..	@@ -3828,6 +4392,7 @@
3828	4392	spinlock_t *ptl;
3829	4393	unsigned long haddr = address & huge_page_mask(h);
3830	4394	bool new_page = false;
	4395	+ u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
3831	4396
3832	4397	/*
3833	4398	* Currently, we are forced to kill the process in the event the
..	..	@@ -3837,52 +4402,50 @@
3837	4402	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
3838	4403	pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
3839	4404	current->pid);
3840		- return ret;
	4405	+ goto out;
3841	4406	}
3842	4407
3843	4408	/*
3844		- * Use page lock to guard against racing truncation
3845		- * before we get page_table_lock.
	4409	+ * We can not race with truncation due to holding i_mmap_rwsem.
	4410	+ * i_size is modified when holding i_mmap_rwsem, so check here
	4411	+ * once for faults beyond end of file.
3846	4412	*/
	4413	+ size = i_size_read(mapping->host) >> huge_page_shift(h);
	4414	+ if (idx >= size)
	4415	+ goto out;
	4416	+
3847	4417	retry:
3848	4418	page = find_lock_page(mapping, idx);
3849	4419	if (!page) {
3850		- size = i_size_read(mapping->host) >> huge_page_shift(h);
3851		- if (idx >= size)
3852		- goto out;
3853		-
3854		- /*
3855		- * Check for page in userfault range
3856		- */
	4420	+ /* Check for page in userfault range */
3857	4421	if (userfaultfd_missing(vma)) {
3858		- u32 hash;
3859		- struct vm_fault vmf = {
3860		- .vma = vma,
3861		- .address = haddr,
3862		- .flags = flags,
3863		- /*
3864		- * Hard to debug if it ends up being
3865		- * used by a callee that assumes
3866		- * something about the other
3867		- * uninitialized fields... same as in
3868		- * memory.c
3869		- */
3870		- };
3871		-
3872		- /*
3873		- * hugetlb_fault_mutex must be dropped before
3874		- * handling userfault. Reacquire after handling
3875		- * fault to make calling code simpler.
3876		- */
3877		- hash = hugetlb_fault_mutex_hash(h, mapping, idx);
3878		- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
3879		- ret = handle_userfault(&vmf, VM_UFFD_MISSING);
3880		- mutex_lock(&hugetlb_fault_mutex_table[hash]);
	4422	+ ret = hugetlb_handle_userfault(vma, mapping, idx,
	4423	+ flags, haddr,
	4424	+ VM_UFFD_MISSING);
3881	4425	goto out;
3882	4426	}
3883	4427
3884	4428	page = alloc_huge_page(vma, haddr, 0);
3885	4429	if (IS_ERR(page)) {
	4430	+ /*
	4431	+ * Returning error will result in faulting task being
	4432	+ * sent SIGBUS. The hugetlb fault mutex prevents two
	4433	+ * tasks from racing to fault in the same page which
	4434	+ * could result in false unable to allocate errors.
	4435	+ * Page migration does not take the fault mutex, but
	4436	+ * does a clear then write of pte's under page table
	4437	+ * lock. Page fault code could race with migration,
	4438	+ * notice the clear pte and try to allocate a page
	4439	+ * here. Before returning error, get ptl and make
	4440	+ * sure there really is no pte entry.
	4441	+ */
	4442	+ ptl = huge_pte_lock(h, mm, ptep);
	4443	+ if (!huge_pte_none(huge_ptep_get(ptep))) {
	4444	+ ret = 0;
	4445	+ spin_unlock(ptl);
	4446	+ goto out;
	4447	+ }
	4448	+ spin_unlock(ptl);
3886	4449	ret = vmf_error(PTR_ERR(page));
3887	4450	goto out;
3888	4451	}
..	..	@@ -3917,6 +4480,16 @@
3917	4480	VM_FAULT_SET_HINDEX(hstate_index(h));
3918	4481	goto backout_unlocked;
3919	4482	}
	4483	+
	4484	+ /* Check for page in userfault range. */
	4485	+ if (userfaultfd_minor(vma)) {
	4486	+ unlock_page(page);
	4487	+ put_page(page);
	4488	+ ret = hugetlb_handle_userfault(vma, mapping, idx,
	4489	+ flags, haddr,
	4490	+ VM_UFFD_MINOR);
	4491	+ goto out;
	4492	+ }
3920	4493	}
3921	4494
3922	4495	/*
..	..	@@ -3935,10 +4508,6 @@
3935	4508	}
3936	4509
3937	4510	ptl = huge_pte_lock(h, mm, ptep);
3938		- size = i_size_read(mapping->host) >> huge_page_shift(h);
3939		- if (idx >= size)
3940		- goto backout;
3941		-
3942	4511	ret = 0;
3943	4512	if (!huge_pte_none(huge_ptep_get(ptep)))
3944	4513	goto backout;
..	..	@@ -3970,6 +4539,8 @@
3970	4539
3971	4540	unlock_page(page);
3972	4541	out:
	4542	+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
	4543	+ i_mmap_unlock_read(mapping);
3973	4544	return ret;
3974	4545
3975	4546	backout:
..	..	@@ -3982,8 +4553,7 @@
3982	4553	}
3983	4554
3984	4555	#ifdef CONFIG_SMP
3985		-u32 hugetlb_fault_mutex_hash(struct hstate h, struct address_space mapping,
3986		- pgoff_t idx)
	4556	+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
3987	4557	{
3988	4558	unsigned long key[2];
3989	4559	u32 hash;
..	..	@@ -4000,8 +4570,7 @@
4000	4570	* For uniprocesor systems we always use a single mutex, so just
4001	4571	* return 0 and avoid the hashing overhead.
4002	4572	*/
4003		-u32 hugetlb_fault_mutex_hash(struct hstate h, struct address_space mapping,
4004		- pgoff_t idx)
	4573	+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
4005	4574	{
4006	4575	return 0;
4007	4576	}
..	..	@@ -4024,6 +4593,11 @@
4024	4593
4025	4594	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
4026	4595	if (ptep) {
	4596	+ /*
	4597	+ * Since we hold no locks, ptep could be stale. That is
	4598	+ * OK as we are only making decisions based on content and
	4599	+ * not actually modifying content here.
	4600	+ */
4027	4601	entry = huge_ptep_get(ptep);
4028	4602	if (unlikely(is_hugetlb_entry_migration(entry))) {
4029	4603	migration_entry_wait_huge(vma, mm, ptep);
..	..	@@ -4031,37 +4605,52 @@
4031	4605	} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
4032	4606	return VM_FAULT_HWPOISON_LARGE \|
4033	4607	VM_FAULT_SET_HINDEX(hstate_index(h));
4034		- } else {
4035		- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
4036		- if (!ptep)
4037		- return VM_FAULT_OOM;
4038	4608	}
4039	4609
	4610	+ /*
	4611	+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
	4612	+ * until finished with ptep. This serves two purposes:
	4613	+ * 1) It prevents huge_pmd_unshare from being called elsewhere
	4614	+ * and making the ptep no longer valid.
	4615	+ * 2) It synchronizes us with i_size modifications during truncation.
	4616	+ *
	4617	+ * ptep could have already be assigned via huge_pte_offset. That
	4618	+ * is OK, as huge_pte_alloc will return the same value unless
	4619	+ * something has changed.
	4620	+ */
4040	4621	mapping = vma->vm_file->f_mapping;
4041		- idx = vma_hugecache_offset(h, vma, haddr);
	4622	+ i_mmap_lock_read(mapping);
	4623	+ ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
	4624	+ if (!ptep) {
	4625	+ i_mmap_unlock_read(mapping);
	4626	+ return VM_FAULT_OOM;
	4627	+ }
4042	4628
4043	4629	/*
4044	4630	* Serialize hugepage allocation and instantiation, so that we don't
4045	4631	* get spurious allocation failures if two CPUs race to instantiate
4046	4632	* the same page in the page cache.
4047	4633	*/
4048		- hash = hugetlb_fault_mutex_hash(h, mapping, idx);
	4634	+ idx = vma_hugecache_offset(h, vma, haddr);
	4635	+ hash = hugetlb_fault_mutex_hash(mapping, idx);
4049	4636	mutex_lock(&hugetlb_fault_mutex_table[hash]);
4050	4637
4051	4638	entry = huge_ptep_get(ptep);
4052		- if (huge_pte_none(entry)) {
4053		- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
4054		- goto out_mutex;
4055		- }
	4639	+ if (huge_pte_none(entry))
	4640	+ /*
	4641	+ * hugetlb_no_page will drop vma lock and hugetlb fault
	4642	+ * mutex internally, which make us return immediately.
	4643	+ */
	4644	+ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
4056	4645
4057	4646	ret = 0;
4058	4647
4059	4648	/*
4060	4649	* entry could be a migration/hwpoison entry at this point, so this
4061	4650	* check prevents the kernel from going below assuming that we have
4062		- * a active hugepage in pagecache. This goto expects the 2nd page fault,
4063		- * and is_hugetlb_entry_(migration\|hwpoisoned) check will properly
4064		- * handle it.
	4651	+ * an active hugepage in pagecache. This goto expects the 2nd page
	4652	+ * fault, and is_hugetlb_entry_(migration\|hwpoisoned) check will
	4653	+ * properly handle it.
4065	4654	*/
4066	4655	if (!pte_present(entry))
4067	4656	goto out_mutex;
..	..	@@ -4132,6 +4721,7 @@
4132	4721	}
4133	4722	out_mutex:
4134	4723	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
	4724	+ i_mmap_unlock_read(mapping);
4135	4725	/*
4136	4726	* Generally it's safe to hold refcount during waiting page lock. But
4137	4727	* here we just wait to defer the next page fault to avoid busy loop and
..	..	@@ -4144,6 +4734,7 @@
4144	4734	return ret;
4145	4735	}
4146	4736
	4737	+#ifdef CONFIG_USERFAULTFD
4147	4738	/*
4148	4739	* Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
4149	4740	* modifications for huge pages.
..	..	@@ -4153,8 +4744,10 @@
4153	4744	struct vm_area_struct *dst_vma,
4154	4745	unsigned long dst_addr,
4155	4746	unsigned long src_addr,
	4747	+ enum mcopy_atomic_mode mode,
4156	4748	struct page **pagep)
4157	4749	{
	4750	+ bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
4158	4751	struct address_space *mapping;
4159	4752	pgoff_t idx;
4160	4753	unsigned long size;
..	..	@@ -4164,8 +4757,17 @@
4164	4757	spinlock_t *ptl;
4165	4758	int ret;
4166	4759	struct page *page;
	4760	+ int writable;
4167	4761
4168		- if (!*pagep) {
	4762	+ mapping = dst_vma->vm_file->f_mapping;
	4763	+ idx = vma_hugecache_offset(h, dst_vma, dst_addr);
	4764	+
	4765	+ if (is_continue) {
	4766	+ ret = -EFAULT;
	4767	+ page = find_lock_page(mapping, idx);
	4768	+ if (!page)
	4769	+ goto out;
	4770	+ } else if (!*pagep) {
4169	4771	/* If a page already exists, then it's UFFDIO_COPY for
4170	4772	* a non-missing case. Return -EEXIST.
4171	4773	*/
..	..	@@ -4185,7 +4787,7 @@
4185	4787	(const void __user *) src_addr,
4186	4788	pages_per_huge_page(h), false);
4187	4789
4188		- /* fallback to copy_from_user outside mmap_sem */
	4790	+ /* fallback to copy_from_user outside mmap_lock */
4189	4791	if (unlikely(ret)) {
4190	4792	ret = -ENOENT;
4191	4793	*pagep = page;
..	..	@@ -4204,13 +4806,8 @@
4204	4806	*/
4205	4807	__SetPageUptodate(page);
4206	4808
4207		- mapping = dst_vma->vm_file->f_mapping;
4208		- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
4209		-
4210		- /*
4211		- * If shared, add to page cache
4212		- */
4213		- if (vm_shared) {
	4809	+ /* Add shared, newly allocated pages to the page cache. */
	4810	+ if (vm_shared && !is_continue) {
4214	4811	size = i_size_read(mapping->host) >> huge_page_shift(h);
4215	4812	ret = -EFAULT;
4216	4813	if (idx >= size)
..	..	@@ -4255,8 +4852,14 @@
4255	4852	hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
4256	4853	}
4257	4854
4258		- _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
4259		- if (dst_vma->vm_flags & VM_WRITE)
	4855	+ /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
	4856	+ if (is_continue && !vm_shared)
	4857	+ writable = 0;
	4858	+ else
	4859	+ writable = dst_vma->vm_flags & VM_WRITE;
	4860	+
	4861	+ _dst_pte = make_huge_pte(dst_vma, page, writable);
	4862	+ if (writable)
4260	4863	_dst_pte = huge_pte_mkdirty(_dst_pte);
4261	4864	_dst_pte = pte_mkyoung(_dst_pte);
4262	4865
..	..	@@ -4270,25 +4873,27 @@
4270	4873	update_mmu_cache(dst_vma, dst_addr, dst_pte);
4271	4874
4272	4875	spin_unlock(ptl);
4273		- set_page_huge_active(page);
4274		- if (vm_shared)
	4876	+ if (!is_continue)
	4877	+ set_page_huge_active(page);
	4878	+ if (vm_shared \|\| is_continue)
4275	4879	unlock_page(page);
4276	4880	ret = 0;
4277	4881	out:
4278	4882	return ret;
4279	4883	out_release_unlock:
4280	4884	spin_unlock(ptl);
4281		- if (vm_shared)
	4885	+ if (vm_shared \|\| is_continue)
4282	4886	unlock_page(page);
4283	4887	out_release_nounlock:
4284	4888	put_page(page);
4285	4889	goto out;
4286	4890	}
	4891	+#endif /* CONFIG_USERFAULTFD */
4287	4892
4288	4893	long follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
4289	4894	struct page pages, struct vm_area_struct vmas,
4290	4895	unsigned long position, unsigned long nr_pages,
4291		- long i, unsigned int flags, int *nonblocking)
	4896	+ long i, unsigned int flags, int *locked)
4292	4897	{
4293	4898	unsigned long pfn_offset;
4294	4899	unsigned long vaddr = *position;
..	..	@@ -4306,7 +4911,7 @@
4306	4911	* If we have a pending SIGKILL, don't keep faulting pages and
4307	4912	* potentially allocating memory.
4308	4913	*/
4309		- if (unlikely(fatal_signal_pending(current))) {
	4914	+ if (fatal_signal_pending(current)) {
4310	4915	remainder = 0;
4311	4916	break;
4312	4917	}
..	..	@@ -4359,14 +4964,17 @@
4359	4964	spin_unlock(ptl);
4360	4965	if (flags & FOLL_WRITE)
4361	4966	fault_flags \|= FAULT_FLAG_WRITE;
4362		- if (nonblocking)
4363		- fault_flags \|= FAULT_FLAG_ALLOW_RETRY;
	4967	+ if (locked)
	4968	+ fault_flags \|= FAULT_FLAG_ALLOW_RETRY \|
	4969	+ FAULT_FLAG_KILLABLE;
4364	4970	if (flags & FOLL_NOWAIT)
4365	4971	fault_flags \|= FAULT_FLAG_ALLOW_RETRY \|
4366	4972	FAULT_FLAG_RETRY_NOWAIT;
4367	4973	if (flags & FOLL_TRIED) {
4368		- VM_WARN_ON_ONCE(fault_flags &
4369		- FAULT_FLAG_ALLOW_RETRY);
	4974	+ /*
	4975	+ * Note: FAULT_FLAG_ALLOW_RETRY and
	4976	+ * FAULT_FLAG_TRIED can co-exist
	4977	+ */
4370	4978	fault_flags \|= FAULT_FLAG_TRIED;
4371	4979	}
4372	4980	ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
..	..	@@ -4376,9 +4984,9 @@
4376	4984	break;
4377	4985	}
4378	4986	if (ret & VM_FAULT_RETRY) {
4379		- if (nonblocking &&
	4987	+ if (locked &&
4380	4988	!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
4381		- *nonblocking = 0;
	4989	+ *locked = 0;
4382	4990	*nr_pages = 0;
4383	4991	/*
4384	4992	* VM_FAULT_RETRY must not return an
..	..	@@ -4398,21 +5006,38 @@
4398	5006	page = pte_page(huge_ptep_get(pte));
4399	5007
4400	5008	/*
4401		- * Instead of doing 'try_get_page()' below in the same_page
4402		- * loop, just check the count once here.
	5009	+ * If subpage information not requested, update counters
	5010	+ * and skip the same_page loop below.
4403	5011	*/
4404		- if (unlikely(page_count(page) <= 0)) {
4405		- if (pages) {
	5012	+ if (!pages && !vmas && !pfn_offset &&
	5013	+ (vaddr + huge_page_size(h) < vma->vm_end) &&
	5014	+ (remainder >= pages_per_huge_page(h))) {
	5015	+ vaddr += huge_page_size(h);
	5016	+ remainder -= pages_per_huge_page(h);
	5017	+ i += pages_per_huge_page(h);
	5018	+ spin_unlock(ptl);
	5019	+ continue;
	5020	+ }
	5021	+
	5022	+same_page:
	5023	+ if (pages) {
	5024	+ pages[i] = mem_map_offset(page, pfn_offset);
	5025	+ /*
	5026	+ * try_grab_page() should always succeed here, because:
	5027	+ * a) we hold the ptl lock, and b) we've just checked
	5028	+ * that the huge page is present in the page tables. If
	5029	+ * the huge page is present, then the tail pages must
	5030	+ * also be present. The ptl prevents the head page and
	5031	+ * tail pages from being rearranged in any way. So this
	5032	+ * page must be available at this point, unless the page
	5033	+ * refcount overflowed:
	5034	+ */
	5035	+ if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
4406	5036	spin_unlock(ptl);
4407	5037	remainder = 0;
4408	5038	err = -ENOMEM;
4409	5039	break;
4410	5040	}
4411		- }
4412		-same_page:
4413		- if (pages) {
4414		- pages[i] = mem_map_offset(page, pfn_offset);
4415		- get_page(pages[i]);
4416	5041	}
4417	5042
4418	5043	if (vmas)
..	..	@@ -4443,14 +5068,6 @@
4443	5068	return i ? i : err;
4444	5069	}
4445	5070
4446		-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
4447		-/*
4448		- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
4449		- * implement this.
4450		- */
4451		-#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
4452		-#endif
4453		-
4454	5071	unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
4455	5072	unsigned long address, unsigned long end, pgprot_t newprot)
4456	5073	{
..	..	@@ -4460,21 +5077,22 @@
4460	5077	pte_t pte;
4461	5078	struct hstate *h = hstate_vma(vma);
4462	5079	unsigned long pages = 0;
4463		- unsigned long f_start = start;
4464		- unsigned long f_end = end;
4465	5080	bool shared_pmd = false;
	5081	+ struct mmu_notifier_range range;
4466	5082
4467	5083	/*
4468	5084	* In the case of shared PMDs, the area to flush could be beyond
4469		- * start/end. Set f_start/f_end to cover the maximum possible
	5085	+ * start/end. Set range.start/range.end to cover the maximum possible
4470	5086	* range if PMD sharing is possible.
4471	5087	*/
4472		- adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
	5088	+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
	5089	+ 0, vma, mm, start, end);
	5090	+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4473	5091
4474	5092	BUG_ON(address >= end);
4475		- flush_cache_range(vma, f_start, f_end);
	5093	+ flush_cache_range(vma, range.start, range.end);
4476	5094
4477		- mmu_notifier_invalidate_range_start(mm, f_start, f_end);
	5095	+ mmu_notifier_invalidate_range_start(&range);
4478	5096	i_mmap_lock_write(vma->vm_file->f_mapping);
4479	5097	for (; address < end; address += huge_page_size(h)) {
4480	5098	spinlock_t *ptl;
..	..	@@ -4482,7 +5100,7 @@
4482	5100	if (!ptep)
4483	5101	continue;
4484	5102	ptl = huge_pte_lock(h, mm, ptep);
4485		- if (huge_pmd_unshare(mm, &address, ptep)) {
	5103	+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
4486	5104	pages++;
4487	5105	spin_unlock(ptl);
4488	5106	shared_pmd = true;
..	..	@@ -4509,10 +5127,12 @@
4509	5127	continue;
4510	5128	}
4511	5129	if (!huge_pte_none(pte)) {
4512		- pte = huge_ptep_get_and_clear(mm, address, ptep);
4513		- pte = pte_mkhuge(huge_pte_modify(pte, newprot));
	5130	+ pte_t old_pte;
	5131	+
	5132	+ old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
	5133	+ pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
4514	5134	pte = arch_make_huge_pte(pte, vma, NULL, 0);
4515		- set_huge_pte_at(mm, address, ptep, pte);
	5135	+ huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
4516	5136	pages++;
4517	5137	}
4518	5138	spin_unlock(ptl);
..	..	@@ -4525,7 +5145,7 @@
4525	5145	* did unshare a page of pmds, flush the range corresponding to the pud.
4526	5146	*/
4527	5147	if (shared_pmd)
4528		- flush_hugetlb_tlb_range(vma, f_start, f_end);
	5148	+ flush_hugetlb_tlb_range(vma, range.start, range.end);
4529	5149	else
4530	5150	flush_hugetlb_tlb_range(vma, start, end);
4531	5151	/*
..	..	@@ -4535,7 +5155,7 @@
4535	5155	* See Documentation/vm/mmu_notifier.rst
4536	5156	*/
4537	5157	i_mmap_unlock_write(vma->vm_file->f_mapping);
4538		- mmu_notifier_invalidate_range_end(mm, f_start, f_end);
	5158	+ mmu_notifier_invalidate_range_end(&range);
4539	5159
4540	5160	return pages << h->order;
4541	5161	}
..	..	@@ -4545,11 +5165,12 @@
4545	5165	struct vm_area_struct *vma,
4546	5166	vm_flags_t vm_flags)
4547	5167	{
4548		- long ret, chg;
	5168	+ long ret, chg, add = -1;
4549	5169	struct hstate *h = hstate_inode(inode);
4550	5170	struct hugepage_subpool *spool = subpool_inode(inode);
4551	5171	struct resv_map *resv_map;
4552		- long gbl_reserve;
	5172	+ struct hugetlb_cgroup *h_cg = NULL;
	5173	+ long gbl_reserve, regions_needed = 0;
4553	5174
4554	5175	/* This should never happen */
4555	5176	if (from > to) {
..	..	@@ -4572,11 +5193,17 @@
4572	5193	* called to make the mapping read-write. Assume !vma is a shm mapping
4573	5194	*/
4574	5195	if (!vma \|\| vma->vm_flags & VM_MAYSHARE) {
	5196	+ /*
	5197	+ * resv_map can not be NULL as hugetlb_reserve_pages is only
	5198	+ * called for inodes for which resv_maps were created (see
	5199	+ * hugetlbfs_get_inode).
	5200	+ */
4575	5201	resv_map = inode_resv_map(inode);
4576	5202
4577		- chg = region_chg(resv_map, from, to);
	5203	+ chg = region_chg(resv_map, from, to, &regions_needed);
4578	5204
4579	5205	} else {
	5206	+ /* Private mapping. */
4580	5207	resv_map = resv_map_alloc();
4581	5208	if (!resv_map)
4582	5209	return -ENOMEM;
..	..	@@ -4592,6 +5219,21 @@
4592	5219	goto out_err;
4593	5220	}
4594	5221
	5222	+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
	5223	+ hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
	5224	+
	5225	+ if (ret < 0) {
	5226	+ ret = -ENOMEM;
	5227	+ goto out_err;
	5228	+ }
	5229	+
	5230	+ if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
	5231	+ /* For private mappings, the hugetlb_cgroup uncharge info hangs
	5232	+ * of the resv_map.
	5233	+ */
	5234	+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
	5235	+ }
	5236	+
4595	5237	/*
4596	5238	* There must be enough pages in the subpool for the mapping. If
4597	5239	* the subpool has a minimum size, there may be some global
..	..	@@ -4600,7 +5242,7 @@
4600	5242	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
4601	5243	if (gbl_reserve < 0) {
4602	5244	ret = -ENOSPC;
4603		- goto out_err;
	5245	+ goto out_uncharge_cgroup;
4604	5246	}
4605	5247
4606	5248	/*
..	..	@@ -4609,9 +5251,7 @@
4609	5251	*/
4610	5252	ret = hugetlb_acct_memory(h, gbl_reserve);
4611	5253	if (ret < 0) {
4612		- /* put back original number of pages, chg */
4613		- (void)hugepage_subpool_put_pages(spool, chg);
4614		- goto out_err;
	5254	+ goto out_put_pages;
4615	5255	}
4616	5256
4617	5257	/*
..	..	@@ -4626,9 +5266,13 @@
4626	5266	* else has to be done for private mappings here
4627	5267	*/
4628	5268	if (!vma \|\| vma->vm_flags & VM_MAYSHARE) {
4629		- long add = region_add(resv_map, from, to);
	5269	+ add = region_add(resv_map, from, to, regions_needed, h, h_cg);
4630	5270
4631		- if (unlikely(chg > add)) {
	5271	+ if (unlikely(add < 0)) {
	5272	+ hugetlb_acct_memory(h, -gbl_reserve);
	5273	+ ret = add;
	5274	+ goto out_put_pages;
	5275	+ } else if (unlikely(chg > add)) {
4632	5276	/*
4633	5277	* pages in this range were added to the reserve
4634	5278	* map between region_chg and region_add. This
..	..	@@ -4638,17 +5282,41 @@
4638	5282	*/
4639	5283	long rsv_adjust;
4640	5284
	5285	+ /*
	5286	+ * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
	5287	+ * reference to h_cg->css. See comment below for detail.
	5288	+ */
	5289	+ hugetlb_cgroup_uncharge_cgroup_rsvd(
	5290	+ hstate_index(h),
	5291	+ (chg - add) * pages_per_huge_page(h), h_cg);
	5292	+
4641	5293	rsv_adjust = hugepage_subpool_put_pages(spool,
4642	5294	chg - add);
4643	5295	hugetlb_acct_memory(h, -rsv_adjust);
	5296	+ } else if (h_cg) {
	5297	+ /*
	5298	+ * The file_regions will hold their own reference to
	5299	+ * h_cg->css. So we should release the reference held
	5300	+ * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
	5301	+ * done.
	5302	+ */
	5303	+ hugetlb_cgroup_put_rsvd_cgroup(h_cg);
4644	5304	}
4645	5305	}
4646	5306	return 0;
	5307	+out_put_pages:
	5308	+ /* put back original number of pages, chg */
	5309	+ (void)hugepage_subpool_put_pages(spool, chg);
	5310	+out_uncharge_cgroup:
	5311	+ hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
	5312	+ chg * pages_per_huge_page(h), h_cg);
4647	5313	out_err:
4648	5314	if (!vma \|\| vma->vm_flags & VM_MAYSHARE)
4649		- /* Don't call region_abort if region_chg failed */
4650		- if (chg >= 0)
4651		- region_abort(resv_map, from, to);
	5315	+ /* Only call region_abort if the region_chg succeeded but the
	5316	+ * region_add failed or didn't run.
	5317	+ */
	5318	+ if (chg >= 0 && add < 0)
	5319	+ region_abort(resv_map, from, to, regions_needed);
4652	5320	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4653	5321	kref_put(&resv_map->refs, resv_map_release);
4654	5322	return ret;
..	..	@@ -4663,6 +5331,10 @@
4663	5331	struct hugepage_subpool *spool = subpool_inode(inode);
4664	5332	long gbl_reserve;
4665	5333
	5334	+ /*
	5335	+ * Since this routine can be called in the evict inode path for all
	5336	+ * hugetlbfs inodes, resv_map could be NULL.
	5337	+ */
4666	5338	if (resv_map) {
4667	5339	chg = region_del(resv_map, start, end);
4668	5340	/*
..	..	@@ -4727,6 +5399,15 @@
4727	5399	return false;
4728	5400	}
4729	5401
	5402	+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
	5403	+{
	5404	+#ifdef CONFIG_USERFAULTFD
	5405	+ if (uffd_disable_huge_pmd_share(vma))
	5406	+ return false;
	5407	+#endif
	5408	+ return vma_shareable(vma, addr);
	5409	+}
	5410	+
4730	5411	/*
4731	5412	* Determine if start,end range within vma could be mapped by shared pmd.
4732	5413	* If yes, adjust start and end to cover range associated with possible
..	..	@@ -4758,14 +5439,22 @@
4758	5439	* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
4759	5440	* and returns the corresponding pte. While this is not necessary for the
4760	5441	* !shared pmd case because we can allocate the pmd later as well, it makes the
4761		- * code much cleaner. pmd allocation is essential for the shared case because
4762		- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
4763		- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
4764		- * bad pmd for sharing.
	5442	+ * code much cleaner.
	5443	+ *
	5444	+ * This routine must be called with i_mmap_rwsem held in at least read mode if
	5445	+ * sharing is possible. For hugetlbfs, this prevents removal of any page
	5446	+ * table entries associated with the address space. This is important as we
	5447	+ * are setting up sharing based on existing page table entries (mappings).
	5448	+ *
	5449	+ * NOTE: This routine is only called from huge_pte_alloc. Some callers of
	5450	+ * huge_pte_alloc know that sharing is not possible and do not take
	5451	+ * i_mmap_rwsem as a performance optimization. This is handled by the
	5452	+ * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
	5453	+ * only required for subsequent processing.
4765	5454	*/
4766		-pte_t huge_pmd_share(struct mm_struct mm, unsigned long addr, pud_t *pud)
	5455	+pte_t huge_pmd_share(struct mm_struct mm, struct vm_area_struct *vma,
	5456	+ unsigned long addr, pud_t *pud)
4767	5457	{
4768		- struct vm_area_struct *vma = find_vma(mm, addr);
4769	5458	struct address_space *mapping = vma->vm_file->f_mapping;
4770	5459	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
4771	5460	vma->vm_pgoff;
..	..	@@ -4775,10 +5464,7 @@
4775	5464	pte_t *pte;
4776	5465	spinlock_t *ptl;
4777	5466
4778		- if (!vma_shareable(vma, addr))
4779		- return (pte_t *)pmd_alloc(mm, pud, addr);
4780		-
4781		- i_mmap_lock_write(mapping);
	5467	+ i_mmap_assert_locked(mapping);
4782	5468	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
4783	5469	if (svma == vma)
4784	5470	continue;
..	..	@@ -4808,7 +5494,6 @@
4808	5494	spin_unlock(ptl);
4809	5495	out:
4810	5496	pte = (pte_t *)pmd_alloc(mm, pud, addr);
4811		- i_mmap_unlock_write(mapping);
4812	5497	return pte;
4813	5498	}
4814	5499
..	..	@@ -4819,17 +5504,19 @@
4819	5504	* indicated by page_count > 1, unmap is achieved by clearing pud and
4820	5505	* decrementing the ref count. If count == 1, the pte page is not shared.
4821	5506	*
4822		- * called with page table lock held.
	5507	+ * Called with page table lock held and i_mmap_rwsem held in write mode.
4823	5508	*
4824	5509	* returns: 1 successfully unmapped a shared pte page
4825	5510	* 0 the underlying pte page is not shared, or it is the last user
4826	5511	*/
4827		-int huge_pmd_unshare(struct mm_struct mm, unsigned long addr, pte_t *ptep)
	5512	+int huge_pmd_unshare(struct mm_struct mm, struct vm_area_struct vma,
	5513	+ unsigned long addr, pte_t ptep)
4828	5514	{
4829	5515	pgd_t pgd = pgd_offset(mm, addr);
4830	5516	p4d_t p4d = p4d_offset(pgd, addr);
4831	5517	pud_t pud = pud_offset(p4d, addr);
4832	5518
	5519	+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
4833	5520	BUG_ON(page_count(virt_to_page(ptep)) == 0);
4834	5521	if (page_count(virt_to_page(ptep)) == 1)
4835	5522	return 0;
..	..	@@ -4837,17 +5524,26 @@
4837	5524	pud_clear(pud);
4838	5525	put_page(virt_to_page(ptep));
4839	5526	mm_dec_nr_pmds(mm);
4840		- addr = ALIGN(addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
	5527	+ /*
	5528	+ * This update of passed address optimizes loops sequentially
	5529	+ * processing addresses in increments of huge page size (PMD_SIZE
	5530	+ * in this case). By clearing the pud, a PUD_SIZE area is unmapped.
	5531	+ * Update address to the 'last page' in the cleared area so that
	5532	+ * calling loop can move to first page past this area.
	5533	+ */
	5534	+ *addr \|= PUD_SIZE - PMD_SIZE;
4841	5535	return 1;
4842	5536	}
4843		-#define want_pmd_share() (1)
	5537	+
4844	5538	#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
4845		-pte_t huge_pmd_share(struct mm_struct mm, unsigned long addr, pud_t *pud)
	5539	+pte_t huge_pmd_share(struct mm_struct mm, struct vm_area_struct *vma,
	5540	+ unsigned long addr, pud_t *pud)
4846	5541	{
4847	5542	return NULL;
4848	5543	}
4849	5544
4850		-int huge_pmd_unshare(struct mm_struct mm, unsigned long addr, pte_t *ptep)
	5545	+int huge_pmd_unshare(struct mm_struct mm, struct vm_area_struct vma,
	5546	+ unsigned long addr, pte_t ptep)
4851	5547	{
4852	5548	return 0;
4853	5549	}
..	..	@@ -4856,11 +5552,15 @@
4856	5552	unsigned long start, unsigned long end)
4857	5553	{
4858	5554	}
4859		-#define want_pmd_share() (0)
	5555	+
	5556	+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
	5557	+{
	5558	+ return false;
	5559	+}
4860	5560	#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
4861	5561
4862	5562	#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
4863		-pte_t huge_pte_alloc(struct mm_struct mm,
	5563	+pte_t huge_pte_alloc(struct mm_struct mm, struct vm_area_struct *vma,
4864	5564	unsigned long addr, unsigned long sz)
4865	5565	{
4866	5566	pgd_t *pgd;
..	..	@@ -4878,8 +5578,8 @@
4878	5578	pte = (pte_t *)pud;
4879	5579	} else {
4880	5580	BUG_ON(sz != PMD_SIZE);
4881		- if (want_pmd_share() && pud_none(*pud))
4882		- pte = huge_pmd_share(mm, addr, pud);
	5581	+ if (want_pmd_share(vma, addr) && pud_none(*pud))
	5582	+ pte = huge_pmd_share(mm, vma, addr, pud);
4883	5583	else
4884	5584	pte = (pte_t *)pmd_alloc(mm, pud, addr);
4885	5585	}
..	..	@@ -4893,8 +5593,8 @@
4893	5593	* huge_pte_offset() - Walk the page table to resolve the hugepage
4894	5594	* entry at address @addr
4895	5595	*
4896		- * Return: Pointer to page table or swap entry (PUD or PMD) for
4897		- * address @addr, or NULL if a p*d_none() entry is encountered and the
	5596	+ * Return: Pointer to page table entry (PUD or PMD) for
	5597	+ * address @addr, or NULL if a !p*d_present() entry is encountered and the
4898	5598	* size @sz doesn't match the hugepage size at this level of the page
4899	5599	* table.
4900	5600	*/
..	..	@@ -4903,8 +5603,8 @@
4903	5603	{
4904	5604	pgd_t *pgd;
4905	5605	p4d_t *p4d;
4906		- pud_t *pud, pud_entry;
4907		- pmd_t *pmd, pmd_entry;
	5606	+ pud_t *pud;
	5607	+ pmd_t *pmd;
4908	5608
4909	5609	pgd = pgd_offset(mm, addr);
4910	5610	if (!pgd_present(*pgd))
..	..	@@ -4914,22 +5614,16 @@
4914	5614	return NULL;
4915	5615
4916	5616	pud = pud_offset(p4d, addr);
4917		- pud_entry = READ_ONCE(*pud);
4918		- if (sz != PUD_SIZE && pud_none(pud_entry))
4919		- return NULL;
4920		- /* hugepage or swap? */
4921		- if (pud_huge(pud_entry) \|\| !pud_present(pud_entry))
	5617	+ if (sz == PUD_SIZE)
	5618	+ /* must be pud huge, non-present or none */
4922	5619	return (pte_t *)pud;
	5620	+ if (!pud_present(*pud))
	5621	+ return NULL;
	5622	+ /* must have a valid entry and size to go further */
4923	5623
4924	5624	pmd = pmd_offset(pud, addr);
4925		- pmd_entry = READ_ONCE(*pmd);
4926		- if (sz != PMD_SIZE && pmd_none(pmd_entry))
4927		- return NULL;
4928		- /* hugepage or swap? */
4929		- if (pmd_huge(pmd_entry) \|\| !pmd_present(pmd_entry))
4930		- return (pte_t *)pmd;
4931		-
4932		- return NULL;
	5625	+ /* must be pmd huge, non-present or none */
	5626	+ return (pte_t *)pmd;
4933	5627	}
4934	5628
4935	5629	#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
..	..	@@ -4954,30 +5648,45 @@
4954	5648	}
4955	5649
4956	5650	struct page * __weak
4957		-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
4958		- pmd_t *pmd, int flags)
	5651	+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
4959	5652	{
	5653	+ struct hstate *h = hstate_vma(vma);
	5654	+ struct mm_struct *mm = vma->vm_mm;
4960	5655	struct page *page = NULL;
4961	5656	spinlock_t *ptl;
4962		- pte_t pte;
	5657	+ pte_t *ptep, pte;
	5658	+
	5659	+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
	5660	+ if (WARN_ON_ONCE((flags & (FOLL_PIN \| FOLL_GET)) ==
	5661	+ (FOLL_PIN \| FOLL_GET)))
	5662	+ return NULL;
	5663	+
4963	5664	retry:
4964		- ptl = pmd_lockptr(mm, pmd);
4965		- spin_lock(ptl);
4966		- /*
4967		- * make sure that the address range covered by this pmd is not
4968		- * unmapped from other threads.
4969		- */
4970		- if (!pmd_huge(*pmd))
4971		- goto out;
4972		- pte = huge_ptep_get((pte_t *)pmd);
	5665	+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
	5666	+ if (!ptep)
	5667	+ return NULL;
	5668	+
	5669	+ ptl = huge_pte_lock(h, mm, ptep);
	5670	+ pte = huge_ptep_get(ptep);
4973	5671	if (pte_present(pte)) {
4974		- page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
4975		- if (flags & FOLL_GET)
4976		- get_page(page);
	5672	+ page = pte_page(pte) +
	5673	+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
	5674	+ /*
	5675	+ * try_grab_page() should always succeed here, because: a) we
	5676	+ * hold the pmd (ptl) lock, and b) we've just checked that the
	5677	+ * huge pmd (head) page is present in the page tables. The ptl
	5678	+ * prevents the head page and tail pages from being rearranged
	5679	+ * in any way. So this page must be available at this point,
	5680	+ * unless the page refcount overflowed:
	5681	+ */
	5682	+ if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
	5683	+ page = NULL;
	5684	+ goto out;
	5685	+ }
4977	5686	} else {
4978	5687	if (is_hugetlb_entry_migration(pte)) {
4979	5688	spin_unlock(ptl);
4980		- __migration_entry_wait(mm, (pte_t *)pmd, ptl);
	5689	+ __migration_entry_wait(mm, ptep, ptl);
4981	5690	goto retry;
4982	5691	}
4983	5692	/*
..	..	@@ -4994,7 +5703,7 @@
4994	5703	follow_huge_pud(struct mm_struct *mm, unsigned long address,
4995	5704	pud_t *pud, int flags)
4996	5705	{
4997		- if (flags & FOLL_GET)
	5706	+ if (flags & (FOLL_GET \| FOLL_PIN))
4998	5707	return NULL;
4999	5708
5000	5709	return pte_page((pte_t )pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
..	..	@@ -5003,20 +5712,20 @@
5003	5712	struct page * __weak
5004	5713	follow_huge_pgd(struct mm_struct mm, unsigned long address, pgd_t pgd, int flags)
5005	5714	{
5006		- if (flags & FOLL_GET)
	5715	+ if (flags & (FOLL_GET \| FOLL_PIN))
5007	5716	return NULL;
5008	5717
5009	5718	return pte_page((pte_t )pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
5010	5719	}
5011	5720
5012		-bool isolate_huge_page(struct page page, struct list_head list)
	5721	+int isolate_hugetlb(struct page page, struct list_head list)
5013	5722	{
5014		- bool ret = true;
	5723	+ int ret = 0;
5015	5724
5016	5725	spin_lock(&hugetlb_lock);
5017	5726	if (!PageHeadHuge(page) \|\| !page_huge_active(page) \|\|
5018	5727	!get_page_unless_zero(page)) {
5019		- ret = false;
	5728	+ ret = -EBUSY;
5020	5729	goto unlock;
5021	5730	}
5022	5731	clear_page_huge_active(page);
..	..	@@ -5068,3 +5777,132 @@
5068	5777	spin_unlock(&hugetlb_lock);
5069	5778	}
5070	5779	}
	5780	+
	5781	+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
	5782	+ unsigned long start,
	5783	+ unsigned long end)
	5784	+{
	5785	+ struct hstate *h = hstate_vma(vma);
	5786	+ unsigned long sz = huge_page_size(h);
	5787	+ struct mm_struct *mm = vma->vm_mm;
	5788	+ struct mmu_notifier_range range;
	5789	+ unsigned long address;
	5790	+ spinlock_t *ptl;
	5791	+ pte_t *ptep;
	5792	+
	5793	+ if (!(vma->vm_flags & VM_MAYSHARE))
	5794	+ return;
	5795	+
	5796	+ if (start >= end)
	5797	+ return;
	5798	+
	5799	+ flush_cache_range(vma, start, end);
	5800	+ /*
	5801	+ * No need to call adjust_range_if_pmd_sharing_possible(), because
	5802	+ * we have already done the PUD_SIZE alignment.
	5803	+ */
	5804	+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
	5805	+ start, end);
	5806	+ mmu_notifier_invalidate_range_start(&range);
	5807	+ i_mmap_lock_write(vma->vm_file->f_mapping);
	5808	+ for (address = start; address < end; address += PUD_SIZE) {
	5809	+ unsigned long tmp = address;
	5810	+
	5811	+ ptep = huge_pte_offset(mm, address, sz);
	5812	+ if (!ptep)
	5813	+ continue;
	5814	+ ptl = huge_pte_lock(h, mm, ptep);
	5815	+ /* We don't want 'address' to be changed */
	5816	+ huge_pmd_unshare(mm, vma, &tmp, ptep);
	5817	+ spin_unlock(ptl);
	5818	+ }
	5819	+ flush_hugetlb_tlb_range(vma, start, end);
	5820	+ i_mmap_unlock_write(vma->vm_file->f_mapping);
	5821	+ /*
	5822	+ * No need to call mmu_notifier_invalidate_range(), see
	5823	+ * Documentation/vm/mmu_notifier.rst.
	5824	+ */
	5825	+ mmu_notifier_invalidate_range_end(&range);
	5826	+}
	5827	+
	5828	+/*
	5829	+ * This function will unconditionally remove all the shared pmd pgtable entries
	5830	+ * within the specific vma for a hugetlbfs memory range.
	5831	+ */
	5832	+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
	5833	+{
	5834	+ hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
	5835	+ ALIGN_DOWN(vma->vm_end, PUD_SIZE));
	5836	+}
	5837	+
	5838	+#ifdef CONFIG_CMA
	5839	+static bool cma_reserve_called __initdata;
	5840	+
	5841	+static int __init cmdline_parse_hugetlb_cma(char *p)
	5842	+{
	5843	+ hugetlb_cma_size = memparse(p, &p);
	5844	+ return 0;
	5845	+}
	5846	+
	5847	+early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
	5848	+
	5849	+void __init hugetlb_cma_reserve(int order)
	5850	+{
	5851	+ unsigned long size, reserved, per_node;
	5852	+ int nid;
	5853	+
	5854	+ cma_reserve_called = true;
	5855	+
	5856	+ if (!hugetlb_cma_size)
	5857	+ return;
	5858	+
	5859	+ if (hugetlb_cma_size < (PAGE_SIZE << order)) {
	5860	+ pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
	5861	+ (PAGE_SIZE << order) / SZ_1M);
	5862	+ return;
	5863	+ }
	5864	+
	5865	+ /*
	5866	+ * If 3 GB area is requested on a machine with 4 numa nodes,
	5867	+ * let's allocate 1 GB on first three nodes and ignore the last one.
	5868	+ */
	5869	+ per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
	5870	+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
	5871	+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
	5872	+
	5873	+ reserved = 0;
	5874	+ for_each_node_state(nid, N_ONLINE) {
	5875	+ int res;
	5876	+ char name[CMA_MAX_NAME];
	5877	+
	5878	+ size = min(per_node, hugetlb_cma_size - reserved);
	5879	+ size = round_up(size, PAGE_SIZE << order);
	5880	+
	5881	+ snprintf(name, sizeof(name), "hugetlb%d", nid);
	5882	+ res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
	5883	+ 0, false, name,
	5884	+ &hugetlb_cma[nid], nid);
	5885	+ if (res) {
	5886	+ pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
	5887	+ res, nid);
	5888	+ continue;
	5889	+ }
	5890	+
	5891	+ reserved += size;
	5892	+ pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
	5893	+ size / SZ_1M, nid);
	5894	+
	5895	+ if (reserved >= hugetlb_cma_size)
	5896	+ break;
	5897	+ }
	5898	+}
	5899	+
	5900	+void __init hugetlb_cma_check(void)
	5901	+{
	5902	+ if (!hugetlb_cma_size \|\| cma_reserve_called)
	5903	+ return;
	5904	+
	5905	+ pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
	5906	+}
	5907	+
	5908	+#endif /* CONFIG_CMA */