~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/filemap.c
3	4	*
..	..	@@ -24,6 +25,7 @@
24	25	#include <linux/pagemap.h>
25	26	#include <linux/file.h>
26	27	#include <linux/uio.h>
	28	+#include <linux/error-injection.h>
27	29	#include <linux/hash.h>
28	30	#include <linux/writeback.h>
29	31	#include <linux/backing-dev.h>
..	..	@@ -38,10 +40,17 @@
38	40	#include <linux/rmap.h>
39	41	#include <linux/delayacct.h>
40	42	#include <linux/psi.h>
	43	+#include <linux/ramfs.h>
	44	+#include <linux/page_idle.h>
	45	+#include <asm/pgalloc.h>
	46	+#include <asm/tlbflush.h>
41	47	#include "internal.h"
42	48
43	49	#define CREATE_TRACE_POINTS
44	50	#include <trace/events/filemap.h>
	51	+
	52	+#undef CREATE_TRACE_POINTS
	53	+#include <trace/hooks/mm.h>
45	54
46	55	/*
47	56	* FIXME: remove all knowledge of the buffer layer from the core VM
..	..	@@ -73,16 +82,16 @@
73	82	* ->i_mutex
74	83	* ->i_mmap_rwsem (truncate->unmap_mapping_range)
75	84	*
76		- * ->mmap_sem
	85	+ * ->mmap_lock
77	86	* ->i_mmap_rwsem
78	87	* ->page_table_lock or pte_lock (various, mainly in memory.c)
79	88	* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
80	89	*
81		- * ->mmap_sem
	90	+ * ->mmap_lock
82	91	* ->lock_page (access_process_vm)
83	92	*
84	93	* ->i_mutex (generic_perform_write)
85		- * ->mmap_sem (fault_in_pages_readable->do_page_fault)
	94	+ * ->mmap_lock (fault_in_pages_readable->do_page_fault)
86	95	*
87	96	* bdi->wb.list_lock
88	97	* sb_lock (fs/fs-writeback.c)
..	..	@@ -98,8 +107,8 @@
98	107	* ->swap_lock (try_to_unmap_one)
99	108	* ->private_lock (try_to_unmap_one)
100	109	* ->i_pages lock (try_to_unmap_one)
101		- * ->zone_lru_lock(zone) (follow_page->mark_page_accessed)
102		- * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page)
	110	+ * ->pgdat->lru_lock (follow_page->mark_page_accessed)
	111	+ * ->pgdat->lru_lock (check_pte_range->isolate_lru_page)
103	112	* ->private_lock (page_remove_rmap->set_page_dirty)
104	113	* ->i_pages lock (page_remove_rmap->set_page_dirty)
105	114	* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
..	..	@@ -113,60 +122,26 @@
113	122	* ->tasklist_lock (memory_failure, collect_procs_ao)
114	123	*/
115	124
116		-static int page_cache_tree_insert(struct address_space *mapping,
117		- struct page page, void *shadowp)
118		-{
119		- struct radix_tree_node *node;
120		- void **slot;
121		- int error;
122		-
123		- error = __radix_tree_create(&mapping->i_pages, page->index, 0,
124		- &node, &slot);
125		- if (error)
126		- return error;
127		- if (*slot) {
128		- void *p;
129		-
130		- p = radix_tree_deref_slot_protected(slot,
131		- &mapping->i_pages.xa_lock);
132		- if (!radix_tree_exceptional_entry(p))
133		- return -EEXIST;
134		-
135		- mapping->nrexceptional--;
136		- if (shadowp)
137		- *shadowp = p;
138		- }
139		- __radix_tree_replace(&mapping->i_pages, node, slot, page,
140		- workingset_lookup_update(mapping));
141		- mapping->nrpages++;
142		- return 0;
143		-}
144		-
145		-static void page_cache_tree_delete(struct address_space *mapping,
	125	+static void page_cache_delete(struct address_space *mapping,
146	126	struct page page, void shadow)
147	127	{
148		- int i, nr;
	128	+ XA_STATE(xas, &mapping->i_pages, page->index);
	129	+ unsigned int nr = 1;
149	130
150		- /* hugetlb pages are represented by one entry in the radix tree */
151		- nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
	131	+ mapping_set_update(&xas, mapping);
	132	+
	133	+ /* hugetlb pages are represented by a single entry in the xarray */
	134	+ if (!PageHuge(page)) {
	135	+ xas_set_order(&xas, page->index, compound_order(page));
	136	+ nr = compound_nr(page);
	137	+ }
152	138
153	139	VM_BUG_ON_PAGE(!PageLocked(page), page);
154	140	VM_BUG_ON_PAGE(PageTail(page), page);
155	141	VM_BUG_ON_PAGE(nr != 1 && shadow, page);
156	142
157		- for (i = 0; i < nr; i++) {
158		- struct radix_tree_node *node;
159		- void **slot;
160		-
161		- __radix_tree_lookup(&mapping->i_pages, page->index + i,
162		- &node, &slot);
163		-
164		- VM_BUG_ON_PAGE(!node && nr != 1, page);
165		-
166		- radix_tree_clear_tags(&mapping->i_pages, node, slot);
167		- __radix_tree_replace(&mapping->i_pages, node, slot, shadow,
168		- workingset_lookup_update(mapping));
169		- }
	143	+ xas_store(&xas, shadow);
	144	+ xas_init_marks(&xas);
170	145
171	146	page->mapping = NULL;
172	147	/* Leave page->index set: truncation lookup relies upon it */
..	..	@@ -194,12 +169,10 @@
194	169	* invalidate any existing cleancache entries. We can't leave
195	170	* stale data around in the cleancache once our page is gone
196	171	*/
197		- if (PageUptodate(page) && PageMappedToDisk(page)) {
198		- count_vm_event(PGPGOUTCLEAN);
	172	+ if (PageUptodate(page) && PageMappedToDisk(page))
199	173	cleancache_put_page(page);
200		- } else {
	174	+ else
201	175	cleancache_invalidate_page(mapping, page);
202		- }
203	176
204	177	VM_BUG_ON_PAGE(PageTail(page), page);
205	178	VM_BUG_ON_PAGE(page_mapped(page), page);
..	..	@@ -230,15 +203,16 @@
230	203	if (PageHuge(page))
231	204	return;
232	205
233		- nr = hpage_nr_pages(page);
	206	+ nr = thp_nr_pages(page);
234	207
235		- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
	208	+ __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
236	209	if (PageSwapBacked(page)) {
237		- __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
	210	+ __mod_lruvec_page_state(page, NR_SHMEM, -nr);
238	211	if (PageTransHuge(page))
239	212	__dec_node_page_state(page, NR_SHMEM_THPS);
240		- } else {
241		- VM_BUG_ON_PAGE(PageTransHuge(page), page);
	213	+ } else if (PageTransHuge(page)) {
	214	+ __dec_node_page_state(page, NR_FILE_THPS);
	215	+ filemap_nr_thps_dec(mapping);
242	216	}
243	217
244	218	/*
..	..	@@ -267,7 +241,7 @@
267	241	trace_mm_filemap_delete_from_page_cache(page);
268	242
269	243	unaccount_page_cache_page(mapping, page);
270		- page_cache_tree_delete(mapping, page, shadow);
	244	+ page_cache_delete(mapping, page, shadow);
271	245	}
272	246
273	247	static void page_cache_free_page(struct address_space *mapping,
..	..	@@ -280,7 +254,7 @@
280	254	freepage(page);
281	255
282	256	if (PageTransHuge(page) && !PageHuge(page)) {
283		- page_ref_sub(page, HPAGE_PMD_NR);
	257	+ page_ref_sub(page, thp_nr_pages(page));
284	258	VM_BUG_ON_PAGE(page_count(page) <= 0, page);
285	259	} else {
286	260	put_page(page);
..	..	@@ -310,61 +284,62 @@
310	284	EXPORT_SYMBOL(delete_from_page_cache);
311	285
312	286	/*
313		- * page_cache_tree_delete_batch - delete several pages from page cache
	287	+ * page_cache_delete_batch - delete several pages from page cache
314	288	* @mapping: the mapping to which pages belong
315	289	* @pvec: pagevec with pages to delete
316	290	*
317	291	* The function walks over mapping->i_pages and removes pages passed in @pvec
318		- * from the mapping. The function expects @pvec to be sorted by page index.
	292	+ * from the mapping. The function expects @pvec to be sorted by page index
	293	+ * and is optimised for it to be dense.
319	294	* It tolerates holes in @pvec (mapping entries at those indices are not
320	295	* modified). The function expects only THP head pages to be present in the
321		- * @pvec and takes care to delete all corresponding tail pages from the
322		- * mapping as well.
	296	+ * @pvec.
323	297	*
324	298	* The function expects the i_pages lock to be held.
325	299	*/
326		-static void
327		-page_cache_tree_delete_batch(struct address_space *mapping,
	300	+static void page_cache_delete_batch(struct address_space *mapping,
328	301	struct pagevec *pvec)
329	302	{
330		- struct radix_tree_iter iter;
331		- void **slot;
	303	+ XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
332	304	int total_pages = 0;
333		- int i = 0, tail_pages = 0;
	305	+ int i = 0;
334	306	struct page *page;
335		- pgoff_t start;
336	307
337		- start = pvec->pages[0]->index;
338		- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
339		- if (i >= pagevec_count(pvec) && !tail_pages)
	308	+ mapping_set_update(&xas, mapping);
	309	+ xas_for_each(&xas, page, ULONG_MAX) {
	310	+ if (i >= pagevec_count(pvec))
340	311	break;
341		- page = radix_tree_deref_slot_protected(slot,
342		- &mapping->i_pages.xa_lock);
343		- if (radix_tree_exceptional_entry(page))
	312	+
	313	+ /* A swap/dax/shadow entry got inserted? Skip it. */
	314	+ if (xa_is_value(page))
344	315	continue;
345		- if (!tail_pages) {
346		- /*
347		- * Some page got inserted in our range? Skip it. We
348		- * have our pages locked so they are protected from
349		- * being removed.
350		- */
351		- if (page != pvec->pages[i])
352		- continue;
353		- WARN_ON_ONCE(!PageLocked(page));
354		- if (PageTransHuge(page) && !PageHuge(page))
355		- tail_pages = HPAGE_PMD_NR - 1;
356		- page->mapping = NULL;
357		- /*
358		- * Leave page->index set: truncation lookup relies
359		- * upon it
360		- */
361		- i++;
362		- } else {
363		- tail_pages--;
	316	+ /*
	317	+ * A page got inserted in our range? Skip it. We have our
	318	+ * pages locked so they are protected from being removed.
	319	+ * If we see a page whose index is higher than ours, it
	320	+ * means our page has been removed, which shouldn't be
	321	+ * possible because we're holding the PageLock.
	322	+ */
	323	+ if (page != pvec->pages[i]) {
	324	+ VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
	325	+ page);
	326	+ continue;
364	327	}
365		- radix_tree_clear_tags(&mapping->i_pages, iter.node, slot);
366		- __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
367		- workingset_lookup_update(mapping));
	328	+
	329	+ WARN_ON_ONCE(!PageLocked(page));
	330	+
	331	+ if (page->index == xas.xa_index)
	332	+ page->mapping = NULL;
	333	+ /* Leave page->index set: truncation lookup relies on it */
	334	+
	335	+ /*
	336	+ * Move to the next page in the vector if this is a regular
	337	+ * page or the index is of the last sub-page of this compound
	338	+ * page.
	339	+ */
	340	+ if (page->index + compound_nr(page) - 1 == xas.xa_index)
	341	+ i++;
	342	+ xas_store(&xas, NULL);
368	343	total_pages++;
369	344	}
370	345	mapping->nrpages -= total_pages;
..	..	@@ -385,7 +360,7 @@
385	360
386	361	unaccount_page_cache_page(mapping, pvec->pages[i]);
387	362	}
388		- page_cache_tree_delete_batch(mapping, pvec);
	363	+ page_cache_delete_batch(mapping, pvec);
389	364	xa_unlock_irqrestore(&mapping->i_pages, flags);
390	365
391	366	for (i = 0; i < pagevec_count(pvec); i++)
..	..	@@ -430,6 +405,8 @@
430	405	* opposed to a regular memory cleansing writeback. The difference between
431	406	* these two operations is that if a dirty page/buffer is encountered, it must
432	407	* be waited upon, and not just skipped over.
	408	+ *
	409	+ * Return: %0 on success, negative error code otherwise.
433	410	*/
434	411	int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
435	412	loff_t end, int sync_mode)
..	..	@@ -442,7 +419,7 @@
442	419	.range_end = end,
443	420	};
444	421
445		- if (!mapping_cap_writeback_dirty(mapping) \|\|
	422	+ if (!mapping_can_writeback(mapping) \|\|
446	423	!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
447	424	return 0;
448	425
..	..	@@ -477,6 +454,8 @@
477	454	*
478	455	* This is a mostly non-blocking flush. Not suitable for data-integrity
479	456	* purposes - I/O may not be started against all dirty pages.
	457	+ *
	458	+ * Return: %0 on success, negative error code otherwise.
480	459	*/
481	460	int filemap_flush(struct address_space *mapping)
482	461	{
..	..	@@ -492,24 +471,38 @@
492	471	*
493	472	* Find at least one page in the range supplied, usually used to check if
494	473	* direct writing in this range will trigger a writeback.
	474	+ *
	475	+ * Return: %true if at least one page exists in the specified range,
	476	+ * %false otherwise.
495	477	*/
496	478	bool filemap_range_has_page(struct address_space *mapping,
497	479	loff_t start_byte, loff_t end_byte)
498	480	{
499		- pgoff_t index = start_byte >> PAGE_SHIFT;
500		- pgoff_t end = end_byte >> PAGE_SHIFT;
501	481	struct page *page;
	482	+ XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
	483	+ pgoff_t max = end_byte >> PAGE_SHIFT;
502	484
503	485	if (end_byte < start_byte)
504	486	return false;
505	487
506		- if (mapping->nrpages == 0)
507		- return false;
	488	+ rcu_read_lock();
	489	+ for (;;) {
	490	+ page = xas_find(&xas, max);
	491	+ if (xas_retry(&xas, page))
	492	+ continue;
	493	+ /* Shadow entries don't count */
	494	+ if (xa_is_value(page))
	495	+ continue;
	496	+ /*
	497	+ * We don't need to try to pin this page; we're about to
	498	+ * release the RCU lock anyway. It is enough to know that
	499	+ * there was a page here recently.
	500	+ */
	501	+ break;
	502	+ }
	503	+ rcu_read_unlock();
508	504
509		- if (!find_get_pages_range(mapping, &index, end, 1, &page))
510		- return false;
511		- put_page(page);
512		- return true;
	505	+ return page != NULL;
513	506	}
514	507	EXPORT_SYMBOL(filemap_range_has_page);
515	508
..	..	@@ -557,6 +550,8 @@
557	550	* Since the error status of the address space is cleared by this function,
558	551	* callers are responsible for checking the return value and handling and/or
559	552	* reporting the error.
	553	+ *
	554	+ * Return: error status of the address space.
560	555	*/
561	556	int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
562	557	loff_t end_byte)
..	..	@@ -601,6 +596,8 @@
601	596	* Since the error status of the file is advanced by this function,
602	597	* callers are responsible for checking the return value and handling and/or
603	598	* reporting the error.
	599	+ *
	600	+ * Return: error status of the address space vs. the file->f_wb_err cursor.
604	601	*/
605	602	int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
606	603	{
..	..	@@ -622,6 +619,8 @@
622	619	* Use this function if callers don't handle errors themselves. Expected
623	620	* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
624	621	* fsfreeze(8)
	622	+ *
	623	+ * Return: error status of the address space.
625	624	*/
626	625	int filemap_fdatawait_keep_errors(struct address_space *mapping)
627	626	{
..	..	@@ -630,38 +629,14 @@
630	629	}
631	630	EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
632	631
	632	+/* Returns true if writeback might be needed or already in progress. */
633	633	static bool mapping_needs_writeback(struct address_space *mapping)
634	634	{
635		- return (!dax_mapping(mapping) && mapping->nrpages) \|\|
636		- (dax_mapping(mapping) && mapping->nrexceptional);
637		-}
	635	+ if (dax_mapping(mapping))
	636	+ return mapping->nrexceptional;
638	637
639		-int filemap_write_and_wait(struct address_space *mapping)
640		-{
641		- int err = 0;
642		-
643		- if (mapping_needs_writeback(mapping)) {
644		- err = filemap_fdatawrite(mapping);
645		- /*
646		- * Even if the above returned error, the pages may be
647		- * written partially (e.g. -ENOSPC), so we wait for it.
648		- * But the -EIO is special case, it may indicate the worst
649		- * thing (e.g. bug) happened, so we avoid waiting for it.
650		- */
651		- if (err != -EIO) {
652		- int err2 = filemap_fdatawait(mapping);
653		- if (!err)
654		- err = err2;
655		- } else {
656		- /* Clear any previously stored errors */
657		- filemap_check_errors(mapping);
658		- }
659		- } else {
660		- err = filemap_check_errors(mapping);
661		- }
662		- return err;
	638	+ return mapping->nrpages;
663	639	}
664		-EXPORT_SYMBOL(filemap_write_and_wait);
665	640
666	641	/**
667	642	* filemap_write_and_wait_range - write out & wait on a file range
..	..	@@ -673,6 +648,8 @@
673	648	*
674	649	* Note that @lend is inclusive (describes the last byte to be written) so
675	650	* that this function can be used to write to the very end-of-file (end = -1).
	651	+ *
	652	+ * Return: error status of the address space.
676	653	*/
677	654	int filemap_write_and_wait_range(struct address_space *mapping,
678	655	loff_t lstart, loff_t lend)
..	..	@@ -682,7 +659,12 @@
682	659	if (mapping_needs_writeback(mapping)) {
683	660	err = __filemap_fdatawrite_range(mapping, lstart, lend,
684	661	WB_SYNC_ALL);
685		- /* See comment of filemap_write_and_wait() */
	662	+ /*
	663	+ * Even if the above returned error, the pages may be
	664	+ * written partially (e.g. -ENOSPC), so we wait for it.
	665	+ * But the -EIO is special case, it may indicate the worst
	666	+ * thing (e.g. bug) happened, so we avoid waiting for it.
	667	+ */
686	668	if (err != -EIO) {
687	669	int err2 = filemap_fdatawait_range(mapping,
688	670	lstart, lend);
..	..	@@ -728,6 +710,8 @@
728	710	* While we handle mapping->wb_err with atomic operations, the f_wb_err
729	711	* value is protected by the f_lock since we must ensure that it reflects
730	712	* the latest value swapped in for this file descriptor.
	713	+ *
	714	+ * Return: %0 on success, negative error code otherwise.
731	715	*/
732	716	int file_check_and_advance_wb_err(struct file *file)
733	717	{
..	..	@@ -770,6 +754,8 @@
770	754	*
771	755	* After writing out and waiting on the data, we check and advance the
772	756	* f_wb_err cursor to the latest value, and return any errors detected there.
	757	+ *
	758	+ * Return: %0 on success, negative error code otherwise.
773	759	*/
774	760	int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
775	761	{
..	..	@@ -802,107 +788,135 @@
802	788	* locked. This function does not add the new page to the LRU, the
803	789	* caller must do that.
804	790	*
805		- * The remove + add is atomic. The only way this function can fail is
806		- * memory allocation failure.
	791	+ * The remove + add is atomic. This function cannot fail.
	792	+ *
	793	+ * Return: %0
807	794	*/
808	795	int replace_page_cache_page(struct page old, struct page new, gfp_t gfp_mask)
809	796	{
810		- int error;
	797	+ struct address_space *mapping = old->mapping;
	798	+ void (freepage)(struct page ) = mapping->a_ops->freepage;
	799	+ pgoff_t offset = old->index;
	800	+ XA_STATE(xas, &mapping->i_pages, offset);
	801	+ unsigned long flags;
811	802
812	803	VM_BUG_ON_PAGE(!PageLocked(old), old);
813	804	VM_BUG_ON_PAGE(!PageLocked(new), new);
814	805	VM_BUG_ON_PAGE(new->mapping, new);
815	806
816		- error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
817		- if (!error) {
818		- struct address_space *mapping = old->mapping;
819		- void (freepage)(struct page );
820		- unsigned long flags;
	807	+ get_page(new);
	808	+ new->mapping = mapping;
	809	+ new->index = offset;
821	810
822		- pgoff_t offset = old->index;
823		- freepage = mapping->a_ops->freepage;
	811	+ mem_cgroup_migrate(old, new);
824	812
825		- get_page(new);
826		- new->mapping = mapping;
827		- new->index = offset;
	813	+ xas_lock_irqsave(&xas, flags);
	814	+ xas_store(&xas, new);
828	815
829		- xa_lock_irqsave(&mapping->i_pages, flags);
830		- __delete_from_page_cache(old, NULL);
831		- error = page_cache_tree_insert(mapping, new, NULL);
832		- BUG_ON(error);
	816	+ old->mapping = NULL;
	817	+ /* hugetlb pages do not participate in page cache accounting. */
	818	+ if (!PageHuge(old))
	819	+ __dec_lruvec_page_state(old, NR_FILE_PAGES);
	820	+ if (!PageHuge(new))
	821	+ __inc_lruvec_page_state(new, NR_FILE_PAGES);
	822	+ if (PageSwapBacked(old))
	823	+ __dec_lruvec_page_state(old, NR_SHMEM);
	824	+ if (PageSwapBacked(new))
	825	+ __inc_lruvec_page_state(new, NR_SHMEM);
	826	+ xas_unlock_irqrestore(&xas, flags);
	827	+ if (freepage)
	828	+ freepage(old);
	829	+ put_page(old);
833	830
834		- /*
835		- * hugetlb pages do not participate in page cache accounting.
836		- */
837		- if (!PageHuge(new))
838		- __inc_node_page_state(new, NR_FILE_PAGES);
839		- if (PageSwapBacked(new))
840		- __inc_node_page_state(new, NR_SHMEM);
841		- xa_unlock_irqrestore(&mapping->i_pages, flags);
842		- mem_cgroup_migrate(old, new);
843		- radix_tree_preload_end();
844		- if (freepage)
845		- freepage(old);
846		- put_page(old);
847		- }
848		-
849		- return error;
	831	+ return 0;
850	832	}
851	833	EXPORT_SYMBOL_GPL(replace_page_cache_page);
852	834
853		-static int __add_to_page_cache_locked(struct page *page,
854		- struct address_space *mapping,
855		- pgoff_t offset, gfp_t gfp_mask,
856		- void **shadowp)
	835	+noinline int __add_to_page_cache_locked(struct page *page,
	836	+ struct address_space *mapping,
	837	+ pgoff_t offset, gfp_t gfp,
	838	+ void **shadowp)
857	839	{
	840	+ XA_STATE(xas, &mapping->i_pages, offset);
858	841	int huge = PageHuge(page);
859		- struct mem_cgroup *memcg;
860	842	int error;
	843	+ bool charged = false;
861	844
862	845	VM_BUG_ON_PAGE(!PageLocked(page), page);
863	846	VM_BUG_ON_PAGE(PageSwapBacked(page), page);
864		-
865		- if (!huge) {
866		- error = mem_cgroup_try_charge(page, current->mm,
867		- gfp_mask, &memcg, false);
868		- if (error)
869		- return error;
870		- }
871		-
872		- error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
873		- if (error) {
874		- if (!huge)
875		- mem_cgroup_cancel_charge(page, memcg, false);
876		- return error;
877		- }
	847	+ mapping_set_update(&xas, mapping);
878	848
879	849	get_page(page);
880	850	page->mapping = mapping;
881	851	page->index = offset;
882	852
883		- xa_lock_irq(&mapping->i_pages);
884		- error = page_cache_tree_insert(mapping, page, shadowp);
885		- radix_tree_preload_end();
886		- if (unlikely(error))
887		- goto err_insert;
	853	+ if (!huge) {
	854	+ error = mem_cgroup_charge(page, current->mm, gfp);
	855	+ if (error)
	856	+ goto error;
	857	+ charged = true;
	858	+ }
888	859
889		- /* hugetlb pages do not participate in page cache accounting. */
890		- if (!huge)
891		- __inc_node_page_state(page, NR_FILE_PAGES);
892		- xa_unlock_irq(&mapping->i_pages);
893		- if (!huge)
894		- mem_cgroup_commit_charge(page, memcg, false, false);
	860	+ gfp &= GFP_RECLAIM_MASK;
	861	+
	862	+ do {
	863	+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
	864	+ void entry, old = NULL;
	865	+
	866	+ if (order > thp_order(page))
	867	+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
	868	+ order, gfp);
	869	+ xas_lock_irq(&xas);
	870	+ xas_for_each_conflict(&xas, entry) {
	871	+ old = entry;
	872	+ if (!xa_is_value(entry)) {
	873	+ xas_set_err(&xas, -EEXIST);
	874	+ goto unlock;
	875	+ }
	876	+ }
	877	+
	878	+ if (old) {
	879	+ if (shadowp)
	880	+ *shadowp = old;
	881	+ /* entry may have been split before we acquired lock */
	882	+ order = xa_get_order(xas.xa, xas.xa_index);
	883	+ if (order > thp_order(page)) {
	884	+ xas_split(&xas, old, order);
	885	+ xas_reset(&xas);
	886	+ }
	887	+ }
	888	+
	889	+ xas_store(&xas, page);
	890	+ if (xas_error(&xas))
	891	+ goto unlock;
	892	+
	893	+ if (old)
	894	+ mapping->nrexceptional--;
	895	+ mapping->nrpages++;
	896	+
	897	+ /* hugetlb pages do not participate in page cache accounting */
	898	+ if (!huge)
	899	+ __inc_lruvec_page_state(page, NR_FILE_PAGES);
	900	+unlock:
	901	+ xas_unlock_irq(&xas);
	902	+ } while (xas_nomem(&xas, gfp));
	903	+
	904	+ if (xas_error(&xas)) {
	905	+ error = xas_error(&xas);
	906	+ if (charged)
	907	+ mem_cgroup_uncharge(page);
	908	+ goto error;
	909	+ }
	910	+
895	911	trace_mm_filemap_add_to_page_cache(page);
896	912	return 0;
897		-err_insert:
	913	+error:
898	914	page->mapping = NULL;
899	915	/* Leave page->index set: truncation relies upon it */
900		- xa_unlock_irq(&mapping->i_pages);
901		- if (!huge)
902		- mem_cgroup_cancel_charge(page, memcg, false);
903	916	put_page(page);
904	917	return error;
905	918	}
	919	+ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
906	920
907	921	/**
908	922	* add_to_page_cache_locked - add a locked page to the pagecache
..	..	@@ -913,6 +927,8 @@
913	927	*
914	928	* This function is used to add a page to the pagecache. It must be locked.
915	929	* This function does not add the page to the LRU. The caller must do that.
	930	+ *
	931	+ * Return: %0 on success, negative error code otherwise.
916	932	*/
917	933	int add_to_page_cache_locked(struct page page, struct address_space mapping,
918	934	pgoff_t offset, gfp_t gfp_mask)
..	..	@@ -1001,37 +1017,89 @@
1001	1017	page_writeback_init();
1002	1018	}
1003	1019
1004		-/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
1005		-struct wait_page_key {
1006		- struct page *page;
1007		- int bit_nr;
1008		- int page_match;
1009		-};
1010		-
1011		-struct wait_page_queue {
1012		- struct page *page;
1013		- int bit_nr;
1014		- wait_queue_entry_t wait;
1015		-};
1016		-
	1020	+/*
	1021	+ * The page wait code treats the "wait->flags" somewhat unusually, because
	1022	+ * we have multiple different kinds of waits, not just the usual "exclusive"
	1023	+ * one.
	1024	+ *
	1025	+ * We have:
	1026	+ *
	1027	+ * (a) no special bits set:
	1028	+ *
	1029	+ * We're just waiting for the bit to be released, and when a waker
	1030	+ * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
	1031	+ * and remove it from the wait queue.
	1032	+ *
	1033	+ * Simple and straightforward.
	1034	+ *
	1035	+ * (b) WQ_FLAG_EXCLUSIVE:
	1036	+ *
	1037	+ * The waiter is waiting to get the lock, and only one waiter should
	1038	+ * be woken up to avoid any thundering herd behavior. We'll set the
	1039	+ * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
	1040	+ *
	1041	+ * This is the traditional exclusive wait.
	1042	+ *
	1043	+ * (c) WQ_FLAG_EXCLUSIVE \| WQ_FLAG_CUSTOM:
	1044	+ *
	1045	+ * The waiter is waiting to get the bit, and additionally wants the
	1046	+ * lock to be transferred to it for fair lock behavior. If the lock
	1047	+ * cannot be taken, we stop walking the wait queue without waking
	1048	+ * the waiter.
	1049	+ *
	1050	+ * This is the "fair lock handoff" case, and in addition to setting
	1051	+ * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
	1052	+ * that it now has the lock.
	1053	+ */
1017	1054	static int wake_page_function(wait_queue_entry_t wait, unsigned mode, int sync, void arg)
1018	1055	{
	1056	+ unsigned int flags;
1019	1057	struct wait_page_key *key = arg;
1020	1058	struct wait_page_queue *wait_page
1021	1059	= container_of(wait, struct wait_page_queue, wait);
1022	1060
1023		- if (wait_page->page != key->page)
1024		- return 0;
1025		- key->page_match = 1;
1026		-
1027		- if (wait_page->bit_nr != key->bit_nr)
	1061	+ if (!wake_page_match(wait_page, key))
1028	1062	return 0;
1029	1063
1030		- /* Stop walking if it's locked */
1031		- if (test_bit(key->bit_nr, &key->page->flags))
1032		- return -1;
	1064	+ /*
	1065	+ * If it's a lock handoff wait, we get the bit for it, and
	1066	+ * stop walking (and do not wake it up) if we can't.
	1067	+ */
	1068	+ flags = wait->flags;
	1069	+ if (flags & WQ_FLAG_EXCLUSIVE) {
	1070	+ if (test_bit(key->bit_nr, &key->page->flags))
	1071	+ return -1;
	1072	+ if (flags & WQ_FLAG_CUSTOM) {
	1073	+ if (test_and_set_bit(key->bit_nr, &key->page->flags))
	1074	+ return -1;
	1075	+ flags \|= WQ_FLAG_DONE;
	1076	+ }
	1077	+ }
1033	1078
1034		- return autoremove_wake_function(wait, mode, sync, key);
	1079	+ /*
	1080	+ * We are holding the wait-queue lock, but the waiter that
	1081	+ * is waiting for this will be checking the flags without
	1082	+ * any locking.
	1083	+ *
	1084	+ * So update the flags atomically, and wake up the waiter
	1085	+ * afterwards to avoid any races. This store-release pairs
	1086	+ * with the load-acquire in wait_on_page_bit_common().
	1087	+ */
	1088	+ smp_store_release(&wait->flags, flags \| WQ_FLAG_WOKEN);
	1089	+ wake_up_state(wait->private, mode);
	1090	+
	1091	+ /*
	1092	+ * Ok, we have successfully done what we're waiting for,
	1093	+ * and we can unconditionally remove the wait entry.
	1094	+ *
	1095	+ * Note that this pairs with the "finish_wait()" in the
	1096	+ * waiter, and has to be the absolute last thing we do.
	1097	+ * After this list_del_init(&wait->entry) the wait entry
	1098	+ * might be de-allocated and the process might even have
	1099	+ * exited.
	1100	+ */
	1101	+ list_del_init_careful(&wait->entry);
	1102	+ return (flags & WQ_FLAG_EXCLUSIVE) != 0;
1035	1103	}
1036	1104
1037	1105	static void wake_up_page_bit(struct page *page, int bit_nr)
..	..	@@ -1095,91 +1163,250 @@
1095	1163	wake_up_page_bit(page, bit);
1096	1164	}
1097	1165
1098		-static inline __sched int wait_on_page_bit_common(wait_queue_head_t *q,
1099		- struct page *page, int bit_nr, int state, bool lock)
	1166	+/*
	1167	+ * A choice of three behaviors for wait_on_page_bit_common():
	1168	+ */
	1169	+enum behavior {
	1170	+ EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
	1171	+ * __lock_page() waiting on then setting PG_locked.
	1172	+ */
	1173	+ SHARED, /* Hold ref to page and check the bit when woken, like
	1174	+ * wait_on_page_writeback() waiting on PG_writeback.
	1175	+ */
	1176	+ DROP, /* Drop ref to page before wait, no check when woken,
	1177	+ * like put_and_wait_on_page_locked() on PG_locked.
	1178	+ */
	1179	+};
	1180	+
	1181	+/*
	1182	+ * Attempt to check (or get) the page bit, and mark us done
	1183	+ * if successful.
	1184	+ */
	1185	+static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
	1186	+ struct wait_queue_entry *wait)
1100	1187	{
	1188	+ if (wait->flags & WQ_FLAG_EXCLUSIVE) {
	1189	+ if (test_and_set_bit(bit_nr, &page->flags))
	1190	+ return false;
	1191	+ } else if (test_bit(bit_nr, &page->flags))
	1192	+ return false;
	1193	+
	1194	+ wait->flags \|= WQ_FLAG_WOKEN \| WQ_FLAG_DONE;
	1195	+ return true;
	1196	+}
	1197	+
	1198	+/* How many times do we accept lock stealing from under a waiter? */
	1199	+int sysctl_page_lock_unfairness = 5;
	1200	+
	1201	+static inline __sched int wait_on_page_bit_common(wait_queue_head_t *q,
	1202	+ struct page *page, int bit_nr, int state, enum behavior behavior)
	1203	+{
	1204	+ int unfairness = sysctl_page_lock_unfairness;
1101	1205	struct wait_page_queue wait_page;
1102	1206	wait_queue_entry_t *wait = &wait_page.wait;
1103	1207	bool thrashing = false;
	1208	+ bool delayacct = false;
1104	1209	unsigned long pflags;
1105		- int ret = 0;
1106	1210
1107	1211	if (bit_nr == PG_locked &&
1108	1212	!PageUptodate(page) && PageWorkingset(page)) {
1109		- if (!PageSwapBacked(page))
	1213	+ if (!PageSwapBacked(page)) {
1110	1214	delayacct_thrashing_start();
	1215	+ delayacct = true;
	1216	+ }
1111	1217	psi_memstall_enter(&pflags);
1112	1218	thrashing = true;
1113	1219	}
1114	1220
1115	1221	init_wait(wait);
1116		- wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
1117	1222	wait->func = wake_page_function;
1118	1223	wait_page.page = page;
1119	1224	wait_page.bit_nr = bit_nr;
1120	1225
1121		- for (;;) {
1122		- spin_lock_irq(&q->lock);
	1226	+repeat:
	1227	+ wait->flags = 0;
	1228	+ if (behavior == EXCLUSIVE) {
	1229	+ wait->flags = WQ_FLAG_EXCLUSIVE;
	1230	+ if (--unfairness < 0)
	1231	+ wait->flags \|= WQ_FLAG_CUSTOM;
	1232	+ }
1123	1233
1124		- if (likely(list_empty(&wait->entry))) {
1125		- __add_wait_queue_entry_tail(q, wait);
1126		- SetPageWaiters(page);
1127		- }
	1234	+ /*
	1235	+ * Do one last check whether we can get the
	1236	+ * page bit synchronously.
	1237	+ *
	1238	+ * Do the SetPageWaiters() marking before that
	1239	+ * to let any waker we _just_ missed know they
	1240	+ * need to wake us up (otherwise they'll never
	1241	+ * even go to the slow case that looks at the
	1242	+ * page queue), and add ourselves to the wait
	1243	+ * queue if we need to sleep.
	1244	+ *
	1245	+ * This part needs to be done under the queue
	1246	+ * lock to avoid races.
	1247	+ */
	1248	+ spin_lock_irq(&q->lock);
	1249	+ SetPageWaiters(page);
	1250	+ if (!trylock_page_bit_common(page, bit_nr, wait))
	1251	+ __add_wait_queue_entry_tail(q, wait);
	1252	+ spin_unlock_irq(&q->lock);
	1253	+
	1254	+ /*
	1255	+ * From now on, all the logic will be based on
	1256	+ * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
	1257	+ * see whether the page bit testing has already
	1258	+ * been done by the wake function.
	1259	+ *
	1260	+ * We can drop our reference to the page.
	1261	+ */
	1262	+ if (behavior == DROP)
	1263	+ put_page(page);
	1264	+
	1265	+ /*
	1266	+ * Note that until the "finish_wait()", or until
	1267	+ * we see the WQ_FLAG_WOKEN flag, we need to
	1268	+ * be very careful with the 'wait->flags', because
	1269	+ * we may race with a waker that sets them.
	1270	+ */
	1271	+ for (;;) {
	1272	+ unsigned int flags;
1128	1273
1129	1274	set_current_state(state);
1130	1275
1131		- spin_unlock_irq(&q->lock);
	1276	+ /* Loop until we've been woken or interrupted */
	1277	+ flags = smp_load_acquire(&wait->flags);
	1278	+ if (!(flags & WQ_FLAG_WOKEN)) {
	1279	+ if (signal_pending_state(state, current))
	1280	+ break;
1132	1281
1133		- if (likely(test_bit(bit_nr, &page->flags))) {
1134	1282	io_schedule();
	1283	+ continue;
1135	1284	}
1136	1285
1137		- if (lock) {
1138		- if (!test_and_set_bit_lock(bit_nr, &page->flags))
1139		- break;
1140		- } else {
1141		- if (!test_bit(bit_nr, &page->flags))
1142		- break;
1143		- }
1144		-
1145		- if (unlikely(signal_pending_state(state, current))) {
1146		- ret = -EINTR;
	1286	+ /* If we were non-exclusive, we're done */
	1287	+ if (behavior != EXCLUSIVE)
1147	1288	break;
1148		- }
	1289	+
	1290	+ /* If the waker got the lock for us, we're done */
	1291	+ if (flags & WQ_FLAG_DONE)
	1292	+ break;
	1293	+
	1294	+ /*
	1295	+ * Otherwise, if we're getting the lock, we need to
	1296	+ * try to get it ourselves.
	1297	+ *
	1298	+ * And if that fails, we'll have to retry this all.
	1299	+ */
	1300	+ if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
	1301	+ goto repeat;
	1302	+
	1303	+ wait->flags \|= WQ_FLAG_DONE;
	1304	+ break;
1149	1305	}
1150	1306
	1307	+ /*
	1308	+ * If a signal happened, this 'finish_wait()' may remove the last
	1309	+ * waiter from the wait-queues, but the PageWaiters bit will remain
	1310	+ * set. That's ok. The next wakeup will take care of it, and trying
	1311	+ * to do it here would be difficult and prone to races.
	1312	+ */
1151	1313	finish_wait(q, wait);
1152	1314
1153	1315	if (thrashing) {
1154		- if (!PageSwapBacked(page))
	1316	+ if (delayacct)
1155	1317	delayacct_thrashing_end();
1156	1318	psi_memstall_leave(&pflags);
1157	1319	}
1158	1320
1159	1321	/*
1160		- * A signal could leave PageWaiters set. Clearing it here if
1161		- * !waitqueue_active would be possible (by open-coding finish_wait),
1162		- * but still fail to catch it in the case of wait hash collision. We
1163		- * already can fail to clear wait hash collision cases, so don't
1164		- * bother with signals either.
	1322	+ * NOTE! The wait->flags weren't stable until we've done the
	1323	+ * 'finish_wait()', and we could have exited the loop above due
	1324	+ * to a signal, and had a wakeup event happen after the signal
	1325	+ * test but before the 'finish_wait()'.
	1326	+ *
	1327	+ * So only after the finish_wait() can we reliably determine
	1328	+ * if we got woken up or not, so we can now figure out the final
	1329	+ * return value based on that state without races.
	1330	+ *
	1331	+ * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
	1332	+ * waiter, but an exclusive one requires WQ_FLAG_DONE.
1165	1333	*/
	1334	+ if (behavior == EXCLUSIVE)
	1335	+ return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
1166	1336
1167		- return ret;
	1337	+ return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
1168	1338	}
1169	1339
1170		-void __sched wait_on_page_bit(struct page *page, int bit_nr)
	1340	+__sched void wait_on_page_bit(struct page *page, int bit_nr)
1171	1341	{
1172	1342	wait_queue_head_t *q = page_waitqueue(page);
1173		- wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
	1343	+ wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
1174	1344	}
1175	1345	EXPORT_SYMBOL(wait_on_page_bit);
1176	1346
1177		-int __sched wait_on_page_bit_killable(struct page *page, int bit_nr)
	1347	+__sched int wait_on_page_bit_killable(struct page *page, int bit_nr)
1178	1348	{
1179	1349	wait_queue_head_t *q = page_waitqueue(page);
1180		- return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
	1350	+ return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
1181	1351	}
1182	1352	EXPORT_SYMBOL(wait_on_page_bit_killable);
	1353	+
	1354	+static int __wait_on_page_locked_async(struct page *page,
	1355	+ struct wait_page_queue *wait, bool set)
	1356	+{
	1357	+ struct wait_queue_head *q = page_waitqueue(page);
	1358	+ int ret = 0;
	1359	+
	1360	+ wait->page = page;
	1361	+ wait->bit_nr = PG_locked;
	1362	+
	1363	+ spin_lock_irq(&q->lock);
	1364	+ __add_wait_queue_entry_tail(q, &wait->wait);
	1365	+ SetPageWaiters(page);
	1366	+ if (set)
	1367	+ ret = !trylock_page(page);
	1368	+ else
	1369	+ ret = PageLocked(page);
	1370	+ /*
	1371	+ * If we were succesful now, we know we're still on the
	1372	+ * waitqueue as we're still under the lock. This means it's
	1373	+ * safe to remove and return success, we know the callback
	1374	+ * isn't going to trigger.
	1375	+ */
	1376	+ if (!ret)
	1377	+ __remove_wait_queue(q, &wait->wait);
	1378	+ else
	1379	+ ret = -EIOCBQUEUED;
	1380	+ spin_unlock_irq(&q->lock);
	1381	+ return ret;
	1382	+}
	1383	+
	1384	+static int wait_on_page_locked_async(struct page *page,
	1385	+ struct wait_page_queue *wait)
	1386	+{
	1387	+ if (!PageLocked(page))
	1388	+ return 0;
	1389	+ return __wait_on_page_locked_async(compound_head(page), wait, false);
	1390	+}
	1391	+
	1392	+/**
	1393	+ * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
	1394	+ * @page: The page to wait for.
	1395	+ *
	1396	+ * The caller should hold a reference on @page. They expect the page to
	1397	+ * become unlocked relatively soon, but do not wish to hold up migration
	1398	+ * (for example) by holding the reference while waiting for the page to
	1399	+ * come unlocked. After this function returns, the caller should not
	1400	+ * dereference @page.
	1401	+ */
	1402	+void put_and_wait_on_page_locked(struct page *page)
	1403	+{
	1404	+ wait_queue_head_t *q;
	1405	+
	1406	+ page = compound_head(page);
	1407	+ q = page_waitqueue(page);
	1408	+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
	1409	+}
1183	1410
1184	1411	/**
1185	1412	* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
..	..	@@ -1211,7 +1438,7 @@
1211	1438	* instead.
1212	1439	*
1213	1440	* The read of PG_waiters has to be after (or concurrently with) PG_locked
1214		- * being cleared, but a memory barrier should be unneccssary since it is
	1441	+ * being cleared, but a memory barrier should be unnecessary since it is
1215	1442	* in the same byte as PG_locked.
1216	1443	*/
1217	1444	static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
..	..	@@ -1227,7 +1454,7 @@
1227	1454	* unlock_page - unlock a locked page
1228	1455	* @page: the page
1229	1456	*
1230		- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
	1457	+ * Unlocks the page and wakes up sleepers in wait_on_page_locked().
1231	1458	* Also wakes sleepers in wait_on_page_writeback() because the wakeup
1232	1459	* mechanism between PageLocked pages and PageWriteback pages is shared.
1233	1460	* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
..	..	@@ -1266,11 +1493,19 @@
1266	1493	rotate_reclaimable_page(page);
1267	1494	}
1268	1495
	1496	+ /*
	1497	+ * Writeback does not hold a page reference of its own, relying
	1498	+ * on truncation to wait for the clearing of PG_writeback.
	1499	+ * But here we must make sure that the page is not freed and
	1500	+ * reused before the wake_up_page().
	1501	+ */
	1502	+ get_page(page);
1269	1503	if (!test_clear_page_writeback(page))
1270	1504	BUG();
1271	1505
1272	1506	smp_mb__after_atomic();
1273	1507	wake_up_page(page, PG_writeback);
	1508	+ put_page(page);
1274	1509	}
1275	1510	EXPORT_SYMBOL(end_page_writeback);
1276	1511
..	..	@@ -1306,45 +1541,52 @@
1306	1541	* __lock_page - get a lock on the page, assuming we need to sleep to get it
1307	1542	* @__page: the page to lock
1308	1543	*/
1309		-void __sched __lock_page(struct page *__page)
	1544	+__sched void __lock_page(struct page *__page)
1310	1545	{
1311	1546	struct page *page = compound_head(__page);
1312	1547	wait_queue_head_t *q = page_waitqueue(page);
1313		- wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
	1548	+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
	1549	+ EXCLUSIVE);
1314	1550	}
1315	1551	EXPORT_SYMBOL(__lock_page);
1316	1552
1317		-int __sched __lock_page_killable(struct page *__page)
	1553	+__sched int __lock_page_killable(struct page *__page)
1318	1554	{
1319	1555	struct page *page = compound_head(__page);
1320	1556	wait_queue_head_t *q = page_waitqueue(page);
1321		- return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
	1557	+ return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
	1558	+ EXCLUSIVE);
1322	1559	}
1323	1560	EXPORT_SYMBOL_GPL(__lock_page_killable);
1324	1561
	1562	+__sched int __lock_page_async(struct page page, struct wait_page_queue wait)
	1563	+{
	1564	+ return __wait_on_page_locked_async(page, wait, true);
	1565	+}
	1566	+
1325	1567	/*
1326	1568	* Return values:
1327		- * 1 - page is locked; mmap_sem is still held.
	1569	+ * 1 - page is locked; mmap_lock is still held.
1328	1570	* 0 - page is not locked.
1329		- * mmap_sem has been released (up_read()), unless flags had both
	1571	+ * mmap_lock has been released (mmap_read_unlock(), unless flags had both
1330	1572	* FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
1331		- * which case mmap_sem is still held.
	1573	+ * which case mmap_lock is still held.
1332	1574	*
1333	1575	* If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
1334		- * with the page locked and the mmap_sem unperturbed.
	1576	+ * with the page locked and the mmap_lock unperturbed.
1335	1577	*/
1336		-int __sched __lock_page_or_retry(struct page page, struct mm_struct mm,
	1578	+__sched int __lock_page_or_retry(struct page page, struct mm_struct mm,
1337	1579	unsigned int flags)
1338	1580	{
1339		- if (flags & FAULT_FLAG_ALLOW_RETRY) {
	1581	+ if (fault_flag_allow_retry_first(flags)) {
1340	1582	/*
1341		- * CAUTION! In this case, mmap_sem is not released
	1583	+ * CAUTION! In this case, mmap_lock is not released
1342	1584	* even though return 0.
1343	1585	*/
1344	1586	if (flags & FAULT_FLAG_RETRY_NOWAIT)
1345	1587	return 0;
1346	1588
1347		- up_read(&mm->mmap_sem);
	1589	+ mmap_read_unlock(mm);
1348	1590	if (flags & FAULT_FLAG_KILLABLE)
1349	1591	wait_on_page_locked_killable(page);
1350	1592	else
..	..	@@ -1356,7 +1598,7 @@
1356	1598
1357	1599	ret = __lock_page_killable(page);
1358	1600	if (ret) {
1359		- up_read(&mm->mmap_sem);
	1601	+ mmap_read_unlock(mm);
1360	1602	return 0;
1361	1603	}
1362	1604	} else
..	..	@@ -1366,224 +1608,203 @@
1366	1608	}
1367	1609
1368	1610	/**
1369		- * page_cache_next_hole - find the next hole (not-present entry)
1370		- * @mapping: mapping
1371		- * @index: index
1372		- * @max_scan: maximum range to search
	1611	+ * page_cache_next_miss() - Find the next gap in the page cache.
	1612	+ * @mapping: Mapping.
	1613	+ * @index: Index.
	1614	+ * @max_scan: Maximum range to search.
1373	1615	*
1374		- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
1375		- * lowest indexed hole.
	1616	+ * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
	1617	+ * gap with the lowest index.
1376	1618	*
1377		- * Returns: the index of the hole if found, otherwise returns an index
1378		- * outside of the set specified (in which case 'return - index >=
1379		- * max_scan' will be true). In rare cases of index wrap-around, 0 will
1380		- * be returned.
	1619	+ * This function may be called under the rcu_read_lock. However, this will
	1620	+ * not atomically search a snapshot of the cache at a single point in time.
	1621	+ * For example, if a gap is created at index 5, then subsequently a gap is
	1622	+ * created at index 10, page_cache_next_miss covering both indices may
	1623	+ * return 10 if called under the rcu_read_lock.
1381	1624	*
1382		- * page_cache_next_hole may be called under rcu_read_lock. However,
1383		- * like radix_tree_gang_lookup, this will not atomically search a
1384		- * snapshot of the tree at a single point in time. For example, if a
1385		- * hole is created at index 5, then subsequently a hole is created at
1386		- * index 10, page_cache_next_hole covering both indexes may return 10
1387		- * if called under rcu_read_lock.
	1625	+ * Return: The index of the gap if found, otherwise an index outside the
	1626	+ * range specified (in which case 'return - index >= max_scan' will be true).
	1627	+ * In the rare case of index wrap-around, 0 will be returned.
1388	1628	*/
1389		-pgoff_t page_cache_next_hole(struct address_space *mapping,
	1629	+pgoff_t page_cache_next_miss(struct address_space *mapping,
1390	1630	pgoff_t index, unsigned long max_scan)
1391	1631	{
1392		- unsigned long i;
	1632	+ XA_STATE(xas, &mapping->i_pages, index);
1393	1633
1394		- for (i = 0; i < max_scan; i++) {
1395		- struct page *page;
1396		-
1397		- page = radix_tree_lookup(&mapping->i_pages, index);
1398		- if (!page \|\| radix_tree_exceptional_entry(page))
	1634	+ while (max_scan--) {
	1635	+ void *entry = xas_next(&xas);
	1636	+ if (!entry \|\| xa_is_value(entry))
1399	1637	break;
1400		- index++;
1401		- if (index == 0)
	1638	+ if (xas.xa_index == 0)
1402	1639	break;
1403	1640	}
1404	1641
1405		- return index;
	1642	+ return xas.xa_index;
1406	1643	}
1407		-EXPORT_SYMBOL(page_cache_next_hole);
	1644	+EXPORT_SYMBOL(page_cache_next_miss);
1408	1645
1409	1646	/**
1410		- * page_cache_prev_hole - find the prev hole (not-present entry)
1411		- * @mapping: mapping
1412		- * @index: index
1413		- * @max_scan: maximum range to search
	1647	+ * page_cache_prev_miss() - Find the previous gap in the page cache.
	1648	+ * @mapping: Mapping.
	1649	+ * @index: Index.
	1650	+ * @max_scan: Maximum range to search.
1414	1651	*
1415		- * Search backwards in the range [max(index-max_scan+1, 0), index] for
1416		- * the first hole.
	1652	+ * Search the range [max(index - max_scan + 1, 0), index] for the
	1653	+ * gap with the highest index.
1417	1654	*
1418		- * Returns: the index of the hole if found, otherwise returns an index
1419		- * outside of the set specified (in which case 'index - return >=
1420		- * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
1421		- * will be returned.
	1655	+ * This function may be called under the rcu_read_lock. However, this will
	1656	+ * not atomically search a snapshot of the cache at a single point in time.
	1657	+ * For example, if a gap is created at index 10, then subsequently a gap is
	1658	+ * created at index 5, page_cache_prev_miss() covering both indices may
	1659	+ * return 5 if called under the rcu_read_lock.
1422	1660	*
1423		- * page_cache_prev_hole may be called under rcu_read_lock. However,
1424		- * like radix_tree_gang_lookup, this will not atomically search a
1425		- * snapshot of the tree at a single point in time. For example, if a
1426		- * hole is created at index 10, then subsequently a hole is created at
1427		- * index 5, page_cache_prev_hole covering both indexes may return 5 if
1428		- * called under rcu_read_lock.
	1661	+ * Return: The index of the gap if found, otherwise an index outside the
	1662	+ * range specified (in which case 'index - return >= max_scan' will be true).
	1663	+ * In the rare case of wrap-around, ULONG_MAX will be returned.
1429	1664	*/
1430		-pgoff_t page_cache_prev_hole(struct address_space *mapping,
	1665	+pgoff_t page_cache_prev_miss(struct address_space *mapping,
1431	1666	pgoff_t index, unsigned long max_scan)
1432	1667	{
1433		- unsigned long i;
	1668	+ XA_STATE(xas, &mapping->i_pages, index);
1434	1669
1435		- for (i = 0; i < max_scan; i++) {
1436		- struct page *page;
1437		-
1438		- page = radix_tree_lookup(&mapping->i_pages, index);
1439		- if (!page \|\| radix_tree_exceptional_entry(page))
	1670	+ while (max_scan--) {
	1671	+ void *entry = xas_prev(&xas);
	1672	+ if (!entry \|\| xa_is_value(entry))
1440	1673	break;
1441		- index--;
1442		- if (index == ULONG_MAX)
	1674	+ if (xas.xa_index == ULONG_MAX)
1443	1675	break;
1444	1676	}
1445	1677
1446		- return index;
	1678	+ return xas.xa_index;
1447	1679	}
1448		-EXPORT_SYMBOL(page_cache_prev_hole);
	1680	+EXPORT_SYMBOL(page_cache_prev_miss);
1449	1681
1450	1682	/**
1451	1683	* find_get_entry - find and get a page cache entry
1452	1684	* @mapping: the address_space to search
1453		- * @offset: the page cache index
	1685	+ * @index: The page cache index.
1454	1686	*
1455	1687	* Looks up the page cache slot at @mapping & @offset. If there is a
1456		- * page cache page, it is returned with an increased refcount.
	1688	+ * page cache page, the head page is returned with an increased refcount.
1457	1689	*
1458	1690	* If the slot holds a shadow entry of a previously evicted page, or a
1459	1691	* swap entry from shmem/tmpfs, it is returned.
1460	1692	*
1461		- * Otherwise, %NULL is returned.
	1693	+ * Return: The head page or shadow entry, %NULL if nothing is found.
1462	1694	*/
1463		-struct page find_get_entry(struct address_space mapping, pgoff_t offset)
	1695	+struct page find_get_entry(struct address_space mapping, pgoff_t index)
1464	1696	{
1465		- void **pagep;
1466		- struct page head, page;
	1697	+ XA_STATE(xas, &mapping->i_pages, index);
	1698	+ struct page *page;
1467	1699
1468	1700	rcu_read_lock();
1469	1701	repeat:
1470		- page = NULL;
1471		- pagep = radix_tree_lookup_slot(&mapping->i_pages, offset);
1472		- if (pagep) {
1473		- page = radix_tree_deref_slot(pagep);
1474		- if (unlikely(!page))
1475		- goto out;
1476		- if (radix_tree_exception(page)) {
1477		- if (radix_tree_deref_retry(page))
1478		- goto repeat;
1479		- /*
1480		- * A shadow entry of a recently evicted page,
1481		- * or a swap entry from shmem/tmpfs. Return
1482		- * it without attempting to raise page count.
1483		- */
1484		- goto out;
1485		- }
	1702	+ xas_reset(&xas);
	1703	+ page = xas_load(&xas);
	1704	+ if (xas_retry(&xas, page))
	1705	+ goto repeat;
	1706	+ /*
	1707	+ * A shadow entry of a recently evicted page, or a swap entry from
	1708	+ * shmem/tmpfs. Return it without attempting to raise page count.
	1709	+ */
	1710	+ if (!page \|\| xa_is_value(page))
	1711	+ goto out;
1486	1712
1487		- head = compound_head(page);
1488		- if (!page_cache_get_speculative(head))
1489		- goto repeat;
	1713	+ if (!page_cache_get_speculative(page))
	1714	+ goto repeat;
1490	1715
1491		- /* The page was split under us? */
1492		- if (compound_head(page) != head) {
1493		- put_page(head);
1494		- goto repeat;
1495		- }
1496		-
1497		- /*
1498		- * Has the page moved?
1499		- * This is part of the lockless pagecache protocol. See
1500		- * include/linux/pagemap.h for details.
1501		- */
1502		- if (unlikely(page != *pagep)) {
1503		- put_page(head);
1504		- goto repeat;
1505		- }
	1716	+ /*
	1717	+ * Has the page moved or been split?
	1718	+ * This is part of the lockless pagecache protocol. See
	1719	+ * include/linux/pagemap.h for details.
	1720	+ */
	1721	+ if (unlikely(page != xas_reload(&xas))) {
	1722	+ put_page(page);
	1723	+ goto repeat;
1506	1724	}
1507	1725	out:
1508	1726	rcu_read_unlock();
1509	1727
1510	1728	return page;
1511	1729	}
1512		-EXPORT_SYMBOL(find_get_entry);
1513	1730
1514	1731	/**
1515		- * find_lock_entry - locate, pin and lock a page cache entry
1516		- * @mapping: the address_space to search
1517		- * @offset: the page cache index
	1732	+ * find_lock_entry - Locate and lock a page cache entry.
	1733	+ * @mapping: The address_space to search.
	1734	+ * @index: The page cache index.
1518	1735	*
1519		- * Looks up the page cache slot at @mapping & @offset. If there is a
1520		- * page cache page, it is returned locked and with an increased
1521		- * refcount.
	1736	+ * Looks up the page at @mapping & @index. If there is a page in the
	1737	+ * cache, the head page is returned locked and with an increased refcount.
1522	1738	*
1523	1739	* If the slot holds a shadow entry of a previously evicted page, or a
1524	1740	* swap entry from shmem/tmpfs, it is returned.
1525	1741	*
1526		- * Otherwise, %NULL is returned.
1527		- *
1528		- * find_lock_entry() may sleep.
	1742	+ * Context: May sleep.
	1743	+ * Return: The head page or shadow entry, %NULL if nothing is found.
1529	1744	*/
1530		-struct page find_lock_entry(struct address_space mapping, pgoff_t offset)
	1745	+struct page find_lock_entry(struct address_space mapping, pgoff_t index)
1531	1746	{
1532	1747	struct page *page;
1533	1748
1534	1749	repeat:
1535		- page = find_get_entry(mapping, offset);
1536		- if (page && !radix_tree_exception(page)) {
	1750	+ page = find_get_entry(mapping, index);
	1751	+ if (page && !xa_is_value(page)) {
1537	1752	lock_page(page);
1538	1753	/* Has the page been truncated? */
1539		- if (unlikely(page_mapping(page) != mapping)) {
	1754	+ if (unlikely(page->mapping != mapping)) {
1540	1755	unlock_page(page);
1541	1756	put_page(page);
1542	1757	goto repeat;
1543	1758	}
1544		- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
	1759	+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
1545	1760	}
1546	1761	return page;
1547	1762	}
1548		-EXPORT_SYMBOL(find_lock_entry);
1549	1763
1550	1764	/**
1551		- * pagecache_get_page - find and get a page reference
1552		- * @mapping: the address_space to search
1553		- * @offset: the page index
1554		- * @fgp_flags: PCG flags
1555		- * @gfp_mask: gfp mask to use for the page cache data page allocation
	1765	+ * pagecache_get_page - Find and get a reference to a page.
	1766	+ * @mapping: The address_space to search.
	1767	+ * @index: The page index.
	1768	+ * @fgp_flags: %FGP flags modify how the page is returned.
	1769	+ * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
1556	1770	*
1557		- * Looks up the page cache slot at @mapping & @offset.
	1771	+ * Looks up the page cache entry at @mapping & @index.
1558	1772	*
1559		- * PCG flags modify how the page is returned.
	1773	+ * @fgp_flags can be zero or more of these flags:
1560	1774	*
1561		- * @fgp_flags can be:
	1775	+ * * %FGP_ACCESSED - The page will be marked accessed.
	1776	+ * * %FGP_LOCK - The page is returned locked.
	1777	+ * * %FGP_HEAD - If the page is present and a THP, return the head page
	1778	+ * rather than the exact page specified by the index.
	1779	+ * * %FGP_CREAT - If no page is present then a new page is allocated using
	1780	+ * @gfp_mask and added to the page cache and the VM's LRU list.
	1781	+ * The page is returned locked and with an increased refcount.
	1782	+ * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
	1783	+ * page is already in cache. If the page was allocated, unlock it before
	1784	+ * returning so the caller can do the same dance.
	1785	+ * * %FGP_WRITE - The page will be written
	1786	+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
	1787	+ * * %FGP_NOWAIT - Don't get blocked by page lock
1562	1788	*
1563		- * - FGP_ACCESSED: the page will be marked accessed
1564		- * - FGP_LOCK: Page is return locked
1565		- * - FGP_CREAT: If page is not present then a new page is allocated using
1566		- * @gfp_mask and added to the page cache and the VM's LRU
1567		- * list. The page is returned locked and with an increased
1568		- * refcount.
1569		- * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
1570		- * its own locking dance if the page is already in cache, or unlock the page
1571		- * before returning if we had to add the page to pagecache.
1572		- *
1573		- * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
1574		- * if the GFP flags specified for FGP_CREAT are atomic.
	1789	+ * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
	1790	+ * if the %GFP flags specified for %FGP_CREAT are atomic.
1575	1791	*
1576	1792	* If there is a page cache page, it is returned with an increased refcount.
	1793	+ *
	1794	+ * Return: The found page or %NULL otherwise.
1577	1795	*/
1578		-struct page pagecache_get_page(struct address_space mapping, pgoff_t offset,
1579		- int fgp_flags, gfp_t gfp_mask)
	1796	+struct page pagecache_get_page(struct address_space mapping, pgoff_t index,
	1797	+ int fgp_flags, gfp_t gfp_mask)
1580	1798	{
1581	1799	struct page *page;
1582	1800
1583	1801	repeat:
1584		- page = find_get_entry(mapping, offset);
1585		- if (radix_tree_exceptional_entry(page))
	1802	+ page = find_get_entry(mapping, index);
	1803	+ if (xa_is_value(page))
1586	1804	page = NULL;
	1805	+
	1806	+ trace_android_vh_pagecache_get_page(mapping, index, fgp_flags,
	1807	+ gfp_mask, page);
1587	1808	if (!page)
1588	1809	goto no_page;
1589	1810
..	..	@@ -1603,16 +1824,23 @@
1603	1824	put_page(page);
1604	1825	goto repeat;
1605	1826	}
1606		- VM_BUG_ON_PAGE(page->index != offset, page);
	1827	+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
1607	1828	}
1608	1829
1609		- if (page && (fgp_flags & FGP_ACCESSED))
	1830	+ if (fgp_flags & FGP_ACCESSED)
1610	1831	mark_page_accessed(page);
	1832	+ else if (fgp_flags & FGP_WRITE) {
	1833	+ /* Clear idle flag for buffer write */
	1834	+ if (page_is_idle(page))
	1835	+ clear_page_idle(page);
	1836	+ }
	1837	+ if (!(fgp_flags & FGP_HEAD))
	1838	+ page = find_subpage(page, index);
1611	1839
1612	1840	no_page:
1613	1841	if (!page && (fgp_flags & FGP_CREAT)) {
1614	1842	int err;
1615		- if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
	1843	+ if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
1616	1844	gfp_mask \|= __GFP_WRITE;
1617	1845	if (fgp_flags & FGP_NOFS)
1618	1846	gfp_mask &= ~__GFP_FS;
..	..	@@ -1628,7 +1856,7 @@
1628	1856	if (fgp_flags & FGP_ACCESSED)
1629	1857	__SetPageReferenced(page);
1630	1858
1631		- err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
	1859	+ err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
1632	1860	if (unlikely(err)) {
1633	1861	put_page(page);
1634	1862	page = NULL;
..	..	@@ -1668,60 +1896,61 @@
1668	1896	* Any shadow entries of evicted pages, or swap entries from
1669	1897	* shmem/tmpfs, are included in the returned array.
1670	1898	*
1671		- * find_get_entries() returns the number of pages and shadow entries
1672		- * which were found.
	1899	+ * If it finds a Transparent Huge Page, head or tail, find_get_entries()
	1900	+ * stops at that page: the caller is likely to have a better way to handle
	1901	+ * the compound page as a whole, and then skip its extent, than repeatedly
	1902	+ * calling find_get_entries() to return all its tails.
	1903	+ *
	1904	+ * Return: the number of pages and shadow entries which were found.
1673	1905	*/
1674	1906	unsigned find_get_entries(struct address_space *mapping,
1675	1907	pgoff_t start, unsigned int nr_entries,
1676	1908	struct page *entries, pgoff_t indices)
1677	1909	{
1678		- void **slot;
	1910	+ XA_STATE(xas, &mapping->i_pages, start);
	1911	+ struct page *page;
1679	1912	unsigned int ret = 0;
1680		- struct radix_tree_iter iter;
1681	1913
1682	1914	if (!nr_entries)
1683	1915	return 0;
1684	1916
1685	1917	rcu_read_lock();
1686		- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
1687		- struct page head, page;
1688		-repeat:
1689		- page = radix_tree_deref_slot(slot);
1690		- if (unlikely(!page))
	1918	+ xas_for_each(&xas, page, ULONG_MAX) {
	1919	+ if (xas_retry(&xas, page))
1691	1920	continue;
1692		- if (radix_tree_exception(page)) {
1693		- if (radix_tree_deref_retry(page)) {
1694		- slot = radix_tree_iter_retry(&iter);
1695		- continue;
1696		- }
1697		- /*
1698		- * A shadow entry of a recently evicted page, a swap
1699		- * entry from shmem/tmpfs or a DAX entry. Return it
1700		- * without attempting to raise page count.
1701		- */
	1921	+ /*
	1922	+ * A shadow entry of a recently evicted page, a swap
	1923	+ * entry from shmem/tmpfs or a DAX entry. Return it
	1924	+ * without attempting to raise page count.
	1925	+ */
	1926	+ if (xa_is_value(page))
1702	1927	goto export;
1703		- }
1704	1928
1705		- head = compound_head(page);
1706		- if (!page_cache_get_speculative(head))
1707		- goto repeat;
	1929	+ if (!page_cache_get_speculative(page))
	1930	+ goto retry;
1708	1931
1709		- /* The page was split under us? */
1710		- if (compound_head(page) != head) {
1711		- put_page(head);
1712		- goto repeat;
1713		- }
	1932	+ /* Has the page moved or been split? */
	1933	+ if (unlikely(page != xas_reload(&xas)))
	1934	+ goto put_page;
1714	1935
1715		- /* Has the page moved? */
1716		- if (unlikely(page != *slot)) {
1717		- put_page(head);
1718		- goto repeat;
	1936	+ /*
	1937	+ * Terminate early on finding a THP, to allow the caller to
	1938	+ * handle it all at once; but continue if this is hugetlbfs.
	1939	+ */
	1940	+ if (PageTransHuge(page) && !PageHuge(page)) {
	1941	+ page = find_subpage(page, xas.xa_index);
	1942	+ nr_entries = ret + 1;
1719	1943	}
1720	1944	export:
1721		- indices[ret] = iter.index;
	1945	+ indices[ret] = xas.xa_index;
1722	1946	entries[ret] = page;
1723	1947	if (++ret == nr_entries)
1724	1948	break;
	1949	+ continue;
	1950	+put_page:
	1951	+ put_page(page);
	1952	+retry:
	1953	+ xas_reset(&xas);
1725	1954	}
1726	1955	rcu_read_unlock();
1727	1956	return ret;
..	..	@@ -1744,72 +1973,52 @@
1744	1973	* indexes. There may be holes in the indices due to not-present pages.
1745	1974	* We also update @start to index the next page for the traversal.
1746	1975	*
1747		- * find_get_pages_range() returns the number of pages which were found. If this
1748		- * number is smaller than @nr_pages, the end of specified range has been
	1976	+ * Return: the number of pages which were found. If this number is
	1977	+ * smaller than @nr_pages, the end of specified range has been
1749	1978	* reached.
1750	1979	*/
1751	1980	unsigned find_get_pages_range(struct address_space mapping, pgoff_t start,
1752	1981	pgoff_t end, unsigned int nr_pages,
1753	1982	struct page **pages)
1754	1983	{
1755		- struct radix_tree_iter iter;
1756		- void **slot;
	1984	+ XA_STATE(xas, &mapping->i_pages, *start);
	1985	+ struct page *page;
1757	1986	unsigned ret = 0;
1758	1987
1759	1988	if (unlikely(!nr_pages))
1760	1989	return 0;
1761	1990
1762	1991	rcu_read_lock();
1763		- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) {
1764		- struct page head, page;
1765		-
1766		- if (iter.index > end)
1767		- break;
1768		-repeat:
1769		- page = radix_tree_deref_slot(slot);
1770		- if (unlikely(!page))
	1992	+ xas_for_each(&xas, page, end) {
	1993	+ if (xas_retry(&xas, page))
	1994	+ continue;
	1995	+ /* Skip over shadow, swap and DAX entries */
	1996	+ if (xa_is_value(page))
1771	1997	continue;
1772	1998
1773		- if (radix_tree_exception(page)) {
1774		- if (radix_tree_deref_retry(page)) {
1775		- slot = radix_tree_iter_retry(&iter);
1776		- continue;
1777		- }
1778		- /*
1779		- * A shadow entry of a recently evicted page,
1780		- * or a swap entry from shmem/tmpfs. Skip
1781		- * over it.
1782		- */
1783		- continue;
1784		- }
	1999	+ if (!page_cache_get_speculative(page))
	2000	+ goto retry;
1785	2001
1786		- head = compound_head(page);
1787		- if (!page_cache_get_speculative(head))
1788		- goto repeat;
	2002	+ /* Has the page moved or been split? */
	2003	+ if (unlikely(page != xas_reload(&xas)))
	2004	+ goto put_page;
1789	2005
1790		- /* The page was split under us? */
1791		- if (compound_head(page) != head) {
1792		- put_page(head);
1793		- goto repeat;
1794		- }
1795		-
1796		- /* Has the page moved? */
1797		- if (unlikely(page != *slot)) {
1798		- put_page(head);
1799		- goto repeat;
1800		- }
1801		-
1802		- pages[ret] = page;
	2006	+ pages[ret] = find_subpage(page, xas.xa_index);
1803	2007	if (++ret == nr_pages) {
1804		- *start = pages[ret - 1]->index + 1;
	2008	+ *start = xas.xa_index + 1;
1805	2009	goto out;
1806	2010	}
	2011	+ continue;
	2012	+put_page:
	2013	+ put_page(page);
	2014	+retry:
	2015	+ xas_reset(&xas);
1807	2016	}
1808	2017
1809	2018	/*
1810	2019	* We come here when there is no page beyond @end. We take care to not
1811	2020	* overflow the index @start as it confuses some of the callers. This
1812		- * breaks the iteration when there is page at index -1 but that is
	2021	+ * breaks the iteration when there is a page at index -1 but that is
1813	2022	* already broken anyway.
1814	2023	*/
1815	2024	if (end == (pgoff_t)-1)
..	..	@@ -1832,69 +2041,44 @@
1832	2041	* find_get_pages_contig() works exactly like find_get_pages(), except
1833	2042	* that the returned number of pages are guaranteed to be contiguous.
1834	2043	*
1835		- * find_get_pages_contig() returns the number of pages which were found.
	2044	+ * Return: the number of pages which were found.
1836	2045	*/
1837	2046	unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
1838	2047	unsigned int nr_pages, struct page **pages)
1839	2048	{
1840		- struct radix_tree_iter iter;
1841		- void **slot;
	2049	+ XA_STATE(xas, &mapping->i_pages, index);
	2050	+ struct page *page;
1842	2051	unsigned int ret = 0;
1843	2052
1844	2053	if (unlikely(!nr_pages))
1845	2054	return 0;
1846	2055
1847	2056	rcu_read_lock();
1848		- radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) {
1849		- struct page head, page;
1850		-repeat:
1851		- page = radix_tree_deref_slot(slot);
1852		- /* The hole, there no reason to continue */
1853		- if (unlikely(!page))
1854		- break;
1855		-
1856		- if (radix_tree_exception(page)) {
1857		- if (radix_tree_deref_retry(page)) {
1858		- slot = radix_tree_iter_retry(&iter);
1859		- continue;
1860		- }
1861		- /*
1862		- * A shadow entry of a recently evicted page,
1863		- * or a swap entry from shmem/tmpfs. Stop
1864		- * looking for contiguous pages.
1865		- */
1866		- break;
1867		- }
1868		-
1869		- head = compound_head(page);
1870		- if (!page_cache_get_speculative(head))
1871		- goto repeat;
1872		-
1873		- /* The page was split under us? */
1874		- if (compound_head(page) != head) {
1875		- put_page(head);
1876		- goto repeat;
1877		- }
1878		-
1879		- /* Has the page moved? */
1880		- if (unlikely(page != *slot)) {
1881		- put_page(head);
1882		- goto repeat;
1883		- }
1884		-
	2057	+ for (page = xas_load(&xas); page; page = xas_next(&xas)) {
	2058	+ if (xas_retry(&xas, page))
	2059	+ continue;
1885	2060	/*
1886		- * must check mapping and index after taking the ref.
1887		- * otherwise we can get both false positives and false
1888		- * negatives, which is just confusing to the caller.
	2061	+ * If the entry has been swapped out, we can stop looking.
	2062	+ * No current caller is looking for DAX entries.
1889	2063	*/
1890		- if (page->mapping == NULL \|\| page_to_pgoff(page) != iter.index) {
1891		- put_page(page);
	2064	+ if (xa_is_value(page))
1892	2065	break;
1893		- }
1894	2066
1895		- pages[ret] = page;
	2067	+ if (!page_cache_get_speculative(page))
	2068	+ goto retry;
	2069	+
	2070	+ /* Has the page moved or been split? */
	2071	+ if (unlikely(page != xas_reload(&xas)))
	2072	+ goto put_page;
	2073	+
	2074	+ pages[ret] = find_subpage(page, xas.xa_index);
1896	2075	if (++ret == nr_pages)
1897	2076	break;
	2077	+ continue;
	2078	+put_page:
	2079	+ put_page(page);
	2080	+retry:
	2081	+ xas_reset(&xas);
1898	2082	}
1899	2083	rcu_read_unlock();
1900	2084	return ret;
..	..	@@ -1912,76 +2096,56 @@
1912	2096	*
1913	2097	* Like find_get_pages, except we only return pages which are tagged with
1914	2098	* @tag. We update @index to index the next page for the traversal.
	2099	+ *
	2100	+ * Return: the number of pages which were found.
1915	2101	*/
1916	2102	unsigned find_get_pages_range_tag(struct address_space mapping, pgoff_t index,
1917		- pgoff_t end, int tag, unsigned int nr_pages,
	2103	+ pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
1918	2104	struct page **pages)
1919	2105	{
1920		- struct radix_tree_iter iter;
1921		- void **slot;
	2106	+ XA_STATE(xas, &mapping->i_pages, *index);
	2107	+ struct page *page;
1922	2108	unsigned ret = 0;
1923	2109
1924	2110	if (unlikely(!nr_pages))
1925	2111	return 0;
1926	2112
1927	2113	rcu_read_lock();
1928		- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) {
1929		- struct page head, page;
1930		-
1931		- if (iter.index > end)
1932		- break;
1933		-repeat:
1934		- page = radix_tree_deref_slot(slot);
1935		- if (unlikely(!page))
	2114	+ xas_for_each_marked(&xas, page, end, tag) {
	2115	+ if (xas_retry(&xas, page))
	2116	+ continue;
	2117	+ /*
	2118	+ * Shadow entries should never be tagged, but this iteration
	2119	+ * is lockless so there is a window for page reclaim to evict
	2120	+ * a page we saw tagged. Skip over it.
	2121	+ */
	2122	+ if (xa_is_value(page))
1936	2123	continue;
1937	2124
1938		- if (radix_tree_exception(page)) {
1939		- if (radix_tree_deref_retry(page)) {
1940		- slot = radix_tree_iter_retry(&iter);
1941		- continue;
1942		- }
1943		- /*
1944		- * A shadow entry of a recently evicted page.
1945		- *
1946		- * Those entries should never be tagged, but
1947		- * this tree walk is lockless and the tags are
1948		- * looked up in bulk, one radix tree node at a
1949		- * time, so there is a sizable window for page
1950		- * reclaim to evict a page we saw tagged.
1951		- *
1952		- * Skip over it.
1953		- */
1954		- continue;
1955		- }
	2125	+ if (!page_cache_get_speculative(page))
	2126	+ goto retry;
1956	2127
1957		- head = compound_head(page);
1958		- if (!page_cache_get_speculative(head))
1959		- goto repeat;
	2128	+ /* Has the page moved or been split? */
	2129	+ if (unlikely(page != xas_reload(&xas)))
	2130	+ goto put_page;
1960	2131
1961		- /* The page was split under us? */
1962		- if (compound_head(page) != head) {
1963		- put_page(head);
1964		- goto repeat;
1965		- }
1966		-
1967		- /* Has the page moved? */
1968		- if (unlikely(page != *slot)) {
1969		- put_page(head);
1970		- goto repeat;
1971		- }
1972		-
1973		- pages[ret] = page;
	2132	+ pages[ret] = find_subpage(page, xas.xa_index);
1974	2133	if (++ret == nr_pages) {
1975		- *index = pages[ret - 1]->index + 1;
	2134	+ *index = xas.xa_index + 1;
1976	2135	goto out;
1977	2136	}
	2137	+ continue;
	2138	+put_page:
	2139	+ put_page(page);
	2140	+retry:
	2141	+ xas_reset(&xas);
1978	2142	}
1979	2143
1980	2144	/*
1981		- * We come here when we got at @end. We take care to not overflow the
	2145	+ * We come here when we got to @end. We take care to not overflow the
1982	2146	* index @index as it confuses some of the callers. This breaks the
1983		- * iteration when there is page at index -1 but that is already broken
1984		- * anyway.
	2147	+ * iteration when there is a page at index -1 but that is already
	2148	+ * broken anyway.
1985	2149	*/
1986	2150	if (end == (pgoff_t)-1)
1987	2151	*index = (pgoff_t)-1;
..	..	@@ -1993,76 +2157,6 @@
1993	2157	return ret;
1994	2158	}
1995	2159	EXPORT_SYMBOL(find_get_pages_range_tag);
1996		-
1997		-/**
1998		- * find_get_entries_tag - find and return entries that match @tag
1999		- * @mapping: the address_space to search
2000		- * @start: the starting page cache index
2001		- * @tag: the tag index
2002		- * @nr_entries: the maximum number of entries
2003		- * @entries: where the resulting entries are placed
2004		- * @indices: the cache indices corresponding to the entries in @entries
2005		- *
2006		- * Like find_get_entries, except we only return entries which are tagged with
2007		- * @tag.
2008		- */
2009		-unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
2010		- int tag, unsigned int nr_entries,
2011		- struct page *entries, pgoff_t indices)
2012		-{
2013		- void **slot;
2014		- unsigned int ret = 0;
2015		- struct radix_tree_iter iter;
2016		-
2017		- if (!nr_entries)
2018		- return 0;
2019		-
2020		- rcu_read_lock();
2021		- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) {
2022		- struct page head, page;
2023		-repeat:
2024		- page = radix_tree_deref_slot(slot);
2025		- if (unlikely(!page))
2026		- continue;
2027		- if (radix_tree_exception(page)) {
2028		- if (radix_tree_deref_retry(page)) {
2029		- slot = radix_tree_iter_retry(&iter);
2030		- continue;
2031		- }
2032		-
2033		- /*
2034		- * A shadow entry of a recently evicted page, a swap
2035		- * entry from shmem/tmpfs or a DAX entry. Return it
2036		- * without attempting to raise page count.
2037		- */
2038		- goto export;
2039		- }
2040		-
2041		- head = compound_head(page);
2042		- if (!page_cache_get_speculative(head))
2043		- goto repeat;
2044		-
2045		- /* The page was split under us? */
2046		- if (compound_head(page) != head) {
2047		- put_page(head);
2048		- goto repeat;
2049		- }
2050		-
2051		- /* Has the page moved? */
2052		- if (unlikely(page != *slot)) {
2053		- put_page(head);
2054		- goto repeat;
2055		- }
2056		-export:
2057		- indices[ret] = iter.index;
2058		- entries[ret] = page;
2059		- if (++ret == nr_entries)
2060		- break;
2061		- }
2062		- rcu_read_unlock();
2063		- return ret;
2064		-}
2065		-EXPORT_SYMBOL(find_get_entries_tag);
2066	2160
2067	2161	/*
2068	2162	* CD/DVDs are error prone. When a medium error occurs, the driver may fail
..	..	@@ -2079,8 +2173,7 @@
2079	2173	*
2080	2174	* It is going insane. Fix it by quickly scaling down the readahead size.
2081	2175	*/
2082		-static void shrink_readahead_size_eio(struct file *filp,
2083		- struct file_ra_state *ra)
	2176	+static void shrink_readahead_size_eio(struct file_ra_state *ra)
2084	2177	{
2085	2178	ra->ra_pages /= 4;
2086	2179	}
..	..	@@ -2096,8 +2189,12 @@
2096	2189	*
2097	2190	* This is really ugly. But the goto's actually try to clarify some
2098	2191	* of the logic when it comes to error handling etc.
	2192	+ *
	2193	+ * Return:
	2194	+ * * total number of bytes copied, including those the were already @written
	2195	+ * * negative error code if nothing was copied
2099	2196	*/
2100		-static ssize_t generic_file_buffered_read(struct kiocb *iocb,
	2197	+ssize_t generic_file_buffered_read(struct kiocb *iocb,
2101	2198	struct iov_iter *iter, ssize_t written)
2102	2199	{
2103	2200	struct file *filp = iocb->ki_filp;
..	..	@@ -2114,6 +2211,9 @@
2114	2211
2115	2212	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
2116	2213	return 0;
	2214	+ if (unlikely(!iov_iter_count(iter)))
	2215	+ return 0;
	2216	+
2117	2217	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
2118	2218
2119	2219	index = *ppos >> PAGE_SHIFT;
..	..	@@ -2121,6 +2221,14 @@
2121	2221	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
2122	2222	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
2123	2223	offset = *ppos & ~PAGE_MASK;
	2224	+
	2225	+ /*
	2226	+ * If we've already successfully copied some data, then we
	2227	+ * can no longer safely return -EIOCBQUEUED. Hence mark
	2228	+ * an async read NOWAIT at that point.
	2229	+ */
	2230	+ if (written && (iocb->ki_flags & IOCB_WAITQ))
	2231	+ iocb->ki_flags \|= IOCB_NOWAIT;
2124	2232
2125	2233	for (;;) {
2126	2234	struct page *page;
..	..	@@ -2137,7 +2245,7 @@
2137	2245
2138	2246	page = find_get_page(mapping, index);
2139	2247	if (!page) {
2140		- if (iocb->ki_flags & IOCB_NOWAIT)
	2248	+ if (iocb->ki_flags & IOCB_NOIO)
2141	2249	goto would_block;
2142	2250	page_cache_sync_readahead(mapping,
2143	2251	ra, filp,
..	..	@@ -2147,22 +2255,34 @@
2147	2255	goto no_cached_page;
2148	2256	}
2149	2257	if (PageReadahead(page)) {
	2258	+ if (iocb->ki_flags & IOCB_NOIO) {
	2259	+ put_page(page);
	2260	+ goto out;
	2261	+ }
2150	2262	page_cache_async_readahead(mapping,
2151	2263	ra, filp, page,
2152	2264	index, last_index - index);
2153	2265	}
2154	2266	if (!PageUptodate(page)) {
2155		- if (iocb->ki_flags & IOCB_NOWAIT) {
2156		- put_page(page);
2157		- goto would_block;
2158		- }
2159		-
2160	2267	/*
2161	2268	* See comment in do_read_cache_page on why
2162	2269	* wait_on_page_locked is used to avoid unnecessarily
2163	2270	* serialisations and why it's safe.
2164	2271	*/
2165		- error = wait_on_page_locked_killable(page);
	2272	+ if (iocb->ki_flags & IOCB_WAITQ) {
	2273	+ if (written) {
	2274	+ put_page(page);
	2275	+ goto out;
	2276	+ }
	2277	+ error = wait_on_page_locked_async(page,
	2278	+ iocb->ki_waitq);
	2279	+ } else {
	2280	+ if (iocb->ki_flags & IOCB_NOWAIT) {
	2281	+ put_page(page);
	2282	+ goto would_block;
	2283	+ }
	2284	+ error = wait_on_page_locked_killable(page);
	2285	+ }
2166	2286	if (unlikely(error))
2167	2287	goto readpage_error;
2168	2288	if (PageUptodate(page))
..	..	@@ -2172,7 +2292,7 @@
2172	2292	!mapping->a_ops->is_partially_uptodate)
2173	2293	goto page_not_up_to_date;
2174	2294	/* pipes can't handle partially uptodate pages */
2175		- if (unlikely(iter->type & ITER_PIPE))
	2295	+ if (unlikely(iov_iter_is_pipe(iter)))
2176	2296	goto page_not_up_to_date;
2177	2297	if (!trylock_page(page))
2178	2298	goto page_not_up_to_date;
..	..	@@ -2250,7 +2370,15 @@
2250	2370
2251	2371	page_not_up_to_date:
2252	2372	/* Get exclusive access to the page ... */
2253		- error = lock_page_killable(page);
	2373	+ if (iocb->ki_flags & IOCB_WAITQ) {
	2374	+ if (written) {
	2375	+ put_page(page);
	2376	+ goto out;
	2377	+ }
	2378	+ error = lock_page_async(page, iocb->ki_waitq);
	2379	+ } else {
	2380	+ error = lock_page_killable(page);
	2381	+ }
2254	2382	if (unlikely(error))
2255	2383	goto readpage_error;
2256	2384
..	..	@@ -2269,6 +2397,11 @@
2269	2397	}
2270	2398
2271	2399	readpage:
	2400	+ if (iocb->ki_flags & (IOCB_NOIO \| IOCB_NOWAIT)) {
	2401	+ unlock_page(page);
	2402	+ put_page(page);
	2403	+ goto would_block;
	2404	+ }
2272	2405	/*
2273	2406	* A previous I/O error may have been due to temporary
2274	2407	* failures, eg. multipath errors.
..	..	@@ -2288,7 +2421,16 @@
2288	2421	}
2289	2422
2290	2423	if (!PageUptodate(page)) {
2291		- error = lock_page_killable(page);
	2424	+ if (iocb->ki_flags & IOCB_WAITQ) {
	2425	+ if (written) {
	2426	+ put_page(page);
	2427	+ goto out;
	2428	+ }
	2429	+ error = lock_page_async(page, iocb->ki_waitq);
	2430	+ } else {
	2431	+ error = lock_page_killable(page);
	2432	+ }
	2433	+
2292	2434	if (unlikely(error))
2293	2435	goto readpage_error;
2294	2436	if (!PageUptodate(page)) {
..	..	@@ -2301,7 +2443,7 @@
2301	2443	goto find_page;
2302	2444	}
2303	2445	unlock_page(page);
2304		- shrink_readahead_size_eio(filp, ra);
	2446	+ shrink_readahead_size_eio(ra);
2305	2447	error = -EIO;
2306	2448	goto readpage_error;
2307	2449	}
..	..	@@ -2349,6 +2491,7 @@
2349	2491	file_accessed(filp);
2350	2492	return written ? written : error;
2351	2493	}
	2494	+EXPORT_SYMBOL_GPL(generic_file_buffered_read);
2352	2495
2353	2496	/**
2354	2497	* generic_file_read_iter - generic filesystem read routine
..	..	@@ -2357,6 +2500,19 @@
2357	2500	*
2358	2501	* This is the "read_iter()" routine for all filesystems
2359	2502	* that can use the page cache directly.
	2503	+ *
	2504	+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
	2505	+ * be returned when no data can be read without waiting for I/O requests
	2506	+ * to complete; it doesn't prevent readahead.
	2507	+ *
	2508	+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
	2509	+ * requests shall be made for the read or for readahead. When no data
	2510	+ * can be read, -EAGAIN shall be returned. When readahead would be
	2511	+ * triggered, a partial, possibly empty read shall be returned.
	2512	+ *
	2513	+ * Return:
	2514	+ * * number of bytes copied, even for partial reads
	2515	+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
2360	2516	*/
2361	2517	ssize_t
2362	2518	generic_file_read_iter(struct kiocb iocb, struct iov_iter iter)
..	..	@@ -2417,36 +2573,15 @@
2417	2573
2418	2574	#ifdef CONFIG_MMU
2419	2575	#define MMAP_LOTSAMISS (100)
2420		-static struct file maybe_unlock_mmap_for_io(struct vm_fault vmf,
2421		- struct file *fpin)
2422		-{
2423		- int flags = vmf->flags;
2424		-
2425		- if (fpin)
2426		- return fpin;
2427		-
2428		- /*
2429		- * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
2430		- * anything, so we only pin the file and drop the mmap_sem if only
2431		- * FAULT_FLAG_ALLOW_RETRY is set.
2432		- */
2433		- if ((flags & (FAULT_FLAG_ALLOW_RETRY \| FAULT_FLAG_RETRY_NOWAIT)) ==
2434		- FAULT_FLAG_ALLOW_RETRY) {
2435		- fpin = get_file(vmf->vma->vm_file);
2436		- up_read(&vmf->vma->vm_mm->mmap_sem);
2437		- }
2438		- return fpin;
2439		-}
2440		-
2441	2576	/*
2442		- * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
	2577	+ * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
2443	2578	* @vmf - the vm_fault for this fault.
2444	2579	* @page - the page to lock.
2445	2580	* @fpin - the pointer to the file we may pin (or is already pinned).
2446	2581	*
2447		- * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
	2582	+ * This works similar to lock_page_or_retry in that it can drop the mmap_lock.
2448	2583	* It differs in that it actually returns the page locked if it returns 1 and 0
2449		- * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin
	2584	+ * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin
2450	2585	* will point to the pinned file and needs to be fput()'ed at a later point.
2451	2586	*/
2452	2587	static int lock_page_maybe_drop_mmap(struct vm_fault vmf, struct page page,
..	..	@@ -2457,7 +2592,7 @@
2457	2592
2458	2593	/*
2459	2594	* NOTE! This will make us return with VM_FAULT_RETRY, but with
2460		- * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
	2595	+ * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
2461	2596	* is supposed to work. We have way too many special cases..
2462	2597	*/
2463	2598	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
..	..	@@ -2467,13 +2602,13 @@
2467	2602	if (vmf->flags & FAULT_FLAG_KILLABLE) {
2468	2603	if (__lock_page_killable(page)) {
2469	2604	/*
2470		- * We didn't have the right flags to drop the mmap_sem,
	2605	+ * We didn't have the right flags to drop the mmap_lock,
2471	2606	* but all fault_handlers only check for fatal signals
2472	2607	* if we return VM_FAULT_RETRY, so we need to drop the
2473		- * mmap_sem here and return 0 if we don't have a fpin.
	2608	+ * mmap_lock here and return 0 if we don't have a fpin.
2474	2609	*/
2475	2610	if (*fpin == NULL)
2476		- up_read(&vmf->vma->vm_mm->mmap_sem);
	2611	+ mmap_read_unlock(vmf->vma->vm_mm);
2477	2612	return 0;
2478	2613	}
2479	2614	} else
..	..	@@ -2494,8 +2629,9 @@
2494	2629	struct file *file = vmf->vma->vm_file;
2495	2630	struct file_ra_state *ra = &file->f_ra;
2496	2631	struct address_space *mapping = file->f_mapping;
	2632	+ DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
2497	2633	struct file *fpin = NULL;
2498		- pgoff_t offset = vmf->pgoff;
	2634	+ unsigned int mmap_miss;
2499	2635
2500	2636	/* If we don't want any read-ahead, don't bother */
2501	2637	if (vmf->vma->vm_flags & VM_RAND_READ)
..	..	@@ -2505,37 +2641,40 @@
2505	2641
2506	2642	if (vmf->vma->vm_flags & VM_SEQ_READ) {
2507	2643	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2508		- page_cache_sync_readahead(mapping, ra, file, offset,
2509		- ra->ra_pages);
	2644	+ page_cache_sync_ra(&ractl, ra, ra->ra_pages);
2510	2645	return fpin;
2511	2646	}
2512	2647
2513	2648	/* Avoid banging the cache line if not needed */
2514		- if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
2515		- ra->mmap_miss++;
	2649	+ mmap_miss = READ_ONCE(ra->mmap_miss);
	2650	+ if (mmap_miss < MMAP_LOTSAMISS * 10)
	2651	+ WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
2516	2652
2517	2653	/*
2518	2654	* Do we miss much more than hit in this file? If so,
2519	2655	* stop bothering with read-ahead. It will only hurt.
2520	2656	*/
2521		- if (ra->mmap_miss > MMAP_LOTSAMISS)
	2657	+ if (mmap_miss > MMAP_LOTSAMISS)
2522	2658	return fpin;
2523	2659
2524	2660	/*
2525	2661	* mmap read-around
2526	2662	*/
2527	2663	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2528		- ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
	2664	+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
2529	2665	ra->size = ra->ra_pages;
2530	2666	ra->async_size = ra->ra_pages / 4;
2531		- ra_submit(ra, mapping, file);
	2667	+ trace_android_vh_tune_mmap_readaround(ra->ra_pages, vmf->pgoff,
	2668	+ &ra->start, &ra->size, &ra->async_size);
	2669	+ ractl._index = ra->start;
	2670	+ do_page_cache_ra(&ractl, ra->size, ra->async_size);
2532	2671	return fpin;
2533	2672	}
2534	2673
2535	2674	/*
2536	2675	* Asynchronous readahead happens when we find the page and PG_readahead,
2537	2676	* so we want to possibly extend the readahead further. We return the file that
2538		- * was pinned if we have to drop the mmap_sem in order to do IO.
	2677	+ * was pinned if we have to drop the mmap_lock in order to do IO.
2539	2678	*/
2540	2679	static struct file do_async_mmap_readahead(struct vm_fault vmf,
2541	2680	struct page *page)
..	..	@@ -2544,13 +2683,15 @@
2544	2683	struct file_ra_state *ra = &file->f_ra;
2545	2684	struct address_space *mapping = file->f_mapping;
2546	2685	struct file *fpin = NULL;
	2686	+ unsigned int mmap_miss;
2547	2687	pgoff_t offset = vmf->pgoff;
2548	2688
2549	2689	/* If we don't want any read-ahead, don't bother */
2550	2690	if (vmf->vma->vm_flags & VM_RAND_READ \|\| !ra->ra_pages)
2551	2691	return fpin;
2552		- if (ra->mmap_miss > 0)
2553		- ra->mmap_miss--;
	2692	+ mmap_miss = READ_ONCE(ra->mmap_miss);
	2693	+ if (mmap_miss)
	2694	+ WRITE_ONCE(ra->mmap_miss, --mmap_miss);
2554	2695	if (PageReadahead(page)) {
2555	2696	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2556	2697	page_cache_async_readahead(mapping, ra, file,
..	..	@@ -2570,17 +2711,19 @@
2570	2711	* it in the page cache, and handles the special cases reasonably without
2571	2712	* having a lot of duplicated code.
2572	2713	*
2573		- * vma->vm_mm->mmap_sem must be held on entry.
	2714	+ * If FAULT_FLAG_SPECULATIVE is set, this function runs with elevated vma
	2715	+ * refcount and with mmap lock not held.
	2716	+ * Otherwise, vma->vm_mm->mmap_lock must be held on entry.
2574	2717	*
2575		- * If our return value has VM_FAULT_RETRY set, it's because
2576		- * lock_page_or_retry() returned 0.
2577		- * The mmap_sem has usually been released in this case.
2578		- * See __lock_page_or_retry() for the exception.
	2718	+ * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
	2719	+ * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
2579	2720	*
2580		- * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
	2721	+ * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
2581	2722	* has not been released.
2582	2723	*
2583	2724	* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
	2725	+ *
	2726	+ * Return: bitwise-OR of %VM_FAULT_ codes.
2584	2727	*/
2585	2728	vm_fault_t filemap_fault(struct vm_fault *vmf)
2586	2729	{
..	..	@@ -2592,12 +2735,65 @@
2592	2735	struct inode *inode = mapping->host;
2593	2736	pgoff_t offset = vmf->pgoff;
2594	2737	pgoff_t max_off;
2595		- struct page *page;
	2738	+ struct page *page = NULL;
2596	2739	vm_fault_t ret = 0;
	2740	+ bool retry = false;
	2741	+
	2742	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
	2743	+ page = find_get_page(mapping, offset);
	2744	+ if (unlikely(!page))
	2745	+ return VM_FAULT_RETRY;
	2746	+
	2747	+ if (unlikely(PageReadahead(page)))
	2748	+ goto page_put;
	2749	+
	2750	+ if (!trylock_page(page))
	2751	+ goto page_put;
	2752	+
	2753	+ if (unlikely(compound_head(page)->mapping != mapping))
	2754	+ goto page_unlock;
	2755	+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
	2756	+ if (unlikely(!PageUptodate(page)))
	2757	+ goto page_unlock;
	2758	+
	2759	+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
	2760	+ if (unlikely(offset >= max_off))
	2761	+ goto page_unlock;
	2762	+
	2763	+ /*
	2764	+ * Update readahead mmap_miss statistic.
	2765	+ *
	2766	+ * Note that we are not sure if finish_fault() will
	2767	+ * manage to complete the transaction. If it fails,
	2768	+ * we'll come back to filemap_fault() non-speculative
	2769	+ * case which will update mmap_miss a second time.
	2770	+ * This is not ideal, we would prefer to guarantee the
	2771	+ * update will happen exactly once.
	2772	+ */
	2773	+ if (!(vmf->vma->vm_flags & VM_RAND_READ) && ra->ra_pages) {
	2774	+ unsigned int mmap_miss = READ_ONCE(ra->mmap_miss);
	2775	+ if (mmap_miss)
	2776	+ WRITE_ONCE(ra->mmap_miss, --mmap_miss);
	2777	+ }
	2778	+
	2779	+ vmf->page = page;
	2780	+ return VM_FAULT_LOCKED;
	2781	+page_unlock:
	2782	+ unlock_page(page);
	2783	+page_put:
	2784	+ put_page(page);
	2785	+ return VM_FAULT_RETRY;
	2786	+ }
2597	2787
2598	2788	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2599	2789	if (unlikely(offset >= max_off))
2600	2790	return VM_FAULT_SIGBUS;
	2791	+
	2792	+ trace_android_vh_filemap_fault_get_page(vmf, &page, &retry);
	2793	+ if (unlikely(retry))
	2794	+ goto out_retry;
	2795	+ if (unlikely(page))
	2796	+ goto page_ok;
2601	2797
2602	2798	/*
2603	2799	* Do we have something in the page cache already?
..	..	@@ -2630,12 +2826,12 @@
2630	2826	goto out_retry;
2631	2827
2632	2828	/* Did it get truncated? */
2633		- if (unlikely(page->mapping != mapping)) {
	2829	+ if (unlikely(compound_head(page)->mapping != mapping)) {
2634	2830	unlock_page(page);
2635	2831	put_page(page);
2636	2832	goto retry_find;
2637	2833	}
2638		- VM_BUG_ON_PAGE(page->index != offset, page);
	2834	+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
2639	2835
2640	2836	/*
2641	2837	* We have a locked page in the page cache, now we need to check
..	..	@@ -2645,7 +2841,7 @@
2645	2841	goto page_not_uptodate;
2646	2842
2647	2843	/*
2648		- * We've made it this far and we had to drop our mmap_sem, now is the
	2844	+ * We've made it this far and we had to drop our mmap_lock, now is the
2649	2845	* time to return to the upper layer and have it re-find the vma and
2650	2846	* redo the fault.
2651	2847	*/
..	..	@@ -2654,6 +2850,7 @@
2654	2850	goto out_retry;
2655	2851	}
2656	2852
	2853	+page_ok:
2657	2854	/*
2658	2855	* Found the page and have a reference on it.
2659	2856	* We must recheck i_size under page lock.
..	..	@@ -2690,104 +2887,197 @@
2690	2887	if (!error \|\| error == AOP_TRUNCATED_PAGE)
2691	2888	goto retry_find;
2692	2889
2693		- /* Things didn't work out. Return zero to tell the mm layer so. */
2694		- shrink_readahead_size_eio(file, ra);
	2890	+ shrink_readahead_size_eio(ra);
2695	2891	return VM_FAULT_SIGBUS;
2696	2892
2697	2893	out_retry:
2698	2894	/*
2699		- * We dropped the mmap_sem, we need to return to the fault handler to
	2895	+ * We dropped the mmap_lock, we need to return to the fault handler to
2700	2896	* re-find the vma and come back and find our hopefully still populated
2701	2897	* page.
2702	2898	*/
2703		- if (page)
	2899	+ if (page) {
	2900	+ trace_android_vh_filemap_fault_cache_page(vmf, page);
2704	2901	put_page(page);
	2902	+ }
2705	2903	if (fpin)
2706	2904	fput(fpin);
2707	2905	return ret \| VM_FAULT_RETRY;
2708	2906	}
2709	2907	EXPORT_SYMBOL(filemap_fault);
2710	2908
2711		-void filemap_map_pages(struct vm_fault *vmf,
2712		- pgoff_t start_pgoff, pgoff_t end_pgoff)
	2909	+static bool filemap_map_pmd(struct vm_fault vmf, struct page page)
2713	2910	{
2714		- struct radix_tree_iter iter;
2715		- void **slot;
2716		- struct file *file = vmf->vma->vm_file;
2717		- struct address_space *mapping = file->f_mapping;
2718		- pgoff_t last_pgoff = start_pgoff;
	2911	+ struct mm_struct *mm = vmf->vma->vm_mm;
	2912	+
	2913	+ /* Huge page is mapped? No need to proceed. */
	2914	+ if (pmd_trans_huge(*vmf->pmd)) {
	2915	+ unlock_page(page);
	2916	+ put_page(page);
	2917	+ return true;
	2918	+ }
	2919	+
	2920	+ if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
	2921	+ vm_fault_t ret = do_set_pmd(vmf, page);
	2922	+ if (!ret) {
	2923	+ /* The page is mapped successfully, reference consumed. */
	2924	+ unlock_page(page);
	2925	+ return true;
	2926	+ }
	2927	+ }
	2928	+
	2929	+ if (pmd_none(*vmf->pmd)) {
	2930	+ vmf->ptl = pmd_lock(mm, vmf->pmd);
	2931	+ if (likely(pmd_none(*vmf->pmd))) {
	2932	+ mm_inc_nr_ptes(mm);
	2933	+ pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
	2934	+ vmf->prealloc_pte = NULL;
	2935	+ }
	2936	+ spin_unlock(vmf->ptl);
	2937	+ }
	2938	+
	2939	+ /* See comment in handle_pte_fault() */
	2940	+ if (pmd_devmap_trans_unstable(vmf->pmd)) {
	2941	+ unlock_page(page);
	2942	+ put_page(page);
	2943	+ return true;
	2944	+ }
	2945	+
	2946	+ return false;
	2947	+}
	2948	+
	2949	+static struct page next_uptodate_page(struct page page,
	2950	+ struct address_space *mapping,
	2951	+ struct xa_state *xas, pgoff_t end_pgoff)
	2952	+{
2719	2953	unsigned long max_idx;
2720		- struct page head, page;
2721	2954
2722		- rcu_read_lock();
2723		- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) {
2724		- if (iter.index > end_pgoff)
2725		- break;
2726		-repeat:
2727		- page = radix_tree_deref_slot(slot);
2728		- if (unlikely(!page))
2729		- goto next;
2730		- if (radix_tree_exception(page)) {
2731		- if (radix_tree_deref_retry(page)) {
2732		- slot = radix_tree_iter_retry(&iter);
2733		- continue;
2734		- }
2735		- goto next;
2736		- }
2737		-
2738		- head = compound_head(page);
2739		- if (!page_cache_get_speculative(head))
2740		- goto repeat;
2741		-
2742		- /* The page was split under us? */
2743		- if (compound_head(page) != head) {
2744		- put_page(head);
2745		- goto repeat;
2746		- }
2747		-
2748		- /* Has the page moved? */
2749		- if (unlikely(page != *slot)) {
2750		- put_page(head);
2751		- goto repeat;
2752		- }
2753		-
2754		- if (!PageUptodate(page) \|\|
2755		- PageReadahead(page) \|\|
2756		- PageHWPoison(page))
	2955	+ do {
	2956	+ if (!page)
	2957	+ return NULL;
	2958	+ if (xas_retry(xas, page))
	2959	+ continue;
	2960	+ if (xa_is_value(page))
	2961	+ continue;
	2962	+ if (PageLocked(page))
	2963	+ continue;
	2964	+ if (!page_cache_get_speculative(page))
	2965	+ continue;
	2966	+ /* Has the page moved or been split? */
	2967	+ if (unlikely(page != xas_reload(xas)))
	2968	+ goto skip;
	2969	+ if (!PageUptodate(page) \|\| PageReadahead(page))
	2970	+ goto skip;
	2971	+ if (PageHWPoison(page))
2757	2972	goto skip;
2758	2973	if (!trylock_page(page))
2759	2974	goto skip;
2760		-
2761		- if (page->mapping != mapping \|\| !PageUptodate(page))
	2975	+ if (page->mapping != mapping)
2762	2976	goto unlock;
2763		-
	2977	+ if (!PageUptodate(page))
	2978	+ goto unlock;
2764	2979	max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
2765		- if (page->index >= max_idx)
	2980	+ if (xas->xa_index >= max_idx)
2766	2981	goto unlock;
2767		-
2768		- if (file->f_ra.mmap_miss > 0)
2769		- file->f_ra.mmap_miss--;
2770		-
2771		- vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
2772		- if (vmf->pte)
2773		- vmf->pte += iter.index - last_pgoff;
2774		- last_pgoff = iter.index;
2775		- if (alloc_set_pte(vmf, NULL, page))
2776		- goto unlock;
2777		- unlock_page(page);
2778		- goto next;
	2982	+ return page;
2779	2983	unlock:
2780	2984	unlock_page(page);
2781	2985	skip:
2782	2986	put_page(page);
2783		-next:
2784		- /* Huge page is mapped? No need to proceed. */
2785		- if (pmd_trans_huge(*vmf->pmd))
2786		- break;
2787		- if (iter.index == end_pgoff)
2788		- break;
	2987	+ } while ((page = xas_next_entry(xas, end_pgoff)) != NULL);
	2988	+
	2989	+ return NULL;
	2990	+}
	2991	+
	2992	+static inline struct page first_map_page(struct address_space mapping,
	2993	+ struct xa_state *xas,
	2994	+ pgoff_t end_pgoff)
	2995	+{
	2996	+ return next_uptodate_page(xas_find(xas, end_pgoff),
	2997	+ mapping, xas, end_pgoff);
	2998	+}
	2999	+
	3000	+static inline struct page next_map_page(struct address_space mapping,
	3001	+ struct xa_state *xas,
	3002	+ pgoff_t end_pgoff)
	3003	+{
	3004	+ return next_uptodate_page(xas_next_entry(xas, end_pgoff),
	3005	+ mapping, xas, end_pgoff);
	3006	+}
	3007	+
	3008	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	3009	+bool filemap_allow_speculation(void)
	3010	+{
	3011	+ return true;
	3012	+}
	3013	+EXPORT_SYMBOL_GPL(filemap_allow_speculation);
	3014	+#endif
	3015	+
	3016	+vm_fault_t filemap_map_pages(struct vm_fault *vmf,
	3017	+ pgoff_t start_pgoff, pgoff_t end_pgoff)
	3018	+{
	3019	+ struct vm_area_struct *vma = vmf->vma;
	3020	+ struct file *file = vma->vm_file;
	3021	+ struct address_space *mapping = file->f_mapping;
	3022	+ pgoff_t last_pgoff = start_pgoff;
	3023	+ unsigned long addr;
	3024	+ XA_STATE(xas, &mapping->i_pages, start_pgoff);
	3025	+ struct page head, page;
	3026	+ unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
	3027	+ vm_fault_t ret = (vmf->flags & FAULT_FLAG_SPECULATIVE) ?
	3028	+ VM_FAULT_RETRY : 0;
	3029	+
	3030	+ rcu_read_lock();
	3031	+ head = first_map_page(mapping, &xas, end_pgoff);
	3032	+ if (!head)
	3033	+ goto out;
	3034	+
	3035	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) &&
	3036	+ filemap_map_pmd(vmf, head)) {
	3037	+ ret = VM_FAULT_NOPAGE;
	3038	+ goto out;
2789	3039	}
	3040	+
	3041	+ addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
	3042	+ if (!pte_map_lock_addr(vmf, addr)) {
	3043	+ unlock_page(head);
	3044	+ put_page(head);
	3045	+ goto out;
	3046	+ }
	3047	+
	3048	+ do {
	3049	+ page = find_subpage(head, xas.xa_index);
	3050	+ if (PageHWPoison(page))
	3051	+ goto unlock;
	3052	+
	3053	+ if (mmap_miss > 0)
	3054	+ mmap_miss--;
	3055	+
	3056	+ addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
	3057	+ vmf->pte += xas.xa_index - last_pgoff;
	3058	+ last_pgoff = xas.xa_index;
	3059	+
	3060	+ if (!pte_none(*vmf->pte))
	3061	+ goto unlock;
	3062	+
	3063	+ /* We're about to handle the fault */
	3064	+ if (vmf->address == addr)
	3065	+ ret = VM_FAULT_NOPAGE;
	3066	+
	3067	+ do_set_pte(vmf, page, addr);
	3068	+ /* no need to invalidate: a not-present page won't be cached */
	3069	+ update_mmu_cache(vma, addr, vmf->pte);
	3070	+ unlock_page(head);
	3071	+ continue;
	3072	+unlock:
	3073	+ unlock_page(head);
	3074	+ put_page(head);
	3075	+ } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
	3076	+ pte_unmap_unlock(vmf->pte, vmf->ptl);
	3077	+out:
2790	3078	rcu_read_unlock();
	3079	+ WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
	3080	+ return ret;
2791	3081	}
2792	3082	EXPORT_SYMBOL(filemap_map_pages);
2793	3083
..	..	@@ -2821,6 +3111,9 @@
2821	3111	.fault = filemap_fault,
2822	3112	.map_pages = filemap_map_pages,
2823	3113	.page_mkwrite = filemap_page_mkwrite,
	3114	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	3115	+ .allow_speculation = filemap_allow_speculation,
	3116	+#endif
2824	3117	};
2825	3118
2826	3119	/* This is used for a general mmap of a disk file */
..	..	@@ -2846,9 +3139,9 @@
2846	3139	return generic_file_mmap(file, vma);
2847	3140	}
2848	3141	#else
2849		-int filemap_page_mkwrite(struct vm_fault *vmf)
	3142	+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
2850	3143	{
2851		- return -ENOSYS;
	3144	+ return VM_FAULT_SIGBUS;
2852	3145	}
2853	3146	int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
2854	3147	{
..	..	@@ -2895,7 +3188,7 @@
2895	3188	put_page(page);
2896	3189	if (err == -EEXIST)
2897	3190	goto repeat;
2898		- /* Presumably ENOMEM for radix tree node */
	3191	+ /* Presumably ENOMEM for xarray node */
2899	3192	return ERR_PTR(err);
2900	3193	}
2901	3194
..	..	@@ -2919,7 +3212,7 @@
2919	3212	goto out;
2920	3213
2921	3214	/*
2922		- * Page is not up to date and may be locked due one of the following
	3215	+ * Page is not up to date and may be locked due to one of the following
2923	3216	* case a: Page is being filled and the page lock is held
2924	3217	* case b: Read/write error clearing the page uptodate status
2925	3218	* case c: Truncation in progress (page locked)
..	..	@@ -2928,7 +3221,7 @@
2928	3221	* Case a, the page will be up to date when the page is unlocked.
2929	3222	* There is no need to serialise on the page lock here as the page
2930	3223	* is pinned so the lock gives no additional protection. Even if the
2931		- * the page is truncated, the data is still valid if PageUptodate as
	3224	+ * page is truncated, the data is still valid if PageUptodate as
2932	3225	* it's a race vs truncate race.
2933	3226	* Case b, the page will not be up to date
2934	3227	* Case c, the page may be truncated but in itself, the data may still
..	..	@@ -2994,13 +3287,16 @@
2994	3287	* not set, try to fill the page and wait for it to become unlocked.
2995	3288	*
2996	3289	* If the page does not get brought uptodate, return -EIO.
	3290	+ *
	3291	+ * Return: up to date page on success, ERR_PTR() on failure.
2997	3292	*/
2998	3293	struct page read_cache_page(struct address_space mapping,
2999	3294	pgoff_t index,
3000	3295	int (filler)(void , struct page *),
3001	3296	void *data)
3002	3297	{
3003		- return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
	3298	+ return do_read_cache_page(mapping, index, filler, data,
	3299	+ mapping_gfp_mask(mapping));
3004	3300	}
3005	3301	EXPORT_SYMBOL(read_cache_page);
3006	3302
..	..	@@ -3014,6 +3310,8 @@
3014	3310	* any new page allocations done using the specified allocation flags.
3015	3311	*
3016	3312	* If the page does not get brought uptodate, return -EIO.
	3313	+ *
	3314	+ * Return: up to date page on success, ERR_PTR() on failure.
3017	3315	*/
3018	3316	struct page read_cache_page_gfp(struct address_space mapping,
3019	3317	pgoff_t index,
..	..	@@ -3022,68 +3320,6 @@
3022	3320	return do_read_cache_page(mapping, index, NULL, NULL, gfp);
3023	3321	}
3024	3322	EXPORT_SYMBOL(read_cache_page_gfp);
3025		-
3026		-/*
3027		- * Performs necessary checks before doing a write
3028		- *
3029		- * Can adjust writing position or amount of bytes to write.
3030		- * Returns appropriate error code that caller should return or
3031		- * zero in case that write should be allowed.
3032		- */
3033		-inline ssize_t generic_write_checks(struct kiocb iocb, struct iov_iter from)
3034		-{
3035		- struct file *file = iocb->ki_filp;
3036		- struct inode *inode = file->f_mapping->host;
3037		- unsigned long limit = rlimit(RLIMIT_FSIZE);
3038		- loff_t pos;
3039		-
3040		- if (IS_SWAPFILE(inode))
3041		- return -ETXTBSY;
3042		-
3043		- if (!iov_iter_count(from))
3044		- return 0;
3045		-
3046		- /* FIXME: this is for backwards compatibility with 2.4 */
3047		- if (iocb->ki_flags & IOCB_APPEND)
3048		- iocb->ki_pos = i_size_read(inode);
3049		-
3050		- pos = iocb->ki_pos;
3051		-
3052		- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
3053		- return -EINVAL;
3054		-
3055		- if (limit != RLIM_INFINITY) {
3056		- if (iocb->ki_pos >= limit) {
3057		- send_sig(SIGXFSZ, current, 0);
3058		- return -EFBIG;
3059		- }
3060		- iov_iter_truncate(from, limit - (unsigned long)pos);
3061		- }
3062		-
3063		- /*
3064		- * LFS rule
3065		- */
3066		- if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
3067		- !(file->f_flags & O_LARGEFILE))) {
3068		- if (pos >= MAX_NON_LFS)
3069		- return -EFBIG;
3070		- iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
3071		- }
3072		-
3073		- /*
3074		- * Are we about to exceed the fs block limit ?
3075		- *
3076		- * If we have written data it becomes a short write. If we have
3077		- * exceeded without writing data we send a signal and return EFBIG.
3078		- * Linus frestrict idea will clean these up nicely..
3079		- */
3080		- if (unlikely(pos >= inode->i_sb->s_maxbytes))
3081		- return -EFBIG;
3082		-
3083		- iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
3084		- return iov_iter_count(from);
3085		-}
3086		-EXPORT_SYMBOL(generic_write_checks);
3087	3323
3088	3324	int pagecache_write_begin(struct file file, struct address_space mapping,
3089	3325	loff_t pos, unsigned len, unsigned flags,
..	..	@@ -3106,6 +3342,27 @@
3106	3342	}
3107	3343	EXPORT_SYMBOL(pagecache_write_end);
3108	3344
	3345	+/*
	3346	+ * Warn about a page cache invalidation failure during a direct I/O write.
	3347	+ */
	3348	+void dio_warn_stale_pagecache(struct file *filp)
	3349	+{
	3350	+ static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
	3351	+ char pathname[128];
	3352	+ struct inode *inode = file_inode(filp);
	3353	+ char *path;
	3354	+
	3355	+ errseq_set(&inode->i_mapping->wb_err, -EIO);
	3356	+ if (__ratelimit(&_rs)) {
	3357	+ path = file_path(filp, pathname, sizeof(pathname));
	3358	+ if (IS_ERR(path))
	3359	+ path = "(unknown)";
	3360	+ pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
	3361	+ pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
	3362	+ current->comm);
	3363	+ }
	3364	+}
	3365	+
3109	3366	ssize_t
3110	3367	generic_file_direct_write(struct kiocb iocb, struct iov_iter from)
3111	3368	{
..	..	@@ -3123,7 +3380,7 @@
3123	3380	if (iocb->ki_flags & IOCB_NOWAIT) {
3124	3381	/* If there are pages to writeback, return */
3125	3382	if (filemap_range_has_page(inode->i_mapping, pos,
3126		- pos + iov_iter_count(from)))
	3383	+ pos + write_len - 1))
3127	3384	return -EAGAIN;
3128	3385	} else {
3129	3386	written = filemap_write_and_wait_range(mapping, pos,
..	..	@@ -3163,11 +3420,15 @@
3163	3420	* Most of the time we do not need this since dio_complete() will do
3164	3421	* the invalidation for us. However there are some file systems that
3165	3422	* do not end up with dio_complete() being called, so let's not break
3166		- * them by removing it completely
	3423	+ * them by removing it completely.
	3424	+ *
	3425	+ * Noticeable example is a blkdev_direct_IO().
	3426	+ *
	3427	+ * Skip invalidation for async writes or if mapping has no pages.
3167	3428	*/
3168		- if (mapping->nrpages)
3169		- invalidate_inode_pages2_range(mapping,
3170		- pos >> PAGE_SHIFT, end);
	3429	+ if (written > 0 && mapping->nrpages &&
	3430	+ invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
	3431	+ dio_warn_stale_pagecache(file);
3171	3432
3172	3433	if (written > 0) {
3173	3434	pos += written;
..	..	@@ -3220,7 +3481,7 @@
3220	3481	unsigned long offset; /* Offset into pagecache page */
3221	3482	unsigned long bytes; /* Bytes to write to page */
3222	3483	size_t copied; /* Bytes copied from user */
3223		- void *fsdata;
	3484	+ void *fsdata = NULL;
3224	3485
3225	3486	offset = (pos & (PAGE_SIZE - 1));
3226	3487	bytes = min_t(unsigned long, PAGE_SIZE - offset,
..	..	@@ -3306,6 +3567,10 @@
3306	3567	* This function does not take care of syncing data in case of O_SYNC write.
3307	3568	* A caller has to handle it. This is mainly due to the fact that we want to
3308	3569	* avoid syncing under i_mutex.
	3570	+ *
	3571	+ * Return:
	3572	+ * * number of bytes written, even for truncated writes
	3573	+ * * negative error code if no data has been written at all
3309	3574	*/
3310	3575	ssize_t __generic_file_write_iter(struct kiocb iocb, struct iov_iter from)
3311	3576	{
..	..	@@ -3390,6 +3655,10 @@
3390	3655	* This is a wrapper around __generic_file_write_iter() to be used by most
3391	3656	* filesystems. It takes care of syncing the file in case of O_SYNC file
3392	3657	* and acquires i_mutex as needed.
	3658	+ * Return:
	3659	+ * * negative error code if no data has been written at all of
	3660	+ * vfs_fsync_range() failed for a synchronous write
	3661	+ * * number of bytes written, even for truncated writes
3393	3662	*/
3394	3663	ssize_t generic_file_write_iter(struct kiocb iocb, struct iov_iter from)
3395	3664	{
..	..	@@ -3416,8 +3685,7 @@
3416	3685	* @gfp_mask: memory allocation flags (and I/O mode)
3417	3686	*
3418	3687	* The address_space is to try to release any data against the page
3419		- * (presumably at page->private). If the release was successful, return '1'.
3420		- * Otherwise return zero.
	3688	+ * (presumably at page->private).
3421	3689	*
3422	3690	* This may also be called if PG_fscache is set on a page, indicating that the
3423	3691	* page is known to the local caching routines.
..	..	@@ -3425,6 +3693,7 @@
3425	3693	* The @gfp_mask argument specifies whether I/O may be performed to release
3426	3694	* this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
3427	3695	*
	3696	+ * Return: %1 if the release was successful, otherwise return zero.
3428	3697	*/
3429	3698	int try_to_release_page(struct page *page, gfp_t gfp_mask)
3430	3699	{