~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* mm/page-writeback.c
3	4	*
..	..	@@ -40,6 +41,9 @@
40	41	#include <trace/events/writeback.h>
41	42
42	43	#include "internal.h"
	44	+
	45	+#undef CREATE_TRACE_POINT
	46	+#include <trace/hooks/mm.h>
43	47
44	48	/*
45	49	* Sleep at most 200ms at a time in balance_dirty_pages().
..	..	@@ -256,7 +260,7 @@
256	260	* requiring writeback.
257	261	*
258	262	* This number of dirtyable pages is the base value of which the
259		- * user-configurable dirty ratio is the effictive number of pages that
	263	+ * user-configurable dirty ratio is the effective number of pages that
260	264	* are allowed to be actually dirtied. Per individual zone, or
261	265	* globally by using the sum of dirtyable pages over all zones.
262	266	*
..	..	@@ -270,7 +274,7 @@
270	274	* node_dirtyable_memory - number of dirtyable pages in a node
271	275	* @pgdat: the node
272	276	*
273		- * Returns the node's number of pages potentially available for dirty
	277	+ * Return: the node's number of pages potentially available for dirty
274	278	* page cache. This is the base value for the per-node dirty limits.
275	279	*/
276	280	static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
..	..	@@ -355,7 +359,7 @@
355	359	/**
356	360	* global_dirtyable_memory - number of globally dirtyable pages
357	361	*
358		- * Returns the global number of pages potentially available for dirty
	362	+ * Return: the global number of pages potentially available for dirty
359	363	* page cache. This is the base value for the global dirty limits.
360	364	*/
361	365	static unsigned long global_dirtyable_memory(void)
..	..	@@ -386,8 +390,7 @@
386	390	* Calculate @dtc->thresh and ->bg_thresh considering
387	391	* vm_dirty_{bytes\|ratio} and dirty_background_{bytes\|ratio}. The caller
388	392	* must ensure that @dtc->avail is set before calling this function. The
389		- * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
390		- * real-time tasks.
	393	+ * dirty limits will be lifted by 1/4 for real-time tasks.
391	394	*/
392	395	static void domain_dirty_limits(struct dirty_throttle_control *dtc)
393	396	{
..	..	@@ -435,7 +438,7 @@
435	438	if (bg_thresh >= thresh)
436	439	bg_thresh = thresh / 2;
437	440	tsk = current;
438		- if (tsk->flags & PF_LESS_THROTTLE \|\| rt_task(tsk)) {
	441	+ if (rt_task(tsk)) {
439	442	bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
440	443	thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
441	444	}
..	..	@@ -470,7 +473,7 @@
470	473	* node_dirty_limit - maximum number of dirty pages allowed in a node
471	474	* @pgdat: the node
472	475	*
473		- * Returns the maximum number of dirty pages allowed in a node, based
	476	+ * Return: the maximum number of dirty pages allowed in a node, based
474	477	* on the node's dirtyable memory.
475	478	*/
476	479	static unsigned long node_dirty_limit(struct pglist_data *pgdat)
..	..	@@ -485,7 +488,7 @@
485	488	else
486	489	dirty = vm_dirty_ratio * node_memory / 100;
487	490
488		- if (tsk->flags & PF_LESS_THROTTLE \|\| rt_task(tsk))
	491	+ if (rt_task(tsk))
489	492	dirty += dirty / 4;
490	493
491	494	return dirty;
..	..	@@ -495,7 +498,7 @@
495	498	* node_dirty_ok - tells whether a node is within its dirty limits
496	499	* @pgdat: the node to check
497	500	*
498		- * Returns %true when the dirty pages in @pgdat are within the node's
	501	+ * Return: %true when the dirty pages in @pgdat are within the node's
499	502	* dirty limit, %false if the limit is exceeded.
500	503	*/
501	504	bool node_dirty_ok(struct pglist_data *pgdat)
..	..	@@ -504,15 +507,13 @@
504	507	unsigned long nr_pages = 0;
505	508
506	509	nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
507		- nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
508	510	nr_pages += node_page_state(pgdat, NR_WRITEBACK);
509	511
510	512	return nr_pages <= limit;
511	513	}
512	514
513	515	int dirty_background_ratio_handler(struct ctl_table *table, int write,
514		- void __user buffer, size_t lenp,
515		- loff_t *ppos)
	516	+ void buffer, size_t lenp, loff_t *ppos)
516	517	{
517	518	int ret;
518	519
..	..	@@ -523,8 +524,7 @@
523	524	}
524	525
525	526	int dirty_background_bytes_handler(struct ctl_table *table, int write,
526		- void __user buffer, size_t lenp,
527		- loff_t *ppos)
	527	+ void buffer, size_t lenp, loff_t *ppos)
528	528	{
529	529	int ret;
530	530
..	..	@@ -534,9 +534,8 @@
534	534	return ret;
535	535	}
536	536
537		-int dirty_ratio_handler(struct ctl_table *table, int write,
538		- void __user buffer, size_t lenp,
539		- loff_t *ppos)
	537	+int dirty_ratio_handler(struct ctl_table table, int write, void buffer,
	538	+ size_t lenp, loff_t ppos)
540	539	{
541	540	int old_ratio = vm_dirty_ratio;
542	541	int ret;
..	..	@@ -550,8 +549,7 @@
550	549	}
551	550
552	551	int dirty_bytes_handler(struct ctl_table *table, int write,
553		- void __user buffer, size_t lenp,
554		- loff_t *ppos)
	552	+ void buffer, size_t lenp, loff_t *ppos)
555	553	{
556	554	unsigned long old_bytes = vm_dirty_bytes;
557	555	int ret;
..	..	@@ -743,9 +741,6 @@
743	741	* __wb_calc_thresh - @wb's share of dirty throttling threshold
744	742	* @dtc: dirty_throttle_context of interest
745	743	*
746		- * Returns @wb's dirty limit in pages. The term "dirty" in the context of
747		- * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
748		- *
749	744	* Note that balance_dirty_pages() will only seriously take it as a hard limit
750	745	* when sleeping max_pause per page is not enough to keep the dirty pages under
751	746	* control. For example, when the device is completely stalled due to some error
..	..	@@ -759,13 +754,16 @@
759	754	*
760	755	* The wb's share of dirty limit will be adapting to its throughput and
761	756	* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
	757	+ *
	758	+ * Return: @wb's dirty limit in pages. The term "dirty" in the context of
	759	+ * dirty balancing includes all PG_dirty and PG_writeback pages.
762	760	*/
763	761	static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
764	762	{
765	763	struct wb_domain *dom = dtc_dom(dtc);
766	764	unsigned long thresh = dtc->thresh;
767	765	u64 wb_thresh;
768		- long numerator, denominator;
	766	+ unsigned long numerator, denominator;
769	767	unsigned long wb_min_ratio, wb_max_ratio;
770	768
771	769	/*
..	..	@@ -776,7 +774,7 @@
776	774
777	775	wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
778	776	wb_thresh *= numerator;
779		- do_div(wb_thresh, denominator);
	777	+ wb_thresh = div64_ul(wb_thresh, denominator);
780	778
781	779	wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
782	780
..	..	@@ -1101,7 +1099,7 @@
1101	1099	bw = written - min(written, wb->written_stamp);
1102	1100	bw *= HZ;
1103	1101	if (unlikely(elapsed > period)) {
1104		- do_div(bw, elapsed);
	1102	+ bw = div64_ul(bw, elapsed);
1105	1103	avg = bw;
1106	1104	goto out;
1107	1105	}
..	..	@@ -1566,7 +1564,7 @@
1566	1564	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
1567	1565	&mdtc_stor : NULL;
1568	1566	struct dirty_throttle_control *sdtc;
1569		- unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
	1567	+ unsigned long nr_reclaimable; /* = file_dirty */
1570	1568	long period;
1571	1569	long pause;
1572	1570	long max_pause;
..	..	@@ -1586,14 +1584,7 @@
1586	1584	unsigned long m_thresh = 0;
1587	1585	unsigned long m_bg_thresh = 0;
1588	1586
1589		- /*
1590		- * Unstable writes are a feature of certain networked
1591		- * filesystems (i.e. NFS) in which data may have been
1592		- * written to the server's write cache, but has not yet
1593		- * been flushed to permanent storage.
1594		- */
1595		- nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
1596		- global_node_page_state(NR_UNSTABLE_NFS);
	1587	+ nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
1597	1588	gdtc->avail = global_dirtyable_memory();
1598	1589	gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
1599	1590
..	..	@@ -1637,6 +1628,9 @@
1637	1628	}
1638	1629	}
1639	1630
	1631	+ trace_android_vh_mm_dirty_limits(gdtc, strictlimit, dirty, bg_thresh,
	1632	+ nr_reclaimable, pages_dirtied);
	1633	+
1640	1634	/*
1641	1635	* Throttle it only when the background writeback cannot
1642	1636	* catch-up. This avoids (excessively) small writeouts
..	..	@@ -1652,8 +1646,12 @@
1652	1646	if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
1653	1647	(!mdtc \|\|
1654	1648	m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
1655		- unsigned long intv = dirty_poll_interval(dirty, thresh);
1656		- unsigned long m_intv = ULONG_MAX;
	1649	+ unsigned long intv;
	1650	+ unsigned long m_intv;
	1651	+
	1652	+free_running:
	1653	+ intv = dirty_poll_interval(dirty, thresh);
	1654	+ m_intv = ULONG_MAX;
1657	1655
1658	1656	current->dirty_paused_when = now;
1659	1657	current->nr_dirtied = 0;
..	..	@@ -1666,12 +1664,25 @@
1666	1664	if (unlikely(!writeback_in_progress(wb)))
1667	1665	wb_start_background_writeback(wb);
1668	1666
	1667	+ mem_cgroup_flush_foreign(wb);
	1668	+
1669	1669	/*
1670	1670	* Calculate global domain's pos_ratio and select the
1671	1671	* global dtc by default.
1672	1672	*/
1673		- if (!strictlimit)
	1673	+ if (!strictlimit) {
1674	1674	wb_dirty_limits(gdtc);
	1675	+
	1676	+ if ((current->flags & PF_LOCAL_THROTTLE) &&
	1677	+ gdtc->wb_dirty <
	1678	+ dirty_freerun_ceiling(gdtc->wb_thresh,
	1679	+ gdtc->wb_bg_thresh))
	1680	+ /*
	1681	+ * LOCAL_THROTTLE tasks must not be throttled
	1682	+ * when below the per-wb freerun ceiling.
	1683	+ */
	1684	+ goto free_running;
	1685	+ }
1675	1686
1676	1687	dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
1677	1688	((gdtc->dirty > gdtc->thresh) \|\| strictlimit);
..	..	@@ -1686,9 +1697,20 @@
1686	1697	* both global and memcg domains. Choose the one
1687	1698	* w/ lower pos_ratio.
1688	1699	*/
1689		- if (!strictlimit)
	1700	+ if (!strictlimit) {
1690	1701	wb_dirty_limits(mdtc);
1691	1702
	1703	+ if ((current->flags & PF_LOCAL_THROTTLE) &&
	1704	+ mdtc->wb_dirty <
	1705	+ dirty_freerun_ceiling(mdtc->wb_thresh,
	1706	+ mdtc->wb_bg_thresh))
	1707	+ /*
	1708	+ * LOCAL_THROTTLE tasks must not be
	1709	+ * throttled when below the per-wb
	1710	+ * freerun ceiling.
	1711	+ */
	1712	+ goto free_running;
	1713	+ }
1692	1714	dirty_exceeded \|= (mdtc->wb_dirty > mdtc->wb_thresh) &&
1693	1715	((mdtc->dirty > mdtc->thresh) \|\| strictlimit);
1694	1716
..	..	@@ -1866,7 +1888,7 @@
1866	1888	int ratelimit;
1867	1889	int *p;
1868	1890
1869		- if (!bdi_cap_account_dirty(bdi))
	1891	+ if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
1870	1892	return;
1871	1893
1872	1894	if (inode_cgwb_enabled(inode))
..	..	@@ -1918,7 +1940,9 @@
1918	1940	* @wb: bdi_writeback of interest
1919	1941	*
1920	1942	* Determines whether background writeback should keep writing @wb or it's
1921		- * clean enough. Returns %true if writeback should continue.
	1943	+ * clean enough.
	1944	+ *
	1945	+ * Return: %true if writeback should continue.
1922	1946	*/
1923	1947	bool wb_over_bg_thresh(struct bdi_writeback *wb)
1924	1948	{
..	..	@@ -1927,21 +1951,27 @@
1927	1951	struct dirty_throttle_control * const gdtc = &gdtc_stor;
1928	1952	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
1929	1953	&mdtc_stor : NULL;
	1954	+ unsigned long reclaimable;
	1955	+ unsigned long thresh;
1930	1956
1931	1957	/*
1932	1958	* Similar to balance_dirty_pages() but ignores pages being written
1933	1959	* as we're trying to decide whether to put more under writeback.
1934	1960	*/
1935	1961	gdtc->avail = global_dirtyable_memory();
1936		- gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) +
1937		- global_node_page_state(NR_UNSTABLE_NFS);
	1962	+ gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
1938	1963	domain_dirty_limits(gdtc);
1939	1964
1940	1965	if (gdtc->dirty > gdtc->bg_thresh)
1941	1966	return true;
1942	1967
1943		- if (wb_stat(wb, WB_RECLAIMABLE) >
1944		- wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
	1968	+ thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh);
	1969	+ if (thresh < 2 * wb_stat_error())
	1970	+ reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
	1971	+ else
	1972	+ reclaimable = wb_stat(wb, WB_RECLAIMABLE);
	1973	+
	1974	+ if (reclaimable > thresh)
1945	1975	return true;
1946	1976
1947	1977	if (mdtc) {
..	..	@@ -1955,8 +1985,13 @@
1955	1985	if (mdtc->dirty > mdtc->bg_thresh)
1956	1986	return true;
1957	1987
1958		- if (wb_stat(wb, WB_RECLAIMABLE) >
1959		- wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
	1988	+ thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh);
	1989	+ if (thresh < 2 * wb_stat_error())
	1990	+ reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
	1991	+ else
	1992	+ reclaimable = wb_stat(wb, WB_RECLAIMABLE);
	1993	+
	1994	+ if (reclaimable > thresh)
1960	1995	return true;
1961	1996	}
1962	1997
..	..	@@ -1967,7 +2002,7 @@
1967	2002	* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
1968	2003	*/
1969	2004	int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
1970		- void __user buffer, size_t length, loff_t *ppos)
	2005	+ void buffer, size_t length, loff_t *ppos)
1971	2006	{
1972	2007	unsigned int old_interval = dirty_writeback_interval;
1973	2008	int ret;
..	..	@@ -2059,13 +2094,11 @@
2059	2094	* Called early on to tune the page writeback dirty limits.
2060	2095	*
2061	2096	* We used to scale dirty pages according to how total memory
2062		- * related to pages that could be allocated for buffers (by
2063		- * comparing nr_free_buffer_pages() to vm_total_pages.
	2097	+ * related to pages that could be allocated for buffers.
2064	2098	*
2065	2099	* However, that was when we used "dirty_ratio" to scale with
2066	2100	* all memory, and we don't do that any more. "dirty_ratio"
2067		- * is now applied to total non-HIGHPAGE memory (by subtracting
2068		- * totalhigh_pages from vm_total_pages), and as such we can't
	2101	+ * is now applied to total non-HIGHPAGE memory, and as such we can't
2069	2102	* get into the old insane situation any more where we had
2070	2103	* large amounts of dirty pages compared to a small amount of
2071	2104	* non-HIGHMEM memory.
..	..	@@ -2097,34 +2130,25 @@
2097	2130	* dirty pages in the file (thus it is important for this function to be quick
2098	2131	* so that it can tag pages faster than a dirtying process can create them).
2099	2132	*/
2100		-/*
2101		- * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock
2102		- * latency.
2103		- */
2104	2133	void tag_pages_for_writeback(struct address_space *mapping,
2105	2134	pgoff_t start, pgoff_t end)
2106	2135	{
2107		-#define WRITEBACK_TAG_BATCH 4096
2108		- unsigned long tagged = 0;
2109		- struct radix_tree_iter iter;
2110		- void **slot;
	2136	+ XA_STATE(xas, &mapping->i_pages, start);
	2137	+ unsigned int tagged = 0;
	2138	+ void *page;
2111	2139
2112		- xa_lock_irq(&mapping->i_pages);
2113		- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start,
2114		- PAGECACHE_TAG_DIRTY) {
2115		- if (iter.index > end)
2116		- break;
2117		- radix_tree_iter_tag_set(&mapping->i_pages, &iter,
2118		- PAGECACHE_TAG_TOWRITE);
2119		- tagged++;
2120		- if ((tagged % WRITEBACK_TAG_BATCH) != 0)
	2140	+ xas_lock_irq(&xas);
	2141	+ xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
	2142	+ xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
	2143	+ if (++tagged % XA_CHECK_SCHED)
2121	2144	continue;
2122		- slot = radix_tree_iter_resume(slot, &iter);
2123		- xa_unlock_irq(&mapping->i_pages);
	2145	+
	2146	+ xas_pause(&xas);
	2147	+ xas_unlock_irq(&xas);
2124	2148	cond_resched();
2125		- xa_lock_irq(&mapping->i_pages);
	2149	+ xas_lock_irq(&xas);
2126	2150	}
2127		- xa_unlock_irq(&mapping->i_pages);
	2151	+ xas_unlock_irq(&xas);
2128	2152	}
2129	2153	EXPORT_SYMBOL(tag_pages_for_writeback);
2130	2154
..	..	@@ -2156,6 +2180,8 @@
2156	2180	* lock/page writeback access order inversion - we should only ever lock
2157	2181	* multiple pages in ascending page->index order, and looping back to the start
2158	2182	* of the file violates that rule and causes deadlocks.
	2183	+ *
	2184	+ * Return: %0 on success, negative error code otherwise
2159	2185	*/
2160	2186	int write_cache_pages(struct address_space *mapping,
2161	2187	struct writeback_control *wbc, writepage_t writepage,
..	..	@@ -2166,17 +2192,15 @@
2166	2192	int error;
2167	2193	struct pagevec pvec;
2168	2194	int nr_pages;
2169		- pgoff_t uninitialized_var(writeback_index);
2170	2195	pgoff_t index;
2171	2196	pgoff_t end; /* Inclusive */
2172	2197	pgoff_t done_index;
2173	2198	int range_whole = 0;
2174		- int tag;
	2199	+ xa_mark_t tag;
2175	2200
2176	2201	pagevec_init(&pvec);
2177	2202	if (wbc->range_cyclic) {
2178		- writeback_index = mapping->writeback_index; /* prev offset */
2179		- index = writeback_index;
	2203	+ index = mapping->writeback_index; /* prev offset */
2180	2204	end = -1;
2181	2205	} else {
2182	2206	index = wbc->range_start >> PAGE_SHIFT;
..	..	@@ -2184,12 +2208,12 @@
2184	2208	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2185	2209	range_whole = 1;
2186	2210	}
2187		- if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
2188		- tag = PAGECACHE_TAG_TOWRITE;
2189		- else
2190		- tag = PAGECACHE_TAG_DIRTY;
2191		- if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
	2211	+ if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages) {
2192	2212	tag_pages_for_writeback(mapping, index, end);
	2213	+ tag = PAGECACHE_TAG_TOWRITE;
	2214	+ } else {
	2215	+ tag = PAGECACHE_TAG_DIRTY;
	2216	+ }
2193	2217	done_index = index;
2194	2218	while (!done && (index <= end)) {
2195	2219	int i;
..	..	@@ -2314,6 +2338,8 @@
2314	2338	*
2315	2339	* This is a library function, which implements the writepages()
2316	2340	* address_space_operation.
	2341	+ *
	2342	+ * Return: %0 on success, negative error code otherwise
2317	2343	*/
2318	2344	int generic_writepages(struct address_space *mapping,
2319	2345	struct writeback_control *wbc)
..	..	@@ -2360,6 +2386,8 @@
2360	2386	*
2361	2387	* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
2362	2388	* function returns.
	2389	+ *
	2390	+ * Return: %0 on success, negative error code otherwise
2363	2391	*/
2364	2392	int write_one_page(struct page *page)
2365	2393	{
..	..	@@ -2413,7 +2441,7 @@
2413	2441
2414	2442	trace_writeback_dirty_page(page, mapping);
2415	2443
2416		- if (mapping_cap_account_dirty(mapping)) {
	2444	+ if (mapping_can_writeback(mapping)) {
2417	2445	struct bdi_writeback *wb;
2418	2446
2419	2447	inode_attach_wb(inode, page);
..	..	@@ -2427,9 +2455,10 @@
2427	2455	task_io_account_write(PAGE_SIZE);
2428	2456	current->nr_dirtied++;
2429	2457	this_cpu_inc(bdp_ratelimits);
	2458	+
	2459	+ mem_cgroup_track_foreign_dirty(page, wb);
2430	2460	}
2431	2461	}
2432		-EXPORT_SYMBOL(account_page_dirtied);
2433	2462
2434	2463	/*
2435	2464	* Helper function for deaccounting dirty page without writeback.
..	..	@@ -2439,7 +2468,7 @@
2439	2468	void account_page_cleaned(struct page page, struct address_space mapping,
2440	2469	struct bdi_writeback *wb)
2441	2470	{
2442		- if (mapping_cap_account_dirty(mapping)) {
	2471	+ if (mapping_can_writeback(mapping)) {
2443	2472	dec_lruvec_page_state(page, NR_FILE_DIRTY);
2444	2473	dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2445	2474	dec_wb_stat(wb, WB_RECLAIMABLE);
..	..	@@ -2449,7 +2478,7 @@
2449	2478
2450	2479	/*
2451	2480	* For address_spaces which do not use buffers. Just tag the page as dirty in
2452		- * its radix tree.
	2481	+ * the xarray.
2453	2482	*
2454	2483	* This is also used when a single buffer is being dirtied: we want to set the
2455	2484	* page dirty in that case, but not all the buffers. This is a "bottom-up"
..	..	@@ -2475,7 +2504,7 @@
2475	2504	BUG_ON(page_mapping(page) != mapping);
2476	2505	WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
2477	2506	account_page_dirtied(page, mapping);
2478		- radix_tree_tag_set(&mapping->i_pages, page_index(page),
	2507	+ __xa_set_mark(&mapping->i_pages, page_index(page),
2479	2508	PAGECACHE_TAG_DIRTY);
2480	2509	xa_unlock_irqrestore(&mapping->i_pages, flags);
2481	2510	unlock_page_memcg(page);
..	..	@@ -2502,7 +2531,7 @@
2502	2531	{
2503	2532	struct address_space *mapping = page->mapping;
2504	2533
2505		- if (mapping && mapping_cap_account_dirty(mapping)) {
	2534	+ if (mapping && mapping_can_writeback(mapping)) {
2506	2535	struct inode *inode = mapping->host;
2507	2536	struct bdi_writeback *wb;
2508	2537	struct wb_lock_cookie cookie = {};
..	..	@@ -2614,7 +2643,7 @@
2614	2643	{
2615	2644	struct address_space *mapping = page_mapping(page);
2616	2645
2617		- if (mapping_cap_account_dirty(mapping)) {
	2646	+ if (mapping_can_writeback(mapping)) {
2618	2647	struct inode *inode = mapping->host;
2619	2648	struct bdi_writeback *wb;
2620	2649	struct wb_lock_cookie cookie = {};
..	..	@@ -2638,13 +2667,13 @@
2638	2667	* Returns true if the page was previously dirty.
2639	2668	*
2640	2669	* This is for preparing to put the page under writeout. We leave the page
2641		- * tagged as dirty in the radix tree so that a concurrent write-for-sync
	2670	+ * tagged as dirty in the xarray so that a concurrent write-for-sync
2642	2671	* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
2643	2672	* implementation will run either set_page_writeback() or set_page_dirty(),
2644		- * at which stage we bring the page's dirty flag and radix-tree dirty tag
	2673	+ * at which stage we bring the page's dirty flag and xarray dirty tag
2645	2674	* back into sync.
2646	2675	*
2647		- * This incoherency between the page's dirty flag and radix-tree tag is
	2676	+ * This incoherency between the page's dirty flag and xarray tag is
2648	2677	* unfortunate, but it only exists while the page is locked.
2649	2678	*/
2650	2679	int clear_page_dirty_for_io(struct page *page)
..	..	@@ -2652,9 +2681,9 @@
2652	2681	struct address_space *mapping = page_mapping(page);
2653	2682	int ret = 0;
2654	2683
2655		- BUG_ON(!PageLocked(page));
	2684	+ VM_BUG_ON_PAGE(!PageLocked(page), page);
2656	2685
2657		- if (mapping && mapping_cap_account_dirty(mapping)) {
	2686	+ if (mapping && mapping_can_writeback(mapping)) {
2658	2687	struct inode *inode = mapping->host;
2659	2688	struct bdi_writeback *wb;
2660	2689	struct wb_lock_cookie cookie = {};
..	..	@@ -2725,9 +2754,9 @@
2725	2754	xa_lock_irqsave(&mapping->i_pages, flags);
2726	2755	ret = TestClearPageWriteback(page);
2727	2756	if (ret) {
2728		- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
	2757	+ __xa_clear_mark(&mapping->i_pages, page_index(page),
2729	2758	PAGECACHE_TAG_WRITEBACK);
2730		- if (bdi_cap_account_writeback(bdi)) {
	2759	+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
2731	2760	struct bdi_writeback *wb = inode_to_wb(inode);
2732	2761
2733	2762	dec_wb_stat(wb, WB_WRITEBACK);
..	..	@@ -2743,12 +2772,6 @@
2743	2772	} else {
2744	2773	ret = TestClearPageWriteback(page);
2745	2774	}
2746		- /*
2747		- * NOTE: Page might be free now! Writeback doesn't hold a page
2748		- * reference on its own, it relies on truncation to wait for
2749		- * the clearing of PG_writeback. The below can only access
2750		- * page state that is static across allocation cycles.
2751		- */
2752	2775	if (ret) {
2753	2776	dec_lruvec_state(lruvec, NR_WRITEBACK);
2754	2777	dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
..	..	@@ -2761,15 +2784,17 @@
2761	2784	int __test_set_page_writeback(struct page *page, bool keep_write)
2762	2785	{
2763	2786	struct address_space *mapping = page_mapping(page);
2764		- int ret;
	2787	+ int ret, access_ret;
2765	2788
2766	2789	lock_page_memcg(page);
2767	2790	if (mapping && mapping_use_writeback_tags(mapping)) {
	2791	+ XA_STATE(xas, &mapping->i_pages, page_index(page));
2768	2792	struct inode *inode = mapping->host;
2769	2793	struct backing_dev_info *bdi = inode_to_bdi(inode);
2770	2794	unsigned long flags;
2771	2795
2772		- xa_lock_irqsave(&mapping->i_pages, flags);
	2796	+ xas_lock_irqsave(&xas, flags);
	2797	+ xas_load(&xas);
2773	2798	ret = TestSetPageWriteback(page);
2774	2799	if (!ret) {
2775	2800	bool on_wblist;
..	..	@@ -2777,9 +2802,8 @@
2777	2802	on_wblist = mapping_tagged(mapping,
2778	2803	PAGECACHE_TAG_WRITEBACK);
2779	2804
2780		- radix_tree_tag_set(&mapping->i_pages, page_index(page),
2781		- PAGECACHE_TAG_WRITEBACK);
2782		- if (bdi_cap_account_writeback(bdi))
	2805	+ xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
	2806	+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
2783	2807	inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
2784	2808
2785	2809	/*
..	..	@@ -2791,12 +2815,10 @@
2791	2815	sb_mark_inode_writeback(mapping->host);
2792	2816	}
2793	2817	if (!PageDirty(page))
2794		- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
2795		- PAGECACHE_TAG_DIRTY);
	2818	+ xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
2796	2819	if (!keep_write)
2797		- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
2798		- PAGECACHE_TAG_TOWRITE);
2799		- xa_unlock_irqrestore(&mapping->i_pages, flags);
	2820	+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
	2821	+ xas_unlock_irqrestore(&xas, flags);
2800	2822	} else {
2801	2823	ret = TestSetPageWriteback(page);
2802	2824	}
..	..	@@ -2805,20 +2827,29 @@
2805	2827	inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2806	2828	}
2807	2829	unlock_page_memcg(page);
	2830	+ access_ret = arch_make_page_accessible(page);
	2831	+ /*
	2832	+ * If writeback has been triggered on a page that cannot be made
	2833	+ * accessible, it is too late to recover here.
	2834	+ */
	2835	+ VM_BUG_ON_PAGE(access_ret != 0, page);
	2836	+
2808	2837	return ret;
2809	2838
2810	2839	}
2811	2840	EXPORT_SYMBOL(__test_set_page_writeback);
2812	2841
2813	2842	/*
2814		- * Return true if any of the pages in the mapping are marked with the
2815		- * passed tag.
	2843	+ * Wait for a page to complete writeback
2816	2844	*/
2817		-int mapping_tagged(struct address_space *mapping, int tag)
	2845	+void wait_on_page_writeback(struct page *page)
2818	2846	{
2819		- return radix_tree_tagged(&mapping->i_pages, tag);
	2847	+ while (PageWriteback(page)) {
	2848	+ trace_wait_on_page_writeback(page, page_mapping(page));
	2849	+ wait_on_page_bit(page, PG_writeback);
	2850	+ }
2820	2851	}
2821		-EXPORT_SYMBOL(mapping_tagged);
	2852	+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
2822	2853
2823	2854	/**
2824	2855	* wait_for_stable_page() - wait for writeback to finish, if necessary.
..	..	@@ -2830,7 +2861,8 @@
2830	2861	*/
2831	2862	void wait_for_stable_page(struct page *page)
2832	2863	{
2833		- if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
	2864	+ page = thp_head(page);
	2865	+ if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
2834	2866	wait_on_page_writeback(page);
2835	2867	}
2836	2868	EXPORT_SYMBOL_GPL(wait_for_stable_page);