hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/mm/page-writeback.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * mm/page-writeback.c
34 *
....@@ -40,6 +41,9 @@
4041 #include <trace/events/writeback.h>
4142
4243 #include "internal.h"
44
+
45
+#undef CREATE_TRACE_POINT
46
+#include <trace/hooks/mm.h>
4347
4448 /*
4549 * Sleep at most 200ms at a time in balance_dirty_pages().
....@@ -256,7 +260,7 @@
256260 * requiring writeback.
257261 *
258262 * This number of dirtyable pages is the base value of which the
259
- * user-configurable dirty ratio is the effictive number of pages that
263
+ * user-configurable dirty ratio is the effective number of pages that
260264 * are allowed to be actually dirtied. Per individual zone, or
261265 * globally by using the sum of dirtyable pages over all zones.
262266 *
....@@ -270,7 +274,7 @@
270274 * node_dirtyable_memory - number of dirtyable pages in a node
271275 * @pgdat: the node
272276 *
273
- * Returns the node's number of pages potentially available for dirty
277
+ * Return: the node's number of pages potentially available for dirty
274278 * page cache. This is the base value for the per-node dirty limits.
275279 */
276280 static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
....@@ -355,7 +359,7 @@
355359 /**
356360 * global_dirtyable_memory - number of globally dirtyable pages
357361 *
358
- * Returns the global number of pages potentially available for dirty
362
+ * Return: the global number of pages potentially available for dirty
359363 * page cache. This is the base value for the global dirty limits.
360364 */
361365 static unsigned long global_dirtyable_memory(void)
....@@ -386,8 +390,7 @@
386390 * Calculate @dtc->thresh and ->bg_thresh considering
387391 * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller
388392 * must ensure that @dtc->avail is set before calling this function. The
389
- * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
390
- * real-time tasks.
393
+ * dirty limits will be lifted by 1/4 for real-time tasks.
391394 */
392395 static void domain_dirty_limits(struct dirty_throttle_control *dtc)
393396 {
....@@ -435,7 +438,7 @@
435438 if (bg_thresh >= thresh)
436439 bg_thresh = thresh / 2;
437440 tsk = current;
438
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
441
+ if (rt_task(tsk)) {
439442 bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
440443 thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
441444 }
....@@ -470,7 +473,7 @@
470473 * node_dirty_limit - maximum number of dirty pages allowed in a node
471474 * @pgdat: the node
472475 *
473
- * Returns the maximum number of dirty pages allowed in a node, based
476
+ * Return: the maximum number of dirty pages allowed in a node, based
474477 * on the node's dirtyable memory.
475478 */
476479 static unsigned long node_dirty_limit(struct pglist_data *pgdat)
....@@ -485,7 +488,7 @@
485488 else
486489 dirty = vm_dirty_ratio * node_memory / 100;
487490
488
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
491
+ if (rt_task(tsk))
489492 dirty += dirty / 4;
490493
491494 return dirty;
....@@ -495,7 +498,7 @@
495498 * node_dirty_ok - tells whether a node is within its dirty limits
496499 * @pgdat: the node to check
497500 *
498
- * Returns %true when the dirty pages in @pgdat are within the node's
501
+ * Return: %true when the dirty pages in @pgdat are within the node's
499502 * dirty limit, %false if the limit is exceeded.
500503 */
501504 bool node_dirty_ok(struct pglist_data *pgdat)
....@@ -504,15 +507,13 @@
504507 unsigned long nr_pages = 0;
505508
506509 nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
507
- nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
508510 nr_pages += node_page_state(pgdat, NR_WRITEBACK);
509511
510512 return nr_pages <= limit;
511513 }
512514
513515 int dirty_background_ratio_handler(struct ctl_table *table, int write,
514
- void __user *buffer, size_t *lenp,
515
- loff_t *ppos)
516
+ void *buffer, size_t *lenp, loff_t *ppos)
516517 {
517518 int ret;
518519
....@@ -523,8 +524,7 @@
523524 }
524525
525526 int dirty_background_bytes_handler(struct ctl_table *table, int write,
526
- void __user *buffer, size_t *lenp,
527
- loff_t *ppos)
527
+ void *buffer, size_t *lenp, loff_t *ppos)
528528 {
529529 int ret;
530530
....@@ -534,9 +534,8 @@
534534 return ret;
535535 }
536536
537
-int dirty_ratio_handler(struct ctl_table *table, int write,
538
- void __user *buffer, size_t *lenp,
539
- loff_t *ppos)
537
+int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
538
+ size_t *lenp, loff_t *ppos)
540539 {
541540 int old_ratio = vm_dirty_ratio;
542541 int ret;
....@@ -550,8 +549,7 @@
550549 }
551550
552551 int dirty_bytes_handler(struct ctl_table *table, int write,
553
- void __user *buffer, size_t *lenp,
554
- loff_t *ppos)
552
+ void *buffer, size_t *lenp, loff_t *ppos)
555553 {
556554 unsigned long old_bytes = vm_dirty_bytes;
557555 int ret;
....@@ -743,9 +741,6 @@
743741 * __wb_calc_thresh - @wb's share of dirty throttling threshold
744742 * @dtc: dirty_throttle_context of interest
745743 *
746
- * Returns @wb's dirty limit in pages. The term "dirty" in the context of
747
- * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
748
- *
749744 * Note that balance_dirty_pages() will only seriously take it as a hard limit
750745 * when sleeping max_pause per page is not enough to keep the dirty pages under
751746 * control. For example, when the device is completely stalled due to some error
....@@ -759,13 +754,16 @@
759754 *
760755 * The wb's share of dirty limit will be adapting to its throughput and
761756 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
757
+ *
758
+ * Return: @wb's dirty limit in pages. The term "dirty" in the context of
759
+ * dirty balancing includes all PG_dirty and PG_writeback pages.
762760 */
763761 static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
764762 {
765763 struct wb_domain *dom = dtc_dom(dtc);
766764 unsigned long thresh = dtc->thresh;
767765 u64 wb_thresh;
768
- long numerator, denominator;
766
+ unsigned long numerator, denominator;
769767 unsigned long wb_min_ratio, wb_max_ratio;
770768
771769 /*
....@@ -776,7 +774,7 @@
776774
777775 wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
778776 wb_thresh *= numerator;
779
- do_div(wb_thresh, denominator);
777
+ wb_thresh = div64_ul(wb_thresh, denominator);
780778
781779 wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
782780
....@@ -1101,7 +1099,7 @@
11011099 bw = written - min(written, wb->written_stamp);
11021100 bw *= HZ;
11031101 if (unlikely(elapsed > period)) {
1104
- do_div(bw, elapsed);
1102
+ bw = div64_ul(bw, elapsed);
11051103 avg = bw;
11061104 goto out;
11071105 }
....@@ -1566,7 +1564,7 @@
15661564 struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
15671565 &mdtc_stor : NULL;
15681566 struct dirty_throttle_control *sdtc;
1569
- unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
1567
+ unsigned long nr_reclaimable; /* = file_dirty */
15701568 long period;
15711569 long pause;
15721570 long max_pause;
....@@ -1586,14 +1584,7 @@
15861584 unsigned long m_thresh = 0;
15871585 unsigned long m_bg_thresh = 0;
15881586
1589
- /*
1590
- * Unstable writes are a feature of certain networked
1591
- * filesystems (i.e. NFS) in which data may have been
1592
- * written to the server's write cache, but has not yet
1593
- * been flushed to permanent storage.
1594
- */
1595
- nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
1596
- global_node_page_state(NR_UNSTABLE_NFS);
1587
+ nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
15971588 gdtc->avail = global_dirtyable_memory();
15981589 gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
15991590
....@@ -1637,6 +1628,9 @@
16371628 }
16381629 }
16391630
1631
+ trace_android_vh_mm_dirty_limits(gdtc, strictlimit, dirty, bg_thresh,
1632
+ nr_reclaimable, pages_dirtied);
1633
+
16401634 /*
16411635 * Throttle it only when the background writeback cannot
16421636 * catch-up. This avoids (excessively) small writeouts
....@@ -1652,8 +1646,12 @@
16521646 if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
16531647 (!mdtc ||
16541648 m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
1655
- unsigned long intv = dirty_poll_interval(dirty, thresh);
1656
- unsigned long m_intv = ULONG_MAX;
1649
+ unsigned long intv;
1650
+ unsigned long m_intv;
1651
+
1652
+free_running:
1653
+ intv = dirty_poll_interval(dirty, thresh);
1654
+ m_intv = ULONG_MAX;
16571655
16581656 current->dirty_paused_when = now;
16591657 current->nr_dirtied = 0;
....@@ -1666,12 +1664,25 @@
16661664 if (unlikely(!writeback_in_progress(wb)))
16671665 wb_start_background_writeback(wb);
16681666
1667
+ mem_cgroup_flush_foreign(wb);
1668
+
16691669 /*
16701670 * Calculate global domain's pos_ratio and select the
16711671 * global dtc by default.
16721672 */
1673
- if (!strictlimit)
1673
+ if (!strictlimit) {
16741674 wb_dirty_limits(gdtc);
1675
+
1676
+ if ((current->flags & PF_LOCAL_THROTTLE) &&
1677
+ gdtc->wb_dirty <
1678
+ dirty_freerun_ceiling(gdtc->wb_thresh,
1679
+ gdtc->wb_bg_thresh))
1680
+ /*
1681
+ * LOCAL_THROTTLE tasks must not be throttled
1682
+ * when below the per-wb freerun ceiling.
1683
+ */
1684
+ goto free_running;
1685
+ }
16751686
16761687 dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
16771688 ((gdtc->dirty > gdtc->thresh) || strictlimit);
....@@ -1686,9 +1697,20 @@
16861697 * both global and memcg domains. Choose the one
16871698 * w/ lower pos_ratio.
16881699 */
1689
- if (!strictlimit)
1700
+ if (!strictlimit) {
16901701 wb_dirty_limits(mdtc);
16911702
1703
+ if ((current->flags & PF_LOCAL_THROTTLE) &&
1704
+ mdtc->wb_dirty <
1705
+ dirty_freerun_ceiling(mdtc->wb_thresh,
1706
+ mdtc->wb_bg_thresh))
1707
+ /*
1708
+ * LOCAL_THROTTLE tasks must not be
1709
+ * throttled when below the per-wb
1710
+ * freerun ceiling.
1711
+ */
1712
+ goto free_running;
1713
+ }
16921714 dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
16931715 ((mdtc->dirty > mdtc->thresh) || strictlimit);
16941716
....@@ -1866,7 +1888,7 @@
18661888 int ratelimit;
18671889 int *p;
18681890
1869
- if (!bdi_cap_account_dirty(bdi))
1891
+ if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
18701892 return;
18711893
18721894 if (inode_cgwb_enabled(inode))
....@@ -1918,7 +1940,9 @@
19181940 * @wb: bdi_writeback of interest
19191941 *
19201942 * Determines whether background writeback should keep writing @wb or it's
1921
- * clean enough. Returns %true if writeback should continue.
1943
+ * clean enough.
1944
+ *
1945
+ * Return: %true if writeback should continue.
19221946 */
19231947 bool wb_over_bg_thresh(struct bdi_writeback *wb)
19241948 {
....@@ -1927,21 +1951,27 @@
19271951 struct dirty_throttle_control * const gdtc = &gdtc_stor;
19281952 struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
19291953 &mdtc_stor : NULL;
1954
+ unsigned long reclaimable;
1955
+ unsigned long thresh;
19301956
19311957 /*
19321958 * Similar to balance_dirty_pages() but ignores pages being written
19331959 * as we're trying to decide whether to put more under writeback.
19341960 */
19351961 gdtc->avail = global_dirtyable_memory();
1936
- gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) +
1937
- global_node_page_state(NR_UNSTABLE_NFS);
1962
+ gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
19381963 domain_dirty_limits(gdtc);
19391964
19401965 if (gdtc->dirty > gdtc->bg_thresh)
19411966 return true;
19421967
1943
- if (wb_stat(wb, WB_RECLAIMABLE) >
1944
- wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
1968
+ thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh);
1969
+ if (thresh < 2 * wb_stat_error())
1970
+ reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
1971
+ else
1972
+ reclaimable = wb_stat(wb, WB_RECLAIMABLE);
1973
+
1974
+ if (reclaimable > thresh)
19451975 return true;
19461976
19471977 if (mdtc) {
....@@ -1955,8 +1985,13 @@
19551985 if (mdtc->dirty > mdtc->bg_thresh)
19561986 return true;
19571987
1958
- if (wb_stat(wb, WB_RECLAIMABLE) >
1959
- wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
1988
+ thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh);
1989
+ if (thresh < 2 * wb_stat_error())
1990
+ reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
1991
+ else
1992
+ reclaimable = wb_stat(wb, WB_RECLAIMABLE);
1993
+
1994
+ if (reclaimable > thresh)
19601995 return true;
19611996 }
19621997
....@@ -1967,7 +2002,7 @@
19672002 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
19682003 */
19692004 int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
1970
- void __user *buffer, size_t *length, loff_t *ppos)
2005
+ void *buffer, size_t *length, loff_t *ppos)
19712006 {
19722007 unsigned int old_interval = dirty_writeback_interval;
19732008 int ret;
....@@ -2059,13 +2094,11 @@
20592094 * Called early on to tune the page writeback dirty limits.
20602095 *
20612096 * We used to scale dirty pages according to how total memory
2062
- * related to pages that could be allocated for buffers (by
2063
- * comparing nr_free_buffer_pages() to vm_total_pages.
2097
+ * related to pages that could be allocated for buffers.
20642098 *
20652099 * However, that was when we used "dirty_ratio" to scale with
20662100 * all memory, and we don't do that any more. "dirty_ratio"
2067
- * is now applied to total non-HIGHPAGE memory (by subtracting
2068
- * totalhigh_pages from vm_total_pages), and as such we can't
2101
+ * is now applied to total non-HIGHPAGE memory, and as such we can't
20692102 * get into the old insane situation any more where we had
20702103 * large amounts of dirty pages compared to a small amount of
20712104 * non-HIGHMEM memory.
....@@ -2097,34 +2130,25 @@
20972130 * dirty pages in the file (thus it is important for this function to be quick
20982131 * so that it can tag pages faster than a dirtying process can create them).
20992132 */
2100
-/*
2101
- * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock
2102
- * latency.
2103
- */
21042133 void tag_pages_for_writeback(struct address_space *mapping,
21052134 pgoff_t start, pgoff_t end)
21062135 {
2107
-#define WRITEBACK_TAG_BATCH 4096
2108
- unsigned long tagged = 0;
2109
- struct radix_tree_iter iter;
2110
- void **slot;
2136
+ XA_STATE(xas, &mapping->i_pages, start);
2137
+ unsigned int tagged = 0;
2138
+ void *page;
21112139
2112
- xa_lock_irq(&mapping->i_pages);
2113
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start,
2114
- PAGECACHE_TAG_DIRTY) {
2115
- if (iter.index > end)
2116
- break;
2117
- radix_tree_iter_tag_set(&mapping->i_pages, &iter,
2118
- PAGECACHE_TAG_TOWRITE);
2119
- tagged++;
2120
- if ((tagged % WRITEBACK_TAG_BATCH) != 0)
2140
+ xas_lock_irq(&xas);
2141
+ xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
2142
+ xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
2143
+ if (++tagged % XA_CHECK_SCHED)
21212144 continue;
2122
- slot = radix_tree_iter_resume(slot, &iter);
2123
- xa_unlock_irq(&mapping->i_pages);
2145
+
2146
+ xas_pause(&xas);
2147
+ xas_unlock_irq(&xas);
21242148 cond_resched();
2125
- xa_lock_irq(&mapping->i_pages);
2149
+ xas_lock_irq(&xas);
21262150 }
2127
- xa_unlock_irq(&mapping->i_pages);
2151
+ xas_unlock_irq(&xas);
21282152 }
21292153 EXPORT_SYMBOL(tag_pages_for_writeback);
21302154
....@@ -2156,6 +2180,8 @@
21562180 * lock/page writeback access order inversion - we should only ever lock
21572181 * multiple pages in ascending page->index order, and looping back to the start
21582182 * of the file violates that rule and causes deadlocks.
2183
+ *
2184
+ * Return: %0 on success, negative error code otherwise
21592185 */
21602186 int write_cache_pages(struct address_space *mapping,
21612187 struct writeback_control *wbc, writepage_t writepage,
....@@ -2166,17 +2192,15 @@
21662192 int error;
21672193 struct pagevec pvec;
21682194 int nr_pages;
2169
- pgoff_t uninitialized_var(writeback_index);
21702195 pgoff_t index;
21712196 pgoff_t end; /* Inclusive */
21722197 pgoff_t done_index;
21732198 int range_whole = 0;
2174
- int tag;
2199
+ xa_mark_t tag;
21752200
21762201 pagevec_init(&pvec);
21772202 if (wbc->range_cyclic) {
2178
- writeback_index = mapping->writeback_index; /* prev offset */
2179
- index = writeback_index;
2203
+ index = mapping->writeback_index; /* prev offset */
21802204 end = -1;
21812205 } else {
21822206 index = wbc->range_start >> PAGE_SHIFT;
....@@ -2184,12 +2208,12 @@
21842208 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
21852209 range_whole = 1;
21862210 }
2187
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2188
- tag = PAGECACHE_TAG_TOWRITE;
2189
- else
2190
- tag = PAGECACHE_TAG_DIRTY;
2191
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2211
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
21922212 tag_pages_for_writeback(mapping, index, end);
2213
+ tag = PAGECACHE_TAG_TOWRITE;
2214
+ } else {
2215
+ tag = PAGECACHE_TAG_DIRTY;
2216
+ }
21932217 done_index = index;
21942218 while (!done && (index <= end)) {
21952219 int i;
....@@ -2314,6 +2338,8 @@
23142338 *
23152339 * This is a library function, which implements the writepages()
23162340 * address_space_operation.
2341
+ *
2342
+ * Return: %0 on success, negative error code otherwise
23172343 */
23182344 int generic_writepages(struct address_space *mapping,
23192345 struct writeback_control *wbc)
....@@ -2360,6 +2386,8 @@
23602386 *
23612387 * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
23622388 * function returns.
2389
+ *
2390
+ * Return: %0 on success, negative error code otherwise
23632391 */
23642392 int write_one_page(struct page *page)
23652393 {
....@@ -2413,7 +2441,7 @@
24132441
24142442 trace_writeback_dirty_page(page, mapping);
24152443
2416
- if (mapping_cap_account_dirty(mapping)) {
2444
+ if (mapping_can_writeback(mapping)) {
24172445 struct bdi_writeback *wb;
24182446
24192447 inode_attach_wb(inode, page);
....@@ -2427,9 +2455,10 @@
24272455 task_io_account_write(PAGE_SIZE);
24282456 current->nr_dirtied++;
24292457 this_cpu_inc(bdp_ratelimits);
2458
+
2459
+ mem_cgroup_track_foreign_dirty(page, wb);
24302460 }
24312461 }
2432
-EXPORT_SYMBOL(account_page_dirtied);
24332462
24342463 /*
24352464 * Helper function for deaccounting dirty page without writeback.
....@@ -2439,7 +2468,7 @@
24392468 void account_page_cleaned(struct page *page, struct address_space *mapping,
24402469 struct bdi_writeback *wb)
24412470 {
2442
- if (mapping_cap_account_dirty(mapping)) {
2471
+ if (mapping_can_writeback(mapping)) {
24432472 dec_lruvec_page_state(page, NR_FILE_DIRTY);
24442473 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
24452474 dec_wb_stat(wb, WB_RECLAIMABLE);
....@@ -2449,7 +2478,7 @@
24492478
24502479 /*
24512480 * For address_spaces which do not use buffers. Just tag the page as dirty in
2452
- * its radix tree.
2481
+ * the xarray.
24532482 *
24542483 * This is also used when a single buffer is being dirtied: we want to set the
24552484 * page dirty in that case, but not all the buffers. This is a "bottom-up"
....@@ -2475,7 +2504,7 @@
24752504 BUG_ON(page_mapping(page) != mapping);
24762505 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
24772506 account_page_dirtied(page, mapping);
2478
- radix_tree_tag_set(&mapping->i_pages, page_index(page),
2507
+ __xa_set_mark(&mapping->i_pages, page_index(page),
24792508 PAGECACHE_TAG_DIRTY);
24802509 xa_unlock_irqrestore(&mapping->i_pages, flags);
24812510 unlock_page_memcg(page);
....@@ -2502,7 +2531,7 @@
25022531 {
25032532 struct address_space *mapping = page->mapping;
25042533
2505
- if (mapping && mapping_cap_account_dirty(mapping)) {
2534
+ if (mapping && mapping_can_writeback(mapping)) {
25062535 struct inode *inode = mapping->host;
25072536 struct bdi_writeback *wb;
25082537 struct wb_lock_cookie cookie = {};
....@@ -2614,7 +2643,7 @@
26142643 {
26152644 struct address_space *mapping = page_mapping(page);
26162645
2617
- if (mapping_cap_account_dirty(mapping)) {
2646
+ if (mapping_can_writeback(mapping)) {
26182647 struct inode *inode = mapping->host;
26192648 struct bdi_writeback *wb;
26202649 struct wb_lock_cookie cookie = {};
....@@ -2638,13 +2667,13 @@
26382667 * Returns true if the page was previously dirty.
26392668 *
26402669 * This is for preparing to put the page under writeout. We leave the page
2641
- * tagged as dirty in the radix tree so that a concurrent write-for-sync
2670
+ * tagged as dirty in the xarray so that a concurrent write-for-sync
26422671 * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
26432672 * implementation will run either set_page_writeback() or set_page_dirty(),
2644
- * at which stage we bring the page's dirty flag and radix-tree dirty tag
2673
+ * at which stage we bring the page's dirty flag and xarray dirty tag
26452674 * back into sync.
26462675 *
2647
- * This incoherency between the page's dirty flag and radix-tree tag is
2676
+ * This incoherency between the page's dirty flag and xarray tag is
26482677 * unfortunate, but it only exists while the page is locked.
26492678 */
26502679 int clear_page_dirty_for_io(struct page *page)
....@@ -2652,9 +2681,9 @@
26522681 struct address_space *mapping = page_mapping(page);
26532682 int ret = 0;
26542683
2655
- BUG_ON(!PageLocked(page));
2684
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
26562685
2657
- if (mapping && mapping_cap_account_dirty(mapping)) {
2686
+ if (mapping && mapping_can_writeback(mapping)) {
26582687 struct inode *inode = mapping->host;
26592688 struct bdi_writeback *wb;
26602689 struct wb_lock_cookie cookie = {};
....@@ -2725,9 +2754,9 @@
27252754 xa_lock_irqsave(&mapping->i_pages, flags);
27262755 ret = TestClearPageWriteback(page);
27272756 if (ret) {
2728
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
2757
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
27292758 PAGECACHE_TAG_WRITEBACK);
2730
- if (bdi_cap_account_writeback(bdi)) {
2759
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
27312760 struct bdi_writeback *wb = inode_to_wb(inode);
27322761
27332762 dec_wb_stat(wb, WB_WRITEBACK);
....@@ -2743,12 +2772,6 @@
27432772 } else {
27442773 ret = TestClearPageWriteback(page);
27452774 }
2746
- /*
2747
- * NOTE: Page might be free now! Writeback doesn't hold a page
2748
- * reference on its own, it relies on truncation to wait for
2749
- * the clearing of PG_writeback. The below can only access
2750
- * page state that is static across allocation cycles.
2751
- */
27522775 if (ret) {
27532776 dec_lruvec_state(lruvec, NR_WRITEBACK);
27542777 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
....@@ -2761,15 +2784,17 @@
27612784 int __test_set_page_writeback(struct page *page, bool keep_write)
27622785 {
27632786 struct address_space *mapping = page_mapping(page);
2764
- int ret;
2787
+ int ret, access_ret;
27652788
27662789 lock_page_memcg(page);
27672790 if (mapping && mapping_use_writeback_tags(mapping)) {
2791
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
27682792 struct inode *inode = mapping->host;
27692793 struct backing_dev_info *bdi = inode_to_bdi(inode);
27702794 unsigned long flags;
27712795
2772
- xa_lock_irqsave(&mapping->i_pages, flags);
2796
+ xas_lock_irqsave(&xas, flags);
2797
+ xas_load(&xas);
27732798 ret = TestSetPageWriteback(page);
27742799 if (!ret) {
27752800 bool on_wblist;
....@@ -2777,9 +2802,8 @@
27772802 on_wblist = mapping_tagged(mapping,
27782803 PAGECACHE_TAG_WRITEBACK);
27792804
2780
- radix_tree_tag_set(&mapping->i_pages, page_index(page),
2781
- PAGECACHE_TAG_WRITEBACK);
2782
- if (bdi_cap_account_writeback(bdi))
2805
+ xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
2806
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
27832807 inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
27842808
27852809 /*
....@@ -2791,12 +2815,10 @@
27912815 sb_mark_inode_writeback(mapping->host);
27922816 }
27932817 if (!PageDirty(page))
2794
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
2795
- PAGECACHE_TAG_DIRTY);
2818
+ xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
27962819 if (!keep_write)
2797
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
2798
- PAGECACHE_TAG_TOWRITE);
2799
- xa_unlock_irqrestore(&mapping->i_pages, flags);
2820
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
2821
+ xas_unlock_irqrestore(&xas, flags);
28002822 } else {
28012823 ret = TestSetPageWriteback(page);
28022824 }
....@@ -2805,20 +2827,29 @@
28052827 inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
28062828 }
28072829 unlock_page_memcg(page);
2830
+ access_ret = arch_make_page_accessible(page);
2831
+ /*
2832
+ * If writeback has been triggered on a page that cannot be made
2833
+ * accessible, it is too late to recover here.
2834
+ */
2835
+ VM_BUG_ON_PAGE(access_ret != 0, page);
2836
+
28082837 return ret;
28092838
28102839 }
28112840 EXPORT_SYMBOL(__test_set_page_writeback);
28122841
28132842 /*
2814
- * Return true if any of the pages in the mapping are marked with the
2815
- * passed tag.
2843
+ * Wait for a page to complete writeback
28162844 */
2817
-int mapping_tagged(struct address_space *mapping, int tag)
2845
+void wait_on_page_writeback(struct page *page)
28182846 {
2819
- return radix_tree_tagged(&mapping->i_pages, tag);
2847
+ while (PageWriteback(page)) {
2848
+ trace_wait_on_page_writeback(page, page_mapping(page));
2849
+ wait_on_page_bit(page, PG_writeback);
2850
+ }
28202851 }
2821
-EXPORT_SYMBOL(mapping_tagged);
2852
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
28222853
28232854 /**
28242855 * wait_for_stable_page() - wait for writeback to finish, if necessary.
....@@ -2830,7 +2861,8 @@
28302861 */
28312862 void wait_for_stable_page(struct page *page)
28322863 {
2833
- if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
2864
+ page = thp_head(page);
2865
+ if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
28342866 wait_on_page_writeback(page);
28352867 }
28362868 EXPORT_SYMBOL_GPL(wait_for_stable_page);