.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * mm/page-writeback.c |
---|
3 | 4 | * |
---|
.. | .. |
---|
40 | 41 | #include <trace/events/writeback.h> |
---|
41 | 42 | |
---|
42 | 43 | #include "internal.h" |
---|
| 44 | + |
---|
| 45 | +#undef CREATE_TRACE_POINT |
---|
| 46 | +#include <trace/hooks/mm.h> |
---|
43 | 47 | |
---|
44 | 48 | /* |
---|
45 | 49 | * Sleep at most 200ms at a time in balance_dirty_pages(). |
---|
.. | .. |
---|
256 | 260 | * requiring writeback. |
---|
257 | 261 | * |
---|
258 | 262 | * This number of dirtyable pages is the base value of which the |
---|
259 | | - * user-configurable dirty ratio is the effictive number of pages that |
---|
| 263 | + * user-configurable dirty ratio is the effective number of pages that |
---|
260 | 264 | * are allowed to be actually dirtied. Per individual zone, or |
---|
261 | 265 | * globally by using the sum of dirtyable pages over all zones. |
---|
262 | 266 | * |
---|
.. | .. |
---|
270 | 274 | * node_dirtyable_memory - number of dirtyable pages in a node |
---|
271 | 275 | * @pgdat: the node |
---|
272 | 276 | * |
---|
273 | | - * Returns the node's number of pages potentially available for dirty |
---|
| 277 | + * Return: the node's number of pages potentially available for dirty |
---|
274 | 278 | * page cache. This is the base value for the per-node dirty limits. |
---|
275 | 279 | */ |
---|
276 | 280 | static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) |
---|
.. | .. |
---|
355 | 359 | /** |
---|
356 | 360 | * global_dirtyable_memory - number of globally dirtyable pages |
---|
357 | 361 | * |
---|
358 | | - * Returns the global number of pages potentially available for dirty |
---|
| 362 | + * Return: the global number of pages potentially available for dirty |
---|
359 | 363 | * page cache. This is the base value for the global dirty limits. |
---|
360 | 364 | */ |
---|
361 | 365 | static unsigned long global_dirtyable_memory(void) |
---|
.. | .. |
---|
386 | 390 | * Calculate @dtc->thresh and ->bg_thresh considering |
---|
387 | 391 | * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller |
---|
388 | 392 | * must ensure that @dtc->avail is set before calling this function. The |
---|
389 | | - * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and |
---|
390 | | - * real-time tasks. |
---|
| 393 | + * dirty limits will be lifted by 1/4 for real-time tasks. |
---|
391 | 394 | */ |
---|
392 | 395 | static void domain_dirty_limits(struct dirty_throttle_control *dtc) |
---|
393 | 396 | { |
---|
.. | .. |
---|
435 | 438 | if (bg_thresh >= thresh) |
---|
436 | 439 | bg_thresh = thresh / 2; |
---|
437 | 440 | tsk = current; |
---|
438 | | - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
---|
| 441 | + if (rt_task(tsk)) { |
---|
439 | 442 | bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; |
---|
440 | 443 | thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; |
---|
441 | 444 | } |
---|
.. | .. |
---|
470 | 473 | * node_dirty_limit - maximum number of dirty pages allowed in a node |
---|
471 | 474 | * @pgdat: the node |
---|
472 | 475 | * |
---|
473 | | - * Returns the maximum number of dirty pages allowed in a node, based |
---|
| 476 | + * Return: the maximum number of dirty pages allowed in a node, based |
---|
474 | 477 | * on the node's dirtyable memory. |
---|
475 | 478 | */ |
---|
476 | 479 | static unsigned long node_dirty_limit(struct pglist_data *pgdat) |
---|
.. | .. |
---|
485 | 488 | else |
---|
486 | 489 | dirty = vm_dirty_ratio * node_memory / 100; |
---|
487 | 490 | |
---|
488 | | - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) |
---|
| 491 | + if (rt_task(tsk)) |
---|
489 | 492 | dirty += dirty / 4; |
---|
490 | 493 | |
---|
491 | 494 | return dirty; |
---|
.. | .. |
---|
495 | 498 | * node_dirty_ok - tells whether a node is within its dirty limits |
---|
496 | 499 | * @pgdat: the node to check |
---|
497 | 500 | * |
---|
498 | | - * Returns %true when the dirty pages in @pgdat are within the node's |
---|
| 501 | + * Return: %true when the dirty pages in @pgdat are within the node's |
---|
499 | 502 | * dirty limit, %false if the limit is exceeded. |
---|
500 | 503 | */ |
---|
501 | 504 | bool node_dirty_ok(struct pglist_data *pgdat) |
---|
.. | .. |
---|
504 | 507 | unsigned long nr_pages = 0; |
---|
505 | 508 | |
---|
506 | 509 | nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); |
---|
507 | | - nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); |
---|
508 | 510 | nr_pages += node_page_state(pgdat, NR_WRITEBACK); |
---|
509 | 511 | |
---|
510 | 512 | return nr_pages <= limit; |
---|
511 | 513 | } |
---|
512 | 514 | |
---|
513 | 515 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
---|
514 | | - void __user *buffer, size_t *lenp, |
---|
515 | | - loff_t *ppos) |
---|
| 516 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
516 | 517 | { |
---|
517 | 518 | int ret; |
---|
518 | 519 | |
---|
.. | .. |
---|
523 | 524 | } |
---|
524 | 525 | |
---|
525 | 526 | int dirty_background_bytes_handler(struct ctl_table *table, int write, |
---|
526 | | - void __user *buffer, size_t *lenp, |
---|
527 | | - loff_t *ppos) |
---|
| 527 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
528 | 528 | { |
---|
529 | 529 | int ret; |
---|
530 | 530 | |
---|
.. | .. |
---|
534 | 534 | return ret; |
---|
535 | 535 | } |
---|
536 | 536 | |
---|
537 | | -int dirty_ratio_handler(struct ctl_table *table, int write, |
---|
538 | | - void __user *buffer, size_t *lenp, |
---|
539 | | - loff_t *ppos) |
---|
| 537 | +int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, |
---|
| 538 | + size_t *lenp, loff_t *ppos) |
---|
540 | 539 | { |
---|
541 | 540 | int old_ratio = vm_dirty_ratio; |
---|
542 | 541 | int ret; |
---|
.. | .. |
---|
550 | 549 | } |
---|
551 | 550 | |
---|
552 | 551 | int dirty_bytes_handler(struct ctl_table *table, int write, |
---|
553 | | - void __user *buffer, size_t *lenp, |
---|
554 | | - loff_t *ppos) |
---|
| 552 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
555 | 553 | { |
---|
556 | 554 | unsigned long old_bytes = vm_dirty_bytes; |
---|
557 | 555 | int ret; |
---|
.. | .. |
---|
743 | 741 | * __wb_calc_thresh - @wb's share of dirty throttling threshold |
---|
744 | 742 | * @dtc: dirty_throttle_context of interest |
---|
745 | 743 | * |
---|
746 | | - * Returns @wb's dirty limit in pages. The term "dirty" in the context of |
---|
747 | | - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. |
---|
748 | | - * |
---|
749 | 744 | * Note that balance_dirty_pages() will only seriously take it as a hard limit |
---|
750 | 745 | * when sleeping max_pause per page is not enough to keep the dirty pages under |
---|
751 | 746 | * control. For example, when the device is completely stalled due to some error |
---|
.. | .. |
---|
759 | 754 | * |
---|
760 | 755 | * The wb's share of dirty limit will be adapting to its throughput and |
---|
761 | 756 | * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. |
---|
| 757 | + * |
---|
| 758 | + * Return: @wb's dirty limit in pages. The term "dirty" in the context of |
---|
| 759 | + * dirty balancing includes all PG_dirty and PG_writeback pages. |
---|
762 | 760 | */ |
---|
763 | 761 | static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) |
---|
764 | 762 | { |
---|
765 | 763 | struct wb_domain *dom = dtc_dom(dtc); |
---|
766 | 764 | unsigned long thresh = dtc->thresh; |
---|
767 | 765 | u64 wb_thresh; |
---|
768 | | - long numerator, denominator; |
---|
| 766 | + unsigned long numerator, denominator; |
---|
769 | 767 | unsigned long wb_min_ratio, wb_max_ratio; |
---|
770 | 768 | |
---|
771 | 769 | /* |
---|
.. | .. |
---|
776 | 774 | |
---|
777 | 775 | wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; |
---|
778 | 776 | wb_thresh *= numerator; |
---|
779 | | - do_div(wb_thresh, denominator); |
---|
| 777 | + wb_thresh = div64_ul(wb_thresh, denominator); |
---|
780 | 778 | |
---|
781 | 779 | wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); |
---|
782 | 780 | |
---|
.. | .. |
---|
1101 | 1099 | bw = written - min(written, wb->written_stamp); |
---|
1102 | 1100 | bw *= HZ; |
---|
1103 | 1101 | if (unlikely(elapsed > period)) { |
---|
1104 | | - do_div(bw, elapsed); |
---|
| 1102 | + bw = div64_ul(bw, elapsed); |
---|
1105 | 1103 | avg = bw; |
---|
1106 | 1104 | goto out; |
---|
1107 | 1105 | } |
---|
.. | .. |
---|
1566 | 1564 | struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? |
---|
1567 | 1565 | &mdtc_stor : NULL; |
---|
1568 | 1566 | struct dirty_throttle_control *sdtc; |
---|
1569 | | - unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ |
---|
| 1567 | + unsigned long nr_reclaimable; /* = file_dirty */ |
---|
1570 | 1568 | long period; |
---|
1571 | 1569 | long pause; |
---|
1572 | 1570 | long max_pause; |
---|
.. | .. |
---|
1586 | 1584 | unsigned long m_thresh = 0; |
---|
1587 | 1585 | unsigned long m_bg_thresh = 0; |
---|
1588 | 1586 | |
---|
1589 | | - /* |
---|
1590 | | - * Unstable writes are a feature of certain networked |
---|
1591 | | - * filesystems (i.e. NFS) in which data may have been |
---|
1592 | | - * written to the server's write cache, but has not yet |
---|
1593 | | - * been flushed to permanent storage. |
---|
1594 | | - */ |
---|
1595 | | - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + |
---|
1596 | | - global_node_page_state(NR_UNSTABLE_NFS); |
---|
| 1587 | + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); |
---|
1597 | 1588 | gdtc->avail = global_dirtyable_memory(); |
---|
1598 | 1589 | gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); |
---|
1599 | 1590 | |
---|
.. | .. |
---|
1637 | 1628 | } |
---|
1638 | 1629 | } |
---|
1639 | 1630 | |
---|
| 1631 | + trace_android_vh_mm_dirty_limits(gdtc, strictlimit, dirty, bg_thresh, |
---|
| 1632 | + nr_reclaimable, pages_dirtied); |
---|
| 1633 | + |
---|
1640 | 1634 | /* |
---|
1641 | 1635 | * Throttle it only when the background writeback cannot |
---|
1642 | 1636 | * catch-up. This avoids (excessively) small writeouts |
---|
.. | .. |
---|
1652 | 1646 | if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && |
---|
1653 | 1647 | (!mdtc || |
---|
1654 | 1648 | m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { |
---|
1655 | | - unsigned long intv = dirty_poll_interval(dirty, thresh); |
---|
1656 | | - unsigned long m_intv = ULONG_MAX; |
---|
| 1649 | + unsigned long intv; |
---|
| 1650 | + unsigned long m_intv; |
---|
| 1651 | + |
---|
| 1652 | +free_running: |
---|
| 1653 | + intv = dirty_poll_interval(dirty, thresh); |
---|
| 1654 | + m_intv = ULONG_MAX; |
---|
1657 | 1655 | |
---|
1658 | 1656 | current->dirty_paused_when = now; |
---|
1659 | 1657 | current->nr_dirtied = 0; |
---|
.. | .. |
---|
1666 | 1664 | if (unlikely(!writeback_in_progress(wb))) |
---|
1667 | 1665 | wb_start_background_writeback(wb); |
---|
1668 | 1666 | |
---|
| 1667 | + mem_cgroup_flush_foreign(wb); |
---|
| 1668 | + |
---|
1669 | 1669 | /* |
---|
1670 | 1670 | * Calculate global domain's pos_ratio and select the |
---|
1671 | 1671 | * global dtc by default. |
---|
1672 | 1672 | */ |
---|
1673 | | - if (!strictlimit) |
---|
| 1673 | + if (!strictlimit) { |
---|
1674 | 1674 | wb_dirty_limits(gdtc); |
---|
| 1675 | + |
---|
| 1676 | + if ((current->flags & PF_LOCAL_THROTTLE) && |
---|
| 1677 | + gdtc->wb_dirty < |
---|
| 1678 | + dirty_freerun_ceiling(gdtc->wb_thresh, |
---|
| 1679 | + gdtc->wb_bg_thresh)) |
---|
| 1680 | + /* |
---|
| 1681 | + * LOCAL_THROTTLE tasks must not be throttled |
---|
| 1682 | + * when below the per-wb freerun ceiling. |
---|
| 1683 | + */ |
---|
| 1684 | + goto free_running; |
---|
| 1685 | + } |
---|
1675 | 1686 | |
---|
1676 | 1687 | dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && |
---|
1677 | 1688 | ((gdtc->dirty > gdtc->thresh) || strictlimit); |
---|
.. | .. |
---|
1686 | 1697 | * both global and memcg domains. Choose the one |
---|
1687 | 1698 | * w/ lower pos_ratio. |
---|
1688 | 1699 | */ |
---|
1689 | | - if (!strictlimit) |
---|
| 1700 | + if (!strictlimit) { |
---|
1690 | 1701 | wb_dirty_limits(mdtc); |
---|
1691 | 1702 | |
---|
| 1703 | + if ((current->flags & PF_LOCAL_THROTTLE) && |
---|
| 1704 | + mdtc->wb_dirty < |
---|
| 1705 | + dirty_freerun_ceiling(mdtc->wb_thresh, |
---|
| 1706 | + mdtc->wb_bg_thresh)) |
---|
| 1707 | + /* |
---|
| 1708 | + * LOCAL_THROTTLE tasks must not be |
---|
| 1709 | + * throttled when below the per-wb |
---|
| 1710 | + * freerun ceiling. |
---|
| 1711 | + */ |
---|
| 1712 | + goto free_running; |
---|
| 1713 | + } |
---|
1692 | 1714 | dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && |
---|
1693 | 1715 | ((mdtc->dirty > mdtc->thresh) || strictlimit); |
---|
1694 | 1716 | |
---|
.. | .. |
---|
1866 | 1888 | int ratelimit; |
---|
1867 | 1889 | int *p; |
---|
1868 | 1890 | |
---|
1869 | | - if (!bdi_cap_account_dirty(bdi)) |
---|
| 1891 | + if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) |
---|
1870 | 1892 | return; |
---|
1871 | 1893 | |
---|
1872 | 1894 | if (inode_cgwb_enabled(inode)) |
---|
.. | .. |
---|
1918 | 1940 | * @wb: bdi_writeback of interest |
---|
1919 | 1941 | * |
---|
1920 | 1942 | * Determines whether background writeback should keep writing @wb or it's |
---|
1921 | | - * clean enough. Returns %true if writeback should continue. |
---|
| 1943 | + * clean enough. |
---|
| 1944 | + * |
---|
| 1945 | + * Return: %true if writeback should continue. |
---|
1922 | 1946 | */ |
---|
1923 | 1947 | bool wb_over_bg_thresh(struct bdi_writeback *wb) |
---|
1924 | 1948 | { |
---|
.. | .. |
---|
1927 | 1951 | struct dirty_throttle_control * const gdtc = &gdtc_stor; |
---|
1928 | 1952 | struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? |
---|
1929 | 1953 | &mdtc_stor : NULL; |
---|
| 1954 | + unsigned long reclaimable; |
---|
| 1955 | + unsigned long thresh; |
---|
1930 | 1956 | |
---|
1931 | 1957 | /* |
---|
1932 | 1958 | * Similar to balance_dirty_pages() but ignores pages being written |
---|
1933 | 1959 | * as we're trying to decide whether to put more under writeback. |
---|
1934 | 1960 | */ |
---|
1935 | 1961 | gdtc->avail = global_dirtyable_memory(); |
---|
1936 | | - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + |
---|
1937 | | - global_node_page_state(NR_UNSTABLE_NFS); |
---|
| 1962 | + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); |
---|
1938 | 1963 | domain_dirty_limits(gdtc); |
---|
1939 | 1964 | |
---|
1940 | 1965 | if (gdtc->dirty > gdtc->bg_thresh) |
---|
1941 | 1966 | return true; |
---|
1942 | 1967 | |
---|
1943 | | - if (wb_stat(wb, WB_RECLAIMABLE) > |
---|
1944 | | - wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) |
---|
| 1968 | + thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh); |
---|
| 1969 | + if (thresh < 2 * wb_stat_error()) |
---|
| 1970 | + reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); |
---|
| 1971 | + else |
---|
| 1972 | + reclaimable = wb_stat(wb, WB_RECLAIMABLE); |
---|
| 1973 | + |
---|
| 1974 | + if (reclaimable > thresh) |
---|
1945 | 1975 | return true; |
---|
1946 | 1976 | |
---|
1947 | 1977 | if (mdtc) { |
---|
.. | .. |
---|
1955 | 1985 | if (mdtc->dirty > mdtc->bg_thresh) |
---|
1956 | 1986 | return true; |
---|
1957 | 1987 | |
---|
1958 | | - if (wb_stat(wb, WB_RECLAIMABLE) > |
---|
1959 | | - wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) |
---|
| 1988 | + thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh); |
---|
| 1989 | + if (thresh < 2 * wb_stat_error()) |
---|
| 1990 | + reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); |
---|
| 1991 | + else |
---|
| 1992 | + reclaimable = wb_stat(wb, WB_RECLAIMABLE); |
---|
| 1993 | + |
---|
| 1994 | + if (reclaimable > thresh) |
---|
1960 | 1995 | return true; |
---|
1961 | 1996 | } |
---|
1962 | 1997 | |
---|
.. | .. |
---|
1967 | 2002 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
---|
1968 | 2003 | */ |
---|
1969 | 2004 | int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, |
---|
1970 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 2005 | + void *buffer, size_t *length, loff_t *ppos) |
---|
1971 | 2006 | { |
---|
1972 | 2007 | unsigned int old_interval = dirty_writeback_interval; |
---|
1973 | 2008 | int ret; |
---|
.. | .. |
---|
2059 | 2094 | * Called early on to tune the page writeback dirty limits. |
---|
2060 | 2095 | * |
---|
2061 | 2096 | * We used to scale dirty pages according to how total memory |
---|
2062 | | - * related to pages that could be allocated for buffers (by |
---|
2063 | | - * comparing nr_free_buffer_pages() to vm_total_pages. |
---|
| 2097 | + * related to pages that could be allocated for buffers. |
---|
2064 | 2098 | * |
---|
2065 | 2099 | * However, that was when we used "dirty_ratio" to scale with |
---|
2066 | 2100 | * all memory, and we don't do that any more. "dirty_ratio" |
---|
2067 | | - * is now applied to total non-HIGHPAGE memory (by subtracting |
---|
2068 | | - * totalhigh_pages from vm_total_pages), and as such we can't |
---|
| 2101 | + * is now applied to total non-HIGHPAGE memory, and as such we can't |
---|
2069 | 2102 | * get into the old insane situation any more where we had |
---|
2070 | 2103 | * large amounts of dirty pages compared to a small amount of |
---|
2071 | 2104 | * non-HIGHMEM memory. |
---|
.. | .. |
---|
2097 | 2130 | * dirty pages in the file (thus it is important for this function to be quick |
---|
2098 | 2131 | * so that it can tag pages faster than a dirtying process can create them). |
---|
2099 | 2132 | */ |
---|
2100 | | -/* |
---|
2101 | | - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock |
---|
2102 | | - * latency. |
---|
2103 | | - */ |
---|
2104 | 2133 | void tag_pages_for_writeback(struct address_space *mapping, |
---|
2105 | 2134 | pgoff_t start, pgoff_t end) |
---|
2106 | 2135 | { |
---|
2107 | | -#define WRITEBACK_TAG_BATCH 4096 |
---|
2108 | | - unsigned long tagged = 0; |
---|
2109 | | - struct radix_tree_iter iter; |
---|
2110 | | - void **slot; |
---|
| 2136 | + XA_STATE(xas, &mapping->i_pages, start); |
---|
| 2137 | + unsigned int tagged = 0; |
---|
| 2138 | + void *page; |
---|
2111 | 2139 | |
---|
2112 | | - xa_lock_irq(&mapping->i_pages); |
---|
2113 | | - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, |
---|
2114 | | - PAGECACHE_TAG_DIRTY) { |
---|
2115 | | - if (iter.index > end) |
---|
2116 | | - break; |
---|
2117 | | - radix_tree_iter_tag_set(&mapping->i_pages, &iter, |
---|
2118 | | - PAGECACHE_TAG_TOWRITE); |
---|
2119 | | - tagged++; |
---|
2120 | | - if ((tagged % WRITEBACK_TAG_BATCH) != 0) |
---|
| 2140 | + xas_lock_irq(&xas); |
---|
| 2141 | + xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { |
---|
| 2142 | + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); |
---|
| 2143 | + if (++tagged % XA_CHECK_SCHED) |
---|
2121 | 2144 | continue; |
---|
2122 | | - slot = radix_tree_iter_resume(slot, &iter); |
---|
2123 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 2145 | + |
---|
| 2146 | + xas_pause(&xas); |
---|
| 2147 | + xas_unlock_irq(&xas); |
---|
2124 | 2148 | cond_resched(); |
---|
2125 | | - xa_lock_irq(&mapping->i_pages); |
---|
| 2149 | + xas_lock_irq(&xas); |
---|
2126 | 2150 | } |
---|
2127 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 2151 | + xas_unlock_irq(&xas); |
---|
2128 | 2152 | } |
---|
2129 | 2153 | EXPORT_SYMBOL(tag_pages_for_writeback); |
---|
2130 | 2154 | |
---|
.. | .. |
---|
2156 | 2180 | * lock/page writeback access order inversion - we should only ever lock |
---|
2157 | 2181 | * multiple pages in ascending page->index order, and looping back to the start |
---|
2158 | 2182 | * of the file violates that rule and causes deadlocks. |
---|
| 2183 | + * |
---|
| 2184 | + * Return: %0 on success, negative error code otherwise |
---|
2159 | 2185 | */ |
---|
2160 | 2186 | int write_cache_pages(struct address_space *mapping, |
---|
2161 | 2187 | struct writeback_control *wbc, writepage_t writepage, |
---|
.. | .. |
---|
2166 | 2192 | int error; |
---|
2167 | 2193 | struct pagevec pvec; |
---|
2168 | 2194 | int nr_pages; |
---|
2169 | | - pgoff_t uninitialized_var(writeback_index); |
---|
2170 | 2195 | pgoff_t index; |
---|
2171 | 2196 | pgoff_t end; /* Inclusive */ |
---|
2172 | 2197 | pgoff_t done_index; |
---|
2173 | 2198 | int range_whole = 0; |
---|
2174 | | - int tag; |
---|
| 2199 | + xa_mark_t tag; |
---|
2175 | 2200 | |
---|
2176 | 2201 | pagevec_init(&pvec); |
---|
2177 | 2202 | if (wbc->range_cyclic) { |
---|
2178 | | - writeback_index = mapping->writeback_index; /* prev offset */ |
---|
2179 | | - index = writeback_index; |
---|
| 2203 | + index = mapping->writeback_index; /* prev offset */ |
---|
2180 | 2204 | end = -1; |
---|
2181 | 2205 | } else { |
---|
2182 | 2206 | index = wbc->range_start >> PAGE_SHIFT; |
---|
.. | .. |
---|
2184 | 2208 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
---|
2185 | 2209 | range_whole = 1; |
---|
2186 | 2210 | } |
---|
2187 | | - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
---|
2188 | | - tag = PAGECACHE_TAG_TOWRITE; |
---|
2189 | | - else |
---|
2190 | | - tag = PAGECACHE_TAG_DIRTY; |
---|
2191 | | - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
---|
| 2211 | + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { |
---|
2192 | 2212 | tag_pages_for_writeback(mapping, index, end); |
---|
| 2213 | + tag = PAGECACHE_TAG_TOWRITE; |
---|
| 2214 | + } else { |
---|
| 2215 | + tag = PAGECACHE_TAG_DIRTY; |
---|
| 2216 | + } |
---|
2193 | 2217 | done_index = index; |
---|
2194 | 2218 | while (!done && (index <= end)) { |
---|
2195 | 2219 | int i; |
---|
.. | .. |
---|
2314 | 2338 | * |
---|
2315 | 2339 | * This is a library function, which implements the writepages() |
---|
2316 | 2340 | * address_space_operation. |
---|
| 2341 | + * |
---|
| 2342 | + * Return: %0 on success, negative error code otherwise |
---|
2317 | 2343 | */ |
---|
2318 | 2344 | int generic_writepages(struct address_space *mapping, |
---|
2319 | 2345 | struct writeback_control *wbc) |
---|
.. | .. |
---|
2360 | 2386 | * |
---|
2361 | 2387 | * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this |
---|
2362 | 2388 | * function returns. |
---|
| 2389 | + * |
---|
| 2390 | + * Return: %0 on success, negative error code otherwise |
---|
2363 | 2391 | */ |
---|
2364 | 2392 | int write_one_page(struct page *page) |
---|
2365 | 2393 | { |
---|
.. | .. |
---|
2413 | 2441 | |
---|
2414 | 2442 | trace_writeback_dirty_page(page, mapping); |
---|
2415 | 2443 | |
---|
2416 | | - if (mapping_cap_account_dirty(mapping)) { |
---|
| 2444 | + if (mapping_can_writeback(mapping)) { |
---|
2417 | 2445 | struct bdi_writeback *wb; |
---|
2418 | 2446 | |
---|
2419 | 2447 | inode_attach_wb(inode, page); |
---|
.. | .. |
---|
2427 | 2455 | task_io_account_write(PAGE_SIZE); |
---|
2428 | 2456 | current->nr_dirtied++; |
---|
2429 | 2457 | this_cpu_inc(bdp_ratelimits); |
---|
| 2458 | + |
---|
| 2459 | + mem_cgroup_track_foreign_dirty(page, wb); |
---|
2430 | 2460 | } |
---|
2431 | 2461 | } |
---|
2432 | | -EXPORT_SYMBOL(account_page_dirtied); |
---|
2433 | 2462 | |
---|
2434 | 2463 | /* |
---|
2435 | 2464 | * Helper function for deaccounting dirty page without writeback. |
---|
.. | .. |
---|
2439 | 2468 | void account_page_cleaned(struct page *page, struct address_space *mapping, |
---|
2440 | 2469 | struct bdi_writeback *wb) |
---|
2441 | 2470 | { |
---|
2442 | | - if (mapping_cap_account_dirty(mapping)) { |
---|
| 2471 | + if (mapping_can_writeback(mapping)) { |
---|
2443 | 2472 | dec_lruvec_page_state(page, NR_FILE_DIRTY); |
---|
2444 | 2473 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
---|
2445 | 2474 | dec_wb_stat(wb, WB_RECLAIMABLE); |
---|
.. | .. |
---|
2449 | 2478 | |
---|
2450 | 2479 | /* |
---|
2451 | 2480 | * For address_spaces which do not use buffers. Just tag the page as dirty in |
---|
2452 | | - * its radix tree. |
---|
| 2481 | + * the xarray. |
---|
2453 | 2482 | * |
---|
2454 | 2483 | * This is also used when a single buffer is being dirtied: we want to set the |
---|
2455 | 2484 | * page dirty in that case, but not all the buffers. This is a "bottom-up" |
---|
.. | .. |
---|
2475 | 2504 | BUG_ON(page_mapping(page) != mapping); |
---|
2476 | 2505 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
---|
2477 | 2506 | account_page_dirtied(page, mapping); |
---|
2478 | | - radix_tree_tag_set(&mapping->i_pages, page_index(page), |
---|
| 2507 | + __xa_set_mark(&mapping->i_pages, page_index(page), |
---|
2479 | 2508 | PAGECACHE_TAG_DIRTY); |
---|
2480 | 2509 | xa_unlock_irqrestore(&mapping->i_pages, flags); |
---|
2481 | 2510 | unlock_page_memcg(page); |
---|
.. | .. |
---|
2502 | 2531 | { |
---|
2503 | 2532 | struct address_space *mapping = page->mapping; |
---|
2504 | 2533 | |
---|
2505 | | - if (mapping && mapping_cap_account_dirty(mapping)) { |
---|
| 2534 | + if (mapping && mapping_can_writeback(mapping)) { |
---|
2506 | 2535 | struct inode *inode = mapping->host; |
---|
2507 | 2536 | struct bdi_writeback *wb; |
---|
2508 | 2537 | struct wb_lock_cookie cookie = {}; |
---|
.. | .. |
---|
2614 | 2643 | { |
---|
2615 | 2644 | struct address_space *mapping = page_mapping(page); |
---|
2616 | 2645 | |
---|
2617 | | - if (mapping_cap_account_dirty(mapping)) { |
---|
| 2646 | + if (mapping_can_writeback(mapping)) { |
---|
2618 | 2647 | struct inode *inode = mapping->host; |
---|
2619 | 2648 | struct bdi_writeback *wb; |
---|
2620 | 2649 | struct wb_lock_cookie cookie = {}; |
---|
.. | .. |
---|
2638 | 2667 | * Returns true if the page was previously dirty. |
---|
2639 | 2668 | * |
---|
2640 | 2669 | * This is for preparing to put the page under writeout. We leave the page |
---|
2641 | | - * tagged as dirty in the radix tree so that a concurrent write-for-sync |
---|
| 2670 | + * tagged as dirty in the xarray so that a concurrent write-for-sync |
---|
2642 | 2671 | * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage |
---|
2643 | 2672 | * implementation will run either set_page_writeback() or set_page_dirty(), |
---|
2644 | | - * at which stage we bring the page's dirty flag and radix-tree dirty tag |
---|
| 2673 | + * at which stage we bring the page's dirty flag and xarray dirty tag |
---|
2645 | 2674 | * back into sync. |
---|
2646 | 2675 | * |
---|
2647 | | - * This incoherency between the page's dirty flag and radix-tree tag is |
---|
| 2676 | + * This incoherency between the page's dirty flag and xarray tag is |
---|
2648 | 2677 | * unfortunate, but it only exists while the page is locked. |
---|
2649 | 2678 | */ |
---|
2650 | 2679 | int clear_page_dirty_for_io(struct page *page) |
---|
.. | .. |
---|
2652 | 2681 | struct address_space *mapping = page_mapping(page); |
---|
2653 | 2682 | int ret = 0; |
---|
2654 | 2683 | |
---|
2655 | | - BUG_ON(!PageLocked(page)); |
---|
| 2684 | + VM_BUG_ON_PAGE(!PageLocked(page), page); |
---|
2656 | 2685 | |
---|
2657 | | - if (mapping && mapping_cap_account_dirty(mapping)) { |
---|
| 2686 | + if (mapping && mapping_can_writeback(mapping)) { |
---|
2658 | 2687 | struct inode *inode = mapping->host; |
---|
2659 | 2688 | struct bdi_writeback *wb; |
---|
2660 | 2689 | struct wb_lock_cookie cookie = {}; |
---|
.. | .. |
---|
2725 | 2754 | xa_lock_irqsave(&mapping->i_pages, flags); |
---|
2726 | 2755 | ret = TestClearPageWriteback(page); |
---|
2727 | 2756 | if (ret) { |
---|
2728 | | - radix_tree_tag_clear(&mapping->i_pages, page_index(page), |
---|
| 2757 | + __xa_clear_mark(&mapping->i_pages, page_index(page), |
---|
2729 | 2758 | PAGECACHE_TAG_WRITEBACK); |
---|
2730 | | - if (bdi_cap_account_writeback(bdi)) { |
---|
| 2759 | + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { |
---|
2731 | 2760 | struct bdi_writeback *wb = inode_to_wb(inode); |
---|
2732 | 2761 | |
---|
2733 | 2762 | dec_wb_stat(wb, WB_WRITEBACK); |
---|
.. | .. |
---|
2743 | 2772 | } else { |
---|
2744 | 2773 | ret = TestClearPageWriteback(page); |
---|
2745 | 2774 | } |
---|
2746 | | - /* |
---|
2747 | | - * NOTE: Page might be free now! Writeback doesn't hold a page |
---|
2748 | | - * reference on its own, it relies on truncation to wait for |
---|
2749 | | - * the clearing of PG_writeback. The below can only access |
---|
2750 | | - * page state that is static across allocation cycles. |
---|
2751 | | - */ |
---|
2752 | 2775 | if (ret) { |
---|
2753 | 2776 | dec_lruvec_state(lruvec, NR_WRITEBACK); |
---|
2754 | 2777 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
---|
.. | .. |
---|
2761 | 2784 | int __test_set_page_writeback(struct page *page, bool keep_write) |
---|
2762 | 2785 | { |
---|
2763 | 2786 | struct address_space *mapping = page_mapping(page); |
---|
2764 | | - int ret; |
---|
| 2787 | + int ret, access_ret; |
---|
2765 | 2788 | |
---|
2766 | 2789 | lock_page_memcg(page); |
---|
2767 | 2790 | if (mapping && mapping_use_writeback_tags(mapping)) { |
---|
| 2791 | + XA_STATE(xas, &mapping->i_pages, page_index(page)); |
---|
2768 | 2792 | struct inode *inode = mapping->host; |
---|
2769 | 2793 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
---|
2770 | 2794 | unsigned long flags; |
---|
2771 | 2795 | |
---|
2772 | | - xa_lock_irqsave(&mapping->i_pages, flags); |
---|
| 2796 | + xas_lock_irqsave(&xas, flags); |
---|
| 2797 | + xas_load(&xas); |
---|
2773 | 2798 | ret = TestSetPageWriteback(page); |
---|
2774 | 2799 | if (!ret) { |
---|
2775 | 2800 | bool on_wblist; |
---|
.. | .. |
---|
2777 | 2802 | on_wblist = mapping_tagged(mapping, |
---|
2778 | 2803 | PAGECACHE_TAG_WRITEBACK); |
---|
2779 | 2804 | |
---|
2780 | | - radix_tree_tag_set(&mapping->i_pages, page_index(page), |
---|
2781 | | - PAGECACHE_TAG_WRITEBACK); |
---|
2782 | | - if (bdi_cap_account_writeback(bdi)) |
---|
| 2805 | + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); |
---|
| 2806 | + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) |
---|
2783 | 2807 | inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); |
---|
2784 | 2808 | |
---|
2785 | 2809 | /* |
---|
.. | .. |
---|
2791 | 2815 | sb_mark_inode_writeback(mapping->host); |
---|
2792 | 2816 | } |
---|
2793 | 2817 | if (!PageDirty(page)) |
---|
2794 | | - radix_tree_tag_clear(&mapping->i_pages, page_index(page), |
---|
2795 | | - PAGECACHE_TAG_DIRTY); |
---|
| 2818 | + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); |
---|
2796 | 2819 | if (!keep_write) |
---|
2797 | | - radix_tree_tag_clear(&mapping->i_pages, page_index(page), |
---|
2798 | | - PAGECACHE_TAG_TOWRITE); |
---|
2799 | | - xa_unlock_irqrestore(&mapping->i_pages, flags); |
---|
| 2820 | + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); |
---|
| 2821 | + xas_unlock_irqrestore(&xas, flags); |
---|
2800 | 2822 | } else { |
---|
2801 | 2823 | ret = TestSetPageWriteback(page); |
---|
2802 | 2824 | } |
---|
.. | .. |
---|
2805 | 2827 | inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
---|
2806 | 2828 | } |
---|
2807 | 2829 | unlock_page_memcg(page); |
---|
| 2830 | + access_ret = arch_make_page_accessible(page); |
---|
| 2831 | + /* |
---|
| 2832 | + * If writeback has been triggered on a page that cannot be made |
---|
| 2833 | + * accessible, it is too late to recover here. |
---|
| 2834 | + */ |
---|
| 2835 | + VM_BUG_ON_PAGE(access_ret != 0, page); |
---|
| 2836 | + |
---|
2808 | 2837 | return ret; |
---|
2809 | 2838 | |
---|
2810 | 2839 | } |
---|
2811 | 2840 | EXPORT_SYMBOL(__test_set_page_writeback); |
---|
2812 | 2841 | |
---|
2813 | 2842 | /* |
---|
2814 | | - * Return true if any of the pages in the mapping are marked with the |
---|
2815 | | - * passed tag. |
---|
| 2843 | + * Wait for a page to complete writeback |
---|
2816 | 2844 | */ |
---|
2817 | | -int mapping_tagged(struct address_space *mapping, int tag) |
---|
| 2845 | +void wait_on_page_writeback(struct page *page) |
---|
2818 | 2846 | { |
---|
2819 | | - return radix_tree_tagged(&mapping->i_pages, tag); |
---|
| 2847 | + while (PageWriteback(page)) { |
---|
| 2848 | + trace_wait_on_page_writeback(page, page_mapping(page)); |
---|
| 2849 | + wait_on_page_bit(page, PG_writeback); |
---|
| 2850 | + } |
---|
2820 | 2851 | } |
---|
2821 | | -EXPORT_SYMBOL(mapping_tagged); |
---|
| 2852 | +EXPORT_SYMBOL_GPL(wait_on_page_writeback); |
---|
2822 | 2853 | |
---|
2823 | 2854 | /** |
---|
2824 | 2855 | * wait_for_stable_page() - wait for writeback to finish, if necessary. |
---|
.. | .. |
---|
2830 | 2861 | */ |
---|
2831 | 2862 | void wait_for_stable_page(struct page *page) |
---|
2832 | 2863 | { |
---|
2833 | | - if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) |
---|
| 2864 | + page = thp_head(page); |
---|
| 2865 | + if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) |
---|
2834 | 2866 | wait_on_page_writeback(page); |
---|
2835 | 2867 | } |
---|
2836 | 2868 | EXPORT_SYMBOL_GPL(wait_for_stable_page); |
---|