| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * mm/page-writeback.c |
|---|
| 3 | 4 | * |
|---|
| .. | .. |
|---|
| 40 | 41 | #include <trace/events/writeback.h> |
|---|
| 41 | 42 | |
|---|
| 42 | 43 | #include "internal.h" |
|---|
| 44 | + |
|---|
| 45 | +#undef CREATE_TRACE_POINT |
|---|
| 46 | +#include <trace/hooks/mm.h> |
|---|
| 43 | 47 | |
|---|
| 44 | 48 | /* |
|---|
| 45 | 49 | * Sleep at most 200ms at a time in balance_dirty_pages(). |
|---|
| .. | .. |
|---|
| 256 | 260 | * requiring writeback. |
|---|
| 257 | 261 | * |
|---|
| 258 | 262 | * This number of dirtyable pages is the base value of which the |
|---|
| 259 | | - * user-configurable dirty ratio is the effictive number of pages that |
|---|
| 263 | + * user-configurable dirty ratio is the effective number of pages that |
|---|
| 260 | 264 | * are allowed to be actually dirtied. Per individual zone, or |
|---|
| 261 | 265 | * globally by using the sum of dirtyable pages over all zones. |
|---|
| 262 | 266 | * |
|---|
| .. | .. |
|---|
| 270 | 274 | * node_dirtyable_memory - number of dirtyable pages in a node |
|---|
| 271 | 275 | * @pgdat: the node |
|---|
| 272 | 276 | * |
|---|
| 273 | | - * Returns the node's number of pages potentially available for dirty |
|---|
| 277 | + * Return: the node's number of pages potentially available for dirty |
|---|
| 274 | 278 | * page cache. This is the base value for the per-node dirty limits. |
|---|
| 275 | 279 | */ |
|---|
| 276 | 280 | static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) |
|---|
| .. | .. |
|---|
| 355 | 359 | /** |
|---|
| 356 | 360 | * global_dirtyable_memory - number of globally dirtyable pages |
|---|
| 357 | 361 | * |
|---|
| 358 | | - * Returns the global number of pages potentially available for dirty |
|---|
| 362 | + * Return: the global number of pages potentially available for dirty |
|---|
| 359 | 363 | * page cache. This is the base value for the global dirty limits. |
|---|
| 360 | 364 | */ |
|---|
| 361 | 365 | static unsigned long global_dirtyable_memory(void) |
|---|
| .. | .. |
|---|
| 386 | 390 | * Calculate @dtc->thresh and ->bg_thresh considering |
|---|
| 387 | 391 | * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller |
|---|
| 388 | 392 | * must ensure that @dtc->avail is set before calling this function. The |
|---|
| 389 | | - * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and |
|---|
| 390 | | - * real-time tasks. |
|---|
| 393 | + * dirty limits will be lifted by 1/4 for real-time tasks. |
|---|
| 391 | 394 | */ |
|---|
| 392 | 395 | static void domain_dirty_limits(struct dirty_throttle_control *dtc) |
|---|
| 393 | 396 | { |
|---|
| .. | .. |
|---|
| 435 | 438 | if (bg_thresh >= thresh) |
|---|
| 436 | 439 | bg_thresh = thresh / 2; |
|---|
| 437 | 440 | tsk = current; |
|---|
| 438 | | - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
|---|
| 441 | + if (rt_task(tsk)) { |
|---|
| 439 | 442 | bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; |
|---|
| 440 | 443 | thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; |
|---|
| 441 | 444 | } |
|---|
| .. | .. |
|---|
| 470 | 473 | * node_dirty_limit - maximum number of dirty pages allowed in a node |
|---|
| 471 | 474 | * @pgdat: the node |
|---|
| 472 | 475 | * |
|---|
| 473 | | - * Returns the maximum number of dirty pages allowed in a node, based |
|---|
| 476 | + * Return: the maximum number of dirty pages allowed in a node, based |
|---|
| 474 | 477 | * on the node's dirtyable memory. |
|---|
| 475 | 478 | */ |
|---|
| 476 | 479 | static unsigned long node_dirty_limit(struct pglist_data *pgdat) |
|---|
| .. | .. |
|---|
| 485 | 488 | else |
|---|
| 486 | 489 | dirty = vm_dirty_ratio * node_memory / 100; |
|---|
| 487 | 490 | |
|---|
| 488 | | - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) |
|---|
| 491 | + if (rt_task(tsk)) |
|---|
| 489 | 492 | dirty += dirty / 4; |
|---|
| 490 | 493 | |
|---|
| 491 | 494 | return dirty; |
|---|
| .. | .. |
|---|
| 495 | 498 | * node_dirty_ok - tells whether a node is within its dirty limits |
|---|
| 496 | 499 | * @pgdat: the node to check |
|---|
| 497 | 500 | * |
|---|
| 498 | | - * Returns %true when the dirty pages in @pgdat are within the node's |
|---|
| 501 | + * Return: %true when the dirty pages in @pgdat are within the node's |
|---|
| 499 | 502 | * dirty limit, %false if the limit is exceeded. |
|---|
| 500 | 503 | */ |
|---|
| 501 | 504 | bool node_dirty_ok(struct pglist_data *pgdat) |
|---|
| .. | .. |
|---|
| 504 | 507 | unsigned long nr_pages = 0; |
|---|
| 505 | 508 | |
|---|
| 506 | 509 | nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); |
|---|
| 507 | | - nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); |
|---|
| 508 | 510 | nr_pages += node_page_state(pgdat, NR_WRITEBACK); |
|---|
| 509 | 511 | |
|---|
| 510 | 512 | return nr_pages <= limit; |
|---|
| 511 | 513 | } |
|---|
| 512 | 514 | |
|---|
| 513 | 515 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
|---|
| 514 | | - void __user *buffer, size_t *lenp, |
|---|
| 515 | | - loff_t *ppos) |
|---|
| 516 | + void *buffer, size_t *lenp, loff_t *ppos) |
|---|
| 516 | 517 | { |
|---|
| 517 | 518 | int ret; |
|---|
| 518 | 519 | |
|---|
| .. | .. |
|---|
| 523 | 524 | } |
|---|
| 524 | 525 | |
|---|
| 525 | 526 | int dirty_background_bytes_handler(struct ctl_table *table, int write, |
|---|
| 526 | | - void __user *buffer, size_t *lenp, |
|---|
| 527 | | - loff_t *ppos) |
|---|
| 527 | + void *buffer, size_t *lenp, loff_t *ppos) |
|---|
| 528 | 528 | { |
|---|
| 529 | 529 | int ret; |
|---|
| 530 | 530 | |
|---|
| .. | .. |
|---|
| 534 | 534 | return ret; |
|---|
| 535 | 535 | } |
|---|
| 536 | 536 | |
|---|
| 537 | | -int dirty_ratio_handler(struct ctl_table *table, int write, |
|---|
| 538 | | - void __user *buffer, size_t *lenp, |
|---|
| 539 | | - loff_t *ppos) |
|---|
| 537 | +int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, |
|---|
| 538 | + size_t *lenp, loff_t *ppos) |
|---|
| 540 | 539 | { |
|---|
| 541 | 540 | int old_ratio = vm_dirty_ratio; |
|---|
| 542 | 541 | int ret; |
|---|
| .. | .. |
|---|
| 550 | 549 | } |
|---|
| 551 | 550 | |
|---|
| 552 | 551 | int dirty_bytes_handler(struct ctl_table *table, int write, |
|---|
| 553 | | - void __user *buffer, size_t *lenp, |
|---|
| 554 | | - loff_t *ppos) |
|---|
| 552 | + void *buffer, size_t *lenp, loff_t *ppos) |
|---|
| 555 | 553 | { |
|---|
| 556 | 554 | unsigned long old_bytes = vm_dirty_bytes; |
|---|
| 557 | 555 | int ret; |
|---|
| .. | .. |
|---|
| 743 | 741 | * __wb_calc_thresh - @wb's share of dirty throttling threshold |
|---|
| 744 | 742 | * @dtc: dirty_throttle_context of interest |
|---|
| 745 | 743 | * |
|---|
| 746 | | - * Returns @wb's dirty limit in pages. The term "dirty" in the context of |
|---|
| 747 | | - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. |
|---|
| 748 | | - * |
|---|
| 749 | 744 | * Note that balance_dirty_pages() will only seriously take it as a hard limit |
|---|
| 750 | 745 | * when sleeping max_pause per page is not enough to keep the dirty pages under |
|---|
| 751 | 746 | * control. For example, when the device is completely stalled due to some error |
|---|
| .. | .. |
|---|
| 759 | 754 | * |
|---|
| 760 | 755 | * The wb's share of dirty limit will be adapting to its throughput and |
|---|
| 761 | 756 | * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. |
|---|
| 757 | + * |
|---|
| 758 | + * Return: @wb's dirty limit in pages. The term "dirty" in the context of |
|---|
| 759 | + * dirty balancing includes all PG_dirty and PG_writeback pages. |
|---|
| 762 | 760 | */ |
|---|
| 763 | 761 | static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) |
|---|
| 764 | 762 | { |
|---|
| 765 | 763 | struct wb_domain *dom = dtc_dom(dtc); |
|---|
| 766 | 764 | unsigned long thresh = dtc->thresh; |
|---|
| 767 | 765 | u64 wb_thresh; |
|---|
| 768 | | - long numerator, denominator; |
|---|
| 766 | + unsigned long numerator, denominator; |
|---|
| 769 | 767 | unsigned long wb_min_ratio, wb_max_ratio; |
|---|
| 770 | 768 | |
|---|
| 771 | 769 | /* |
|---|
| .. | .. |
|---|
| 776 | 774 | |
|---|
| 777 | 775 | wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; |
|---|
| 778 | 776 | wb_thresh *= numerator; |
|---|
| 779 | | - do_div(wb_thresh, denominator); |
|---|
| 777 | + wb_thresh = div64_ul(wb_thresh, denominator); |
|---|
| 780 | 778 | |
|---|
| 781 | 779 | wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); |
|---|
| 782 | 780 | |
|---|
| .. | .. |
|---|
| 1101 | 1099 | bw = written - min(written, wb->written_stamp); |
|---|
| 1102 | 1100 | bw *= HZ; |
|---|
| 1103 | 1101 | if (unlikely(elapsed > period)) { |
|---|
| 1104 | | - do_div(bw, elapsed); |
|---|
| 1102 | + bw = div64_ul(bw, elapsed); |
|---|
| 1105 | 1103 | avg = bw; |
|---|
| 1106 | 1104 | goto out; |
|---|
| 1107 | 1105 | } |
|---|
| .. | .. |
|---|
| 1566 | 1564 | struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? |
|---|
| 1567 | 1565 | &mdtc_stor : NULL; |
|---|
| 1568 | 1566 | struct dirty_throttle_control *sdtc; |
|---|
| 1569 | | - unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ |
|---|
| 1567 | + unsigned long nr_reclaimable; /* = file_dirty */ |
|---|
| 1570 | 1568 | long period; |
|---|
| 1571 | 1569 | long pause; |
|---|
| 1572 | 1570 | long max_pause; |
|---|
| .. | .. |
|---|
| 1586 | 1584 | unsigned long m_thresh = 0; |
|---|
| 1587 | 1585 | unsigned long m_bg_thresh = 0; |
|---|
| 1588 | 1586 | |
|---|
| 1589 | | - /* |
|---|
| 1590 | | - * Unstable writes are a feature of certain networked |
|---|
| 1591 | | - * filesystems (i.e. NFS) in which data may have been |
|---|
| 1592 | | - * written to the server's write cache, but has not yet |
|---|
| 1593 | | - * been flushed to permanent storage. |
|---|
| 1594 | | - */ |
|---|
| 1595 | | - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + |
|---|
| 1596 | | - global_node_page_state(NR_UNSTABLE_NFS); |
|---|
| 1587 | + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); |
|---|
| 1597 | 1588 | gdtc->avail = global_dirtyable_memory(); |
|---|
| 1598 | 1589 | gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); |
|---|
| 1599 | 1590 | |
|---|
| .. | .. |
|---|
| 1637 | 1628 | } |
|---|
| 1638 | 1629 | } |
|---|
| 1639 | 1630 | |
|---|
| 1631 | + trace_android_vh_mm_dirty_limits(gdtc, strictlimit, dirty, bg_thresh, |
|---|
| 1632 | + nr_reclaimable, pages_dirtied); |
|---|
| 1633 | + |
|---|
| 1640 | 1634 | /* |
|---|
| 1641 | 1635 | * Throttle it only when the background writeback cannot |
|---|
| 1642 | 1636 | * catch-up. This avoids (excessively) small writeouts |
|---|
| .. | .. |
|---|
| 1652 | 1646 | if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && |
|---|
| 1653 | 1647 | (!mdtc || |
|---|
| 1654 | 1648 | m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { |
|---|
| 1655 | | - unsigned long intv = dirty_poll_interval(dirty, thresh); |
|---|
| 1656 | | - unsigned long m_intv = ULONG_MAX; |
|---|
| 1649 | + unsigned long intv; |
|---|
| 1650 | + unsigned long m_intv; |
|---|
| 1651 | + |
|---|
| 1652 | +free_running: |
|---|
| 1653 | + intv = dirty_poll_interval(dirty, thresh); |
|---|
| 1654 | + m_intv = ULONG_MAX; |
|---|
| 1657 | 1655 | |
|---|
| 1658 | 1656 | current->dirty_paused_when = now; |
|---|
| 1659 | 1657 | current->nr_dirtied = 0; |
|---|
| .. | .. |
|---|
| 1666 | 1664 | if (unlikely(!writeback_in_progress(wb))) |
|---|
| 1667 | 1665 | wb_start_background_writeback(wb); |
|---|
| 1668 | 1666 | |
|---|
| 1667 | + mem_cgroup_flush_foreign(wb); |
|---|
| 1668 | + |
|---|
| 1669 | 1669 | /* |
|---|
| 1670 | 1670 | * Calculate global domain's pos_ratio and select the |
|---|
| 1671 | 1671 | * global dtc by default. |
|---|
| 1672 | 1672 | */ |
|---|
| 1673 | | - if (!strictlimit) |
|---|
| 1673 | + if (!strictlimit) { |
|---|
| 1674 | 1674 | wb_dirty_limits(gdtc); |
|---|
| 1675 | + |
|---|
| 1676 | + if ((current->flags & PF_LOCAL_THROTTLE) && |
|---|
| 1677 | + gdtc->wb_dirty < |
|---|
| 1678 | + dirty_freerun_ceiling(gdtc->wb_thresh, |
|---|
| 1679 | + gdtc->wb_bg_thresh)) |
|---|
| 1680 | + /* |
|---|
| 1681 | + * LOCAL_THROTTLE tasks must not be throttled |
|---|
| 1682 | + * when below the per-wb freerun ceiling. |
|---|
| 1683 | + */ |
|---|
| 1684 | + goto free_running; |
|---|
| 1685 | + } |
|---|
| 1675 | 1686 | |
|---|
| 1676 | 1687 | dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && |
|---|
| 1677 | 1688 | ((gdtc->dirty > gdtc->thresh) || strictlimit); |
|---|
| .. | .. |
|---|
| 1686 | 1697 | * both global and memcg domains. Choose the one |
|---|
| 1687 | 1698 | * w/ lower pos_ratio. |
|---|
| 1688 | 1699 | */ |
|---|
| 1689 | | - if (!strictlimit) |
|---|
| 1700 | + if (!strictlimit) { |
|---|
| 1690 | 1701 | wb_dirty_limits(mdtc); |
|---|
| 1691 | 1702 | |
|---|
| 1703 | + if ((current->flags & PF_LOCAL_THROTTLE) && |
|---|
| 1704 | + mdtc->wb_dirty < |
|---|
| 1705 | + dirty_freerun_ceiling(mdtc->wb_thresh, |
|---|
| 1706 | + mdtc->wb_bg_thresh)) |
|---|
| 1707 | + /* |
|---|
| 1708 | + * LOCAL_THROTTLE tasks must not be |
|---|
| 1709 | + * throttled when below the per-wb |
|---|
| 1710 | + * freerun ceiling. |
|---|
| 1711 | + */ |
|---|
| 1712 | + goto free_running; |
|---|
| 1713 | + } |
|---|
| 1692 | 1714 | dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && |
|---|
| 1693 | 1715 | ((mdtc->dirty > mdtc->thresh) || strictlimit); |
|---|
| 1694 | 1716 | |
|---|
| .. | .. |
|---|
| 1866 | 1888 | int ratelimit; |
|---|
| 1867 | 1889 | int *p; |
|---|
| 1868 | 1890 | |
|---|
| 1869 | | - if (!bdi_cap_account_dirty(bdi)) |
|---|
| 1891 | + if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) |
|---|
| 1870 | 1892 | return; |
|---|
| 1871 | 1893 | |
|---|
| 1872 | 1894 | if (inode_cgwb_enabled(inode)) |
|---|
| .. | .. |
|---|
| 1918 | 1940 | * @wb: bdi_writeback of interest |
|---|
| 1919 | 1941 | * |
|---|
| 1920 | 1942 | * Determines whether background writeback should keep writing @wb or it's |
|---|
| 1921 | | - * clean enough. Returns %true if writeback should continue. |
|---|
| 1943 | + * clean enough. |
|---|
| 1944 | + * |
|---|
| 1945 | + * Return: %true if writeback should continue. |
|---|
| 1922 | 1946 | */ |
|---|
| 1923 | 1947 | bool wb_over_bg_thresh(struct bdi_writeback *wb) |
|---|
| 1924 | 1948 | { |
|---|
| .. | .. |
|---|
| 1927 | 1951 | struct dirty_throttle_control * const gdtc = &gdtc_stor; |
|---|
| 1928 | 1952 | struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? |
|---|
| 1929 | 1953 | &mdtc_stor : NULL; |
|---|
| 1954 | + unsigned long reclaimable; |
|---|
| 1955 | + unsigned long thresh; |
|---|
| 1930 | 1956 | |
|---|
| 1931 | 1957 | /* |
|---|
| 1932 | 1958 | * Similar to balance_dirty_pages() but ignores pages being written |
|---|
| 1933 | 1959 | * as we're trying to decide whether to put more under writeback. |
|---|
| 1934 | 1960 | */ |
|---|
| 1935 | 1961 | gdtc->avail = global_dirtyable_memory(); |
|---|
| 1936 | | - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + |
|---|
| 1937 | | - global_node_page_state(NR_UNSTABLE_NFS); |
|---|
| 1962 | + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); |
|---|
| 1938 | 1963 | domain_dirty_limits(gdtc); |
|---|
| 1939 | 1964 | |
|---|
| 1940 | 1965 | if (gdtc->dirty > gdtc->bg_thresh) |
|---|
| 1941 | 1966 | return true; |
|---|
| 1942 | 1967 | |
|---|
| 1943 | | - if (wb_stat(wb, WB_RECLAIMABLE) > |
|---|
| 1944 | | - wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) |
|---|
| 1968 | + thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh); |
|---|
| 1969 | + if (thresh < 2 * wb_stat_error()) |
|---|
| 1970 | + reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); |
|---|
| 1971 | + else |
|---|
| 1972 | + reclaimable = wb_stat(wb, WB_RECLAIMABLE); |
|---|
| 1973 | + |
|---|
| 1974 | + if (reclaimable > thresh) |
|---|
| 1945 | 1975 | return true; |
|---|
| 1946 | 1976 | |
|---|
| 1947 | 1977 | if (mdtc) { |
|---|
| .. | .. |
|---|
| 1955 | 1985 | if (mdtc->dirty > mdtc->bg_thresh) |
|---|
| 1956 | 1986 | return true; |
|---|
| 1957 | 1987 | |
|---|
| 1958 | | - if (wb_stat(wb, WB_RECLAIMABLE) > |
|---|
| 1959 | | - wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) |
|---|
| 1988 | + thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh); |
|---|
| 1989 | + if (thresh < 2 * wb_stat_error()) |
|---|
| 1990 | + reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); |
|---|
| 1991 | + else |
|---|
| 1992 | + reclaimable = wb_stat(wb, WB_RECLAIMABLE); |
|---|
| 1993 | + |
|---|
| 1994 | + if (reclaimable > thresh) |
|---|
| 1960 | 1995 | return true; |
|---|
| 1961 | 1996 | } |
|---|
| 1962 | 1997 | |
|---|
| .. | .. |
|---|
| 1967 | 2002 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
|---|
| 1968 | 2003 | */ |
|---|
| 1969 | 2004 | int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, |
|---|
| 1970 | | - void __user *buffer, size_t *length, loff_t *ppos) |
|---|
| 2005 | + void *buffer, size_t *length, loff_t *ppos) |
|---|
| 1971 | 2006 | { |
|---|
| 1972 | 2007 | unsigned int old_interval = dirty_writeback_interval; |
|---|
| 1973 | 2008 | int ret; |
|---|
| .. | .. |
|---|
| 2059 | 2094 | * Called early on to tune the page writeback dirty limits. |
|---|
| 2060 | 2095 | * |
|---|
| 2061 | 2096 | * We used to scale dirty pages according to how total memory |
|---|
| 2062 | | - * related to pages that could be allocated for buffers (by |
|---|
| 2063 | | - * comparing nr_free_buffer_pages() to vm_total_pages. |
|---|
| 2097 | + * related to pages that could be allocated for buffers. |
|---|
| 2064 | 2098 | * |
|---|
| 2065 | 2099 | * However, that was when we used "dirty_ratio" to scale with |
|---|
| 2066 | 2100 | * all memory, and we don't do that any more. "dirty_ratio" |
|---|
| 2067 | | - * is now applied to total non-HIGHPAGE memory (by subtracting |
|---|
| 2068 | | - * totalhigh_pages from vm_total_pages), and as such we can't |
|---|
| 2101 | + * is now applied to total non-HIGHPAGE memory, and as such we can't |
|---|
| 2069 | 2102 | * get into the old insane situation any more where we had |
|---|
| 2070 | 2103 | * large amounts of dirty pages compared to a small amount of |
|---|
| 2071 | 2104 | * non-HIGHMEM memory. |
|---|
| .. | .. |
|---|
| 2097 | 2130 | * dirty pages in the file (thus it is important for this function to be quick |
|---|
| 2098 | 2131 | * so that it can tag pages faster than a dirtying process can create them). |
|---|
| 2099 | 2132 | */ |
|---|
| 2100 | | -/* |
|---|
| 2101 | | - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock |
|---|
| 2102 | | - * latency. |
|---|
| 2103 | | - */ |
|---|
| 2104 | 2133 | void tag_pages_for_writeback(struct address_space *mapping, |
|---|
| 2105 | 2134 | pgoff_t start, pgoff_t end) |
|---|
| 2106 | 2135 | { |
|---|
| 2107 | | -#define WRITEBACK_TAG_BATCH 4096 |
|---|
| 2108 | | - unsigned long tagged = 0; |
|---|
| 2109 | | - struct radix_tree_iter iter; |
|---|
| 2110 | | - void **slot; |
|---|
| 2136 | + XA_STATE(xas, &mapping->i_pages, start); |
|---|
| 2137 | + unsigned int tagged = 0; |
|---|
| 2138 | + void *page; |
|---|
| 2111 | 2139 | |
|---|
| 2112 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 2113 | | - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, |
|---|
| 2114 | | - PAGECACHE_TAG_DIRTY) { |
|---|
| 2115 | | - if (iter.index > end) |
|---|
| 2116 | | - break; |
|---|
| 2117 | | - radix_tree_iter_tag_set(&mapping->i_pages, &iter, |
|---|
| 2118 | | - PAGECACHE_TAG_TOWRITE); |
|---|
| 2119 | | - tagged++; |
|---|
| 2120 | | - if ((tagged % WRITEBACK_TAG_BATCH) != 0) |
|---|
| 2140 | + xas_lock_irq(&xas); |
|---|
| 2141 | + xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { |
|---|
| 2142 | + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); |
|---|
| 2143 | + if (++tagged % XA_CHECK_SCHED) |
|---|
| 2121 | 2144 | continue; |
|---|
| 2122 | | - slot = radix_tree_iter_resume(slot, &iter); |
|---|
| 2123 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 2145 | + |
|---|
| 2146 | + xas_pause(&xas); |
|---|
| 2147 | + xas_unlock_irq(&xas); |
|---|
| 2124 | 2148 | cond_resched(); |
|---|
| 2125 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 2149 | + xas_lock_irq(&xas); |
|---|
| 2126 | 2150 | } |
|---|
| 2127 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 2151 | + xas_unlock_irq(&xas); |
|---|
| 2128 | 2152 | } |
|---|
| 2129 | 2153 | EXPORT_SYMBOL(tag_pages_for_writeback); |
|---|
| 2130 | 2154 | |
|---|
| .. | .. |
|---|
| 2156 | 2180 | * lock/page writeback access order inversion - we should only ever lock |
|---|
| 2157 | 2181 | * multiple pages in ascending page->index order, and looping back to the start |
|---|
| 2158 | 2182 | * of the file violates that rule and causes deadlocks. |
|---|
| 2183 | + * |
|---|
| 2184 | + * Return: %0 on success, negative error code otherwise |
|---|
| 2159 | 2185 | */ |
|---|
| 2160 | 2186 | int write_cache_pages(struct address_space *mapping, |
|---|
| 2161 | 2187 | struct writeback_control *wbc, writepage_t writepage, |
|---|
| .. | .. |
|---|
| 2166 | 2192 | int error; |
|---|
| 2167 | 2193 | struct pagevec pvec; |
|---|
| 2168 | 2194 | int nr_pages; |
|---|
| 2169 | | - pgoff_t uninitialized_var(writeback_index); |
|---|
| 2170 | 2195 | pgoff_t index; |
|---|
| 2171 | 2196 | pgoff_t end; /* Inclusive */ |
|---|
| 2172 | 2197 | pgoff_t done_index; |
|---|
| 2173 | 2198 | int range_whole = 0; |
|---|
| 2174 | | - int tag; |
|---|
| 2199 | + xa_mark_t tag; |
|---|
| 2175 | 2200 | |
|---|
| 2176 | 2201 | pagevec_init(&pvec); |
|---|
| 2177 | 2202 | if (wbc->range_cyclic) { |
|---|
| 2178 | | - writeback_index = mapping->writeback_index; /* prev offset */ |
|---|
| 2179 | | - index = writeback_index; |
|---|
| 2203 | + index = mapping->writeback_index; /* prev offset */ |
|---|
| 2180 | 2204 | end = -1; |
|---|
| 2181 | 2205 | } else { |
|---|
| 2182 | 2206 | index = wbc->range_start >> PAGE_SHIFT; |
|---|
| .. | .. |
|---|
| 2184 | 2208 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
|---|
| 2185 | 2209 | range_whole = 1; |
|---|
| 2186 | 2210 | } |
|---|
| 2187 | | - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
|---|
| 2188 | | - tag = PAGECACHE_TAG_TOWRITE; |
|---|
| 2189 | | - else |
|---|
| 2190 | | - tag = PAGECACHE_TAG_DIRTY; |
|---|
| 2191 | | - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
|---|
| 2211 | + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { |
|---|
| 2192 | 2212 | tag_pages_for_writeback(mapping, index, end); |
|---|
| 2213 | + tag = PAGECACHE_TAG_TOWRITE; |
|---|
| 2214 | + } else { |
|---|
| 2215 | + tag = PAGECACHE_TAG_DIRTY; |
|---|
| 2216 | + } |
|---|
| 2193 | 2217 | done_index = index; |
|---|
| 2194 | 2218 | while (!done && (index <= end)) { |
|---|
| 2195 | 2219 | int i; |
|---|
| .. | .. |
|---|
| 2314 | 2338 | * |
|---|
| 2315 | 2339 | * This is a library function, which implements the writepages() |
|---|
| 2316 | 2340 | * address_space_operation. |
|---|
| 2341 | + * |
|---|
| 2342 | + * Return: %0 on success, negative error code otherwise |
|---|
| 2317 | 2343 | */ |
|---|
| 2318 | 2344 | int generic_writepages(struct address_space *mapping, |
|---|
| 2319 | 2345 | struct writeback_control *wbc) |
|---|
| .. | .. |
|---|
| 2360 | 2386 | * |
|---|
| 2361 | 2387 | * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this |
|---|
| 2362 | 2388 | * function returns. |
|---|
| 2389 | + * |
|---|
| 2390 | + * Return: %0 on success, negative error code otherwise |
|---|
| 2363 | 2391 | */ |
|---|
| 2364 | 2392 | int write_one_page(struct page *page) |
|---|
| 2365 | 2393 | { |
|---|
| .. | .. |
|---|
| 2413 | 2441 | |
|---|
| 2414 | 2442 | trace_writeback_dirty_page(page, mapping); |
|---|
| 2415 | 2443 | |
|---|
| 2416 | | - if (mapping_cap_account_dirty(mapping)) { |
|---|
| 2444 | + if (mapping_can_writeback(mapping)) { |
|---|
| 2417 | 2445 | struct bdi_writeback *wb; |
|---|
| 2418 | 2446 | |
|---|
| 2419 | 2447 | inode_attach_wb(inode, page); |
|---|
| .. | .. |
|---|
| 2427 | 2455 | task_io_account_write(PAGE_SIZE); |
|---|
| 2428 | 2456 | current->nr_dirtied++; |
|---|
| 2429 | 2457 | this_cpu_inc(bdp_ratelimits); |
|---|
| 2458 | + |
|---|
| 2459 | + mem_cgroup_track_foreign_dirty(page, wb); |
|---|
| 2430 | 2460 | } |
|---|
| 2431 | 2461 | } |
|---|
| 2432 | | -EXPORT_SYMBOL(account_page_dirtied); |
|---|
| 2433 | 2462 | |
|---|
| 2434 | 2463 | /* |
|---|
| 2435 | 2464 | * Helper function for deaccounting dirty page without writeback. |
|---|
| .. | .. |
|---|
| 2439 | 2468 | void account_page_cleaned(struct page *page, struct address_space *mapping, |
|---|
| 2440 | 2469 | struct bdi_writeback *wb) |
|---|
| 2441 | 2470 | { |
|---|
| 2442 | | - if (mapping_cap_account_dirty(mapping)) { |
|---|
| 2471 | + if (mapping_can_writeback(mapping)) { |
|---|
| 2443 | 2472 | dec_lruvec_page_state(page, NR_FILE_DIRTY); |
|---|
| 2444 | 2473 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
|---|
| 2445 | 2474 | dec_wb_stat(wb, WB_RECLAIMABLE); |
|---|
| .. | .. |
|---|
| 2449 | 2478 | |
|---|
| 2450 | 2479 | /* |
|---|
| 2451 | 2480 | * For address_spaces which do not use buffers. Just tag the page as dirty in |
|---|
| 2452 | | - * its radix tree. |
|---|
| 2481 | + * the xarray. |
|---|
| 2453 | 2482 | * |
|---|
| 2454 | 2483 | * This is also used when a single buffer is being dirtied: we want to set the |
|---|
| 2455 | 2484 | * page dirty in that case, but not all the buffers. This is a "bottom-up" |
|---|
| .. | .. |
|---|
| 2475 | 2504 | BUG_ON(page_mapping(page) != mapping); |
|---|
| 2476 | 2505 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
|---|
| 2477 | 2506 | account_page_dirtied(page, mapping); |
|---|
| 2478 | | - radix_tree_tag_set(&mapping->i_pages, page_index(page), |
|---|
| 2507 | + __xa_set_mark(&mapping->i_pages, page_index(page), |
|---|
| 2479 | 2508 | PAGECACHE_TAG_DIRTY); |
|---|
| 2480 | 2509 | xa_unlock_irqrestore(&mapping->i_pages, flags); |
|---|
| 2481 | 2510 | unlock_page_memcg(page); |
|---|
| .. | .. |
|---|
| 2502 | 2531 | { |
|---|
| 2503 | 2532 | struct address_space *mapping = page->mapping; |
|---|
| 2504 | 2533 | |
|---|
| 2505 | | - if (mapping && mapping_cap_account_dirty(mapping)) { |
|---|
| 2534 | + if (mapping && mapping_can_writeback(mapping)) { |
|---|
| 2506 | 2535 | struct inode *inode = mapping->host; |
|---|
| 2507 | 2536 | struct bdi_writeback *wb; |
|---|
| 2508 | 2537 | struct wb_lock_cookie cookie = {}; |
|---|
| .. | .. |
|---|
| 2614 | 2643 | { |
|---|
| 2615 | 2644 | struct address_space *mapping = page_mapping(page); |
|---|
| 2616 | 2645 | |
|---|
| 2617 | | - if (mapping_cap_account_dirty(mapping)) { |
|---|
| 2646 | + if (mapping_can_writeback(mapping)) { |
|---|
| 2618 | 2647 | struct inode *inode = mapping->host; |
|---|
| 2619 | 2648 | struct bdi_writeback *wb; |
|---|
| 2620 | 2649 | struct wb_lock_cookie cookie = {}; |
|---|
| .. | .. |
|---|
| 2638 | 2667 | * Returns true if the page was previously dirty. |
|---|
| 2639 | 2668 | * |
|---|
| 2640 | 2669 | * This is for preparing to put the page under writeout. We leave the page |
|---|
| 2641 | | - * tagged as dirty in the radix tree so that a concurrent write-for-sync |
|---|
| 2670 | + * tagged as dirty in the xarray so that a concurrent write-for-sync |
|---|
| 2642 | 2671 | * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage |
|---|
| 2643 | 2672 | * implementation will run either set_page_writeback() or set_page_dirty(), |
|---|
| 2644 | | - * at which stage we bring the page's dirty flag and radix-tree dirty tag |
|---|
| 2673 | + * at which stage we bring the page's dirty flag and xarray dirty tag |
|---|
| 2645 | 2674 | * back into sync. |
|---|
| 2646 | 2675 | * |
|---|
| 2647 | | - * This incoherency between the page's dirty flag and radix-tree tag is |
|---|
| 2676 | + * This incoherency between the page's dirty flag and xarray tag is |
|---|
| 2648 | 2677 | * unfortunate, but it only exists while the page is locked. |
|---|
| 2649 | 2678 | */ |
|---|
| 2650 | 2679 | int clear_page_dirty_for_io(struct page *page) |
|---|
| .. | .. |
|---|
| 2652 | 2681 | struct address_space *mapping = page_mapping(page); |
|---|
| 2653 | 2682 | int ret = 0; |
|---|
| 2654 | 2683 | |
|---|
| 2655 | | - BUG_ON(!PageLocked(page)); |
|---|
| 2684 | + VM_BUG_ON_PAGE(!PageLocked(page), page); |
|---|
| 2656 | 2685 | |
|---|
| 2657 | | - if (mapping && mapping_cap_account_dirty(mapping)) { |
|---|
| 2686 | + if (mapping && mapping_can_writeback(mapping)) { |
|---|
| 2658 | 2687 | struct inode *inode = mapping->host; |
|---|
| 2659 | 2688 | struct bdi_writeback *wb; |
|---|
| 2660 | 2689 | struct wb_lock_cookie cookie = {}; |
|---|
| .. | .. |
|---|
| 2725 | 2754 | xa_lock_irqsave(&mapping->i_pages, flags); |
|---|
| 2726 | 2755 | ret = TestClearPageWriteback(page); |
|---|
| 2727 | 2756 | if (ret) { |
|---|
| 2728 | | - radix_tree_tag_clear(&mapping->i_pages, page_index(page), |
|---|
| 2757 | + __xa_clear_mark(&mapping->i_pages, page_index(page), |
|---|
| 2729 | 2758 | PAGECACHE_TAG_WRITEBACK); |
|---|
| 2730 | | - if (bdi_cap_account_writeback(bdi)) { |
|---|
| 2759 | + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { |
|---|
| 2731 | 2760 | struct bdi_writeback *wb = inode_to_wb(inode); |
|---|
| 2732 | 2761 | |
|---|
| 2733 | 2762 | dec_wb_stat(wb, WB_WRITEBACK); |
|---|
| .. | .. |
|---|
| 2743 | 2772 | } else { |
|---|
| 2744 | 2773 | ret = TestClearPageWriteback(page); |
|---|
| 2745 | 2774 | } |
|---|
| 2746 | | - /* |
|---|
| 2747 | | - * NOTE: Page might be free now! Writeback doesn't hold a page |
|---|
| 2748 | | - * reference on its own, it relies on truncation to wait for |
|---|
| 2749 | | - * the clearing of PG_writeback. The below can only access |
|---|
| 2750 | | - * page state that is static across allocation cycles. |
|---|
| 2751 | | - */ |
|---|
| 2752 | 2775 | if (ret) { |
|---|
| 2753 | 2776 | dec_lruvec_state(lruvec, NR_WRITEBACK); |
|---|
| 2754 | 2777 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
|---|
| .. | .. |
|---|
| 2761 | 2784 | int __test_set_page_writeback(struct page *page, bool keep_write) |
|---|
| 2762 | 2785 | { |
|---|
| 2763 | 2786 | struct address_space *mapping = page_mapping(page); |
|---|
| 2764 | | - int ret; |
|---|
| 2787 | + int ret, access_ret; |
|---|
| 2765 | 2788 | |
|---|
| 2766 | 2789 | lock_page_memcg(page); |
|---|
| 2767 | 2790 | if (mapping && mapping_use_writeback_tags(mapping)) { |
|---|
| 2791 | + XA_STATE(xas, &mapping->i_pages, page_index(page)); |
|---|
| 2768 | 2792 | struct inode *inode = mapping->host; |
|---|
| 2769 | 2793 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
|---|
| 2770 | 2794 | unsigned long flags; |
|---|
| 2771 | 2795 | |
|---|
| 2772 | | - xa_lock_irqsave(&mapping->i_pages, flags); |
|---|
| 2796 | + xas_lock_irqsave(&xas, flags); |
|---|
| 2797 | + xas_load(&xas); |
|---|
| 2773 | 2798 | ret = TestSetPageWriteback(page); |
|---|
| 2774 | 2799 | if (!ret) { |
|---|
| 2775 | 2800 | bool on_wblist; |
|---|
| .. | .. |
|---|
| 2777 | 2802 | on_wblist = mapping_tagged(mapping, |
|---|
| 2778 | 2803 | PAGECACHE_TAG_WRITEBACK); |
|---|
| 2779 | 2804 | |
|---|
| 2780 | | - radix_tree_tag_set(&mapping->i_pages, page_index(page), |
|---|
| 2781 | | - PAGECACHE_TAG_WRITEBACK); |
|---|
| 2782 | | - if (bdi_cap_account_writeback(bdi)) |
|---|
| 2805 | + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); |
|---|
| 2806 | + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) |
|---|
| 2783 | 2807 | inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); |
|---|
| 2784 | 2808 | |
|---|
| 2785 | 2809 | /* |
|---|
| .. | .. |
|---|
| 2791 | 2815 | sb_mark_inode_writeback(mapping->host); |
|---|
| 2792 | 2816 | } |
|---|
| 2793 | 2817 | if (!PageDirty(page)) |
|---|
| 2794 | | - radix_tree_tag_clear(&mapping->i_pages, page_index(page), |
|---|
| 2795 | | - PAGECACHE_TAG_DIRTY); |
|---|
| 2818 | + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); |
|---|
| 2796 | 2819 | if (!keep_write) |
|---|
| 2797 | | - radix_tree_tag_clear(&mapping->i_pages, page_index(page), |
|---|
| 2798 | | - PAGECACHE_TAG_TOWRITE); |
|---|
| 2799 | | - xa_unlock_irqrestore(&mapping->i_pages, flags); |
|---|
| 2820 | + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); |
|---|
| 2821 | + xas_unlock_irqrestore(&xas, flags); |
|---|
| 2800 | 2822 | } else { |
|---|
| 2801 | 2823 | ret = TestSetPageWriteback(page); |
|---|
| 2802 | 2824 | } |
|---|
| .. | .. |
|---|
| 2805 | 2827 | inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
|---|
| 2806 | 2828 | } |
|---|
| 2807 | 2829 | unlock_page_memcg(page); |
|---|
| 2830 | + access_ret = arch_make_page_accessible(page); |
|---|
| 2831 | + /* |
|---|
| 2832 | + * If writeback has been triggered on a page that cannot be made |
|---|
| 2833 | + * accessible, it is too late to recover here. |
|---|
| 2834 | + */ |
|---|
| 2835 | + VM_BUG_ON_PAGE(access_ret != 0, page); |
|---|
| 2836 | + |
|---|
| 2808 | 2837 | return ret; |
|---|
| 2809 | 2838 | |
|---|
| 2810 | 2839 | } |
|---|
| 2811 | 2840 | EXPORT_SYMBOL(__test_set_page_writeback); |
|---|
| 2812 | 2841 | |
|---|
| 2813 | 2842 | /* |
|---|
| 2814 | | - * Return true if any of the pages in the mapping are marked with the |
|---|
| 2815 | | - * passed tag. |
|---|
| 2843 | + * Wait for a page to complete writeback |
|---|
| 2816 | 2844 | */ |
|---|
| 2817 | | -int mapping_tagged(struct address_space *mapping, int tag) |
|---|
| 2845 | +void wait_on_page_writeback(struct page *page) |
|---|
| 2818 | 2846 | { |
|---|
| 2819 | | - return radix_tree_tagged(&mapping->i_pages, tag); |
|---|
| 2847 | + while (PageWriteback(page)) { |
|---|
| 2848 | + trace_wait_on_page_writeback(page, page_mapping(page)); |
|---|
| 2849 | + wait_on_page_bit(page, PG_writeback); |
|---|
| 2850 | + } |
|---|
| 2820 | 2851 | } |
|---|
| 2821 | | -EXPORT_SYMBOL(mapping_tagged); |
|---|
| 2852 | +EXPORT_SYMBOL_GPL(wait_on_page_writeback); |
|---|
| 2822 | 2853 | |
|---|
| 2823 | 2854 | /** |
|---|
| 2824 | 2855 | * wait_for_stable_page() - wait for writeback to finish, if necessary. |
|---|
| .. | .. |
|---|
| 2830 | 2861 | */ |
|---|
| 2831 | 2862 | void wait_for_stable_page(struct page *page) |
|---|
| 2832 | 2863 | { |
|---|
| 2833 | | - if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) |
|---|
| 2864 | + page = thp_head(page); |
|---|
| 2865 | + if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) |
|---|
| 2834 | 2866 | wait_on_page_writeback(page); |
|---|
| 2835 | 2867 | } |
|---|
| 2836 | 2868 | EXPORT_SYMBOL_GPL(wait_for_stable_page); |
|---|