| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * linux/mm/swap.c |
|---|
| 3 | 4 | * |
|---|
| .. | .. |
|---|
| 7 | 8 | /* |
|---|
| 8 | 9 | * This file contains the default values for the operation of the |
|---|
| 9 | 10 | * Linux VM subsystem. Fine-tuning documentation can be found in |
|---|
| 10 | | - * Documentation/sysctl/vm.txt. |
|---|
| 11 | + * Documentation/admin-guide/sysctl/vm.rst. |
|---|
| 11 | 12 | * Started 18.12.91 |
|---|
| 12 | 13 | * Swap aging added 23.2.95, Stephen Tweedie. |
|---|
| 13 | 14 | * Buffermem limits added 12.3.98, Rik van Riel. |
|---|
| .. | .. |
|---|
| 29 | 30 | #include <linux/cpu.h> |
|---|
| 30 | 31 | #include <linux/notifier.h> |
|---|
| 31 | 32 | #include <linux/backing-dev.h> |
|---|
| 32 | | -#include <linux/memremap.h> |
|---|
| 33 | 33 | #include <linux/memcontrol.h> |
|---|
| 34 | 34 | #include <linux/gfp.h> |
|---|
| 35 | 35 | #include <linux/uio.h> |
|---|
| 36 | | -#include <linux/locallock.h> |
|---|
| 37 | 36 | #include <linux/hugetlb.h> |
|---|
| 38 | 37 | #include <linux/page_idle.h> |
|---|
| 38 | +#include <linux/local_lock.h> |
|---|
| 39 | +#include <linux/buffer_head.h> |
|---|
| 39 | 40 | |
|---|
| 40 | 41 | #include "internal.h" |
|---|
| 41 | 42 | |
|---|
| .. | .. |
|---|
| 45 | 46 | /* How many pages do we try to swap or page in/out together? */ |
|---|
| 46 | 47 | int page_cluster; |
|---|
| 47 | 48 | |
|---|
| 48 | | -static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); |
|---|
| 49 | | -static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
|---|
| 50 | | -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); |
|---|
| 51 | | -static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); |
|---|
| 49 | +/* Protecting only lru_rotate.pvec which requires disabling interrupts */ |
|---|
| 50 | +struct lru_rotate { |
|---|
| 51 | + local_lock_t lock; |
|---|
| 52 | + struct pagevec pvec; |
|---|
| 53 | +}; |
|---|
| 54 | +static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { |
|---|
| 55 | + .lock = INIT_LOCAL_LOCK(lock), |
|---|
| 56 | +}; |
|---|
| 57 | + |
|---|
| 58 | +/* |
|---|
| 59 | + * The following struct pagevec are grouped together because they are protected |
|---|
| 60 | + * by disabling preemption (and interrupts remain enabled). |
|---|
| 61 | + */ |
|---|
| 62 | +struct lru_pvecs { |
|---|
| 63 | + local_lock_t lock; |
|---|
| 64 | + struct pagevec lru_add; |
|---|
| 65 | + struct pagevec lru_deactivate_file; |
|---|
| 66 | + struct pagevec lru_deactivate; |
|---|
| 67 | + struct pagevec lru_lazyfree; |
|---|
| 68 | + struct pagevec lru_lazyfree_movetail; |
|---|
| 52 | 69 | #ifdef CONFIG_SMP |
|---|
| 53 | | -static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); |
|---|
| 70 | + struct pagevec activate_page; |
|---|
| 54 | 71 | #endif |
|---|
| 55 | | -static DEFINE_LOCAL_IRQ_LOCK(rotate_lock); |
|---|
| 56 | | -DEFINE_LOCAL_IRQ_LOCK(swapvec_lock); |
|---|
| 72 | +}; |
|---|
| 73 | +static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { |
|---|
| 74 | + .lock = INIT_LOCAL_LOCK(lock), |
|---|
| 75 | +}; |
|---|
| 57 | 76 | |
|---|
| 58 | 77 | /* |
|---|
| 59 | 78 | * This path almost never happens for VM activity - pages are normally |
|---|
| .. | .. |
|---|
| 62 | 81 | static void __page_cache_release(struct page *page) |
|---|
| 63 | 82 | { |
|---|
| 64 | 83 | if (PageLRU(page)) { |
|---|
| 65 | | - struct zone *zone = page_zone(page); |
|---|
| 84 | + pg_data_t *pgdat = page_pgdat(page); |
|---|
| 66 | 85 | struct lruvec *lruvec; |
|---|
| 67 | 86 | unsigned long flags; |
|---|
| 68 | 87 | |
|---|
| 69 | | - spin_lock_irqsave(zone_lru_lock(zone), flags); |
|---|
| 70 | | - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); |
|---|
| 88 | + spin_lock_irqsave(&pgdat->lru_lock, flags); |
|---|
| 89 | + lruvec = mem_cgroup_page_lruvec(page, pgdat); |
|---|
| 71 | 90 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
|---|
| 72 | 91 | __ClearPageLRU(page); |
|---|
| 73 | 92 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
|---|
| 74 | | - spin_unlock_irqrestore(zone_lru_lock(zone), flags); |
|---|
| 93 | + spin_unlock_irqrestore(&pgdat->lru_lock, flags); |
|---|
| 75 | 94 | } |
|---|
| 76 | 95 | __ClearPageWaiters(page); |
|---|
| 77 | | - mem_cgroup_uncharge(page); |
|---|
| 78 | 96 | } |
|---|
| 79 | 97 | |
|---|
| 80 | 98 | static void __put_single_page(struct page *page) |
|---|
| 81 | 99 | { |
|---|
| 82 | 100 | __page_cache_release(page); |
|---|
| 101 | + mem_cgroup_uncharge(page); |
|---|
| 83 | 102 | free_unref_page(page); |
|---|
| 84 | 103 | } |
|---|
| 85 | 104 | |
|---|
| 86 | 105 | static void __put_compound_page(struct page *page) |
|---|
| 87 | 106 | { |
|---|
| 88 | | - compound_page_dtor *dtor; |
|---|
| 89 | | - |
|---|
| 90 | 107 | /* |
|---|
| 91 | 108 | * __page_cache_release() is supposed to be called for thp, not for |
|---|
| 92 | 109 | * hugetlb. This is because hugetlb page does never have PageLRU set |
|---|
| .. | .. |
|---|
| 95 | 112 | */ |
|---|
| 96 | 113 | if (!PageHuge(page)) |
|---|
| 97 | 114 | __page_cache_release(page); |
|---|
| 98 | | - dtor = get_compound_page_dtor(page); |
|---|
| 99 | | - (*dtor)(page); |
|---|
| 115 | + destroy_compound_page(page); |
|---|
| 100 | 116 | } |
|---|
| 101 | 117 | |
|---|
| 102 | 118 | void __put_page(struct page *page) |
|---|
| .. | .. |
|---|
| 130 | 146 | while (!list_empty(pages)) { |
|---|
| 131 | 147 | struct page *victim; |
|---|
| 132 | 148 | |
|---|
| 133 | | - victim = list_entry(pages->prev, struct page, lru); |
|---|
| 149 | + victim = lru_to_page(pages); |
|---|
| 134 | 150 | list_del(&victim->lru); |
|---|
| 135 | 151 | put_page(victim); |
|---|
| 136 | 152 | } |
|---|
| .. | .. |
|---|
| 227 | 243 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
|---|
| 228 | 244 | ClearPageActive(page); |
|---|
| 229 | 245 | add_page_to_lru_list_tail(page, lruvec, page_lru(page)); |
|---|
| 230 | | - (*pgmoved)++; |
|---|
| 246 | + (*pgmoved) += thp_nr_pages(page); |
|---|
| 231 | 247 | } |
|---|
| 232 | 248 | } |
|---|
| 233 | 249 | |
|---|
| .. | .. |
|---|
| 243 | 259 | __count_vm_events(PGROTATED, pgmoved); |
|---|
| 244 | 260 | } |
|---|
| 245 | 261 | |
|---|
| 262 | +/* return true if pagevec needs to drain */ |
|---|
| 263 | +static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) |
|---|
| 264 | +{ |
|---|
| 265 | + bool ret = false; |
|---|
| 266 | + |
|---|
| 267 | + if (!pagevec_add(pvec, page) || PageCompound(page) || |
|---|
| 268 | + lru_cache_disabled()) |
|---|
| 269 | + ret = true; |
|---|
| 270 | + |
|---|
| 271 | + return ret; |
|---|
| 272 | +} |
|---|
| 273 | + |
|---|
| 246 | 274 | /* |
|---|
| 247 | 275 | * Writeback is about to end against a page which has been marked for immediate |
|---|
| 248 | 276 | * reclaim. If it still appears to be reclaimable, move it to the tail of the |
|---|
| .. | .. |
|---|
| 256 | 284 | unsigned long flags; |
|---|
| 257 | 285 | |
|---|
| 258 | 286 | get_page(page); |
|---|
| 259 | | - local_lock_irqsave(rotate_lock, flags); |
|---|
| 260 | | - pvec = this_cpu_ptr(&lru_rotate_pvecs); |
|---|
| 261 | | - if (!pagevec_add(pvec, page) || PageCompound(page)) |
|---|
| 287 | + local_lock_irqsave(&lru_rotate.lock, flags); |
|---|
| 288 | + pvec = this_cpu_ptr(&lru_rotate.pvec); |
|---|
| 289 | + if (pagevec_add_and_need_flush(pvec, page)) |
|---|
| 262 | 290 | pagevec_move_tail(pvec); |
|---|
| 263 | | - local_unlock_irqrestore(rotate_lock, flags); |
|---|
| 291 | + local_unlock_irqrestore(&lru_rotate.lock, flags); |
|---|
| 264 | 292 | } |
|---|
| 265 | 293 | } |
|---|
| 266 | 294 | |
|---|
| 267 | | -static void update_page_reclaim_stat(struct lruvec *lruvec, |
|---|
| 268 | | - int file, int rotated) |
|---|
| 295 | +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) |
|---|
| 269 | 296 | { |
|---|
| 270 | | - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
|---|
| 297 | + do { |
|---|
| 298 | + unsigned long lrusize; |
|---|
| 271 | 299 | |
|---|
| 272 | | - reclaim_stat->recent_scanned[file]++; |
|---|
| 273 | | - if (rotated) |
|---|
| 274 | | - reclaim_stat->recent_rotated[file]++; |
|---|
| 300 | + /* Record cost event */ |
|---|
| 301 | + if (file) |
|---|
| 302 | + lruvec->file_cost += nr_pages; |
|---|
| 303 | + else |
|---|
| 304 | + lruvec->anon_cost += nr_pages; |
|---|
| 305 | + |
|---|
| 306 | + /* |
|---|
| 307 | + * Decay previous events |
|---|
| 308 | + * |
|---|
| 309 | + * Because workloads change over time (and to avoid |
|---|
| 310 | + * overflow) we keep these statistics as a floating |
|---|
| 311 | + * average, which ends up weighing recent refaults |
|---|
| 312 | + * more than old ones. |
|---|
| 313 | + */ |
|---|
| 314 | + lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + |
|---|
| 315 | + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + |
|---|
| 316 | + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + |
|---|
| 317 | + lruvec_page_state(lruvec, NR_ACTIVE_FILE); |
|---|
| 318 | + |
|---|
| 319 | + if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { |
|---|
| 320 | + lruvec->file_cost /= 2; |
|---|
| 321 | + lruvec->anon_cost /= 2; |
|---|
| 322 | + } |
|---|
| 323 | + } while ((lruvec = parent_lruvec(lruvec))); |
|---|
| 324 | +} |
|---|
| 325 | + |
|---|
| 326 | +void lru_note_cost_page(struct page *page) |
|---|
| 327 | +{ |
|---|
| 328 | + lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)), |
|---|
| 329 | + page_is_file_lru(page), thp_nr_pages(page)); |
|---|
| 275 | 330 | } |
|---|
| 276 | 331 | |
|---|
| 277 | 332 | static void __activate_page(struct page *page, struct lruvec *lruvec, |
|---|
| 278 | 333 | void *arg) |
|---|
| 279 | 334 | { |
|---|
| 280 | 335 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
|---|
| 281 | | - int file = page_is_file_cache(page); |
|---|
| 282 | 336 | int lru = page_lru_base_type(page); |
|---|
| 337 | + int nr_pages = thp_nr_pages(page); |
|---|
| 283 | 338 | |
|---|
| 284 | 339 | del_page_from_lru_list(page, lruvec, lru); |
|---|
| 285 | 340 | SetPageActive(page); |
|---|
| .. | .. |
|---|
| 287 | 342 | add_page_to_lru_list(page, lruvec, lru); |
|---|
| 288 | 343 | trace_mm_lru_activate(page); |
|---|
| 289 | 344 | |
|---|
| 290 | | - __count_vm_event(PGACTIVATE); |
|---|
| 291 | | - update_page_reclaim_stat(lruvec, file, 1); |
|---|
| 345 | + __count_vm_events(PGACTIVATE, nr_pages); |
|---|
| 346 | + __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, |
|---|
| 347 | + nr_pages); |
|---|
| 292 | 348 | } |
|---|
| 293 | 349 | } |
|---|
| 294 | 350 | |
|---|
| 295 | 351 | #ifdef CONFIG_SMP |
|---|
| 296 | 352 | static void activate_page_drain(int cpu) |
|---|
| 297 | 353 | { |
|---|
| 298 | | - struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); |
|---|
| 354 | + struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); |
|---|
| 299 | 355 | |
|---|
| 300 | 356 | if (pagevec_count(pvec)) |
|---|
| 301 | 357 | pagevec_lru_move_fn(pvec, __activate_page, NULL); |
|---|
| .. | .. |
|---|
| 303 | 359 | |
|---|
| 304 | 360 | static bool need_activate_page_drain(int cpu) |
|---|
| 305 | 361 | { |
|---|
| 306 | | - return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; |
|---|
| 362 | + return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; |
|---|
| 307 | 363 | } |
|---|
| 308 | 364 | |
|---|
| 309 | | -void activate_page(struct page *page) |
|---|
| 365 | +static void activate_page(struct page *page) |
|---|
| 310 | 366 | { |
|---|
| 311 | 367 | page = compound_head(page); |
|---|
| 312 | 368 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
|---|
| 313 | | - struct pagevec *pvec = &get_locked_var(swapvec_lock, |
|---|
| 314 | | - activate_page_pvecs); |
|---|
| 369 | + struct pagevec *pvec; |
|---|
| 315 | 370 | |
|---|
| 371 | + local_lock(&lru_pvecs.lock); |
|---|
| 372 | + pvec = this_cpu_ptr(&lru_pvecs.activate_page); |
|---|
| 316 | 373 | get_page(page); |
|---|
| 317 | | - if (!pagevec_add(pvec, page) || PageCompound(page)) |
|---|
| 374 | + if (pagevec_add_and_need_flush(pvec, page)) |
|---|
| 318 | 375 | pagevec_lru_move_fn(pvec, __activate_page, NULL); |
|---|
| 319 | | - put_locked_var(swapvec_lock, activate_page_pvecs); |
|---|
| 376 | + local_unlock(&lru_pvecs.lock); |
|---|
| 320 | 377 | } |
|---|
| 321 | 378 | } |
|---|
| 322 | 379 | |
|---|
| .. | .. |
|---|
| 325 | 382 | { |
|---|
| 326 | 383 | } |
|---|
| 327 | 384 | |
|---|
| 328 | | -void activate_page(struct page *page) |
|---|
| 385 | +static void activate_page(struct page *page) |
|---|
| 329 | 386 | { |
|---|
| 330 | | - struct zone *zone = page_zone(page); |
|---|
| 387 | + pg_data_t *pgdat = page_pgdat(page); |
|---|
| 331 | 388 | |
|---|
| 332 | 389 | page = compound_head(page); |
|---|
| 333 | | - spin_lock_irq(zone_lru_lock(zone)); |
|---|
| 334 | | - __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); |
|---|
| 335 | | - spin_unlock_irq(zone_lru_lock(zone)); |
|---|
| 390 | + spin_lock_irq(&pgdat->lru_lock); |
|---|
| 391 | + __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL); |
|---|
| 392 | + spin_unlock_irq(&pgdat->lru_lock); |
|---|
| 336 | 393 | } |
|---|
| 337 | 394 | #endif |
|---|
| 338 | 395 | |
|---|
| 339 | 396 | static void __lru_cache_activate_page(struct page *page) |
|---|
| 340 | 397 | { |
|---|
| 341 | | - struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec); |
|---|
| 398 | + struct pagevec *pvec; |
|---|
| 342 | 399 | int i; |
|---|
| 400 | + |
|---|
| 401 | + local_lock(&lru_pvecs.lock); |
|---|
| 402 | + pvec = this_cpu_ptr(&lru_pvecs.lru_add); |
|---|
| 343 | 403 | |
|---|
| 344 | 404 | /* |
|---|
| 345 | 405 | * Search backwards on the optimistic assumption that the page being |
|---|
| .. | .. |
|---|
| 360 | 420 | } |
|---|
| 361 | 421 | } |
|---|
| 362 | 422 | |
|---|
| 363 | | - put_locked_var(swapvec_lock, lru_add_pvec); |
|---|
| 423 | + local_unlock(&lru_pvecs.lock); |
|---|
| 364 | 424 | } |
|---|
| 365 | 425 | |
|---|
| 366 | 426 | /* |
|---|
| .. | .. |
|---|
| 376 | 436 | void mark_page_accessed(struct page *page) |
|---|
| 377 | 437 | { |
|---|
| 378 | 438 | page = compound_head(page); |
|---|
| 379 | | - if (!PageActive(page) && !PageUnevictable(page) && |
|---|
| 380 | | - PageReferenced(page)) { |
|---|
| 381 | 439 | |
|---|
| 440 | + trace_android_vh_mark_page_accessed(page); |
|---|
| 441 | + if (!PageReferenced(page)) { |
|---|
| 442 | + SetPageReferenced(page); |
|---|
| 443 | + } else if (PageUnevictable(page)) { |
|---|
| 444 | + /* |
|---|
| 445 | + * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, |
|---|
| 446 | + * this list is never rotated or maintained, so marking an |
|---|
| 447 | + * evictable page accessed has no effect. |
|---|
| 448 | + */ |
|---|
| 449 | + } else if (!PageActive(page)) { |
|---|
| 382 | 450 | /* |
|---|
| 383 | 451 | * If the page is on the LRU, queue it for activation via |
|---|
| 384 | | - * activate_page_pvecs. Otherwise, assume the page is on a |
|---|
| 452 | + * lru_pvecs.activate_page. Otherwise, assume the page is on a |
|---|
| 385 | 453 | * pagevec, mark it active and it'll be moved to the active |
|---|
| 386 | 454 | * LRU on the next drain. |
|---|
| 387 | 455 | */ |
|---|
| .. | .. |
|---|
| 390 | 458 | else |
|---|
| 391 | 459 | __lru_cache_activate_page(page); |
|---|
| 392 | 460 | ClearPageReferenced(page); |
|---|
| 393 | | - if (page_is_file_cache(page)) |
|---|
| 394 | | - workingset_activation(page); |
|---|
| 395 | | - } else if (!PageReferenced(page)) { |
|---|
| 396 | | - SetPageReferenced(page); |
|---|
| 461 | + workingset_activation(page); |
|---|
| 397 | 462 | } |
|---|
| 398 | 463 | if (page_is_idle(page)) |
|---|
| 399 | 464 | clear_page_idle(page); |
|---|
| 400 | 465 | } |
|---|
| 401 | 466 | EXPORT_SYMBOL(mark_page_accessed); |
|---|
| 402 | | - |
|---|
| 403 | | -static void __lru_cache_add(struct page *page) |
|---|
| 404 | | -{ |
|---|
| 405 | | - struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec); |
|---|
| 406 | | - |
|---|
| 407 | | - get_page(page); |
|---|
| 408 | | - if (!pagevec_add(pvec, page) || PageCompound(page)) |
|---|
| 409 | | - __pagevec_lru_add(pvec); |
|---|
| 410 | | - put_locked_var(swapvec_lock, lru_add_pvec); |
|---|
| 411 | | -} |
|---|
| 412 | | - |
|---|
| 413 | | -/** |
|---|
| 414 | | - * lru_cache_add_anon - add a page to the page lists |
|---|
| 415 | | - * @page: the page to add |
|---|
| 416 | | - */ |
|---|
| 417 | | -void lru_cache_add_anon(struct page *page) |
|---|
| 418 | | -{ |
|---|
| 419 | | - if (PageActive(page)) |
|---|
| 420 | | - ClearPageActive(page); |
|---|
| 421 | | - __lru_cache_add(page); |
|---|
| 422 | | -} |
|---|
| 423 | | - |
|---|
| 424 | | -void lru_cache_add_file(struct page *page) |
|---|
| 425 | | -{ |
|---|
| 426 | | - if (PageActive(page)) |
|---|
| 427 | | - ClearPageActive(page); |
|---|
| 428 | | - __lru_cache_add(page); |
|---|
| 429 | | -} |
|---|
| 430 | | -EXPORT_SYMBOL(lru_cache_add_file); |
|---|
| 431 | 467 | |
|---|
| 432 | 468 | /** |
|---|
| 433 | 469 | * lru_cache_add - add a page to a page list |
|---|
| .. | .. |
|---|
| 440 | 476 | */ |
|---|
| 441 | 477 | void lru_cache_add(struct page *page) |
|---|
| 442 | 478 | { |
|---|
| 479 | + struct pagevec *pvec; |
|---|
| 480 | + |
|---|
| 443 | 481 | VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); |
|---|
| 444 | 482 | VM_BUG_ON_PAGE(PageLRU(page), page); |
|---|
| 445 | | - __lru_cache_add(page); |
|---|
| 483 | + |
|---|
| 484 | + get_page(page); |
|---|
| 485 | + local_lock(&lru_pvecs.lock); |
|---|
| 486 | + pvec = this_cpu_ptr(&lru_pvecs.lru_add); |
|---|
| 487 | + if (pagevec_add_and_need_flush(pvec, page)) |
|---|
| 488 | + __pagevec_lru_add(pvec); |
|---|
| 489 | + local_unlock(&lru_pvecs.lock); |
|---|
| 446 | 490 | } |
|---|
| 491 | +EXPORT_SYMBOL(lru_cache_add); |
|---|
| 447 | 492 | |
|---|
| 448 | 493 | /** |
|---|
| 449 | | - * lru_cache_add_active_or_unevictable |
|---|
| 494 | + * lru_cache_add_inactive_or_unevictable |
|---|
| 450 | 495 | * @page: the page to be added to LRU |
|---|
| 451 | 496 | * @vma: vma in which page is mapped for determining reclaimability |
|---|
| 452 | 497 | * |
|---|
| 453 | | - * Place @page on the active or unevictable LRU list, depending on its |
|---|
| 454 | | - * evictability. Note that if the page is not evictable, it goes |
|---|
| 455 | | - * directly back onto it's zone's unevictable list, it does NOT use a |
|---|
| 456 | | - * per cpu pagevec. |
|---|
| 498 | + * Place @page on the inactive or unevictable LRU list, depending on its |
|---|
| 499 | + * evictability. |
|---|
| 457 | 500 | */ |
|---|
| 458 | | -void lru_cache_add_active_or_unevictable(struct page *page, |
|---|
| 459 | | - struct vm_area_struct *vma) |
|---|
| 501 | +void __lru_cache_add_inactive_or_unevictable(struct page *page, |
|---|
| 502 | + unsigned long vma_flags) |
|---|
| 460 | 503 | { |
|---|
| 504 | + bool unevictable; |
|---|
| 505 | + |
|---|
| 461 | 506 | VM_BUG_ON_PAGE(PageLRU(page), page); |
|---|
| 462 | 507 | |
|---|
| 463 | | - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) |
|---|
| 464 | | - SetPageActive(page); |
|---|
| 465 | | - else if (!TestSetPageMlocked(page)) { |
|---|
| 508 | + unevictable = (vma_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; |
|---|
| 509 | + if (unlikely(unevictable) && !TestSetPageMlocked(page)) { |
|---|
| 510 | + int nr_pages = thp_nr_pages(page); |
|---|
| 466 | 511 | /* |
|---|
| 467 | 512 | * We use the irq-unsafe __mod_zone_page_stat because this |
|---|
| 468 | 513 | * counter is not modified from interrupt context, and the pte |
|---|
| 469 | 514 | * lock is held(spinlock), which implies preemption disabled. |
|---|
| 470 | 515 | */ |
|---|
| 471 | | - __mod_zone_page_state(page_zone(page), NR_MLOCK, |
|---|
| 472 | | - hpage_nr_pages(page)); |
|---|
| 473 | | - count_vm_event(UNEVICTABLE_PGMLOCKED); |
|---|
| 516 | + __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); |
|---|
| 517 | + count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); |
|---|
| 474 | 518 | } |
|---|
| 475 | 519 | lru_cache_add(page); |
|---|
| 476 | 520 | } |
|---|
| .. | .. |
|---|
| 499 | 543 | static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, |
|---|
| 500 | 544 | void *arg) |
|---|
| 501 | 545 | { |
|---|
| 502 | | - int lru, file; |
|---|
| 546 | + int lru; |
|---|
| 503 | 547 | bool active; |
|---|
| 548 | + int nr_pages = thp_nr_pages(page); |
|---|
| 504 | 549 | |
|---|
| 505 | 550 | if (!PageLRU(page)) |
|---|
| 506 | 551 | return; |
|---|
| .. | .. |
|---|
| 513 | 558 | return; |
|---|
| 514 | 559 | |
|---|
| 515 | 560 | active = PageActive(page); |
|---|
| 516 | | - file = page_is_file_cache(page); |
|---|
| 517 | 561 | lru = page_lru_base_type(page); |
|---|
| 518 | 562 | |
|---|
| 519 | 563 | del_page_from_lru_list(page, lruvec, lru + active); |
|---|
| 520 | 564 | ClearPageActive(page); |
|---|
| 521 | 565 | ClearPageReferenced(page); |
|---|
| 522 | | - add_page_to_lru_list(page, lruvec, lru); |
|---|
| 523 | 566 | |
|---|
| 524 | 567 | if (PageWriteback(page) || PageDirty(page)) { |
|---|
| 525 | 568 | /* |
|---|
| .. | .. |
|---|
| 527 | 570 | * It can make readahead confusing. But race window |
|---|
| 528 | 571 | * is _really_ small and it's non-critical problem. |
|---|
| 529 | 572 | */ |
|---|
| 573 | + add_page_to_lru_list(page, lruvec, lru); |
|---|
| 530 | 574 | SetPageReclaim(page); |
|---|
| 531 | 575 | } else { |
|---|
| 532 | 576 | /* |
|---|
| 533 | 577 | * The page's writeback ends up during pagevec |
|---|
| 534 | 578 | * We moves tha page into tail of inactive. |
|---|
| 535 | 579 | */ |
|---|
| 536 | | - list_move_tail(&page->lru, &lruvec->lists[lru]); |
|---|
| 537 | | - __count_vm_event(PGROTATED); |
|---|
| 580 | + add_page_to_lru_list_tail(page, lruvec, lru); |
|---|
| 581 | + __count_vm_events(PGROTATED, nr_pages); |
|---|
| 538 | 582 | } |
|---|
| 539 | 583 | |
|---|
| 540 | | - if (active) |
|---|
| 541 | | - __count_vm_event(PGDEACTIVATE); |
|---|
| 542 | | - update_page_reclaim_stat(lruvec, file, 0); |
|---|
| 584 | + if (active) { |
|---|
| 585 | + __count_vm_events(PGDEACTIVATE, nr_pages); |
|---|
| 586 | + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, |
|---|
| 587 | + nr_pages); |
|---|
| 588 | + } |
|---|
| 543 | 589 | } |
|---|
| 544 | 590 | |
|---|
| 591 | +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, |
|---|
| 592 | + void *arg) |
|---|
| 593 | +{ |
|---|
| 594 | + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { |
|---|
| 595 | + int lru = page_lru_base_type(page); |
|---|
| 596 | + int nr_pages = thp_nr_pages(page); |
|---|
| 597 | + |
|---|
| 598 | + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); |
|---|
| 599 | + ClearPageActive(page); |
|---|
| 600 | + ClearPageReferenced(page); |
|---|
| 601 | + add_page_to_lru_list(page, lruvec, lru); |
|---|
| 602 | + |
|---|
| 603 | + __count_vm_events(PGDEACTIVATE, nr_pages); |
|---|
| 604 | + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, |
|---|
| 605 | + nr_pages); |
|---|
| 606 | + } |
|---|
| 607 | +} |
|---|
| 545 | 608 | |
|---|
| 546 | 609 | static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, |
|---|
| 547 | 610 | void *arg) |
|---|
| .. | .. |
|---|
| 549 | 612 | if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && |
|---|
| 550 | 613 | !PageSwapCache(page) && !PageUnevictable(page)) { |
|---|
| 551 | 614 | bool active = PageActive(page); |
|---|
| 615 | + int nr_pages = thp_nr_pages(page); |
|---|
| 552 | 616 | |
|---|
| 553 | 617 | del_page_from_lru_list(page, lruvec, |
|---|
| 554 | 618 | LRU_INACTIVE_ANON + active); |
|---|
| 555 | 619 | ClearPageActive(page); |
|---|
| 556 | 620 | ClearPageReferenced(page); |
|---|
| 557 | 621 | /* |
|---|
| 558 | | - * lazyfree pages are clean anonymous pages. They have |
|---|
| 559 | | - * SwapBacked flag cleared to distinguish normal anonymous |
|---|
| 560 | | - * pages |
|---|
| 622 | + * Lazyfree pages are clean anonymous pages. They have |
|---|
| 623 | + * PG_swapbacked flag cleared, to distinguish them from normal |
|---|
| 624 | + * anonymous pages |
|---|
| 561 | 625 | */ |
|---|
| 562 | 626 | ClearPageSwapBacked(page); |
|---|
| 563 | 627 | add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); |
|---|
| 564 | 628 | |
|---|
| 565 | | - __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); |
|---|
| 566 | | - count_memcg_page_event(page, PGLAZYFREE); |
|---|
| 567 | | - update_page_reclaim_stat(lruvec, 1, 0); |
|---|
| 629 | + __count_vm_events(PGLAZYFREE, nr_pages); |
|---|
| 630 | + __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, |
|---|
| 631 | + nr_pages); |
|---|
| 632 | + } |
|---|
| 633 | +} |
|---|
| 634 | + |
|---|
| 635 | +static void lru_lazyfree_movetail_fn(struct page *page, struct lruvec *lruvec, |
|---|
| 636 | + void *arg) |
|---|
| 637 | +{ |
|---|
| 638 | + bool *add_to_tail = (bool *)arg; |
|---|
| 639 | + |
|---|
| 640 | + if (PageLRU(page) && !PageUnevictable(page) && PageSwapBacked(page) && |
|---|
| 641 | + !PageSwapCache(page)) { |
|---|
| 642 | + bool active = PageActive(page); |
|---|
| 643 | + |
|---|
| 644 | + del_page_from_lru_list(page, lruvec, |
|---|
| 645 | + LRU_INACTIVE_ANON + active); |
|---|
| 646 | + ClearPageActive(page); |
|---|
| 647 | + ClearPageReferenced(page); |
|---|
| 648 | + if (add_to_tail && *add_to_tail) |
|---|
| 649 | + add_page_to_lru_list_tail(page, lruvec, LRU_INACTIVE_FILE); |
|---|
| 650 | + else |
|---|
| 651 | + add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); |
|---|
| 568 | 652 | } |
|---|
| 569 | 653 | } |
|---|
| 570 | 654 | |
|---|
| .. | .. |
|---|
| 575 | 659 | */ |
|---|
| 576 | 660 | void lru_add_drain_cpu(int cpu) |
|---|
| 577 | 661 | { |
|---|
| 578 | | - struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); |
|---|
| 662 | + struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu); |
|---|
| 579 | 663 | |
|---|
| 580 | 664 | if (pagevec_count(pvec)) |
|---|
| 581 | 665 | __pagevec_lru_add(pvec); |
|---|
| 582 | 666 | |
|---|
| 583 | | - pvec = &per_cpu(lru_rotate_pvecs, cpu); |
|---|
| 584 | | - if (pagevec_count(pvec)) { |
|---|
| 667 | + pvec = &per_cpu(lru_rotate.pvec, cpu); |
|---|
| 668 | + /* Disabling interrupts below acts as a compiler barrier. */ |
|---|
| 669 | + if (data_race(pagevec_count(pvec))) { |
|---|
| 585 | 670 | unsigned long flags; |
|---|
| 586 | 671 | |
|---|
| 587 | 672 | /* No harm done if a racing interrupt already did this */ |
|---|
| 588 | | -#ifdef CONFIG_PREEMPT_RT_BASE |
|---|
| 589 | | - local_lock_irqsave_on(rotate_lock, flags, cpu); |
|---|
| 673 | + local_lock_irqsave(&lru_rotate.lock, flags); |
|---|
| 590 | 674 | pagevec_move_tail(pvec); |
|---|
| 591 | | - local_unlock_irqrestore_on(rotate_lock, flags, cpu); |
|---|
| 592 | | -#else |
|---|
| 593 | | - local_lock_irqsave(rotate_lock, flags); |
|---|
| 594 | | - pagevec_move_tail(pvec); |
|---|
| 595 | | - local_unlock_irqrestore(rotate_lock, flags); |
|---|
| 596 | | -#endif |
|---|
| 675 | + local_unlock_irqrestore(&lru_rotate.lock, flags); |
|---|
| 597 | 676 | } |
|---|
| 598 | 677 | |
|---|
| 599 | | - pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); |
|---|
| 678 | + pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); |
|---|
| 600 | 679 | if (pagevec_count(pvec)) |
|---|
| 601 | 680 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
|---|
| 602 | 681 | |
|---|
| 603 | | - pvec = &per_cpu(lru_lazyfree_pvecs, cpu); |
|---|
| 682 | + pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu); |
|---|
| 683 | + if (pagevec_count(pvec)) |
|---|
| 684 | + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); |
|---|
| 685 | + |
|---|
| 686 | + pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu); |
|---|
| 604 | 687 | if (pagevec_count(pvec)) |
|---|
| 605 | 688 | pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); |
|---|
| 689 | + |
|---|
| 690 | + pvec = &per_cpu(lru_pvecs.lru_lazyfree_movetail, cpu); |
|---|
| 691 | + if (pagevec_count(pvec)) |
|---|
| 692 | + pagevec_lru_move_fn(pvec, lru_lazyfree_movetail_fn, NULL); |
|---|
| 606 | 693 | |
|---|
| 607 | 694 | activate_page_drain(cpu); |
|---|
| 608 | 695 | } |
|---|
| .. | .. |
|---|
| 625 | 712 | return; |
|---|
| 626 | 713 | |
|---|
| 627 | 714 | if (likely(get_page_unless_zero(page))) { |
|---|
| 628 | | - struct pagevec *pvec = &get_locked_var(swapvec_lock, |
|---|
| 629 | | - lru_deactivate_file_pvecs); |
|---|
| 715 | + struct pagevec *pvec; |
|---|
| 630 | 716 | |
|---|
| 631 | | - if (!pagevec_add(pvec, page) || PageCompound(page)) |
|---|
| 717 | + local_lock(&lru_pvecs.lock); |
|---|
| 718 | + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); |
|---|
| 719 | + |
|---|
| 720 | + if (pagevec_add_and_need_flush(pvec, page)) |
|---|
| 632 | 721 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
|---|
| 633 | | - put_locked_var(swapvec_lock, lru_deactivate_file_pvecs); |
|---|
| 722 | + local_unlock(&lru_pvecs.lock); |
|---|
| 723 | + } |
|---|
| 724 | +} |
|---|
| 725 | + |
|---|
| 726 | +/* |
|---|
| 727 | + * deactivate_page - deactivate a page |
|---|
| 728 | + * @page: page to deactivate |
|---|
| 729 | + * |
|---|
| 730 | + * deactivate_page() moves @page to the inactive list if @page was on the active |
|---|
| 731 | + * list and was not an unevictable page. This is done to accelerate the reclaim |
|---|
| 732 | + * of @page. |
|---|
| 733 | + */ |
|---|
| 734 | +void deactivate_page(struct page *page) |
|---|
| 735 | +{ |
|---|
| 736 | + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { |
|---|
| 737 | + struct pagevec *pvec; |
|---|
| 738 | + |
|---|
| 739 | + local_lock(&lru_pvecs.lock); |
|---|
| 740 | + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); |
|---|
| 741 | + get_page(page); |
|---|
| 742 | + if (pagevec_add_and_need_flush(pvec, page)) |
|---|
| 743 | + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); |
|---|
| 744 | + local_unlock(&lru_pvecs.lock); |
|---|
| 634 | 745 | } |
|---|
| 635 | 746 | } |
|---|
| 636 | 747 | |
|---|
| .. | .. |
|---|
| 645 | 756 | { |
|---|
| 646 | 757 | if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && |
|---|
| 647 | 758 | !PageSwapCache(page) && !PageUnevictable(page)) { |
|---|
| 648 | | - struct pagevec *pvec = &get_locked_var(swapvec_lock, |
|---|
| 649 | | - lru_lazyfree_pvecs); |
|---|
| 759 | + struct pagevec *pvec; |
|---|
| 650 | 760 | |
|---|
| 761 | + local_lock(&lru_pvecs.lock); |
|---|
| 762 | + pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); |
|---|
| 651 | 763 | get_page(page); |
|---|
| 652 | | - if (!pagevec_add(pvec, page) || PageCompound(page)) |
|---|
| 764 | + if (pagevec_add_and_need_flush(pvec, page)) |
|---|
| 653 | 765 | pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); |
|---|
| 654 | | - put_locked_var(swapvec_lock, lru_lazyfree_pvecs); |
|---|
| 766 | + local_unlock(&lru_pvecs.lock); |
|---|
| 767 | + } |
|---|
| 768 | +} |
|---|
| 769 | + |
|---|
| 770 | +/** |
|---|
| 771 | + * mark_page_lazyfree_movetail - make a swapbacked page lazyfree |
|---|
| 772 | + * @page: page to deactivate |
|---|
| 773 | + * |
|---|
| 774 | + * mark_page_lazyfree_movetail() moves @page to the tail of inactive file list. |
|---|
| 775 | + * This is done to accelerate the reclaim of @page. |
|---|
| 776 | + */ |
|---|
| 777 | +void mark_page_lazyfree_movetail(struct page *page, bool tail) |
|---|
| 778 | +{ |
|---|
| 779 | + if (PageLRU(page) && !PageUnevictable(page) && PageSwapBacked(page) && |
|---|
| 780 | + !PageSwapCache(page)) { |
|---|
| 781 | + struct pagevec *pvec; |
|---|
| 782 | + |
|---|
| 783 | + local_lock(&lru_pvecs.lock); |
|---|
| 784 | + pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree_movetail); |
|---|
| 785 | + get_page(page); |
|---|
| 786 | + if (pagevec_add_and_need_flush(pvec, page)) |
|---|
| 787 | + pagevec_lru_move_fn(pvec, |
|---|
| 788 | + lru_lazyfree_movetail_fn, &tail); |
|---|
| 789 | + local_unlock(&lru_pvecs.lock); |
|---|
| 655 | 790 | } |
|---|
| 656 | 791 | } |
|---|
| 657 | 792 | |
|---|
| 658 | 793 | void lru_add_drain(void) |
|---|
| 659 | 794 | { |
|---|
| 660 | | - lru_add_drain_cpu(local_lock_cpu(swapvec_lock)); |
|---|
| 661 | | - local_unlock_cpu(swapvec_lock); |
|---|
| 795 | + local_lock(&lru_pvecs.lock); |
|---|
| 796 | + lru_add_drain_cpu(smp_processor_id()); |
|---|
| 797 | + local_unlock(&lru_pvecs.lock); |
|---|
| 798 | +} |
|---|
| 799 | + |
|---|
| 800 | +/* |
|---|
| 801 | + * It's called from per-cpu workqueue context in SMP case so |
|---|
| 802 | + * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on |
|---|
| 803 | + * the same cpu. It shouldn't be a problem in !SMP case since |
|---|
| 804 | + * the core is only one and the locks will disable preemption. |
|---|
| 805 | + */ |
|---|
| 806 | +static void lru_add_and_bh_lrus_drain(void) |
|---|
| 807 | +{ |
|---|
| 808 | + local_lock(&lru_pvecs.lock); |
|---|
| 809 | + lru_add_drain_cpu(smp_processor_id()); |
|---|
| 810 | + local_unlock(&lru_pvecs.lock); |
|---|
| 811 | + invalidate_bh_lrus_cpu(); |
|---|
| 812 | +} |
|---|
| 813 | + |
|---|
| 814 | +void lru_add_drain_cpu_zone(struct zone *zone) |
|---|
| 815 | +{ |
|---|
| 816 | + local_lock(&lru_pvecs.lock); |
|---|
| 817 | + lru_add_drain_cpu(smp_processor_id()); |
|---|
| 818 | + drain_local_pages(zone); |
|---|
| 819 | + local_unlock(&lru_pvecs.lock); |
|---|
| 662 | 820 | } |
|---|
| 663 | 821 | |
|---|
| 664 | 822 | #ifdef CONFIG_SMP |
|---|
| 665 | | - |
|---|
| 666 | | -#ifdef CONFIG_PREEMPT_RT_BASE |
|---|
| 667 | | -static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work) |
|---|
| 668 | | -{ |
|---|
| 669 | | - local_lock_on(swapvec_lock, cpu); |
|---|
| 670 | | - lru_add_drain_cpu(cpu); |
|---|
| 671 | | - local_unlock_on(swapvec_lock, cpu); |
|---|
| 672 | | -} |
|---|
| 673 | | - |
|---|
| 674 | | -#else |
|---|
| 675 | 823 | |
|---|
| 676 | 824 | static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); |
|---|
| 677 | 825 | |
|---|
| 678 | 826 | static void lru_add_drain_per_cpu(struct work_struct *dummy) |
|---|
| 679 | 827 | { |
|---|
| 680 | | - lru_add_drain(); |
|---|
| 828 | + lru_add_and_bh_lrus_drain(); |
|---|
| 681 | 829 | } |
|---|
| 682 | | - |
|---|
| 683 | | -static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work) |
|---|
| 684 | | -{ |
|---|
| 685 | | - struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); |
|---|
| 686 | | - |
|---|
| 687 | | - INIT_WORK(work, lru_add_drain_per_cpu); |
|---|
| 688 | | - queue_work_on(cpu, mm_percpu_wq, work); |
|---|
| 689 | | - cpumask_set_cpu(cpu, has_work); |
|---|
| 690 | | -} |
|---|
| 691 | | -#endif |
|---|
| 692 | 830 | |
|---|
| 693 | 831 | /* |
|---|
| 694 | 832 | * Doesn't need any cpu hotplug locking because we do rely on per-cpu |
|---|
| .. | .. |
|---|
| 697 | 835 | * Calling this function with cpu hotplug locks held can actually lead |
|---|
| 698 | 836 | * to obscure indirect dependencies via WQ context. |
|---|
| 699 | 837 | */ |
|---|
| 700 | | -void lru_add_drain_all(void) |
|---|
| 838 | +inline void __lru_add_drain_all(bool force_all_cpus) |
|---|
| 701 | 839 | { |
|---|
| 702 | | - static DEFINE_MUTEX(lock); |
|---|
| 840 | + /* |
|---|
| 841 | + * lru_drain_gen - Global pages generation number |
|---|
| 842 | + * |
|---|
| 843 | + * (A) Definition: global lru_drain_gen = x implies that all generations |
|---|
| 844 | + * 0 < n <= x are already *scheduled* for draining. |
|---|
| 845 | + * |
|---|
| 846 | + * This is an optimization for the highly-contended use case where a |
|---|
| 847 | + * user space workload keeps constantly generating a flow of pages for |
|---|
| 848 | + * each CPU. |
|---|
| 849 | + */ |
|---|
| 850 | + static unsigned int lru_drain_gen; |
|---|
| 703 | 851 | static struct cpumask has_work; |
|---|
| 704 | | - int cpu; |
|---|
| 852 | + static DEFINE_MUTEX(lock); |
|---|
| 853 | + unsigned cpu, this_gen; |
|---|
| 705 | 854 | |
|---|
| 706 | 855 | /* |
|---|
| 707 | 856 | * Make sure nobody triggers this path before mm_percpu_wq is fully |
|---|
| .. | .. |
|---|
| 710 | 859 | if (WARN_ON(!mm_percpu_wq)) |
|---|
| 711 | 860 | return; |
|---|
| 712 | 861 | |
|---|
| 862 | + /* |
|---|
| 863 | + * Guarantee pagevec counter stores visible by this CPU are visible to |
|---|
| 864 | + * other CPUs before loading the current drain generation. |
|---|
| 865 | + */ |
|---|
| 866 | + smp_mb(); |
|---|
| 867 | + |
|---|
| 868 | + /* |
|---|
| 869 | + * (B) Locally cache global LRU draining generation number |
|---|
| 870 | + * |
|---|
| 871 | + * The read barrier ensures that the counter is loaded before the mutex |
|---|
| 872 | + * is taken. It pairs with smp_mb() inside the mutex critical section |
|---|
| 873 | + * at (D). |
|---|
| 874 | + */ |
|---|
| 875 | + this_gen = smp_load_acquire(&lru_drain_gen); |
|---|
| 876 | + |
|---|
| 713 | 877 | mutex_lock(&lock); |
|---|
| 878 | + |
|---|
| 879 | + /* |
|---|
| 880 | + * (C) Exit the draining operation if a newer generation, from another |
|---|
| 881 | + * lru_add_drain_all(), was already scheduled for draining. Check (A). |
|---|
| 882 | + */ |
|---|
| 883 | + if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) |
|---|
| 884 | + goto done; |
|---|
| 885 | + |
|---|
| 886 | + /* |
|---|
| 887 | + * (D) Increment global generation number |
|---|
| 888 | + * |
|---|
| 889 | + * Pairs with smp_load_acquire() at (B), outside of the critical |
|---|
| 890 | + * section. Use a full memory barrier to guarantee that the new global |
|---|
| 891 | + * drain generation number is stored before loading pagevec counters. |
|---|
| 892 | + * |
|---|
| 893 | + * This pairing must be done here, before the for_each_online_cpu loop |
|---|
| 894 | + * below which drains the page vectors. |
|---|
| 895 | + * |
|---|
| 896 | + * Let x, y, and z represent some system CPU numbers, where x < y < z. |
|---|
| 897 | + * Assume CPU #z is is in the middle of the for_each_online_cpu loop |
|---|
| 898 | + * below and has already reached CPU #y's per-cpu data. CPU #x comes |
|---|
| 899 | + * along, adds some pages to its per-cpu vectors, then calls |
|---|
| 900 | + * lru_add_drain_all(). |
|---|
| 901 | + * |
|---|
| 902 | + * If the paired barrier is done at any later step, e.g. after the |
|---|
| 903 | + * loop, CPU #x will just exit at (C) and miss flushing out all of its |
|---|
| 904 | + * added pages. |
|---|
| 905 | + */ |
|---|
| 906 | + WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); |
|---|
| 907 | + smp_mb(); |
|---|
| 908 | + |
|---|
| 714 | 909 | cpumask_clear(&has_work); |
|---|
| 715 | | - |
|---|
| 716 | 910 | for_each_online_cpu(cpu) { |
|---|
| 911 | + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); |
|---|
| 717 | 912 | |
|---|
| 718 | | - if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || |
|---|
| 719 | | - pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || |
|---|
| 720 | | - pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || |
|---|
| 721 | | - pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || |
|---|
| 722 | | - need_activate_page_drain(cpu)) |
|---|
| 723 | | - remote_lru_add_drain(cpu, &has_work); |
|---|
| 913 | + if (force_all_cpus || |
|---|
| 914 | + pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || |
|---|
| 915 | + data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || |
|---|
| 916 | + pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || |
|---|
| 917 | + pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || |
|---|
| 918 | + pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || |
|---|
| 919 | + pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree_movetail, cpu)) || |
|---|
| 920 | + need_activate_page_drain(cpu) || |
|---|
| 921 | + has_bh_in_lru(cpu, NULL)) { |
|---|
| 922 | + INIT_WORK(work, lru_add_drain_per_cpu); |
|---|
| 923 | + queue_work_on(cpu, mm_percpu_wq, work); |
|---|
| 924 | + __cpumask_set_cpu(cpu, &has_work); |
|---|
| 925 | + } |
|---|
| 724 | 926 | } |
|---|
| 725 | 927 | |
|---|
| 726 | | -#ifndef CONFIG_PREEMPT_RT_BASE |
|---|
| 727 | 928 | for_each_cpu(cpu, &has_work) |
|---|
| 728 | 929 | flush_work(&per_cpu(lru_add_drain_work, cpu)); |
|---|
| 729 | | -#endif |
|---|
| 730 | 930 | |
|---|
| 931 | +done: |
|---|
| 731 | 932 | mutex_unlock(&lock); |
|---|
| 933 | +} |
|---|
| 934 | + |
|---|
| 935 | +void lru_add_drain_all(void) |
|---|
| 936 | +{ |
|---|
| 937 | + __lru_add_drain_all(false); |
|---|
| 732 | 938 | } |
|---|
| 733 | 939 | #else |
|---|
| 734 | 940 | void lru_add_drain_all(void) |
|---|
| 735 | 941 | { |
|---|
| 736 | 942 | lru_add_drain(); |
|---|
| 737 | 943 | } |
|---|
| 944 | +#endif /* CONFIG_SMP */ |
|---|
| 945 | + |
|---|
| 946 | +static atomic_t lru_disable_count = ATOMIC_INIT(0); |
|---|
| 947 | + |
|---|
| 948 | +bool lru_cache_disabled(void) |
|---|
| 949 | +{ |
|---|
| 950 | + return atomic_read(&lru_disable_count) != 0; |
|---|
| 951 | +} |
|---|
| 952 | + |
|---|
| 953 | +void lru_cache_enable(void) |
|---|
| 954 | +{ |
|---|
| 955 | + atomic_dec(&lru_disable_count); |
|---|
| 956 | +} |
|---|
| 957 | +EXPORT_SYMBOL_GPL(lru_cache_enable); |
|---|
| 958 | + |
|---|
| 959 | +/* |
|---|
| 960 | + * lru_cache_disable() needs to be called before we start compiling |
|---|
| 961 | + * a list of pages to be migrated using isolate_lru_page(). |
|---|
| 962 | + * It drains pages on LRU cache and then disable on all cpus until |
|---|
| 963 | + * lru_cache_enable is called. |
|---|
| 964 | + * |
|---|
| 965 | + * Must be paired with a call to lru_cache_enable(). |
|---|
| 966 | + */ |
|---|
| 967 | +void lru_cache_disable(void) |
|---|
| 968 | +{ |
|---|
| 969 | + /* |
|---|
| 970 | + * If someone is already disabled lru_cache, just return with |
|---|
| 971 | + * increasing the lru_disable_count. |
|---|
| 972 | + */ |
|---|
| 973 | + if (atomic_inc_not_zero(&lru_disable_count)) |
|---|
| 974 | + return; |
|---|
| 975 | +#ifdef CONFIG_SMP |
|---|
| 976 | + /* |
|---|
| 977 | + * lru_add_drain_all in the force mode will schedule draining on |
|---|
| 978 | + * all online CPUs so any calls of lru_cache_disabled wrapped by |
|---|
| 979 | + * local_lock or preemption disabled would be ordered by that. |
|---|
| 980 | + * The atomic operation doesn't need to have stronger ordering |
|---|
| 981 | + * requirements because that is enforeced by the scheduling |
|---|
| 982 | + * guarantees. |
|---|
| 983 | + */ |
|---|
| 984 | + __lru_add_drain_all(true); |
|---|
| 985 | +#else |
|---|
| 986 | + lru_add_and_bh_lrus_drain(); |
|---|
| 738 | 987 | #endif |
|---|
| 988 | + atomic_inc(&lru_disable_count); |
|---|
| 989 | +} |
|---|
| 990 | +EXPORT_SYMBOL_GPL(lru_cache_disable); |
|---|
| 739 | 991 | |
|---|
| 740 | 992 | /** |
|---|
| 741 | 993 | * release_pages - batched put_page() |
|---|
| .. | .. |
|---|
| 751 | 1003 | LIST_HEAD(pages_to_free); |
|---|
| 752 | 1004 | struct pglist_data *locked_pgdat = NULL; |
|---|
| 753 | 1005 | struct lruvec *lruvec; |
|---|
| 754 | | - unsigned long uninitialized_var(flags); |
|---|
| 755 | | - unsigned int uninitialized_var(lock_batch); |
|---|
| 1006 | + unsigned long flags; |
|---|
| 1007 | + unsigned int lock_batch; |
|---|
| 756 | 1008 | |
|---|
| 757 | 1009 | for (i = 0; i < nr; i++) { |
|---|
| 758 | 1010 | struct page *page = pages[i]; |
|---|
| .. | .. |
|---|
| 767 | 1019 | locked_pgdat = NULL; |
|---|
| 768 | 1020 | } |
|---|
| 769 | 1021 | |
|---|
| 1022 | + page = compound_head(page); |
|---|
| 770 | 1023 | if (is_huge_zero_page(page)) |
|---|
| 771 | 1024 | continue; |
|---|
| 772 | 1025 | |
|---|
| .. | .. |
|---|
| 778 | 1031 | } |
|---|
| 779 | 1032 | /* |
|---|
| 780 | 1033 | * ZONE_DEVICE pages that return 'false' from |
|---|
| 781 | | - * put_devmap_managed_page() do not require special |
|---|
| 1034 | + * page_is_devmap_managed() do not require special |
|---|
| 782 | 1035 | * processing, and instead, expect a call to |
|---|
| 783 | 1036 | * put_page_testzero(). |
|---|
| 784 | 1037 | */ |
|---|
| 785 | | - if (put_devmap_managed_page(page)) |
|---|
| 1038 | + if (page_is_devmap_managed(page)) { |
|---|
| 1039 | + put_devmap_managed_page(page); |
|---|
| 786 | 1040 | continue; |
|---|
| 1041 | + } |
|---|
| 787 | 1042 | } |
|---|
| 788 | 1043 | |
|---|
| 789 | | - page = compound_head(page); |
|---|
| 790 | 1044 | if (!put_page_testzero(page)) |
|---|
| 791 | 1045 | continue; |
|---|
| 792 | 1046 | |
|---|
| .. | .. |
|---|
| 817 | 1071 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
|---|
| 818 | 1072 | } |
|---|
| 819 | 1073 | |
|---|
| 820 | | - /* Clear Active bit in case of parallel mark_page_accessed */ |
|---|
| 821 | | - __ClearPageActive(page); |
|---|
| 822 | 1074 | __ClearPageWaiters(page); |
|---|
| 823 | 1075 | |
|---|
| 824 | 1076 | list_add(&page->lru, &pages_to_free); |
|---|
| .. | .. |
|---|
| 857 | 1109 | void lru_add_page_tail(struct page *page, struct page *page_tail, |
|---|
| 858 | 1110 | struct lruvec *lruvec, struct list_head *list) |
|---|
| 859 | 1111 | { |
|---|
| 860 | | - const int file = 0; |
|---|
| 861 | | - |
|---|
| 862 | 1112 | VM_BUG_ON_PAGE(!PageHead(page), page); |
|---|
| 863 | 1113 | VM_BUG_ON_PAGE(PageCompound(page_tail), page); |
|---|
| 864 | 1114 | VM_BUG_ON_PAGE(PageLRU(page_tail), page); |
|---|
| 865 | | - VM_BUG_ON(NR_CPUS != 1 && |
|---|
| 866 | | - !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); |
|---|
| 1115 | + lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock); |
|---|
| 867 | 1116 | |
|---|
| 868 | 1117 | if (!list) |
|---|
| 869 | 1118 | SetPageLRU(page_tail); |
|---|
| .. | .. |
|---|
| 875 | 1124 | get_page(page_tail); |
|---|
| 876 | 1125 | list_add_tail(&page_tail->lru, list); |
|---|
| 877 | 1126 | } else { |
|---|
| 878 | | - struct list_head *list_head; |
|---|
| 879 | 1127 | /* |
|---|
| 880 | 1128 | * Head page has not yet been counted, as an hpage, |
|---|
| 881 | 1129 | * so we must account for each subpage individually. |
|---|
| 882 | 1130 | * |
|---|
| 883 | | - * Use the standard add function to put page_tail on the list, |
|---|
| 884 | | - * but then correct its position so they all end up in order. |
|---|
| 1131 | + * Put page_tail on the list at the correct position |
|---|
| 1132 | + * so they all end up in order. |
|---|
| 885 | 1133 | */ |
|---|
| 886 | | - add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); |
|---|
| 887 | | - list_head = page_tail->lru.prev; |
|---|
| 888 | | - list_move_tail(&page_tail->lru, list_head); |
|---|
| 1134 | + add_page_to_lru_list_tail(page_tail, lruvec, |
|---|
| 1135 | + page_lru(page_tail)); |
|---|
| 889 | 1136 | } |
|---|
| 890 | | - |
|---|
| 891 | | - if (!PageUnevictable(page)) |
|---|
| 892 | | - update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); |
|---|
| 893 | 1137 | } |
|---|
| 894 | 1138 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
|---|
| 895 | 1139 | |
|---|
| .. | .. |
|---|
| 898 | 1142 | { |
|---|
| 899 | 1143 | enum lru_list lru; |
|---|
| 900 | 1144 | int was_unevictable = TestClearPageUnevictable(page); |
|---|
| 1145 | + int nr_pages = thp_nr_pages(page); |
|---|
| 901 | 1146 | |
|---|
| 902 | 1147 | VM_BUG_ON_PAGE(PageLRU(page), page); |
|---|
| 903 | 1148 | |
|---|
| 904 | | - SetPageLRU(page); |
|---|
| 905 | 1149 | /* |
|---|
| 906 | 1150 | * Page becomes evictable in two ways: |
|---|
| 907 | | - * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()]. |
|---|
| 1151 | + * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. |
|---|
| 908 | 1152 | * 2) Before acquiring LRU lock to put the page to correct LRU and then |
|---|
| 909 | 1153 | * a) do PageLRU check with lock [check_move_unevictable_pages] |
|---|
| 910 | 1154 | * b) do PageLRU check before lock [clear_page_mlock] |
|---|
| .. | .. |
|---|
| 928 | 1172 | * looking at the same page) and the evictable page will be stranded |
|---|
| 929 | 1173 | * in an unevictable LRU. |
|---|
| 930 | 1174 | */ |
|---|
| 931 | | - smp_mb(); |
|---|
| 1175 | + SetPageLRU(page); |
|---|
| 1176 | + smp_mb__after_atomic(); |
|---|
| 932 | 1177 | |
|---|
| 933 | 1178 | if (page_evictable(page)) { |
|---|
| 934 | 1179 | lru = page_lru(page); |
|---|
| 935 | | - update_page_reclaim_stat(lruvec, page_is_file_cache(page), |
|---|
| 936 | | - PageActive(page)); |
|---|
| 937 | 1180 | if (was_unevictable) |
|---|
| 938 | | - count_vm_event(UNEVICTABLE_PGRESCUED); |
|---|
| 1181 | + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); |
|---|
| 939 | 1182 | } else { |
|---|
| 940 | 1183 | lru = LRU_UNEVICTABLE; |
|---|
| 941 | 1184 | ClearPageActive(page); |
|---|
| 942 | 1185 | SetPageUnevictable(page); |
|---|
| 943 | 1186 | if (!was_unevictable) |
|---|
| 944 | | - count_vm_event(UNEVICTABLE_PGCULLED); |
|---|
| 1187 | + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); |
|---|
| 945 | 1188 | } |
|---|
| 946 | 1189 | |
|---|
| 947 | 1190 | add_page_to_lru_list(page, lruvec, lru); |
|---|
| .. | .. |
|---|
| 956 | 1199 | { |
|---|
| 957 | 1200 | pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); |
|---|
| 958 | 1201 | } |
|---|
| 959 | | -EXPORT_SYMBOL(__pagevec_lru_add); |
|---|
| 960 | 1202 | |
|---|
| 961 | 1203 | /** |
|---|
| 962 | 1204 | * pagevec_lookup_entries - gang pagecache lookup |
|---|
| .. | .. |
|---|
| 974 | 1216 | * The search returns a group of mapping-contiguous entries with |
|---|
| 975 | 1217 | * ascending indexes. There may be holes in the indices due to |
|---|
| 976 | 1218 | * not-present entries. |
|---|
| 1219 | + * |
|---|
| 1220 | + * Only one subpage of a Transparent Huge Page is returned in one call: |
|---|
| 1221 | + * allowing truncate_inode_pages_range() to evict the whole THP without |
|---|
| 1222 | + * cycling through a pagevec of extra references. |
|---|
| 977 | 1223 | * |
|---|
| 978 | 1224 | * pagevec_lookup_entries() returns the number of entries which were |
|---|
| 979 | 1225 | * found. |
|---|
| .. | .. |
|---|
| 1003 | 1249 | |
|---|
| 1004 | 1250 | for (i = 0, j = 0; i < pagevec_count(pvec); i++) { |
|---|
| 1005 | 1251 | struct page *page = pvec->pages[i]; |
|---|
| 1006 | | - if (!radix_tree_exceptional_entry(page)) |
|---|
| 1252 | + if (!xa_is_value(page)) |
|---|
| 1007 | 1253 | pvec->pages[j++] = page; |
|---|
| 1008 | 1254 | } |
|---|
| 1009 | 1255 | pvec->nr = j; |
|---|
| .. | .. |
|---|
| 1040 | 1286 | |
|---|
| 1041 | 1287 | unsigned pagevec_lookup_range_tag(struct pagevec *pvec, |
|---|
| 1042 | 1288 | struct address_space *mapping, pgoff_t *index, pgoff_t end, |
|---|
| 1043 | | - int tag) |
|---|
| 1289 | + xa_mark_t tag) |
|---|
| 1044 | 1290 | { |
|---|
| 1045 | 1291 | pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, |
|---|
| 1046 | 1292 | PAGEVEC_SIZE, pvec->pages); |
|---|
| .. | .. |
|---|
| 1050 | 1296 | |
|---|
| 1051 | 1297 | unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, |
|---|
| 1052 | 1298 | struct address_space *mapping, pgoff_t *index, pgoff_t end, |
|---|
| 1053 | | - int tag, unsigned max_pages) |
|---|
| 1299 | + xa_mark_t tag, unsigned max_pages) |
|---|
| 1054 | 1300 | { |
|---|
| 1055 | 1301 | pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, |
|---|
| 1056 | 1302 | min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); |
|---|
| .. | .. |
|---|
| 1062 | 1308 | */ |
|---|
| 1063 | 1309 | void __init swap_setup(void) |
|---|
| 1064 | 1310 | { |
|---|
| 1065 | | - unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); |
|---|
| 1311 | + unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); |
|---|
| 1066 | 1312 | |
|---|
| 1067 | 1313 | /* Use a smaller cluster for small-memory machines */ |
|---|
| 1068 | 1314 | if (megs < 16) |
|---|
| .. | .. |
|---|
| 1074 | 1320 | * _really_ don't want to cluster much more |
|---|
| 1075 | 1321 | */ |
|---|
| 1076 | 1322 | } |
|---|
| 1323 | + |
|---|
| 1324 | +#ifdef CONFIG_DEV_PAGEMAP_OPS |
|---|
| 1325 | +void put_devmap_managed_page(struct page *page) |
|---|
| 1326 | +{ |
|---|
| 1327 | + int count; |
|---|
| 1328 | + |
|---|
| 1329 | + if (WARN_ON_ONCE(!page_is_devmap_managed(page))) |
|---|
| 1330 | + return; |
|---|
| 1331 | + |
|---|
| 1332 | + count = page_ref_dec_return(page); |
|---|
| 1333 | + |
|---|
| 1334 | + /* |
|---|
| 1335 | + * devmap page refcounts are 1-based, rather than 0-based: if |
|---|
| 1336 | + * refcount is 1, then the page is free and the refcount is |
|---|
| 1337 | + * stable because nobody holds a reference on the page. |
|---|
| 1338 | + */ |
|---|
| 1339 | + if (count == 1) |
|---|
| 1340 | + free_devmap_managed_page(page); |
|---|
| 1341 | + else if (!count) |
|---|
| 1342 | + __put_page(page); |
|---|
| 1343 | +} |
|---|
| 1344 | +EXPORT_SYMBOL(put_devmap_managed_page); |
|---|
| 1345 | +#endif |
|---|