| .. | .. |
|---|
| 6 | 6 | */ |
|---|
| 7 | 7 | |
|---|
| 8 | 8 | #include <linux/memcontrol.h> |
|---|
| 9 | +#include <linux/mm_inline.h> |
|---|
| 9 | 10 | #include <linux/writeback.h> |
|---|
| 10 | 11 | #include <linux/shmem_fs.h> |
|---|
| 11 | 12 | #include <linux/pagemap.h> |
|---|
| .. | .. |
|---|
| 156 | 157 | * |
|---|
| 157 | 158 | * Implementation |
|---|
| 158 | 159 | * |
|---|
| 159 | | - * For each node's file LRU lists, a counter for inactive evictions |
|---|
| 160 | | - * and activations is maintained (node->inactive_age). |
|---|
| 160 | + * For each node's LRU lists, a counter for inactive evictions and |
|---|
| 161 | + * activations is maintained (node->nonresident_age). |
|---|
| 161 | 162 | * |
|---|
| 162 | 163 | * On eviction, a snapshot of this counter (along with some bits to |
|---|
| 163 | | - * identify the node) is stored in the now empty page cache radix tree |
|---|
| 164 | + * identify the node) is stored in the now empty page cache |
|---|
| 164 | 165 | * slot of the evicted page. This is called a shadow entry. |
|---|
| 165 | 166 | * |
|---|
| 166 | 167 | * On cache misses for which there are shadow entries, an eligible |
|---|
| 167 | 168 | * refault distance will immediately activate the refaulting page. |
|---|
| 168 | 169 | */ |
|---|
| 169 | 170 | |
|---|
| 170 | | -#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ |
|---|
| 171 | +#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ |
|---|
| 171 | 172 | 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) |
|---|
| 172 | 173 | #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) |
|---|
| 173 | 174 | |
|---|
| 174 | 175 | /* |
|---|
| 175 | 176 | * Eviction timestamps need to be able to cover the full range of |
|---|
| 176 | | - * actionable refaults. However, bits are tight in the radix tree |
|---|
| 177 | + * actionable refaults. However, bits are tight in the xarray |
|---|
| 177 | 178 | * entry, and after storing the identifier for the lruvec there might |
|---|
| 178 | 179 | * not be enough left to represent every single actionable refault. In |
|---|
| 179 | 180 | * that case, we have to sacrifice granularity for distance, and group |
|---|
| .. | .. |
|---|
| 185 | 186 | bool workingset) |
|---|
| 186 | 187 | { |
|---|
| 187 | 188 | eviction >>= bucket_order; |
|---|
| 189 | + eviction &= EVICTION_MASK; |
|---|
| 188 | 190 | eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; |
|---|
| 189 | 191 | eviction = (eviction << NODES_SHIFT) | pgdat->node_id; |
|---|
| 190 | 192 | eviction = (eviction << 1) | workingset; |
|---|
| 191 | | - eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); |
|---|
| 192 | 193 | |
|---|
| 193 | | - return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); |
|---|
| 194 | + return xa_mk_value(eviction); |
|---|
| 194 | 195 | } |
|---|
| 195 | 196 | |
|---|
| 196 | 197 | static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, |
|---|
| 197 | 198 | unsigned long *evictionp, bool *workingsetp) |
|---|
| 198 | 199 | { |
|---|
| 199 | | - unsigned long entry = (unsigned long)shadow; |
|---|
| 200 | + unsigned long entry = xa_to_value(shadow); |
|---|
| 200 | 201 | int memcgid, nid; |
|---|
| 201 | 202 | bool workingset; |
|---|
| 202 | 203 | |
|---|
| 203 | | - entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; |
|---|
| 204 | 204 | workingset = entry & 1; |
|---|
| 205 | 205 | entry >>= 1; |
|---|
| 206 | 206 | nid = entry & ((1UL << NODES_SHIFT) - 1); |
|---|
| .. | .. |
|---|
| 215 | 215 | } |
|---|
| 216 | 216 | |
|---|
| 217 | 217 | /** |
|---|
| 218 | + * workingset_age_nonresident - age non-resident entries as LRU ages |
|---|
| 219 | + * @lruvec: the lruvec that was aged |
|---|
| 220 | + * @nr_pages: the number of pages to count |
|---|
| 221 | + * |
|---|
| 222 | + * As in-memory pages are aged, non-resident pages need to be aged as |
|---|
| 223 | + * well, in order for the refault distances later on to be comparable |
|---|
| 224 | + * to the in-memory dimensions. This function allows reclaim and LRU |
|---|
| 225 | + * operations to drive the non-resident aging along in parallel. |
|---|
| 226 | + */ |
|---|
| 227 | +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) |
|---|
| 228 | +{ |
|---|
| 229 | + /* |
|---|
| 230 | + * Reclaiming a cgroup means reclaiming all its children in a |
|---|
| 231 | + * round-robin fashion. That means that each cgroup has an LRU |
|---|
| 232 | + * order that is composed of the LRU orders of its child |
|---|
| 233 | + * cgroups; and every page has an LRU position not just in the |
|---|
| 234 | + * cgroup that owns it, but in all of that group's ancestors. |
|---|
| 235 | + * |
|---|
| 236 | + * So when the physical inactive list of a leaf cgroup ages, |
|---|
| 237 | + * the virtual inactive lists of all its parents, including |
|---|
| 238 | + * the root cgroup's, age as well. |
|---|
| 239 | + */ |
|---|
| 240 | + do { |
|---|
| 241 | + atomic_long_add(nr_pages, &lruvec->nonresident_age); |
|---|
| 242 | + } while ((lruvec = parent_lruvec(lruvec))); |
|---|
| 243 | +} |
|---|
| 244 | + |
|---|
| 245 | +/** |
|---|
| 218 | 246 | * workingset_eviction - note the eviction of a page from memory |
|---|
| 219 | | - * @mapping: address space the page was backing |
|---|
| 247 | + * @target_memcg: the cgroup that is causing the reclaim |
|---|
| 220 | 248 | * @page: the page being evicted |
|---|
| 221 | 249 | * |
|---|
| 222 | | - * Returns a shadow entry to be stored in @mapping->i_pages in place |
|---|
| 250 | + * Returns a shadow entry to be stored in @page->mapping->i_pages in place |
|---|
| 223 | 251 | * of the evicted @page so that a later refault can be detected. |
|---|
| 224 | 252 | */ |
|---|
| 225 | | -void *workingset_eviction(struct address_space *mapping, struct page *page) |
|---|
| 253 | +void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) |
|---|
| 226 | 254 | { |
|---|
| 227 | 255 | struct pglist_data *pgdat = page_pgdat(page); |
|---|
| 228 | | - struct mem_cgroup *memcg = page_memcg(page); |
|---|
| 229 | | - int memcgid = mem_cgroup_id(memcg); |
|---|
| 230 | 256 | unsigned long eviction; |
|---|
| 231 | 257 | struct lruvec *lruvec; |
|---|
| 258 | + int memcgid; |
|---|
| 232 | 259 | |
|---|
| 233 | 260 | /* Page is fully exclusive and pins page->mem_cgroup */ |
|---|
| 234 | 261 | VM_BUG_ON_PAGE(PageLRU(page), page); |
|---|
| 235 | 262 | VM_BUG_ON_PAGE(page_count(page), page); |
|---|
| 236 | 263 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
|---|
| 237 | 264 | |
|---|
| 238 | | - lruvec = mem_cgroup_lruvec(pgdat, memcg); |
|---|
| 239 | | - eviction = atomic_long_inc_return(&lruvec->inactive_age); |
|---|
| 265 | + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); |
|---|
| 266 | + workingset_age_nonresident(lruvec, thp_nr_pages(page)); |
|---|
| 267 | + /* XXX: target_memcg can be NULL, go through lruvec */ |
|---|
| 268 | + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); |
|---|
| 269 | + eviction = atomic_long_read(&lruvec->nonresident_age); |
|---|
| 240 | 270 | return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); |
|---|
| 241 | 271 | } |
|---|
| 242 | 272 | |
|---|
| .. | .. |
|---|
| 246 | 276 | * @shadow: shadow entry of the evicted page |
|---|
| 247 | 277 | * |
|---|
| 248 | 278 | * Calculates and evaluates the refault distance of the previously |
|---|
| 249 | | - * evicted page in the context of the node it was allocated in. |
|---|
| 279 | + * evicted page in the context of the node and the memcg whose memory |
|---|
| 280 | + * pressure caused the eviction. |
|---|
| 250 | 281 | */ |
|---|
| 251 | 282 | void workingset_refault(struct page *page, void *shadow) |
|---|
| 252 | 283 | { |
|---|
| 284 | + bool file = page_is_file_lru(page); |
|---|
| 285 | + struct mem_cgroup *eviction_memcg; |
|---|
| 286 | + struct lruvec *eviction_lruvec; |
|---|
| 253 | 287 | unsigned long refault_distance; |
|---|
| 288 | + unsigned long workingset_size; |
|---|
| 254 | 289 | struct pglist_data *pgdat; |
|---|
| 255 | | - unsigned long active_file; |
|---|
| 256 | 290 | struct mem_cgroup *memcg; |
|---|
| 257 | 291 | unsigned long eviction; |
|---|
| 258 | 292 | struct lruvec *lruvec; |
|---|
| .. | .. |
|---|
| 279 | 313 | * would be better if the root_mem_cgroup existed in all |
|---|
| 280 | 314 | * configurations instead. |
|---|
| 281 | 315 | */ |
|---|
| 282 | | - memcg = mem_cgroup_from_id(memcgid); |
|---|
| 283 | | - if (!mem_cgroup_disabled() && !memcg) |
|---|
| 316 | + eviction_memcg = mem_cgroup_from_id(memcgid); |
|---|
| 317 | + if (!mem_cgroup_disabled() && !eviction_memcg) |
|---|
| 284 | 318 | goto out; |
|---|
| 285 | | - lruvec = mem_cgroup_lruvec(pgdat, memcg); |
|---|
| 286 | | - refault = atomic_long_read(&lruvec->inactive_age); |
|---|
| 287 | | - active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); |
|---|
| 319 | + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); |
|---|
| 320 | + refault = atomic_long_read(&eviction_lruvec->nonresident_age); |
|---|
| 288 | 321 | |
|---|
| 289 | 322 | /* |
|---|
| 290 | 323 | * Calculate the refault distance |
|---|
| 291 | 324 | * |
|---|
| 292 | 325 | * The unsigned subtraction here gives an accurate distance |
|---|
| 293 | | - * across inactive_age overflows in most cases. There is a |
|---|
| 326 | + * across nonresident_age overflows in most cases. There is a |
|---|
| 294 | 327 | * special case: usually, shadow entries have a short lifetime |
|---|
| 295 | 328 | * and are either refaulted or reclaimed along with the inode |
|---|
| 296 | 329 | * before they get too old. But it is not impossible for the |
|---|
| 297 | | - * inactive_age to lap a shadow entry in the field, which can |
|---|
| 298 | | - * then result in a false small refault distance, leading to a |
|---|
| 299 | | - * false activation should this old entry actually refault |
|---|
| 300 | | - * again. However, earlier kernels used to deactivate |
|---|
| 330 | + * nonresident_age to lap a shadow entry in the field, which |
|---|
| 331 | + * can then result in a false small refault distance, leading |
|---|
| 332 | + * to a false activation should this old entry actually |
|---|
| 333 | + * refault again. However, earlier kernels used to deactivate |
|---|
| 301 | 334 | * unconditionally with *every* reclaim invocation for the |
|---|
| 302 | 335 | * longest time, so the occasional inappropriate activation |
|---|
| 303 | 336 | * leading to pressure on the active list is not a problem. |
|---|
| 304 | 337 | */ |
|---|
| 305 | 338 | refault_distance = (refault - eviction) & EVICTION_MASK; |
|---|
| 306 | 339 | |
|---|
| 307 | | - inc_lruvec_state(lruvec, WORKINGSET_REFAULT); |
|---|
| 340 | + /* |
|---|
| 341 | + * The activation decision for this page is made at the level |
|---|
| 342 | + * where the eviction occurred, as that is where the LRU order |
|---|
| 343 | + * during page reclaim is being determined. |
|---|
| 344 | + * |
|---|
| 345 | + * However, the cgroup that will own the page is the one that |
|---|
| 346 | + * is actually experiencing the refault event. |
|---|
| 347 | + */ |
|---|
| 348 | + memcg = page_memcg(page); |
|---|
| 349 | + lruvec = mem_cgroup_lruvec(memcg, pgdat); |
|---|
| 350 | + |
|---|
| 351 | + inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); |
|---|
| 308 | 352 | |
|---|
| 309 | 353 | /* |
|---|
| 310 | 354 | * Compare the distance to the existing workingset size. We |
|---|
| 311 | | - * don't act on pages that couldn't stay resident even if all |
|---|
| 312 | | - * the memory was available to the page cache. |
|---|
| 355 | + * don't activate pages that couldn't stay resident even if |
|---|
| 356 | + * all the memory was available to the workingset. Whether |
|---|
| 357 | + * workingset competition needs to consider anon or not depends |
|---|
| 358 | + * on having swap. |
|---|
| 313 | 359 | */ |
|---|
| 314 | | - if (refault_distance > active_file) |
|---|
| 360 | + workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); |
|---|
| 361 | + if (!file) { |
|---|
| 362 | + workingset_size += lruvec_page_state(eviction_lruvec, |
|---|
| 363 | + NR_INACTIVE_FILE); |
|---|
| 364 | + } |
|---|
| 365 | + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { |
|---|
| 366 | + workingset_size += lruvec_page_state(eviction_lruvec, |
|---|
| 367 | + NR_ACTIVE_ANON); |
|---|
| 368 | + if (file) { |
|---|
| 369 | + workingset_size += lruvec_page_state(eviction_lruvec, |
|---|
| 370 | + NR_INACTIVE_ANON); |
|---|
| 371 | + } |
|---|
| 372 | + } |
|---|
| 373 | + if (refault_distance > workingset_size) |
|---|
| 315 | 374 | goto out; |
|---|
| 316 | 375 | |
|---|
| 317 | 376 | SetPageActive(page); |
|---|
| 318 | | - atomic_long_inc(&lruvec->inactive_age); |
|---|
| 319 | | - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); |
|---|
| 377 | + workingset_age_nonresident(lruvec, thp_nr_pages(page)); |
|---|
| 378 | + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); |
|---|
| 320 | 379 | |
|---|
| 321 | 380 | /* Page was active prior to eviction */ |
|---|
| 322 | 381 | if (workingset) { |
|---|
| 323 | 382 | SetPageWorkingset(page); |
|---|
| 324 | | - inc_lruvec_state(lruvec, WORKINGSET_RESTORE); |
|---|
| 383 | + /* XXX: Move to lru_cache_add() when it supports new vs putback */ |
|---|
| 384 | + spin_lock_irq(&page_pgdat(page)->lru_lock); |
|---|
| 385 | + lru_note_cost_page(page); |
|---|
| 386 | + spin_unlock_irq(&page_pgdat(page)->lru_lock); |
|---|
| 387 | + inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); |
|---|
| 325 | 388 | } |
|---|
| 326 | 389 | out: |
|---|
| 327 | 390 | rcu_read_unlock(); |
|---|
| .. | .. |
|---|
| 347 | 410 | memcg = page_memcg_rcu(page); |
|---|
| 348 | 411 | if (!mem_cgroup_disabled() && !memcg) |
|---|
| 349 | 412 | goto out; |
|---|
| 350 | | - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); |
|---|
| 351 | | - atomic_long_inc(&lruvec->inactive_age); |
|---|
| 413 | + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); |
|---|
| 414 | + workingset_age_nonresident(lruvec, thp_nr_pages(page)); |
|---|
| 352 | 415 | out: |
|---|
| 353 | 416 | rcu_read_unlock(); |
|---|
| 354 | 417 | } |
|---|
| .. | .. |
|---|
| 367 | 430 | |
|---|
| 368 | 431 | static struct list_lru shadow_nodes; |
|---|
| 369 | 432 | |
|---|
| 370 | | -void workingset_update_node(struct radix_tree_node *node) |
|---|
| 433 | +void workingset_update_node(struct xa_node *node) |
|---|
| 371 | 434 | { |
|---|
| 372 | 435 | /* |
|---|
| 373 | 436 | * Track non-empty nodes that contain only shadow entries; |
|---|
| .. | .. |
|---|
| 377 | 440 | * already where they should be. The list_empty() test is safe |
|---|
| 378 | 441 | * as node->private_list is protected by the i_pages lock. |
|---|
| 379 | 442 | */ |
|---|
| 380 | | - if (node->count && node->count == node->exceptional) { |
|---|
| 381 | | - if (list_empty(&node->private_list)) |
|---|
| 443 | + VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ |
|---|
| 444 | + |
|---|
| 445 | + if (node->count && node->count == node->nr_values) { |
|---|
| 446 | + if (list_empty(&node->private_list)) { |
|---|
| 382 | 447 | list_lru_add(&shadow_nodes, &node->private_list); |
|---|
| 448 | + __inc_lruvec_slab_state(node, WORKINGSET_NODES); |
|---|
| 449 | + } |
|---|
| 383 | 450 | } else { |
|---|
| 384 | | - if (!list_empty(&node->private_list)) |
|---|
| 451 | + if (!list_empty(&node->private_list)) { |
|---|
| 385 | 452 | list_lru_del(&shadow_nodes, &node->private_list); |
|---|
| 453 | + __dec_lruvec_slab_state(node, WORKINGSET_NODES); |
|---|
| 454 | + } |
|---|
| 386 | 455 | } |
|---|
| 387 | 456 | } |
|---|
| 388 | 457 | |
|---|
| .. | .. |
|---|
| 391 | 460 | { |
|---|
| 392 | 461 | unsigned long max_nodes; |
|---|
| 393 | 462 | unsigned long nodes; |
|---|
| 394 | | - unsigned long cache; |
|---|
| 463 | + unsigned long pages; |
|---|
| 395 | 464 | |
|---|
| 396 | 465 | nodes = list_lru_shrink_count(&shadow_nodes, sc); |
|---|
| 397 | 466 | |
|---|
| 398 | 467 | /* |
|---|
| 399 | | - * Approximate a reasonable limit for the radix tree nodes |
|---|
| 468 | + * Approximate a reasonable limit for the nodes |
|---|
| 400 | 469 | * containing shadow entries. We don't need to keep more |
|---|
| 401 | 470 | * shadow entries than possible pages on the active list, |
|---|
| 402 | 471 | * since refault distances bigger than that are dismissed. |
|---|
| .. | .. |
|---|
| 411 | 480 | * worst-case density of 1/8th. Below that, not all eligible |
|---|
| 412 | 481 | * refaults can be detected anymore. |
|---|
| 413 | 482 | * |
|---|
| 414 | | - * On 64-bit with 7 radix_tree_nodes per page and 64 slots |
|---|
| 483 | + * On 64-bit with 7 xa_nodes per page and 64 slots |
|---|
| 415 | 484 | * each, this will reclaim shadow entries when they consume |
|---|
| 416 | 485 | * ~1.8% of available memory: |
|---|
| 417 | 486 | * |
|---|
| 418 | | - * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE |
|---|
| 487 | + * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE |
|---|
| 419 | 488 | */ |
|---|
| 489 | +#ifdef CONFIG_MEMCG |
|---|
| 420 | 490 | if (sc->memcg) { |
|---|
| 421 | | - cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, |
|---|
| 422 | | - LRU_ALL_FILE); |
|---|
| 423 | | - } else { |
|---|
| 424 | | - cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + |
|---|
| 425 | | - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); |
|---|
| 426 | | - } |
|---|
| 427 | | - max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); |
|---|
| 491 | + struct lruvec *lruvec; |
|---|
| 492 | + int i; |
|---|
| 493 | + |
|---|
| 494 | + lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); |
|---|
| 495 | + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) |
|---|
| 496 | + pages += lruvec_page_state_local(lruvec, |
|---|
| 497 | + NR_LRU_BASE + i); |
|---|
| 498 | + pages += lruvec_page_state_local( |
|---|
| 499 | + lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; |
|---|
| 500 | + pages += lruvec_page_state_local( |
|---|
| 501 | + lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; |
|---|
| 502 | + } else |
|---|
| 503 | +#endif |
|---|
| 504 | + pages = node_present_pages(sc->nid); |
|---|
| 505 | + |
|---|
| 506 | + max_nodes = pages >> (XA_CHUNK_SHIFT - 3); |
|---|
| 428 | 507 | |
|---|
| 429 | 508 | if (!nodes) |
|---|
| 430 | 509 | return SHRINK_EMPTY; |
|---|
| .. | .. |
|---|
| 437 | 516 | static enum lru_status shadow_lru_isolate(struct list_head *item, |
|---|
| 438 | 517 | struct list_lru_one *lru, |
|---|
| 439 | 518 | spinlock_t *lru_lock, |
|---|
| 440 | | - void *arg) |
|---|
| 519 | + void *arg) __must_hold(lru_lock) |
|---|
| 441 | 520 | { |
|---|
| 521 | + struct xa_node *node = container_of(item, struct xa_node, private_list); |
|---|
| 442 | 522 | struct address_space *mapping; |
|---|
| 443 | | - struct radix_tree_node *node; |
|---|
| 444 | | - unsigned int i; |
|---|
| 445 | 523 | int ret; |
|---|
| 446 | 524 | |
|---|
| 447 | 525 | /* |
|---|
| 448 | | - * Page cache insertions and deletions synchroneously maintain |
|---|
| 526 | + * Page cache insertions and deletions synchronously maintain |
|---|
| 449 | 527 | * the shadow node LRU under the i_pages lock and the |
|---|
| 450 | 528 | * lru_lock. Because the page cache tree is emptied before |
|---|
| 451 | 529 | * the inode can be destroyed, holding the lru_lock pins any |
|---|
| 452 | | - * address_space that has radix tree nodes on the LRU. |
|---|
| 530 | + * address_space that has nodes on the LRU. |
|---|
| 453 | 531 | * |
|---|
| 454 | 532 | * We can then safely transition to the i_pages lock to |
|---|
| 455 | 533 | * pin only the address_space of the particular node we want |
|---|
| 456 | 534 | * to reclaim, take the node off-LRU, and drop the lru_lock. |
|---|
| 457 | 535 | */ |
|---|
| 458 | 536 | |
|---|
| 459 | | - node = container_of(item, struct radix_tree_node, private_list); |
|---|
| 460 | | - mapping = container_of(node->root, struct address_space, i_pages); |
|---|
| 537 | + mapping = container_of(node->array, struct address_space, i_pages); |
|---|
| 461 | 538 | |
|---|
| 462 | 539 | /* Coming from the list, invert the lock order */ |
|---|
| 463 | 540 | if (!xa_trylock(&mapping->i_pages)) { |
|---|
| .. | .. |
|---|
| 467 | 544 | } |
|---|
| 468 | 545 | |
|---|
| 469 | 546 | list_lru_isolate(lru, item); |
|---|
| 547 | + __dec_lruvec_slab_state(node, WORKINGSET_NODES); |
|---|
| 548 | + |
|---|
| 470 | 549 | spin_unlock(lru_lock); |
|---|
| 471 | 550 | |
|---|
| 472 | 551 | /* |
|---|
| .. | .. |
|---|
| 474 | 553 | * no pages, so we expect to be able to remove them all and |
|---|
| 475 | 554 | * delete and free the empty node afterwards. |
|---|
| 476 | 555 | */ |
|---|
| 477 | | - if (WARN_ON_ONCE(!node->exceptional)) |
|---|
| 556 | + if (WARN_ON_ONCE(!node->nr_values)) |
|---|
| 478 | 557 | goto out_invalid; |
|---|
| 479 | | - if (WARN_ON_ONCE(node->count != node->exceptional)) |
|---|
| 558 | + if (WARN_ON_ONCE(node->count != node->nr_values)) |
|---|
| 480 | 559 | goto out_invalid; |
|---|
| 481 | | - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { |
|---|
| 482 | | - if (node->slots[i]) { |
|---|
| 483 | | - if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) |
|---|
| 484 | | - goto out_invalid; |
|---|
| 485 | | - if (WARN_ON_ONCE(!node->exceptional)) |
|---|
| 486 | | - goto out_invalid; |
|---|
| 487 | | - if (WARN_ON_ONCE(!mapping->nrexceptional)) |
|---|
| 488 | | - goto out_invalid; |
|---|
| 489 | | - node->slots[i] = NULL; |
|---|
| 490 | | - node->exceptional--; |
|---|
| 491 | | - node->count--; |
|---|
| 492 | | - mapping->nrexceptional--; |
|---|
| 493 | | - } |
|---|
| 494 | | - } |
|---|
| 495 | | - if (WARN_ON_ONCE(node->exceptional)) |
|---|
| 496 | | - goto out_invalid; |
|---|
| 497 | | - inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); |
|---|
| 498 | | - __radix_tree_delete_node(&mapping->i_pages, node, |
|---|
| 499 | | - workingset_lookup_update(mapping)); |
|---|
| 560 | + mapping->nrexceptional -= node->nr_values; |
|---|
| 561 | + xa_delete_node(node, workingset_update_node); |
|---|
| 562 | + __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM); |
|---|
| 500 | 563 | |
|---|
| 501 | 564 | out_invalid: |
|---|
| 502 | 565 | xa_unlock_irq(&mapping->i_pages); |
|---|
| .. | .. |
|---|
| 518 | 581 | static struct shrinker workingset_shadow_shrinker = { |
|---|
| 519 | 582 | .count_objects = count_shadow_nodes, |
|---|
| 520 | 583 | .scan_objects = scan_shadow_nodes, |
|---|
| 521 | | - .seeks = DEFAULT_SEEKS, |
|---|
| 584 | + .seeks = 0, /* ->count reports only fully expendable nodes */ |
|---|
| 522 | 585 | .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, |
|---|
| 523 | 586 | }; |
|---|
| 524 | 587 | |
|---|
| .. | .. |
|---|
| 543 | 606 | * double the initial memory by using totalram_pages as-is. |
|---|
| 544 | 607 | */ |
|---|
| 545 | 608 | timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; |
|---|
| 546 | | - max_order = fls_long(totalram_pages - 1); |
|---|
| 609 | + max_order = fls_long(totalram_pages() - 1); |
|---|
| 547 | 610 | if (max_order > timestamp_bits) |
|---|
| 548 | 611 | bucket_order = max_order - timestamp_bits; |
|---|
| 549 | 612 | pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", |
|---|