.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * linux/mm/filemap.c |
---|
3 | 4 | * |
---|
.. | .. |
---|
24 | 25 | #include <linux/pagemap.h> |
---|
25 | 26 | #include <linux/file.h> |
---|
26 | 27 | #include <linux/uio.h> |
---|
| 28 | +#include <linux/error-injection.h> |
---|
27 | 29 | #include <linux/hash.h> |
---|
28 | 30 | #include <linux/writeback.h> |
---|
29 | 31 | #include <linux/backing-dev.h> |
---|
.. | .. |
---|
38 | 40 | #include <linux/rmap.h> |
---|
39 | 41 | #include <linux/delayacct.h> |
---|
40 | 42 | #include <linux/psi.h> |
---|
| 43 | +#include <linux/ramfs.h> |
---|
| 44 | +#include <linux/page_idle.h> |
---|
| 45 | +#include <asm/pgalloc.h> |
---|
| 46 | +#include <asm/tlbflush.h> |
---|
41 | 47 | #include "internal.h" |
---|
42 | 48 | |
---|
43 | 49 | #define CREATE_TRACE_POINTS |
---|
44 | 50 | #include <trace/events/filemap.h> |
---|
| 51 | + |
---|
| 52 | +#undef CREATE_TRACE_POINTS |
---|
| 53 | +#include <trace/hooks/mm.h> |
---|
45 | 54 | |
---|
46 | 55 | /* |
---|
47 | 56 | * FIXME: remove all knowledge of the buffer layer from the core VM |
---|
.. | .. |
---|
73 | 82 | * ->i_mutex |
---|
74 | 83 | * ->i_mmap_rwsem (truncate->unmap_mapping_range) |
---|
75 | 84 | * |
---|
76 | | - * ->mmap_sem |
---|
| 85 | + * ->mmap_lock |
---|
77 | 86 | * ->i_mmap_rwsem |
---|
78 | 87 | * ->page_table_lock or pte_lock (various, mainly in memory.c) |
---|
79 | 88 | * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) |
---|
80 | 89 | * |
---|
81 | | - * ->mmap_sem |
---|
| 90 | + * ->mmap_lock |
---|
82 | 91 | * ->lock_page (access_process_vm) |
---|
83 | 92 | * |
---|
84 | 93 | * ->i_mutex (generic_perform_write) |
---|
85 | | - * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
---|
| 94 | + * ->mmap_lock (fault_in_pages_readable->do_page_fault) |
---|
86 | 95 | * |
---|
87 | 96 | * bdi->wb.list_lock |
---|
88 | 97 | * sb_lock (fs/fs-writeback.c) |
---|
.. | .. |
---|
98 | 107 | * ->swap_lock (try_to_unmap_one) |
---|
99 | 108 | * ->private_lock (try_to_unmap_one) |
---|
100 | 109 | * ->i_pages lock (try_to_unmap_one) |
---|
101 | | - * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) |
---|
102 | | - * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) |
---|
| 110 | + * ->pgdat->lru_lock (follow_page->mark_page_accessed) |
---|
| 111 | + * ->pgdat->lru_lock (check_pte_range->isolate_lru_page) |
---|
103 | 112 | * ->private_lock (page_remove_rmap->set_page_dirty) |
---|
104 | 113 | * ->i_pages lock (page_remove_rmap->set_page_dirty) |
---|
105 | 114 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) |
---|
.. | .. |
---|
113 | 122 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
---|
114 | 123 | */ |
---|
115 | 124 | |
---|
116 | | -static int page_cache_tree_insert(struct address_space *mapping, |
---|
117 | | - struct page *page, void **shadowp) |
---|
118 | | -{ |
---|
119 | | - struct radix_tree_node *node; |
---|
120 | | - void **slot; |
---|
121 | | - int error; |
---|
122 | | - |
---|
123 | | - error = __radix_tree_create(&mapping->i_pages, page->index, 0, |
---|
124 | | - &node, &slot); |
---|
125 | | - if (error) |
---|
126 | | - return error; |
---|
127 | | - if (*slot) { |
---|
128 | | - void *p; |
---|
129 | | - |
---|
130 | | - p = radix_tree_deref_slot_protected(slot, |
---|
131 | | - &mapping->i_pages.xa_lock); |
---|
132 | | - if (!radix_tree_exceptional_entry(p)) |
---|
133 | | - return -EEXIST; |
---|
134 | | - |
---|
135 | | - mapping->nrexceptional--; |
---|
136 | | - if (shadowp) |
---|
137 | | - *shadowp = p; |
---|
138 | | - } |
---|
139 | | - __radix_tree_replace(&mapping->i_pages, node, slot, page, |
---|
140 | | - workingset_lookup_update(mapping)); |
---|
141 | | - mapping->nrpages++; |
---|
142 | | - return 0; |
---|
143 | | -} |
---|
144 | | - |
---|
145 | | -static void page_cache_tree_delete(struct address_space *mapping, |
---|
| 125 | +static void page_cache_delete(struct address_space *mapping, |
---|
146 | 126 | struct page *page, void *shadow) |
---|
147 | 127 | { |
---|
148 | | - int i, nr; |
---|
| 128 | + XA_STATE(xas, &mapping->i_pages, page->index); |
---|
| 129 | + unsigned int nr = 1; |
---|
149 | 130 | |
---|
150 | | - /* hugetlb pages are represented by one entry in the radix tree */ |
---|
151 | | - nr = PageHuge(page) ? 1 : hpage_nr_pages(page); |
---|
| 131 | + mapping_set_update(&xas, mapping); |
---|
| 132 | + |
---|
| 133 | + /* hugetlb pages are represented by a single entry in the xarray */ |
---|
| 134 | + if (!PageHuge(page)) { |
---|
| 135 | + xas_set_order(&xas, page->index, compound_order(page)); |
---|
| 136 | + nr = compound_nr(page); |
---|
| 137 | + } |
---|
152 | 138 | |
---|
153 | 139 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
---|
154 | 140 | VM_BUG_ON_PAGE(PageTail(page), page); |
---|
155 | 141 | VM_BUG_ON_PAGE(nr != 1 && shadow, page); |
---|
156 | 142 | |
---|
157 | | - for (i = 0; i < nr; i++) { |
---|
158 | | - struct radix_tree_node *node; |
---|
159 | | - void **slot; |
---|
160 | | - |
---|
161 | | - __radix_tree_lookup(&mapping->i_pages, page->index + i, |
---|
162 | | - &node, &slot); |
---|
163 | | - |
---|
164 | | - VM_BUG_ON_PAGE(!node && nr != 1, page); |
---|
165 | | - |
---|
166 | | - radix_tree_clear_tags(&mapping->i_pages, node, slot); |
---|
167 | | - __radix_tree_replace(&mapping->i_pages, node, slot, shadow, |
---|
168 | | - workingset_lookup_update(mapping)); |
---|
169 | | - } |
---|
| 143 | + xas_store(&xas, shadow); |
---|
| 144 | + xas_init_marks(&xas); |
---|
170 | 145 | |
---|
171 | 146 | page->mapping = NULL; |
---|
172 | 147 | /* Leave page->index set: truncation lookup relies upon it */ |
---|
.. | .. |
---|
194 | 169 | * invalidate any existing cleancache entries. We can't leave |
---|
195 | 170 | * stale data around in the cleancache once our page is gone |
---|
196 | 171 | */ |
---|
197 | | - if (PageUptodate(page) && PageMappedToDisk(page)) { |
---|
198 | | - count_vm_event(PGPGOUTCLEAN); |
---|
| 172 | + if (PageUptodate(page) && PageMappedToDisk(page)) |
---|
199 | 173 | cleancache_put_page(page); |
---|
200 | | - } else { |
---|
| 174 | + else |
---|
201 | 175 | cleancache_invalidate_page(mapping, page); |
---|
202 | | - } |
---|
203 | 176 | |
---|
204 | 177 | VM_BUG_ON_PAGE(PageTail(page), page); |
---|
205 | 178 | VM_BUG_ON_PAGE(page_mapped(page), page); |
---|
.. | .. |
---|
230 | 203 | if (PageHuge(page)) |
---|
231 | 204 | return; |
---|
232 | 205 | |
---|
233 | | - nr = hpage_nr_pages(page); |
---|
| 206 | + nr = thp_nr_pages(page); |
---|
234 | 207 | |
---|
235 | | - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); |
---|
| 208 | + __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); |
---|
236 | 209 | if (PageSwapBacked(page)) { |
---|
237 | | - __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); |
---|
| 210 | + __mod_lruvec_page_state(page, NR_SHMEM, -nr); |
---|
238 | 211 | if (PageTransHuge(page)) |
---|
239 | 212 | __dec_node_page_state(page, NR_SHMEM_THPS); |
---|
240 | | - } else { |
---|
241 | | - VM_BUG_ON_PAGE(PageTransHuge(page), page); |
---|
| 213 | + } else if (PageTransHuge(page)) { |
---|
| 214 | + __dec_node_page_state(page, NR_FILE_THPS); |
---|
| 215 | + filemap_nr_thps_dec(mapping); |
---|
242 | 216 | } |
---|
243 | 217 | |
---|
244 | 218 | /* |
---|
.. | .. |
---|
267 | 241 | trace_mm_filemap_delete_from_page_cache(page); |
---|
268 | 242 | |
---|
269 | 243 | unaccount_page_cache_page(mapping, page); |
---|
270 | | - page_cache_tree_delete(mapping, page, shadow); |
---|
| 244 | + page_cache_delete(mapping, page, shadow); |
---|
271 | 245 | } |
---|
272 | 246 | |
---|
273 | 247 | static void page_cache_free_page(struct address_space *mapping, |
---|
.. | .. |
---|
280 | 254 | freepage(page); |
---|
281 | 255 | |
---|
282 | 256 | if (PageTransHuge(page) && !PageHuge(page)) { |
---|
283 | | - page_ref_sub(page, HPAGE_PMD_NR); |
---|
| 257 | + page_ref_sub(page, thp_nr_pages(page)); |
---|
284 | 258 | VM_BUG_ON_PAGE(page_count(page) <= 0, page); |
---|
285 | 259 | } else { |
---|
286 | 260 | put_page(page); |
---|
.. | .. |
---|
310 | 284 | EXPORT_SYMBOL(delete_from_page_cache); |
---|
311 | 285 | |
---|
312 | 286 | /* |
---|
313 | | - * page_cache_tree_delete_batch - delete several pages from page cache |
---|
| 287 | + * page_cache_delete_batch - delete several pages from page cache |
---|
314 | 288 | * @mapping: the mapping to which pages belong |
---|
315 | 289 | * @pvec: pagevec with pages to delete |
---|
316 | 290 | * |
---|
317 | 291 | * The function walks over mapping->i_pages and removes pages passed in @pvec |
---|
318 | | - * from the mapping. The function expects @pvec to be sorted by page index. |
---|
| 292 | + * from the mapping. The function expects @pvec to be sorted by page index |
---|
| 293 | + * and is optimised for it to be dense. |
---|
319 | 294 | * It tolerates holes in @pvec (mapping entries at those indices are not |
---|
320 | 295 | * modified). The function expects only THP head pages to be present in the |
---|
321 | | - * @pvec and takes care to delete all corresponding tail pages from the |
---|
322 | | - * mapping as well. |
---|
| 296 | + * @pvec. |
---|
323 | 297 | * |
---|
324 | 298 | * The function expects the i_pages lock to be held. |
---|
325 | 299 | */ |
---|
326 | | -static void |
---|
327 | | -page_cache_tree_delete_batch(struct address_space *mapping, |
---|
| 300 | +static void page_cache_delete_batch(struct address_space *mapping, |
---|
328 | 301 | struct pagevec *pvec) |
---|
329 | 302 | { |
---|
330 | | - struct radix_tree_iter iter; |
---|
331 | | - void **slot; |
---|
| 303 | + XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); |
---|
332 | 304 | int total_pages = 0; |
---|
333 | | - int i = 0, tail_pages = 0; |
---|
| 305 | + int i = 0; |
---|
334 | 306 | struct page *page; |
---|
335 | | - pgoff_t start; |
---|
336 | 307 | |
---|
337 | | - start = pvec->pages[0]->index; |
---|
338 | | - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { |
---|
339 | | - if (i >= pagevec_count(pvec) && !tail_pages) |
---|
| 308 | + mapping_set_update(&xas, mapping); |
---|
| 309 | + xas_for_each(&xas, page, ULONG_MAX) { |
---|
| 310 | + if (i >= pagevec_count(pvec)) |
---|
340 | 311 | break; |
---|
341 | | - page = radix_tree_deref_slot_protected(slot, |
---|
342 | | - &mapping->i_pages.xa_lock); |
---|
343 | | - if (radix_tree_exceptional_entry(page)) |
---|
| 312 | + |
---|
| 313 | + /* A swap/dax/shadow entry got inserted? Skip it. */ |
---|
| 314 | + if (xa_is_value(page)) |
---|
344 | 315 | continue; |
---|
345 | | - if (!tail_pages) { |
---|
346 | | - /* |
---|
347 | | - * Some page got inserted in our range? Skip it. We |
---|
348 | | - * have our pages locked so they are protected from |
---|
349 | | - * being removed. |
---|
350 | | - */ |
---|
351 | | - if (page != pvec->pages[i]) |
---|
352 | | - continue; |
---|
353 | | - WARN_ON_ONCE(!PageLocked(page)); |
---|
354 | | - if (PageTransHuge(page) && !PageHuge(page)) |
---|
355 | | - tail_pages = HPAGE_PMD_NR - 1; |
---|
356 | | - page->mapping = NULL; |
---|
357 | | - /* |
---|
358 | | - * Leave page->index set: truncation lookup relies |
---|
359 | | - * upon it |
---|
360 | | - */ |
---|
361 | | - i++; |
---|
362 | | - } else { |
---|
363 | | - tail_pages--; |
---|
| 316 | + /* |
---|
| 317 | + * A page got inserted in our range? Skip it. We have our |
---|
| 318 | + * pages locked so they are protected from being removed. |
---|
| 319 | + * If we see a page whose index is higher than ours, it |
---|
| 320 | + * means our page has been removed, which shouldn't be |
---|
| 321 | + * possible because we're holding the PageLock. |
---|
| 322 | + */ |
---|
| 323 | + if (page != pvec->pages[i]) { |
---|
| 324 | + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, |
---|
| 325 | + page); |
---|
| 326 | + continue; |
---|
364 | 327 | } |
---|
365 | | - radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); |
---|
366 | | - __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, |
---|
367 | | - workingset_lookup_update(mapping)); |
---|
| 328 | + |
---|
| 329 | + WARN_ON_ONCE(!PageLocked(page)); |
---|
| 330 | + |
---|
| 331 | + if (page->index == xas.xa_index) |
---|
| 332 | + page->mapping = NULL; |
---|
| 333 | + /* Leave page->index set: truncation lookup relies on it */ |
---|
| 334 | + |
---|
| 335 | + /* |
---|
| 336 | + * Move to the next page in the vector if this is a regular |
---|
| 337 | + * page or the index is of the last sub-page of this compound |
---|
| 338 | + * page. |
---|
| 339 | + */ |
---|
| 340 | + if (page->index + compound_nr(page) - 1 == xas.xa_index) |
---|
| 341 | + i++; |
---|
| 342 | + xas_store(&xas, NULL); |
---|
368 | 343 | total_pages++; |
---|
369 | 344 | } |
---|
370 | 345 | mapping->nrpages -= total_pages; |
---|
.. | .. |
---|
385 | 360 | |
---|
386 | 361 | unaccount_page_cache_page(mapping, pvec->pages[i]); |
---|
387 | 362 | } |
---|
388 | | - page_cache_tree_delete_batch(mapping, pvec); |
---|
| 363 | + page_cache_delete_batch(mapping, pvec); |
---|
389 | 364 | xa_unlock_irqrestore(&mapping->i_pages, flags); |
---|
390 | 365 | |
---|
391 | 366 | for (i = 0; i < pagevec_count(pvec); i++) |
---|
.. | .. |
---|
430 | 405 | * opposed to a regular memory cleansing writeback. The difference between |
---|
431 | 406 | * these two operations is that if a dirty page/buffer is encountered, it must |
---|
432 | 407 | * be waited upon, and not just skipped over. |
---|
| 408 | + * |
---|
| 409 | + * Return: %0 on success, negative error code otherwise. |
---|
433 | 410 | */ |
---|
434 | 411 | int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, |
---|
435 | 412 | loff_t end, int sync_mode) |
---|
.. | .. |
---|
442 | 419 | .range_end = end, |
---|
443 | 420 | }; |
---|
444 | 421 | |
---|
445 | | - if (!mapping_cap_writeback_dirty(mapping) || |
---|
| 422 | + if (!mapping_can_writeback(mapping) || |
---|
446 | 423 | !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) |
---|
447 | 424 | return 0; |
---|
448 | 425 | |
---|
.. | .. |
---|
477 | 454 | * |
---|
478 | 455 | * This is a mostly non-blocking flush. Not suitable for data-integrity |
---|
479 | 456 | * purposes - I/O may not be started against all dirty pages. |
---|
| 457 | + * |
---|
| 458 | + * Return: %0 on success, negative error code otherwise. |
---|
480 | 459 | */ |
---|
481 | 460 | int filemap_flush(struct address_space *mapping) |
---|
482 | 461 | { |
---|
.. | .. |
---|
492 | 471 | * |
---|
493 | 472 | * Find at least one page in the range supplied, usually used to check if |
---|
494 | 473 | * direct writing in this range will trigger a writeback. |
---|
| 474 | + * |
---|
| 475 | + * Return: %true if at least one page exists in the specified range, |
---|
| 476 | + * %false otherwise. |
---|
495 | 477 | */ |
---|
496 | 478 | bool filemap_range_has_page(struct address_space *mapping, |
---|
497 | 479 | loff_t start_byte, loff_t end_byte) |
---|
498 | 480 | { |
---|
499 | | - pgoff_t index = start_byte >> PAGE_SHIFT; |
---|
500 | | - pgoff_t end = end_byte >> PAGE_SHIFT; |
---|
501 | 481 | struct page *page; |
---|
| 482 | + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); |
---|
| 483 | + pgoff_t max = end_byte >> PAGE_SHIFT; |
---|
502 | 484 | |
---|
503 | 485 | if (end_byte < start_byte) |
---|
504 | 486 | return false; |
---|
505 | 487 | |
---|
506 | | - if (mapping->nrpages == 0) |
---|
507 | | - return false; |
---|
| 488 | + rcu_read_lock(); |
---|
| 489 | + for (;;) { |
---|
| 490 | + page = xas_find(&xas, max); |
---|
| 491 | + if (xas_retry(&xas, page)) |
---|
| 492 | + continue; |
---|
| 493 | + /* Shadow entries don't count */ |
---|
| 494 | + if (xa_is_value(page)) |
---|
| 495 | + continue; |
---|
| 496 | + /* |
---|
| 497 | + * We don't need to try to pin this page; we're about to |
---|
| 498 | + * release the RCU lock anyway. It is enough to know that |
---|
| 499 | + * there was a page here recently. |
---|
| 500 | + */ |
---|
| 501 | + break; |
---|
| 502 | + } |
---|
| 503 | + rcu_read_unlock(); |
---|
508 | 504 | |
---|
509 | | - if (!find_get_pages_range(mapping, &index, end, 1, &page)) |
---|
510 | | - return false; |
---|
511 | | - put_page(page); |
---|
512 | | - return true; |
---|
| 505 | + return page != NULL; |
---|
513 | 506 | } |
---|
514 | 507 | EXPORT_SYMBOL(filemap_range_has_page); |
---|
515 | 508 | |
---|
.. | .. |
---|
557 | 550 | * Since the error status of the address space is cleared by this function, |
---|
558 | 551 | * callers are responsible for checking the return value and handling and/or |
---|
559 | 552 | * reporting the error. |
---|
| 553 | + * |
---|
| 554 | + * Return: error status of the address space. |
---|
560 | 555 | */ |
---|
561 | 556 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, |
---|
562 | 557 | loff_t end_byte) |
---|
.. | .. |
---|
601 | 596 | * Since the error status of the file is advanced by this function, |
---|
602 | 597 | * callers are responsible for checking the return value and handling and/or |
---|
603 | 598 | * reporting the error. |
---|
| 599 | + * |
---|
| 600 | + * Return: error status of the address space vs. the file->f_wb_err cursor. |
---|
604 | 601 | */ |
---|
605 | 602 | int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) |
---|
606 | 603 | { |
---|
.. | .. |
---|
622 | 619 | * Use this function if callers don't handle errors themselves. Expected |
---|
623 | 620 | * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), |
---|
624 | 621 | * fsfreeze(8) |
---|
| 622 | + * |
---|
| 623 | + * Return: error status of the address space. |
---|
625 | 624 | */ |
---|
626 | 625 | int filemap_fdatawait_keep_errors(struct address_space *mapping) |
---|
627 | 626 | { |
---|
.. | .. |
---|
630 | 629 | } |
---|
631 | 630 | EXPORT_SYMBOL(filemap_fdatawait_keep_errors); |
---|
632 | 631 | |
---|
| 632 | +/* Returns true if writeback might be needed or already in progress. */ |
---|
633 | 633 | static bool mapping_needs_writeback(struct address_space *mapping) |
---|
634 | 634 | { |
---|
635 | | - return (!dax_mapping(mapping) && mapping->nrpages) || |
---|
636 | | - (dax_mapping(mapping) && mapping->nrexceptional); |
---|
637 | | -} |
---|
| 635 | + if (dax_mapping(mapping)) |
---|
| 636 | + return mapping->nrexceptional; |
---|
638 | 637 | |
---|
639 | | -int filemap_write_and_wait(struct address_space *mapping) |
---|
640 | | -{ |
---|
641 | | - int err = 0; |
---|
642 | | - |
---|
643 | | - if (mapping_needs_writeback(mapping)) { |
---|
644 | | - err = filemap_fdatawrite(mapping); |
---|
645 | | - /* |
---|
646 | | - * Even if the above returned error, the pages may be |
---|
647 | | - * written partially (e.g. -ENOSPC), so we wait for it. |
---|
648 | | - * But the -EIO is special case, it may indicate the worst |
---|
649 | | - * thing (e.g. bug) happened, so we avoid waiting for it. |
---|
650 | | - */ |
---|
651 | | - if (err != -EIO) { |
---|
652 | | - int err2 = filemap_fdatawait(mapping); |
---|
653 | | - if (!err) |
---|
654 | | - err = err2; |
---|
655 | | - } else { |
---|
656 | | - /* Clear any previously stored errors */ |
---|
657 | | - filemap_check_errors(mapping); |
---|
658 | | - } |
---|
659 | | - } else { |
---|
660 | | - err = filemap_check_errors(mapping); |
---|
661 | | - } |
---|
662 | | - return err; |
---|
| 638 | + return mapping->nrpages; |
---|
663 | 639 | } |
---|
664 | | -EXPORT_SYMBOL(filemap_write_and_wait); |
---|
665 | 640 | |
---|
666 | 641 | /** |
---|
667 | 642 | * filemap_write_and_wait_range - write out & wait on a file range |
---|
.. | .. |
---|
673 | 648 | * |
---|
674 | 649 | * Note that @lend is inclusive (describes the last byte to be written) so |
---|
675 | 650 | * that this function can be used to write to the very end-of-file (end = -1). |
---|
| 651 | + * |
---|
| 652 | + * Return: error status of the address space. |
---|
676 | 653 | */ |
---|
677 | 654 | int filemap_write_and_wait_range(struct address_space *mapping, |
---|
678 | 655 | loff_t lstart, loff_t lend) |
---|
.. | .. |
---|
682 | 659 | if (mapping_needs_writeback(mapping)) { |
---|
683 | 660 | err = __filemap_fdatawrite_range(mapping, lstart, lend, |
---|
684 | 661 | WB_SYNC_ALL); |
---|
685 | | - /* See comment of filemap_write_and_wait() */ |
---|
| 662 | + /* |
---|
| 663 | + * Even if the above returned error, the pages may be |
---|
| 664 | + * written partially (e.g. -ENOSPC), so we wait for it. |
---|
| 665 | + * But the -EIO is special case, it may indicate the worst |
---|
| 666 | + * thing (e.g. bug) happened, so we avoid waiting for it. |
---|
| 667 | + */ |
---|
686 | 668 | if (err != -EIO) { |
---|
687 | 669 | int err2 = filemap_fdatawait_range(mapping, |
---|
688 | 670 | lstart, lend); |
---|
.. | .. |
---|
728 | 710 | * While we handle mapping->wb_err with atomic operations, the f_wb_err |
---|
729 | 711 | * value is protected by the f_lock since we must ensure that it reflects |
---|
730 | 712 | * the latest value swapped in for this file descriptor. |
---|
| 713 | + * |
---|
| 714 | + * Return: %0 on success, negative error code otherwise. |
---|
731 | 715 | */ |
---|
732 | 716 | int file_check_and_advance_wb_err(struct file *file) |
---|
733 | 717 | { |
---|
.. | .. |
---|
770 | 754 | * |
---|
771 | 755 | * After writing out and waiting on the data, we check and advance the |
---|
772 | 756 | * f_wb_err cursor to the latest value, and return any errors detected there. |
---|
| 757 | + * |
---|
| 758 | + * Return: %0 on success, negative error code otherwise. |
---|
773 | 759 | */ |
---|
774 | 760 | int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) |
---|
775 | 761 | { |
---|
.. | .. |
---|
802 | 788 | * locked. This function does not add the new page to the LRU, the |
---|
803 | 789 | * caller must do that. |
---|
804 | 790 | * |
---|
805 | | - * The remove + add is atomic. The only way this function can fail is |
---|
806 | | - * memory allocation failure. |
---|
| 791 | + * The remove + add is atomic. This function cannot fail. |
---|
| 792 | + * |
---|
| 793 | + * Return: %0 |
---|
807 | 794 | */ |
---|
808 | 795 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) |
---|
809 | 796 | { |
---|
810 | | - int error; |
---|
| 797 | + struct address_space *mapping = old->mapping; |
---|
| 798 | + void (*freepage)(struct page *) = mapping->a_ops->freepage; |
---|
| 799 | + pgoff_t offset = old->index; |
---|
| 800 | + XA_STATE(xas, &mapping->i_pages, offset); |
---|
| 801 | + unsigned long flags; |
---|
811 | 802 | |
---|
812 | 803 | VM_BUG_ON_PAGE(!PageLocked(old), old); |
---|
813 | 804 | VM_BUG_ON_PAGE(!PageLocked(new), new); |
---|
814 | 805 | VM_BUG_ON_PAGE(new->mapping, new); |
---|
815 | 806 | |
---|
816 | | - error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); |
---|
817 | | - if (!error) { |
---|
818 | | - struct address_space *mapping = old->mapping; |
---|
819 | | - void (*freepage)(struct page *); |
---|
820 | | - unsigned long flags; |
---|
| 807 | + get_page(new); |
---|
| 808 | + new->mapping = mapping; |
---|
| 809 | + new->index = offset; |
---|
821 | 810 | |
---|
822 | | - pgoff_t offset = old->index; |
---|
823 | | - freepage = mapping->a_ops->freepage; |
---|
| 811 | + mem_cgroup_migrate(old, new); |
---|
824 | 812 | |
---|
825 | | - get_page(new); |
---|
826 | | - new->mapping = mapping; |
---|
827 | | - new->index = offset; |
---|
| 813 | + xas_lock_irqsave(&xas, flags); |
---|
| 814 | + xas_store(&xas, new); |
---|
828 | 815 | |
---|
829 | | - xa_lock_irqsave(&mapping->i_pages, flags); |
---|
830 | | - __delete_from_page_cache(old, NULL); |
---|
831 | | - error = page_cache_tree_insert(mapping, new, NULL); |
---|
832 | | - BUG_ON(error); |
---|
| 816 | + old->mapping = NULL; |
---|
| 817 | + /* hugetlb pages do not participate in page cache accounting. */ |
---|
| 818 | + if (!PageHuge(old)) |
---|
| 819 | + __dec_lruvec_page_state(old, NR_FILE_PAGES); |
---|
| 820 | + if (!PageHuge(new)) |
---|
| 821 | + __inc_lruvec_page_state(new, NR_FILE_PAGES); |
---|
| 822 | + if (PageSwapBacked(old)) |
---|
| 823 | + __dec_lruvec_page_state(old, NR_SHMEM); |
---|
| 824 | + if (PageSwapBacked(new)) |
---|
| 825 | + __inc_lruvec_page_state(new, NR_SHMEM); |
---|
| 826 | + xas_unlock_irqrestore(&xas, flags); |
---|
| 827 | + if (freepage) |
---|
| 828 | + freepage(old); |
---|
| 829 | + put_page(old); |
---|
833 | 830 | |
---|
834 | | - /* |
---|
835 | | - * hugetlb pages do not participate in page cache accounting. |
---|
836 | | - */ |
---|
837 | | - if (!PageHuge(new)) |
---|
838 | | - __inc_node_page_state(new, NR_FILE_PAGES); |
---|
839 | | - if (PageSwapBacked(new)) |
---|
840 | | - __inc_node_page_state(new, NR_SHMEM); |
---|
841 | | - xa_unlock_irqrestore(&mapping->i_pages, flags); |
---|
842 | | - mem_cgroup_migrate(old, new); |
---|
843 | | - radix_tree_preload_end(); |
---|
844 | | - if (freepage) |
---|
845 | | - freepage(old); |
---|
846 | | - put_page(old); |
---|
847 | | - } |
---|
848 | | - |
---|
849 | | - return error; |
---|
| 831 | + return 0; |
---|
850 | 832 | } |
---|
851 | 833 | EXPORT_SYMBOL_GPL(replace_page_cache_page); |
---|
852 | 834 | |
---|
853 | | -static int __add_to_page_cache_locked(struct page *page, |
---|
854 | | - struct address_space *mapping, |
---|
855 | | - pgoff_t offset, gfp_t gfp_mask, |
---|
856 | | - void **shadowp) |
---|
| 835 | +noinline int __add_to_page_cache_locked(struct page *page, |
---|
| 836 | + struct address_space *mapping, |
---|
| 837 | + pgoff_t offset, gfp_t gfp, |
---|
| 838 | + void **shadowp) |
---|
857 | 839 | { |
---|
| 840 | + XA_STATE(xas, &mapping->i_pages, offset); |
---|
858 | 841 | int huge = PageHuge(page); |
---|
859 | | - struct mem_cgroup *memcg; |
---|
860 | 842 | int error; |
---|
| 843 | + bool charged = false; |
---|
861 | 844 | |
---|
862 | 845 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
---|
863 | 846 | VM_BUG_ON_PAGE(PageSwapBacked(page), page); |
---|
864 | | - |
---|
865 | | - if (!huge) { |
---|
866 | | - error = mem_cgroup_try_charge(page, current->mm, |
---|
867 | | - gfp_mask, &memcg, false); |
---|
868 | | - if (error) |
---|
869 | | - return error; |
---|
870 | | - } |
---|
871 | | - |
---|
872 | | - error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); |
---|
873 | | - if (error) { |
---|
874 | | - if (!huge) |
---|
875 | | - mem_cgroup_cancel_charge(page, memcg, false); |
---|
876 | | - return error; |
---|
877 | | - } |
---|
| 847 | + mapping_set_update(&xas, mapping); |
---|
878 | 848 | |
---|
879 | 849 | get_page(page); |
---|
880 | 850 | page->mapping = mapping; |
---|
881 | 851 | page->index = offset; |
---|
882 | 852 | |
---|
883 | | - xa_lock_irq(&mapping->i_pages); |
---|
884 | | - error = page_cache_tree_insert(mapping, page, shadowp); |
---|
885 | | - radix_tree_preload_end(); |
---|
886 | | - if (unlikely(error)) |
---|
887 | | - goto err_insert; |
---|
| 853 | + if (!huge) { |
---|
| 854 | + error = mem_cgroup_charge(page, current->mm, gfp); |
---|
| 855 | + if (error) |
---|
| 856 | + goto error; |
---|
| 857 | + charged = true; |
---|
| 858 | + } |
---|
888 | 859 | |
---|
889 | | - /* hugetlb pages do not participate in page cache accounting. */ |
---|
890 | | - if (!huge) |
---|
891 | | - __inc_node_page_state(page, NR_FILE_PAGES); |
---|
892 | | - xa_unlock_irq(&mapping->i_pages); |
---|
893 | | - if (!huge) |
---|
894 | | - mem_cgroup_commit_charge(page, memcg, false, false); |
---|
| 860 | + gfp &= GFP_RECLAIM_MASK; |
---|
| 861 | + |
---|
| 862 | + do { |
---|
| 863 | + unsigned int order = xa_get_order(xas.xa, xas.xa_index); |
---|
| 864 | + void *entry, *old = NULL; |
---|
| 865 | + |
---|
| 866 | + if (order > thp_order(page)) |
---|
| 867 | + xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), |
---|
| 868 | + order, gfp); |
---|
| 869 | + xas_lock_irq(&xas); |
---|
| 870 | + xas_for_each_conflict(&xas, entry) { |
---|
| 871 | + old = entry; |
---|
| 872 | + if (!xa_is_value(entry)) { |
---|
| 873 | + xas_set_err(&xas, -EEXIST); |
---|
| 874 | + goto unlock; |
---|
| 875 | + } |
---|
| 876 | + } |
---|
| 877 | + |
---|
| 878 | + if (old) { |
---|
| 879 | + if (shadowp) |
---|
| 880 | + *shadowp = old; |
---|
| 881 | + /* entry may have been split before we acquired lock */ |
---|
| 882 | + order = xa_get_order(xas.xa, xas.xa_index); |
---|
| 883 | + if (order > thp_order(page)) { |
---|
| 884 | + xas_split(&xas, old, order); |
---|
| 885 | + xas_reset(&xas); |
---|
| 886 | + } |
---|
| 887 | + } |
---|
| 888 | + |
---|
| 889 | + xas_store(&xas, page); |
---|
| 890 | + if (xas_error(&xas)) |
---|
| 891 | + goto unlock; |
---|
| 892 | + |
---|
| 893 | + if (old) |
---|
| 894 | + mapping->nrexceptional--; |
---|
| 895 | + mapping->nrpages++; |
---|
| 896 | + |
---|
| 897 | + /* hugetlb pages do not participate in page cache accounting */ |
---|
| 898 | + if (!huge) |
---|
| 899 | + __inc_lruvec_page_state(page, NR_FILE_PAGES); |
---|
| 900 | +unlock: |
---|
| 901 | + xas_unlock_irq(&xas); |
---|
| 902 | + } while (xas_nomem(&xas, gfp)); |
---|
| 903 | + |
---|
| 904 | + if (xas_error(&xas)) { |
---|
| 905 | + error = xas_error(&xas); |
---|
| 906 | + if (charged) |
---|
| 907 | + mem_cgroup_uncharge(page); |
---|
| 908 | + goto error; |
---|
| 909 | + } |
---|
| 910 | + |
---|
895 | 911 | trace_mm_filemap_add_to_page_cache(page); |
---|
896 | 912 | return 0; |
---|
897 | | -err_insert: |
---|
| 913 | +error: |
---|
898 | 914 | page->mapping = NULL; |
---|
899 | 915 | /* Leave page->index set: truncation relies upon it */ |
---|
900 | | - xa_unlock_irq(&mapping->i_pages); |
---|
901 | | - if (!huge) |
---|
902 | | - mem_cgroup_cancel_charge(page, memcg, false); |
---|
903 | 916 | put_page(page); |
---|
904 | 917 | return error; |
---|
905 | 918 | } |
---|
| 919 | +ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); |
---|
906 | 920 | |
---|
907 | 921 | /** |
---|
908 | 922 | * add_to_page_cache_locked - add a locked page to the pagecache |
---|
.. | .. |
---|
913 | 927 | * |
---|
914 | 928 | * This function is used to add a page to the pagecache. It must be locked. |
---|
915 | 929 | * This function does not add the page to the LRU. The caller must do that. |
---|
| 930 | + * |
---|
| 931 | + * Return: %0 on success, negative error code otherwise. |
---|
916 | 932 | */ |
---|
917 | 933 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, |
---|
918 | 934 | pgoff_t offset, gfp_t gfp_mask) |
---|
.. | .. |
---|
1001 | 1017 | page_writeback_init(); |
---|
1002 | 1018 | } |
---|
1003 | 1019 | |
---|
1004 | | -/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ |
---|
1005 | | -struct wait_page_key { |
---|
1006 | | - struct page *page; |
---|
1007 | | - int bit_nr; |
---|
1008 | | - int page_match; |
---|
1009 | | -}; |
---|
1010 | | - |
---|
1011 | | -struct wait_page_queue { |
---|
1012 | | - struct page *page; |
---|
1013 | | - int bit_nr; |
---|
1014 | | - wait_queue_entry_t wait; |
---|
1015 | | -}; |
---|
1016 | | - |
---|
| 1020 | +/* |
---|
| 1021 | + * The page wait code treats the "wait->flags" somewhat unusually, because |
---|
| 1022 | + * we have multiple different kinds of waits, not just the usual "exclusive" |
---|
| 1023 | + * one. |
---|
| 1024 | + * |
---|
| 1025 | + * We have: |
---|
| 1026 | + * |
---|
| 1027 | + * (a) no special bits set: |
---|
| 1028 | + * |
---|
| 1029 | + * We're just waiting for the bit to be released, and when a waker |
---|
| 1030 | + * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, |
---|
| 1031 | + * and remove it from the wait queue. |
---|
| 1032 | + * |
---|
| 1033 | + * Simple and straightforward. |
---|
| 1034 | + * |
---|
| 1035 | + * (b) WQ_FLAG_EXCLUSIVE: |
---|
| 1036 | + * |
---|
| 1037 | + * The waiter is waiting to get the lock, and only one waiter should |
---|
| 1038 | + * be woken up to avoid any thundering herd behavior. We'll set the |
---|
| 1039 | + * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. |
---|
| 1040 | + * |
---|
| 1041 | + * This is the traditional exclusive wait. |
---|
| 1042 | + * |
---|
| 1043 | + * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: |
---|
| 1044 | + * |
---|
| 1045 | + * The waiter is waiting to get the bit, and additionally wants the |
---|
| 1046 | + * lock to be transferred to it for fair lock behavior. If the lock |
---|
| 1047 | + * cannot be taken, we stop walking the wait queue without waking |
---|
| 1048 | + * the waiter. |
---|
| 1049 | + * |
---|
| 1050 | + * This is the "fair lock handoff" case, and in addition to setting |
---|
| 1051 | + * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see |
---|
| 1052 | + * that it now has the lock. |
---|
| 1053 | + */ |
---|
1017 | 1054 | static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) |
---|
1018 | 1055 | { |
---|
| 1056 | + unsigned int flags; |
---|
1019 | 1057 | struct wait_page_key *key = arg; |
---|
1020 | 1058 | struct wait_page_queue *wait_page |
---|
1021 | 1059 | = container_of(wait, struct wait_page_queue, wait); |
---|
1022 | 1060 | |
---|
1023 | | - if (wait_page->page != key->page) |
---|
1024 | | - return 0; |
---|
1025 | | - key->page_match = 1; |
---|
1026 | | - |
---|
1027 | | - if (wait_page->bit_nr != key->bit_nr) |
---|
| 1061 | + if (!wake_page_match(wait_page, key)) |
---|
1028 | 1062 | return 0; |
---|
1029 | 1063 | |
---|
1030 | | - /* Stop walking if it's locked */ |
---|
1031 | | - if (test_bit(key->bit_nr, &key->page->flags)) |
---|
1032 | | - return -1; |
---|
| 1064 | + /* |
---|
| 1065 | + * If it's a lock handoff wait, we get the bit for it, and |
---|
| 1066 | + * stop walking (and do not wake it up) if we can't. |
---|
| 1067 | + */ |
---|
| 1068 | + flags = wait->flags; |
---|
| 1069 | + if (flags & WQ_FLAG_EXCLUSIVE) { |
---|
| 1070 | + if (test_bit(key->bit_nr, &key->page->flags)) |
---|
| 1071 | + return -1; |
---|
| 1072 | + if (flags & WQ_FLAG_CUSTOM) { |
---|
| 1073 | + if (test_and_set_bit(key->bit_nr, &key->page->flags)) |
---|
| 1074 | + return -1; |
---|
| 1075 | + flags |= WQ_FLAG_DONE; |
---|
| 1076 | + } |
---|
| 1077 | + } |
---|
1033 | 1078 | |
---|
1034 | | - return autoremove_wake_function(wait, mode, sync, key); |
---|
| 1079 | + /* |
---|
| 1080 | + * We are holding the wait-queue lock, but the waiter that |
---|
| 1081 | + * is waiting for this will be checking the flags without |
---|
| 1082 | + * any locking. |
---|
| 1083 | + * |
---|
| 1084 | + * So update the flags atomically, and wake up the waiter |
---|
| 1085 | + * afterwards to avoid any races. This store-release pairs |
---|
| 1086 | + * with the load-acquire in wait_on_page_bit_common(). |
---|
| 1087 | + */ |
---|
| 1088 | + smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); |
---|
| 1089 | + wake_up_state(wait->private, mode); |
---|
| 1090 | + |
---|
| 1091 | + /* |
---|
| 1092 | + * Ok, we have successfully done what we're waiting for, |
---|
| 1093 | + * and we can unconditionally remove the wait entry. |
---|
| 1094 | + * |
---|
| 1095 | + * Note that this pairs with the "finish_wait()" in the |
---|
| 1096 | + * waiter, and has to be the absolute last thing we do. |
---|
| 1097 | + * After this list_del_init(&wait->entry) the wait entry |
---|
| 1098 | + * might be de-allocated and the process might even have |
---|
| 1099 | + * exited. |
---|
| 1100 | + */ |
---|
| 1101 | + list_del_init_careful(&wait->entry); |
---|
| 1102 | + return (flags & WQ_FLAG_EXCLUSIVE) != 0; |
---|
1035 | 1103 | } |
---|
1036 | 1104 | |
---|
1037 | 1105 | static void wake_up_page_bit(struct page *page, int bit_nr) |
---|
.. | .. |
---|
1095 | 1163 | wake_up_page_bit(page, bit); |
---|
1096 | 1164 | } |
---|
1097 | 1165 | |
---|
1098 | | -static inline __sched int wait_on_page_bit_common(wait_queue_head_t *q, |
---|
1099 | | - struct page *page, int bit_nr, int state, bool lock) |
---|
| 1166 | +/* |
---|
| 1167 | + * A choice of three behaviors for wait_on_page_bit_common(): |
---|
| 1168 | + */ |
---|
| 1169 | +enum behavior { |
---|
| 1170 | + EXCLUSIVE, /* Hold ref to page and take the bit when woken, like |
---|
| 1171 | + * __lock_page() waiting on then setting PG_locked. |
---|
| 1172 | + */ |
---|
| 1173 | + SHARED, /* Hold ref to page and check the bit when woken, like |
---|
| 1174 | + * wait_on_page_writeback() waiting on PG_writeback. |
---|
| 1175 | + */ |
---|
| 1176 | + DROP, /* Drop ref to page before wait, no check when woken, |
---|
| 1177 | + * like put_and_wait_on_page_locked() on PG_locked. |
---|
| 1178 | + */ |
---|
| 1179 | +}; |
---|
| 1180 | + |
---|
| 1181 | +/* |
---|
| 1182 | + * Attempt to check (or get) the page bit, and mark us done |
---|
| 1183 | + * if successful. |
---|
| 1184 | + */ |
---|
| 1185 | +static inline bool trylock_page_bit_common(struct page *page, int bit_nr, |
---|
| 1186 | + struct wait_queue_entry *wait) |
---|
1100 | 1187 | { |
---|
| 1188 | + if (wait->flags & WQ_FLAG_EXCLUSIVE) { |
---|
| 1189 | + if (test_and_set_bit(bit_nr, &page->flags)) |
---|
| 1190 | + return false; |
---|
| 1191 | + } else if (test_bit(bit_nr, &page->flags)) |
---|
| 1192 | + return false; |
---|
| 1193 | + |
---|
| 1194 | + wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; |
---|
| 1195 | + return true; |
---|
| 1196 | +} |
---|
| 1197 | + |
---|
| 1198 | +/* How many times do we accept lock stealing from under a waiter? */ |
---|
| 1199 | +int sysctl_page_lock_unfairness = 5; |
---|
| 1200 | + |
---|
| 1201 | +static inline __sched int wait_on_page_bit_common(wait_queue_head_t *q, |
---|
| 1202 | + struct page *page, int bit_nr, int state, enum behavior behavior) |
---|
| 1203 | +{ |
---|
| 1204 | + int unfairness = sysctl_page_lock_unfairness; |
---|
1101 | 1205 | struct wait_page_queue wait_page; |
---|
1102 | 1206 | wait_queue_entry_t *wait = &wait_page.wait; |
---|
1103 | 1207 | bool thrashing = false; |
---|
| 1208 | + bool delayacct = false; |
---|
1104 | 1209 | unsigned long pflags; |
---|
1105 | | - int ret = 0; |
---|
1106 | 1210 | |
---|
1107 | 1211 | if (bit_nr == PG_locked && |
---|
1108 | 1212 | !PageUptodate(page) && PageWorkingset(page)) { |
---|
1109 | | - if (!PageSwapBacked(page)) |
---|
| 1213 | + if (!PageSwapBacked(page)) { |
---|
1110 | 1214 | delayacct_thrashing_start(); |
---|
| 1215 | + delayacct = true; |
---|
| 1216 | + } |
---|
1111 | 1217 | psi_memstall_enter(&pflags); |
---|
1112 | 1218 | thrashing = true; |
---|
1113 | 1219 | } |
---|
1114 | 1220 | |
---|
1115 | 1221 | init_wait(wait); |
---|
1116 | | - wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; |
---|
1117 | 1222 | wait->func = wake_page_function; |
---|
1118 | 1223 | wait_page.page = page; |
---|
1119 | 1224 | wait_page.bit_nr = bit_nr; |
---|
1120 | 1225 | |
---|
1121 | | - for (;;) { |
---|
1122 | | - spin_lock_irq(&q->lock); |
---|
| 1226 | +repeat: |
---|
| 1227 | + wait->flags = 0; |
---|
| 1228 | + if (behavior == EXCLUSIVE) { |
---|
| 1229 | + wait->flags = WQ_FLAG_EXCLUSIVE; |
---|
| 1230 | + if (--unfairness < 0) |
---|
| 1231 | + wait->flags |= WQ_FLAG_CUSTOM; |
---|
| 1232 | + } |
---|
1123 | 1233 | |
---|
1124 | | - if (likely(list_empty(&wait->entry))) { |
---|
1125 | | - __add_wait_queue_entry_tail(q, wait); |
---|
1126 | | - SetPageWaiters(page); |
---|
1127 | | - } |
---|
| 1234 | + /* |
---|
| 1235 | + * Do one last check whether we can get the |
---|
| 1236 | + * page bit synchronously. |
---|
| 1237 | + * |
---|
| 1238 | + * Do the SetPageWaiters() marking before that |
---|
| 1239 | + * to let any waker we _just_ missed know they |
---|
| 1240 | + * need to wake us up (otherwise they'll never |
---|
| 1241 | + * even go to the slow case that looks at the |
---|
| 1242 | + * page queue), and add ourselves to the wait |
---|
| 1243 | + * queue if we need to sleep. |
---|
| 1244 | + * |
---|
| 1245 | + * This part needs to be done under the queue |
---|
| 1246 | + * lock to avoid races. |
---|
| 1247 | + */ |
---|
| 1248 | + spin_lock_irq(&q->lock); |
---|
| 1249 | + SetPageWaiters(page); |
---|
| 1250 | + if (!trylock_page_bit_common(page, bit_nr, wait)) |
---|
| 1251 | + __add_wait_queue_entry_tail(q, wait); |
---|
| 1252 | + spin_unlock_irq(&q->lock); |
---|
| 1253 | + |
---|
| 1254 | + /* |
---|
| 1255 | + * From now on, all the logic will be based on |
---|
| 1256 | + * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to |
---|
| 1257 | + * see whether the page bit testing has already |
---|
| 1258 | + * been done by the wake function. |
---|
| 1259 | + * |
---|
| 1260 | + * We can drop our reference to the page. |
---|
| 1261 | + */ |
---|
| 1262 | + if (behavior == DROP) |
---|
| 1263 | + put_page(page); |
---|
| 1264 | + |
---|
| 1265 | + /* |
---|
| 1266 | + * Note that until the "finish_wait()", or until |
---|
| 1267 | + * we see the WQ_FLAG_WOKEN flag, we need to |
---|
| 1268 | + * be very careful with the 'wait->flags', because |
---|
| 1269 | + * we may race with a waker that sets them. |
---|
| 1270 | + */ |
---|
| 1271 | + for (;;) { |
---|
| 1272 | + unsigned int flags; |
---|
1128 | 1273 | |
---|
1129 | 1274 | set_current_state(state); |
---|
1130 | 1275 | |
---|
1131 | | - spin_unlock_irq(&q->lock); |
---|
| 1276 | + /* Loop until we've been woken or interrupted */ |
---|
| 1277 | + flags = smp_load_acquire(&wait->flags); |
---|
| 1278 | + if (!(flags & WQ_FLAG_WOKEN)) { |
---|
| 1279 | + if (signal_pending_state(state, current)) |
---|
| 1280 | + break; |
---|
1132 | 1281 | |
---|
1133 | | - if (likely(test_bit(bit_nr, &page->flags))) { |
---|
1134 | 1282 | io_schedule(); |
---|
| 1283 | + continue; |
---|
1135 | 1284 | } |
---|
1136 | 1285 | |
---|
1137 | | - if (lock) { |
---|
1138 | | - if (!test_and_set_bit_lock(bit_nr, &page->flags)) |
---|
1139 | | - break; |
---|
1140 | | - } else { |
---|
1141 | | - if (!test_bit(bit_nr, &page->flags)) |
---|
1142 | | - break; |
---|
1143 | | - } |
---|
1144 | | - |
---|
1145 | | - if (unlikely(signal_pending_state(state, current))) { |
---|
1146 | | - ret = -EINTR; |
---|
| 1286 | + /* If we were non-exclusive, we're done */ |
---|
| 1287 | + if (behavior != EXCLUSIVE) |
---|
1147 | 1288 | break; |
---|
1148 | | - } |
---|
| 1289 | + |
---|
| 1290 | + /* If the waker got the lock for us, we're done */ |
---|
| 1291 | + if (flags & WQ_FLAG_DONE) |
---|
| 1292 | + break; |
---|
| 1293 | + |
---|
| 1294 | + /* |
---|
| 1295 | + * Otherwise, if we're getting the lock, we need to |
---|
| 1296 | + * try to get it ourselves. |
---|
| 1297 | + * |
---|
| 1298 | + * And if that fails, we'll have to retry this all. |
---|
| 1299 | + */ |
---|
| 1300 | + if (unlikely(test_and_set_bit(bit_nr, &page->flags))) |
---|
| 1301 | + goto repeat; |
---|
| 1302 | + |
---|
| 1303 | + wait->flags |= WQ_FLAG_DONE; |
---|
| 1304 | + break; |
---|
1149 | 1305 | } |
---|
1150 | 1306 | |
---|
| 1307 | + /* |
---|
| 1308 | + * If a signal happened, this 'finish_wait()' may remove the last |
---|
| 1309 | + * waiter from the wait-queues, but the PageWaiters bit will remain |
---|
| 1310 | + * set. That's ok. The next wakeup will take care of it, and trying |
---|
| 1311 | + * to do it here would be difficult and prone to races. |
---|
| 1312 | + */ |
---|
1151 | 1313 | finish_wait(q, wait); |
---|
1152 | 1314 | |
---|
1153 | 1315 | if (thrashing) { |
---|
1154 | | - if (!PageSwapBacked(page)) |
---|
| 1316 | + if (delayacct) |
---|
1155 | 1317 | delayacct_thrashing_end(); |
---|
1156 | 1318 | psi_memstall_leave(&pflags); |
---|
1157 | 1319 | } |
---|
1158 | 1320 | |
---|
1159 | 1321 | /* |
---|
1160 | | - * A signal could leave PageWaiters set. Clearing it here if |
---|
1161 | | - * !waitqueue_active would be possible (by open-coding finish_wait), |
---|
1162 | | - * but still fail to catch it in the case of wait hash collision. We |
---|
1163 | | - * already can fail to clear wait hash collision cases, so don't |
---|
1164 | | - * bother with signals either. |
---|
| 1322 | + * NOTE! The wait->flags weren't stable until we've done the |
---|
| 1323 | + * 'finish_wait()', and we could have exited the loop above due |
---|
| 1324 | + * to a signal, and had a wakeup event happen after the signal |
---|
| 1325 | + * test but before the 'finish_wait()'. |
---|
| 1326 | + * |
---|
| 1327 | + * So only after the finish_wait() can we reliably determine |
---|
| 1328 | + * if we got woken up or not, so we can now figure out the final |
---|
| 1329 | + * return value based on that state without races. |
---|
| 1330 | + * |
---|
| 1331 | + * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive |
---|
| 1332 | + * waiter, but an exclusive one requires WQ_FLAG_DONE. |
---|
1165 | 1333 | */ |
---|
| 1334 | + if (behavior == EXCLUSIVE) |
---|
| 1335 | + return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; |
---|
1166 | 1336 | |
---|
1167 | | - return ret; |
---|
| 1337 | + return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; |
---|
1168 | 1338 | } |
---|
1169 | 1339 | |
---|
1170 | | -void __sched wait_on_page_bit(struct page *page, int bit_nr) |
---|
| 1340 | +__sched void wait_on_page_bit(struct page *page, int bit_nr) |
---|
1171 | 1341 | { |
---|
1172 | 1342 | wait_queue_head_t *q = page_waitqueue(page); |
---|
1173 | | - wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false); |
---|
| 1343 | + wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); |
---|
1174 | 1344 | } |
---|
1175 | 1345 | EXPORT_SYMBOL(wait_on_page_bit); |
---|
1176 | 1346 | |
---|
1177 | | -int __sched wait_on_page_bit_killable(struct page *page, int bit_nr) |
---|
| 1347 | +__sched int wait_on_page_bit_killable(struct page *page, int bit_nr) |
---|
1178 | 1348 | { |
---|
1179 | 1349 | wait_queue_head_t *q = page_waitqueue(page); |
---|
1180 | | - return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false); |
---|
| 1350 | + return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED); |
---|
1181 | 1351 | } |
---|
1182 | 1352 | EXPORT_SYMBOL(wait_on_page_bit_killable); |
---|
| 1353 | + |
---|
| 1354 | +static int __wait_on_page_locked_async(struct page *page, |
---|
| 1355 | + struct wait_page_queue *wait, bool set) |
---|
| 1356 | +{ |
---|
| 1357 | + struct wait_queue_head *q = page_waitqueue(page); |
---|
| 1358 | + int ret = 0; |
---|
| 1359 | + |
---|
| 1360 | + wait->page = page; |
---|
| 1361 | + wait->bit_nr = PG_locked; |
---|
| 1362 | + |
---|
| 1363 | + spin_lock_irq(&q->lock); |
---|
| 1364 | + __add_wait_queue_entry_tail(q, &wait->wait); |
---|
| 1365 | + SetPageWaiters(page); |
---|
| 1366 | + if (set) |
---|
| 1367 | + ret = !trylock_page(page); |
---|
| 1368 | + else |
---|
| 1369 | + ret = PageLocked(page); |
---|
| 1370 | + /* |
---|
| 1371 | + * If we were succesful now, we know we're still on the |
---|
| 1372 | + * waitqueue as we're still under the lock. This means it's |
---|
| 1373 | + * safe to remove and return success, we know the callback |
---|
| 1374 | + * isn't going to trigger. |
---|
| 1375 | + */ |
---|
| 1376 | + if (!ret) |
---|
| 1377 | + __remove_wait_queue(q, &wait->wait); |
---|
| 1378 | + else |
---|
| 1379 | + ret = -EIOCBQUEUED; |
---|
| 1380 | + spin_unlock_irq(&q->lock); |
---|
| 1381 | + return ret; |
---|
| 1382 | +} |
---|
| 1383 | + |
---|
| 1384 | +static int wait_on_page_locked_async(struct page *page, |
---|
| 1385 | + struct wait_page_queue *wait) |
---|
| 1386 | +{ |
---|
| 1387 | + if (!PageLocked(page)) |
---|
| 1388 | + return 0; |
---|
| 1389 | + return __wait_on_page_locked_async(compound_head(page), wait, false); |
---|
| 1390 | +} |
---|
| 1391 | + |
---|
| 1392 | +/** |
---|
| 1393 | + * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked |
---|
| 1394 | + * @page: The page to wait for. |
---|
| 1395 | + * |
---|
| 1396 | + * The caller should hold a reference on @page. They expect the page to |
---|
| 1397 | + * become unlocked relatively soon, but do not wish to hold up migration |
---|
| 1398 | + * (for example) by holding the reference while waiting for the page to |
---|
| 1399 | + * come unlocked. After this function returns, the caller should not |
---|
| 1400 | + * dereference @page. |
---|
| 1401 | + */ |
---|
| 1402 | +void put_and_wait_on_page_locked(struct page *page) |
---|
| 1403 | +{ |
---|
| 1404 | + wait_queue_head_t *q; |
---|
| 1405 | + |
---|
| 1406 | + page = compound_head(page); |
---|
| 1407 | + q = page_waitqueue(page); |
---|
| 1408 | + wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP); |
---|
| 1409 | +} |
---|
1183 | 1410 | |
---|
1184 | 1411 | /** |
---|
1185 | 1412 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue |
---|
.. | .. |
---|
1211 | 1438 | * instead. |
---|
1212 | 1439 | * |
---|
1213 | 1440 | * The read of PG_waiters has to be after (or concurrently with) PG_locked |
---|
1214 | | - * being cleared, but a memory barrier should be unneccssary since it is |
---|
| 1441 | + * being cleared, but a memory barrier should be unnecessary since it is |
---|
1215 | 1442 | * in the same byte as PG_locked. |
---|
1216 | 1443 | */ |
---|
1217 | 1444 | static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) |
---|
.. | .. |
---|
1227 | 1454 | * unlock_page - unlock a locked page |
---|
1228 | 1455 | * @page: the page |
---|
1229 | 1456 | * |
---|
1230 | | - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). |
---|
| 1457 | + * Unlocks the page and wakes up sleepers in wait_on_page_locked(). |
---|
1231 | 1458 | * Also wakes sleepers in wait_on_page_writeback() because the wakeup |
---|
1232 | 1459 | * mechanism between PageLocked pages and PageWriteback pages is shared. |
---|
1233 | 1460 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. |
---|
.. | .. |
---|
1266 | 1493 | rotate_reclaimable_page(page); |
---|
1267 | 1494 | } |
---|
1268 | 1495 | |
---|
| 1496 | + /* |
---|
| 1497 | + * Writeback does not hold a page reference of its own, relying |
---|
| 1498 | + * on truncation to wait for the clearing of PG_writeback. |
---|
| 1499 | + * But here we must make sure that the page is not freed and |
---|
| 1500 | + * reused before the wake_up_page(). |
---|
| 1501 | + */ |
---|
| 1502 | + get_page(page); |
---|
1269 | 1503 | if (!test_clear_page_writeback(page)) |
---|
1270 | 1504 | BUG(); |
---|
1271 | 1505 | |
---|
1272 | 1506 | smp_mb__after_atomic(); |
---|
1273 | 1507 | wake_up_page(page, PG_writeback); |
---|
| 1508 | + put_page(page); |
---|
1274 | 1509 | } |
---|
1275 | 1510 | EXPORT_SYMBOL(end_page_writeback); |
---|
1276 | 1511 | |
---|
.. | .. |
---|
1306 | 1541 | * __lock_page - get a lock on the page, assuming we need to sleep to get it |
---|
1307 | 1542 | * @__page: the page to lock |
---|
1308 | 1543 | */ |
---|
1309 | | -void __sched __lock_page(struct page *__page) |
---|
| 1544 | +__sched void __lock_page(struct page *__page) |
---|
1310 | 1545 | { |
---|
1311 | 1546 | struct page *page = compound_head(__page); |
---|
1312 | 1547 | wait_queue_head_t *q = page_waitqueue(page); |
---|
1313 | | - wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true); |
---|
| 1548 | + wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, |
---|
| 1549 | + EXCLUSIVE); |
---|
1314 | 1550 | } |
---|
1315 | 1551 | EXPORT_SYMBOL(__lock_page); |
---|
1316 | 1552 | |
---|
1317 | | -int __sched __lock_page_killable(struct page *__page) |
---|
| 1553 | +__sched int __lock_page_killable(struct page *__page) |
---|
1318 | 1554 | { |
---|
1319 | 1555 | struct page *page = compound_head(__page); |
---|
1320 | 1556 | wait_queue_head_t *q = page_waitqueue(page); |
---|
1321 | | - return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true); |
---|
| 1557 | + return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, |
---|
| 1558 | + EXCLUSIVE); |
---|
1322 | 1559 | } |
---|
1323 | 1560 | EXPORT_SYMBOL_GPL(__lock_page_killable); |
---|
1324 | 1561 | |
---|
| 1562 | +__sched int __lock_page_async(struct page *page, struct wait_page_queue *wait) |
---|
| 1563 | +{ |
---|
| 1564 | + return __wait_on_page_locked_async(page, wait, true); |
---|
| 1565 | +} |
---|
| 1566 | + |
---|
1325 | 1567 | /* |
---|
1326 | 1568 | * Return values: |
---|
1327 | | - * 1 - page is locked; mmap_sem is still held. |
---|
| 1569 | + * 1 - page is locked; mmap_lock is still held. |
---|
1328 | 1570 | * 0 - page is not locked. |
---|
1329 | | - * mmap_sem has been released (up_read()), unless flags had both |
---|
| 1571 | + * mmap_lock has been released (mmap_read_unlock(), unless flags had both |
---|
1330 | 1572 | * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in |
---|
1331 | | - * which case mmap_sem is still held. |
---|
| 1573 | + * which case mmap_lock is still held. |
---|
1332 | 1574 | * |
---|
1333 | 1575 | * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 |
---|
1334 | | - * with the page locked and the mmap_sem unperturbed. |
---|
| 1576 | + * with the page locked and the mmap_lock unperturbed. |
---|
1335 | 1577 | */ |
---|
1336 | | -int __sched __lock_page_or_retry(struct page *page, struct mm_struct *mm, |
---|
| 1578 | +__sched int __lock_page_or_retry(struct page *page, struct mm_struct *mm, |
---|
1337 | 1579 | unsigned int flags) |
---|
1338 | 1580 | { |
---|
1339 | | - if (flags & FAULT_FLAG_ALLOW_RETRY) { |
---|
| 1581 | + if (fault_flag_allow_retry_first(flags)) { |
---|
1340 | 1582 | /* |
---|
1341 | | - * CAUTION! In this case, mmap_sem is not released |
---|
| 1583 | + * CAUTION! In this case, mmap_lock is not released |
---|
1342 | 1584 | * even though return 0. |
---|
1343 | 1585 | */ |
---|
1344 | 1586 | if (flags & FAULT_FLAG_RETRY_NOWAIT) |
---|
1345 | 1587 | return 0; |
---|
1346 | 1588 | |
---|
1347 | | - up_read(&mm->mmap_sem); |
---|
| 1589 | + mmap_read_unlock(mm); |
---|
1348 | 1590 | if (flags & FAULT_FLAG_KILLABLE) |
---|
1349 | 1591 | wait_on_page_locked_killable(page); |
---|
1350 | 1592 | else |
---|
.. | .. |
---|
1356 | 1598 | |
---|
1357 | 1599 | ret = __lock_page_killable(page); |
---|
1358 | 1600 | if (ret) { |
---|
1359 | | - up_read(&mm->mmap_sem); |
---|
| 1601 | + mmap_read_unlock(mm); |
---|
1360 | 1602 | return 0; |
---|
1361 | 1603 | } |
---|
1362 | 1604 | } else |
---|
.. | .. |
---|
1366 | 1608 | } |
---|
1367 | 1609 | |
---|
1368 | 1610 | /** |
---|
1369 | | - * page_cache_next_hole - find the next hole (not-present entry) |
---|
1370 | | - * @mapping: mapping |
---|
1371 | | - * @index: index |
---|
1372 | | - * @max_scan: maximum range to search |
---|
| 1611 | + * page_cache_next_miss() - Find the next gap in the page cache. |
---|
| 1612 | + * @mapping: Mapping. |
---|
| 1613 | + * @index: Index. |
---|
| 1614 | + * @max_scan: Maximum range to search. |
---|
1373 | 1615 | * |
---|
1374 | | - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the |
---|
1375 | | - * lowest indexed hole. |
---|
| 1616 | + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the |
---|
| 1617 | + * gap with the lowest index. |
---|
1376 | 1618 | * |
---|
1377 | | - * Returns: the index of the hole if found, otherwise returns an index |
---|
1378 | | - * outside of the set specified (in which case 'return - index >= |
---|
1379 | | - * max_scan' will be true). In rare cases of index wrap-around, 0 will |
---|
1380 | | - * be returned. |
---|
| 1619 | + * This function may be called under the rcu_read_lock. However, this will |
---|
| 1620 | + * not atomically search a snapshot of the cache at a single point in time. |
---|
| 1621 | + * For example, if a gap is created at index 5, then subsequently a gap is |
---|
| 1622 | + * created at index 10, page_cache_next_miss covering both indices may |
---|
| 1623 | + * return 10 if called under the rcu_read_lock. |
---|
1381 | 1624 | * |
---|
1382 | | - * page_cache_next_hole may be called under rcu_read_lock. However, |
---|
1383 | | - * like radix_tree_gang_lookup, this will not atomically search a |
---|
1384 | | - * snapshot of the tree at a single point in time. For example, if a |
---|
1385 | | - * hole is created at index 5, then subsequently a hole is created at |
---|
1386 | | - * index 10, page_cache_next_hole covering both indexes may return 10 |
---|
1387 | | - * if called under rcu_read_lock. |
---|
| 1625 | + * Return: The index of the gap if found, otherwise an index outside the |
---|
| 1626 | + * range specified (in which case 'return - index >= max_scan' will be true). |
---|
| 1627 | + * In the rare case of index wrap-around, 0 will be returned. |
---|
1388 | 1628 | */ |
---|
1389 | | -pgoff_t page_cache_next_hole(struct address_space *mapping, |
---|
| 1629 | +pgoff_t page_cache_next_miss(struct address_space *mapping, |
---|
1390 | 1630 | pgoff_t index, unsigned long max_scan) |
---|
1391 | 1631 | { |
---|
1392 | | - unsigned long i; |
---|
| 1632 | + XA_STATE(xas, &mapping->i_pages, index); |
---|
1393 | 1633 | |
---|
1394 | | - for (i = 0; i < max_scan; i++) { |
---|
1395 | | - struct page *page; |
---|
1396 | | - |
---|
1397 | | - page = radix_tree_lookup(&mapping->i_pages, index); |
---|
1398 | | - if (!page || radix_tree_exceptional_entry(page)) |
---|
| 1634 | + while (max_scan--) { |
---|
| 1635 | + void *entry = xas_next(&xas); |
---|
| 1636 | + if (!entry || xa_is_value(entry)) |
---|
1399 | 1637 | break; |
---|
1400 | | - index++; |
---|
1401 | | - if (index == 0) |
---|
| 1638 | + if (xas.xa_index == 0) |
---|
1402 | 1639 | break; |
---|
1403 | 1640 | } |
---|
1404 | 1641 | |
---|
1405 | | - return index; |
---|
| 1642 | + return xas.xa_index; |
---|
1406 | 1643 | } |
---|
1407 | | -EXPORT_SYMBOL(page_cache_next_hole); |
---|
| 1644 | +EXPORT_SYMBOL(page_cache_next_miss); |
---|
1408 | 1645 | |
---|
1409 | 1646 | /** |
---|
1410 | | - * page_cache_prev_hole - find the prev hole (not-present entry) |
---|
1411 | | - * @mapping: mapping |
---|
1412 | | - * @index: index |
---|
1413 | | - * @max_scan: maximum range to search |
---|
| 1647 | + * page_cache_prev_miss() - Find the previous gap in the page cache. |
---|
| 1648 | + * @mapping: Mapping. |
---|
| 1649 | + * @index: Index. |
---|
| 1650 | + * @max_scan: Maximum range to search. |
---|
1414 | 1651 | * |
---|
1415 | | - * Search backwards in the range [max(index-max_scan+1, 0), index] for |
---|
1416 | | - * the first hole. |
---|
| 1652 | + * Search the range [max(index - max_scan + 1, 0), index] for the |
---|
| 1653 | + * gap with the highest index. |
---|
1417 | 1654 | * |
---|
1418 | | - * Returns: the index of the hole if found, otherwise returns an index |
---|
1419 | | - * outside of the set specified (in which case 'index - return >= |
---|
1420 | | - * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX |
---|
1421 | | - * will be returned. |
---|
| 1655 | + * This function may be called under the rcu_read_lock. However, this will |
---|
| 1656 | + * not atomically search a snapshot of the cache at a single point in time. |
---|
| 1657 | + * For example, if a gap is created at index 10, then subsequently a gap is |
---|
| 1658 | + * created at index 5, page_cache_prev_miss() covering both indices may |
---|
| 1659 | + * return 5 if called under the rcu_read_lock. |
---|
1422 | 1660 | * |
---|
1423 | | - * page_cache_prev_hole may be called under rcu_read_lock. However, |
---|
1424 | | - * like radix_tree_gang_lookup, this will not atomically search a |
---|
1425 | | - * snapshot of the tree at a single point in time. For example, if a |
---|
1426 | | - * hole is created at index 10, then subsequently a hole is created at |
---|
1427 | | - * index 5, page_cache_prev_hole covering both indexes may return 5 if |
---|
1428 | | - * called under rcu_read_lock. |
---|
| 1661 | + * Return: The index of the gap if found, otherwise an index outside the |
---|
| 1662 | + * range specified (in which case 'index - return >= max_scan' will be true). |
---|
| 1663 | + * In the rare case of wrap-around, ULONG_MAX will be returned. |
---|
1429 | 1664 | */ |
---|
1430 | | -pgoff_t page_cache_prev_hole(struct address_space *mapping, |
---|
| 1665 | +pgoff_t page_cache_prev_miss(struct address_space *mapping, |
---|
1431 | 1666 | pgoff_t index, unsigned long max_scan) |
---|
1432 | 1667 | { |
---|
1433 | | - unsigned long i; |
---|
| 1668 | + XA_STATE(xas, &mapping->i_pages, index); |
---|
1434 | 1669 | |
---|
1435 | | - for (i = 0; i < max_scan; i++) { |
---|
1436 | | - struct page *page; |
---|
1437 | | - |
---|
1438 | | - page = radix_tree_lookup(&mapping->i_pages, index); |
---|
1439 | | - if (!page || radix_tree_exceptional_entry(page)) |
---|
| 1670 | + while (max_scan--) { |
---|
| 1671 | + void *entry = xas_prev(&xas); |
---|
| 1672 | + if (!entry || xa_is_value(entry)) |
---|
1440 | 1673 | break; |
---|
1441 | | - index--; |
---|
1442 | | - if (index == ULONG_MAX) |
---|
| 1674 | + if (xas.xa_index == ULONG_MAX) |
---|
1443 | 1675 | break; |
---|
1444 | 1676 | } |
---|
1445 | 1677 | |
---|
1446 | | - return index; |
---|
| 1678 | + return xas.xa_index; |
---|
1447 | 1679 | } |
---|
1448 | | -EXPORT_SYMBOL(page_cache_prev_hole); |
---|
| 1680 | +EXPORT_SYMBOL(page_cache_prev_miss); |
---|
1449 | 1681 | |
---|
1450 | 1682 | /** |
---|
1451 | 1683 | * find_get_entry - find and get a page cache entry |
---|
1452 | 1684 | * @mapping: the address_space to search |
---|
1453 | | - * @offset: the page cache index |
---|
| 1685 | + * @index: The page cache index. |
---|
1454 | 1686 | * |
---|
1455 | 1687 | * Looks up the page cache slot at @mapping & @offset. If there is a |
---|
1456 | | - * page cache page, it is returned with an increased refcount. |
---|
| 1688 | + * page cache page, the head page is returned with an increased refcount. |
---|
1457 | 1689 | * |
---|
1458 | 1690 | * If the slot holds a shadow entry of a previously evicted page, or a |
---|
1459 | 1691 | * swap entry from shmem/tmpfs, it is returned. |
---|
1460 | 1692 | * |
---|
1461 | | - * Otherwise, %NULL is returned. |
---|
| 1693 | + * Return: The head page or shadow entry, %NULL if nothing is found. |
---|
1462 | 1694 | */ |
---|
1463 | | -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) |
---|
| 1695 | +struct page *find_get_entry(struct address_space *mapping, pgoff_t index) |
---|
1464 | 1696 | { |
---|
1465 | | - void **pagep; |
---|
1466 | | - struct page *head, *page; |
---|
| 1697 | + XA_STATE(xas, &mapping->i_pages, index); |
---|
| 1698 | + struct page *page; |
---|
1467 | 1699 | |
---|
1468 | 1700 | rcu_read_lock(); |
---|
1469 | 1701 | repeat: |
---|
1470 | | - page = NULL; |
---|
1471 | | - pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); |
---|
1472 | | - if (pagep) { |
---|
1473 | | - page = radix_tree_deref_slot(pagep); |
---|
1474 | | - if (unlikely(!page)) |
---|
1475 | | - goto out; |
---|
1476 | | - if (radix_tree_exception(page)) { |
---|
1477 | | - if (radix_tree_deref_retry(page)) |
---|
1478 | | - goto repeat; |
---|
1479 | | - /* |
---|
1480 | | - * A shadow entry of a recently evicted page, |
---|
1481 | | - * or a swap entry from shmem/tmpfs. Return |
---|
1482 | | - * it without attempting to raise page count. |
---|
1483 | | - */ |
---|
1484 | | - goto out; |
---|
1485 | | - } |
---|
| 1702 | + xas_reset(&xas); |
---|
| 1703 | + page = xas_load(&xas); |
---|
| 1704 | + if (xas_retry(&xas, page)) |
---|
| 1705 | + goto repeat; |
---|
| 1706 | + /* |
---|
| 1707 | + * A shadow entry of a recently evicted page, or a swap entry from |
---|
| 1708 | + * shmem/tmpfs. Return it without attempting to raise page count. |
---|
| 1709 | + */ |
---|
| 1710 | + if (!page || xa_is_value(page)) |
---|
| 1711 | + goto out; |
---|
1486 | 1712 | |
---|
1487 | | - head = compound_head(page); |
---|
1488 | | - if (!page_cache_get_speculative(head)) |
---|
1489 | | - goto repeat; |
---|
| 1713 | + if (!page_cache_get_speculative(page)) |
---|
| 1714 | + goto repeat; |
---|
1490 | 1715 | |
---|
1491 | | - /* The page was split under us? */ |
---|
1492 | | - if (compound_head(page) != head) { |
---|
1493 | | - put_page(head); |
---|
1494 | | - goto repeat; |
---|
1495 | | - } |
---|
1496 | | - |
---|
1497 | | - /* |
---|
1498 | | - * Has the page moved? |
---|
1499 | | - * This is part of the lockless pagecache protocol. See |
---|
1500 | | - * include/linux/pagemap.h for details. |
---|
1501 | | - */ |
---|
1502 | | - if (unlikely(page != *pagep)) { |
---|
1503 | | - put_page(head); |
---|
1504 | | - goto repeat; |
---|
1505 | | - } |
---|
| 1716 | + /* |
---|
| 1717 | + * Has the page moved or been split? |
---|
| 1718 | + * This is part of the lockless pagecache protocol. See |
---|
| 1719 | + * include/linux/pagemap.h for details. |
---|
| 1720 | + */ |
---|
| 1721 | + if (unlikely(page != xas_reload(&xas))) { |
---|
| 1722 | + put_page(page); |
---|
| 1723 | + goto repeat; |
---|
1506 | 1724 | } |
---|
1507 | 1725 | out: |
---|
1508 | 1726 | rcu_read_unlock(); |
---|
1509 | 1727 | |
---|
1510 | 1728 | return page; |
---|
1511 | 1729 | } |
---|
1512 | | -EXPORT_SYMBOL(find_get_entry); |
---|
1513 | 1730 | |
---|
1514 | 1731 | /** |
---|
1515 | | - * find_lock_entry - locate, pin and lock a page cache entry |
---|
1516 | | - * @mapping: the address_space to search |
---|
1517 | | - * @offset: the page cache index |
---|
| 1732 | + * find_lock_entry - Locate and lock a page cache entry. |
---|
| 1733 | + * @mapping: The address_space to search. |
---|
| 1734 | + * @index: The page cache index. |
---|
1518 | 1735 | * |
---|
1519 | | - * Looks up the page cache slot at @mapping & @offset. If there is a |
---|
1520 | | - * page cache page, it is returned locked and with an increased |
---|
1521 | | - * refcount. |
---|
| 1736 | + * Looks up the page at @mapping & @index. If there is a page in the |
---|
| 1737 | + * cache, the head page is returned locked and with an increased refcount. |
---|
1522 | 1738 | * |
---|
1523 | 1739 | * If the slot holds a shadow entry of a previously evicted page, or a |
---|
1524 | 1740 | * swap entry from shmem/tmpfs, it is returned. |
---|
1525 | 1741 | * |
---|
1526 | | - * Otherwise, %NULL is returned. |
---|
1527 | | - * |
---|
1528 | | - * find_lock_entry() may sleep. |
---|
| 1742 | + * Context: May sleep. |
---|
| 1743 | + * Return: The head page or shadow entry, %NULL if nothing is found. |
---|
1529 | 1744 | */ |
---|
1530 | | -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) |
---|
| 1745 | +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) |
---|
1531 | 1746 | { |
---|
1532 | 1747 | struct page *page; |
---|
1533 | 1748 | |
---|
1534 | 1749 | repeat: |
---|
1535 | | - page = find_get_entry(mapping, offset); |
---|
1536 | | - if (page && !radix_tree_exception(page)) { |
---|
| 1750 | + page = find_get_entry(mapping, index); |
---|
| 1751 | + if (page && !xa_is_value(page)) { |
---|
1537 | 1752 | lock_page(page); |
---|
1538 | 1753 | /* Has the page been truncated? */ |
---|
1539 | | - if (unlikely(page_mapping(page) != mapping)) { |
---|
| 1754 | + if (unlikely(page->mapping != mapping)) { |
---|
1540 | 1755 | unlock_page(page); |
---|
1541 | 1756 | put_page(page); |
---|
1542 | 1757 | goto repeat; |
---|
1543 | 1758 | } |
---|
1544 | | - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); |
---|
| 1759 | + VM_BUG_ON_PAGE(!thp_contains(page, index), page); |
---|
1545 | 1760 | } |
---|
1546 | 1761 | return page; |
---|
1547 | 1762 | } |
---|
1548 | | -EXPORT_SYMBOL(find_lock_entry); |
---|
1549 | 1763 | |
---|
1550 | 1764 | /** |
---|
1551 | | - * pagecache_get_page - find and get a page reference |
---|
1552 | | - * @mapping: the address_space to search |
---|
1553 | | - * @offset: the page index |
---|
1554 | | - * @fgp_flags: PCG flags |
---|
1555 | | - * @gfp_mask: gfp mask to use for the page cache data page allocation |
---|
| 1765 | + * pagecache_get_page - Find and get a reference to a page. |
---|
| 1766 | + * @mapping: The address_space to search. |
---|
| 1767 | + * @index: The page index. |
---|
| 1768 | + * @fgp_flags: %FGP flags modify how the page is returned. |
---|
| 1769 | + * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified. |
---|
1556 | 1770 | * |
---|
1557 | | - * Looks up the page cache slot at @mapping & @offset. |
---|
| 1771 | + * Looks up the page cache entry at @mapping & @index. |
---|
1558 | 1772 | * |
---|
1559 | | - * PCG flags modify how the page is returned. |
---|
| 1773 | + * @fgp_flags can be zero or more of these flags: |
---|
1560 | 1774 | * |
---|
1561 | | - * @fgp_flags can be: |
---|
| 1775 | + * * %FGP_ACCESSED - The page will be marked accessed. |
---|
| 1776 | + * * %FGP_LOCK - The page is returned locked. |
---|
| 1777 | + * * %FGP_HEAD - If the page is present and a THP, return the head page |
---|
| 1778 | + * rather than the exact page specified by the index. |
---|
| 1779 | + * * %FGP_CREAT - If no page is present then a new page is allocated using |
---|
| 1780 | + * @gfp_mask and added to the page cache and the VM's LRU list. |
---|
| 1781 | + * The page is returned locked and with an increased refcount. |
---|
| 1782 | + * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the |
---|
| 1783 | + * page is already in cache. If the page was allocated, unlock it before |
---|
| 1784 | + * returning so the caller can do the same dance. |
---|
| 1785 | + * * %FGP_WRITE - The page will be written |
---|
| 1786 | + * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask |
---|
| 1787 | + * * %FGP_NOWAIT - Don't get blocked by page lock |
---|
1562 | 1788 | * |
---|
1563 | | - * - FGP_ACCESSED: the page will be marked accessed |
---|
1564 | | - * - FGP_LOCK: Page is return locked |
---|
1565 | | - * - FGP_CREAT: If page is not present then a new page is allocated using |
---|
1566 | | - * @gfp_mask and added to the page cache and the VM's LRU |
---|
1567 | | - * list. The page is returned locked and with an increased |
---|
1568 | | - * refcount. |
---|
1569 | | - * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do |
---|
1570 | | - * its own locking dance if the page is already in cache, or unlock the page |
---|
1571 | | - * before returning if we had to add the page to pagecache. |
---|
1572 | | - * |
---|
1573 | | - * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even |
---|
1574 | | - * if the GFP flags specified for FGP_CREAT are atomic. |
---|
| 1789 | + * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even |
---|
| 1790 | + * if the %GFP flags specified for %FGP_CREAT are atomic. |
---|
1575 | 1791 | * |
---|
1576 | 1792 | * If there is a page cache page, it is returned with an increased refcount. |
---|
| 1793 | + * |
---|
| 1794 | + * Return: The found page or %NULL otherwise. |
---|
1577 | 1795 | */ |
---|
1578 | | -struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, |
---|
1579 | | - int fgp_flags, gfp_t gfp_mask) |
---|
| 1796 | +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, |
---|
| 1797 | + int fgp_flags, gfp_t gfp_mask) |
---|
1580 | 1798 | { |
---|
1581 | 1799 | struct page *page; |
---|
1582 | 1800 | |
---|
1583 | 1801 | repeat: |
---|
1584 | | - page = find_get_entry(mapping, offset); |
---|
1585 | | - if (radix_tree_exceptional_entry(page)) |
---|
| 1802 | + page = find_get_entry(mapping, index); |
---|
| 1803 | + if (xa_is_value(page)) |
---|
1586 | 1804 | page = NULL; |
---|
| 1805 | + |
---|
| 1806 | + trace_android_vh_pagecache_get_page(mapping, index, fgp_flags, |
---|
| 1807 | + gfp_mask, page); |
---|
1587 | 1808 | if (!page) |
---|
1588 | 1809 | goto no_page; |
---|
1589 | 1810 | |
---|
.. | .. |
---|
1603 | 1824 | put_page(page); |
---|
1604 | 1825 | goto repeat; |
---|
1605 | 1826 | } |
---|
1606 | | - VM_BUG_ON_PAGE(page->index != offset, page); |
---|
| 1827 | + VM_BUG_ON_PAGE(!thp_contains(page, index), page); |
---|
1607 | 1828 | } |
---|
1608 | 1829 | |
---|
1609 | | - if (page && (fgp_flags & FGP_ACCESSED)) |
---|
| 1830 | + if (fgp_flags & FGP_ACCESSED) |
---|
1610 | 1831 | mark_page_accessed(page); |
---|
| 1832 | + else if (fgp_flags & FGP_WRITE) { |
---|
| 1833 | + /* Clear idle flag for buffer write */ |
---|
| 1834 | + if (page_is_idle(page)) |
---|
| 1835 | + clear_page_idle(page); |
---|
| 1836 | + } |
---|
| 1837 | + if (!(fgp_flags & FGP_HEAD)) |
---|
| 1838 | + page = find_subpage(page, index); |
---|
1611 | 1839 | |
---|
1612 | 1840 | no_page: |
---|
1613 | 1841 | if (!page && (fgp_flags & FGP_CREAT)) { |
---|
1614 | 1842 | int err; |
---|
1615 | | - if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) |
---|
| 1843 | + if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) |
---|
1616 | 1844 | gfp_mask |= __GFP_WRITE; |
---|
1617 | 1845 | if (fgp_flags & FGP_NOFS) |
---|
1618 | 1846 | gfp_mask &= ~__GFP_FS; |
---|
.. | .. |
---|
1628 | 1856 | if (fgp_flags & FGP_ACCESSED) |
---|
1629 | 1857 | __SetPageReferenced(page); |
---|
1630 | 1858 | |
---|
1631 | | - err = add_to_page_cache_lru(page, mapping, offset, gfp_mask); |
---|
| 1859 | + err = add_to_page_cache_lru(page, mapping, index, gfp_mask); |
---|
1632 | 1860 | if (unlikely(err)) { |
---|
1633 | 1861 | put_page(page); |
---|
1634 | 1862 | page = NULL; |
---|
.. | .. |
---|
1668 | 1896 | * Any shadow entries of evicted pages, or swap entries from |
---|
1669 | 1897 | * shmem/tmpfs, are included in the returned array. |
---|
1670 | 1898 | * |
---|
1671 | | - * find_get_entries() returns the number of pages and shadow entries |
---|
1672 | | - * which were found. |
---|
| 1899 | + * If it finds a Transparent Huge Page, head or tail, find_get_entries() |
---|
| 1900 | + * stops at that page: the caller is likely to have a better way to handle |
---|
| 1901 | + * the compound page as a whole, and then skip its extent, than repeatedly |
---|
| 1902 | + * calling find_get_entries() to return all its tails. |
---|
| 1903 | + * |
---|
| 1904 | + * Return: the number of pages and shadow entries which were found. |
---|
1673 | 1905 | */ |
---|
1674 | 1906 | unsigned find_get_entries(struct address_space *mapping, |
---|
1675 | 1907 | pgoff_t start, unsigned int nr_entries, |
---|
1676 | 1908 | struct page **entries, pgoff_t *indices) |
---|
1677 | 1909 | { |
---|
1678 | | - void **slot; |
---|
| 1910 | + XA_STATE(xas, &mapping->i_pages, start); |
---|
| 1911 | + struct page *page; |
---|
1679 | 1912 | unsigned int ret = 0; |
---|
1680 | | - struct radix_tree_iter iter; |
---|
1681 | 1913 | |
---|
1682 | 1914 | if (!nr_entries) |
---|
1683 | 1915 | return 0; |
---|
1684 | 1916 | |
---|
1685 | 1917 | rcu_read_lock(); |
---|
1686 | | - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { |
---|
1687 | | - struct page *head, *page; |
---|
1688 | | -repeat: |
---|
1689 | | - page = radix_tree_deref_slot(slot); |
---|
1690 | | - if (unlikely(!page)) |
---|
| 1918 | + xas_for_each(&xas, page, ULONG_MAX) { |
---|
| 1919 | + if (xas_retry(&xas, page)) |
---|
1691 | 1920 | continue; |
---|
1692 | | - if (radix_tree_exception(page)) { |
---|
1693 | | - if (radix_tree_deref_retry(page)) { |
---|
1694 | | - slot = radix_tree_iter_retry(&iter); |
---|
1695 | | - continue; |
---|
1696 | | - } |
---|
1697 | | - /* |
---|
1698 | | - * A shadow entry of a recently evicted page, a swap |
---|
1699 | | - * entry from shmem/tmpfs or a DAX entry. Return it |
---|
1700 | | - * without attempting to raise page count. |
---|
1701 | | - */ |
---|
| 1921 | + /* |
---|
| 1922 | + * A shadow entry of a recently evicted page, a swap |
---|
| 1923 | + * entry from shmem/tmpfs or a DAX entry. Return it |
---|
| 1924 | + * without attempting to raise page count. |
---|
| 1925 | + */ |
---|
| 1926 | + if (xa_is_value(page)) |
---|
1702 | 1927 | goto export; |
---|
1703 | | - } |
---|
1704 | 1928 | |
---|
1705 | | - head = compound_head(page); |
---|
1706 | | - if (!page_cache_get_speculative(head)) |
---|
1707 | | - goto repeat; |
---|
| 1929 | + if (!page_cache_get_speculative(page)) |
---|
| 1930 | + goto retry; |
---|
1708 | 1931 | |
---|
1709 | | - /* The page was split under us? */ |
---|
1710 | | - if (compound_head(page) != head) { |
---|
1711 | | - put_page(head); |
---|
1712 | | - goto repeat; |
---|
1713 | | - } |
---|
| 1932 | + /* Has the page moved or been split? */ |
---|
| 1933 | + if (unlikely(page != xas_reload(&xas))) |
---|
| 1934 | + goto put_page; |
---|
1714 | 1935 | |
---|
1715 | | - /* Has the page moved? */ |
---|
1716 | | - if (unlikely(page != *slot)) { |
---|
1717 | | - put_page(head); |
---|
1718 | | - goto repeat; |
---|
| 1936 | + /* |
---|
| 1937 | + * Terminate early on finding a THP, to allow the caller to |
---|
| 1938 | + * handle it all at once; but continue if this is hugetlbfs. |
---|
| 1939 | + */ |
---|
| 1940 | + if (PageTransHuge(page) && !PageHuge(page)) { |
---|
| 1941 | + page = find_subpage(page, xas.xa_index); |
---|
| 1942 | + nr_entries = ret + 1; |
---|
1719 | 1943 | } |
---|
1720 | 1944 | export: |
---|
1721 | | - indices[ret] = iter.index; |
---|
| 1945 | + indices[ret] = xas.xa_index; |
---|
1722 | 1946 | entries[ret] = page; |
---|
1723 | 1947 | if (++ret == nr_entries) |
---|
1724 | 1948 | break; |
---|
| 1949 | + continue; |
---|
| 1950 | +put_page: |
---|
| 1951 | + put_page(page); |
---|
| 1952 | +retry: |
---|
| 1953 | + xas_reset(&xas); |
---|
1725 | 1954 | } |
---|
1726 | 1955 | rcu_read_unlock(); |
---|
1727 | 1956 | return ret; |
---|
.. | .. |
---|
1744 | 1973 | * indexes. There may be holes in the indices due to not-present pages. |
---|
1745 | 1974 | * We also update @start to index the next page for the traversal. |
---|
1746 | 1975 | * |
---|
1747 | | - * find_get_pages_range() returns the number of pages which were found. If this |
---|
1748 | | - * number is smaller than @nr_pages, the end of specified range has been |
---|
| 1976 | + * Return: the number of pages which were found. If this number is |
---|
| 1977 | + * smaller than @nr_pages, the end of specified range has been |
---|
1749 | 1978 | * reached. |
---|
1750 | 1979 | */ |
---|
1751 | 1980 | unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, |
---|
1752 | 1981 | pgoff_t end, unsigned int nr_pages, |
---|
1753 | 1982 | struct page **pages) |
---|
1754 | 1983 | { |
---|
1755 | | - struct radix_tree_iter iter; |
---|
1756 | | - void **slot; |
---|
| 1984 | + XA_STATE(xas, &mapping->i_pages, *start); |
---|
| 1985 | + struct page *page; |
---|
1757 | 1986 | unsigned ret = 0; |
---|
1758 | 1987 | |
---|
1759 | 1988 | if (unlikely(!nr_pages)) |
---|
1760 | 1989 | return 0; |
---|
1761 | 1990 | |
---|
1762 | 1991 | rcu_read_lock(); |
---|
1763 | | - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { |
---|
1764 | | - struct page *head, *page; |
---|
1765 | | - |
---|
1766 | | - if (iter.index > end) |
---|
1767 | | - break; |
---|
1768 | | -repeat: |
---|
1769 | | - page = radix_tree_deref_slot(slot); |
---|
1770 | | - if (unlikely(!page)) |
---|
| 1992 | + xas_for_each(&xas, page, end) { |
---|
| 1993 | + if (xas_retry(&xas, page)) |
---|
| 1994 | + continue; |
---|
| 1995 | + /* Skip over shadow, swap and DAX entries */ |
---|
| 1996 | + if (xa_is_value(page)) |
---|
1771 | 1997 | continue; |
---|
1772 | 1998 | |
---|
1773 | | - if (radix_tree_exception(page)) { |
---|
1774 | | - if (radix_tree_deref_retry(page)) { |
---|
1775 | | - slot = radix_tree_iter_retry(&iter); |
---|
1776 | | - continue; |
---|
1777 | | - } |
---|
1778 | | - /* |
---|
1779 | | - * A shadow entry of a recently evicted page, |
---|
1780 | | - * or a swap entry from shmem/tmpfs. Skip |
---|
1781 | | - * over it. |
---|
1782 | | - */ |
---|
1783 | | - continue; |
---|
1784 | | - } |
---|
| 1999 | + if (!page_cache_get_speculative(page)) |
---|
| 2000 | + goto retry; |
---|
1785 | 2001 | |
---|
1786 | | - head = compound_head(page); |
---|
1787 | | - if (!page_cache_get_speculative(head)) |
---|
1788 | | - goto repeat; |
---|
| 2002 | + /* Has the page moved or been split? */ |
---|
| 2003 | + if (unlikely(page != xas_reload(&xas))) |
---|
| 2004 | + goto put_page; |
---|
1789 | 2005 | |
---|
1790 | | - /* The page was split under us? */ |
---|
1791 | | - if (compound_head(page) != head) { |
---|
1792 | | - put_page(head); |
---|
1793 | | - goto repeat; |
---|
1794 | | - } |
---|
1795 | | - |
---|
1796 | | - /* Has the page moved? */ |
---|
1797 | | - if (unlikely(page != *slot)) { |
---|
1798 | | - put_page(head); |
---|
1799 | | - goto repeat; |
---|
1800 | | - } |
---|
1801 | | - |
---|
1802 | | - pages[ret] = page; |
---|
| 2006 | + pages[ret] = find_subpage(page, xas.xa_index); |
---|
1803 | 2007 | if (++ret == nr_pages) { |
---|
1804 | | - *start = pages[ret - 1]->index + 1; |
---|
| 2008 | + *start = xas.xa_index + 1; |
---|
1805 | 2009 | goto out; |
---|
1806 | 2010 | } |
---|
| 2011 | + continue; |
---|
| 2012 | +put_page: |
---|
| 2013 | + put_page(page); |
---|
| 2014 | +retry: |
---|
| 2015 | + xas_reset(&xas); |
---|
1807 | 2016 | } |
---|
1808 | 2017 | |
---|
1809 | 2018 | /* |
---|
1810 | 2019 | * We come here when there is no page beyond @end. We take care to not |
---|
1811 | 2020 | * overflow the index @start as it confuses some of the callers. This |
---|
1812 | | - * breaks the iteration when there is page at index -1 but that is |
---|
| 2021 | + * breaks the iteration when there is a page at index -1 but that is |
---|
1813 | 2022 | * already broken anyway. |
---|
1814 | 2023 | */ |
---|
1815 | 2024 | if (end == (pgoff_t)-1) |
---|
.. | .. |
---|
1832 | 2041 | * find_get_pages_contig() works exactly like find_get_pages(), except |
---|
1833 | 2042 | * that the returned number of pages are guaranteed to be contiguous. |
---|
1834 | 2043 | * |
---|
1835 | | - * find_get_pages_contig() returns the number of pages which were found. |
---|
| 2044 | + * Return: the number of pages which were found. |
---|
1836 | 2045 | */ |
---|
1837 | 2046 | unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, |
---|
1838 | 2047 | unsigned int nr_pages, struct page **pages) |
---|
1839 | 2048 | { |
---|
1840 | | - struct radix_tree_iter iter; |
---|
1841 | | - void **slot; |
---|
| 2049 | + XA_STATE(xas, &mapping->i_pages, index); |
---|
| 2050 | + struct page *page; |
---|
1842 | 2051 | unsigned int ret = 0; |
---|
1843 | 2052 | |
---|
1844 | 2053 | if (unlikely(!nr_pages)) |
---|
1845 | 2054 | return 0; |
---|
1846 | 2055 | |
---|
1847 | 2056 | rcu_read_lock(); |
---|
1848 | | - radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { |
---|
1849 | | - struct page *head, *page; |
---|
1850 | | -repeat: |
---|
1851 | | - page = radix_tree_deref_slot(slot); |
---|
1852 | | - /* The hole, there no reason to continue */ |
---|
1853 | | - if (unlikely(!page)) |
---|
1854 | | - break; |
---|
1855 | | - |
---|
1856 | | - if (radix_tree_exception(page)) { |
---|
1857 | | - if (radix_tree_deref_retry(page)) { |
---|
1858 | | - slot = radix_tree_iter_retry(&iter); |
---|
1859 | | - continue; |
---|
1860 | | - } |
---|
1861 | | - /* |
---|
1862 | | - * A shadow entry of a recently evicted page, |
---|
1863 | | - * or a swap entry from shmem/tmpfs. Stop |
---|
1864 | | - * looking for contiguous pages. |
---|
1865 | | - */ |
---|
1866 | | - break; |
---|
1867 | | - } |
---|
1868 | | - |
---|
1869 | | - head = compound_head(page); |
---|
1870 | | - if (!page_cache_get_speculative(head)) |
---|
1871 | | - goto repeat; |
---|
1872 | | - |
---|
1873 | | - /* The page was split under us? */ |
---|
1874 | | - if (compound_head(page) != head) { |
---|
1875 | | - put_page(head); |
---|
1876 | | - goto repeat; |
---|
1877 | | - } |
---|
1878 | | - |
---|
1879 | | - /* Has the page moved? */ |
---|
1880 | | - if (unlikely(page != *slot)) { |
---|
1881 | | - put_page(head); |
---|
1882 | | - goto repeat; |
---|
1883 | | - } |
---|
1884 | | - |
---|
| 2057 | + for (page = xas_load(&xas); page; page = xas_next(&xas)) { |
---|
| 2058 | + if (xas_retry(&xas, page)) |
---|
| 2059 | + continue; |
---|
1885 | 2060 | /* |
---|
1886 | | - * must check mapping and index after taking the ref. |
---|
1887 | | - * otherwise we can get both false positives and false |
---|
1888 | | - * negatives, which is just confusing to the caller. |
---|
| 2061 | + * If the entry has been swapped out, we can stop looking. |
---|
| 2062 | + * No current caller is looking for DAX entries. |
---|
1889 | 2063 | */ |
---|
1890 | | - if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { |
---|
1891 | | - put_page(page); |
---|
| 2064 | + if (xa_is_value(page)) |
---|
1892 | 2065 | break; |
---|
1893 | | - } |
---|
1894 | 2066 | |
---|
1895 | | - pages[ret] = page; |
---|
| 2067 | + if (!page_cache_get_speculative(page)) |
---|
| 2068 | + goto retry; |
---|
| 2069 | + |
---|
| 2070 | + /* Has the page moved or been split? */ |
---|
| 2071 | + if (unlikely(page != xas_reload(&xas))) |
---|
| 2072 | + goto put_page; |
---|
| 2073 | + |
---|
| 2074 | + pages[ret] = find_subpage(page, xas.xa_index); |
---|
1896 | 2075 | if (++ret == nr_pages) |
---|
1897 | 2076 | break; |
---|
| 2077 | + continue; |
---|
| 2078 | +put_page: |
---|
| 2079 | + put_page(page); |
---|
| 2080 | +retry: |
---|
| 2081 | + xas_reset(&xas); |
---|
1898 | 2082 | } |
---|
1899 | 2083 | rcu_read_unlock(); |
---|
1900 | 2084 | return ret; |
---|
.. | .. |
---|
1912 | 2096 | * |
---|
1913 | 2097 | * Like find_get_pages, except we only return pages which are tagged with |
---|
1914 | 2098 | * @tag. We update @index to index the next page for the traversal. |
---|
| 2099 | + * |
---|
| 2100 | + * Return: the number of pages which were found. |
---|
1915 | 2101 | */ |
---|
1916 | 2102 | unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, |
---|
1917 | | - pgoff_t end, int tag, unsigned int nr_pages, |
---|
| 2103 | + pgoff_t end, xa_mark_t tag, unsigned int nr_pages, |
---|
1918 | 2104 | struct page **pages) |
---|
1919 | 2105 | { |
---|
1920 | | - struct radix_tree_iter iter; |
---|
1921 | | - void **slot; |
---|
| 2106 | + XA_STATE(xas, &mapping->i_pages, *index); |
---|
| 2107 | + struct page *page; |
---|
1922 | 2108 | unsigned ret = 0; |
---|
1923 | 2109 | |
---|
1924 | 2110 | if (unlikely(!nr_pages)) |
---|
1925 | 2111 | return 0; |
---|
1926 | 2112 | |
---|
1927 | 2113 | rcu_read_lock(); |
---|
1928 | | - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { |
---|
1929 | | - struct page *head, *page; |
---|
1930 | | - |
---|
1931 | | - if (iter.index > end) |
---|
1932 | | - break; |
---|
1933 | | -repeat: |
---|
1934 | | - page = radix_tree_deref_slot(slot); |
---|
1935 | | - if (unlikely(!page)) |
---|
| 2114 | + xas_for_each_marked(&xas, page, end, tag) { |
---|
| 2115 | + if (xas_retry(&xas, page)) |
---|
| 2116 | + continue; |
---|
| 2117 | + /* |
---|
| 2118 | + * Shadow entries should never be tagged, but this iteration |
---|
| 2119 | + * is lockless so there is a window for page reclaim to evict |
---|
| 2120 | + * a page we saw tagged. Skip over it. |
---|
| 2121 | + */ |
---|
| 2122 | + if (xa_is_value(page)) |
---|
1936 | 2123 | continue; |
---|
1937 | 2124 | |
---|
1938 | | - if (radix_tree_exception(page)) { |
---|
1939 | | - if (radix_tree_deref_retry(page)) { |
---|
1940 | | - slot = radix_tree_iter_retry(&iter); |
---|
1941 | | - continue; |
---|
1942 | | - } |
---|
1943 | | - /* |
---|
1944 | | - * A shadow entry of a recently evicted page. |
---|
1945 | | - * |
---|
1946 | | - * Those entries should never be tagged, but |
---|
1947 | | - * this tree walk is lockless and the tags are |
---|
1948 | | - * looked up in bulk, one radix tree node at a |
---|
1949 | | - * time, so there is a sizable window for page |
---|
1950 | | - * reclaim to evict a page we saw tagged. |
---|
1951 | | - * |
---|
1952 | | - * Skip over it. |
---|
1953 | | - */ |
---|
1954 | | - continue; |
---|
1955 | | - } |
---|
| 2125 | + if (!page_cache_get_speculative(page)) |
---|
| 2126 | + goto retry; |
---|
1956 | 2127 | |
---|
1957 | | - head = compound_head(page); |
---|
1958 | | - if (!page_cache_get_speculative(head)) |
---|
1959 | | - goto repeat; |
---|
| 2128 | + /* Has the page moved or been split? */ |
---|
| 2129 | + if (unlikely(page != xas_reload(&xas))) |
---|
| 2130 | + goto put_page; |
---|
1960 | 2131 | |
---|
1961 | | - /* The page was split under us? */ |
---|
1962 | | - if (compound_head(page) != head) { |
---|
1963 | | - put_page(head); |
---|
1964 | | - goto repeat; |
---|
1965 | | - } |
---|
1966 | | - |
---|
1967 | | - /* Has the page moved? */ |
---|
1968 | | - if (unlikely(page != *slot)) { |
---|
1969 | | - put_page(head); |
---|
1970 | | - goto repeat; |
---|
1971 | | - } |
---|
1972 | | - |
---|
1973 | | - pages[ret] = page; |
---|
| 2132 | + pages[ret] = find_subpage(page, xas.xa_index); |
---|
1974 | 2133 | if (++ret == nr_pages) { |
---|
1975 | | - *index = pages[ret - 1]->index + 1; |
---|
| 2134 | + *index = xas.xa_index + 1; |
---|
1976 | 2135 | goto out; |
---|
1977 | 2136 | } |
---|
| 2137 | + continue; |
---|
| 2138 | +put_page: |
---|
| 2139 | + put_page(page); |
---|
| 2140 | +retry: |
---|
| 2141 | + xas_reset(&xas); |
---|
1978 | 2142 | } |
---|
1979 | 2143 | |
---|
1980 | 2144 | /* |
---|
1981 | | - * We come here when we got at @end. We take care to not overflow the |
---|
| 2145 | + * We come here when we got to @end. We take care to not overflow the |
---|
1982 | 2146 | * index @index as it confuses some of the callers. This breaks the |
---|
1983 | | - * iteration when there is page at index -1 but that is already broken |
---|
1984 | | - * anyway. |
---|
| 2147 | + * iteration when there is a page at index -1 but that is already |
---|
| 2148 | + * broken anyway. |
---|
1985 | 2149 | */ |
---|
1986 | 2150 | if (end == (pgoff_t)-1) |
---|
1987 | 2151 | *index = (pgoff_t)-1; |
---|
.. | .. |
---|
1993 | 2157 | return ret; |
---|
1994 | 2158 | } |
---|
1995 | 2159 | EXPORT_SYMBOL(find_get_pages_range_tag); |
---|
1996 | | - |
---|
1997 | | -/** |
---|
1998 | | - * find_get_entries_tag - find and return entries that match @tag |
---|
1999 | | - * @mapping: the address_space to search |
---|
2000 | | - * @start: the starting page cache index |
---|
2001 | | - * @tag: the tag index |
---|
2002 | | - * @nr_entries: the maximum number of entries |
---|
2003 | | - * @entries: where the resulting entries are placed |
---|
2004 | | - * @indices: the cache indices corresponding to the entries in @entries |
---|
2005 | | - * |
---|
2006 | | - * Like find_get_entries, except we only return entries which are tagged with |
---|
2007 | | - * @tag. |
---|
2008 | | - */ |
---|
2009 | | -unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, |
---|
2010 | | - int tag, unsigned int nr_entries, |
---|
2011 | | - struct page **entries, pgoff_t *indices) |
---|
2012 | | -{ |
---|
2013 | | - void **slot; |
---|
2014 | | - unsigned int ret = 0; |
---|
2015 | | - struct radix_tree_iter iter; |
---|
2016 | | - |
---|
2017 | | - if (!nr_entries) |
---|
2018 | | - return 0; |
---|
2019 | | - |
---|
2020 | | - rcu_read_lock(); |
---|
2021 | | - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { |
---|
2022 | | - struct page *head, *page; |
---|
2023 | | -repeat: |
---|
2024 | | - page = radix_tree_deref_slot(slot); |
---|
2025 | | - if (unlikely(!page)) |
---|
2026 | | - continue; |
---|
2027 | | - if (radix_tree_exception(page)) { |
---|
2028 | | - if (radix_tree_deref_retry(page)) { |
---|
2029 | | - slot = radix_tree_iter_retry(&iter); |
---|
2030 | | - continue; |
---|
2031 | | - } |
---|
2032 | | - |
---|
2033 | | - /* |
---|
2034 | | - * A shadow entry of a recently evicted page, a swap |
---|
2035 | | - * entry from shmem/tmpfs or a DAX entry. Return it |
---|
2036 | | - * without attempting to raise page count. |
---|
2037 | | - */ |
---|
2038 | | - goto export; |
---|
2039 | | - } |
---|
2040 | | - |
---|
2041 | | - head = compound_head(page); |
---|
2042 | | - if (!page_cache_get_speculative(head)) |
---|
2043 | | - goto repeat; |
---|
2044 | | - |
---|
2045 | | - /* The page was split under us? */ |
---|
2046 | | - if (compound_head(page) != head) { |
---|
2047 | | - put_page(head); |
---|
2048 | | - goto repeat; |
---|
2049 | | - } |
---|
2050 | | - |
---|
2051 | | - /* Has the page moved? */ |
---|
2052 | | - if (unlikely(page != *slot)) { |
---|
2053 | | - put_page(head); |
---|
2054 | | - goto repeat; |
---|
2055 | | - } |
---|
2056 | | -export: |
---|
2057 | | - indices[ret] = iter.index; |
---|
2058 | | - entries[ret] = page; |
---|
2059 | | - if (++ret == nr_entries) |
---|
2060 | | - break; |
---|
2061 | | - } |
---|
2062 | | - rcu_read_unlock(); |
---|
2063 | | - return ret; |
---|
2064 | | -} |
---|
2065 | | -EXPORT_SYMBOL(find_get_entries_tag); |
---|
2066 | 2160 | |
---|
2067 | 2161 | /* |
---|
2068 | 2162 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail |
---|
.. | .. |
---|
2079 | 2173 | * |
---|
2080 | 2174 | * It is going insane. Fix it by quickly scaling down the readahead size. |
---|
2081 | 2175 | */ |
---|
2082 | | -static void shrink_readahead_size_eio(struct file *filp, |
---|
2083 | | - struct file_ra_state *ra) |
---|
| 2176 | +static void shrink_readahead_size_eio(struct file_ra_state *ra) |
---|
2084 | 2177 | { |
---|
2085 | 2178 | ra->ra_pages /= 4; |
---|
2086 | 2179 | } |
---|
.. | .. |
---|
2096 | 2189 | * |
---|
2097 | 2190 | * This is really ugly. But the goto's actually try to clarify some |
---|
2098 | 2191 | * of the logic when it comes to error handling etc. |
---|
| 2192 | + * |
---|
| 2193 | + * Return: |
---|
| 2194 | + * * total number of bytes copied, including those the were already @written |
---|
| 2195 | + * * negative error code if nothing was copied |
---|
2099 | 2196 | */ |
---|
2100 | | -static ssize_t generic_file_buffered_read(struct kiocb *iocb, |
---|
| 2197 | +ssize_t generic_file_buffered_read(struct kiocb *iocb, |
---|
2101 | 2198 | struct iov_iter *iter, ssize_t written) |
---|
2102 | 2199 | { |
---|
2103 | 2200 | struct file *filp = iocb->ki_filp; |
---|
.. | .. |
---|
2114 | 2211 | |
---|
2115 | 2212 | if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) |
---|
2116 | 2213 | return 0; |
---|
| 2214 | + if (unlikely(!iov_iter_count(iter))) |
---|
| 2215 | + return 0; |
---|
| 2216 | + |
---|
2117 | 2217 | iov_iter_truncate(iter, inode->i_sb->s_maxbytes); |
---|
2118 | 2218 | |
---|
2119 | 2219 | index = *ppos >> PAGE_SHIFT; |
---|
.. | .. |
---|
2121 | 2221 | prev_offset = ra->prev_pos & (PAGE_SIZE-1); |
---|
2122 | 2222 | last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; |
---|
2123 | 2223 | offset = *ppos & ~PAGE_MASK; |
---|
| 2224 | + |
---|
| 2225 | + /* |
---|
| 2226 | + * If we've already successfully copied some data, then we |
---|
| 2227 | + * can no longer safely return -EIOCBQUEUED. Hence mark |
---|
| 2228 | + * an async read NOWAIT at that point. |
---|
| 2229 | + */ |
---|
| 2230 | + if (written && (iocb->ki_flags & IOCB_WAITQ)) |
---|
| 2231 | + iocb->ki_flags |= IOCB_NOWAIT; |
---|
2124 | 2232 | |
---|
2125 | 2233 | for (;;) { |
---|
2126 | 2234 | struct page *page; |
---|
.. | .. |
---|
2137 | 2245 | |
---|
2138 | 2246 | page = find_get_page(mapping, index); |
---|
2139 | 2247 | if (!page) { |
---|
2140 | | - if (iocb->ki_flags & IOCB_NOWAIT) |
---|
| 2248 | + if (iocb->ki_flags & IOCB_NOIO) |
---|
2141 | 2249 | goto would_block; |
---|
2142 | 2250 | page_cache_sync_readahead(mapping, |
---|
2143 | 2251 | ra, filp, |
---|
.. | .. |
---|
2147 | 2255 | goto no_cached_page; |
---|
2148 | 2256 | } |
---|
2149 | 2257 | if (PageReadahead(page)) { |
---|
| 2258 | + if (iocb->ki_flags & IOCB_NOIO) { |
---|
| 2259 | + put_page(page); |
---|
| 2260 | + goto out; |
---|
| 2261 | + } |
---|
2150 | 2262 | page_cache_async_readahead(mapping, |
---|
2151 | 2263 | ra, filp, page, |
---|
2152 | 2264 | index, last_index - index); |
---|
2153 | 2265 | } |
---|
2154 | 2266 | if (!PageUptodate(page)) { |
---|
2155 | | - if (iocb->ki_flags & IOCB_NOWAIT) { |
---|
2156 | | - put_page(page); |
---|
2157 | | - goto would_block; |
---|
2158 | | - } |
---|
2159 | | - |
---|
2160 | 2267 | /* |
---|
2161 | 2268 | * See comment in do_read_cache_page on why |
---|
2162 | 2269 | * wait_on_page_locked is used to avoid unnecessarily |
---|
2163 | 2270 | * serialisations and why it's safe. |
---|
2164 | 2271 | */ |
---|
2165 | | - error = wait_on_page_locked_killable(page); |
---|
| 2272 | + if (iocb->ki_flags & IOCB_WAITQ) { |
---|
| 2273 | + if (written) { |
---|
| 2274 | + put_page(page); |
---|
| 2275 | + goto out; |
---|
| 2276 | + } |
---|
| 2277 | + error = wait_on_page_locked_async(page, |
---|
| 2278 | + iocb->ki_waitq); |
---|
| 2279 | + } else { |
---|
| 2280 | + if (iocb->ki_flags & IOCB_NOWAIT) { |
---|
| 2281 | + put_page(page); |
---|
| 2282 | + goto would_block; |
---|
| 2283 | + } |
---|
| 2284 | + error = wait_on_page_locked_killable(page); |
---|
| 2285 | + } |
---|
2166 | 2286 | if (unlikely(error)) |
---|
2167 | 2287 | goto readpage_error; |
---|
2168 | 2288 | if (PageUptodate(page)) |
---|
.. | .. |
---|
2172 | 2292 | !mapping->a_ops->is_partially_uptodate) |
---|
2173 | 2293 | goto page_not_up_to_date; |
---|
2174 | 2294 | /* pipes can't handle partially uptodate pages */ |
---|
2175 | | - if (unlikely(iter->type & ITER_PIPE)) |
---|
| 2295 | + if (unlikely(iov_iter_is_pipe(iter))) |
---|
2176 | 2296 | goto page_not_up_to_date; |
---|
2177 | 2297 | if (!trylock_page(page)) |
---|
2178 | 2298 | goto page_not_up_to_date; |
---|
.. | .. |
---|
2250 | 2370 | |
---|
2251 | 2371 | page_not_up_to_date: |
---|
2252 | 2372 | /* Get exclusive access to the page ... */ |
---|
2253 | | - error = lock_page_killable(page); |
---|
| 2373 | + if (iocb->ki_flags & IOCB_WAITQ) { |
---|
| 2374 | + if (written) { |
---|
| 2375 | + put_page(page); |
---|
| 2376 | + goto out; |
---|
| 2377 | + } |
---|
| 2378 | + error = lock_page_async(page, iocb->ki_waitq); |
---|
| 2379 | + } else { |
---|
| 2380 | + error = lock_page_killable(page); |
---|
| 2381 | + } |
---|
2254 | 2382 | if (unlikely(error)) |
---|
2255 | 2383 | goto readpage_error; |
---|
2256 | 2384 | |
---|
.. | .. |
---|
2269 | 2397 | } |
---|
2270 | 2398 | |
---|
2271 | 2399 | readpage: |
---|
| 2400 | + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { |
---|
| 2401 | + unlock_page(page); |
---|
| 2402 | + put_page(page); |
---|
| 2403 | + goto would_block; |
---|
| 2404 | + } |
---|
2272 | 2405 | /* |
---|
2273 | 2406 | * A previous I/O error may have been due to temporary |
---|
2274 | 2407 | * failures, eg. multipath errors. |
---|
.. | .. |
---|
2288 | 2421 | } |
---|
2289 | 2422 | |
---|
2290 | 2423 | if (!PageUptodate(page)) { |
---|
2291 | | - error = lock_page_killable(page); |
---|
| 2424 | + if (iocb->ki_flags & IOCB_WAITQ) { |
---|
| 2425 | + if (written) { |
---|
| 2426 | + put_page(page); |
---|
| 2427 | + goto out; |
---|
| 2428 | + } |
---|
| 2429 | + error = lock_page_async(page, iocb->ki_waitq); |
---|
| 2430 | + } else { |
---|
| 2431 | + error = lock_page_killable(page); |
---|
| 2432 | + } |
---|
| 2433 | + |
---|
2292 | 2434 | if (unlikely(error)) |
---|
2293 | 2435 | goto readpage_error; |
---|
2294 | 2436 | if (!PageUptodate(page)) { |
---|
.. | .. |
---|
2301 | 2443 | goto find_page; |
---|
2302 | 2444 | } |
---|
2303 | 2445 | unlock_page(page); |
---|
2304 | | - shrink_readahead_size_eio(filp, ra); |
---|
| 2446 | + shrink_readahead_size_eio(ra); |
---|
2305 | 2447 | error = -EIO; |
---|
2306 | 2448 | goto readpage_error; |
---|
2307 | 2449 | } |
---|
.. | .. |
---|
2349 | 2491 | file_accessed(filp); |
---|
2350 | 2492 | return written ? written : error; |
---|
2351 | 2493 | } |
---|
| 2494 | +EXPORT_SYMBOL_GPL(generic_file_buffered_read); |
---|
2352 | 2495 | |
---|
2353 | 2496 | /** |
---|
2354 | 2497 | * generic_file_read_iter - generic filesystem read routine |
---|
.. | .. |
---|
2357 | 2500 | * |
---|
2358 | 2501 | * This is the "read_iter()" routine for all filesystems |
---|
2359 | 2502 | * that can use the page cache directly. |
---|
| 2503 | + * |
---|
| 2504 | + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall |
---|
| 2505 | + * be returned when no data can be read without waiting for I/O requests |
---|
| 2506 | + * to complete; it doesn't prevent readahead. |
---|
| 2507 | + * |
---|
| 2508 | + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O |
---|
| 2509 | + * requests shall be made for the read or for readahead. When no data |
---|
| 2510 | + * can be read, -EAGAIN shall be returned. When readahead would be |
---|
| 2511 | + * triggered, a partial, possibly empty read shall be returned. |
---|
| 2512 | + * |
---|
| 2513 | + * Return: |
---|
| 2514 | + * * number of bytes copied, even for partial reads |
---|
| 2515 | + * * negative error code (or 0 if IOCB_NOIO) if nothing was read |
---|
2360 | 2516 | */ |
---|
2361 | 2517 | ssize_t |
---|
2362 | 2518 | generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) |
---|
.. | .. |
---|
2417 | 2573 | |
---|
2418 | 2574 | #ifdef CONFIG_MMU |
---|
2419 | 2575 | #define MMAP_LOTSAMISS (100) |
---|
2420 | | -static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, |
---|
2421 | | - struct file *fpin) |
---|
2422 | | -{ |
---|
2423 | | - int flags = vmf->flags; |
---|
2424 | | - |
---|
2425 | | - if (fpin) |
---|
2426 | | - return fpin; |
---|
2427 | | - |
---|
2428 | | - /* |
---|
2429 | | - * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or |
---|
2430 | | - * anything, so we only pin the file and drop the mmap_sem if only |
---|
2431 | | - * FAULT_FLAG_ALLOW_RETRY is set. |
---|
2432 | | - */ |
---|
2433 | | - if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == |
---|
2434 | | - FAULT_FLAG_ALLOW_RETRY) { |
---|
2435 | | - fpin = get_file(vmf->vma->vm_file); |
---|
2436 | | - up_read(&vmf->vma->vm_mm->mmap_sem); |
---|
2437 | | - } |
---|
2438 | | - return fpin; |
---|
2439 | | -} |
---|
2440 | | - |
---|
2441 | 2576 | /* |
---|
2442 | | - * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem |
---|
| 2577 | + * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock |
---|
2443 | 2578 | * @vmf - the vm_fault for this fault. |
---|
2444 | 2579 | * @page - the page to lock. |
---|
2445 | 2580 | * @fpin - the pointer to the file we may pin (or is already pinned). |
---|
2446 | 2581 | * |
---|
2447 | | - * This works similar to lock_page_or_retry in that it can drop the mmap_sem. |
---|
| 2582 | + * This works similar to lock_page_or_retry in that it can drop the mmap_lock. |
---|
2448 | 2583 | * It differs in that it actually returns the page locked if it returns 1 and 0 |
---|
2449 | | - * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin |
---|
| 2584 | + * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin |
---|
2450 | 2585 | * will point to the pinned file and needs to be fput()'ed at a later point. |
---|
2451 | 2586 | */ |
---|
2452 | 2587 | static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, |
---|
.. | .. |
---|
2457 | 2592 | |
---|
2458 | 2593 | /* |
---|
2459 | 2594 | * NOTE! This will make us return with VM_FAULT_RETRY, but with |
---|
2460 | | - * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT |
---|
| 2595 | + * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT |
---|
2461 | 2596 | * is supposed to work. We have way too many special cases.. |
---|
2462 | 2597 | */ |
---|
2463 | 2598 | if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) |
---|
.. | .. |
---|
2467 | 2602 | if (vmf->flags & FAULT_FLAG_KILLABLE) { |
---|
2468 | 2603 | if (__lock_page_killable(page)) { |
---|
2469 | 2604 | /* |
---|
2470 | | - * We didn't have the right flags to drop the mmap_sem, |
---|
| 2605 | + * We didn't have the right flags to drop the mmap_lock, |
---|
2471 | 2606 | * but all fault_handlers only check for fatal signals |
---|
2472 | 2607 | * if we return VM_FAULT_RETRY, so we need to drop the |
---|
2473 | | - * mmap_sem here and return 0 if we don't have a fpin. |
---|
| 2608 | + * mmap_lock here and return 0 if we don't have a fpin. |
---|
2474 | 2609 | */ |
---|
2475 | 2610 | if (*fpin == NULL) |
---|
2476 | | - up_read(&vmf->vma->vm_mm->mmap_sem); |
---|
| 2611 | + mmap_read_unlock(vmf->vma->vm_mm); |
---|
2477 | 2612 | return 0; |
---|
2478 | 2613 | } |
---|
2479 | 2614 | } else |
---|
.. | .. |
---|
2494 | 2629 | struct file *file = vmf->vma->vm_file; |
---|
2495 | 2630 | struct file_ra_state *ra = &file->f_ra; |
---|
2496 | 2631 | struct address_space *mapping = file->f_mapping; |
---|
| 2632 | + DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff); |
---|
2497 | 2633 | struct file *fpin = NULL; |
---|
2498 | | - pgoff_t offset = vmf->pgoff; |
---|
| 2634 | + unsigned int mmap_miss; |
---|
2499 | 2635 | |
---|
2500 | 2636 | /* If we don't want any read-ahead, don't bother */ |
---|
2501 | 2637 | if (vmf->vma->vm_flags & VM_RAND_READ) |
---|
.. | .. |
---|
2505 | 2641 | |
---|
2506 | 2642 | if (vmf->vma->vm_flags & VM_SEQ_READ) { |
---|
2507 | 2643 | fpin = maybe_unlock_mmap_for_io(vmf, fpin); |
---|
2508 | | - page_cache_sync_readahead(mapping, ra, file, offset, |
---|
2509 | | - ra->ra_pages); |
---|
| 2644 | + page_cache_sync_ra(&ractl, ra, ra->ra_pages); |
---|
2510 | 2645 | return fpin; |
---|
2511 | 2646 | } |
---|
2512 | 2647 | |
---|
2513 | 2648 | /* Avoid banging the cache line if not needed */ |
---|
2514 | | - if (ra->mmap_miss < MMAP_LOTSAMISS * 10) |
---|
2515 | | - ra->mmap_miss++; |
---|
| 2649 | + mmap_miss = READ_ONCE(ra->mmap_miss); |
---|
| 2650 | + if (mmap_miss < MMAP_LOTSAMISS * 10) |
---|
| 2651 | + WRITE_ONCE(ra->mmap_miss, ++mmap_miss); |
---|
2516 | 2652 | |
---|
2517 | 2653 | /* |
---|
2518 | 2654 | * Do we miss much more than hit in this file? If so, |
---|
2519 | 2655 | * stop bothering with read-ahead. It will only hurt. |
---|
2520 | 2656 | */ |
---|
2521 | | - if (ra->mmap_miss > MMAP_LOTSAMISS) |
---|
| 2657 | + if (mmap_miss > MMAP_LOTSAMISS) |
---|
2522 | 2658 | return fpin; |
---|
2523 | 2659 | |
---|
2524 | 2660 | /* |
---|
2525 | 2661 | * mmap read-around |
---|
2526 | 2662 | */ |
---|
2527 | 2663 | fpin = maybe_unlock_mmap_for_io(vmf, fpin); |
---|
2528 | | - ra->start = max_t(long, 0, offset - ra->ra_pages / 2); |
---|
| 2664 | + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); |
---|
2529 | 2665 | ra->size = ra->ra_pages; |
---|
2530 | 2666 | ra->async_size = ra->ra_pages / 4; |
---|
2531 | | - ra_submit(ra, mapping, file); |
---|
| 2667 | + trace_android_vh_tune_mmap_readaround(ra->ra_pages, vmf->pgoff, |
---|
| 2668 | + &ra->start, &ra->size, &ra->async_size); |
---|
| 2669 | + ractl._index = ra->start; |
---|
| 2670 | + do_page_cache_ra(&ractl, ra->size, ra->async_size); |
---|
2532 | 2671 | return fpin; |
---|
2533 | 2672 | } |
---|
2534 | 2673 | |
---|
2535 | 2674 | /* |
---|
2536 | 2675 | * Asynchronous readahead happens when we find the page and PG_readahead, |
---|
2537 | 2676 | * so we want to possibly extend the readahead further. We return the file that |
---|
2538 | | - * was pinned if we have to drop the mmap_sem in order to do IO. |
---|
| 2677 | + * was pinned if we have to drop the mmap_lock in order to do IO. |
---|
2539 | 2678 | */ |
---|
2540 | 2679 | static struct file *do_async_mmap_readahead(struct vm_fault *vmf, |
---|
2541 | 2680 | struct page *page) |
---|
.. | .. |
---|
2544 | 2683 | struct file_ra_state *ra = &file->f_ra; |
---|
2545 | 2684 | struct address_space *mapping = file->f_mapping; |
---|
2546 | 2685 | struct file *fpin = NULL; |
---|
| 2686 | + unsigned int mmap_miss; |
---|
2547 | 2687 | pgoff_t offset = vmf->pgoff; |
---|
2548 | 2688 | |
---|
2549 | 2689 | /* If we don't want any read-ahead, don't bother */ |
---|
2550 | 2690 | if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) |
---|
2551 | 2691 | return fpin; |
---|
2552 | | - if (ra->mmap_miss > 0) |
---|
2553 | | - ra->mmap_miss--; |
---|
| 2692 | + mmap_miss = READ_ONCE(ra->mmap_miss); |
---|
| 2693 | + if (mmap_miss) |
---|
| 2694 | + WRITE_ONCE(ra->mmap_miss, --mmap_miss); |
---|
2554 | 2695 | if (PageReadahead(page)) { |
---|
2555 | 2696 | fpin = maybe_unlock_mmap_for_io(vmf, fpin); |
---|
2556 | 2697 | page_cache_async_readahead(mapping, ra, file, |
---|
.. | .. |
---|
2570 | 2711 | * it in the page cache, and handles the special cases reasonably without |
---|
2571 | 2712 | * having a lot of duplicated code. |
---|
2572 | 2713 | * |
---|
2573 | | - * vma->vm_mm->mmap_sem must be held on entry. |
---|
| 2714 | + * If FAULT_FLAG_SPECULATIVE is set, this function runs with elevated vma |
---|
| 2715 | + * refcount and with mmap lock not held. |
---|
| 2716 | + * Otherwise, vma->vm_mm->mmap_lock must be held on entry. |
---|
2574 | 2717 | * |
---|
2575 | | - * If our return value has VM_FAULT_RETRY set, it's because |
---|
2576 | | - * lock_page_or_retry() returned 0. |
---|
2577 | | - * The mmap_sem has usually been released in this case. |
---|
2578 | | - * See __lock_page_or_retry() for the exception. |
---|
| 2718 | + * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock |
---|
| 2719 | + * may be dropped before doing I/O or by lock_page_maybe_drop_mmap(). |
---|
2579 | 2720 | * |
---|
2580 | | - * If our return value does not have VM_FAULT_RETRY set, the mmap_sem |
---|
| 2721 | + * If our return value does not have VM_FAULT_RETRY set, the mmap_lock |
---|
2581 | 2722 | * has not been released. |
---|
2582 | 2723 | * |
---|
2583 | 2724 | * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. |
---|
| 2725 | + * |
---|
| 2726 | + * Return: bitwise-OR of %VM_FAULT_ codes. |
---|
2584 | 2727 | */ |
---|
2585 | 2728 | vm_fault_t filemap_fault(struct vm_fault *vmf) |
---|
2586 | 2729 | { |
---|
.. | .. |
---|
2592 | 2735 | struct inode *inode = mapping->host; |
---|
2593 | 2736 | pgoff_t offset = vmf->pgoff; |
---|
2594 | 2737 | pgoff_t max_off; |
---|
2595 | | - struct page *page; |
---|
| 2738 | + struct page *page = NULL; |
---|
2596 | 2739 | vm_fault_t ret = 0; |
---|
| 2740 | + bool retry = false; |
---|
| 2741 | + |
---|
| 2742 | + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { |
---|
| 2743 | + page = find_get_page(mapping, offset); |
---|
| 2744 | + if (unlikely(!page)) |
---|
| 2745 | + return VM_FAULT_RETRY; |
---|
| 2746 | + |
---|
| 2747 | + if (unlikely(PageReadahead(page))) |
---|
| 2748 | + goto page_put; |
---|
| 2749 | + |
---|
| 2750 | + if (!trylock_page(page)) |
---|
| 2751 | + goto page_put; |
---|
| 2752 | + |
---|
| 2753 | + if (unlikely(compound_head(page)->mapping != mapping)) |
---|
| 2754 | + goto page_unlock; |
---|
| 2755 | + VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); |
---|
| 2756 | + if (unlikely(!PageUptodate(page))) |
---|
| 2757 | + goto page_unlock; |
---|
| 2758 | + |
---|
| 2759 | + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
---|
| 2760 | + if (unlikely(offset >= max_off)) |
---|
| 2761 | + goto page_unlock; |
---|
| 2762 | + |
---|
| 2763 | + /* |
---|
| 2764 | + * Update readahead mmap_miss statistic. |
---|
| 2765 | + * |
---|
| 2766 | + * Note that we are not sure if finish_fault() will |
---|
| 2767 | + * manage to complete the transaction. If it fails, |
---|
| 2768 | + * we'll come back to filemap_fault() non-speculative |
---|
| 2769 | + * case which will update mmap_miss a second time. |
---|
| 2770 | + * This is not ideal, we would prefer to guarantee the |
---|
| 2771 | + * update will happen exactly once. |
---|
| 2772 | + */ |
---|
| 2773 | + if (!(vmf->vma->vm_flags & VM_RAND_READ) && ra->ra_pages) { |
---|
| 2774 | + unsigned int mmap_miss = READ_ONCE(ra->mmap_miss); |
---|
| 2775 | + if (mmap_miss) |
---|
| 2776 | + WRITE_ONCE(ra->mmap_miss, --mmap_miss); |
---|
| 2777 | + } |
---|
| 2778 | + |
---|
| 2779 | + vmf->page = page; |
---|
| 2780 | + return VM_FAULT_LOCKED; |
---|
| 2781 | +page_unlock: |
---|
| 2782 | + unlock_page(page); |
---|
| 2783 | +page_put: |
---|
| 2784 | + put_page(page); |
---|
| 2785 | + return VM_FAULT_RETRY; |
---|
| 2786 | + } |
---|
2597 | 2787 | |
---|
2598 | 2788 | max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
---|
2599 | 2789 | if (unlikely(offset >= max_off)) |
---|
2600 | 2790 | return VM_FAULT_SIGBUS; |
---|
| 2791 | + |
---|
| 2792 | + trace_android_vh_filemap_fault_get_page(vmf, &page, &retry); |
---|
| 2793 | + if (unlikely(retry)) |
---|
| 2794 | + goto out_retry; |
---|
| 2795 | + if (unlikely(page)) |
---|
| 2796 | + goto page_ok; |
---|
2601 | 2797 | |
---|
2602 | 2798 | /* |
---|
2603 | 2799 | * Do we have something in the page cache already? |
---|
.. | .. |
---|
2630 | 2826 | goto out_retry; |
---|
2631 | 2827 | |
---|
2632 | 2828 | /* Did it get truncated? */ |
---|
2633 | | - if (unlikely(page->mapping != mapping)) { |
---|
| 2829 | + if (unlikely(compound_head(page)->mapping != mapping)) { |
---|
2634 | 2830 | unlock_page(page); |
---|
2635 | 2831 | put_page(page); |
---|
2636 | 2832 | goto retry_find; |
---|
2637 | 2833 | } |
---|
2638 | | - VM_BUG_ON_PAGE(page->index != offset, page); |
---|
| 2834 | + VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); |
---|
2639 | 2835 | |
---|
2640 | 2836 | /* |
---|
2641 | 2837 | * We have a locked page in the page cache, now we need to check |
---|
.. | .. |
---|
2645 | 2841 | goto page_not_uptodate; |
---|
2646 | 2842 | |
---|
2647 | 2843 | /* |
---|
2648 | | - * We've made it this far and we had to drop our mmap_sem, now is the |
---|
| 2844 | + * We've made it this far and we had to drop our mmap_lock, now is the |
---|
2649 | 2845 | * time to return to the upper layer and have it re-find the vma and |
---|
2650 | 2846 | * redo the fault. |
---|
2651 | 2847 | */ |
---|
.. | .. |
---|
2654 | 2850 | goto out_retry; |
---|
2655 | 2851 | } |
---|
2656 | 2852 | |
---|
| 2853 | +page_ok: |
---|
2657 | 2854 | /* |
---|
2658 | 2855 | * Found the page and have a reference on it. |
---|
2659 | 2856 | * We must recheck i_size under page lock. |
---|
.. | .. |
---|
2690 | 2887 | if (!error || error == AOP_TRUNCATED_PAGE) |
---|
2691 | 2888 | goto retry_find; |
---|
2692 | 2889 | |
---|
2693 | | - /* Things didn't work out. Return zero to tell the mm layer so. */ |
---|
2694 | | - shrink_readahead_size_eio(file, ra); |
---|
| 2890 | + shrink_readahead_size_eio(ra); |
---|
2695 | 2891 | return VM_FAULT_SIGBUS; |
---|
2696 | 2892 | |
---|
2697 | 2893 | out_retry: |
---|
2698 | 2894 | /* |
---|
2699 | | - * We dropped the mmap_sem, we need to return to the fault handler to |
---|
| 2895 | + * We dropped the mmap_lock, we need to return to the fault handler to |
---|
2700 | 2896 | * re-find the vma and come back and find our hopefully still populated |
---|
2701 | 2897 | * page. |
---|
2702 | 2898 | */ |
---|
2703 | | - if (page) |
---|
| 2899 | + if (page) { |
---|
| 2900 | + trace_android_vh_filemap_fault_cache_page(vmf, page); |
---|
2704 | 2901 | put_page(page); |
---|
| 2902 | + } |
---|
2705 | 2903 | if (fpin) |
---|
2706 | 2904 | fput(fpin); |
---|
2707 | 2905 | return ret | VM_FAULT_RETRY; |
---|
2708 | 2906 | } |
---|
2709 | 2907 | EXPORT_SYMBOL(filemap_fault); |
---|
2710 | 2908 | |
---|
2711 | | -void filemap_map_pages(struct vm_fault *vmf, |
---|
2712 | | - pgoff_t start_pgoff, pgoff_t end_pgoff) |
---|
| 2909 | +static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) |
---|
2713 | 2910 | { |
---|
2714 | | - struct radix_tree_iter iter; |
---|
2715 | | - void **slot; |
---|
2716 | | - struct file *file = vmf->vma->vm_file; |
---|
2717 | | - struct address_space *mapping = file->f_mapping; |
---|
2718 | | - pgoff_t last_pgoff = start_pgoff; |
---|
| 2911 | + struct mm_struct *mm = vmf->vma->vm_mm; |
---|
| 2912 | + |
---|
| 2913 | + /* Huge page is mapped? No need to proceed. */ |
---|
| 2914 | + if (pmd_trans_huge(*vmf->pmd)) { |
---|
| 2915 | + unlock_page(page); |
---|
| 2916 | + put_page(page); |
---|
| 2917 | + return true; |
---|
| 2918 | + } |
---|
| 2919 | + |
---|
| 2920 | + if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { |
---|
| 2921 | + vm_fault_t ret = do_set_pmd(vmf, page); |
---|
| 2922 | + if (!ret) { |
---|
| 2923 | + /* The page is mapped successfully, reference consumed. */ |
---|
| 2924 | + unlock_page(page); |
---|
| 2925 | + return true; |
---|
| 2926 | + } |
---|
| 2927 | + } |
---|
| 2928 | + |
---|
| 2929 | + if (pmd_none(*vmf->pmd)) { |
---|
| 2930 | + vmf->ptl = pmd_lock(mm, vmf->pmd); |
---|
| 2931 | + if (likely(pmd_none(*vmf->pmd))) { |
---|
| 2932 | + mm_inc_nr_ptes(mm); |
---|
| 2933 | + pmd_populate(mm, vmf->pmd, vmf->prealloc_pte); |
---|
| 2934 | + vmf->prealloc_pte = NULL; |
---|
| 2935 | + } |
---|
| 2936 | + spin_unlock(vmf->ptl); |
---|
| 2937 | + } |
---|
| 2938 | + |
---|
| 2939 | + /* See comment in handle_pte_fault() */ |
---|
| 2940 | + if (pmd_devmap_trans_unstable(vmf->pmd)) { |
---|
| 2941 | + unlock_page(page); |
---|
| 2942 | + put_page(page); |
---|
| 2943 | + return true; |
---|
| 2944 | + } |
---|
| 2945 | + |
---|
| 2946 | + return false; |
---|
| 2947 | +} |
---|
| 2948 | + |
---|
| 2949 | +static struct page *next_uptodate_page(struct page *page, |
---|
| 2950 | + struct address_space *mapping, |
---|
| 2951 | + struct xa_state *xas, pgoff_t end_pgoff) |
---|
| 2952 | +{ |
---|
2719 | 2953 | unsigned long max_idx; |
---|
2720 | | - struct page *head, *page; |
---|
2721 | 2954 | |
---|
2722 | | - rcu_read_lock(); |
---|
2723 | | - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { |
---|
2724 | | - if (iter.index > end_pgoff) |
---|
2725 | | - break; |
---|
2726 | | -repeat: |
---|
2727 | | - page = radix_tree_deref_slot(slot); |
---|
2728 | | - if (unlikely(!page)) |
---|
2729 | | - goto next; |
---|
2730 | | - if (radix_tree_exception(page)) { |
---|
2731 | | - if (radix_tree_deref_retry(page)) { |
---|
2732 | | - slot = radix_tree_iter_retry(&iter); |
---|
2733 | | - continue; |
---|
2734 | | - } |
---|
2735 | | - goto next; |
---|
2736 | | - } |
---|
2737 | | - |
---|
2738 | | - head = compound_head(page); |
---|
2739 | | - if (!page_cache_get_speculative(head)) |
---|
2740 | | - goto repeat; |
---|
2741 | | - |
---|
2742 | | - /* The page was split under us? */ |
---|
2743 | | - if (compound_head(page) != head) { |
---|
2744 | | - put_page(head); |
---|
2745 | | - goto repeat; |
---|
2746 | | - } |
---|
2747 | | - |
---|
2748 | | - /* Has the page moved? */ |
---|
2749 | | - if (unlikely(page != *slot)) { |
---|
2750 | | - put_page(head); |
---|
2751 | | - goto repeat; |
---|
2752 | | - } |
---|
2753 | | - |
---|
2754 | | - if (!PageUptodate(page) || |
---|
2755 | | - PageReadahead(page) || |
---|
2756 | | - PageHWPoison(page)) |
---|
| 2955 | + do { |
---|
| 2956 | + if (!page) |
---|
| 2957 | + return NULL; |
---|
| 2958 | + if (xas_retry(xas, page)) |
---|
| 2959 | + continue; |
---|
| 2960 | + if (xa_is_value(page)) |
---|
| 2961 | + continue; |
---|
| 2962 | + if (PageLocked(page)) |
---|
| 2963 | + continue; |
---|
| 2964 | + if (!page_cache_get_speculative(page)) |
---|
| 2965 | + continue; |
---|
| 2966 | + /* Has the page moved or been split? */ |
---|
| 2967 | + if (unlikely(page != xas_reload(xas))) |
---|
| 2968 | + goto skip; |
---|
| 2969 | + if (!PageUptodate(page) || PageReadahead(page)) |
---|
| 2970 | + goto skip; |
---|
| 2971 | + if (PageHWPoison(page)) |
---|
2757 | 2972 | goto skip; |
---|
2758 | 2973 | if (!trylock_page(page)) |
---|
2759 | 2974 | goto skip; |
---|
2760 | | - |
---|
2761 | | - if (page->mapping != mapping || !PageUptodate(page)) |
---|
| 2975 | + if (page->mapping != mapping) |
---|
2762 | 2976 | goto unlock; |
---|
2763 | | - |
---|
| 2977 | + if (!PageUptodate(page)) |
---|
| 2978 | + goto unlock; |
---|
2764 | 2979 | max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); |
---|
2765 | | - if (page->index >= max_idx) |
---|
| 2980 | + if (xas->xa_index >= max_idx) |
---|
2766 | 2981 | goto unlock; |
---|
2767 | | - |
---|
2768 | | - if (file->f_ra.mmap_miss > 0) |
---|
2769 | | - file->f_ra.mmap_miss--; |
---|
2770 | | - |
---|
2771 | | - vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; |
---|
2772 | | - if (vmf->pte) |
---|
2773 | | - vmf->pte += iter.index - last_pgoff; |
---|
2774 | | - last_pgoff = iter.index; |
---|
2775 | | - if (alloc_set_pte(vmf, NULL, page)) |
---|
2776 | | - goto unlock; |
---|
2777 | | - unlock_page(page); |
---|
2778 | | - goto next; |
---|
| 2982 | + return page; |
---|
2779 | 2983 | unlock: |
---|
2780 | 2984 | unlock_page(page); |
---|
2781 | 2985 | skip: |
---|
2782 | 2986 | put_page(page); |
---|
2783 | | -next: |
---|
2784 | | - /* Huge page is mapped? No need to proceed. */ |
---|
2785 | | - if (pmd_trans_huge(*vmf->pmd)) |
---|
2786 | | - break; |
---|
2787 | | - if (iter.index == end_pgoff) |
---|
2788 | | - break; |
---|
| 2987 | + } while ((page = xas_next_entry(xas, end_pgoff)) != NULL); |
---|
| 2988 | + |
---|
| 2989 | + return NULL; |
---|
| 2990 | +} |
---|
| 2991 | + |
---|
| 2992 | +static inline struct page *first_map_page(struct address_space *mapping, |
---|
| 2993 | + struct xa_state *xas, |
---|
| 2994 | + pgoff_t end_pgoff) |
---|
| 2995 | +{ |
---|
| 2996 | + return next_uptodate_page(xas_find(xas, end_pgoff), |
---|
| 2997 | + mapping, xas, end_pgoff); |
---|
| 2998 | +} |
---|
| 2999 | + |
---|
| 3000 | +static inline struct page *next_map_page(struct address_space *mapping, |
---|
| 3001 | + struct xa_state *xas, |
---|
| 3002 | + pgoff_t end_pgoff) |
---|
| 3003 | +{ |
---|
| 3004 | + return next_uptodate_page(xas_next_entry(xas, end_pgoff), |
---|
| 3005 | + mapping, xas, end_pgoff); |
---|
| 3006 | +} |
---|
| 3007 | + |
---|
| 3008 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 3009 | +bool filemap_allow_speculation(void) |
---|
| 3010 | +{ |
---|
| 3011 | + return true; |
---|
| 3012 | +} |
---|
| 3013 | +EXPORT_SYMBOL_GPL(filemap_allow_speculation); |
---|
| 3014 | +#endif |
---|
| 3015 | + |
---|
| 3016 | +vm_fault_t filemap_map_pages(struct vm_fault *vmf, |
---|
| 3017 | + pgoff_t start_pgoff, pgoff_t end_pgoff) |
---|
| 3018 | +{ |
---|
| 3019 | + struct vm_area_struct *vma = vmf->vma; |
---|
| 3020 | + struct file *file = vma->vm_file; |
---|
| 3021 | + struct address_space *mapping = file->f_mapping; |
---|
| 3022 | + pgoff_t last_pgoff = start_pgoff; |
---|
| 3023 | + unsigned long addr; |
---|
| 3024 | + XA_STATE(xas, &mapping->i_pages, start_pgoff); |
---|
| 3025 | + struct page *head, *page; |
---|
| 3026 | + unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); |
---|
| 3027 | + vm_fault_t ret = (vmf->flags & FAULT_FLAG_SPECULATIVE) ? |
---|
| 3028 | + VM_FAULT_RETRY : 0; |
---|
| 3029 | + |
---|
| 3030 | + rcu_read_lock(); |
---|
| 3031 | + head = first_map_page(mapping, &xas, end_pgoff); |
---|
| 3032 | + if (!head) |
---|
| 3033 | + goto out; |
---|
| 3034 | + |
---|
| 3035 | + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) && |
---|
| 3036 | + filemap_map_pmd(vmf, head)) { |
---|
| 3037 | + ret = VM_FAULT_NOPAGE; |
---|
| 3038 | + goto out; |
---|
2789 | 3039 | } |
---|
| 3040 | + |
---|
| 3041 | + addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
---|
| 3042 | + if (!pte_map_lock_addr(vmf, addr)) { |
---|
| 3043 | + unlock_page(head); |
---|
| 3044 | + put_page(head); |
---|
| 3045 | + goto out; |
---|
| 3046 | + } |
---|
| 3047 | + |
---|
| 3048 | + do { |
---|
| 3049 | + page = find_subpage(head, xas.xa_index); |
---|
| 3050 | + if (PageHWPoison(page)) |
---|
| 3051 | + goto unlock; |
---|
| 3052 | + |
---|
| 3053 | + if (mmap_miss > 0) |
---|
| 3054 | + mmap_miss--; |
---|
| 3055 | + |
---|
| 3056 | + addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; |
---|
| 3057 | + vmf->pte += xas.xa_index - last_pgoff; |
---|
| 3058 | + last_pgoff = xas.xa_index; |
---|
| 3059 | + |
---|
| 3060 | + if (!pte_none(*vmf->pte)) |
---|
| 3061 | + goto unlock; |
---|
| 3062 | + |
---|
| 3063 | + /* We're about to handle the fault */ |
---|
| 3064 | + if (vmf->address == addr) |
---|
| 3065 | + ret = VM_FAULT_NOPAGE; |
---|
| 3066 | + |
---|
| 3067 | + do_set_pte(vmf, page, addr); |
---|
| 3068 | + /* no need to invalidate: a not-present page won't be cached */ |
---|
| 3069 | + update_mmu_cache(vma, addr, vmf->pte); |
---|
| 3070 | + unlock_page(head); |
---|
| 3071 | + continue; |
---|
| 3072 | +unlock: |
---|
| 3073 | + unlock_page(head); |
---|
| 3074 | + put_page(head); |
---|
| 3075 | + } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); |
---|
| 3076 | + pte_unmap_unlock(vmf->pte, vmf->ptl); |
---|
| 3077 | +out: |
---|
2790 | 3078 | rcu_read_unlock(); |
---|
| 3079 | + WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); |
---|
| 3080 | + return ret; |
---|
2791 | 3081 | } |
---|
2792 | 3082 | EXPORT_SYMBOL(filemap_map_pages); |
---|
2793 | 3083 | |
---|
.. | .. |
---|
2821 | 3111 | .fault = filemap_fault, |
---|
2822 | 3112 | .map_pages = filemap_map_pages, |
---|
2823 | 3113 | .page_mkwrite = filemap_page_mkwrite, |
---|
| 3114 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 3115 | + .allow_speculation = filemap_allow_speculation, |
---|
| 3116 | +#endif |
---|
2824 | 3117 | }; |
---|
2825 | 3118 | |
---|
2826 | 3119 | /* This is used for a general mmap of a disk file */ |
---|
.. | .. |
---|
2846 | 3139 | return generic_file_mmap(file, vma); |
---|
2847 | 3140 | } |
---|
2848 | 3141 | #else |
---|
2849 | | -int filemap_page_mkwrite(struct vm_fault *vmf) |
---|
| 3142 | +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) |
---|
2850 | 3143 | { |
---|
2851 | | - return -ENOSYS; |
---|
| 3144 | + return VM_FAULT_SIGBUS; |
---|
2852 | 3145 | } |
---|
2853 | 3146 | int generic_file_mmap(struct file * file, struct vm_area_struct * vma) |
---|
2854 | 3147 | { |
---|
.. | .. |
---|
2895 | 3188 | put_page(page); |
---|
2896 | 3189 | if (err == -EEXIST) |
---|
2897 | 3190 | goto repeat; |
---|
2898 | | - /* Presumably ENOMEM for radix tree node */ |
---|
| 3191 | + /* Presumably ENOMEM for xarray node */ |
---|
2899 | 3192 | return ERR_PTR(err); |
---|
2900 | 3193 | } |
---|
2901 | 3194 | |
---|
.. | .. |
---|
2919 | 3212 | goto out; |
---|
2920 | 3213 | |
---|
2921 | 3214 | /* |
---|
2922 | | - * Page is not up to date and may be locked due one of the following |
---|
| 3215 | + * Page is not up to date and may be locked due to one of the following |
---|
2923 | 3216 | * case a: Page is being filled and the page lock is held |
---|
2924 | 3217 | * case b: Read/write error clearing the page uptodate status |
---|
2925 | 3218 | * case c: Truncation in progress (page locked) |
---|
.. | .. |
---|
2928 | 3221 | * Case a, the page will be up to date when the page is unlocked. |
---|
2929 | 3222 | * There is no need to serialise on the page lock here as the page |
---|
2930 | 3223 | * is pinned so the lock gives no additional protection. Even if the |
---|
2931 | | - * the page is truncated, the data is still valid if PageUptodate as |
---|
| 3224 | + * page is truncated, the data is still valid if PageUptodate as |
---|
2932 | 3225 | * it's a race vs truncate race. |
---|
2933 | 3226 | * Case b, the page will not be up to date |
---|
2934 | 3227 | * Case c, the page may be truncated but in itself, the data may still |
---|
.. | .. |
---|
2994 | 3287 | * not set, try to fill the page and wait for it to become unlocked. |
---|
2995 | 3288 | * |
---|
2996 | 3289 | * If the page does not get brought uptodate, return -EIO. |
---|
| 3290 | + * |
---|
| 3291 | + * Return: up to date page on success, ERR_PTR() on failure. |
---|
2997 | 3292 | */ |
---|
2998 | 3293 | struct page *read_cache_page(struct address_space *mapping, |
---|
2999 | 3294 | pgoff_t index, |
---|
3000 | 3295 | int (*filler)(void *, struct page *), |
---|
3001 | 3296 | void *data) |
---|
3002 | 3297 | { |
---|
3003 | | - return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); |
---|
| 3298 | + return do_read_cache_page(mapping, index, filler, data, |
---|
| 3299 | + mapping_gfp_mask(mapping)); |
---|
3004 | 3300 | } |
---|
3005 | 3301 | EXPORT_SYMBOL(read_cache_page); |
---|
3006 | 3302 | |
---|
.. | .. |
---|
3014 | 3310 | * any new page allocations done using the specified allocation flags. |
---|
3015 | 3311 | * |
---|
3016 | 3312 | * If the page does not get brought uptodate, return -EIO. |
---|
| 3313 | + * |
---|
| 3314 | + * Return: up to date page on success, ERR_PTR() on failure. |
---|
3017 | 3315 | */ |
---|
3018 | 3316 | struct page *read_cache_page_gfp(struct address_space *mapping, |
---|
3019 | 3317 | pgoff_t index, |
---|
.. | .. |
---|
3022 | 3320 | return do_read_cache_page(mapping, index, NULL, NULL, gfp); |
---|
3023 | 3321 | } |
---|
3024 | 3322 | EXPORT_SYMBOL(read_cache_page_gfp); |
---|
3025 | | - |
---|
3026 | | -/* |
---|
3027 | | - * Performs necessary checks before doing a write |
---|
3028 | | - * |
---|
3029 | | - * Can adjust writing position or amount of bytes to write. |
---|
3030 | | - * Returns appropriate error code that caller should return or |
---|
3031 | | - * zero in case that write should be allowed. |
---|
3032 | | - */ |
---|
3033 | | -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) |
---|
3034 | | -{ |
---|
3035 | | - struct file *file = iocb->ki_filp; |
---|
3036 | | - struct inode *inode = file->f_mapping->host; |
---|
3037 | | - unsigned long limit = rlimit(RLIMIT_FSIZE); |
---|
3038 | | - loff_t pos; |
---|
3039 | | - |
---|
3040 | | - if (IS_SWAPFILE(inode)) |
---|
3041 | | - return -ETXTBSY; |
---|
3042 | | - |
---|
3043 | | - if (!iov_iter_count(from)) |
---|
3044 | | - return 0; |
---|
3045 | | - |
---|
3046 | | - /* FIXME: this is for backwards compatibility with 2.4 */ |
---|
3047 | | - if (iocb->ki_flags & IOCB_APPEND) |
---|
3048 | | - iocb->ki_pos = i_size_read(inode); |
---|
3049 | | - |
---|
3050 | | - pos = iocb->ki_pos; |
---|
3051 | | - |
---|
3052 | | - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) |
---|
3053 | | - return -EINVAL; |
---|
3054 | | - |
---|
3055 | | - if (limit != RLIM_INFINITY) { |
---|
3056 | | - if (iocb->ki_pos >= limit) { |
---|
3057 | | - send_sig(SIGXFSZ, current, 0); |
---|
3058 | | - return -EFBIG; |
---|
3059 | | - } |
---|
3060 | | - iov_iter_truncate(from, limit - (unsigned long)pos); |
---|
3061 | | - } |
---|
3062 | | - |
---|
3063 | | - /* |
---|
3064 | | - * LFS rule |
---|
3065 | | - */ |
---|
3066 | | - if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && |
---|
3067 | | - !(file->f_flags & O_LARGEFILE))) { |
---|
3068 | | - if (pos >= MAX_NON_LFS) |
---|
3069 | | - return -EFBIG; |
---|
3070 | | - iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); |
---|
3071 | | - } |
---|
3072 | | - |
---|
3073 | | - /* |
---|
3074 | | - * Are we about to exceed the fs block limit ? |
---|
3075 | | - * |
---|
3076 | | - * If we have written data it becomes a short write. If we have |
---|
3077 | | - * exceeded without writing data we send a signal and return EFBIG. |
---|
3078 | | - * Linus frestrict idea will clean these up nicely.. |
---|
3079 | | - */ |
---|
3080 | | - if (unlikely(pos >= inode->i_sb->s_maxbytes)) |
---|
3081 | | - return -EFBIG; |
---|
3082 | | - |
---|
3083 | | - iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); |
---|
3084 | | - return iov_iter_count(from); |
---|
3085 | | -} |
---|
3086 | | -EXPORT_SYMBOL(generic_write_checks); |
---|
3087 | 3323 | |
---|
3088 | 3324 | int pagecache_write_begin(struct file *file, struct address_space *mapping, |
---|
3089 | 3325 | loff_t pos, unsigned len, unsigned flags, |
---|
.. | .. |
---|
3106 | 3342 | } |
---|
3107 | 3343 | EXPORT_SYMBOL(pagecache_write_end); |
---|
3108 | 3344 | |
---|
| 3345 | +/* |
---|
| 3346 | + * Warn about a page cache invalidation failure during a direct I/O write. |
---|
| 3347 | + */ |
---|
| 3348 | +void dio_warn_stale_pagecache(struct file *filp) |
---|
| 3349 | +{ |
---|
| 3350 | + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); |
---|
| 3351 | + char pathname[128]; |
---|
| 3352 | + struct inode *inode = file_inode(filp); |
---|
| 3353 | + char *path; |
---|
| 3354 | + |
---|
| 3355 | + errseq_set(&inode->i_mapping->wb_err, -EIO); |
---|
| 3356 | + if (__ratelimit(&_rs)) { |
---|
| 3357 | + path = file_path(filp, pathname, sizeof(pathname)); |
---|
| 3358 | + if (IS_ERR(path)) |
---|
| 3359 | + path = "(unknown)"; |
---|
| 3360 | + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); |
---|
| 3361 | + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, |
---|
| 3362 | + current->comm); |
---|
| 3363 | + } |
---|
| 3364 | +} |
---|
| 3365 | + |
---|
3109 | 3366 | ssize_t |
---|
3110 | 3367 | generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) |
---|
3111 | 3368 | { |
---|
.. | .. |
---|
3123 | 3380 | if (iocb->ki_flags & IOCB_NOWAIT) { |
---|
3124 | 3381 | /* If there are pages to writeback, return */ |
---|
3125 | 3382 | if (filemap_range_has_page(inode->i_mapping, pos, |
---|
3126 | | - pos + iov_iter_count(from))) |
---|
| 3383 | + pos + write_len - 1)) |
---|
3127 | 3384 | return -EAGAIN; |
---|
3128 | 3385 | } else { |
---|
3129 | 3386 | written = filemap_write_and_wait_range(mapping, pos, |
---|
.. | .. |
---|
3163 | 3420 | * Most of the time we do not need this since dio_complete() will do |
---|
3164 | 3421 | * the invalidation for us. However there are some file systems that |
---|
3165 | 3422 | * do not end up with dio_complete() being called, so let's not break |
---|
3166 | | - * them by removing it completely |
---|
| 3423 | + * them by removing it completely. |
---|
| 3424 | + * |
---|
| 3425 | + * Noticeable example is a blkdev_direct_IO(). |
---|
| 3426 | + * |
---|
| 3427 | + * Skip invalidation for async writes or if mapping has no pages. |
---|
3167 | 3428 | */ |
---|
3168 | | - if (mapping->nrpages) |
---|
3169 | | - invalidate_inode_pages2_range(mapping, |
---|
3170 | | - pos >> PAGE_SHIFT, end); |
---|
| 3429 | + if (written > 0 && mapping->nrpages && |
---|
| 3430 | + invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) |
---|
| 3431 | + dio_warn_stale_pagecache(file); |
---|
3171 | 3432 | |
---|
3172 | 3433 | if (written > 0) { |
---|
3173 | 3434 | pos += written; |
---|
.. | .. |
---|
3220 | 3481 | unsigned long offset; /* Offset into pagecache page */ |
---|
3221 | 3482 | unsigned long bytes; /* Bytes to write to page */ |
---|
3222 | 3483 | size_t copied; /* Bytes copied from user */ |
---|
3223 | | - void *fsdata; |
---|
| 3484 | + void *fsdata = NULL; |
---|
3224 | 3485 | |
---|
3225 | 3486 | offset = (pos & (PAGE_SIZE - 1)); |
---|
3226 | 3487 | bytes = min_t(unsigned long, PAGE_SIZE - offset, |
---|
.. | .. |
---|
3306 | 3567 | * This function does *not* take care of syncing data in case of O_SYNC write. |
---|
3307 | 3568 | * A caller has to handle it. This is mainly due to the fact that we want to |
---|
3308 | 3569 | * avoid syncing under i_mutex. |
---|
| 3570 | + * |
---|
| 3571 | + * Return: |
---|
| 3572 | + * * number of bytes written, even for truncated writes |
---|
| 3573 | + * * negative error code if no data has been written at all |
---|
3309 | 3574 | */ |
---|
3310 | 3575 | ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) |
---|
3311 | 3576 | { |
---|
.. | .. |
---|
3390 | 3655 | * This is a wrapper around __generic_file_write_iter() to be used by most |
---|
3391 | 3656 | * filesystems. It takes care of syncing the file in case of O_SYNC file |
---|
3392 | 3657 | * and acquires i_mutex as needed. |
---|
| 3658 | + * Return: |
---|
| 3659 | + * * negative error code if no data has been written at all of |
---|
| 3660 | + * vfs_fsync_range() failed for a synchronous write |
---|
| 3661 | + * * number of bytes written, even for truncated writes |
---|
3393 | 3662 | */ |
---|
3394 | 3663 | ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) |
---|
3395 | 3664 | { |
---|
.. | .. |
---|
3416 | 3685 | * @gfp_mask: memory allocation flags (and I/O mode) |
---|
3417 | 3686 | * |
---|
3418 | 3687 | * The address_space is to try to release any data against the page |
---|
3419 | | - * (presumably at page->private). If the release was successful, return '1'. |
---|
3420 | | - * Otherwise return zero. |
---|
| 3688 | + * (presumably at page->private). |
---|
3421 | 3689 | * |
---|
3422 | 3690 | * This may also be called if PG_fscache is set on a page, indicating that the |
---|
3423 | 3691 | * page is known to the local caching routines. |
---|
.. | .. |
---|
3425 | 3693 | * The @gfp_mask argument specifies whether I/O may be performed to release |
---|
3426 | 3694 | * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). |
---|
3427 | 3695 | * |
---|
| 3696 | + * Return: %1 if the release was successful, otherwise return zero. |
---|
3428 | 3697 | */ |
---|
3429 | 3698 | int try_to_release_page(struct page *page, gfp_t gfp_mask) |
---|
3430 | 3699 | { |
---|