| .. | .. |
|---|
| 21 | 21 | #include <uapi/linux/memfd.h> |
|---|
| 22 | 22 | |
|---|
| 23 | 23 | /* |
|---|
| 24 | | - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, |
|---|
| 24 | + * We need a tag: a new tag would expand every xa_node by 8 bytes, |
|---|
| 25 | 25 | * so reuse a tag which we firmly believe is never set or cleared on tmpfs |
|---|
| 26 | 26 | * or hugetlbfs because they are memory only filesystems. |
|---|
| 27 | 27 | */ |
|---|
| 28 | 28 | #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE |
|---|
| 29 | 29 | #define LAST_SCAN 4 /* about 150ms max */ |
|---|
| 30 | 30 | |
|---|
| 31 | | -static void memfd_tag_pins(struct address_space *mapping) |
|---|
| 31 | +static void memfd_tag_pins(struct xa_state *xas) |
|---|
| 32 | 32 | { |
|---|
| 33 | | - struct radix_tree_iter iter; |
|---|
| 34 | | - void __rcu **slot; |
|---|
| 35 | | - pgoff_t start; |
|---|
| 36 | 33 | struct page *page; |
|---|
| 37 | | - unsigned int tagged = 0; |
|---|
| 34 | + int latency = 0; |
|---|
| 35 | + int cache_count; |
|---|
| 38 | 36 | |
|---|
| 39 | 37 | lru_add_drain(); |
|---|
| 40 | | - start = 0; |
|---|
| 41 | 38 | |
|---|
| 42 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 43 | | - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { |
|---|
| 44 | | - page = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); |
|---|
| 45 | | - if (!page || radix_tree_exception(page)) { |
|---|
| 46 | | - if (radix_tree_deref_retry(page)) { |
|---|
| 47 | | - slot = radix_tree_iter_retry(&iter); |
|---|
| 48 | | - continue; |
|---|
| 49 | | - } |
|---|
| 50 | | - } else if (page_count(page) - page_mapcount(page) > 1) { |
|---|
| 51 | | - radix_tree_tag_set(&mapping->i_pages, iter.index, |
|---|
| 52 | | - MEMFD_TAG_PINNED); |
|---|
| 53 | | - } |
|---|
| 39 | + xas_lock_irq(xas); |
|---|
| 40 | + xas_for_each(xas, page, ULONG_MAX) { |
|---|
| 41 | + cache_count = 1; |
|---|
| 42 | + if (!xa_is_value(page) && |
|---|
| 43 | + PageTransHuge(page) && !PageHuge(page)) |
|---|
| 44 | + cache_count = HPAGE_PMD_NR; |
|---|
| 54 | 45 | |
|---|
| 55 | | - if (++tagged % 1024) |
|---|
| 46 | + if (!xa_is_value(page) && |
|---|
| 47 | + page_count(page) - total_mapcount(page) != cache_count) |
|---|
| 48 | + xas_set_mark(xas, MEMFD_TAG_PINNED); |
|---|
| 49 | + if (cache_count != 1) |
|---|
| 50 | + xas_set(xas, page->index + cache_count); |
|---|
| 51 | + |
|---|
| 52 | + latency += cache_count; |
|---|
| 53 | + if (latency < XA_CHECK_SCHED) |
|---|
| 56 | 54 | continue; |
|---|
| 55 | + latency = 0; |
|---|
| 57 | 56 | |
|---|
| 58 | | - slot = radix_tree_iter_resume(slot, &iter); |
|---|
| 59 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 57 | + xas_pause(xas); |
|---|
| 58 | + xas_unlock_irq(xas); |
|---|
| 60 | 59 | cond_resched(); |
|---|
| 61 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 60 | + xas_lock_irq(xas); |
|---|
| 62 | 61 | } |
|---|
| 63 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 62 | + xas_unlock_irq(xas); |
|---|
| 64 | 63 | } |
|---|
| 65 | 64 | |
|---|
| 66 | 65 | /* |
|---|
| .. | .. |
|---|
| 74 | 73 | */ |
|---|
| 75 | 74 | static int memfd_wait_for_pins(struct address_space *mapping) |
|---|
| 76 | 75 | { |
|---|
| 77 | | - struct radix_tree_iter iter; |
|---|
| 78 | | - void __rcu **slot; |
|---|
| 79 | | - pgoff_t start; |
|---|
| 76 | + XA_STATE(xas, &mapping->i_pages, 0); |
|---|
| 80 | 77 | struct page *page; |
|---|
| 81 | 78 | int error, scan; |
|---|
| 82 | 79 | |
|---|
| 83 | | - memfd_tag_pins(mapping); |
|---|
| 80 | + memfd_tag_pins(&xas); |
|---|
| 84 | 81 | |
|---|
| 85 | 82 | error = 0; |
|---|
| 86 | 83 | for (scan = 0; scan <= LAST_SCAN; scan++) { |
|---|
| 87 | | - if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) |
|---|
| 84 | + int latency = 0; |
|---|
| 85 | + int cache_count; |
|---|
| 86 | + |
|---|
| 87 | + if (!xas_marked(&xas, MEMFD_TAG_PINNED)) |
|---|
| 88 | 88 | break; |
|---|
| 89 | 89 | |
|---|
| 90 | 90 | if (!scan) |
|---|
| .. | .. |
|---|
| 92 | 92 | else if (schedule_timeout_killable((HZ << scan) / 200)) |
|---|
| 93 | 93 | scan = LAST_SCAN; |
|---|
| 94 | 94 | |
|---|
| 95 | | - start = 0; |
|---|
| 96 | | - rcu_read_lock(); |
|---|
| 97 | | - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, |
|---|
| 98 | | - start, MEMFD_TAG_PINNED) { |
|---|
| 95 | + xas_set(&xas, 0); |
|---|
| 96 | + xas_lock_irq(&xas); |
|---|
| 97 | + xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { |
|---|
| 98 | + bool clear = true; |
|---|
| 99 | 99 | |
|---|
| 100 | | - page = radix_tree_deref_slot(slot); |
|---|
| 101 | | - if (radix_tree_exception(page)) { |
|---|
| 102 | | - if (radix_tree_deref_retry(page)) { |
|---|
| 103 | | - slot = radix_tree_iter_retry(&iter); |
|---|
| 104 | | - continue; |
|---|
| 105 | | - } |
|---|
| 100 | + cache_count = 1; |
|---|
| 101 | + if (!xa_is_value(page) && |
|---|
| 102 | + PageTransHuge(page) && !PageHuge(page)) |
|---|
| 103 | + cache_count = HPAGE_PMD_NR; |
|---|
| 106 | 104 | |
|---|
| 107 | | - page = NULL; |
|---|
| 108 | | - } |
|---|
| 109 | | - |
|---|
| 110 | | - if (page && |
|---|
| 111 | | - page_count(page) - page_mapcount(page) != 1) { |
|---|
| 112 | | - if (scan < LAST_SCAN) |
|---|
| 113 | | - goto continue_resched; |
|---|
| 114 | | - |
|---|
| 105 | + if (!xa_is_value(page) && cache_count != |
|---|
| 106 | + page_count(page) - total_mapcount(page)) { |
|---|
| 115 | 107 | /* |
|---|
| 116 | 108 | * On the last scan, we clean up all those tags |
|---|
| 117 | 109 | * we inserted; but make a note that we still |
|---|
| 118 | 110 | * found pages pinned. |
|---|
| 119 | 111 | */ |
|---|
| 120 | | - error = -EBUSY; |
|---|
| 112 | + if (scan == LAST_SCAN) |
|---|
| 113 | + error = -EBUSY; |
|---|
| 114 | + else |
|---|
| 115 | + clear = false; |
|---|
| 121 | 116 | } |
|---|
| 117 | + if (clear) |
|---|
| 118 | + xas_clear_mark(&xas, MEMFD_TAG_PINNED); |
|---|
| 122 | 119 | |
|---|
| 123 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 124 | | - radix_tree_tag_clear(&mapping->i_pages, |
|---|
| 125 | | - iter.index, MEMFD_TAG_PINNED); |
|---|
| 126 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 127 | | -continue_resched: |
|---|
| 128 | | - if (need_resched()) { |
|---|
| 129 | | - slot = radix_tree_iter_resume(slot, &iter); |
|---|
| 130 | | - cond_resched_rcu(); |
|---|
| 131 | | - } |
|---|
| 120 | + latency += cache_count; |
|---|
| 121 | + if (latency < XA_CHECK_SCHED) |
|---|
| 122 | + continue; |
|---|
| 123 | + latency = 0; |
|---|
| 124 | + |
|---|
| 125 | + xas_pause(&xas); |
|---|
| 126 | + xas_unlock_irq(&xas); |
|---|
| 127 | + cond_resched(); |
|---|
| 128 | + xas_lock_irq(&xas); |
|---|
| 132 | 129 | } |
|---|
| 133 | | - rcu_read_unlock(); |
|---|
| 130 | + xas_unlock_irq(&xas); |
|---|
| 134 | 131 | } |
|---|
| 135 | 132 | |
|---|
| 136 | 133 | return error; |
|---|
| .. | .. |
|---|
| 333 | 330 | |
|---|
| 334 | 331 | if (flags & MFD_ALLOW_SEALING) { |
|---|
| 335 | 332 | file_seals = memfd_file_seals_ptr(file); |
|---|
| 336 | | - *file_seals &= ~F_SEAL_SEAL; |
|---|
| 333 | + if (file_seals) |
|---|
| 334 | + *file_seals &= ~F_SEAL_SEAL; |
|---|
| 337 | 335 | } |
|---|
| 338 | 336 | |
|---|
| 339 | 337 | fd_install(fd, file); |
|---|