.. | .. |
---|
11 | 11 | #include <linux/syscalls.h> |
---|
12 | 12 | #include <linux/mempolicy.h> |
---|
13 | 13 | #include <linux/page-isolation.h> |
---|
| 14 | +#include <linux/page_idle.h> |
---|
14 | 15 | #include <linux/userfaultfd_k.h> |
---|
15 | 16 | #include <linux/hugetlb.h> |
---|
16 | 17 | #include <linux/falloc.h> |
---|
| 18 | +#include <linux/fadvise.h> |
---|
17 | 19 | #include <linux/sched.h> |
---|
| 20 | +#include <linux/sched/mm.h> |
---|
| 21 | +#include <linux/uio.h> |
---|
18 | 22 | #include <linux/ksm.h> |
---|
19 | 23 | #include <linux/fs.h> |
---|
20 | 24 | #include <linux/file.h> |
---|
21 | 25 | #include <linux/blkdev.h> |
---|
22 | 26 | #include <linux/backing-dev.h> |
---|
| 27 | +#include <linux/pagewalk.h> |
---|
23 | 28 | #include <linux/swap.h> |
---|
24 | 29 | #include <linux/swapops.h> |
---|
25 | 30 | #include <linux/shmem_fs.h> |
---|
26 | 31 | #include <linux/mmu_notifier.h> |
---|
| 32 | +#include <trace/hooks/mm.h> |
---|
27 | 33 | |
---|
28 | 34 | #include <asm/tlb.h> |
---|
29 | 35 | |
---|
30 | 36 | #include "internal.h" |
---|
31 | 37 | |
---|
| 38 | +struct madvise_walk_private { |
---|
| 39 | + struct mmu_gather *tlb; |
---|
| 40 | + bool pageout; |
---|
| 41 | + bool can_pageout_file; |
---|
| 42 | +}; |
---|
| 43 | + |
---|
32 | 44 | /* |
---|
33 | 45 | * Any behaviour which results in changes to the vma->vm_flags needs to |
---|
34 | | - * take mmap_sem for writing. Others, which simply traverse vmas, need |
---|
| 46 | + * take mmap_lock for writing. Others, which simply traverse vmas, need |
---|
35 | 47 | * to only take it for reading. |
---|
36 | 48 | */ |
---|
37 | 49 | static int madvise_need_mmap_write(int behavior) |
---|
.. | .. |
---|
40 | 52 | case MADV_REMOVE: |
---|
41 | 53 | case MADV_WILLNEED: |
---|
42 | 54 | case MADV_DONTNEED: |
---|
| 55 | + case MADV_COLD: |
---|
| 56 | + case MADV_PAGEOUT: |
---|
43 | 57 | case MADV_FREE: |
---|
44 | 58 | return 0; |
---|
45 | 59 | default: |
---|
.. | .. |
---|
105 | 119 | case MADV_MERGEABLE: |
---|
106 | 120 | case MADV_UNMERGEABLE: |
---|
107 | 121 | error = ksm_madvise(vma, start, end, behavior, &new_flags); |
---|
108 | | - if (error) { |
---|
109 | | - /* |
---|
110 | | - * madvise() returns EAGAIN if kernel resources, such as |
---|
111 | | - * slab, are temporarily unavailable. |
---|
112 | | - */ |
---|
113 | | - if (error == -ENOMEM) |
---|
114 | | - error = -EAGAIN; |
---|
115 | | - goto out; |
---|
116 | | - } |
---|
| 122 | + if (error) |
---|
| 123 | + goto out_convert_errno; |
---|
117 | 124 | break; |
---|
118 | 125 | case MADV_HUGEPAGE: |
---|
119 | 126 | case MADV_NOHUGEPAGE: |
---|
120 | 127 | error = hugepage_madvise(vma, &new_flags, behavior); |
---|
121 | | - if (error) { |
---|
122 | | - /* |
---|
123 | | - * madvise() returns EAGAIN if kernel resources, such as |
---|
124 | | - * slab, are temporarily unavailable. |
---|
125 | | - */ |
---|
126 | | - if (error == -ENOMEM) |
---|
127 | | - error = -EAGAIN; |
---|
128 | | - goto out; |
---|
129 | | - } |
---|
| 128 | + if (error) |
---|
| 129 | + goto out_convert_errno; |
---|
130 | 130 | break; |
---|
131 | 131 | } |
---|
132 | 132 | |
---|
.. | .. |
---|
152 | 152 | goto out; |
---|
153 | 153 | } |
---|
154 | 154 | error = __split_vma(mm, vma, start, 1); |
---|
155 | | - if (error) { |
---|
156 | | - /* |
---|
157 | | - * madvise() returns EAGAIN if kernel resources, such as |
---|
158 | | - * slab, are temporarily unavailable. |
---|
159 | | - */ |
---|
160 | | - if (error == -ENOMEM) |
---|
161 | | - error = -EAGAIN; |
---|
162 | | - goto out; |
---|
163 | | - } |
---|
| 155 | + if (error) |
---|
| 156 | + goto out_convert_errno; |
---|
164 | 157 | } |
---|
165 | 158 | |
---|
166 | 159 | if (end != vma->vm_end) { |
---|
.. | .. |
---|
169 | 162 | goto out; |
---|
170 | 163 | } |
---|
171 | 164 | error = __split_vma(mm, vma, end, 0); |
---|
172 | | - if (error) { |
---|
173 | | - /* |
---|
174 | | - * madvise() returns EAGAIN if kernel resources, such as |
---|
175 | | - * slab, are temporarily unavailable. |
---|
176 | | - */ |
---|
177 | | - if (error == -ENOMEM) |
---|
178 | | - error = -EAGAIN; |
---|
179 | | - goto out; |
---|
180 | | - } |
---|
| 165 | + if (error) |
---|
| 166 | + goto out_convert_errno; |
---|
181 | 167 | } |
---|
182 | 168 | |
---|
183 | 169 | success: |
---|
184 | 170 | /* |
---|
185 | | - * vm_flags is protected by the mmap_sem held in write mode. |
---|
| 171 | + * vm_flags is protected by the mmap_lock held in write mode. |
---|
186 | 172 | */ |
---|
187 | | - vma->vm_flags = new_flags; |
---|
| 173 | + vm_write_begin(vma); |
---|
| 174 | + WRITE_ONCE(vma->vm_flags, new_flags); |
---|
| 175 | + vm_write_end(vma); |
---|
| 176 | + |
---|
| 177 | +out_convert_errno: |
---|
| 178 | + /* |
---|
| 179 | + * madvise() returns EAGAIN if kernel resources, such as |
---|
| 180 | + * slab, are temporarily unavailable. |
---|
| 181 | + */ |
---|
| 182 | + if (error == -ENOMEM) |
---|
| 183 | + error = -EAGAIN; |
---|
188 | 184 | out: |
---|
189 | 185 | return error; |
---|
190 | 186 | } |
---|
.. | .. |
---|
225 | 221 | return 0; |
---|
226 | 222 | } |
---|
227 | 223 | |
---|
228 | | -static void force_swapin_readahead(struct vm_area_struct *vma, |
---|
229 | | - unsigned long start, unsigned long end) |
---|
230 | | -{ |
---|
231 | | - struct mm_walk walk = { |
---|
232 | | - .mm = vma->vm_mm, |
---|
233 | | - .pmd_entry = swapin_walk_pmd_entry, |
---|
234 | | - .private = vma, |
---|
235 | | - }; |
---|
236 | | - |
---|
237 | | - walk_page_range(start, end, &walk); |
---|
238 | | - |
---|
239 | | - lru_add_drain(); /* Push any new pages onto the LRU now */ |
---|
240 | | -} |
---|
| 224 | +static const struct mm_walk_ops swapin_walk_ops = { |
---|
| 225 | + .pmd_entry = swapin_walk_pmd_entry, |
---|
| 226 | +}; |
---|
241 | 227 | |
---|
242 | 228 | static void force_shm_swapin_readahead(struct vm_area_struct *vma, |
---|
243 | 229 | unsigned long start, unsigned long end, |
---|
244 | 230 | struct address_space *mapping) |
---|
245 | 231 | { |
---|
246 | | - pgoff_t index; |
---|
| 232 | + XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); |
---|
| 233 | + pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); |
---|
247 | 234 | struct page *page; |
---|
248 | | - swp_entry_t swap; |
---|
249 | 235 | |
---|
250 | | - for (; start < end; start += PAGE_SIZE) { |
---|
251 | | - index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
---|
| 236 | + rcu_read_lock(); |
---|
| 237 | + xas_for_each(&xas, page, end_index) { |
---|
| 238 | + swp_entry_t swap; |
---|
252 | 239 | |
---|
253 | | - page = find_get_entry(mapping, index); |
---|
254 | | - if (!radix_tree_exceptional_entry(page)) { |
---|
255 | | - if (page) |
---|
256 | | - put_page(page); |
---|
| 240 | + if (!xa_is_value(page)) |
---|
257 | 241 | continue; |
---|
258 | | - } |
---|
| 242 | + xas_pause(&xas); |
---|
| 243 | + rcu_read_unlock(); |
---|
| 244 | + |
---|
259 | 245 | swap = radix_to_swp_entry(page); |
---|
260 | 246 | page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, |
---|
261 | 247 | NULL, 0, false); |
---|
262 | 248 | if (page) |
---|
263 | 249 | put_page(page); |
---|
| 250 | + |
---|
| 251 | + rcu_read_lock(); |
---|
264 | 252 | } |
---|
| 253 | + rcu_read_unlock(); |
---|
265 | 254 | |
---|
266 | 255 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
---|
267 | 256 | } |
---|
.. | .. |
---|
274 | 263 | struct vm_area_struct **prev, |
---|
275 | 264 | unsigned long start, unsigned long end) |
---|
276 | 265 | { |
---|
| 266 | + struct mm_struct *mm = vma->vm_mm; |
---|
277 | 267 | struct file *file = vma->vm_file; |
---|
| 268 | + loff_t offset; |
---|
278 | 269 | |
---|
279 | 270 | *prev = vma; |
---|
280 | 271 | #ifdef CONFIG_SWAP |
---|
281 | 272 | if (!file) { |
---|
282 | | - force_swapin_readahead(vma, start, end); |
---|
| 273 | + walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); |
---|
| 274 | + lru_add_drain(); /* Push any new pages onto the LRU now */ |
---|
283 | 275 | return 0; |
---|
284 | 276 | } |
---|
285 | 277 | |
---|
.. | .. |
---|
298 | 290 | return 0; |
---|
299 | 291 | } |
---|
300 | 292 | |
---|
301 | | - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
---|
302 | | - if (end > vma->vm_end) |
---|
303 | | - end = vma->vm_end; |
---|
304 | | - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
---|
| 293 | + /* |
---|
| 294 | + * Filesystem's fadvise may need to take various locks. We need to |
---|
| 295 | + * explicitly grab a reference because the vma (and hence the |
---|
| 296 | + * vma's reference to the file) can go away as soon as we drop |
---|
| 297 | + * mmap_lock. |
---|
| 298 | + */ |
---|
| 299 | + *prev = NULL; /* tell sys_madvise we drop mmap_lock */ |
---|
| 300 | + get_file(file); |
---|
| 301 | + offset = (loff_t)(start - vma->vm_start) |
---|
| 302 | + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
---|
| 303 | + mmap_read_unlock(mm); |
---|
| 304 | + vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); |
---|
| 305 | + fput(file); |
---|
| 306 | + mmap_read_lock(mm); |
---|
| 307 | + return 0; |
---|
| 308 | +} |
---|
305 | 309 | |
---|
306 | | - force_page_cache_readahead(file->f_mapping, file, start, end - start); |
---|
| 310 | +static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, |
---|
| 311 | + unsigned long addr, unsigned long end, |
---|
| 312 | + struct mm_walk *walk) |
---|
| 313 | +{ |
---|
| 314 | + struct madvise_walk_private *private = walk->private; |
---|
| 315 | + struct mmu_gather *tlb = private->tlb; |
---|
| 316 | + bool pageout = private->pageout; |
---|
| 317 | + bool pageout_anon_only = pageout && !private->can_pageout_file; |
---|
| 318 | + struct mm_struct *mm = tlb->mm; |
---|
| 319 | + struct vm_area_struct *vma = walk->vma; |
---|
| 320 | + pte_t *orig_pte, *pte, ptent; |
---|
| 321 | + spinlock_t *ptl; |
---|
| 322 | + struct page *page = NULL; |
---|
| 323 | + LIST_HEAD(page_list); |
---|
| 324 | + bool allow_shared = false; |
---|
| 325 | + bool abort_madvise = false; |
---|
| 326 | + bool skip = false; |
---|
| 327 | + |
---|
| 328 | + trace_android_vh_madvise_cold_or_pageout_abort(vma, &abort_madvise); |
---|
| 329 | + if (fatal_signal_pending(current) || abort_madvise) |
---|
| 330 | + return -EINTR; |
---|
| 331 | + |
---|
| 332 | + trace_android_vh_madvise_cold_or_pageout(vma, &allow_shared); |
---|
| 333 | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE |
---|
| 334 | + if (pmd_trans_huge(*pmd)) { |
---|
| 335 | + pmd_t orig_pmd; |
---|
| 336 | + unsigned long next = pmd_addr_end(addr, end); |
---|
| 337 | + |
---|
| 338 | + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); |
---|
| 339 | + ptl = pmd_trans_huge_lock(pmd, vma); |
---|
| 340 | + if (!ptl) |
---|
| 341 | + return 0; |
---|
| 342 | + |
---|
| 343 | + orig_pmd = *pmd; |
---|
| 344 | + if (is_huge_zero_pmd(orig_pmd)) |
---|
| 345 | + goto huge_unlock; |
---|
| 346 | + |
---|
| 347 | + if (unlikely(!pmd_present(orig_pmd))) { |
---|
| 348 | + VM_BUG_ON(thp_migration_supported() && |
---|
| 349 | + !is_pmd_migration_entry(orig_pmd)); |
---|
| 350 | + goto huge_unlock; |
---|
| 351 | + } |
---|
| 352 | + |
---|
| 353 | + page = pmd_page(orig_pmd); |
---|
| 354 | + |
---|
| 355 | + /* Do not interfere with other mappings of this page */ |
---|
| 356 | + if (page_mapcount(page) != 1) |
---|
| 357 | + goto huge_unlock; |
---|
| 358 | + |
---|
| 359 | + if (pageout_anon_only && !PageAnon(page)) |
---|
| 360 | + goto huge_unlock; |
---|
| 361 | + |
---|
| 362 | + if (next - addr != HPAGE_PMD_SIZE) { |
---|
| 363 | + int err; |
---|
| 364 | + |
---|
| 365 | + get_page(page); |
---|
| 366 | + spin_unlock(ptl); |
---|
| 367 | + lock_page(page); |
---|
| 368 | + err = split_huge_page(page); |
---|
| 369 | + unlock_page(page); |
---|
| 370 | + put_page(page); |
---|
| 371 | + if (!err) |
---|
| 372 | + goto regular_page; |
---|
| 373 | + return 0; |
---|
| 374 | + } |
---|
| 375 | + |
---|
| 376 | + if (pmd_young(orig_pmd)) { |
---|
| 377 | + pmdp_invalidate(vma, addr, pmd); |
---|
| 378 | + orig_pmd = pmd_mkold(orig_pmd); |
---|
| 379 | + |
---|
| 380 | + set_pmd_at(mm, addr, pmd, orig_pmd); |
---|
| 381 | + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
---|
| 382 | + } |
---|
| 383 | + |
---|
| 384 | + ClearPageReferenced(page); |
---|
| 385 | + test_and_clear_page_young(page); |
---|
| 386 | + if (pageout) { |
---|
| 387 | + if (!isolate_lru_page(page)) { |
---|
| 388 | + if (PageUnevictable(page)) |
---|
| 389 | + putback_lru_page(page); |
---|
| 390 | + else |
---|
| 391 | + list_add(&page->lru, &page_list); |
---|
| 392 | + } |
---|
| 393 | + } else |
---|
| 394 | + deactivate_page(page); |
---|
| 395 | +huge_unlock: |
---|
| 396 | + spin_unlock(ptl); |
---|
| 397 | + if (pageout) |
---|
| 398 | + reclaim_pages(&page_list); |
---|
| 399 | + return 0; |
---|
| 400 | + } |
---|
| 401 | + |
---|
| 402 | +regular_page: |
---|
| 403 | + if (pmd_trans_unstable(pmd)) |
---|
| 404 | + return 0; |
---|
| 405 | +#endif |
---|
| 406 | + tlb_change_page_size(tlb, PAGE_SIZE); |
---|
| 407 | + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
---|
| 408 | + flush_tlb_batched_pending(mm); |
---|
| 409 | + arch_enter_lazy_mmu_mode(); |
---|
| 410 | + for (; addr < end; pte++, addr += PAGE_SIZE) { |
---|
| 411 | + ptent = *pte; |
---|
| 412 | + |
---|
| 413 | + if (pte_none(ptent)) |
---|
| 414 | + continue; |
---|
| 415 | + |
---|
| 416 | + if (!pte_present(ptent)) |
---|
| 417 | + continue; |
---|
| 418 | + |
---|
| 419 | + page = vm_normal_page(vma, addr, ptent); |
---|
| 420 | + if (!page) |
---|
| 421 | + continue; |
---|
| 422 | + |
---|
| 423 | + trace_android_vh_should_end_madvise(mm, &skip, &pageout); |
---|
| 424 | + if (skip) |
---|
| 425 | + break; |
---|
| 426 | + |
---|
| 427 | + /* |
---|
| 428 | + * Creating a THP page is expensive so split it only if we |
---|
| 429 | + * are sure it's worth. Split it if we are only owner. |
---|
| 430 | + */ |
---|
| 431 | + if (PageTransCompound(page)) { |
---|
| 432 | + if (page_mapcount(page) != 1) |
---|
| 433 | + break; |
---|
| 434 | + if (pageout_anon_only && !PageAnon(page)) |
---|
| 435 | + break; |
---|
| 436 | + get_page(page); |
---|
| 437 | + if (!trylock_page(page)) { |
---|
| 438 | + put_page(page); |
---|
| 439 | + break; |
---|
| 440 | + } |
---|
| 441 | + pte_unmap_unlock(orig_pte, ptl); |
---|
| 442 | + if (split_huge_page(page)) { |
---|
| 443 | + unlock_page(page); |
---|
| 444 | + put_page(page); |
---|
| 445 | + pte_offset_map_lock(mm, pmd, addr, &ptl); |
---|
| 446 | + break; |
---|
| 447 | + } |
---|
| 448 | + unlock_page(page); |
---|
| 449 | + put_page(page); |
---|
| 450 | + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
---|
| 451 | + pte--; |
---|
| 452 | + addr -= PAGE_SIZE; |
---|
| 453 | + continue; |
---|
| 454 | + } |
---|
| 455 | + |
---|
| 456 | + /* |
---|
| 457 | + * Do not interfere with other mappings of this page and |
---|
| 458 | + * non-LRU page. |
---|
| 459 | + */ |
---|
| 460 | + if (!allow_shared && (!PageLRU(page) || page_mapcount(page) != 1)) |
---|
| 461 | + continue; |
---|
| 462 | + |
---|
| 463 | + if (pageout_anon_only && !PageAnon(page)) |
---|
| 464 | + continue; |
---|
| 465 | + |
---|
| 466 | + VM_BUG_ON_PAGE(PageTransCompound(page), page); |
---|
| 467 | + |
---|
| 468 | + if (pte_young(ptent)) { |
---|
| 469 | + ptent = ptep_get_and_clear_full(mm, addr, pte, |
---|
| 470 | + tlb->fullmm); |
---|
| 471 | + ptent = pte_mkold(ptent); |
---|
| 472 | + set_pte_at(mm, addr, pte, ptent); |
---|
| 473 | + tlb_remove_tlb_entry(tlb, pte, addr); |
---|
| 474 | + } |
---|
| 475 | + |
---|
| 476 | + /* |
---|
| 477 | + * We are deactivating a page for accelerating reclaiming. |
---|
| 478 | + * VM couldn't reclaim the page unless we clear PG_young. |
---|
| 479 | + * As a side effect, it makes confuse idle-page tracking |
---|
| 480 | + * because they will miss recent referenced history. |
---|
| 481 | + */ |
---|
| 482 | + ClearPageReferenced(page); |
---|
| 483 | + test_and_clear_page_young(page); |
---|
| 484 | + if (pageout) { |
---|
| 485 | + if (!isolate_lru_page(page)) { |
---|
| 486 | + if (PageUnevictable(page)) |
---|
| 487 | + putback_lru_page(page); |
---|
| 488 | + else { |
---|
| 489 | + list_add(&page->lru, &page_list); |
---|
| 490 | + trace_android_vh_page_isolated_for_reclaim(mm, page); |
---|
| 491 | + } |
---|
| 492 | + } |
---|
| 493 | + } else |
---|
| 494 | + deactivate_page(page); |
---|
| 495 | + } |
---|
| 496 | + |
---|
| 497 | + arch_leave_lazy_mmu_mode(); |
---|
| 498 | + pte_unmap_unlock(orig_pte, ptl); |
---|
| 499 | + if (pageout) |
---|
| 500 | + reclaim_pages(&page_list); |
---|
| 501 | + cond_resched(); |
---|
| 502 | + |
---|
| 503 | + return 0; |
---|
| 504 | +} |
---|
| 505 | + |
---|
| 506 | +static const struct mm_walk_ops cold_walk_ops = { |
---|
| 507 | + .pmd_entry = madvise_cold_or_pageout_pte_range, |
---|
| 508 | +}; |
---|
| 509 | + |
---|
| 510 | +static void madvise_cold_page_range(struct mmu_gather *tlb, |
---|
| 511 | + struct vm_area_struct *vma, |
---|
| 512 | + unsigned long addr, unsigned long end) |
---|
| 513 | +{ |
---|
| 514 | + struct madvise_walk_private walk_private = { |
---|
| 515 | + .pageout = false, |
---|
| 516 | + .tlb = tlb, |
---|
| 517 | + }; |
---|
| 518 | + |
---|
| 519 | + tlb_start_vma(tlb, vma); |
---|
| 520 | + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); |
---|
| 521 | + tlb_end_vma(tlb, vma); |
---|
| 522 | +} |
---|
| 523 | + |
---|
| 524 | +static long madvise_cold(struct vm_area_struct *vma, |
---|
| 525 | + struct vm_area_struct **prev, |
---|
| 526 | + unsigned long start_addr, unsigned long end_addr) |
---|
| 527 | +{ |
---|
| 528 | + struct mm_struct *mm = vma->vm_mm; |
---|
| 529 | + struct mmu_gather tlb; |
---|
| 530 | + |
---|
| 531 | + *prev = vma; |
---|
| 532 | + if (!can_madv_lru_vma(vma)) |
---|
| 533 | + return -EINVAL; |
---|
| 534 | + |
---|
| 535 | + lru_add_drain(); |
---|
| 536 | + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); |
---|
| 537 | + madvise_cold_page_range(&tlb, vma, start_addr, end_addr); |
---|
| 538 | + tlb_finish_mmu(&tlb, start_addr, end_addr); |
---|
| 539 | + |
---|
| 540 | + return 0; |
---|
| 541 | +} |
---|
| 542 | + |
---|
| 543 | +static void madvise_pageout_page_range(struct mmu_gather *tlb, |
---|
| 544 | + struct vm_area_struct *vma, |
---|
| 545 | + unsigned long addr, unsigned long end, |
---|
| 546 | + bool can_pageout_file) |
---|
| 547 | +{ |
---|
| 548 | + struct madvise_walk_private walk_private = { |
---|
| 549 | + .pageout = true, |
---|
| 550 | + .tlb = tlb, |
---|
| 551 | + .can_pageout_file = can_pageout_file, |
---|
| 552 | + }; |
---|
| 553 | + |
---|
| 554 | + tlb_start_vma(tlb, vma); |
---|
| 555 | + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); |
---|
| 556 | + tlb_end_vma(tlb, vma); |
---|
| 557 | +} |
---|
| 558 | + |
---|
| 559 | +static inline bool can_do_file_pageout(struct vm_area_struct *vma) |
---|
| 560 | +{ |
---|
| 561 | + if (!vma->vm_file) |
---|
| 562 | + return false; |
---|
| 563 | + /* |
---|
| 564 | + * paging out pagecache only for non-anonymous mappings that correspond |
---|
| 565 | + * to the files the calling process could (if tried) open for writing; |
---|
| 566 | + * otherwise we'd be including shared non-exclusive mappings, which |
---|
| 567 | + * opens a side channel. |
---|
| 568 | + */ |
---|
| 569 | + return inode_owner_or_capable(file_inode(vma->vm_file)) || |
---|
| 570 | + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; |
---|
| 571 | +} |
---|
| 572 | + |
---|
| 573 | +static long madvise_pageout(struct vm_area_struct *vma, |
---|
| 574 | + struct vm_area_struct **prev, |
---|
| 575 | + unsigned long start_addr, unsigned long end_addr) |
---|
| 576 | +{ |
---|
| 577 | + struct mm_struct *mm = vma->vm_mm; |
---|
| 578 | + struct mmu_gather tlb; |
---|
| 579 | + bool can_pageout_file; |
---|
| 580 | + |
---|
| 581 | + *prev = vma; |
---|
| 582 | + if (!can_madv_lru_vma(vma)) |
---|
| 583 | + return -EINVAL; |
---|
| 584 | + |
---|
| 585 | + /* |
---|
| 586 | + * If the VMA belongs to a private file mapping, there can be private |
---|
| 587 | + * dirty pages which can be paged out if even this process is neither |
---|
| 588 | + * owner nor write capable of the file. Cache the file access check |
---|
| 589 | + * here and use it later during page walk. |
---|
| 590 | + */ |
---|
| 591 | + can_pageout_file = can_do_file_pageout(vma); |
---|
| 592 | + |
---|
| 593 | + lru_add_drain(); |
---|
| 594 | + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); |
---|
| 595 | + madvise_pageout_page_range(&tlb, vma, start_addr, end_addr, can_pageout_file); |
---|
| 596 | + tlb_finish_mmu(&tlb, start_addr, end_addr); |
---|
| 597 | + |
---|
307 | 598 | return 0; |
---|
308 | 599 | } |
---|
309 | 600 | |
---|
.. | .. |
---|
328 | 619 | if (pmd_trans_unstable(pmd)) |
---|
329 | 620 | return 0; |
---|
330 | 621 | |
---|
331 | | - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); |
---|
| 622 | + tlb_change_page_size(tlb, PAGE_SIZE); |
---|
332 | 623 | orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
---|
333 | 624 | flush_tlb_batched_pending(mm); |
---|
334 | 625 | arch_enter_lazy_mmu_mode(); |
---|
.. | .. |
---|
354 | 645 | continue; |
---|
355 | 646 | } |
---|
356 | 647 | |
---|
357 | | - page = _vm_normal_page(vma, addr, ptent, true); |
---|
| 648 | + page = vm_normal_page(vma, addr, ptent); |
---|
358 | 649 | if (!page) |
---|
359 | 650 | continue; |
---|
360 | 651 | |
---|
.. | .. |
---|
440 | 731 | return 0; |
---|
441 | 732 | } |
---|
442 | 733 | |
---|
443 | | -static void madvise_free_page_range(struct mmu_gather *tlb, |
---|
444 | | - struct vm_area_struct *vma, |
---|
445 | | - unsigned long addr, unsigned long end) |
---|
446 | | -{ |
---|
447 | | - struct mm_walk free_walk = { |
---|
448 | | - .pmd_entry = madvise_free_pte_range, |
---|
449 | | - .mm = vma->vm_mm, |
---|
450 | | - .private = tlb, |
---|
451 | | - }; |
---|
452 | | - |
---|
453 | | - tlb_start_vma(tlb, vma); |
---|
454 | | - walk_page_range(addr, end, &free_walk); |
---|
455 | | - tlb_end_vma(tlb, vma); |
---|
456 | | -} |
---|
| 734 | +static const struct mm_walk_ops madvise_free_walk_ops = { |
---|
| 735 | + .pmd_entry = madvise_free_pte_range, |
---|
| 736 | +}; |
---|
457 | 737 | |
---|
458 | 738 | static int madvise_free_single_vma(struct vm_area_struct *vma, |
---|
459 | 739 | unsigned long start_addr, unsigned long end_addr) |
---|
460 | 740 | { |
---|
461 | | - unsigned long start, end; |
---|
462 | 741 | struct mm_struct *mm = vma->vm_mm; |
---|
| 742 | + struct mmu_notifier_range range; |
---|
463 | 743 | struct mmu_gather tlb; |
---|
464 | 744 | |
---|
465 | 745 | /* MADV_FREE works for only anon vma at the moment */ |
---|
466 | 746 | if (!vma_is_anonymous(vma)) |
---|
467 | 747 | return -EINVAL; |
---|
468 | 748 | |
---|
469 | | - start = max(vma->vm_start, start_addr); |
---|
470 | | - if (start >= vma->vm_end) |
---|
| 749 | + range.start = max(vma->vm_start, start_addr); |
---|
| 750 | + if (range.start >= vma->vm_end) |
---|
471 | 751 | return -EINVAL; |
---|
472 | | - end = min(vma->vm_end, end_addr); |
---|
473 | | - if (end <= vma->vm_start) |
---|
| 752 | + range.end = min(vma->vm_end, end_addr); |
---|
| 753 | + if (range.end <= vma->vm_start) |
---|
474 | 754 | return -EINVAL; |
---|
| 755 | + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, |
---|
| 756 | + range.start, range.end); |
---|
475 | 757 | |
---|
476 | 758 | lru_add_drain(); |
---|
477 | | - tlb_gather_mmu(&tlb, mm, start, end); |
---|
| 759 | + tlb_gather_mmu(&tlb, mm, range.start, range.end); |
---|
478 | 760 | update_hiwater_rss(mm); |
---|
479 | 761 | |
---|
480 | | - mmu_notifier_invalidate_range_start(mm, start, end); |
---|
481 | | - madvise_free_page_range(&tlb, vma, start, end); |
---|
482 | | - mmu_notifier_invalidate_range_end(mm, start, end); |
---|
483 | | - tlb_finish_mmu(&tlb, start, end); |
---|
| 762 | + mmu_notifier_invalidate_range_start(&range); |
---|
| 763 | + tlb_start_vma(&tlb, vma); |
---|
| 764 | + walk_page_range(vma->vm_mm, range.start, range.end, |
---|
| 765 | + &madvise_free_walk_ops, &tlb); |
---|
| 766 | + tlb_end_vma(&tlb, vma); |
---|
| 767 | + mmu_notifier_invalidate_range_end(&range); |
---|
| 768 | + tlb_finish_mmu(&tlb, range.start, range.end); |
---|
484 | 769 | |
---|
485 | 770 | return 0; |
---|
486 | 771 | } |
---|
.. | .. |
---|
516 | 801 | unsigned long start, unsigned long end, |
---|
517 | 802 | int behavior) |
---|
518 | 803 | { |
---|
| 804 | + struct mm_struct *mm = vma->vm_mm; |
---|
| 805 | + |
---|
519 | 806 | *prev = vma; |
---|
520 | | - if (!can_madv_dontneed_vma(vma)) |
---|
| 807 | + if (!can_madv_lru_vma(vma)) |
---|
521 | 808 | return -EINVAL; |
---|
522 | 809 | |
---|
523 | 810 | if (!userfaultfd_remove(vma, start, end)) { |
---|
524 | | - *prev = NULL; /* mmap_sem has been dropped, prev is stale */ |
---|
| 811 | + *prev = NULL; /* mmap_lock has been dropped, prev is stale */ |
---|
525 | 812 | |
---|
526 | | - down_read(¤t->mm->mmap_sem); |
---|
527 | | - vma = find_vma(current->mm, start); |
---|
| 813 | + mmap_read_lock(mm); |
---|
| 814 | + vma = find_vma(mm, start); |
---|
528 | 815 | if (!vma) |
---|
529 | 816 | return -ENOMEM; |
---|
530 | 817 | if (start < vma->vm_start) { |
---|
.. | .. |
---|
539 | 826 | */ |
---|
540 | 827 | return -ENOMEM; |
---|
541 | 828 | } |
---|
542 | | - if (!can_madv_dontneed_vma(vma)) |
---|
| 829 | + if (!can_madv_lru_vma(vma)) |
---|
543 | 830 | return -EINVAL; |
---|
544 | 831 | if (end > vma->vm_end) { |
---|
545 | 832 | /* |
---|
546 | 833 | * Don't fail if end > vma->vm_end. If the old |
---|
547 | | - * vma was splitted while the mmap_sem was |
---|
| 834 | + * vma was splitted while the mmap_lock was |
---|
548 | 835 | * released the effect of the concurrent |
---|
549 | 836 | * operation may not cause madvise() to |
---|
550 | 837 | * have an undefined result. There may be an |
---|
.. | .. |
---|
578 | 865 | loff_t offset; |
---|
579 | 866 | int error; |
---|
580 | 867 | struct file *f; |
---|
| 868 | + struct mm_struct *mm = vma->vm_mm; |
---|
581 | 869 | |
---|
582 | | - *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
---|
| 870 | + *prev = NULL; /* tell sys_madvise we drop mmap_lock */ |
---|
583 | 871 | |
---|
584 | 872 | if (vma->vm_flags & VM_LOCKED) |
---|
585 | 873 | return -EINVAL; |
---|
.. | .. |
---|
600 | 888 | * Filesystem's fallocate may need to take i_mutex. We need to |
---|
601 | 889 | * explicitly grab a reference because the vma (and hence the |
---|
602 | 890 | * vma's reference to the file) can go away as soon as we drop |
---|
603 | | - * mmap_sem. |
---|
| 891 | + * mmap_lock. |
---|
604 | 892 | */ |
---|
605 | 893 | get_file(f); |
---|
606 | 894 | if (userfaultfd_remove(vma, start, end)) { |
---|
607 | | - /* mmap_sem was not released by userfaultfd_remove() */ |
---|
608 | | - up_read(¤t->mm->mmap_sem); |
---|
| 895 | + /* mmap_lock was not released by userfaultfd_remove() */ |
---|
| 896 | + mmap_read_unlock(mm); |
---|
609 | 897 | } |
---|
610 | 898 | error = vfs_fallocate(f, |
---|
611 | 899 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, |
---|
612 | 900 | offset, end - start); |
---|
613 | 901 | fput(f); |
---|
614 | | - down_read(¤t->mm->mmap_sem); |
---|
| 902 | + mmap_read_lock(mm); |
---|
615 | 903 | return error; |
---|
616 | 904 | } |
---|
617 | 905 | |
---|
.. | .. |
---|
622 | 910 | static int madvise_inject_error(int behavior, |
---|
623 | 911 | unsigned long start, unsigned long end) |
---|
624 | 912 | { |
---|
625 | | - struct page *page; |
---|
626 | 913 | struct zone *zone; |
---|
627 | | - unsigned int order; |
---|
| 914 | + unsigned long size; |
---|
628 | 915 | |
---|
629 | 916 | if (!capable(CAP_SYS_ADMIN)) |
---|
630 | 917 | return -EPERM; |
---|
631 | 918 | |
---|
632 | 919 | |
---|
633 | | - for (; start < end; start += PAGE_SIZE << order) { |
---|
| 920 | + for (; start < end; start += size) { |
---|
634 | 921 | unsigned long pfn; |
---|
| 922 | + struct page *page; |
---|
635 | 923 | int ret; |
---|
636 | 924 | |
---|
637 | 925 | ret = get_user_pages_fast(start, 1, 0, &page); |
---|
.. | .. |
---|
642 | 930 | /* |
---|
643 | 931 | * When soft offlining hugepages, after migrating the page |
---|
644 | 932 | * we dissolve it, therefore in the second loop "page" will |
---|
645 | | - * no longer be a compound page, and order will be 0. |
---|
| 933 | + * no longer be a compound page. |
---|
646 | 934 | */ |
---|
647 | | - order = compound_order(compound_head(page)); |
---|
648 | | - |
---|
649 | | - if (PageHWPoison(page)) { |
---|
650 | | - put_page(page); |
---|
651 | | - continue; |
---|
652 | | - } |
---|
| 935 | + size = page_size(compound_head(page)); |
---|
653 | 936 | |
---|
654 | 937 | if (behavior == MADV_SOFT_OFFLINE) { |
---|
655 | 938 | pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", |
---|
656 | | - pfn, start); |
---|
657 | | - |
---|
658 | | - ret = soft_offline_page(page, MF_COUNT_INCREASED); |
---|
659 | | - if (ret) |
---|
660 | | - return ret; |
---|
661 | | - continue; |
---|
| 939 | + pfn, start); |
---|
| 940 | + ret = soft_offline_page(pfn, MF_COUNT_INCREASED); |
---|
| 941 | + } else { |
---|
| 942 | + pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", |
---|
| 943 | + pfn, start); |
---|
| 944 | + ret = memory_failure(pfn, MF_COUNT_INCREASED); |
---|
662 | 945 | } |
---|
663 | 946 | |
---|
664 | | - pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", |
---|
665 | | - pfn, start); |
---|
666 | | - |
---|
667 | | - /* |
---|
668 | | - * Drop the page reference taken by get_user_pages_fast(). In |
---|
669 | | - * the absence of MF_COUNT_INCREASED the memory_failure() |
---|
670 | | - * routine is responsible for pinning the page to prevent it |
---|
671 | | - * from being released back to the page allocator. |
---|
672 | | - */ |
---|
673 | | - put_page(page); |
---|
674 | | - ret = memory_failure(pfn, 0); |
---|
675 | 947 | if (ret) |
---|
676 | 948 | return ret; |
---|
677 | 949 | } |
---|
.. | .. |
---|
693 | 965 | return madvise_remove(vma, prev, start, end); |
---|
694 | 966 | case MADV_WILLNEED: |
---|
695 | 967 | return madvise_willneed(vma, prev, start, end); |
---|
| 968 | + case MADV_COLD: |
---|
| 969 | + return madvise_cold(vma, prev, start, end); |
---|
| 970 | + case MADV_PAGEOUT: |
---|
| 971 | + return madvise_pageout(vma, prev, start, end); |
---|
696 | 972 | case MADV_FREE: |
---|
697 | 973 | case MADV_DONTNEED: |
---|
698 | 974 | return madvise_dontneed_free(vma, prev, start, end, behavior); |
---|
.. | .. |
---|
714 | 990 | case MADV_WILLNEED: |
---|
715 | 991 | case MADV_DONTNEED: |
---|
716 | 992 | case MADV_FREE: |
---|
| 993 | + case MADV_COLD: |
---|
| 994 | + case MADV_PAGEOUT: |
---|
717 | 995 | #ifdef CONFIG_KSM |
---|
718 | 996 | case MADV_MERGEABLE: |
---|
719 | 997 | case MADV_UNMERGEABLE: |
---|
.. | .. |
---|
732 | 1010 | #endif |
---|
733 | 1011 | return true; |
---|
734 | 1012 | |
---|
| 1013 | + default: |
---|
| 1014 | + return false; |
---|
| 1015 | + } |
---|
| 1016 | +} |
---|
| 1017 | + |
---|
| 1018 | +static bool |
---|
| 1019 | +process_madvise_behavior_valid(int behavior) |
---|
| 1020 | +{ |
---|
| 1021 | + switch (behavior) { |
---|
| 1022 | + case MADV_COLD: |
---|
| 1023 | + case MADV_PAGEOUT: |
---|
| 1024 | + case MADV_WILLNEED: |
---|
| 1025 | + return true; |
---|
735 | 1026 | default: |
---|
736 | 1027 | return false; |
---|
737 | 1028 | } |
---|
.. | .. |
---|
784 | 1075 | * MADV_DONTDUMP - the application wants to prevent pages in the given range |
---|
785 | 1076 | * from being included in its core dump. |
---|
786 | 1077 | * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. |
---|
| 1078 | + * MADV_COLD - the application is not expected to use this memory soon, |
---|
| 1079 | + * deactivate pages in this range so that they can be reclaimed |
---|
| 1080 | + * easily if memory pressure hanppens. |
---|
| 1081 | + * MADV_PAGEOUT - the application is not expected to use this memory soon, |
---|
| 1082 | + * page out the pages in this range immediately. |
---|
787 | 1083 | * |
---|
788 | 1084 | * return values: |
---|
789 | 1085 | * zero - success |
---|
.. | .. |
---|
798 | 1094 | * -EBADF - map exists, but area maps something that isn't a file. |
---|
799 | 1095 | * -EAGAIN - a kernel resource was temporarily unavailable. |
---|
800 | 1096 | */ |
---|
801 | | -SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) |
---|
| 1097 | +int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) |
---|
802 | 1098 | { |
---|
803 | 1099 | unsigned long end, tmp; |
---|
804 | 1100 | struct vm_area_struct *vma, *prev; |
---|
.. | .. |
---|
813 | 1109 | if (!madvise_behavior_valid(behavior)) |
---|
814 | 1110 | return error; |
---|
815 | 1111 | |
---|
816 | | - if (start & ~PAGE_MASK) |
---|
| 1112 | + if (!PAGE_ALIGNED(start)) |
---|
817 | 1113 | return error; |
---|
818 | | - len = (len_in + ~PAGE_MASK) & PAGE_MASK; |
---|
| 1114 | + len = PAGE_ALIGN(len_in); |
---|
819 | 1115 | |
---|
820 | 1116 | /* Check to see whether len was rounded up from small -ve to zero */ |
---|
821 | 1117 | if (len_in && !len) |
---|
.. | .. |
---|
836 | 1132 | |
---|
837 | 1133 | write = madvise_need_mmap_write(behavior); |
---|
838 | 1134 | if (write) { |
---|
839 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
---|
| 1135 | + if (mmap_write_lock_killable(mm)) |
---|
840 | 1136 | return -EINTR; |
---|
841 | 1137 | } else { |
---|
842 | | - down_read(¤t->mm->mmap_sem); |
---|
| 1138 | + mmap_read_lock(mm); |
---|
843 | 1139 | } |
---|
844 | 1140 | |
---|
845 | 1141 | /* |
---|
.. | .. |
---|
847 | 1143 | * ranges, just ignore them, but return -ENOMEM at the end. |
---|
848 | 1144 | * - different from the way of handling in mlock etc. |
---|
849 | 1145 | */ |
---|
850 | | - vma = find_vma_prev(current->mm, start, &prev); |
---|
| 1146 | + vma = find_vma_prev(mm, start, &prev); |
---|
851 | 1147 | if (vma && start > vma->vm_start) |
---|
852 | 1148 | prev = vma; |
---|
853 | 1149 | |
---|
.. | .. |
---|
883 | 1179 | goto out; |
---|
884 | 1180 | if (prev) |
---|
885 | 1181 | vma = prev->vm_next; |
---|
886 | | - else /* madvise_remove dropped mmap_sem */ |
---|
887 | | - vma = find_vma(current->mm, start); |
---|
| 1182 | + else /* madvise_remove dropped mmap_lock */ |
---|
| 1183 | + vma = find_vma(mm, start); |
---|
888 | 1184 | } |
---|
889 | 1185 | out: |
---|
890 | 1186 | blk_finish_plug(&plug); |
---|
891 | 1187 | if (write) |
---|
892 | | - up_write(¤t->mm->mmap_sem); |
---|
| 1188 | + mmap_write_unlock(mm); |
---|
893 | 1189 | else |
---|
894 | | - up_read(¤t->mm->mmap_sem); |
---|
| 1190 | + mmap_read_unlock(mm); |
---|
895 | 1191 | |
---|
896 | 1192 | return error; |
---|
897 | 1193 | } |
---|
| 1194 | + |
---|
| 1195 | +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) |
---|
| 1196 | +{ |
---|
| 1197 | + return do_madvise(current->mm, start, len_in, behavior); |
---|
| 1198 | +} |
---|
| 1199 | + |
---|
| 1200 | +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, |
---|
| 1201 | + size_t, vlen, int, behavior, unsigned int, flags) |
---|
| 1202 | +{ |
---|
| 1203 | + ssize_t ret; |
---|
| 1204 | + struct iovec iovstack[UIO_FASTIOV], iovec; |
---|
| 1205 | + struct iovec *iov = iovstack; |
---|
| 1206 | + struct iov_iter iter; |
---|
| 1207 | + struct pid *pid; |
---|
| 1208 | + struct task_struct *task; |
---|
| 1209 | + struct mm_struct *mm; |
---|
| 1210 | + size_t total_len; |
---|
| 1211 | + unsigned int f_flags; |
---|
| 1212 | + |
---|
| 1213 | + if (flags != 0) { |
---|
| 1214 | + ret = -EINVAL; |
---|
| 1215 | + goto out; |
---|
| 1216 | + } |
---|
| 1217 | + |
---|
| 1218 | + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); |
---|
| 1219 | + if (ret < 0) |
---|
| 1220 | + goto out; |
---|
| 1221 | + |
---|
| 1222 | + pid = pidfd_get_pid(pidfd, &f_flags); |
---|
| 1223 | + if (IS_ERR(pid)) { |
---|
| 1224 | + ret = PTR_ERR(pid); |
---|
| 1225 | + goto free_iov; |
---|
| 1226 | + } |
---|
| 1227 | + |
---|
| 1228 | + task = get_pid_task(pid, PIDTYPE_PID); |
---|
| 1229 | + if (!task) { |
---|
| 1230 | + ret = -ESRCH; |
---|
| 1231 | + goto put_pid; |
---|
| 1232 | + } |
---|
| 1233 | + |
---|
| 1234 | + if (!process_madvise_behavior_valid(behavior)) { |
---|
| 1235 | + ret = -EINVAL; |
---|
| 1236 | + goto release_task; |
---|
| 1237 | + } |
---|
| 1238 | + |
---|
| 1239 | + /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ |
---|
| 1240 | + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); |
---|
| 1241 | + if (IS_ERR_OR_NULL(mm)) { |
---|
| 1242 | + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; |
---|
| 1243 | + goto release_task; |
---|
| 1244 | + } |
---|
| 1245 | + |
---|
| 1246 | + /* |
---|
| 1247 | + * Require CAP_SYS_NICE for influencing process performance. Note that |
---|
| 1248 | + * only non-destructive hints are currently supported. |
---|
| 1249 | + */ |
---|
| 1250 | + if (!capable(CAP_SYS_NICE)) { |
---|
| 1251 | + ret = -EPERM; |
---|
| 1252 | + goto release_mm; |
---|
| 1253 | + } |
---|
| 1254 | + |
---|
| 1255 | + total_len = iov_iter_count(&iter); |
---|
| 1256 | + |
---|
| 1257 | + while (iov_iter_count(&iter)) { |
---|
| 1258 | + iovec = iov_iter_iovec(&iter); |
---|
| 1259 | + ret = do_madvise(mm, (unsigned long)iovec.iov_base, |
---|
| 1260 | + iovec.iov_len, behavior); |
---|
| 1261 | + if (ret < 0) |
---|
| 1262 | + break; |
---|
| 1263 | + iov_iter_advance(&iter, iovec.iov_len); |
---|
| 1264 | + } |
---|
| 1265 | + |
---|
| 1266 | + ret = (total_len - iov_iter_count(&iter)) ? : ret; |
---|
| 1267 | + |
---|
| 1268 | +release_mm: |
---|
| 1269 | + mmput(mm); |
---|
| 1270 | +release_task: |
---|
| 1271 | + put_task_struct(task); |
---|
| 1272 | +put_pid: |
---|
| 1273 | + put_pid(pid); |
---|
| 1274 | +free_iov: |
---|
| 1275 | + kfree(iov); |
---|
| 1276 | +out: |
---|
| 1277 | + return ret; |
---|
| 1278 | +} |
---|