.. | .. |
---|
15 | 15 | #include <linux/atomic.h> |
---|
16 | 16 | #include <linux/debug_locks.h> |
---|
17 | 17 | #include <linux/mm_types.h> |
---|
| 18 | +#include <linux/mmap_lock.h> |
---|
18 | 19 | #include <linux/range.h> |
---|
19 | 20 | #include <linux/pfn.h> |
---|
20 | 21 | #include <linux/percpu-refcount.h> |
---|
.. | .. |
---|
23 | 24 | #include <linux/resource.h> |
---|
24 | 25 | #include <linux/page_ext.h> |
---|
25 | 26 | #include <linux/err.h> |
---|
| 27 | +#include <linux/page-flags.h> |
---|
26 | 28 | #include <linux/page_ref.h> |
---|
27 | 29 | #include <linux/memremap.h> |
---|
28 | 30 | #include <linux/overflow.h> |
---|
| 31 | +#include <linux/sizes.h> |
---|
| 32 | +#include <linux/sched.h> |
---|
| 33 | +#include <linux/pgtable.h> |
---|
| 34 | +#include <linux/kasan.h> |
---|
| 35 | +#include <linux/page_pinner.h> |
---|
29 | 36 | #include <linux/android_kabi.h> |
---|
30 | 37 | |
---|
31 | 38 | struct mempolicy; |
---|
.. | .. |
---|
35 | 42 | struct user_struct; |
---|
36 | 43 | struct writeback_control; |
---|
37 | 44 | struct bdi_writeback; |
---|
| 45 | +struct pt_regs; |
---|
| 46 | + |
---|
| 47 | +extern int sysctl_page_lock_unfairness; |
---|
38 | 48 | |
---|
39 | 49 | void init_mm_internals(void); |
---|
40 | 50 | |
---|
.. | .. |
---|
49 | 59 | static inline void set_max_mapnr(unsigned long limit) { } |
---|
50 | 60 | #endif |
---|
51 | 61 | |
---|
52 | | -extern unsigned long totalram_pages; |
---|
| 62 | +extern atomic_long_t _totalram_pages; |
---|
| 63 | +static inline unsigned long totalram_pages(void) |
---|
| 64 | +{ |
---|
| 65 | + return (unsigned long)atomic_long_read(&_totalram_pages); |
---|
| 66 | +} |
---|
| 67 | + |
---|
| 68 | +static inline void totalram_pages_inc(void) |
---|
| 69 | +{ |
---|
| 70 | + atomic_long_inc(&_totalram_pages); |
---|
| 71 | +} |
---|
| 72 | + |
---|
| 73 | +static inline void totalram_pages_dec(void) |
---|
| 74 | +{ |
---|
| 75 | + atomic_long_dec(&_totalram_pages); |
---|
| 76 | +} |
---|
| 77 | + |
---|
| 78 | +static inline void totalram_pages_add(long count) |
---|
| 79 | +{ |
---|
| 80 | + atomic_long_add(count, &_totalram_pages); |
---|
| 81 | +} |
---|
| 82 | + |
---|
53 | 83 | extern void * high_memory; |
---|
54 | 84 | extern int page_cluster; |
---|
55 | 85 | |
---|
.. | .. |
---|
71 | 101 | #endif |
---|
72 | 102 | |
---|
73 | 103 | #include <asm/page.h> |
---|
74 | | -#include <asm/pgtable.h> |
---|
75 | 104 | #include <asm/processor.h> |
---|
76 | 105 | |
---|
77 | 106 | /* |
---|
.. | .. |
---|
87 | 116 | |
---|
88 | 117 | #ifndef __pa_symbol |
---|
89 | 118 | #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) |
---|
| 119 | +#endif |
---|
| 120 | + |
---|
| 121 | +#ifndef __va_function |
---|
| 122 | +#define __va_function(x) (x) |
---|
| 123 | +#endif |
---|
| 124 | + |
---|
| 125 | +#ifndef __pa_function |
---|
| 126 | +#define __pa_function(x) __pa_symbol(x) |
---|
90 | 127 | #endif |
---|
91 | 128 | |
---|
92 | 129 | #ifndef page_to_virt |
---|
.. | .. |
---|
110 | 147 | |
---|
111 | 148 | /* |
---|
112 | 149 | * On some architectures it is expensive to call memset() for small sizes. |
---|
113 | | - * Those architectures should provide their own implementation of "struct page" |
---|
114 | | - * zeroing by defining this macro in <asm/pgtable.h>. |
---|
| 150 | + * If an architecture decides to implement their own version of |
---|
| 151 | + * mm_zero_struct_page they should wrap the defines below in a #ifndef and |
---|
| 152 | + * define their own version of this macro in <asm/pgtable.h> |
---|
115 | 153 | */ |
---|
116 | | -#ifndef mm_zero_struct_page |
---|
| 154 | +#if BITS_PER_LONG == 64 |
---|
| 155 | +/* This function must be updated when the size of struct page grows above 80 |
---|
| 156 | + * or reduces below 56. The idea that compiler optimizes out switch() |
---|
| 157 | + * statement, and only leaves move/store instructions. Also the compiler can |
---|
| 158 | + * combine write statments if they are both assignments and can be reordered, |
---|
| 159 | + * this can result in several of the writes here being dropped. |
---|
| 160 | + */ |
---|
| 161 | +#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) |
---|
| 162 | +static inline void __mm_zero_struct_page(struct page *page) |
---|
| 163 | +{ |
---|
| 164 | + unsigned long *_pp = (void *)page; |
---|
| 165 | + |
---|
| 166 | + /* Check that struct page is either 56, 64, 72, or 80 bytes */ |
---|
| 167 | + BUILD_BUG_ON(sizeof(struct page) & 7); |
---|
| 168 | + BUILD_BUG_ON(sizeof(struct page) < 56); |
---|
| 169 | + BUILD_BUG_ON(sizeof(struct page) > 80); |
---|
| 170 | + |
---|
| 171 | + switch (sizeof(struct page)) { |
---|
| 172 | + case 80: |
---|
| 173 | + _pp[9] = 0; |
---|
| 174 | + fallthrough; |
---|
| 175 | + case 72: |
---|
| 176 | + _pp[8] = 0; |
---|
| 177 | + fallthrough; |
---|
| 178 | + case 64: |
---|
| 179 | + _pp[7] = 0; |
---|
| 180 | + fallthrough; |
---|
| 181 | + case 56: |
---|
| 182 | + _pp[6] = 0; |
---|
| 183 | + _pp[5] = 0; |
---|
| 184 | + _pp[4] = 0; |
---|
| 185 | + _pp[3] = 0; |
---|
| 186 | + _pp[2] = 0; |
---|
| 187 | + _pp[1] = 0; |
---|
| 188 | + _pp[0] = 0; |
---|
| 189 | + } |
---|
| 190 | +} |
---|
| 191 | +#else |
---|
117 | 192 | #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) |
---|
118 | 193 | #endif |
---|
119 | 194 | |
---|
.. | .. |
---|
145 | 220 | extern int sysctl_overcommit_ratio; |
---|
146 | 221 | extern unsigned long sysctl_overcommit_kbytes; |
---|
147 | 222 | |
---|
148 | | -extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, |
---|
149 | | - size_t *, loff_t *); |
---|
150 | | -extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, |
---|
151 | | - size_t *, loff_t *); |
---|
| 223 | +int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, |
---|
| 224 | + loff_t *); |
---|
| 225 | +int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, |
---|
| 226 | + loff_t *); |
---|
| 227 | +int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, |
---|
| 228 | + loff_t *); |
---|
152 | 229 | |
---|
153 | 230 | #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) |
---|
154 | 231 | |
---|
.. | .. |
---|
157 | 234 | |
---|
158 | 235 | /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ |
---|
159 | 236 | #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) |
---|
| 237 | + |
---|
| 238 | +#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) |
---|
160 | 239 | |
---|
161 | 240 | /* |
---|
162 | 241 | * Linux kernel virtual memory manager primitives. |
---|
.. | .. |
---|
267 | 346 | #elif defined(CONFIG_SPARC64) |
---|
268 | 347 | # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ |
---|
269 | 348 | # define VM_ARCH_CLEAR VM_SPARC_ADI |
---|
| 349 | +#elif defined(CONFIG_ARM64) |
---|
| 350 | +# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ |
---|
| 351 | +# define VM_ARCH_CLEAR VM_ARM64_BTI |
---|
270 | 352 | #elif !defined(CONFIG_MMU) |
---|
271 | 353 | # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ |
---|
272 | 354 | #endif |
---|
273 | 355 | |
---|
274 | | -#if defined(CONFIG_X86_INTEL_MPX) |
---|
275 | | -/* MPX specific bounds table or bounds directory */ |
---|
276 | | -# define VM_MPX VM_HIGH_ARCH_4 |
---|
| 356 | +#if defined(CONFIG_ARM64_MTE) |
---|
| 357 | +# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ |
---|
| 358 | +# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ |
---|
277 | 359 | #else |
---|
278 | | -# define VM_MPX VM_NONE |
---|
| 360 | +# define VM_MTE VM_NONE |
---|
| 361 | +# define VM_MTE_ALLOWED VM_NONE |
---|
279 | 362 | #endif |
---|
280 | 363 | |
---|
281 | 364 | #ifndef VM_GROWSUP |
---|
282 | 365 | # define VM_GROWSUP VM_NONE |
---|
283 | 366 | #endif |
---|
284 | 367 | |
---|
| 368 | +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR |
---|
| 369 | +# define VM_UFFD_MINOR_BIT 37 |
---|
| 370 | +# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ |
---|
| 371 | +#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ |
---|
| 372 | +# define VM_UFFD_MINOR VM_NONE |
---|
| 373 | +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ |
---|
| 374 | + |
---|
285 | 375 | /* Bits set in the VMA until the stack is in its final location */ |
---|
286 | 376 | #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) |
---|
| 377 | + |
---|
| 378 | +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) |
---|
| 379 | + |
---|
| 380 | +/* Common data flag combinations */ |
---|
| 381 | +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ |
---|
| 382 | + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) |
---|
| 383 | +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ |
---|
| 384 | + VM_MAYWRITE | VM_MAYEXEC) |
---|
| 385 | +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ |
---|
| 386 | + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) |
---|
| 387 | + |
---|
| 388 | +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ |
---|
| 389 | +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC |
---|
| 390 | +#endif |
---|
287 | 391 | |
---|
288 | 392 | #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ |
---|
289 | 393 | #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS |
---|
.. | .. |
---|
297 | 401 | |
---|
298 | 402 | #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) |
---|
299 | 403 | |
---|
| 404 | +/* VMA basic access permission flags */ |
---|
| 405 | +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) |
---|
| 406 | + |
---|
| 407 | + |
---|
300 | 408 | /* |
---|
301 | 409 | * Special vmas that are non-mergable, non-mlock()able. |
---|
302 | | - * Note: mm/huge_memory.c VM_NO_THP depends on this definition. |
---|
303 | 410 | */ |
---|
304 | 411 | #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) |
---|
| 412 | + |
---|
| 413 | +/* This mask prevents VMA from being scanned with khugepaged */ |
---|
| 414 | +#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) |
---|
305 | 415 | |
---|
306 | 416 | /* This mask defines which mm->def_flags a process can inherit its parent */ |
---|
307 | 417 | #define VM_INIT_DEF_MASK VM_NOHUGEPAGE |
---|
.. | .. |
---|
321 | 431 | */ |
---|
322 | 432 | extern pgprot_t protection_map[16]; |
---|
323 | 433 | |
---|
324 | | -#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ |
---|
325 | | -#define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */ |
---|
326 | | -#define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */ |
---|
327 | | -#define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */ |
---|
328 | | -#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ |
---|
329 | | -#define FAULT_FLAG_TRIED 0x20 /* Second try */ |
---|
330 | | -#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ |
---|
331 | | -#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ |
---|
332 | | -#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ |
---|
| 434 | +/** |
---|
| 435 | + * Fault flag definitions. |
---|
| 436 | + * |
---|
| 437 | + * @FAULT_FLAG_WRITE: Fault was a write fault. |
---|
| 438 | + * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. |
---|
| 439 | + * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. |
---|
| 440 | + * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. |
---|
| 441 | + * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. |
---|
| 442 | + * @FAULT_FLAG_TRIED: The fault has been tried once. |
---|
| 443 | + * @FAULT_FLAG_USER: The fault originated in userspace. |
---|
| 444 | + * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. |
---|
| 445 | + * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. |
---|
| 446 | + * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. |
---|
| 447 | + * |
---|
| 448 | + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify |
---|
| 449 | + * whether we would allow page faults to retry by specifying these two |
---|
| 450 | + * fault flags correctly. Currently there can be three legal combinations: |
---|
| 451 | + * |
---|
| 452 | + * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and |
---|
| 453 | + * this is the first try |
---|
| 454 | + * |
---|
| 455 | + * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and |
---|
| 456 | + * we've already tried at least once |
---|
| 457 | + * |
---|
| 458 | + * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry |
---|
| 459 | + * |
---|
| 460 | + * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never |
---|
| 461 | + * be used. Note that page faults can be allowed to retry for multiple times, |
---|
| 462 | + * in which case we'll have an initial fault with flags (a) then later on |
---|
| 463 | + * continuous faults with flags (b). We should always try to detect pending |
---|
| 464 | + * signals before a retry to make sure the continuous page faults can still be |
---|
| 465 | + * interrupted if necessary. |
---|
| 466 | + */ |
---|
| 467 | +#define FAULT_FLAG_WRITE 0x01 |
---|
| 468 | +#define FAULT_FLAG_MKWRITE 0x02 |
---|
| 469 | +#define FAULT_FLAG_ALLOW_RETRY 0x04 |
---|
| 470 | +#define FAULT_FLAG_RETRY_NOWAIT 0x08 |
---|
| 471 | +#define FAULT_FLAG_KILLABLE 0x10 |
---|
| 472 | +#define FAULT_FLAG_TRIED 0x20 |
---|
| 473 | +#define FAULT_FLAG_USER 0x40 |
---|
| 474 | +#define FAULT_FLAG_REMOTE 0x80 |
---|
| 475 | +#define FAULT_FLAG_INSTRUCTION 0x100 |
---|
| 476 | +#define FAULT_FLAG_INTERRUPTIBLE 0x200 |
---|
| 477 | +/* Speculative fault, not holding mmap_sem */ |
---|
| 478 | +#define FAULT_FLAG_SPECULATIVE 0x400 |
---|
| 479 | + |
---|
| 480 | +/* |
---|
| 481 | + * The default fault flags that should be used by most of the |
---|
| 482 | + * arch-specific page fault handlers. |
---|
| 483 | + */ |
---|
| 484 | +#define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ |
---|
| 485 | + FAULT_FLAG_KILLABLE | \ |
---|
| 486 | + FAULT_FLAG_INTERRUPTIBLE) |
---|
| 487 | + |
---|
| 488 | +/** |
---|
| 489 | + * fault_flag_allow_retry_first - check ALLOW_RETRY the first time |
---|
| 490 | + * |
---|
| 491 | + * This is mostly used for places where we want to try to avoid taking |
---|
| 492 | + * the mmap_lock for too long a time when waiting for another condition |
---|
| 493 | + * to change, in which case we can try to be polite to release the |
---|
| 494 | + * mmap_lock in the first round to avoid potential starvation of other |
---|
| 495 | + * processes that would also want the mmap_lock. |
---|
| 496 | + * |
---|
| 497 | + * Return: true if the page fault allows retry and this is the first |
---|
| 498 | + * attempt of the fault handling; false otherwise. |
---|
| 499 | + */ |
---|
| 500 | +static inline bool fault_flag_allow_retry_first(unsigned int flags) |
---|
| 501 | +{ |
---|
| 502 | + return (flags & FAULT_FLAG_ALLOW_RETRY) && |
---|
| 503 | + (!(flags & FAULT_FLAG_TRIED)); |
---|
| 504 | +} |
---|
333 | 505 | |
---|
334 | 506 | #define FAULT_FLAG_TRACE \ |
---|
335 | 507 | { FAULT_FLAG_WRITE, "WRITE" }, \ |
---|
.. | .. |
---|
340 | 512 | { FAULT_FLAG_TRIED, "TRIED" }, \ |
---|
341 | 513 | { FAULT_FLAG_USER, "USER" }, \ |
---|
342 | 514 | { FAULT_FLAG_REMOTE, "REMOTE" }, \ |
---|
343 | | - { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" } |
---|
| 515 | + { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ |
---|
| 516 | + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } |
---|
344 | 517 | |
---|
345 | 518 | /* |
---|
346 | | - * vm_fault is filled by the the pagefault handler and passed to the vma's |
---|
| 519 | + * vm_fault is filled by the pagefault handler and passed to the vma's |
---|
347 | 520 | * ->fault function. The vma's ->fault is responsible for returning a bitmask |
---|
348 | 521 | * of VM_FAULT_xxx flags that give details about how the fault was handled. |
---|
349 | 522 | * |
---|
.. | .. |
---|
353 | 526 | * pgoff should be used in favour of virtual_address, if possible. |
---|
354 | 527 | */ |
---|
355 | 528 | struct vm_fault { |
---|
356 | | - struct vm_area_struct *vma; /* Target VMA */ |
---|
357 | | - unsigned int flags; /* FAULT_FLAG_xxx flags */ |
---|
358 | | - gfp_t gfp_mask; /* gfp mask to be used for allocations */ |
---|
359 | | - pgoff_t pgoff; /* Logical page offset based on vma */ |
---|
360 | | - unsigned long address; /* Faulting virtual address */ |
---|
| 529 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 530 | + unsigned int sequence; |
---|
| 531 | + pmd_t orig_pmd; /* value of PMD at the time of fault */ |
---|
| 532 | +#endif |
---|
| 533 | + const struct { |
---|
| 534 | + struct vm_area_struct *vma; /* Target VMA */ |
---|
| 535 | + gfp_t gfp_mask; /* gfp mask to be used for allocations */ |
---|
| 536 | + pgoff_t pgoff; /* Logical page offset based on vma */ |
---|
| 537 | + unsigned long address; /* Faulting virtual address */ |
---|
| 538 | + }; |
---|
| 539 | + unsigned int flags; /* FAULT_FLAG_xxx flags |
---|
| 540 | + * XXX: should really be 'const' */ |
---|
361 | 541 | pmd_t *pmd; /* Pointer to pmd entry matching |
---|
362 | 542 | * the 'address' */ |
---|
363 | 543 | pud_t *pud; /* Pointer to pud entry matching |
---|
.. | .. |
---|
366 | 546 | pte_t orig_pte; /* Value of PTE at the time of fault */ |
---|
367 | 547 | |
---|
368 | 548 | struct page *cow_page; /* Page handler may use for COW fault */ |
---|
369 | | - struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ |
---|
370 | 549 | struct page *page; /* ->fault handlers should return a |
---|
371 | 550 | * page here, unless VM_FAULT_NOPAGE |
---|
372 | 551 | * is set (which is also implied by |
---|
.. | .. |
---|
382 | 561 | * is not NULL, otherwise pmd. |
---|
383 | 562 | */ |
---|
384 | 563 | pgtable_t prealloc_pte; /* Pre-allocated pte page table. |
---|
385 | | - * vm_ops->map_pages() calls |
---|
386 | | - * alloc_set_pte() from atomic context. |
---|
| 564 | + * vm_ops->map_pages() sets up a page |
---|
| 565 | + * table from atomic context. |
---|
387 | 566 | * do_fault_around() pre-allocates |
---|
388 | 567 | * page table to avoid allocation from |
---|
389 | 568 | * atomic context. |
---|
390 | 569 | */ |
---|
| 570 | + /* |
---|
| 571 | + * These entries are required when handling speculative page fault. |
---|
| 572 | + * This way the page handling is done using consistent field values. |
---|
| 573 | + */ |
---|
| 574 | + unsigned long vma_flags; |
---|
| 575 | + pgprot_t vma_page_prot; |
---|
| 576 | + ANDROID_OEM_DATA_ARRAY(1, 2); |
---|
391 | 577 | }; |
---|
392 | 578 | |
---|
393 | 579 | /* page entry size for vm->huge_fault() */ |
---|
.. | .. |
---|
410 | 596 | vm_fault_t (*fault)(struct vm_fault *vmf); |
---|
411 | 597 | vm_fault_t (*huge_fault)(struct vm_fault *vmf, |
---|
412 | 598 | enum page_entry_size pe_size); |
---|
413 | | - void (*map_pages)(struct vm_fault *vmf, |
---|
| 599 | + vm_fault_t (*map_pages)(struct vm_fault *vmf, |
---|
414 | 600 | pgoff_t start_pgoff, pgoff_t end_pgoff); |
---|
415 | 601 | unsigned long (*pagesize)(struct vm_area_struct * area); |
---|
416 | 602 | |
---|
.. | .. |
---|
447 | 633 | * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure |
---|
448 | 634 | * in mm/mempolicy.c will do this automatically. |
---|
449 | 635 | * get_policy() must NOT add a ref if the policy at (vma,addr) is not |
---|
450 | | - * marked as MPOL_SHARED. vma policies are protected by the mmap_sem. |
---|
| 636 | + * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. |
---|
451 | 637 | * If no [shared/vma] mempolicy exists at the addr, get_policy() op |
---|
452 | 638 | * must return NULL--i.e., do not "fallback" to task or system default |
---|
453 | 639 | * policy. |
---|
.. | .. |
---|
463 | 649 | struct page *(*find_special_page)(struct vm_area_struct *vma, |
---|
464 | 650 | unsigned long addr); |
---|
465 | 651 | |
---|
| 652 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 653 | + bool (*allow_speculation)(void); |
---|
| 654 | +#endif |
---|
| 655 | + |
---|
466 | 656 | ANDROID_KABI_RESERVE(1); |
---|
467 | 657 | ANDROID_KABI_RESERVE(2); |
---|
468 | 658 | ANDROID_KABI_RESERVE(3); |
---|
469 | 659 | ANDROID_KABI_RESERVE(4); |
---|
470 | 660 | }; |
---|
| 661 | + |
---|
| 662 | +static inline void INIT_VMA(struct vm_area_struct *vma) |
---|
| 663 | +{ |
---|
| 664 | + INIT_LIST_HEAD(&vma->anon_vma_chain); |
---|
| 665 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 666 | + seqcount_init(&vma->vm_sequence); |
---|
| 667 | + atomic_set(&vma->vm_ref_count, 1); |
---|
| 668 | +#endif |
---|
| 669 | +} |
---|
471 | 670 | |
---|
472 | 671 | static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) |
---|
473 | 672 | { |
---|
.. | .. |
---|
476 | 675 | memset(vma, 0, sizeof(*vma)); |
---|
477 | 676 | vma->vm_mm = mm; |
---|
478 | 677 | vma->vm_ops = &dummy_vm_ops; |
---|
479 | | - INIT_LIST_HEAD(&vma->anon_vma_chain); |
---|
| 678 | + INIT_VMA(vma); |
---|
480 | 679 | } |
---|
481 | 680 | |
---|
482 | 681 | static inline void vma_set_anonymous(struct vm_area_struct *vma) |
---|
.. | .. |
---|
484 | 683 | vma->vm_ops = NULL; |
---|
485 | 684 | } |
---|
486 | 685 | |
---|
| 686 | +static inline bool vma_is_anonymous(struct vm_area_struct *vma) |
---|
| 687 | +{ |
---|
| 688 | + return !vma->vm_ops; |
---|
| 689 | +} |
---|
| 690 | + |
---|
| 691 | +static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) |
---|
| 692 | +{ |
---|
| 693 | + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); |
---|
| 694 | + |
---|
| 695 | + if (!maybe_stack) |
---|
| 696 | + return false; |
---|
| 697 | + |
---|
| 698 | + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == |
---|
| 699 | + VM_STACK_INCOMPLETE_SETUP) |
---|
| 700 | + return true; |
---|
| 701 | + |
---|
| 702 | + return false; |
---|
| 703 | +} |
---|
| 704 | + |
---|
| 705 | +static inline bool vma_is_foreign(struct vm_area_struct *vma) |
---|
| 706 | +{ |
---|
| 707 | + if (!current->mm) |
---|
| 708 | + return true; |
---|
| 709 | + |
---|
| 710 | + if (current->mm != vma->vm_mm) |
---|
| 711 | + return true; |
---|
| 712 | + |
---|
| 713 | + return false; |
---|
| 714 | +} |
---|
| 715 | + |
---|
| 716 | +static inline bool vma_is_accessible(struct vm_area_struct *vma) |
---|
| 717 | +{ |
---|
| 718 | + return vma->vm_flags & VM_ACCESS_FLAGS; |
---|
| 719 | +} |
---|
| 720 | + |
---|
| 721 | +#ifdef CONFIG_SHMEM |
---|
| 722 | +/* |
---|
| 723 | + * The vma_is_shmem is not inline because it is used only by slow |
---|
| 724 | + * paths in userfault. |
---|
| 725 | + */ |
---|
| 726 | +bool vma_is_shmem(struct vm_area_struct *vma); |
---|
| 727 | +#else |
---|
| 728 | +static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } |
---|
| 729 | +#endif |
---|
| 730 | + |
---|
| 731 | +int vma_is_stack_for_current(struct vm_area_struct *vma); |
---|
| 732 | + |
---|
487 | 733 | /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ |
---|
488 | 734 | #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } |
---|
489 | 735 | |
---|
490 | 736 | struct mmu_gather; |
---|
491 | 737 | struct inode; |
---|
492 | 738 | |
---|
493 | | -#define page_private(page) ((page)->private) |
---|
494 | | -#define set_page_private(page, v) ((page)->private = (v)) |
---|
495 | | - |
---|
496 | | -#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) |
---|
497 | | -static inline int pmd_devmap(pmd_t pmd) |
---|
498 | | -{ |
---|
499 | | - return 0; |
---|
500 | | -} |
---|
501 | | -static inline int pud_devmap(pud_t pud) |
---|
502 | | -{ |
---|
503 | | - return 0; |
---|
504 | | -} |
---|
505 | | -static inline int pgd_devmap(pgd_t pgd) |
---|
506 | | -{ |
---|
507 | | - return 0; |
---|
508 | | -} |
---|
509 | | -#endif |
---|
510 | | - |
---|
511 | | -/* |
---|
512 | | - * FIXME: take this include out, include page-flags.h in |
---|
513 | | - * files which need it (119 of them) |
---|
514 | | - */ |
---|
515 | | -#include <linux/page-flags.h> |
---|
516 | 739 | #include <linux/huge_mm.h> |
---|
517 | 740 | |
---|
518 | 741 | /* |
---|
.. | .. |
---|
533 | 756 | */ |
---|
534 | 757 | static inline int put_page_testzero(struct page *page) |
---|
535 | 758 | { |
---|
| 759 | + int ret; |
---|
| 760 | + |
---|
536 | 761 | VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); |
---|
537 | | - return page_ref_dec_and_test(page); |
---|
| 762 | + ret = page_ref_dec_and_test(page); |
---|
| 763 | + page_pinner_put_page(page); |
---|
| 764 | + |
---|
| 765 | + return ret; |
---|
538 | 766 | } |
---|
539 | 767 | |
---|
540 | 768 | /* |
---|
.. | .. |
---|
569 | 797 | * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there |
---|
570 | 798 | * is no special casing required. |
---|
571 | 799 | */ |
---|
572 | | -static inline bool is_vmalloc_addr(const void *x) |
---|
573 | | -{ |
---|
574 | | -#ifdef CONFIG_MMU |
---|
575 | | - unsigned long addr = (unsigned long)x; |
---|
576 | 800 | |
---|
577 | | - return addr >= VMALLOC_START && addr < VMALLOC_END; |
---|
578 | | -#else |
---|
579 | | - return false; |
---|
| 801 | +#ifndef is_ioremap_addr |
---|
| 802 | +#define is_ioremap_addr(x) is_vmalloc_addr(x) |
---|
580 | 803 | #endif |
---|
581 | | -} |
---|
| 804 | + |
---|
582 | 805 | #ifdef CONFIG_MMU |
---|
| 806 | +extern bool is_vmalloc_addr(const void *x); |
---|
583 | 807 | extern int is_vmalloc_or_module_addr(const void *x); |
---|
584 | 808 | #else |
---|
| 809 | +static inline bool is_vmalloc_addr(const void *x) |
---|
| 810 | +{ |
---|
| 811 | + return false; |
---|
| 812 | +} |
---|
585 | 813 | static inline int is_vmalloc_or_module_addr(const void *x) |
---|
586 | 814 | { |
---|
587 | 815 | return 0; |
---|
.. | .. |
---|
617 | 845 | return kvmalloc_array(n, size, flags | __GFP_ZERO); |
---|
618 | 846 | } |
---|
619 | 847 | |
---|
| 848 | +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, |
---|
| 849 | + gfp_t flags); |
---|
620 | 850 | extern void kvfree(const void *addr); |
---|
621 | 851 | extern void kvfree_sensitive(const void *addr, size_t len); |
---|
| 852 | + |
---|
| 853 | +static inline int head_compound_mapcount(struct page *head) |
---|
| 854 | +{ |
---|
| 855 | + return atomic_read(compound_mapcount_ptr(head)) + 1; |
---|
| 856 | +} |
---|
622 | 857 | |
---|
623 | 858 | /* |
---|
624 | 859 | * Mapcount of compound page as a whole, does not include mapped sub-pages. |
---|
.. | .. |
---|
629 | 864 | { |
---|
630 | 865 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
---|
631 | 866 | page = compound_head(page); |
---|
632 | | - return atomic_read(compound_mapcount_ptr(page)) + 1; |
---|
| 867 | + return head_compound_mapcount(page); |
---|
633 | 868 | } |
---|
634 | 869 | |
---|
635 | 870 | /* |
---|
.. | .. |
---|
709 | 944 | #endif |
---|
710 | 945 | NR_COMPOUND_DTORS, |
---|
711 | 946 | }; |
---|
712 | | -extern compound_page_dtor * const compound_page_dtors[]; |
---|
| 947 | +extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; |
---|
713 | 948 | |
---|
714 | 949 | static inline void set_compound_page_dtor(struct page *page, |
---|
715 | 950 | enum compound_dtor_id compound_dtor) |
---|
.. | .. |
---|
718 | 953 | page[1].compound_dtor = compound_dtor; |
---|
719 | 954 | } |
---|
720 | 955 | |
---|
721 | | -static inline compound_page_dtor *get_compound_page_dtor(struct page *page) |
---|
| 956 | +static inline void destroy_compound_page(struct page *page) |
---|
722 | 957 | { |
---|
723 | 958 | VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); |
---|
724 | | - return compound_page_dtors[page[1].compound_dtor]; |
---|
| 959 | + compound_page_dtors[page[1].compound_dtor](page); |
---|
725 | 960 | } |
---|
726 | 961 | |
---|
727 | 962 | static inline unsigned int compound_order(struct page *page) |
---|
.. | .. |
---|
731 | 966 | return page[1].compound_order; |
---|
732 | 967 | } |
---|
733 | 968 | |
---|
| 969 | +static inline bool hpage_pincount_available(struct page *page) |
---|
| 970 | +{ |
---|
| 971 | + /* |
---|
| 972 | + * Can the page->hpage_pinned_refcount field be used? That field is in |
---|
| 973 | + * the 3rd page of the compound page, so the smallest (2-page) compound |
---|
| 974 | + * pages cannot support it. |
---|
| 975 | + */ |
---|
| 976 | + page = compound_head(page); |
---|
| 977 | + return PageCompound(page) && compound_order(page) > 1; |
---|
| 978 | +} |
---|
| 979 | + |
---|
| 980 | +static inline int head_compound_pincount(struct page *head) |
---|
| 981 | +{ |
---|
| 982 | + return atomic_read(compound_pincount_ptr(head)); |
---|
| 983 | +} |
---|
| 984 | + |
---|
| 985 | +static inline int compound_pincount(struct page *page) |
---|
| 986 | +{ |
---|
| 987 | + VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); |
---|
| 988 | + page = compound_head(page); |
---|
| 989 | + return head_compound_pincount(page); |
---|
| 990 | +} |
---|
| 991 | + |
---|
734 | 992 | static inline void set_compound_order(struct page *page, unsigned int order) |
---|
735 | 993 | { |
---|
736 | 994 | page[1].compound_order = order; |
---|
| 995 | + page[1].compound_nr = 1U << order; |
---|
| 996 | +} |
---|
| 997 | + |
---|
| 998 | +/* Returns the number of pages in this potentially compound page. */ |
---|
| 999 | +static inline unsigned long compound_nr(struct page *page) |
---|
| 1000 | +{ |
---|
| 1001 | + if (!PageHead(page)) |
---|
| 1002 | + return 1; |
---|
| 1003 | + return page[1].compound_nr; |
---|
737 | 1004 | } |
---|
738 | 1005 | |
---|
739 | 1006 | /* Returns the number of bytes in this potentially compound page. */ |
---|
740 | 1007 | static inline unsigned long page_size(struct page *page) |
---|
741 | 1008 | { |
---|
742 | 1009 | return PAGE_SIZE << compound_order(page); |
---|
| 1010 | +} |
---|
| 1011 | + |
---|
| 1012 | +/* Returns the number of bits needed for the number of bytes in a page */ |
---|
| 1013 | +static inline unsigned int page_shift(struct page *page) |
---|
| 1014 | +{ |
---|
| 1015 | + return PAGE_SHIFT + compound_order(page); |
---|
743 | 1016 | } |
---|
744 | 1017 | |
---|
745 | 1018 | void free_compound_page(struct page *page); |
---|
.. | .. |
---|
751 | 1024 | * pte_mkwrite. But get_user_pages can cause write faults for mappings |
---|
752 | 1025 | * that do not have writing enabled, when used by access_process_vm. |
---|
753 | 1026 | */ |
---|
754 | | -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) |
---|
| 1027 | +static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags) |
---|
755 | 1028 | { |
---|
756 | | - if (likely(vma->vm_flags & VM_WRITE)) |
---|
| 1029 | + if (likely(vma_flags & VM_WRITE)) |
---|
757 | 1030 | pte = pte_mkwrite(pte); |
---|
758 | 1031 | return pte; |
---|
759 | 1032 | } |
---|
760 | 1033 | |
---|
761 | | -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, |
---|
762 | | - struct page *page); |
---|
| 1034 | +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); |
---|
| 1035 | +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); |
---|
| 1036 | + |
---|
763 | 1037 | vm_fault_t finish_fault(struct vm_fault *vmf); |
---|
764 | 1038 | vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); |
---|
765 | 1039 | #endif |
---|
.. | .. |
---|
860 | 1134 | |
---|
861 | 1135 | #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) |
---|
862 | 1136 | |
---|
863 | | -#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS |
---|
864 | | -#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS |
---|
865 | | -#endif |
---|
866 | | - |
---|
867 | 1137 | #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) |
---|
868 | 1138 | #define NODES_MASK ((1UL << NODES_WIDTH) - 1) |
---|
869 | 1139 | #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) |
---|
.. | .. |
---|
873 | 1143 | |
---|
874 | 1144 | static inline enum zone_type page_zonenum(const struct page *page) |
---|
875 | 1145 | { |
---|
| 1146 | + ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); |
---|
876 | 1147 | return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; |
---|
877 | 1148 | } |
---|
878 | 1149 | |
---|
.. | .. |
---|
881 | 1152 | { |
---|
882 | 1153 | return page_zonenum(page) == ZONE_DEVICE; |
---|
883 | 1154 | } |
---|
| 1155 | +extern void memmap_init_zone_device(struct zone *, unsigned long, |
---|
| 1156 | + unsigned long, struct dev_pagemap *); |
---|
884 | 1157 | #else |
---|
885 | 1158 | static inline bool is_zone_device_page(const struct page *page) |
---|
886 | 1159 | { |
---|
.. | .. |
---|
889 | 1162 | #endif |
---|
890 | 1163 | |
---|
891 | 1164 | #ifdef CONFIG_DEV_PAGEMAP_OPS |
---|
892 | | -void dev_pagemap_get_ops(void); |
---|
893 | | -void dev_pagemap_put_ops(void); |
---|
894 | | -void __put_devmap_managed_page(struct page *page); |
---|
| 1165 | +void free_devmap_managed_page(struct page *page); |
---|
895 | 1166 | DECLARE_STATIC_KEY_FALSE(devmap_managed_key); |
---|
896 | | -static inline bool put_devmap_managed_page(struct page *page) |
---|
| 1167 | + |
---|
| 1168 | +static inline bool page_is_devmap_managed(struct page *page) |
---|
897 | 1169 | { |
---|
898 | 1170 | if (!static_branch_unlikely(&devmap_managed_key)) |
---|
899 | 1171 | return false; |
---|
.. | .. |
---|
901 | 1173 | return false; |
---|
902 | 1174 | switch (page->pgmap->type) { |
---|
903 | 1175 | case MEMORY_DEVICE_PRIVATE: |
---|
904 | | - case MEMORY_DEVICE_PUBLIC: |
---|
905 | 1176 | case MEMORY_DEVICE_FS_DAX: |
---|
906 | | - __put_devmap_managed_page(page); |
---|
907 | 1177 | return true; |
---|
908 | 1178 | default: |
---|
909 | 1179 | break; |
---|
.. | .. |
---|
911 | 1181 | return false; |
---|
912 | 1182 | } |
---|
913 | 1183 | |
---|
| 1184 | +void put_devmap_managed_page(struct page *page); |
---|
| 1185 | + |
---|
| 1186 | +#else /* CONFIG_DEV_PAGEMAP_OPS */ |
---|
| 1187 | +static inline bool page_is_devmap_managed(struct page *page) |
---|
| 1188 | +{ |
---|
| 1189 | + return false; |
---|
| 1190 | +} |
---|
| 1191 | + |
---|
| 1192 | +static inline void put_devmap_managed_page(struct page *page) |
---|
| 1193 | +{ |
---|
| 1194 | +} |
---|
| 1195 | +#endif /* CONFIG_DEV_PAGEMAP_OPS */ |
---|
| 1196 | + |
---|
914 | 1197 | static inline bool is_device_private_page(const struct page *page) |
---|
915 | 1198 | { |
---|
916 | | - return is_zone_device_page(page) && |
---|
| 1199 | + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && |
---|
| 1200 | + IS_ENABLED(CONFIG_DEVICE_PRIVATE) && |
---|
| 1201 | + is_zone_device_page(page) && |
---|
917 | 1202 | page->pgmap->type == MEMORY_DEVICE_PRIVATE; |
---|
918 | 1203 | } |
---|
919 | 1204 | |
---|
920 | | -static inline bool is_device_public_page(const struct page *page) |
---|
| 1205 | +static inline bool is_pci_p2pdma_page(const struct page *page) |
---|
921 | 1206 | { |
---|
922 | | - return is_zone_device_page(page) && |
---|
923 | | - page->pgmap->type == MEMORY_DEVICE_PUBLIC; |
---|
| 1207 | + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && |
---|
| 1208 | + IS_ENABLED(CONFIG_PCI_P2PDMA) && |
---|
| 1209 | + is_zone_device_page(page) && |
---|
| 1210 | + page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; |
---|
924 | 1211 | } |
---|
925 | | - |
---|
926 | | -#else /* CONFIG_DEV_PAGEMAP_OPS */ |
---|
927 | | -static inline void dev_pagemap_get_ops(void) |
---|
928 | | -{ |
---|
929 | | -} |
---|
930 | | - |
---|
931 | | -static inline void dev_pagemap_put_ops(void) |
---|
932 | | -{ |
---|
933 | | -} |
---|
934 | | - |
---|
935 | | -static inline bool put_devmap_managed_page(struct page *page) |
---|
936 | | -{ |
---|
937 | | - return false; |
---|
938 | | -} |
---|
939 | | - |
---|
940 | | -static inline bool is_device_private_page(const struct page *page) |
---|
941 | | -{ |
---|
942 | | - return false; |
---|
943 | | -} |
---|
944 | | - |
---|
945 | | -static inline bool is_device_public_page(const struct page *page) |
---|
946 | | -{ |
---|
947 | | - return false; |
---|
948 | | -} |
---|
949 | | -#endif /* CONFIG_DEV_PAGEMAP_OPS */ |
---|
950 | 1212 | |
---|
951 | 1213 | /* 127: arbitrary random number, small enough to assemble well */ |
---|
952 | 1214 | #define page_ref_zero_or_close_to_overflow(page) \ |
---|
.. | .. |
---|
962 | 1224 | VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); |
---|
963 | 1225 | page_ref_inc(page); |
---|
964 | 1226 | } |
---|
| 1227 | + |
---|
| 1228 | +bool __must_check try_grab_page(struct page *page, unsigned int flags); |
---|
965 | 1229 | |
---|
966 | 1230 | static inline __must_check bool try_get_page(struct page *page) |
---|
967 | 1231 | { |
---|
.. | .. |
---|
982 | 1246 | * need to inform the device driver through callback. See |
---|
983 | 1247 | * include/linux/memremap.h and HMM for details. |
---|
984 | 1248 | */ |
---|
985 | | - if (put_devmap_managed_page(page)) |
---|
| 1249 | + if (page_is_devmap_managed(page)) { |
---|
| 1250 | + put_devmap_managed_page(page); |
---|
986 | 1251 | return; |
---|
| 1252 | + } |
---|
987 | 1253 | |
---|
988 | 1254 | if (put_page_testzero(page)) |
---|
989 | 1255 | __put_page(page); |
---|
| 1256 | +} |
---|
| 1257 | + |
---|
| 1258 | +/* |
---|
| 1259 | + * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload |
---|
| 1260 | + * the page's refcount so that two separate items are tracked: the original page |
---|
| 1261 | + * reference count, and also a new count of how many pin_user_pages() calls were |
---|
| 1262 | + * made against the page. ("gup-pinned" is another term for the latter). |
---|
| 1263 | + * |
---|
| 1264 | + * With this scheme, pin_user_pages() becomes special: such pages are marked as |
---|
| 1265 | + * distinct from normal pages. As such, the unpin_user_page() call (and its |
---|
| 1266 | + * variants) must be used in order to release gup-pinned pages. |
---|
| 1267 | + * |
---|
| 1268 | + * Choice of value: |
---|
| 1269 | + * |
---|
| 1270 | + * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference |
---|
| 1271 | + * counts with respect to pin_user_pages() and unpin_user_page() becomes |
---|
| 1272 | + * simpler, due to the fact that adding an even power of two to the page |
---|
| 1273 | + * refcount has the effect of using only the upper N bits, for the code that |
---|
| 1274 | + * counts up using the bias value. This means that the lower bits are left for |
---|
| 1275 | + * the exclusive use of the original code that increments and decrements by one |
---|
| 1276 | + * (or at least, by much smaller values than the bias value). |
---|
| 1277 | + * |
---|
| 1278 | + * Of course, once the lower bits overflow into the upper bits (and this is |
---|
| 1279 | + * OK, because subtraction recovers the original values), then visual inspection |
---|
| 1280 | + * no longer suffices to directly view the separate counts. However, for normal |
---|
| 1281 | + * applications that don't have huge page reference counts, this won't be an |
---|
| 1282 | + * issue. |
---|
| 1283 | + * |
---|
| 1284 | + * Locking: the lockless algorithm described in page_cache_get_speculative() |
---|
| 1285 | + * and page_cache_gup_pin_speculative() provides safe operation for |
---|
| 1286 | + * get_user_pages and page_mkclean and other calls that race to set up page |
---|
| 1287 | + * table entries. |
---|
| 1288 | + */ |
---|
| 1289 | +#define GUP_PIN_COUNTING_BIAS (1U << 10) |
---|
| 1290 | + |
---|
| 1291 | +void put_user_page(struct page *page); |
---|
| 1292 | +void unpin_user_page(struct page *page); |
---|
| 1293 | +void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, |
---|
| 1294 | + bool make_dirty); |
---|
| 1295 | +void unpin_user_pages(struct page **pages, unsigned long npages); |
---|
| 1296 | + |
---|
| 1297 | +/** |
---|
| 1298 | + * page_maybe_dma_pinned() - report if a page is pinned for DMA. |
---|
| 1299 | + * |
---|
| 1300 | + * This function checks if a page has been pinned via a call to |
---|
| 1301 | + * pin_user_pages*(). |
---|
| 1302 | + * |
---|
| 1303 | + * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, |
---|
| 1304 | + * because it means "definitely not pinned for DMA", but true means "probably |
---|
| 1305 | + * pinned for DMA, but possibly a false positive due to having at least |
---|
| 1306 | + * GUP_PIN_COUNTING_BIAS worth of normal page references". |
---|
| 1307 | + * |
---|
| 1308 | + * False positives are OK, because: a) it's unlikely for a page to get that many |
---|
| 1309 | + * refcounts, and b) all the callers of this routine are expected to be able to |
---|
| 1310 | + * deal gracefully with a false positive. |
---|
| 1311 | + * |
---|
| 1312 | + * For huge pages, the result will be exactly correct. That's because we have |
---|
| 1313 | + * more tracking data available: the 3rd struct page in the compound page is |
---|
| 1314 | + * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS |
---|
| 1315 | + * scheme). |
---|
| 1316 | + * |
---|
| 1317 | + * For more information, please see Documentation/core-api/pin_user_pages.rst. |
---|
| 1318 | + * |
---|
| 1319 | + * @page: pointer to page to be queried. |
---|
| 1320 | + * @Return: True, if it is likely that the page has been "dma-pinned". |
---|
| 1321 | + * False, if the page is definitely not dma-pinned. |
---|
| 1322 | + */ |
---|
| 1323 | +static inline bool page_maybe_dma_pinned(struct page *page) |
---|
| 1324 | +{ |
---|
| 1325 | + if (hpage_pincount_available(page)) |
---|
| 1326 | + return compound_pincount(page) > 0; |
---|
| 1327 | + |
---|
| 1328 | + /* |
---|
| 1329 | + * page_ref_count() is signed. If that refcount overflows, then |
---|
| 1330 | + * page_ref_count() returns a negative value, and callers will avoid |
---|
| 1331 | + * further incrementing the refcount. |
---|
| 1332 | + * |
---|
| 1333 | + * Here, for that overflow case, use the signed bit to count a little |
---|
| 1334 | + * bit higher via unsigned math, and thus still get an accurate result. |
---|
| 1335 | + */ |
---|
| 1336 | + return ((unsigned int)page_ref_count(compound_head(page))) >= |
---|
| 1337 | + GUP_PIN_COUNTING_BIAS; |
---|
990 | 1338 | } |
---|
991 | 1339 | |
---|
992 | 1340 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) |
---|
.. | .. |
---|
1114 | 1462 | |
---|
1115 | 1463 | static inline bool cpupid_pid_unset(int cpupid) |
---|
1116 | 1464 | { |
---|
1117 | | - return 1; |
---|
| 1465 | + return true; |
---|
1118 | 1466 | } |
---|
1119 | 1467 | |
---|
1120 | 1468 | static inline void page_cpupid_reset_last(struct page *page) |
---|
.. | .. |
---|
1127 | 1475 | } |
---|
1128 | 1476 | #endif /* CONFIG_NUMA_BALANCING */ |
---|
1129 | 1477 | |
---|
1130 | | -#ifdef CONFIG_KASAN_SW_TAGS |
---|
| 1478 | +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) |
---|
| 1479 | + |
---|
| 1480 | +/* |
---|
| 1481 | + * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid |
---|
| 1482 | + * setting tags for all pages to native kernel tag value 0xff, as the default |
---|
| 1483 | + * value 0x00 maps to 0xff. |
---|
| 1484 | + */ |
---|
| 1485 | + |
---|
1131 | 1486 | static inline u8 page_kasan_tag(const struct page *page) |
---|
1132 | 1487 | { |
---|
1133 | | - return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; |
---|
| 1488 | + u8 tag = 0xff; |
---|
| 1489 | + |
---|
| 1490 | + if (kasan_enabled()) { |
---|
| 1491 | + tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; |
---|
| 1492 | + tag ^= 0xff; |
---|
| 1493 | + } |
---|
| 1494 | + |
---|
| 1495 | + return tag; |
---|
1134 | 1496 | } |
---|
1135 | 1497 | |
---|
1136 | 1498 | static inline void page_kasan_tag_set(struct page *page, u8 tag) |
---|
1137 | 1499 | { |
---|
1138 | | - page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); |
---|
1139 | | - page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; |
---|
| 1500 | + if (kasan_enabled()) { |
---|
| 1501 | + tag ^= 0xff; |
---|
| 1502 | + page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); |
---|
| 1503 | + page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; |
---|
| 1504 | + } |
---|
1140 | 1505 | } |
---|
1141 | 1506 | |
---|
1142 | 1507 | static inline void page_kasan_tag_reset(struct page *page) |
---|
1143 | 1508 | { |
---|
1144 | | - page_kasan_tag_set(page, 0xff); |
---|
| 1509 | + if (kasan_enabled()) |
---|
| 1510 | + page_kasan_tag_set(page, 0xff); |
---|
1145 | 1511 | } |
---|
1146 | | -#else |
---|
| 1512 | + |
---|
| 1513 | +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ |
---|
| 1514 | + |
---|
1147 | 1515 | static inline u8 page_kasan_tag(const struct page *page) |
---|
1148 | 1516 | { |
---|
1149 | 1517 | return 0xff; |
---|
.. | .. |
---|
1151 | 1519 | |
---|
1152 | 1520 | static inline void page_kasan_tag_set(struct page *page, u8 tag) { } |
---|
1153 | 1521 | static inline void page_kasan_tag_reset(struct page *page) { } |
---|
1154 | | -#endif |
---|
| 1522 | + |
---|
| 1523 | +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ |
---|
1155 | 1524 | |
---|
1156 | 1525 | static inline struct zone *page_zone(const struct page *page) |
---|
1157 | 1526 | { |
---|
.. | .. |
---|
1319 | 1688 | } |
---|
1320 | 1689 | |
---|
1321 | 1690 | /* |
---|
1322 | | - * Different kinds of faults, as returned by handle_mm_fault(). |
---|
1323 | | - * Used to decide whether a process gets delivered SIGBUS or |
---|
1324 | | - * just gets major/minor fault counters bumped up. |
---|
1325 | | - */ |
---|
1326 | | - |
---|
1327 | | -#define VM_FAULT_OOM 0x0001 |
---|
1328 | | -#define VM_FAULT_SIGBUS 0x0002 |
---|
1329 | | -#define VM_FAULT_MAJOR 0x0004 |
---|
1330 | | -#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ |
---|
1331 | | -#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ |
---|
1332 | | -#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ |
---|
1333 | | -#define VM_FAULT_SIGSEGV 0x0040 |
---|
1334 | | - |
---|
1335 | | -#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ |
---|
1336 | | -#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
---|
1337 | | -#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ |
---|
1338 | | -#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ |
---|
1339 | | -#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ |
---|
1340 | | -#define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables |
---|
1341 | | - * and needs fsync() to complete (for |
---|
1342 | | - * synchronous page faults in DAX) */ |
---|
1343 | | - |
---|
1344 | | -#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ |
---|
1345 | | - VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ |
---|
1346 | | - VM_FAULT_FALLBACK) |
---|
1347 | | - |
---|
1348 | | -#define VM_FAULT_RESULT_TRACE \ |
---|
1349 | | - { VM_FAULT_OOM, "OOM" }, \ |
---|
1350 | | - { VM_FAULT_SIGBUS, "SIGBUS" }, \ |
---|
1351 | | - { VM_FAULT_MAJOR, "MAJOR" }, \ |
---|
1352 | | - { VM_FAULT_WRITE, "WRITE" }, \ |
---|
1353 | | - { VM_FAULT_HWPOISON, "HWPOISON" }, \ |
---|
1354 | | - { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ |
---|
1355 | | - { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ |
---|
1356 | | - { VM_FAULT_NOPAGE, "NOPAGE" }, \ |
---|
1357 | | - { VM_FAULT_LOCKED, "LOCKED" }, \ |
---|
1358 | | - { VM_FAULT_RETRY, "RETRY" }, \ |
---|
1359 | | - { VM_FAULT_FALLBACK, "FALLBACK" }, \ |
---|
1360 | | - { VM_FAULT_DONE_COW, "DONE_COW" }, \ |
---|
1361 | | - { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" } |
---|
1362 | | - |
---|
1363 | | -/* Encode hstate index for a hwpoisoned large page */ |
---|
1364 | | -#define VM_FAULT_SET_HINDEX(x) ((x) << 12) |
---|
1365 | | -#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) |
---|
1366 | | - |
---|
1367 | | -/* |
---|
1368 | 1691 | * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. |
---|
1369 | 1692 | */ |
---|
1370 | 1693 | extern void pagefault_out_of_memory(void); |
---|
1371 | 1694 | |
---|
1372 | 1695 | #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) |
---|
| 1696 | +#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) |
---|
1373 | 1697 | |
---|
1374 | 1698 | /* |
---|
1375 | 1699 | * Flags passed to show_mem() and show_free_areas() to suppress output in |
---|
.. | .. |
---|
1379 | 1703 | |
---|
1380 | 1704 | extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); |
---|
1381 | 1705 | |
---|
| 1706 | +#ifdef CONFIG_MMU |
---|
1382 | 1707 | extern bool can_do_mlock(void); |
---|
| 1708 | +#else |
---|
| 1709 | +static inline bool can_do_mlock(void) { return false; } |
---|
| 1710 | +#endif |
---|
1383 | 1711 | extern int user_shm_lock(size_t, struct user_struct *); |
---|
1384 | 1712 | extern void user_shm_unlock(size_t, struct user_struct *); |
---|
1385 | 1713 | |
---|
.. | .. |
---|
1394 | 1722 | }; |
---|
1395 | 1723 | |
---|
1396 | 1724 | struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
---|
1397 | | - pte_t pte, bool with_public_device); |
---|
1398 | | -#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false) |
---|
| 1725 | + pte_t pte, unsigned long vma_flags); |
---|
| 1726 | +static inline struct page *vm_normal_page(struct vm_area_struct *vma, |
---|
| 1727 | + unsigned long addr, pte_t pte) |
---|
| 1728 | +{ |
---|
| 1729 | + return _vm_normal_page(vma, addr, pte, vma->vm_flags); |
---|
| 1730 | +} |
---|
1399 | 1731 | |
---|
1400 | 1732 | struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, |
---|
1401 | 1733 | pmd_t pmd); |
---|
.. | .. |
---|
1407 | 1739 | void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
---|
1408 | 1740 | unsigned long start, unsigned long end); |
---|
1409 | 1741 | |
---|
1410 | | -/** |
---|
1411 | | - * mm_walk - callbacks for walk_page_range |
---|
1412 | | - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry |
---|
1413 | | - * this handler should only handle pud_trans_huge() puds. |
---|
1414 | | - * the pmd_entry or pte_entry callbacks will be used for |
---|
1415 | | - * regular PUDs. |
---|
1416 | | - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry |
---|
1417 | | - * this handler is required to be able to handle |
---|
1418 | | - * pmd_trans_huge() pmds. They may simply choose to |
---|
1419 | | - * split_huge_page() instead of handling it explicitly. |
---|
1420 | | - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry |
---|
1421 | | - * @pte_hole: if set, called for each hole at all levels |
---|
1422 | | - * @hugetlb_entry: if set, called for each hugetlb entry |
---|
1423 | | - * @test_walk: caller specific callback function to determine whether |
---|
1424 | | - * we walk over the current vma or not. Returning 0 |
---|
1425 | | - * value means "do page table walk over the current vma," |
---|
1426 | | - * and a negative one means "abort current page table walk |
---|
1427 | | - * right now." 1 means "skip the current vma." |
---|
1428 | | - * @mm: mm_struct representing the target process of page table walk |
---|
1429 | | - * @vma: vma currently walked (NULL if walking outside vmas) |
---|
1430 | | - * @private: private data for callbacks' usage |
---|
1431 | | - * |
---|
1432 | | - * (see the comment on walk_page_range() for more details) |
---|
1433 | | - */ |
---|
1434 | | -struct mm_walk { |
---|
1435 | | - int (*pud_entry)(pud_t *pud, unsigned long addr, |
---|
1436 | | - unsigned long next, struct mm_walk *walk); |
---|
1437 | | - int (*pmd_entry)(pmd_t *pmd, unsigned long addr, |
---|
1438 | | - unsigned long next, struct mm_walk *walk); |
---|
1439 | | - int (*pte_entry)(pte_t *pte, unsigned long addr, |
---|
1440 | | - unsigned long next, struct mm_walk *walk); |
---|
1441 | | - int (*pte_hole)(unsigned long addr, unsigned long next, |
---|
1442 | | - struct mm_walk *walk); |
---|
1443 | | - int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, |
---|
1444 | | - unsigned long addr, unsigned long next, |
---|
1445 | | - struct mm_walk *walk); |
---|
1446 | | - int (*test_walk)(unsigned long addr, unsigned long next, |
---|
1447 | | - struct mm_walk *walk); |
---|
1448 | | - struct mm_struct *mm; |
---|
1449 | | - struct vm_area_struct *vma; |
---|
1450 | | - void *private; |
---|
1451 | | -}; |
---|
| 1742 | +struct mmu_notifier_range; |
---|
1452 | 1743 | |
---|
1453 | | -int walk_page_range(unsigned long addr, unsigned long end, |
---|
1454 | | - struct mm_walk *walk); |
---|
1455 | | -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); |
---|
1456 | 1744 | void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, |
---|
1457 | 1745 | unsigned long end, unsigned long floor, unsigned long ceiling); |
---|
1458 | | -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, |
---|
1459 | | - struct vm_area_struct *vma); |
---|
1460 | | -int follow_pte_pmd(struct mm_struct *mm, unsigned long address, |
---|
1461 | | - unsigned long *start, unsigned long *end, |
---|
1462 | | - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); |
---|
| 1746 | +int |
---|
| 1747 | +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); |
---|
| 1748 | +int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, |
---|
| 1749 | + struct mmu_notifier_range *range, pte_t **ptepp, |
---|
| 1750 | + pmd_t **pmdpp, spinlock_t **ptlp); |
---|
| 1751 | +int follow_pte(struct mm_struct *mm, unsigned long address, |
---|
| 1752 | + pte_t **ptepp, spinlock_t **ptlp); |
---|
1463 | 1753 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, |
---|
1464 | 1754 | unsigned long *pfn); |
---|
1465 | 1755 | int follow_phys(struct vm_area_struct *vma, unsigned long address, |
---|
1466 | 1756 | unsigned int flags, unsigned long *prot, resource_size_t *phys); |
---|
1467 | 1757 | int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, |
---|
1468 | 1758 | void *buf, int len, int write); |
---|
| 1759 | + |
---|
| 1760 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 1761 | +static inline void vm_write_begin(struct vm_area_struct *vma) |
---|
| 1762 | +{ |
---|
| 1763 | + /* |
---|
| 1764 | + * Isolated vma might be freed without exclusive mmap_lock but |
---|
| 1765 | + * speculative page fault handler still needs to know it was changed. |
---|
| 1766 | + */ |
---|
| 1767 | + if (!RB_EMPTY_NODE(&vma->vm_rb)) |
---|
| 1768 | + mmap_assert_write_locked(vma->vm_mm); |
---|
| 1769 | + /* |
---|
| 1770 | + * The reads never spins and preemption |
---|
| 1771 | + * disablement is not required. |
---|
| 1772 | + */ |
---|
| 1773 | + raw_write_seqcount_begin(&vma->vm_sequence); |
---|
| 1774 | +} |
---|
| 1775 | +static inline void vm_write_end(struct vm_area_struct *vma) |
---|
| 1776 | +{ |
---|
| 1777 | + raw_write_seqcount_end(&vma->vm_sequence); |
---|
| 1778 | +} |
---|
| 1779 | +#else |
---|
| 1780 | +static inline void vm_write_begin(struct vm_area_struct *vma) |
---|
| 1781 | +{ |
---|
| 1782 | +} |
---|
| 1783 | +static inline void vm_write_end(struct vm_area_struct *vma) |
---|
| 1784 | +{ |
---|
| 1785 | +} |
---|
| 1786 | +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ |
---|
1469 | 1787 | |
---|
1470 | 1788 | extern void truncate_pagecache(struct inode *inode, loff_t new); |
---|
1471 | 1789 | extern void truncate_setsize(struct inode *inode, loff_t newsize); |
---|
.. | .. |
---|
1477 | 1795 | |
---|
1478 | 1796 | #ifdef CONFIG_MMU |
---|
1479 | 1797 | extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, |
---|
1480 | | - unsigned long address, unsigned int flags); |
---|
1481 | | -extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, |
---|
| 1798 | + unsigned long address, unsigned int flags, |
---|
| 1799 | + struct pt_regs *regs); |
---|
| 1800 | +extern int fixup_user_fault(struct mm_struct *mm, |
---|
1482 | 1801 | unsigned long address, unsigned int fault_flags, |
---|
1483 | 1802 | bool *unlocked); |
---|
| 1803 | + |
---|
| 1804 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 1805 | +extern vm_fault_t __handle_speculative_fault(struct mm_struct *mm, |
---|
| 1806 | + unsigned long address, |
---|
| 1807 | + unsigned int flags, |
---|
| 1808 | + struct vm_area_struct **vma, |
---|
| 1809 | + struct pt_regs *regs); |
---|
| 1810 | +static inline vm_fault_t handle_speculative_fault(struct mm_struct *mm, |
---|
| 1811 | + unsigned long address, |
---|
| 1812 | + unsigned int flags, |
---|
| 1813 | + struct vm_area_struct **vma, |
---|
| 1814 | + struct pt_regs *regs) |
---|
| 1815 | +{ |
---|
| 1816 | + /* |
---|
| 1817 | + * Try speculative page fault for multithreaded user space task only. |
---|
| 1818 | + */ |
---|
| 1819 | + if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) { |
---|
| 1820 | + *vma = NULL; |
---|
| 1821 | + return VM_FAULT_RETRY; |
---|
| 1822 | + } |
---|
| 1823 | + return __handle_speculative_fault(mm, address, flags, vma, regs); |
---|
| 1824 | +} |
---|
| 1825 | +extern bool can_reuse_spf_vma(struct vm_area_struct *vma, |
---|
| 1826 | + unsigned long address); |
---|
| 1827 | +#else |
---|
| 1828 | +static inline vm_fault_t handle_speculative_fault(struct mm_struct *mm, |
---|
| 1829 | + unsigned long address, |
---|
| 1830 | + unsigned int flags, |
---|
| 1831 | + struct vm_area_struct **vma, |
---|
| 1832 | + struct pt_regs *regs) |
---|
| 1833 | +{ |
---|
| 1834 | + return VM_FAULT_RETRY; |
---|
| 1835 | +} |
---|
| 1836 | +static inline bool can_reuse_spf_vma(struct vm_area_struct *vma, |
---|
| 1837 | + unsigned long address) |
---|
| 1838 | +{ |
---|
| 1839 | + return false; |
---|
| 1840 | +} |
---|
| 1841 | +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ |
---|
| 1842 | + |
---|
1484 | 1843 | void unmap_mapping_page(struct page *page); |
---|
1485 | 1844 | void unmap_mapping_pages(struct address_space *mapping, |
---|
1486 | 1845 | pgoff_t start, pgoff_t nr, bool even_cows); |
---|
.. | .. |
---|
1488 | 1847 | loff_t const holebegin, loff_t const holelen, int even_cows); |
---|
1489 | 1848 | #else |
---|
1490 | 1849 | static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, |
---|
1491 | | - unsigned long address, unsigned int flags) |
---|
| 1850 | + unsigned long address, unsigned int flags, |
---|
| 1851 | + struct pt_regs *regs) |
---|
1492 | 1852 | { |
---|
1493 | 1853 | /* should never happen if there's no MMU */ |
---|
1494 | 1854 | BUG(); |
---|
1495 | 1855 | return VM_FAULT_SIGBUS; |
---|
1496 | 1856 | } |
---|
1497 | | -static inline int fixup_user_fault(struct task_struct *tsk, |
---|
1498 | | - struct mm_struct *mm, unsigned long address, |
---|
| 1857 | +static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, |
---|
1499 | 1858 | unsigned int fault_flags, bool *unlocked) |
---|
1500 | 1859 | { |
---|
1501 | 1860 | /* should never happen if there's no MMU */ |
---|
.. | .. |
---|
1522 | 1881 | extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
---|
1523 | 1882 | unsigned long addr, void *buf, int len, unsigned int gup_flags); |
---|
1524 | 1883 | |
---|
1525 | | -long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, |
---|
| 1884 | +long get_user_pages_remote(struct mm_struct *mm, |
---|
1526 | 1885 | unsigned long start, unsigned long nr_pages, |
---|
1527 | 1886 | unsigned int gup_flags, struct page **pages, |
---|
1528 | 1887 | struct vm_area_struct **vmas, int *locked); |
---|
| 1888 | +long pin_user_pages_remote(struct mm_struct *mm, |
---|
| 1889 | + unsigned long start, unsigned long nr_pages, |
---|
| 1890 | + unsigned int gup_flags, struct page **pages, |
---|
| 1891 | + struct vm_area_struct **vmas, int *locked); |
---|
1529 | 1892 | long get_user_pages(unsigned long start, unsigned long nr_pages, |
---|
1530 | 1893 | unsigned int gup_flags, struct page **pages, |
---|
1531 | 1894 | struct vm_area_struct **vmas); |
---|
| 1895 | +long pin_user_pages(unsigned long start, unsigned long nr_pages, |
---|
| 1896 | + unsigned int gup_flags, struct page **pages, |
---|
| 1897 | + struct vm_area_struct **vmas); |
---|
1532 | 1898 | long get_user_pages_locked(unsigned long start, unsigned long nr_pages, |
---|
| 1899 | + unsigned int gup_flags, struct page **pages, int *locked); |
---|
| 1900 | +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, |
---|
1533 | 1901 | unsigned int gup_flags, struct page **pages, int *locked); |
---|
1534 | 1902 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, |
---|
1535 | 1903 | struct page **pages, unsigned int gup_flags); |
---|
1536 | | -#ifdef CONFIG_FS_DAX |
---|
1537 | | -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, |
---|
1538 | | - unsigned int gup_flags, struct page **pages, |
---|
1539 | | - struct vm_area_struct **vmas); |
---|
1540 | | -#else |
---|
1541 | | -static inline long get_user_pages_longterm(unsigned long start, |
---|
1542 | | - unsigned long nr_pages, unsigned int gup_flags, |
---|
1543 | | - struct page **pages, struct vm_area_struct **vmas) |
---|
1544 | | -{ |
---|
1545 | | - return get_user_pages(start, nr_pages, gup_flags, pages, vmas); |
---|
1546 | | -} |
---|
1547 | | -#endif /* CONFIG_FS_DAX */ |
---|
| 1904 | +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, |
---|
| 1905 | + struct page **pages, unsigned int gup_flags); |
---|
1548 | 1906 | |
---|
1549 | | -int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
---|
1550 | | - struct page **pages); |
---|
| 1907 | +int get_user_pages_fast(unsigned long start, int nr_pages, |
---|
| 1908 | + unsigned int gup_flags, struct page **pages); |
---|
| 1909 | +int pin_user_pages_fast(unsigned long start, int nr_pages, |
---|
| 1910 | + unsigned int gup_flags, struct page **pages); |
---|
| 1911 | + |
---|
| 1912 | +int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); |
---|
| 1913 | +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, |
---|
| 1914 | + struct task_struct *task, bool bypass_rlim); |
---|
1551 | 1915 | |
---|
1552 | 1916 | /* Container for pinned pfns / pages */ |
---|
1553 | 1917 | struct frame_vector { |
---|
.. | .. |
---|
1555 | 1919 | unsigned int nr_frames; /* Number of frames stored in ptrs array */ |
---|
1556 | 1920 | bool got_ref; /* Did we pin pages by getting page ref? */ |
---|
1557 | 1921 | bool is_pfns; /* Does array contain pages or pfns? */ |
---|
1558 | | - void *ptrs[0]; /* Array of pinned pfns / pages. Use |
---|
| 1922 | + void *ptrs[]; /* Array of pinned pfns / pages. Use |
---|
1559 | 1923 | * pfns_vector_pages() or pfns_vector_pfns() |
---|
1560 | 1924 | * for access */ |
---|
1561 | 1925 | }; |
---|
.. | .. |
---|
1622 | 1986 | |
---|
1623 | 1987 | int get_cmdline(struct task_struct *task, char *buffer, int buflen); |
---|
1624 | 1988 | |
---|
1625 | | -static inline bool vma_is_anonymous(struct vm_area_struct *vma) |
---|
1626 | | -{ |
---|
1627 | | - return !vma->vm_ops; |
---|
1628 | | -} |
---|
1629 | | - |
---|
1630 | | -#ifdef CONFIG_SHMEM |
---|
1631 | | -/* |
---|
1632 | | - * The vma_is_shmem is not inline because it is used only by slow |
---|
1633 | | - * paths in userfault. |
---|
1634 | | - */ |
---|
1635 | | -bool vma_is_shmem(struct vm_area_struct *vma); |
---|
1636 | | -#else |
---|
1637 | | -static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } |
---|
1638 | | -#endif |
---|
1639 | | - |
---|
1640 | | -int vma_is_stack_for_current(struct vm_area_struct *vma); |
---|
1641 | | - |
---|
1642 | 1989 | extern unsigned long move_page_tables(struct vm_area_struct *vma, |
---|
1643 | 1990 | unsigned long old_addr, struct vm_area_struct *new_vma, |
---|
1644 | 1991 | unsigned long new_addr, unsigned long len, |
---|
1645 | 1992 | bool need_rmap_locks); |
---|
| 1993 | + |
---|
| 1994 | +/* |
---|
| 1995 | + * Flags used by change_protection(). For now we make it a bitmap so |
---|
| 1996 | + * that we can pass in multiple flags just like parameters. However |
---|
| 1997 | + * for now all the callers are only use one of the flags at the same |
---|
| 1998 | + * time. |
---|
| 1999 | + */ |
---|
| 2000 | +/* Whether we should allow dirty bit accounting */ |
---|
| 2001 | +#define MM_CP_DIRTY_ACCT (1UL << 0) |
---|
| 2002 | +/* Whether this protection change is for NUMA hints */ |
---|
| 2003 | +#define MM_CP_PROT_NUMA (1UL << 1) |
---|
| 2004 | +/* Whether this change is for write protecting */ |
---|
| 2005 | +#define MM_CP_UFFD_WP (1UL << 2) /* do wp */ |
---|
| 2006 | +#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ |
---|
| 2007 | +#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ |
---|
| 2008 | + MM_CP_UFFD_WP_RESOLVE) |
---|
| 2009 | + |
---|
1646 | 2010 | extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, |
---|
1647 | 2011 | unsigned long end, pgprot_t newprot, |
---|
1648 | | - int dirty_accountable, int prot_numa); |
---|
| 2012 | + unsigned long cp_flags); |
---|
1649 | 2013 | extern int mprotect_fixup(struct vm_area_struct *vma, |
---|
1650 | 2014 | struct vm_area_struct **pprev, unsigned long start, |
---|
1651 | 2015 | unsigned long end, unsigned long newflags); |
---|
.. | .. |
---|
1653 | 2017 | /* |
---|
1654 | 2018 | * doesn't attempt to fault and will return short. |
---|
1655 | 2019 | */ |
---|
1656 | | -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, |
---|
1657 | | - struct page **pages); |
---|
| 2020 | +int get_user_pages_fast_only(unsigned long start, int nr_pages, |
---|
| 2021 | + unsigned int gup_flags, struct page **pages); |
---|
| 2022 | +int pin_user_pages_fast_only(unsigned long start, int nr_pages, |
---|
| 2023 | + unsigned int gup_flags, struct page **pages); |
---|
| 2024 | + |
---|
| 2025 | +static inline bool get_user_page_fast_only(unsigned long addr, |
---|
| 2026 | + unsigned int gup_flags, struct page **pagep) |
---|
| 2027 | +{ |
---|
| 2028 | + return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; |
---|
| 2029 | +} |
---|
1658 | 2030 | /* |
---|
1659 | 2031 | * per-process(per-mm_struct) statistics. |
---|
1660 | 2032 | */ |
---|
.. | .. |
---|
1765 | 2137 | } |
---|
1766 | 2138 | #endif |
---|
1767 | 2139 | |
---|
1768 | | -#ifndef __HAVE_ARCH_PTE_DEVMAP |
---|
| 2140 | +#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL |
---|
| 2141 | +static inline int pte_special(pte_t pte) |
---|
| 2142 | +{ |
---|
| 2143 | + return 0; |
---|
| 2144 | +} |
---|
| 2145 | + |
---|
| 2146 | +static inline pte_t pte_mkspecial(pte_t pte) |
---|
| 2147 | +{ |
---|
| 2148 | + return pte; |
---|
| 2149 | +} |
---|
| 2150 | +#endif |
---|
| 2151 | + |
---|
| 2152 | +#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP |
---|
1769 | 2153 | static inline int pte_devmap(pte_t pte) |
---|
1770 | 2154 | { |
---|
1771 | 2155 | return 0; |
---|
.. | .. |
---|
1881 | 2265 | static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} |
---|
1882 | 2266 | #endif |
---|
1883 | 2267 | |
---|
1884 | | -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); |
---|
1885 | | -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); |
---|
| 2268 | +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); |
---|
| 2269 | +int __pte_alloc_kernel(pmd_t *pmd); |
---|
1886 | 2270 | |
---|
1887 | | -/* |
---|
1888 | | - * The following ifdef needed to get the 4level-fixup.h header to work. |
---|
1889 | | - * Remove it when 4level-fixup.h has been removed. |
---|
1890 | | - */ |
---|
1891 | | -#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) |
---|
| 2271 | +#if defined(CONFIG_MMU) |
---|
1892 | 2272 | |
---|
1893 | | -#ifndef __ARCH_HAS_5LEVEL_HACK |
---|
1894 | 2273 | static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, |
---|
1895 | 2274 | unsigned long address) |
---|
1896 | 2275 | { |
---|
.. | .. |
---|
1904 | 2283 | return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? |
---|
1905 | 2284 | NULL : pud_offset(p4d, address); |
---|
1906 | 2285 | } |
---|
1907 | | -#endif /* !__ARCH_HAS_5LEVEL_HACK */ |
---|
1908 | 2286 | |
---|
1909 | 2287 | static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) |
---|
1910 | 2288 | { |
---|
1911 | 2289 | return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? |
---|
1912 | 2290 | NULL: pmd_offset(pud, address); |
---|
1913 | 2291 | } |
---|
1914 | | -#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ |
---|
| 2292 | +#endif /* CONFIG_MMU */ |
---|
1915 | 2293 | |
---|
1916 | 2294 | #if USE_SPLIT_PTE_PTLOCKS |
---|
1917 | 2295 | #if ALLOC_SPLIT_PTLOCKS |
---|
.. | .. |
---|
1964 | 2342 | return true; |
---|
1965 | 2343 | } |
---|
1966 | 2344 | |
---|
1967 | | -/* Reset page->mapping so free_pages_check won't complain. */ |
---|
1968 | | -static inline void pte_lock_deinit(struct page *page) |
---|
1969 | | -{ |
---|
1970 | | - page->mapping = NULL; |
---|
1971 | | - ptlock_free(page); |
---|
1972 | | -} |
---|
1973 | | - |
---|
1974 | 2345 | #else /* !USE_SPLIT_PTE_PTLOCKS */ |
---|
1975 | 2346 | /* |
---|
1976 | 2347 | * We use mm->page_table_lock to guard all pagetable pages of the mm. |
---|
.. | .. |
---|
1981 | 2352 | } |
---|
1982 | 2353 | static inline void ptlock_cache_init(void) {} |
---|
1983 | 2354 | static inline bool ptlock_init(struct page *page) { return true; } |
---|
1984 | | -static inline void pte_lock_deinit(struct page *page) {} |
---|
| 2355 | +static inline void ptlock_free(struct page *page) {} |
---|
1985 | 2356 | #endif /* USE_SPLIT_PTE_PTLOCKS */ |
---|
1986 | 2357 | |
---|
1987 | 2358 | static inline void pgtable_init(void) |
---|
.. | .. |
---|
1990 | 2361 | pgtable_cache_init(); |
---|
1991 | 2362 | } |
---|
1992 | 2363 | |
---|
1993 | | -static inline bool pgtable_page_ctor(struct page *page) |
---|
| 2364 | +static inline bool pgtable_pte_page_ctor(struct page *page) |
---|
1994 | 2365 | { |
---|
1995 | 2366 | if (!ptlock_init(page)) |
---|
1996 | 2367 | return false; |
---|
.. | .. |
---|
1999 | 2370 | return true; |
---|
2000 | 2371 | } |
---|
2001 | 2372 | |
---|
2002 | | -static inline void pgtable_page_dtor(struct page *page) |
---|
| 2373 | +static inline void pgtable_pte_page_dtor(struct page *page) |
---|
2003 | 2374 | { |
---|
2004 | | - pte_lock_deinit(page); |
---|
| 2375 | + ptlock_free(page); |
---|
2005 | 2376 | __ClearPageTable(page); |
---|
2006 | 2377 | dec_zone_page_state(page, NR_PAGETABLE); |
---|
2007 | 2378 | } |
---|
.. | .. |
---|
2020 | 2391 | pte_unmap(pte); \ |
---|
2021 | 2392 | } while (0) |
---|
2022 | 2393 | |
---|
2023 | | -#define pte_alloc(mm, pmd, address) \ |
---|
2024 | | - (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd, address)) |
---|
| 2394 | +#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) |
---|
2025 | 2395 | |
---|
2026 | 2396 | #define pte_alloc_map(mm, pmd, address) \ |
---|
2027 | | - (pte_alloc(mm, pmd, address) ? NULL : pte_offset_map(pmd, address)) |
---|
| 2397 | + (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) |
---|
2028 | 2398 | |
---|
2029 | 2399 | #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ |
---|
2030 | | - (pte_alloc(mm, pmd, address) ? \ |
---|
| 2400 | + (pte_alloc(mm, pmd) ? \ |
---|
2031 | 2401 | NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) |
---|
2032 | 2402 | |
---|
2033 | 2403 | #define pte_alloc_kernel(pmd, address) \ |
---|
2034 | | - ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ |
---|
| 2404 | + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ |
---|
2035 | 2405 | NULL: pte_offset_kernel(pmd, address)) |
---|
2036 | 2406 | |
---|
2037 | 2407 | #if USE_SPLIT_PMD_PTLOCKS |
---|
.. | .. |
---|
2047 | 2417 | return ptlock_ptr(pmd_to_page(pmd)); |
---|
2048 | 2418 | } |
---|
2049 | 2419 | |
---|
2050 | | -static inline bool pgtable_pmd_page_ctor(struct page *page) |
---|
| 2420 | +static inline bool pmd_ptlock_init(struct page *page) |
---|
2051 | 2421 | { |
---|
2052 | 2422 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
---|
2053 | 2423 | page->pmd_huge_pte = NULL; |
---|
.. | .. |
---|
2055 | 2425 | return ptlock_init(page); |
---|
2056 | 2426 | } |
---|
2057 | 2427 | |
---|
2058 | | -static inline void pgtable_pmd_page_dtor(struct page *page) |
---|
| 2428 | +static inline void pmd_ptlock_free(struct page *page) |
---|
2059 | 2429 | { |
---|
2060 | 2430 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
---|
2061 | 2431 | VM_BUG_ON_PAGE(page->pmd_huge_pte, page); |
---|
.. | .. |
---|
2072 | 2442 | return &mm->page_table_lock; |
---|
2073 | 2443 | } |
---|
2074 | 2444 | |
---|
2075 | | -static inline bool pgtable_pmd_page_ctor(struct page *page) { return true; } |
---|
2076 | | -static inline void pgtable_pmd_page_dtor(struct page *page) {} |
---|
| 2445 | +static inline bool pmd_ptlock_init(struct page *page) { return true; } |
---|
| 2446 | +static inline void pmd_ptlock_free(struct page *page) {} |
---|
2077 | 2447 | |
---|
2078 | 2448 | #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) |
---|
2079 | 2449 | |
---|
.. | .. |
---|
2084 | 2454 | spinlock_t *ptl = pmd_lockptr(mm, pmd); |
---|
2085 | 2455 | spin_lock(ptl); |
---|
2086 | 2456 | return ptl; |
---|
| 2457 | +} |
---|
| 2458 | + |
---|
| 2459 | +static inline bool pgtable_pmd_page_ctor(struct page *page) |
---|
| 2460 | +{ |
---|
| 2461 | + if (!pmd_ptlock_init(page)) |
---|
| 2462 | + return false; |
---|
| 2463 | + __SetPageTable(page); |
---|
| 2464 | + inc_zone_page_state(page, NR_PAGETABLE); |
---|
| 2465 | + return true; |
---|
| 2466 | +} |
---|
| 2467 | + |
---|
| 2468 | +static inline void pgtable_pmd_page_dtor(struct page *page) |
---|
| 2469 | +{ |
---|
| 2470 | + pmd_ptlock_free(page); |
---|
| 2471 | + __ClearPageTable(page); |
---|
| 2472 | + dec_zone_page_state(page, NR_PAGETABLE); |
---|
2087 | 2473 | } |
---|
2088 | 2474 | |
---|
2089 | 2475 | /* |
---|
.. | .. |
---|
2106 | 2492 | } |
---|
2107 | 2493 | |
---|
2108 | 2494 | extern void __init pagecache_init(void); |
---|
2109 | | -extern void free_area_init(unsigned long * zones_size); |
---|
2110 | | -extern void __init free_area_init_node(int nid, unsigned long * zones_size, |
---|
2111 | | - unsigned long zone_start_pfn, unsigned long *zholes_size); |
---|
| 2495 | +extern void __init free_area_init_memoryless_node(int nid); |
---|
2112 | 2496 | extern void free_initmem(void); |
---|
2113 | 2497 | |
---|
2114 | 2498 | /* |
---|
.. | .. |
---|
2118 | 2502 | * Return pages freed into the buddy system. |
---|
2119 | 2503 | */ |
---|
2120 | 2504 | extern unsigned long free_reserved_area(void *start, void *end, |
---|
2121 | | - int poison, char *s); |
---|
| 2505 | + int poison, const char *s); |
---|
2122 | 2506 | |
---|
2123 | 2507 | #ifdef CONFIG_HIGHMEM |
---|
2124 | 2508 | /* |
---|
.. | .. |
---|
2178 | 2562 | return phys_pages; |
---|
2179 | 2563 | } |
---|
2180 | 2564 | |
---|
2181 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
2182 | 2565 | /* |
---|
2183 | | - * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its |
---|
2184 | | - * zones, allocate the backing mem_map and account for memory holes in a more |
---|
2185 | | - * architecture independent manner. This is a substitute for creating the |
---|
2186 | | - * zone_sizes[] and zholes_size[] arrays and passing them to |
---|
2187 | | - * free_area_init_node() |
---|
| 2566 | + * Using memblock node mappings, an architecture may initialise its |
---|
| 2567 | + * zones, allocate the backing mem_map and account for memory holes in an |
---|
| 2568 | + * architecture independent manner. |
---|
2188 | 2569 | * |
---|
2189 | 2570 | * An architecture is expected to register range of page frames backed by |
---|
2190 | 2571 | * physical memory with memblock_add[_node]() before calling |
---|
2191 | | - * free_area_init_nodes() passing in the PFN each zone ends at. At a basic |
---|
| 2572 | + * free_area_init() passing in the PFN each zone ends at. At a basic |
---|
2192 | 2573 | * usage, an architecture is expected to do something like |
---|
2193 | 2574 | * |
---|
2194 | 2575 | * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, |
---|
2195 | 2576 | * max_highmem_pfn}; |
---|
2196 | 2577 | * for_each_valid_physical_page_range() |
---|
2197 | 2578 | * memblock_add_node(base, size, nid) |
---|
2198 | | - * free_area_init_nodes(max_zone_pfns); |
---|
2199 | | - * |
---|
2200 | | - * free_bootmem_with_active_regions() calls free_bootmem_node() for each |
---|
2201 | | - * registered physical page range. Similarly |
---|
2202 | | - * sparse_memory_present_with_active_regions() calls memory_present() for |
---|
2203 | | - * each range when SPARSEMEM is enabled. |
---|
2204 | | - * |
---|
2205 | | - * See mm/page_alloc.c for more information on each function exposed by |
---|
2206 | | - * CONFIG_HAVE_MEMBLOCK_NODE_MAP. |
---|
| 2579 | + * free_area_init(max_zone_pfns); |
---|
2207 | 2580 | */ |
---|
2208 | | -extern void free_area_init_nodes(unsigned long *max_zone_pfn); |
---|
| 2581 | +void free_area_init(unsigned long *max_zone_pfn); |
---|
2209 | 2582 | unsigned long node_map_pfn_alignment(void); |
---|
2210 | 2583 | unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, |
---|
2211 | 2584 | unsigned long end_pfn); |
---|
.. | .. |
---|
2214 | 2587 | extern void get_pfn_range_for_nid(unsigned int nid, |
---|
2215 | 2588 | unsigned long *start_pfn, unsigned long *end_pfn); |
---|
2216 | 2589 | extern unsigned long find_min_pfn_with_active_regions(void); |
---|
2217 | | -extern void free_bootmem_with_active_regions(int nid, |
---|
2218 | | - unsigned long max_low_pfn); |
---|
2219 | | -extern void sparse_memory_present_with_active_regions(int nid); |
---|
2220 | 2590 | |
---|
2221 | | -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
2222 | | - |
---|
2223 | | -#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ |
---|
2224 | | - !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) |
---|
2225 | | -static inline int __early_pfn_to_nid(unsigned long pfn, |
---|
2226 | | - struct mminit_pfnnid_cache *state) |
---|
| 2591 | +#ifndef CONFIG_NEED_MULTIPLE_NODES |
---|
| 2592 | +static inline int early_pfn_to_nid(unsigned long pfn) |
---|
2227 | 2593 | { |
---|
2228 | 2594 | return 0; |
---|
2229 | 2595 | } |
---|
.. | .. |
---|
2235 | 2601 | struct mminit_pfnnid_cache *state); |
---|
2236 | 2602 | #endif |
---|
2237 | 2603 | |
---|
2238 | | -#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) |
---|
2239 | | -void zero_resv_unavail(void); |
---|
2240 | | -#else |
---|
2241 | | -static inline void zero_resv_unavail(void) {} |
---|
2242 | | -#endif |
---|
2243 | | - |
---|
2244 | 2604 | extern void set_dma_reserve(unsigned long new_dma_reserve); |
---|
2245 | | -extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, |
---|
2246 | | - enum meminit_context, struct vmem_altmap *); |
---|
| 2605 | +extern void memmap_init_zone(unsigned long, int, unsigned long, |
---|
| 2606 | + unsigned long, unsigned long, enum meminit_context, |
---|
| 2607 | + struct vmem_altmap *, int migratetype); |
---|
2247 | 2608 | extern void setup_per_zone_wmarks(void); |
---|
2248 | 2609 | extern int __meminit init_per_zone_wmark_min(void); |
---|
2249 | 2610 | extern void mem_init(void); |
---|
.. | .. |
---|
2261 | 2622 | |
---|
2262 | 2623 | extern void setup_per_cpu_pageset(void); |
---|
2263 | 2624 | |
---|
2264 | | -extern void zone_pcp_update(struct zone *zone); |
---|
2265 | | -extern void zone_pcp_reset(struct zone *zone); |
---|
2266 | | - |
---|
2267 | 2625 | /* page_alloc.c */ |
---|
2268 | 2626 | extern int min_free_kbytes; |
---|
| 2627 | +extern int watermark_boost_factor; |
---|
2269 | 2628 | extern int watermark_scale_factor; |
---|
| 2629 | +extern bool arch_has_descending_max_zone_pfns(void); |
---|
2270 | 2630 | |
---|
2271 | 2631 | /* nommu.c */ |
---|
2272 | 2632 | extern atomic_long_t mmap_pages_allocated; |
---|
.. | .. |
---|
2310 | 2670 | extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); |
---|
2311 | 2671 | extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, |
---|
2312 | 2672 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, |
---|
2313 | | - struct vm_area_struct *expand); |
---|
| 2673 | + struct vm_area_struct *expand, bool keep_locked); |
---|
2314 | 2674 | static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, |
---|
2315 | 2675 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) |
---|
2316 | 2676 | { |
---|
2317 | | - return __vma_adjust(vma, start, end, pgoff, insert, NULL); |
---|
| 2677 | + return __vma_adjust(vma, start, end, pgoff, insert, NULL, false); |
---|
2318 | 2678 | } |
---|
2319 | | -extern struct vm_area_struct *vma_merge(struct mm_struct *, |
---|
| 2679 | + |
---|
| 2680 | +extern struct vm_area_struct *__vma_merge(struct mm_struct *mm, |
---|
2320 | 2681 | struct vm_area_struct *prev, unsigned long addr, unsigned long end, |
---|
2321 | | - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, |
---|
2322 | | - struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *); |
---|
| 2682 | + unsigned long vm_flags, struct anon_vma *anon, struct file *file, |
---|
| 2683 | + pgoff_t pgoff, struct mempolicy *mpol, struct vm_userfaultfd_ctx uff, |
---|
| 2684 | + const char __user *user, bool keep_locked); |
---|
| 2685 | + |
---|
| 2686 | +static inline struct vm_area_struct *vma_merge(struct mm_struct *mm, |
---|
| 2687 | + struct vm_area_struct *prev, unsigned long addr, unsigned long end, |
---|
| 2688 | + unsigned long vm_flags, struct anon_vma *anon, struct file *file, |
---|
| 2689 | + pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff, |
---|
| 2690 | + const char __user *user) |
---|
| 2691 | +{ |
---|
| 2692 | + return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off, |
---|
| 2693 | + pol, uff, user, false); |
---|
| 2694 | +} |
---|
| 2695 | + |
---|
2323 | 2696 | extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); |
---|
2324 | 2697 | extern int __split_vma(struct mm_struct *, struct vm_area_struct *, |
---|
2325 | 2698 | unsigned long addr, int new_below); |
---|
.. | .. |
---|
2369 | 2742 | unsigned long addr, unsigned long len, |
---|
2370 | 2743 | unsigned long flags, struct page **pages); |
---|
2371 | 2744 | |
---|
| 2745 | +unsigned long randomize_stack_top(unsigned long stack_top); |
---|
| 2746 | +unsigned long randomize_page(unsigned long start, unsigned long range); |
---|
| 2747 | + |
---|
2372 | 2748 | extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); |
---|
2373 | 2749 | |
---|
2374 | 2750 | extern unsigned long mmap_region(struct file *file, unsigned long addr, |
---|
.. | .. |
---|
2376 | 2752 | struct list_head *uf); |
---|
2377 | 2753 | extern unsigned long do_mmap(struct file *file, unsigned long addr, |
---|
2378 | 2754 | unsigned long len, unsigned long prot, unsigned long flags, |
---|
2379 | | - vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, |
---|
2380 | | - struct list_head *uf); |
---|
| 2755 | + unsigned long pgoff, unsigned long *populate, struct list_head *uf); |
---|
| 2756 | +extern int __do_munmap(struct mm_struct *, unsigned long, size_t, |
---|
| 2757 | + struct list_head *uf, bool downgrade); |
---|
2381 | 2758 | extern int do_munmap(struct mm_struct *, unsigned long, size_t, |
---|
2382 | 2759 | struct list_head *uf); |
---|
2383 | | - |
---|
2384 | | -static inline unsigned long |
---|
2385 | | -do_mmap_pgoff(struct file *file, unsigned long addr, |
---|
2386 | | - unsigned long len, unsigned long prot, unsigned long flags, |
---|
2387 | | - unsigned long pgoff, unsigned long *populate, |
---|
2388 | | - struct list_head *uf) |
---|
2389 | | -{ |
---|
2390 | | - return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf); |
---|
2391 | | -} |
---|
| 2760 | +extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); |
---|
2392 | 2761 | |
---|
2393 | 2762 | #ifdef CONFIG_MMU |
---|
2394 | 2763 | extern int __mm_populate(unsigned long addr, unsigned long len, |
---|
.. | .. |
---|
2420 | 2789 | unsigned long align_offset; |
---|
2421 | 2790 | }; |
---|
2422 | 2791 | |
---|
2423 | | -extern unsigned long unmapped_area(struct vm_unmapped_area_info *info); |
---|
2424 | | -extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); |
---|
2425 | | - |
---|
2426 | | -/* |
---|
2427 | | - * Search for an unmapped address range. |
---|
2428 | | - * |
---|
2429 | | - * We are looking for a range that: |
---|
2430 | | - * - does not intersect with any VMA; |
---|
2431 | | - * - is contained within the [low_limit, high_limit) interval; |
---|
2432 | | - * - is at least the desired size. |
---|
2433 | | - * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) |
---|
2434 | | - */ |
---|
2435 | | -static inline unsigned long |
---|
2436 | | -vm_unmapped_area(struct vm_unmapped_area_info *info) |
---|
2437 | | -{ |
---|
2438 | | - if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) |
---|
2439 | | - return unmapped_area_topdown(info); |
---|
2440 | | - else |
---|
2441 | | - return unmapped_area(info); |
---|
2442 | | -} |
---|
| 2792 | +extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); |
---|
2443 | 2793 | |
---|
2444 | 2794 | /* truncate.c */ |
---|
2445 | 2795 | extern void truncate_inode_pages(struct address_space *, loff_t); |
---|
.. | .. |
---|
2449 | 2799 | |
---|
2450 | 2800 | /* generic vm_area_ops exported for stackable file systems */ |
---|
2451 | 2801 | extern vm_fault_t filemap_fault(struct vm_fault *vmf); |
---|
2452 | | -extern void filemap_map_pages(struct vm_fault *vmf, |
---|
| 2802 | +extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, |
---|
2453 | 2803 | pgoff_t start_pgoff, pgoff_t end_pgoff); |
---|
2454 | 2804 | extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); |
---|
| 2805 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 2806 | +extern bool filemap_allow_speculation(void); |
---|
| 2807 | +#endif |
---|
2455 | 2808 | |
---|
2456 | 2809 | /* mm/page-writeback.c */ |
---|
2457 | 2810 | int __must_check write_one_page(struct page *page); |
---|
2458 | 2811 | void task_dirty_inc(struct task_struct *tsk); |
---|
2459 | 2812 | |
---|
2460 | | -/* readahead.c */ |
---|
2461 | | -#define VM_MAX_READAHEAD 128 /* kbytes */ |
---|
2462 | | -#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ |
---|
2463 | | - |
---|
2464 | | -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, |
---|
2465 | | - pgoff_t offset, unsigned long nr_to_read); |
---|
2466 | | - |
---|
2467 | | -void page_cache_sync_readahead(struct address_space *mapping, |
---|
2468 | | - struct file_ra_state *ra, |
---|
2469 | | - struct file *filp, |
---|
2470 | | - pgoff_t offset, |
---|
2471 | | - unsigned long size); |
---|
2472 | | - |
---|
2473 | | -void page_cache_async_readahead(struct address_space *mapping, |
---|
2474 | | - struct file_ra_state *ra, |
---|
2475 | | - struct file *filp, |
---|
2476 | | - struct page *pg, |
---|
2477 | | - pgoff_t offset, |
---|
2478 | | - unsigned long size); |
---|
2479 | | - |
---|
2480 | 2813 | extern unsigned long stack_guard_gap; |
---|
2481 | 2814 | /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ |
---|
2482 | 2815 | extern int expand_stack(struct vm_area_struct *vma, unsigned long address); |
---|
2483 | 2816 | |
---|
2484 | | -/* CONFIG_STACK_GROWSUP still needs to to grow downwards at some places */ |
---|
| 2817 | +/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ |
---|
2485 | 2818 | extern int expand_downwards(struct vm_area_struct *vma, |
---|
2486 | 2819 | unsigned long address); |
---|
2487 | 2820 | #if VM_GROWSUP |
---|
.. | .. |
---|
2576 | 2909 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, |
---|
2577 | 2910 | unsigned long pfn, unsigned long size, pgprot_t); |
---|
2578 | 2911 | int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); |
---|
2579 | | -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
---|
| 2912 | +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, |
---|
| 2913 | + struct page **pages, unsigned long *num); |
---|
| 2914 | +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, |
---|
| 2915 | + unsigned long num); |
---|
| 2916 | +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, |
---|
| 2917 | + unsigned long num); |
---|
| 2918 | +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
---|
2580 | 2919 | unsigned long pfn); |
---|
2581 | | -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, |
---|
| 2920 | +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, |
---|
2582 | 2921 | unsigned long pfn, pgprot_t pgprot); |
---|
2583 | | -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
---|
| 2922 | +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
---|
2584 | 2923 | pfn_t pfn); |
---|
| 2924 | +vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr, |
---|
| 2925 | + pfn_t pfn, pgprot_t pgprot); |
---|
2585 | 2926 | vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, |
---|
2586 | 2927 | unsigned long addr, pfn_t pfn); |
---|
2587 | 2928 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); |
---|
.. | .. |
---|
2590 | 2931 | unsigned long addr, struct page *page) |
---|
2591 | 2932 | { |
---|
2592 | 2933 | int err = vm_insert_page(vma, addr, page); |
---|
2593 | | - |
---|
2594 | | - if (err == -ENOMEM) |
---|
2595 | | - return VM_FAULT_OOM; |
---|
2596 | | - if (err < 0 && err != -EBUSY) |
---|
2597 | | - return VM_FAULT_SIGBUS; |
---|
2598 | | - |
---|
2599 | | - return VM_FAULT_NOPAGE; |
---|
2600 | | -} |
---|
2601 | | - |
---|
2602 | | -static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, |
---|
2603 | | - unsigned long addr, pfn_t pfn) |
---|
2604 | | -{ |
---|
2605 | | - int err = vm_insert_mixed(vma, addr, pfn); |
---|
2606 | | - |
---|
2607 | | - if (err == -ENOMEM) |
---|
2608 | | - return VM_FAULT_OOM; |
---|
2609 | | - if (err < 0 && err != -EBUSY) |
---|
2610 | | - return VM_FAULT_SIGBUS; |
---|
2611 | | - |
---|
2612 | | - return VM_FAULT_NOPAGE; |
---|
2613 | | -} |
---|
2614 | | - |
---|
2615 | | -static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, |
---|
2616 | | - unsigned long addr, unsigned long pfn) |
---|
2617 | | -{ |
---|
2618 | | - int err = vm_insert_pfn(vma, addr, pfn); |
---|
2619 | 2934 | |
---|
2620 | 2935 | if (err == -ENOMEM) |
---|
2621 | 2936 | return VM_FAULT_OOM; |
---|
.. | .. |
---|
2641 | 2956 | return VM_FAULT_SIGBUS; |
---|
2642 | 2957 | } |
---|
2643 | 2958 | |
---|
2644 | | -struct page *follow_page_mask(struct vm_area_struct *vma, |
---|
2645 | | - unsigned long address, unsigned int foll_flags, |
---|
2646 | | - unsigned int *page_mask); |
---|
2647 | | - |
---|
2648 | | -static inline struct page *follow_page(struct vm_area_struct *vma, |
---|
2649 | | - unsigned long address, unsigned int foll_flags) |
---|
2650 | | -{ |
---|
2651 | | - unsigned int unused_page_mask; |
---|
2652 | | - return follow_page_mask(vma, address, foll_flags, &unused_page_mask); |
---|
2653 | | -} |
---|
| 2959 | +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
---|
| 2960 | + unsigned int foll_flags); |
---|
2654 | 2961 | |
---|
2655 | 2962 | #define FOLL_WRITE 0x01 /* check pte is writable */ |
---|
2656 | 2963 | #define FOLL_TOUCH 0x02 /* mark page accessed */ |
---|
.. | .. |
---|
2669 | 2976 | #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ |
---|
2670 | 2977 | #define FOLL_COW 0x4000 /* internal GUP flag */ |
---|
2671 | 2978 | #define FOLL_ANON 0x8000 /* don't do file mappings */ |
---|
| 2979 | +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ |
---|
| 2980 | +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ |
---|
| 2981 | +#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ |
---|
| 2982 | +#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ |
---|
| 2983 | + |
---|
| 2984 | +/* |
---|
| 2985 | + * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each |
---|
| 2986 | + * other. Here is what they mean, and how to use them: |
---|
| 2987 | + * |
---|
| 2988 | + * FOLL_LONGTERM indicates that the page will be held for an indefinite time |
---|
| 2989 | + * period _often_ under userspace control. This is in contrast to |
---|
| 2990 | + * iov_iter_get_pages(), whose usages are transient. |
---|
| 2991 | + * |
---|
| 2992 | + * FIXME: For pages which are part of a filesystem, mappings are subject to the |
---|
| 2993 | + * lifetime enforced by the filesystem and we need guarantees that longterm |
---|
| 2994 | + * users like RDMA and V4L2 only establish mappings which coordinate usage with |
---|
| 2995 | + * the filesystem. Ideas for this coordination include revoking the longterm |
---|
| 2996 | + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was |
---|
| 2997 | + * added after the problem with filesystems was found FS DAX VMAs are |
---|
| 2998 | + * specifically failed. Filesystem pages are still subject to bugs and use of |
---|
| 2999 | + * FOLL_LONGTERM should be avoided on those pages. |
---|
| 3000 | + * |
---|
| 3001 | + * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. |
---|
| 3002 | + * Currently only get_user_pages() and get_user_pages_fast() support this flag |
---|
| 3003 | + * and calls to get_user_pages_[un]locked are specifically not allowed. This |
---|
| 3004 | + * is due to an incompatibility with the FS DAX check and |
---|
| 3005 | + * FAULT_FLAG_ALLOW_RETRY. |
---|
| 3006 | + * |
---|
| 3007 | + * In the CMA case: long term pins in a CMA region would unnecessarily fragment |
---|
| 3008 | + * that region. And so, CMA attempts to migrate the page before pinning, when |
---|
| 3009 | + * FOLL_LONGTERM is specified. |
---|
| 3010 | + * |
---|
| 3011 | + * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, |
---|
| 3012 | + * but an additional pin counting system) will be invoked. This is intended for |
---|
| 3013 | + * anything that gets a page reference and then touches page data (for example, |
---|
| 3014 | + * Direct IO). This lets the filesystem know that some non-file-system entity is |
---|
| 3015 | + * potentially changing the pages' data. In contrast to FOLL_GET (whose pages |
---|
| 3016 | + * are released via put_page()), FOLL_PIN pages must be released, ultimately, by |
---|
| 3017 | + * a call to unpin_user_page(). |
---|
| 3018 | + * |
---|
| 3019 | + * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different |
---|
| 3020 | + * and separate refcounting mechanisms, however, and that means that each has |
---|
| 3021 | + * its own acquire and release mechanisms: |
---|
| 3022 | + * |
---|
| 3023 | + * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. |
---|
| 3024 | + * |
---|
| 3025 | + * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. |
---|
| 3026 | + * |
---|
| 3027 | + * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. |
---|
| 3028 | + * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based |
---|
| 3029 | + * calls applied to them, and that's perfectly OK. This is a constraint on the |
---|
| 3030 | + * callers, not on the pages.) |
---|
| 3031 | + * |
---|
| 3032 | + * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never |
---|
| 3033 | + * directly by the caller. That's in order to help avoid mismatches when |
---|
| 3034 | + * releasing pages: get_user_pages*() pages must be released via put_page(), |
---|
| 3035 | + * while pin_user_pages*() pages must be released via unpin_user_page(). |
---|
| 3036 | + * |
---|
| 3037 | + * Please see Documentation/core-api/pin_user_pages.rst for more information. |
---|
| 3038 | + */ |
---|
2672 | 3039 | |
---|
2673 | 3040 | static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) |
---|
2674 | 3041 | { |
---|
.. | .. |
---|
2681 | 3048 | return 0; |
---|
2682 | 3049 | } |
---|
2683 | 3050 | |
---|
2684 | | -typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, |
---|
2685 | | - void *data); |
---|
| 3051 | +typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); |
---|
2686 | 3052 | extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, |
---|
2687 | 3053 | unsigned long size, pte_fn_t fn, void *data); |
---|
| 3054 | +extern int apply_to_existing_page_range(struct mm_struct *mm, |
---|
| 3055 | + unsigned long address, unsigned long size, |
---|
| 3056 | + pte_fn_t fn, void *data); |
---|
2688 | 3057 | |
---|
2689 | | - |
---|
| 3058 | +extern void init_mem_debugging_and_hardening(void); |
---|
2690 | 3059 | #ifdef CONFIG_PAGE_POISONING |
---|
2691 | | -extern bool page_poisoning_enabled(void); |
---|
2692 | | -extern void kernel_poison_pages(struct page *page, int numpages, int enable); |
---|
| 3060 | +extern void __kernel_poison_pages(struct page *page, int numpages); |
---|
| 3061 | +extern void __kernel_unpoison_pages(struct page *page, int numpages); |
---|
| 3062 | +extern bool _page_poisoning_enabled_early; |
---|
| 3063 | +DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); |
---|
| 3064 | +static inline bool page_poisoning_enabled(void) |
---|
| 3065 | +{ |
---|
| 3066 | + return _page_poisoning_enabled_early; |
---|
| 3067 | +} |
---|
| 3068 | +/* |
---|
| 3069 | + * For use in fast paths after init_mem_debugging() has run, or when a |
---|
| 3070 | + * false negative result is not harmful when called too early. |
---|
| 3071 | + */ |
---|
| 3072 | +static inline bool page_poisoning_enabled_static(void) |
---|
| 3073 | +{ |
---|
| 3074 | + return static_branch_unlikely(&_page_poisoning_enabled); |
---|
| 3075 | +} |
---|
| 3076 | +static inline void kernel_poison_pages(struct page *page, int numpages) |
---|
| 3077 | +{ |
---|
| 3078 | + if (page_poisoning_enabled_static()) |
---|
| 3079 | + __kernel_poison_pages(page, numpages); |
---|
| 3080 | +} |
---|
| 3081 | +static inline void kernel_unpoison_pages(struct page *page, int numpages) |
---|
| 3082 | +{ |
---|
| 3083 | + if (page_poisoning_enabled_static()) |
---|
| 3084 | + __kernel_unpoison_pages(page, numpages); |
---|
| 3085 | +} |
---|
2693 | 3086 | #else |
---|
2694 | 3087 | static inline bool page_poisoning_enabled(void) { return false; } |
---|
2695 | | -static inline void kernel_poison_pages(struct page *page, int numpages, |
---|
2696 | | - int enable) { } |
---|
| 3088 | +static inline bool page_poisoning_enabled_static(void) { return false; } |
---|
| 3089 | +static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } |
---|
| 3090 | +static inline void kernel_poison_pages(struct page *page, int numpages) { } |
---|
| 3091 | +static inline void kernel_unpoison_pages(struct page *page, int numpages) { } |
---|
2697 | 3092 | #endif |
---|
2698 | 3093 | |
---|
2699 | | -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON |
---|
2700 | | -DECLARE_STATIC_KEY_TRUE(init_on_alloc); |
---|
2701 | | -#else |
---|
2702 | 3094 | DECLARE_STATIC_KEY_FALSE(init_on_alloc); |
---|
2703 | | -#endif |
---|
2704 | 3095 | static inline bool want_init_on_alloc(gfp_t flags) |
---|
2705 | 3096 | { |
---|
2706 | | - if (static_branch_unlikely(&init_on_alloc) && |
---|
2707 | | - !page_poisoning_enabled()) |
---|
| 3097 | + if (static_branch_unlikely(&init_on_alloc)) |
---|
2708 | 3098 | return true; |
---|
2709 | 3099 | return flags & __GFP_ZERO; |
---|
2710 | 3100 | } |
---|
2711 | 3101 | |
---|
2712 | | -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON |
---|
2713 | | -DECLARE_STATIC_KEY_TRUE(init_on_free); |
---|
2714 | | -#else |
---|
2715 | 3102 | DECLARE_STATIC_KEY_FALSE(init_on_free); |
---|
2716 | | -#endif |
---|
2717 | 3103 | static inline bool want_init_on_free(void) |
---|
2718 | 3104 | { |
---|
2719 | | - return static_branch_unlikely(&init_on_free) && |
---|
2720 | | - !page_poisoning_enabled(); |
---|
| 3105 | + return static_branch_unlikely(&init_on_free); |
---|
2721 | 3106 | } |
---|
2722 | 3107 | |
---|
2723 | | -#ifdef CONFIG_DEBUG_PAGEALLOC |
---|
2724 | | -extern bool _debug_pagealloc_enabled; |
---|
2725 | | -extern void __kernel_map_pages(struct page *page, int numpages, int enable); |
---|
| 3108 | +extern bool _debug_pagealloc_enabled_early; |
---|
| 3109 | +DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); |
---|
2726 | 3110 | |
---|
2727 | 3111 | static inline bool debug_pagealloc_enabled(void) |
---|
2728 | 3112 | { |
---|
2729 | | - return _debug_pagealloc_enabled; |
---|
| 3113 | + return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && |
---|
| 3114 | + _debug_pagealloc_enabled_early; |
---|
2730 | 3115 | } |
---|
2731 | 3116 | |
---|
| 3117 | +/* |
---|
| 3118 | + * For use in fast paths after init_debug_pagealloc() has run, or when a |
---|
| 3119 | + * false negative result is not harmful when called too early. |
---|
| 3120 | + */ |
---|
| 3121 | +static inline bool debug_pagealloc_enabled_static(void) |
---|
| 3122 | +{ |
---|
| 3123 | + if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) |
---|
| 3124 | + return false; |
---|
| 3125 | + |
---|
| 3126 | + return static_branch_unlikely(&_debug_pagealloc_enabled); |
---|
| 3127 | +} |
---|
| 3128 | + |
---|
| 3129 | +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) |
---|
| 3130 | +extern void __kernel_map_pages(struct page *page, int numpages, int enable); |
---|
| 3131 | + |
---|
| 3132 | +/* |
---|
| 3133 | + * When called in DEBUG_PAGEALLOC context, the call should most likely be |
---|
| 3134 | + * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static() |
---|
| 3135 | + */ |
---|
2732 | 3136 | static inline void |
---|
2733 | 3137 | kernel_map_pages(struct page *page, int numpages, int enable) |
---|
2734 | 3138 | { |
---|
2735 | | - if (!debug_pagealloc_enabled()) |
---|
2736 | | - return; |
---|
2737 | | - |
---|
2738 | 3139 | __kernel_map_pages(page, numpages, enable); |
---|
2739 | 3140 | } |
---|
| 3141 | + |
---|
| 3142 | +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) |
---|
| 3143 | +{ |
---|
| 3144 | + if (debug_pagealloc_enabled_static()) |
---|
| 3145 | + __kernel_map_pages(page, numpages, 1); |
---|
| 3146 | +} |
---|
| 3147 | + |
---|
| 3148 | +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) |
---|
| 3149 | +{ |
---|
| 3150 | + if (debug_pagealloc_enabled_static()) |
---|
| 3151 | + __kernel_map_pages(page, numpages, 0); |
---|
| 3152 | +} |
---|
| 3153 | + |
---|
2740 | 3154 | #ifdef CONFIG_HIBERNATION |
---|
2741 | 3155 | extern bool kernel_page_present(struct page *page); |
---|
2742 | 3156 | #endif /* CONFIG_HIBERNATION */ |
---|
2743 | | -#else /* CONFIG_DEBUG_PAGEALLOC */ |
---|
| 3157 | +#else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ |
---|
2744 | 3158 | static inline void |
---|
2745 | 3159 | kernel_map_pages(struct page *page, int numpages, int enable) {} |
---|
| 3160 | +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} |
---|
| 3161 | +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} |
---|
2746 | 3162 | #ifdef CONFIG_HIBERNATION |
---|
2747 | 3163 | static inline bool kernel_page_present(struct page *page) { return true; } |
---|
2748 | 3164 | #endif /* CONFIG_HIBERNATION */ |
---|
2749 | | -static inline bool debug_pagealloc_enabled(void) |
---|
2750 | | -{ |
---|
2751 | | - return false; |
---|
2752 | | -} |
---|
2753 | | -#endif /* CONFIG_DEBUG_PAGEALLOC */ |
---|
| 3165 | +#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ |
---|
2754 | 3166 | |
---|
2755 | 3167 | #ifdef __HAVE_ARCH_GATE_AREA |
---|
2756 | 3168 | extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); |
---|
.. | .. |
---|
2772 | 3184 | |
---|
2773 | 3185 | #ifdef CONFIG_SYSCTL |
---|
2774 | 3186 | extern int sysctl_drop_caches; |
---|
2775 | | -int drop_caches_sysctl_handler(struct ctl_table *, int, |
---|
2776 | | - void __user *, size_t *, loff_t *); |
---|
| 3187 | +int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, |
---|
| 3188 | + loff_t *); |
---|
2777 | 3189 | #endif |
---|
2778 | 3190 | |
---|
2779 | 3191 | void drop_slab(void); |
---|
.. | .. |
---|
2786 | 3198 | #endif |
---|
2787 | 3199 | |
---|
2788 | 3200 | const char * arch_vma_name(struct vm_area_struct *vma); |
---|
| 3201 | +#ifdef CONFIG_MMU |
---|
2789 | 3202 | void print_vma_addr(char *prefix, unsigned long rip); |
---|
| 3203 | +#else |
---|
| 3204 | +static inline void print_vma_addr(char *prefix, unsigned long rip) |
---|
| 3205 | +{ |
---|
| 3206 | +} |
---|
| 3207 | +#endif |
---|
2790 | 3208 | |
---|
2791 | 3209 | void *sparse_buffer_alloc(unsigned long size); |
---|
2792 | | -struct page *sparse_mem_map_populate(unsigned long pnum, int nid, |
---|
2793 | | - struct vmem_altmap *altmap); |
---|
| 3210 | +struct page * __populate_section_memmap(unsigned long pfn, |
---|
| 3211 | + unsigned long nr_pages, int nid, struct vmem_altmap *altmap); |
---|
2794 | 3212 | pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); |
---|
2795 | 3213 | p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); |
---|
2796 | 3214 | pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); |
---|
2797 | 3215 | pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); |
---|
2798 | | -pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); |
---|
| 3216 | +pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, |
---|
| 3217 | + struct vmem_altmap *altmap); |
---|
2799 | 3218 | void *vmemmap_alloc_block(unsigned long size, int node); |
---|
2800 | 3219 | struct vmem_altmap; |
---|
2801 | | -void *vmemmap_alloc_block_buf(unsigned long size, int node); |
---|
2802 | | -void *altmap_alloc_block_buf(unsigned long size, struct vmem_altmap *altmap); |
---|
| 3220 | +void *vmemmap_alloc_block_buf(unsigned long size, int node, |
---|
| 3221 | + struct vmem_altmap *altmap); |
---|
2803 | 3222 | void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); |
---|
2804 | 3223 | int vmemmap_populate_basepages(unsigned long start, unsigned long end, |
---|
2805 | | - int node); |
---|
| 3224 | + int node, struct vmem_altmap *altmap); |
---|
2806 | 3225 | int vmemmap_populate(unsigned long start, unsigned long end, int node, |
---|
2807 | 3226 | struct vmem_altmap *altmap); |
---|
2808 | 3227 | void vmemmap_populate_print_last(void); |
---|
.. | .. |
---|
2821 | 3240 | }; |
---|
2822 | 3241 | extern int memory_failure(unsigned long pfn, int flags); |
---|
2823 | 3242 | extern void memory_failure_queue(unsigned long pfn, int flags); |
---|
| 3243 | +extern void memory_failure_queue_kick(int cpu); |
---|
2824 | 3244 | extern int unpoison_memory(unsigned long pfn); |
---|
2825 | | -extern int get_hwpoison_page(struct page *page); |
---|
2826 | | -#define put_hwpoison_page(page) put_page(page) |
---|
2827 | 3245 | extern int sysctl_memory_failure_early_kill; |
---|
2828 | 3246 | extern int sysctl_memory_failure_recovery; |
---|
2829 | 3247 | extern void shake_page(struct page *p, int access); |
---|
2830 | 3248 | extern atomic_long_t num_poisoned_pages __read_mostly; |
---|
2831 | | -extern int soft_offline_page(struct page *page, int flags); |
---|
| 3249 | +extern int soft_offline_page(unsigned long pfn, int flags); |
---|
2832 | 3250 | |
---|
2833 | 3251 | |
---|
2834 | 3252 | /* |
---|
.. | .. |
---|
2863 | 3281 | MF_MSG_BUDDY, |
---|
2864 | 3282 | MF_MSG_BUDDY_2ND, |
---|
2865 | 3283 | MF_MSG_DAX, |
---|
| 3284 | + MF_MSG_UNSPLIT_THP, |
---|
2866 | 3285 | MF_MSG_UNKNOWN, |
---|
2867 | 3286 | }; |
---|
2868 | 3287 | |
---|
.. | .. |
---|
2878 | 3297 | const void __user *usr_src, |
---|
2879 | 3298 | unsigned int pages_per_huge_page, |
---|
2880 | 3299 | bool allow_pagefault); |
---|
2881 | | -#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ |
---|
2882 | 3300 | |
---|
2883 | | -extern struct page_ext_operations debug_guardpage_ops; |
---|
| 3301 | +/** |
---|
| 3302 | + * vma_is_special_huge - Are transhuge page-table entries considered special? |
---|
| 3303 | + * @vma: Pointer to the struct vm_area_struct to consider |
---|
| 3304 | + * |
---|
| 3305 | + * Whether transhuge page-table entries are considered "special" following |
---|
| 3306 | + * the definition in vm_normal_page(). |
---|
| 3307 | + * |
---|
| 3308 | + * Return: true if transhuge page-table entries should be considered special, |
---|
| 3309 | + * false otherwise. |
---|
| 3310 | + */ |
---|
| 3311 | +static inline bool vma_is_special_huge(const struct vm_area_struct *vma) |
---|
| 3312 | +{ |
---|
| 3313 | + return vma_is_dax(vma) || (vma->vm_file && |
---|
| 3314 | + (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); |
---|
| 3315 | +} |
---|
| 3316 | + |
---|
| 3317 | +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ |
---|
2884 | 3318 | |
---|
2885 | 3319 | #ifdef CONFIG_DEBUG_PAGEALLOC |
---|
2886 | 3320 | extern unsigned int _debug_guardpage_minorder; |
---|
2887 | | -extern bool _debug_guardpage_enabled; |
---|
| 3321 | +DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); |
---|
2888 | 3322 | |
---|
2889 | 3323 | static inline unsigned int debug_guardpage_minorder(void) |
---|
2890 | 3324 | { |
---|
.. | .. |
---|
2893 | 3327 | |
---|
2894 | 3328 | static inline bool debug_guardpage_enabled(void) |
---|
2895 | 3329 | { |
---|
2896 | | - return _debug_guardpage_enabled; |
---|
| 3330 | + return static_branch_unlikely(&_debug_guardpage_enabled); |
---|
2897 | 3331 | } |
---|
2898 | 3332 | |
---|
2899 | 3333 | static inline bool page_is_guard(struct page *page) |
---|
2900 | 3334 | { |
---|
2901 | | - struct page_ext *page_ext; |
---|
2902 | | - |
---|
2903 | 3335 | if (!debug_guardpage_enabled()) |
---|
2904 | 3336 | return false; |
---|
2905 | 3337 | |
---|
2906 | | - page_ext = lookup_page_ext(page); |
---|
2907 | | - if (unlikely(!page_ext)) |
---|
2908 | | - return false; |
---|
2909 | | - |
---|
2910 | | - return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); |
---|
| 3338 | + return PageGuard(page); |
---|
2911 | 3339 | } |
---|
2912 | 3340 | #else |
---|
2913 | 3341 | static inline unsigned int debug_guardpage_minorder(void) { return 0; } |
---|
.. | .. |
---|
2921 | 3349 | static inline void setup_nr_node_ids(void) {} |
---|
2922 | 3350 | #endif |
---|
2923 | 3351 | |
---|
| 3352 | +extern int memcmp_pages(struct page *page1, struct page *page2); |
---|
| 3353 | + |
---|
| 3354 | +static inline int pages_identical(struct page *page1, struct page *page2) |
---|
| 3355 | +{ |
---|
| 3356 | + return !memcmp_pages(page1, page2); |
---|
| 3357 | +} |
---|
| 3358 | + |
---|
| 3359 | +#ifdef CONFIG_MAPPING_DIRTY_HELPERS |
---|
| 3360 | +unsigned long clean_record_shared_mapping_range(struct address_space *mapping, |
---|
| 3361 | + pgoff_t first_index, pgoff_t nr, |
---|
| 3362 | + pgoff_t bitmap_pgoff, |
---|
| 3363 | + unsigned long *bitmap, |
---|
| 3364 | + pgoff_t *start, |
---|
| 3365 | + pgoff_t *end); |
---|
| 3366 | + |
---|
| 3367 | +unsigned long wp_shared_mapping_range(struct address_space *mapping, |
---|
| 3368 | + pgoff_t first_index, pgoff_t nr); |
---|
| 3369 | +#endif |
---|
| 3370 | + |
---|
| 3371 | +extern int sysctl_nr_trim_pages; |
---|
| 3372 | +extern bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr); |
---|
| 3373 | +extern int reclaim_shmem_address_space(struct address_space *mapping); |
---|
| 3374 | + |
---|
| 3375 | +/** |
---|
| 3376 | + * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it |
---|
| 3377 | + * @seals: the seals to check |
---|
| 3378 | + * @vma: the vma to operate on |
---|
| 3379 | + * |
---|
| 3380 | + * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on |
---|
| 3381 | + * the vma flags. Return 0 if check pass, or <0 for errors. |
---|
| 3382 | + */ |
---|
| 3383 | +static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) |
---|
| 3384 | +{ |
---|
| 3385 | + if (seals & F_SEAL_FUTURE_WRITE) { |
---|
| 3386 | + /* |
---|
| 3387 | + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when |
---|
| 3388 | + * "future write" seal active. |
---|
| 3389 | + */ |
---|
| 3390 | + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) |
---|
| 3391 | + return -EPERM; |
---|
| 3392 | + |
---|
| 3393 | + /* |
---|
| 3394 | + * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as |
---|
| 3395 | + * MAP_SHARED and read-only, take care to not allow mprotect to |
---|
| 3396 | + * revert protections on such mappings. Do this only for shared |
---|
| 3397 | + * mappings. For private mappings, don't need to mask |
---|
| 3398 | + * VM_MAYWRITE as we still want them to be COW-writable. |
---|
| 3399 | + */ |
---|
| 3400 | + if (vma->vm_flags & VM_SHARED) |
---|
| 3401 | + vma->vm_flags &= ~(VM_MAYWRITE); |
---|
| 3402 | + } |
---|
| 3403 | + |
---|
| 3404 | + return 0; |
---|
| 3405 | +} |
---|
| 3406 | + |
---|
2924 | 3407 | #endif /* __KERNEL__ */ |
---|
2925 | 3408 | #endif /* _LINUX_MM_H */ |
---|