.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | #include <linux/mm.h> |
---|
2 | 3 | #include <linux/slab.h> |
---|
3 | 4 | #include <linux/string.h> |
---|
.. | .. |
---|
6 | 7 | #include <linux/err.h> |
---|
7 | 8 | #include <linux/sched.h> |
---|
8 | 9 | #include <linux/sched/mm.h> |
---|
| 10 | +#include <linux/sched/signal.h> |
---|
9 | 11 | #include <linux/sched/task_stack.h> |
---|
10 | 12 | #include <linux/security.h> |
---|
11 | 13 | #include <linux/swap.h> |
---|
.. | .. |
---|
14 | 16 | #include <linux/hugetlb.h> |
---|
15 | 17 | #include <linux/vmalloc.h> |
---|
16 | 18 | #include <linux/userfaultfd_k.h> |
---|
| 19 | +#include <linux/elf.h> |
---|
| 20 | +#include <linux/elf-randomize.h> |
---|
| 21 | +#include <linux/personality.h> |
---|
| 22 | +#include <linux/random.h> |
---|
| 23 | +#include <linux/processor.h> |
---|
| 24 | +#include <linux/sizes.h> |
---|
| 25 | +#include <linux/compat.h> |
---|
17 | 26 | |
---|
18 | | -#include <asm/sections.h> |
---|
19 | 27 | #include <linux/uaccess.h> |
---|
20 | 28 | |
---|
21 | 29 | #include "internal.h" |
---|
22 | | - |
---|
23 | | -static inline int is_kernel_rodata(unsigned long addr) |
---|
24 | | -{ |
---|
25 | | - return addr >= (unsigned long)__start_rodata && |
---|
26 | | - addr < (unsigned long)__end_rodata; |
---|
27 | | -} |
---|
| 30 | +#ifndef __GENKSYMS__ |
---|
| 31 | +#include <trace/hooks/syscall_check.h> |
---|
| 32 | +#include <trace/hooks/mm.h> |
---|
| 33 | +#endif |
---|
28 | 34 | |
---|
29 | 35 | /** |
---|
30 | 36 | * kfree_const - conditionally free memory |
---|
.. | .. |
---|
43 | 49 | * kstrdup - allocate space for and copy an existing string |
---|
44 | 50 | * @s: the string to duplicate |
---|
45 | 51 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory |
---|
| 52 | + * |
---|
| 53 | + * Return: newly allocated copy of @s or %NULL in case of error |
---|
46 | 54 | */ |
---|
47 | 55 | char *kstrdup(const char *s, gfp_t gfp) |
---|
48 | 56 | { |
---|
.. | .. |
---|
65 | 73 | * @s: the string to duplicate |
---|
66 | 74 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory |
---|
67 | 75 | * |
---|
68 | | - * Function returns source string if it is in .rodata section otherwise it |
---|
69 | | - * fallbacks to kstrdup. |
---|
70 | | - * Strings allocated by kstrdup_const should be freed by kfree_const. |
---|
| 76 | + * Note: Strings allocated by kstrdup_const should be freed by kfree_const and |
---|
| 77 | + * must not be passed to krealloc(). |
---|
| 78 | + * |
---|
| 79 | + * Return: source string if it is in .rodata section otherwise |
---|
| 80 | + * fallback to kstrdup. |
---|
71 | 81 | */ |
---|
72 | 82 | const char *kstrdup_const(const char *s, gfp_t gfp) |
---|
73 | 83 | { |
---|
.. | .. |
---|
85 | 95 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory |
---|
86 | 96 | * |
---|
87 | 97 | * Note: Use kmemdup_nul() instead if the size is known exactly. |
---|
| 98 | + * |
---|
| 99 | + * Return: newly allocated copy of @s or %NULL in case of error |
---|
88 | 100 | */ |
---|
89 | 101 | char *kstrndup(const char *s, size_t max, gfp_t gfp) |
---|
90 | 102 | { |
---|
.. | .. |
---|
110 | 122 | * @src: memory region to duplicate |
---|
111 | 123 | * @len: memory region length |
---|
112 | 124 | * @gfp: GFP mask to use |
---|
| 125 | + * |
---|
| 126 | + * Return: newly allocated copy of @src or %NULL in case of error |
---|
113 | 127 | */ |
---|
114 | 128 | void *kmemdup(const void *src, size_t len, gfp_t gfp) |
---|
115 | 129 | { |
---|
.. | .. |
---|
127 | 141 | * @s: The data to stringify |
---|
128 | 142 | * @len: The size of the data |
---|
129 | 143 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory |
---|
| 144 | + * |
---|
| 145 | + * Return: newly allocated copy of @s with NUL-termination or %NULL in |
---|
| 146 | + * case of error |
---|
130 | 147 | */ |
---|
131 | 148 | char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) |
---|
132 | 149 | { |
---|
.. | .. |
---|
150 | 167 | * @src: source address in user space |
---|
151 | 168 | * @len: number of bytes to copy |
---|
152 | 169 | * |
---|
153 | | - * Returns an ERR_PTR() on failure. Result is physically |
---|
| 170 | + * Return: an ERR_PTR() on failure. Result is physically |
---|
154 | 171 | * contiguous, to be freed by kfree(). |
---|
155 | 172 | */ |
---|
156 | 173 | void *memdup_user(const void __user *src, size_t len) |
---|
157 | 174 | { |
---|
158 | 175 | void *p; |
---|
159 | 176 | |
---|
160 | | - p = kmalloc_track_caller(len, GFP_USER); |
---|
| 177 | + p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); |
---|
161 | 178 | if (!p) |
---|
162 | 179 | return ERR_PTR(-ENOMEM); |
---|
163 | 180 | |
---|
.. | .. |
---|
176 | 193 | * @src: source address in user space |
---|
177 | 194 | * @len: number of bytes to copy |
---|
178 | 195 | * |
---|
179 | | - * Returns an ERR_PTR() on failure. Result may be not |
---|
| 196 | + * Return: an ERR_PTR() on failure. Result may be not |
---|
180 | 197 | * physically contiguous. Use kvfree() to free. |
---|
181 | 198 | */ |
---|
182 | 199 | void *vmemdup_user(const void __user *src, size_t len) |
---|
.. | .. |
---|
200 | 217 | * strndup_user - duplicate an existing string from user space |
---|
201 | 218 | * @s: The string to duplicate |
---|
202 | 219 | * @n: Maximum number of bytes to copy, including the trailing NUL. |
---|
| 220 | + * |
---|
| 221 | + * Return: newly allocated copy of @s or an ERR_PTR() in case of error |
---|
203 | 222 | */ |
---|
204 | 223 | char *strndup_user(const char __user *s, long n) |
---|
205 | 224 | { |
---|
.. | .. |
---|
231 | 250 | * @src: source address in user space |
---|
232 | 251 | * @len: number of bytes to copy |
---|
233 | 252 | * |
---|
234 | | - * Returns an ERR_PTR() on failure. |
---|
| 253 | + * Return: an ERR_PTR() on failure. |
---|
235 | 254 | */ |
---|
236 | 255 | void *memdup_user_nul(const void __user *src, size_t len) |
---|
237 | 256 | { |
---|
.. | .. |
---|
257 | 276 | EXPORT_SYMBOL(memdup_user_nul); |
---|
258 | 277 | |
---|
259 | 278 | void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
260 | | - struct vm_area_struct *prev, struct rb_node *rb_parent) |
---|
| 279 | + struct vm_area_struct *prev) |
---|
261 | 280 | { |
---|
262 | 281 | struct vm_area_struct *next; |
---|
263 | 282 | |
---|
.. | .. |
---|
266 | 285 | next = prev->vm_next; |
---|
267 | 286 | prev->vm_next = vma; |
---|
268 | 287 | } else { |
---|
| 288 | + next = mm->mmap; |
---|
269 | 289 | mm->mmap = vma; |
---|
270 | | - if (rb_parent) |
---|
271 | | - next = rb_entry(rb_parent, |
---|
272 | | - struct vm_area_struct, vm_rb); |
---|
273 | | - else |
---|
274 | | - next = NULL; |
---|
275 | 290 | } |
---|
276 | 291 | vma->vm_next = next; |
---|
277 | 292 | if (next) |
---|
278 | 293 | next->vm_prev = vma; |
---|
| 294 | +} |
---|
| 295 | + |
---|
| 296 | +void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) |
---|
| 297 | +{ |
---|
| 298 | + struct vm_area_struct *prev, *next; |
---|
| 299 | + |
---|
| 300 | + next = vma->vm_next; |
---|
| 301 | + prev = vma->vm_prev; |
---|
| 302 | + if (prev) |
---|
| 303 | + prev->vm_next = next; |
---|
| 304 | + else |
---|
| 305 | + mm->mmap = next; |
---|
| 306 | + if (next) |
---|
| 307 | + next->vm_prev = prev; |
---|
279 | 308 | } |
---|
280 | 309 | |
---|
281 | 310 | /* Check if the vma is being used as a stack by this task */ |
---|
.. | .. |
---|
286 | 315 | return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); |
---|
287 | 316 | } |
---|
288 | 317 | |
---|
289 | | -#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
---|
| 318 | +#ifndef STACK_RND_MASK |
---|
| 319 | +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ |
---|
| 320 | +#endif |
---|
| 321 | + |
---|
| 322 | +unsigned long randomize_stack_top(unsigned long stack_top) |
---|
| 323 | +{ |
---|
| 324 | + unsigned long random_variable = 0; |
---|
| 325 | + |
---|
| 326 | + if (current->flags & PF_RANDOMIZE) { |
---|
| 327 | + random_variable = get_random_long(); |
---|
| 328 | + random_variable &= STACK_RND_MASK; |
---|
| 329 | + random_variable <<= PAGE_SHIFT; |
---|
| 330 | + } |
---|
| 331 | +#ifdef CONFIG_STACK_GROWSUP |
---|
| 332 | + return PAGE_ALIGN(stack_top) + random_variable; |
---|
| 333 | +#else |
---|
| 334 | + return PAGE_ALIGN(stack_top) - random_variable; |
---|
| 335 | +#endif |
---|
| 336 | +} |
---|
| 337 | + |
---|
| 338 | +/** |
---|
| 339 | + * randomize_page - Generate a random, page aligned address |
---|
| 340 | + * @start: The smallest acceptable address the caller will take. |
---|
| 341 | + * @range: The size of the area, starting at @start, within which the |
---|
| 342 | + * random address must fall. |
---|
| 343 | + * |
---|
| 344 | + * If @start + @range would overflow, @range is capped. |
---|
| 345 | + * |
---|
| 346 | + * NOTE: Historical use of randomize_range, which this replaces, presumed that |
---|
| 347 | + * @start was already page aligned. We now align it regardless. |
---|
| 348 | + * |
---|
| 349 | + * Return: A page aligned address within [start, start + range). On error, |
---|
| 350 | + * @start is returned. |
---|
| 351 | + */ |
---|
| 352 | +unsigned long randomize_page(unsigned long start, unsigned long range) |
---|
| 353 | +{ |
---|
| 354 | + if (!PAGE_ALIGNED(start)) { |
---|
| 355 | + range -= PAGE_ALIGN(start) - start; |
---|
| 356 | + start = PAGE_ALIGN(start); |
---|
| 357 | + } |
---|
| 358 | + |
---|
| 359 | + if (start > ULONG_MAX - range) |
---|
| 360 | + range = ULONG_MAX - start; |
---|
| 361 | + |
---|
| 362 | + range >>= PAGE_SHIFT; |
---|
| 363 | + |
---|
| 364 | + if (range == 0) |
---|
| 365 | + return start; |
---|
| 366 | + |
---|
| 367 | + return start + (get_random_long() % range << PAGE_SHIFT); |
---|
| 368 | +} |
---|
| 369 | + |
---|
| 370 | +#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT |
---|
| 371 | +unsigned long arch_randomize_brk(struct mm_struct *mm) |
---|
| 372 | +{ |
---|
| 373 | + /* Is the current task 32bit ? */ |
---|
| 374 | + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) |
---|
| 375 | + return randomize_page(mm->brk, SZ_32M); |
---|
| 376 | + |
---|
| 377 | + return randomize_page(mm->brk, SZ_1G); |
---|
| 378 | +} |
---|
| 379 | + |
---|
| 380 | +unsigned long arch_mmap_rnd(void) |
---|
| 381 | +{ |
---|
| 382 | + unsigned long rnd; |
---|
| 383 | + |
---|
| 384 | +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS |
---|
| 385 | + if (is_compat_task()) |
---|
| 386 | + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); |
---|
| 387 | + else |
---|
| 388 | +#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ |
---|
| 389 | + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); |
---|
| 390 | + |
---|
| 391 | + return rnd << PAGE_SHIFT; |
---|
| 392 | +} |
---|
| 393 | +EXPORT_SYMBOL_GPL(arch_mmap_rnd); |
---|
| 394 | + |
---|
| 395 | +static int mmap_is_legacy(struct rlimit *rlim_stack) |
---|
| 396 | +{ |
---|
| 397 | + if (current->personality & ADDR_COMPAT_LAYOUT) |
---|
| 398 | + return 1; |
---|
| 399 | + |
---|
| 400 | + if (rlim_stack->rlim_cur == RLIM_INFINITY) |
---|
| 401 | + return 1; |
---|
| 402 | + |
---|
| 403 | + return sysctl_legacy_va_layout; |
---|
| 404 | +} |
---|
| 405 | + |
---|
| 406 | +/* |
---|
| 407 | + * Leave enough space between the mmap area and the stack to honour ulimit in |
---|
| 408 | + * the face of randomisation. |
---|
| 409 | + */ |
---|
| 410 | +#define MIN_GAP (SZ_128M) |
---|
| 411 | +#define MAX_GAP (STACK_TOP / 6 * 5) |
---|
| 412 | + |
---|
| 413 | +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) |
---|
| 414 | +{ |
---|
| 415 | + unsigned long gap = rlim_stack->rlim_cur; |
---|
| 416 | + unsigned long pad = stack_guard_gap; |
---|
| 417 | + |
---|
| 418 | + /* Account for stack randomization if necessary */ |
---|
| 419 | + if (current->flags & PF_RANDOMIZE) |
---|
| 420 | + pad += (STACK_RND_MASK << PAGE_SHIFT); |
---|
| 421 | + |
---|
| 422 | + /* Values close to RLIM_INFINITY can overflow. */ |
---|
| 423 | + if (gap + pad > gap) |
---|
| 424 | + gap += pad; |
---|
| 425 | + |
---|
| 426 | + if (gap < MIN_GAP) |
---|
| 427 | + gap = MIN_GAP; |
---|
| 428 | + else if (gap > MAX_GAP) |
---|
| 429 | + gap = MAX_GAP; |
---|
| 430 | + |
---|
| 431 | + return PAGE_ALIGN(STACK_TOP - gap - rnd); |
---|
| 432 | +} |
---|
| 433 | + |
---|
| 434 | +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) |
---|
| 435 | +{ |
---|
| 436 | + unsigned long random_factor = 0UL; |
---|
| 437 | + |
---|
| 438 | + if (current->flags & PF_RANDOMIZE) |
---|
| 439 | + random_factor = arch_mmap_rnd(); |
---|
| 440 | + |
---|
| 441 | + if (mmap_is_legacy(rlim_stack)) { |
---|
| 442 | + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; |
---|
| 443 | + mm->get_unmapped_area = arch_get_unmapped_area; |
---|
| 444 | + } else { |
---|
| 445 | + mm->mmap_base = mmap_base(random_factor, rlim_stack); |
---|
| 446 | + mm->get_unmapped_area = arch_get_unmapped_area_topdown; |
---|
| 447 | + } |
---|
| 448 | +} |
---|
| 449 | +#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
---|
290 | 450 | void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) |
---|
291 | 451 | { |
---|
292 | 452 | mm->mmap_base = TASK_UNMAPPED_BASE; |
---|
.. | .. |
---|
294 | 454 | } |
---|
295 | 455 | #endif |
---|
296 | 456 | |
---|
297 | | -/* |
---|
298 | | - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall |
---|
299 | | - * back to the regular GUP. |
---|
300 | | - * Note a difference with get_user_pages_fast: this always returns the |
---|
301 | | - * number of pages pinned, 0 if no pages were pinned. |
---|
302 | | - * If the architecture does not support this function, simply return with no |
---|
303 | | - * pages pinned. |
---|
| 457 | +/** |
---|
| 458 | + * __account_locked_vm - account locked pages to an mm's locked_vm |
---|
| 459 | + * @mm: mm to account against |
---|
| 460 | + * @pages: number of pages to account |
---|
| 461 | + * @inc: %true if @pages should be considered positive, %false if not |
---|
| 462 | + * @task: task used to check RLIMIT_MEMLOCK |
---|
| 463 | + * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped |
---|
| 464 | + * |
---|
| 465 | + * Assumes @task and @mm are valid (i.e. at least one reference on each), and |
---|
| 466 | + * that mmap_lock is held as writer. |
---|
| 467 | + * |
---|
| 468 | + * Return: |
---|
| 469 | + * * 0 on success |
---|
| 470 | + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. |
---|
304 | 471 | */ |
---|
305 | | -int __weak __get_user_pages_fast(unsigned long start, |
---|
306 | | - int nr_pages, int write, struct page **pages) |
---|
| 472 | +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, |
---|
| 473 | + struct task_struct *task, bool bypass_rlim) |
---|
307 | 474 | { |
---|
308 | | - return 0; |
---|
| 475 | + unsigned long locked_vm, limit; |
---|
| 476 | + int ret = 0; |
---|
| 477 | + |
---|
| 478 | + mmap_assert_write_locked(mm); |
---|
| 479 | + |
---|
| 480 | + locked_vm = mm->locked_vm; |
---|
| 481 | + if (inc) { |
---|
| 482 | + if (!bypass_rlim) { |
---|
| 483 | + limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
---|
| 484 | + if (locked_vm + pages > limit) |
---|
| 485 | + ret = -ENOMEM; |
---|
| 486 | + } |
---|
| 487 | + if (!ret) |
---|
| 488 | + mm->locked_vm = locked_vm + pages; |
---|
| 489 | + } else { |
---|
| 490 | + WARN_ON_ONCE(pages > locked_vm); |
---|
| 491 | + mm->locked_vm = locked_vm - pages; |
---|
| 492 | + } |
---|
| 493 | + |
---|
| 494 | + pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, |
---|
| 495 | + (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, |
---|
| 496 | + locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), |
---|
| 497 | + ret ? " - exceeded" : ""); |
---|
| 498 | + |
---|
| 499 | + return ret; |
---|
309 | 500 | } |
---|
310 | | -EXPORT_SYMBOL_GPL(__get_user_pages_fast); |
---|
| 501 | +EXPORT_SYMBOL_GPL(__account_locked_vm); |
---|
311 | 502 | |
---|
312 | 503 | /** |
---|
313 | | - * get_user_pages_fast() - pin user pages in memory |
---|
314 | | - * @start: starting user address |
---|
315 | | - * @nr_pages: number of pages from start to pin |
---|
316 | | - * @write: whether pages will be written to |
---|
317 | | - * @pages: array that receives pointers to the pages pinned. |
---|
318 | | - * Should be at least nr_pages long. |
---|
| 504 | + * account_locked_vm - account locked pages to an mm's locked_vm |
---|
| 505 | + * @mm: mm to account against, may be NULL |
---|
| 506 | + * @pages: number of pages to account |
---|
| 507 | + * @inc: %true if @pages should be considered positive, %false if not |
---|
319 | 508 | * |
---|
320 | | - * Returns number of pages pinned. This may be fewer than the number |
---|
321 | | - * requested. If nr_pages is 0 or negative, returns 0. If no pages |
---|
322 | | - * were pinned, returns -errno. |
---|
| 509 | + * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). |
---|
323 | 510 | * |
---|
324 | | - * get_user_pages_fast provides equivalent functionality to get_user_pages, |
---|
325 | | - * operating on current and current->mm, with force=0 and vma=NULL. However |
---|
326 | | - * unlike get_user_pages, it must be called without mmap_sem held. |
---|
327 | | - * |
---|
328 | | - * get_user_pages_fast may take mmap_sem and page table locks, so no |
---|
329 | | - * assumptions can be made about lack of locking. get_user_pages_fast is to be |
---|
330 | | - * implemented in a way that is advantageous (vs get_user_pages()) when the |
---|
331 | | - * user memory area is already faulted in and present in ptes. However if the |
---|
332 | | - * pages have to be faulted in, it may turn out to be slightly slower so |
---|
333 | | - * callers need to carefully consider what to use. On many architectures, |
---|
334 | | - * get_user_pages_fast simply falls back to get_user_pages. |
---|
| 511 | + * Return: |
---|
| 512 | + * * 0 on success, or if mm is NULL |
---|
| 513 | + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. |
---|
335 | 514 | */ |
---|
336 | | -int __weak get_user_pages_fast(unsigned long start, |
---|
337 | | - int nr_pages, int write, struct page **pages) |
---|
| 515 | +int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) |
---|
338 | 516 | { |
---|
339 | | - return get_user_pages_unlocked(start, nr_pages, pages, |
---|
340 | | - write ? FOLL_WRITE : 0); |
---|
| 517 | + int ret; |
---|
| 518 | + |
---|
| 519 | + if (pages == 0 || !mm) |
---|
| 520 | + return 0; |
---|
| 521 | + |
---|
| 522 | + mmap_write_lock(mm); |
---|
| 523 | + ret = __account_locked_vm(mm, pages, inc, current, |
---|
| 524 | + capable(CAP_IPC_LOCK)); |
---|
| 525 | + mmap_write_unlock(mm); |
---|
| 526 | + |
---|
| 527 | + return ret; |
---|
341 | 528 | } |
---|
342 | | -EXPORT_SYMBOL_GPL(get_user_pages_fast); |
---|
| 529 | +EXPORT_SYMBOL_GPL(account_locked_vm); |
---|
343 | 530 | |
---|
344 | 531 | unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, |
---|
345 | 532 | unsigned long len, unsigned long prot, |
---|
.. | .. |
---|
352 | 539 | |
---|
353 | 540 | ret = security_mmap_file(file, prot, flag); |
---|
354 | 541 | if (!ret) { |
---|
355 | | - if (down_write_killable(&mm->mmap_sem)) |
---|
| 542 | + if (mmap_write_lock_killable(mm)) |
---|
356 | 543 | return -EINTR; |
---|
357 | | - ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, |
---|
358 | | - &populate, &uf); |
---|
359 | | - up_write(&mm->mmap_sem); |
---|
| 544 | + ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate, |
---|
| 545 | + &uf); |
---|
| 546 | + mmap_write_unlock(mm); |
---|
360 | 547 | userfaultfd_unmap_complete(mm, &uf); |
---|
361 | 548 | if (populate) |
---|
362 | 549 | mm_populate(ret, populate); |
---|
363 | 550 | } |
---|
| 551 | + trace_android_vh_check_mmap_file(file, prot, flag, ret); |
---|
364 | 552 | return ret; |
---|
365 | 553 | } |
---|
366 | 554 | |
---|
.. | .. |
---|
393 | 581 | * |
---|
394 | 582 | * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not |
---|
395 | 583 | * fall back to vmalloc. |
---|
| 584 | + * |
---|
| 585 | + * Return: pointer to the allocated memory of %NULL in case of failure |
---|
396 | 586 | */ |
---|
397 | 587 | void *kvmalloc_node(size_t size, gfp_t flags, int node) |
---|
398 | 588 | { |
---|
399 | 589 | gfp_t kmalloc_flags = flags; |
---|
400 | 590 | void *ret; |
---|
| 591 | + bool use_vmalloc = false; |
---|
401 | 592 | |
---|
402 | 593 | /* |
---|
403 | 594 | * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) |
---|
.. | .. |
---|
405 | 596 | */ |
---|
406 | 597 | if ((flags & GFP_KERNEL) != GFP_KERNEL) |
---|
407 | 598 | return kmalloc_node(size, flags, node); |
---|
| 599 | + |
---|
| 600 | + trace_android_vh_kvmalloc_node_use_vmalloc(size, &kmalloc_flags, &use_vmalloc); |
---|
| 601 | + if (use_vmalloc) |
---|
| 602 | + goto use_vmalloc_node; |
---|
408 | 603 | |
---|
409 | 604 | /* |
---|
410 | 605 | * We want to attempt a large physically contiguous block first because |
---|
.. | .. |
---|
429 | 624 | if (ret || size <= PAGE_SIZE) |
---|
430 | 625 | return ret; |
---|
431 | 626 | |
---|
432 | | - return __vmalloc_node_flags_caller(size, node, flags, |
---|
| 627 | + /* Don't even allow crazy sizes */ |
---|
| 628 | + if (unlikely(size > INT_MAX)) { |
---|
| 629 | + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); |
---|
| 630 | + return NULL; |
---|
| 631 | + } |
---|
| 632 | + |
---|
| 633 | +use_vmalloc_node: |
---|
| 634 | + return __vmalloc_node(size, 1, flags, node, |
---|
433 | 635 | __builtin_return_address(0)); |
---|
434 | 636 | } |
---|
435 | 637 | EXPORT_SYMBOL(kvmalloc_node); |
---|
.. | .. |
---|
442 | 644 | * It is slightly more efficient to use kfree() or vfree() if you are certain |
---|
443 | 645 | * that you know which one to use. |
---|
444 | 646 | * |
---|
445 | | - * Context: Any context except NMI. |
---|
| 647 | + * Context: Either preemptible task context or not-NMI interrupt. |
---|
446 | 648 | */ |
---|
447 | 649 | void kvfree(const void *addr) |
---|
448 | 650 | { |
---|
.. | .. |
---|
470 | 672 | } |
---|
471 | 673 | } |
---|
472 | 674 | EXPORT_SYMBOL(kvfree_sensitive); |
---|
| 675 | + |
---|
| 676 | +void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) |
---|
| 677 | +{ |
---|
| 678 | + void *newp; |
---|
| 679 | + |
---|
| 680 | + if (oldsize >= newsize) |
---|
| 681 | + return (void *)p; |
---|
| 682 | + newp = kvmalloc(newsize, flags); |
---|
| 683 | + if (!newp) |
---|
| 684 | + return NULL; |
---|
| 685 | + memcpy(newp, p, oldsize); |
---|
| 686 | + kvfree(p); |
---|
| 687 | + return newp; |
---|
| 688 | +} |
---|
| 689 | +EXPORT_SYMBOL(kvrealloc); |
---|
473 | 690 | |
---|
474 | 691 | static inline void *__page_rmapping(struct page *page) |
---|
475 | 692 | { |
---|
.. | .. |
---|
503 | 720 | return true; |
---|
504 | 721 | if (PageHuge(page)) |
---|
505 | 722 | return false; |
---|
506 | | - for (i = 0; i < (1 << compound_order(page)); i++) { |
---|
| 723 | + for (i = 0; i < compound_nr(page); i++) { |
---|
507 | 724 | if (atomic_read(&page[i]._mapcount) >= 0) |
---|
508 | 725 | return true; |
---|
509 | 726 | } |
---|
.. | .. |
---|
584 | 801 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
---|
585 | 802 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ |
---|
586 | 803 | |
---|
587 | | -int overcommit_ratio_handler(struct ctl_table *table, int write, |
---|
588 | | - void __user *buffer, size_t *lenp, |
---|
589 | | - loff_t *ppos) |
---|
| 804 | +int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, |
---|
| 805 | + size_t *lenp, loff_t *ppos) |
---|
590 | 806 | { |
---|
591 | 807 | int ret; |
---|
592 | 808 | |
---|
.. | .. |
---|
596 | 812 | return ret; |
---|
597 | 813 | } |
---|
598 | 814 | |
---|
599 | | -int overcommit_kbytes_handler(struct ctl_table *table, int write, |
---|
600 | | - void __user *buffer, size_t *lenp, |
---|
601 | | - loff_t *ppos) |
---|
| 815 | +static void sync_overcommit_as(struct work_struct *dummy) |
---|
| 816 | +{ |
---|
| 817 | + percpu_counter_sync(&vm_committed_as); |
---|
| 818 | +} |
---|
| 819 | + |
---|
| 820 | +int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, |
---|
| 821 | + size_t *lenp, loff_t *ppos) |
---|
| 822 | +{ |
---|
| 823 | + struct ctl_table t; |
---|
| 824 | + int new_policy = -1; |
---|
| 825 | + int ret; |
---|
| 826 | + |
---|
| 827 | + /* |
---|
| 828 | + * The deviation of sync_overcommit_as could be big with loose policy |
---|
| 829 | + * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to |
---|
| 830 | + * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply |
---|
| 831 | + * with the strict "NEVER", and to avoid possible race condtion (even |
---|
| 832 | + * though user usually won't too frequently do the switching to policy |
---|
| 833 | + * OVERCOMMIT_NEVER), the switch is done in the following order: |
---|
| 834 | + * 1. changing the batch |
---|
| 835 | + * 2. sync percpu count on each CPU |
---|
| 836 | + * 3. switch the policy |
---|
| 837 | + */ |
---|
| 838 | + if (write) { |
---|
| 839 | + t = *table; |
---|
| 840 | + t.data = &new_policy; |
---|
| 841 | + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); |
---|
| 842 | + if (ret || new_policy == -1) |
---|
| 843 | + return ret; |
---|
| 844 | + |
---|
| 845 | + mm_compute_batch(new_policy); |
---|
| 846 | + if (new_policy == OVERCOMMIT_NEVER) |
---|
| 847 | + schedule_on_each_cpu(sync_overcommit_as); |
---|
| 848 | + sysctl_overcommit_memory = new_policy; |
---|
| 849 | + } else { |
---|
| 850 | + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
---|
| 851 | + } |
---|
| 852 | + |
---|
| 853 | + return ret; |
---|
| 854 | +} |
---|
| 855 | + |
---|
| 856 | +int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, |
---|
| 857 | + size_t *lenp, loff_t *ppos) |
---|
602 | 858 | { |
---|
603 | 859 | int ret; |
---|
604 | 860 | |
---|
.. | .. |
---|
618 | 874 | if (sysctl_overcommit_kbytes) |
---|
619 | 875 | allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); |
---|
620 | 876 | else |
---|
621 | | - allowed = ((totalram_pages - hugetlb_total_pages()) |
---|
| 877 | + allowed = ((totalram_pages() - hugetlb_total_pages()) |
---|
622 | 878 | * sysctl_overcommit_ratio / 100); |
---|
623 | 879 | allowed += total_swap_pages; |
---|
624 | 880 | |
---|
.. | .. |
---|
638 | 894 | * balancing memory across competing virtual machines that are hosted. |
---|
639 | 895 | * Several metrics drive this policy engine including the guest reported |
---|
640 | 896 | * memory commitment. |
---|
| 897 | + * |
---|
| 898 | + * The time cost of this is very low for small platforms, and for big |
---|
| 899 | + * platform like a 2S/36C/72T Skylake server, in worst case where |
---|
| 900 | + * vm_committed_as's spinlock is under severe contention, the time cost |
---|
| 901 | + * could be about 30~40 microseconds. |
---|
641 | 902 | */ |
---|
642 | 903 | unsigned long vm_memory_committed(void) |
---|
643 | 904 | { |
---|
644 | | - return percpu_counter_read_positive(&vm_committed_as); |
---|
| 905 | + return percpu_counter_sum_positive(&vm_committed_as); |
---|
645 | 906 | } |
---|
646 | 907 | EXPORT_SYMBOL_GPL(vm_memory_committed); |
---|
647 | 908 | |
---|
.. | .. |
---|
663 | 924 | */ |
---|
664 | 925 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
---|
665 | 926 | { |
---|
666 | | - long free, allowed, reserve; |
---|
667 | | - |
---|
668 | | - VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < |
---|
669 | | - -(s64)vm_committed_as_batch * num_online_cpus(), |
---|
670 | | - "memory commitment underflow"); |
---|
| 927 | + long allowed; |
---|
671 | 928 | |
---|
672 | 929 | vm_acct_memory(pages); |
---|
673 | 930 | |
---|
.. | .. |
---|
678 | 935 | return 0; |
---|
679 | 936 | |
---|
680 | 937 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
---|
681 | | - free = global_zone_page_state(NR_FREE_PAGES); |
---|
682 | | - free += global_node_page_state(NR_FILE_PAGES); |
---|
683 | | - |
---|
684 | | - /* |
---|
685 | | - * shmem pages shouldn't be counted as free in this |
---|
686 | | - * case, they can't be purged, only swapped out, and |
---|
687 | | - * that won't affect the overall amount of available |
---|
688 | | - * memory in the system. |
---|
689 | | - */ |
---|
690 | | - free -= global_node_page_state(NR_SHMEM); |
---|
691 | | - |
---|
692 | | - free += get_nr_swap_pages(); |
---|
693 | | - |
---|
694 | | - /* |
---|
695 | | - * Any slabs which are created with the |
---|
696 | | - * SLAB_RECLAIM_ACCOUNT flag claim to have contents |
---|
697 | | - * which are reclaimable, under pressure. The dentry |
---|
698 | | - * cache and most inode caches should fall into this |
---|
699 | | - */ |
---|
700 | | - free += global_node_page_state(NR_SLAB_RECLAIMABLE); |
---|
701 | | - |
---|
702 | | - /* |
---|
703 | | - * Part of the kernel memory, which can be released |
---|
704 | | - * under memory pressure. |
---|
705 | | - */ |
---|
706 | | - free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); |
---|
707 | | - |
---|
708 | | - /* |
---|
709 | | - * Leave reserved pages. The pages are not for anonymous pages. |
---|
710 | | - */ |
---|
711 | | - if (free <= totalreserve_pages) |
---|
| 938 | + if (pages > totalram_pages() + total_swap_pages) |
---|
712 | 939 | goto error; |
---|
713 | | - else |
---|
714 | | - free -= totalreserve_pages; |
---|
715 | | - |
---|
716 | | - /* |
---|
717 | | - * Reserve some for root |
---|
718 | | - */ |
---|
719 | | - if (!cap_sys_admin) |
---|
720 | | - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
---|
721 | | - |
---|
722 | | - if (free > pages) |
---|
723 | | - return 0; |
---|
724 | | - |
---|
725 | | - goto error; |
---|
| 940 | + return 0; |
---|
726 | 941 | } |
---|
727 | 942 | |
---|
728 | 943 | allowed = vm_commit_limit(); |
---|
.. | .. |
---|
736 | 951 | * Don't let a single process grow so big a user can't recover |
---|
737 | 952 | */ |
---|
738 | 953 | if (mm) { |
---|
739 | | - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); |
---|
| 954 | + long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); |
---|
| 955 | + |
---|
740 | 956 | allowed -= min_t(long, mm->total_vm / 32, reserve); |
---|
741 | 957 | } |
---|
742 | 958 | |
---|
.. | .. |
---|
754 | 970 | * @buffer: the buffer to copy to. |
---|
755 | 971 | * @buflen: the length of the buffer. Larger cmdline values are truncated |
---|
756 | 972 | * to this length. |
---|
757 | | - * Returns the size of the cmdline field copied. Note that the copy does |
---|
| 973 | + * |
---|
| 974 | + * Return: the size of the cmdline field copied. Note that the copy does |
---|
758 | 975 | * not guarantee an ending NULL byte. |
---|
759 | 976 | */ |
---|
760 | 977 | int get_cmdline(struct task_struct *task, char *buffer, int buflen) |
---|
.. | .. |
---|
768 | 985 | if (!mm->arg_end) |
---|
769 | 986 | goto out_mm; /* Shh! No looking before we're done */ |
---|
770 | 987 | |
---|
771 | | - down_read(&mm->mmap_sem); |
---|
| 988 | + spin_lock(&mm->arg_lock); |
---|
772 | 989 | arg_start = mm->arg_start; |
---|
773 | 990 | arg_end = mm->arg_end; |
---|
774 | 991 | env_start = mm->env_start; |
---|
775 | 992 | env_end = mm->env_end; |
---|
776 | | - up_read(&mm->mmap_sem); |
---|
| 993 | + spin_unlock(&mm->arg_lock); |
---|
777 | 994 | |
---|
778 | 995 | len = arg_end - arg_start; |
---|
779 | 996 | |
---|
.. | .. |
---|
805 | 1022 | out: |
---|
806 | 1023 | return res; |
---|
807 | 1024 | } |
---|
| 1025 | + |
---|
| 1026 | +int __weak memcmp_pages(struct page *page1, struct page *page2) |
---|
| 1027 | +{ |
---|
| 1028 | + char *addr1, *addr2; |
---|
| 1029 | + int ret; |
---|
| 1030 | + |
---|
| 1031 | + addr1 = kmap_atomic(page1); |
---|
| 1032 | + addr2 = kmap_atomic(page2); |
---|
| 1033 | + ret = memcmp(addr1, addr2, PAGE_SIZE); |
---|
| 1034 | + kunmap_atomic(addr2); |
---|
| 1035 | + kunmap_atomic(addr1); |
---|
| 1036 | + return ret; |
---|
| 1037 | +} |
---|