.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * Generic hugetlb support. |
---|
3 | 4 | * (C) Nadia Yvette Chambers, April 2004 |
---|
.. | .. |
---|
15 | 16 | #include <linux/compiler.h> |
---|
16 | 17 | #include <linux/cpuset.h> |
---|
17 | 18 | #include <linux/mutex.h> |
---|
18 | | -#include <linux/bootmem.h> |
---|
| 19 | +#include <linux/memblock.h> |
---|
19 | 20 | #include <linux/sysfs.h> |
---|
20 | 21 | #include <linux/slab.h> |
---|
| 22 | +#include <linux/sched/mm.h> |
---|
21 | 23 | #include <linux/mmdebug.h> |
---|
22 | 24 | #include <linux/sched/signal.h> |
---|
23 | 25 | #include <linux/rmap.h> |
---|
.. | .. |
---|
25 | 27 | #include <linux/swap.h> |
---|
26 | 28 | #include <linux/swapops.h> |
---|
27 | 29 | #include <linux/jhash.h> |
---|
| 30 | +#include <linux/numa.h> |
---|
| 31 | +#include <linux/llist.h> |
---|
| 32 | +#include <linux/cma.h> |
---|
28 | 33 | |
---|
29 | 34 | #include <asm/page.h> |
---|
30 | | -#include <asm/pgtable.h> |
---|
| 35 | +#include <asm/pgalloc.h> |
---|
31 | 36 | #include <asm/tlb.h> |
---|
32 | 37 | |
---|
33 | 38 | #include <linux/io.h> |
---|
34 | 39 | #include <linux/hugetlb.h> |
---|
35 | 40 | #include <linux/hugetlb_cgroup.h> |
---|
36 | 41 | #include <linux/node.h> |
---|
37 | | -#include <linux/userfaultfd_k.h> |
---|
38 | 42 | #include <linux/page_owner.h> |
---|
39 | 43 | #include "internal.h" |
---|
40 | 44 | |
---|
41 | 45 | int hugetlb_max_hstate __read_mostly; |
---|
42 | 46 | unsigned int default_hstate_idx; |
---|
43 | 47 | struct hstate hstates[HUGE_MAX_HSTATE]; |
---|
| 48 | + |
---|
| 49 | +#ifdef CONFIG_CMA |
---|
| 50 | +static struct cma *hugetlb_cma[MAX_NUMNODES]; |
---|
| 51 | +#endif |
---|
| 52 | +static unsigned long hugetlb_cma_size __initdata; |
---|
| 53 | + |
---|
44 | 54 | /* |
---|
45 | 55 | * Minimum page order among possible hugepage sizes, set to a proper value |
---|
46 | 56 | * at boot time. |
---|
.. | .. |
---|
52 | 62 | /* for command line parsing */ |
---|
53 | 63 | static struct hstate * __initdata parsed_hstate; |
---|
54 | 64 | static unsigned long __initdata default_hstate_max_huge_pages; |
---|
55 | | -static unsigned long __initdata default_hstate_size; |
---|
56 | 65 | static bool __initdata parsed_valid_hugepagesz = true; |
---|
| 66 | +static bool __initdata parsed_default_hugepagesz; |
---|
57 | 67 | |
---|
58 | 68 | /* |
---|
59 | 69 | * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, |
---|
.. | .. |
---|
67 | 77 | */ |
---|
68 | 78 | static int num_fault_mutexes; |
---|
69 | 79 | struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; |
---|
| 80 | + |
---|
| 81 | +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, |
---|
| 82 | + unsigned long start, unsigned long end); |
---|
70 | 83 | |
---|
71 | 84 | static inline bool PageHugeFreed(struct page *head) |
---|
72 | 85 | { |
---|
.. | .. |
---|
93 | 106 | spin_unlock(&spool->lock); |
---|
94 | 107 | |
---|
95 | 108 | /* If no pages are used, and no other handles to the subpool |
---|
96 | | - * remain, give up any reservations mased on minimum size and |
---|
| 109 | + * remain, give up any reservations based on minimum size and |
---|
97 | 110 | * free the subpool */ |
---|
98 | 111 | if (free) { |
---|
99 | 112 | if (spool->min_hpages != -1) |
---|
.. | .. |
---|
138 | 151 | /* |
---|
139 | 152 | * Subpool accounting for allocating and reserving pages. |
---|
140 | 153 | * Return -ENOMEM if there are not enough resources to satisfy the |
---|
141 | | - * the request. Otherwise, return the number of pages by which the |
---|
| 154 | + * request. Otherwise, return the number of pages by which the |
---|
142 | 155 | * global pools must be adjusted (upward). The returned value may |
---|
143 | 156 | * only be different than the passed value (delta) in the case where |
---|
144 | | - * a subpool minimum size must be manitained. |
---|
| 157 | + * a subpool minimum size must be maintained. |
---|
145 | 158 | */ |
---|
146 | 159 | static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, |
---|
147 | 160 | long delta) |
---|
.. | .. |
---|
232 | 245 | return subpool_inode(file_inode(vma->vm_file)); |
---|
233 | 246 | } |
---|
234 | 247 | |
---|
235 | | -/* |
---|
236 | | - * Region tracking -- allows tracking of reservations and instantiated pages |
---|
237 | | - * across the pages in a mapping. |
---|
238 | | - * |
---|
239 | | - * The region data structures are embedded into a resv_map and protected |
---|
240 | | - * by a resv_map's lock. The set of regions within the resv_map represent |
---|
241 | | - * reservations for huge pages, or huge pages that have already been |
---|
242 | | - * instantiated within the map. The from and to elements are huge page |
---|
243 | | - * indicies into the associated mapping. from indicates the starting index |
---|
244 | | - * of the region. to represents the first index past the end of the region. |
---|
245 | | - * |
---|
246 | | - * For example, a file region structure with from == 0 and to == 4 represents |
---|
247 | | - * four huge pages in a mapping. It is important to note that the to element |
---|
248 | | - * represents the first element past the end of the region. This is used in |
---|
249 | | - * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. |
---|
250 | | - * |
---|
251 | | - * Interval notation of the form [from, to) will be used to indicate that |
---|
252 | | - * the endpoint from is inclusive and to is exclusive. |
---|
| 248 | +/* Helper that removes a struct file_region from the resv_map cache and returns |
---|
| 249 | + * it for use. |
---|
253 | 250 | */ |
---|
254 | | -struct file_region { |
---|
255 | | - struct list_head link; |
---|
256 | | - long from; |
---|
257 | | - long to; |
---|
258 | | -}; |
---|
259 | | - |
---|
260 | | -/* |
---|
261 | | - * Add the huge page range represented by [f, t) to the reserve |
---|
262 | | - * map. In the normal case, existing regions will be expanded |
---|
263 | | - * to accommodate the specified range. Sufficient regions should |
---|
264 | | - * exist for expansion due to the previous call to region_chg |
---|
265 | | - * with the same range. However, it is possible that region_del |
---|
266 | | - * could have been called after region_chg and modifed the map |
---|
267 | | - * in such a way that no region exists to be expanded. In this |
---|
268 | | - * case, pull a region descriptor from the cache associated with |
---|
269 | | - * the map and use that for the new range. |
---|
270 | | - * |
---|
271 | | - * Return the number of new huge pages added to the map. This |
---|
272 | | - * number is greater than or equal to zero. |
---|
273 | | - */ |
---|
274 | | -static long region_add(struct resv_map *resv, long f, long t) |
---|
| 251 | +static struct file_region * |
---|
| 252 | +get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) |
---|
275 | 253 | { |
---|
276 | | - struct list_head *head = &resv->regions; |
---|
277 | | - struct file_region *rg, *nrg, *trg; |
---|
278 | | - long add = 0; |
---|
| 254 | + struct file_region *nrg = NULL; |
---|
279 | 255 | |
---|
280 | | - spin_lock(&resv->lock); |
---|
281 | | - /* Locate the region we are either in or before. */ |
---|
282 | | - list_for_each_entry(rg, head, link) |
---|
283 | | - if (f <= rg->to) |
---|
284 | | - break; |
---|
| 256 | + VM_BUG_ON(resv->region_cache_count <= 0); |
---|
285 | 257 | |
---|
286 | | - /* |
---|
287 | | - * If no region exists which can be expanded to include the |
---|
288 | | - * specified range, the list must have been modified by an |
---|
289 | | - * interleving call to region_del(). Pull a region descriptor |
---|
290 | | - * from the cache and use it for this range. |
---|
291 | | - */ |
---|
292 | | - if (&rg->link == head || t < rg->from) { |
---|
293 | | - VM_BUG_ON(resv->region_cache_count <= 0); |
---|
| 258 | + resv->region_cache_count--; |
---|
| 259 | + nrg = list_first_entry(&resv->region_cache, struct file_region, link); |
---|
| 260 | + list_del(&nrg->link); |
---|
294 | 261 | |
---|
295 | | - resv->region_cache_count--; |
---|
296 | | - nrg = list_first_entry(&resv->region_cache, struct file_region, |
---|
297 | | - link); |
---|
298 | | - list_del(&nrg->link); |
---|
| 262 | + nrg->from = from; |
---|
| 263 | + nrg->to = to; |
---|
299 | 264 | |
---|
300 | | - nrg->from = f; |
---|
301 | | - nrg->to = t; |
---|
302 | | - list_add(&nrg->link, rg->link.prev); |
---|
| 265 | + return nrg; |
---|
| 266 | +} |
---|
303 | 267 | |
---|
304 | | - add += t - f; |
---|
305 | | - goto out_locked; |
---|
| 268 | +static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, |
---|
| 269 | + struct file_region *rg) |
---|
| 270 | +{ |
---|
| 271 | +#ifdef CONFIG_CGROUP_HUGETLB |
---|
| 272 | + nrg->reservation_counter = rg->reservation_counter; |
---|
| 273 | + nrg->css = rg->css; |
---|
| 274 | + if (rg->css) |
---|
| 275 | + css_get(rg->css); |
---|
| 276 | +#endif |
---|
| 277 | +} |
---|
| 278 | + |
---|
| 279 | +/* Helper that records hugetlb_cgroup uncharge info. */ |
---|
| 280 | +static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, |
---|
| 281 | + struct hstate *h, |
---|
| 282 | + struct resv_map *resv, |
---|
| 283 | + struct file_region *nrg) |
---|
| 284 | +{ |
---|
| 285 | +#ifdef CONFIG_CGROUP_HUGETLB |
---|
| 286 | + if (h_cg) { |
---|
| 287 | + nrg->reservation_counter = |
---|
| 288 | + &h_cg->rsvd_hugepage[hstate_index(h)]; |
---|
| 289 | + nrg->css = &h_cg->css; |
---|
| 290 | + /* |
---|
| 291 | + * The caller will hold exactly one h_cg->css reference for the |
---|
| 292 | + * whole contiguous reservation region. But this area might be |
---|
| 293 | + * scattered when there are already some file_regions reside in |
---|
| 294 | + * it. As a result, many file_regions may share only one css |
---|
| 295 | + * reference. In order to ensure that one file_region must hold |
---|
| 296 | + * exactly one h_cg->css reference, we should do css_get for |
---|
| 297 | + * each file_region and leave the reference held by caller |
---|
| 298 | + * untouched. |
---|
| 299 | + */ |
---|
| 300 | + css_get(&h_cg->css); |
---|
| 301 | + if (!resv->pages_per_hpage) |
---|
| 302 | + resv->pages_per_hpage = pages_per_huge_page(h); |
---|
| 303 | + /* pages_per_hpage should be the same for all entries in |
---|
| 304 | + * a resv_map. |
---|
| 305 | + */ |
---|
| 306 | + VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); |
---|
| 307 | + } else { |
---|
| 308 | + nrg->reservation_counter = NULL; |
---|
| 309 | + nrg->css = NULL; |
---|
| 310 | + } |
---|
| 311 | +#endif |
---|
| 312 | +} |
---|
| 313 | + |
---|
| 314 | +static void put_uncharge_info(struct file_region *rg) |
---|
| 315 | +{ |
---|
| 316 | +#ifdef CONFIG_CGROUP_HUGETLB |
---|
| 317 | + if (rg->css) |
---|
| 318 | + css_put(rg->css); |
---|
| 319 | +#endif |
---|
| 320 | +} |
---|
| 321 | + |
---|
| 322 | +static bool has_same_uncharge_info(struct file_region *rg, |
---|
| 323 | + struct file_region *org) |
---|
| 324 | +{ |
---|
| 325 | +#ifdef CONFIG_CGROUP_HUGETLB |
---|
| 326 | + return rg && org && |
---|
| 327 | + rg->reservation_counter == org->reservation_counter && |
---|
| 328 | + rg->css == org->css; |
---|
| 329 | + |
---|
| 330 | +#else |
---|
| 331 | + return true; |
---|
| 332 | +#endif |
---|
| 333 | +} |
---|
| 334 | + |
---|
| 335 | +static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) |
---|
| 336 | +{ |
---|
| 337 | + struct file_region *nrg = NULL, *prg = NULL; |
---|
| 338 | + |
---|
| 339 | + prg = list_prev_entry(rg, link); |
---|
| 340 | + if (&prg->link != &resv->regions && prg->to == rg->from && |
---|
| 341 | + has_same_uncharge_info(prg, rg)) { |
---|
| 342 | + prg->to = rg->to; |
---|
| 343 | + |
---|
| 344 | + list_del(&rg->link); |
---|
| 345 | + put_uncharge_info(rg); |
---|
| 346 | + kfree(rg); |
---|
| 347 | + |
---|
| 348 | + rg = prg; |
---|
306 | 349 | } |
---|
307 | 350 | |
---|
308 | | - /* Round our left edge to the current segment if it encloses us. */ |
---|
309 | | - if (f > rg->from) |
---|
310 | | - f = rg->from; |
---|
| 351 | + nrg = list_next_entry(rg, link); |
---|
| 352 | + if (&nrg->link != &resv->regions && nrg->from == rg->to && |
---|
| 353 | + has_same_uncharge_info(nrg, rg)) { |
---|
| 354 | + nrg->from = rg->from; |
---|
311 | 355 | |
---|
312 | | - /* Check for and consume any regions we now overlap with. */ |
---|
313 | | - nrg = rg; |
---|
314 | | - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { |
---|
315 | | - if (&rg->link == head) |
---|
316 | | - break; |
---|
| 356 | + list_del(&rg->link); |
---|
| 357 | + put_uncharge_info(rg); |
---|
| 358 | + kfree(rg); |
---|
| 359 | + } |
---|
| 360 | +} |
---|
| 361 | + |
---|
| 362 | +/* |
---|
| 363 | + * Must be called with resv->lock held. |
---|
| 364 | + * |
---|
| 365 | + * Calling this with regions_needed != NULL will count the number of pages |
---|
| 366 | + * to be added but will not modify the linked list. And regions_needed will |
---|
| 367 | + * indicate the number of file_regions needed in the cache to carry out to add |
---|
| 368 | + * the regions for this range. |
---|
| 369 | + */ |
---|
| 370 | +static long add_reservation_in_range(struct resv_map *resv, long f, long t, |
---|
| 371 | + struct hugetlb_cgroup *h_cg, |
---|
| 372 | + struct hstate *h, long *regions_needed) |
---|
| 373 | +{ |
---|
| 374 | + long add = 0; |
---|
| 375 | + struct list_head *head = &resv->regions; |
---|
| 376 | + long last_accounted_offset = f; |
---|
| 377 | + struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; |
---|
| 378 | + |
---|
| 379 | + if (regions_needed) |
---|
| 380 | + *regions_needed = 0; |
---|
| 381 | + |
---|
| 382 | + /* In this loop, we essentially handle an entry for the range |
---|
| 383 | + * [last_accounted_offset, rg->from), at every iteration, with some |
---|
| 384 | + * bounds checking. |
---|
| 385 | + */ |
---|
| 386 | + list_for_each_entry_safe(rg, trg, head, link) { |
---|
| 387 | + /* Skip irrelevant regions that start before our range. */ |
---|
| 388 | + if (rg->from < f) { |
---|
| 389 | + /* If this region ends after the last accounted offset, |
---|
| 390 | + * then we need to update last_accounted_offset. |
---|
| 391 | + */ |
---|
| 392 | + if (rg->to > last_accounted_offset) |
---|
| 393 | + last_accounted_offset = rg->to; |
---|
| 394 | + continue; |
---|
| 395 | + } |
---|
| 396 | + |
---|
| 397 | + /* When we find a region that starts beyond our range, we've |
---|
| 398 | + * finished. |
---|
| 399 | + */ |
---|
317 | 400 | if (rg->from > t) |
---|
318 | 401 | break; |
---|
319 | 402 | |
---|
320 | | - /* If this area reaches higher then extend our area to |
---|
321 | | - * include it completely. If this is not the first area |
---|
322 | | - * which we intend to reuse, free it. */ |
---|
323 | | - if (rg->to > t) |
---|
324 | | - t = rg->to; |
---|
325 | | - if (rg != nrg) { |
---|
326 | | - /* Decrement return value by the deleted range. |
---|
327 | | - * Another range will span this area so that by |
---|
328 | | - * end of routine add will be >= zero |
---|
329 | | - */ |
---|
330 | | - add -= (rg->to - rg->from); |
---|
331 | | - list_del(&rg->link); |
---|
332 | | - kfree(rg); |
---|
| 403 | + /* Add an entry for last_accounted_offset -> rg->from, and |
---|
| 404 | + * update last_accounted_offset. |
---|
| 405 | + */ |
---|
| 406 | + if (rg->from > last_accounted_offset) { |
---|
| 407 | + add += rg->from - last_accounted_offset; |
---|
| 408 | + if (!regions_needed) { |
---|
| 409 | + nrg = get_file_region_entry_from_cache( |
---|
| 410 | + resv, last_accounted_offset, rg->from); |
---|
| 411 | + record_hugetlb_cgroup_uncharge_info(h_cg, h, |
---|
| 412 | + resv, nrg); |
---|
| 413 | + list_add(&nrg->link, rg->link.prev); |
---|
| 414 | + coalesce_file_region(resv, nrg); |
---|
| 415 | + } else |
---|
| 416 | + *regions_needed += 1; |
---|
333 | 417 | } |
---|
| 418 | + |
---|
| 419 | + last_accounted_offset = rg->to; |
---|
334 | 420 | } |
---|
335 | 421 | |
---|
336 | | - add += (nrg->from - f); /* Added to beginning of region */ |
---|
337 | | - nrg->from = f; |
---|
338 | | - add += t - nrg->to; /* Added to end of region */ |
---|
339 | | - nrg->to = t; |
---|
| 422 | + /* Handle the case where our range extends beyond |
---|
| 423 | + * last_accounted_offset. |
---|
| 424 | + */ |
---|
| 425 | + if (last_accounted_offset < t) { |
---|
| 426 | + add += t - last_accounted_offset; |
---|
| 427 | + if (!regions_needed) { |
---|
| 428 | + nrg = get_file_region_entry_from_cache( |
---|
| 429 | + resv, last_accounted_offset, t); |
---|
| 430 | + record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); |
---|
| 431 | + list_add(&nrg->link, rg->link.prev); |
---|
| 432 | + coalesce_file_region(resv, nrg); |
---|
| 433 | + } else |
---|
| 434 | + *regions_needed += 1; |
---|
| 435 | + } |
---|
340 | 436 | |
---|
341 | | -out_locked: |
---|
342 | | - resv->adds_in_progress--; |
---|
| 437 | + VM_BUG_ON(add < 0); |
---|
| 438 | + return add; |
---|
| 439 | +} |
---|
| 440 | + |
---|
| 441 | +/* Must be called with resv->lock acquired. Will drop lock to allocate entries. |
---|
| 442 | + */ |
---|
| 443 | +static int allocate_file_region_entries(struct resv_map *resv, |
---|
| 444 | + int regions_needed) |
---|
| 445 | + __must_hold(&resv->lock) |
---|
| 446 | +{ |
---|
| 447 | + struct list_head allocated_regions; |
---|
| 448 | + int to_allocate = 0, i = 0; |
---|
| 449 | + struct file_region *trg = NULL, *rg = NULL; |
---|
| 450 | + |
---|
| 451 | + VM_BUG_ON(regions_needed < 0); |
---|
| 452 | + |
---|
| 453 | + INIT_LIST_HEAD(&allocated_regions); |
---|
| 454 | + |
---|
| 455 | + /* |
---|
| 456 | + * Check for sufficient descriptors in the cache to accommodate |
---|
| 457 | + * the number of in progress add operations plus regions_needed. |
---|
| 458 | + * |
---|
| 459 | + * This is a while loop because when we drop the lock, some other call |
---|
| 460 | + * to region_add or region_del may have consumed some region_entries, |
---|
| 461 | + * so we keep looping here until we finally have enough entries for |
---|
| 462 | + * (adds_in_progress + regions_needed). |
---|
| 463 | + */ |
---|
| 464 | + while (resv->region_cache_count < |
---|
| 465 | + (resv->adds_in_progress + regions_needed)) { |
---|
| 466 | + to_allocate = resv->adds_in_progress + regions_needed - |
---|
| 467 | + resv->region_cache_count; |
---|
| 468 | + |
---|
| 469 | + /* At this point, we should have enough entries in the cache |
---|
| 470 | + * for all the existings adds_in_progress. We should only be |
---|
| 471 | + * needing to allocate for regions_needed. |
---|
| 472 | + */ |
---|
| 473 | + VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); |
---|
| 474 | + |
---|
| 475 | + spin_unlock(&resv->lock); |
---|
| 476 | + for (i = 0; i < to_allocate; i++) { |
---|
| 477 | + trg = kmalloc(sizeof(*trg), GFP_KERNEL); |
---|
| 478 | + if (!trg) |
---|
| 479 | + goto out_of_memory; |
---|
| 480 | + list_add(&trg->link, &allocated_regions); |
---|
| 481 | + } |
---|
| 482 | + |
---|
| 483 | + spin_lock(&resv->lock); |
---|
| 484 | + |
---|
| 485 | + list_splice(&allocated_regions, &resv->region_cache); |
---|
| 486 | + resv->region_cache_count += to_allocate; |
---|
| 487 | + } |
---|
| 488 | + |
---|
| 489 | + return 0; |
---|
| 490 | + |
---|
| 491 | +out_of_memory: |
---|
| 492 | + list_for_each_entry_safe(rg, trg, &allocated_regions, link) { |
---|
| 493 | + list_del(&rg->link); |
---|
| 494 | + kfree(rg); |
---|
| 495 | + } |
---|
| 496 | + return -ENOMEM; |
---|
| 497 | +} |
---|
| 498 | + |
---|
| 499 | +/* |
---|
| 500 | + * Add the huge page range represented by [f, t) to the reserve |
---|
| 501 | + * map. Regions will be taken from the cache to fill in this range. |
---|
| 502 | + * Sufficient regions should exist in the cache due to the previous |
---|
| 503 | + * call to region_chg with the same range, but in some cases the cache will not |
---|
| 504 | + * have sufficient entries due to races with other code doing region_add or |
---|
| 505 | + * region_del. The extra needed entries will be allocated. |
---|
| 506 | + * |
---|
| 507 | + * regions_needed is the out value provided by a previous call to region_chg. |
---|
| 508 | + * |
---|
| 509 | + * Return the number of new huge pages added to the map. This number is greater |
---|
| 510 | + * than or equal to zero. If file_region entries needed to be allocated for |
---|
| 511 | + * this operation and we were not able to allocate, it returns -ENOMEM. |
---|
| 512 | + * region_add of regions of length 1 never allocate file_regions and cannot |
---|
| 513 | + * fail; region_chg will always allocate at least 1 entry and a region_add for |
---|
| 514 | + * 1 page will only require at most 1 entry. |
---|
| 515 | + */ |
---|
| 516 | +static long region_add(struct resv_map *resv, long f, long t, |
---|
| 517 | + long in_regions_needed, struct hstate *h, |
---|
| 518 | + struct hugetlb_cgroup *h_cg) |
---|
| 519 | +{ |
---|
| 520 | + long add = 0, actual_regions_needed = 0; |
---|
| 521 | + |
---|
| 522 | + spin_lock(&resv->lock); |
---|
| 523 | +retry: |
---|
| 524 | + |
---|
| 525 | + /* Count how many regions are actually needed to execute this add. */ |
---|
| 526 | + add_reservation_in_range(resv, f, t, NULL, NULL, |
---|
| 527 | + &actual_regions_needed); |
---|
| 528 | + |
---|
| 529 | + /* |
---|
| 530 | + * Check for sufficient descriptors in the cache to accommodate |
---|
| 531 | + * this add operation. Note that actual_regions_needed may be greater |
---|
| 532 | + * than in_regions_needed, as the resv_map may have been modified since |
---|
| 533 | + * the region_chg call. In this case, we need to make sure that we |
---|
| 534 | + * allocate extra entries, such that we have enough for all the |
---|
| 535 | + * existing adds_in_progress, plus the excess needed for this |
---|
| 536 | + * operation. |
---|
| 537 | + */ |
---|
| 538 | + if (actual_regions_needed > in_regions_needed && |
---|
| 539 | + resv->region_cache_count < |
---|
| 540 | + resv->adds_in_progress + |
---|
| 541 | + (actual_regions_needed - in_regions_needed)) { |
---|
| 542 | + /* region_add operation of range 1 should never need to |
---|
| 543 | + * allocate file_region entries. |
---|
| 544 | + */ |
---|
| 545 | + VM_BUG_ON(t - f <= 1); |
---|
| 546 | + |
---|
| 547 | + if (allocate_file_region_entries( |
---|
| 548 | + resv, actual_regions_needed - in_regions_needed)) { |
---|
| 549 | + return -ENOMEM; |
---|
| 550 | + } |
---|
| 551 | + |
---|
| 552 | + goto retry; |
---|
| 553 | + } |
---|
| 554 | + |
---|
| 555 | + add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); |
---|
| 556 | + |
---|
| 557 | + resv->adds_in_progress -= in_regions_needed; |
---|
| 558 | + |
---|
343 | 559 | spin_unlock(&resv->lock); |
---|
344 | 560 | VM_BUG_ON(add < 0); |
---|
345 | 561 | return add; |
---|
.. | .. |
---|
352 | 568 | * call to region_add that will actually modify the reserve |
---|
353 | 569 | * map to add the specified range [f, t). region_chg does |
---|
354 | 570 | * not change the number of huge pages represented by the |
---|
355 | | - * map. However, if the existing regions in the map can not |
---|
356 | | - * be expanded to represent the new range, a new file_region |
---|
357 | | - * structure is added to the map as a placeholder. This is |
---|
358 | | - * so that the subsequent region_add call will have all the |
---|
359 | | - * regions it needs and will not fail. |
---|
| 571 | + * map. A number of new file_region structures is added to the cache as a |
---|
| 572 | + * placeholder, for the subsequent region_add call to use. At least 1 |
---|
| 573 | + * file_region structure is added. |
---|
360 | 574 | * |
---|
361 | | - * Upon entry, region_chg will also examine the cache of region descriptors |
---|
362 | | - * associated with the map. If there are not enough descriptors cached, one |
---|
363 | | - * will be allocated for the in progress add operation. |
---|
| 575 | + * out_regions_needed is the number of regions added to the |
---|
| 576 | + * resv->adds_in_progress. This value needs to be provided to a follow up call |
---|
| 577 | + * to region_add or region_abort for proper accounting. |
---|
364 | 578 | * |
---|
365 | 579 | * Returns the number of huge pages that need to be added to the existing |
---|
366 | 580 | * reservation map for the range [f, t). This number is greater or equal to |
---|
367 | 581 | * zero. -ENOMEM is returned if a new file_region structure or cache entry |
---|
368 | 582 | * is needed and can not be allocated. |
---|
369 | 583 | */ |
---|
370 | | -static long region_chg(struct resv_map *resv, long f, long t) |
---|
| 584 | +static long region_chg(struct resv_map *resv, long f, long t, |
---|
| 585 | + long *out_regions_needed) |
---|
371 | 586 | { |
---|
372 | | - struct list_head *head = &resv->regions; |
---|
373 | | - struct file_region *rg, *nrg = NULL; |
---|
374 | 587 | long chg = 0; |
---|
375 | 588 | |
---|
376 | | -retry: |
---|
377 | 589 | spin_lock(&resv->lock); |
---|
378 | | -retry_locked: |
---|
379 | | - resv->adds_in_progress++; |
---|
380 | 590 | |
---|
381 | | - /* |
---|
382 | | - * Check for sufficient descriptors in the cache to accommodate |
---|
383 | | - * the number of in progress add operations. |
---|
384 | | - */ |
---|
385 | | - if (resv->adds_in_progress > resv->region_cache_count) { |
---|
386 | | - struct file_region *trg; |
---|
| 591 | + /* Count how many hugepages in this range are NOT represented. */ |
---|
| 592 | + chg = add_reservation_in_range(resv, f, t, NULL, NULL, |
---|
| 593 | + out_regions_needed); |
---|
387 | 594 | |
---|
388 | | - VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1); |
---|
389 | | - /* Must drop lock to allocate a new descriptor. */ |
---|
390 | | - resv->adds_in_progress--; |
---|
391 | | - spin_unlock(&resv->lock); |
---|
| 595 | + if (*out_regions_needed == 0) |
---|
| 596 | + *out_regions_needed = 1; |
---|
392 | 597 | |
---|
393 | | - trg = kmalloc(sizeof(*trg), GFP_KERNEL); |
---|
394 | | - if (!trg) { |
---|
395 | | - kfree(nrg); |
---|
396 | | - return -ENOMEM; |
---|
397 | | - } |
---|
| 598 | + if (allocate_file_region_entries(resv, *out_regions_needed)) |
---|
| 599 | + return -ENOMEM; |
---|
398 | 600 | |
---|
399 | | - spin_lock(&resv->lock); |
---|
400 | | - list_add(&trg->link, &resv->region_cache); |
---|
401 | | - resv->region_cache_count++; |
---|
402 | | - goto retry_locked; |
---|
403 | | - } |
---|
| 601 | + resv->adds_in_progress += *out_regions_needed; |
---|
404 | 602 | |
---|
405 | | - /* Locate the region we are before or in. */ |
---|
406 | | - list_for_each_entry(rg, head, link) |
---|
407 | | - if (f <= rg->to) |
---|
408 | | - break; |
---|
409 | | - |
---|
410 | | - /* If we are below the current region then a new region is required. |
---|
411 | | - * Subtle, allocate a new region at the position but make it zero |
---|
412 | | - * size such that we can guarantee to record the reservation. */ |
---|
413 | | - if (&rg->link == head || t < rg->from) { |
---|
414 | | - if (!nrg) { |
---|
415 | | - resv->adds_in_progress--; |
---|
416 | | - spin_unlock(&resv->lock); |
---|
417 | | - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); |
---|
418 | | - if (!nrg) |
---|
419 | | - return -ENOMEM; |
---|
420 | | - |
---|
421 | | - nrg->from = f; |
---|
422 | | - nrg->to = f; |
---|
423 | | - INIT_LIST_HEAD(&nrg->link); |
---|
424 | | - goto retry; |
---|
425 | | - } |
---|
426 | | - |
---|
427 | | - list_add(&nrg->link, rg->link.prev); |
---|
428 | | - chg = t - f; |
---|
429 | | - goto out_nrg; |
---|
430 | | - } |
---|
431 | | - |
---|
432 | | - /* Round our left edge to the current segment if it encloses us. */ |
---|
433 | | - if (f > rg->from) |
---|
434 | | - f = rg->from; |
---|
435 | | - chg = t - f; |
---|
436 | | - |
---|
437 | | - /* Check for and consume any regions we now overlap with. */ |
---|
438 | | - list_for_each_entry(rg, rg->link.prev, link) { |
---|
439 | | - if (&rg->link == head) |
---|
440 | | - break; |
---|
441 | | - if (rg->from > t) |
---|
442 | | - goto out; |
---|
443 | | - |
---|
444 | | - /* We overlap with this area, if it extends further than |
---|
445 | | - * us then we must extend ourselves. Account for its |
---|
446 | | - * existing reservation. */ |
---|
447 | | - if (rg->to > t) { |
---|
448 | | - chg += rg->to - t; |
---|
449 | | - t = rg->to; |
---|
450 | | - } |
---|
451 | | - chg -= rg->to - rg->from; |
---|
452 | | - } |
---|
453 | | - |
---|
454 | | -out: |
---|
455 | | - spin_unlock(&resv->lock); |
---|
456 | | - /* We already know we raced and no longer need the new region */ |
---|
457 | | - kfree(nrg); |
---|
458 | | - return chg; |
---|
459 | | -out_nrg: |
---|
460 | 603 | spin_unlock(&resv->lock); |
---|
461 | 604 | return chg; |
---|
462 | 605 | } |
---|
.. | .. |
---|
466 | 609 | * of the resv_map keeps track of the operations in progress between |
---|
467 | 610 | * calls to region_chg and region_add. Operations are sometimes |
---|
468 | 611 | * aborted after the call to region_chg. In such cases, region_abort |
---|
469 | | - * is called to decrement the adds_in_progress counter. |
---|
| 612 | + * is called to decrement the adds_in_progress counter. regions_needed |
---|
| 613 | + * is the value returned by the region_chg call, it is used to decrement |
---|
| 614 | + * the adds_in_progress counter. |
---|
470 | 615 | * |
---|
471 | 616 | * NOTE: The range arguments [f, t) are not needed or used in this |
---|
472 | 617 | * routine. They are kept to make reading the calling code easier as |
---|
473 | 618 | * arguments will match the associated region_chg call. |
---|
474 | 619 | */ |
---|
475 | | -static void region_abort(struct resv_map *resv, long f, long t) |
---|
| 620 | +static void region_abort(struct resv_map *resv, long f, long t, |
---|
| 621 | + long regions_needed) |
---|
476 | 622 | { |
---|
477 | 623 | spin_lock(&resv->lock); |
---|
478 | 624 | VM_BUG_ON(!resv->region_cache_count); |
---|
479 | | - resv->adds_in_progress--; |
---|
| 625 | + resv->adds_in_progress -= regions_needed; |
---|
480 | 626 | spin_unlock(&resv->lock); |
---|
481 | 627 | } |
---|
482 | 628 | |
---|
.. | .. |
---|
540 | 686 | } |
---|
541 | 687 | |
---|
542 | 688 | del += t - f; |
---|
| 689 | + hugetlb_cgroup_uncharge_file_region( |
---|
| 690 | + resv, rg, t - f, false); |
---|
543 | 691 | |
---|
544 | 692 | /* New entry for end of split region */ |
---|
545 | 693 | nrg->from = t; |
---|
546 | 694 | nrg->to = rg->to; |
---|
| 695 | + |
---|
| 696 | + copy_hugetlb_cgroup_uncharge_info(nrg, rg); |
---|
| 697 | + |
---|
547 | 698 | INIT_LIST_HEAD(&nrg->link); |
---|
548 | 699 | |
---|
549 | 700 | /* Original entry is trimmed */ |
---|
.. | .. |
---|
556 | 707 | |
---|
557 | 708 | if (f <= rg->from && t >= rg->to) { /* Remove entire region */ |
---|
558 | 709 | del += rg->to - rg->from; |
---|
| 710 | + hugetlb_cgroup_uncharge_file_region(resv, rg, |
---|
| 711 | + rg->to - rg->from, true); |
---|
559 | 712 | list_del(&rg->link); |
---|
560 | 713 | kfree(rg); |
---|
561 | 714 | continue; |
---|
562 | 715 | } |
---|
563 | 716 | |
---|
564 | 717 | if (f <= rg->from) { /* Trim beginning of region */ |
---|
| 718 | + hugetlb_cgroup_uncharge_file_region(resv, rg, |
---|
| 719 | + t - rg->from, false); |
---|
| 720 | + |
---|
565 | 721 | del += t - rg->from; |
---|
566 | 722 | rg->from = t; |
---|
567 | 723 | } else { /* Trim end of region */ |
---|
| 724 | + hugetlb_cgroup_uncharge_file_region(resv, rg, |
---|
| 725 | + rg->to - f, false); |
---|
| 726 | + |
---|
568 | 727 | del += rg->to - f; |
---|
569 | 728 | rg->to = f; |
---|
570 | 729 | } |
---|
.. | .. |
---|
715 | 874 | vma->vm_private_data = (void *)value; |
---|
716 | 875 | } |
---|
717 | 876 | |
---|
| 877 | +static void |
---|
| 878 | +resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, |
---|
| 879 | + struct hugetlb_cgroup *h_cg, |
---|
| 880 | + struct hstate *h) |
---|
| 881 | +{ |
---|
| 882 | +#ifdef CONFIG_CGROUP_HUGETLB |
---|
| 883 | + if (!h_cg || !h) { |
---|
| 884 | + resv_map->reservation_counter = NULL; |
---|
| 885 | + resv_map->pages_per_hpage = 0; |
---|
| 886 | + resv_map->css = NULL; |
---|
| 887 | + } else { |
---|
| 888 | + resv_map->reservation_counter = |
---|
| 889 | + &h_cg->rsvd_hugepage[hstate_index(h)]; |
---|
| 890 | + resv_map->pages_per_hpage = pages_per_huge_page(h); |
---|
| 891 | + resv_map->css = &h_cg->css; |
---|
| 892 | + } |
---|
| 893 | +#endif |
---|
| 894 | +} |
---|
| 895 | + |
---|
718 | 896 | struct resv_map *resv_map_alloc(void) |
---|
719 | 897 | { |
---|
720 | 898 | struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); |
---|
.. | .. |
---|
731 | 909 | INIT_LIST_HEAD(&resv_map->regions); |
---|
732 | 910 | |
---|
733 | 911 | resv_map->adds_in_progress = 0; |
---|
| 912 | + /* |
---|
| 913 | + * Initialize these to 0. On shared mappings, 0's here indicate these |
---|
| 914 | + * fields don't do cgroup accounting. On private mappings, these will be |
---|
| 915 | + * re-initialized to the proper values, to indicate that hugetlb cgroup |
---|
| 916 | + * reservations are to be un-charged from here. |
---|
| 917 | + */ |
---|
| 918 | + resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); |
---|
734 | 919 | |
---|
735 | 920 | INIT_LIST_HEAD(&resv_map->region_cache); |
---|
736 | 921 | list_add(&rg->link, &resv_map->region_cache); |
---|
.. | .. |
---|
761 | 946 | |
---|
762 | 947 | static inline struct resv_map *inode_resv_map(struct inode *inode) |
---|
763 | 948 | { |
---|
764 | | - return inode->i_mapping->private_data; |
---|
| 949 | + /* |
---|
| 950 | + * At inode evict time, i_mapping may not point to the original |
---|
| 951 | + * address space within the inode. This original address space |
---|
| 952 | + * contains the pointer to the resv_map. So, always use the |
---|
| 953 | + * address space embedded within the inode. |
---|
| 954 | + * The VERY common case is inode->mapping == &inode->i_data but, |
---|
| 955 | + * this may not be true for device special inodes. |
---|
| 956 | + */ |
---|
| 957 | + return (struct resv_map *)(&inode->i_data)->private_data; |
---|
765 | 958 | } |
---|
766 | 959 | |
---|
767 | 960 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) |
---|
.. | .. |
---|
836 | 1029 | * We know VM_NORESERVE is not set. Therefore, there SHOULD |
---|
837 | 1030 | * be a region map for all pages. The only situation where |
---|
838 | 1031 | * there is no region map is if a hole was punched via |
---|
839 | | - * fallocate. In this case, there really are no reverves to |
---|
| 1032 | + * fallocate. In this case, there really are no reserves to |
---|
840 | 1033 | * use. This situation is indicated if chg != 0. |
---|
841 | 1034 | */ |
---|
842 | 1035 | if (chg) |
---|
.. | .. |
---|
886 | 1079 | static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) |
---|
887 | 1080 | { |
---|
888 | 1081 | struct page *page; |
---|
| 1082 | + bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); |
---|
889 | 1083 | |
---|
890 | | - list_for_each_entry(page, &h->hugepage_freelists[nid], lru) |
---|
891 | | - if (!PageHWPoison(page)) |
---|
892 | | - break; |
---|
893 | | - /* |
---|
894 | | - * if 'non-isolated free hugepage' not found on the list, |
---|
895 | | - * the allocation fails. |
---|
896 | | - */ |
---|
897 | | - if (&h->hugepage_freelists[nid] == &page->lru) |
---|
898 | | - return NULL; |
---|
899 | | - list_move(&page->lru, &h->hugepage_activelist); |
---|
900 | | - set_page_refcounted(page); |
---|
901 | | - ClearPageHugeFreed(page); |
---|
902 | | - h->free_huge_pages--; |
---|
903 | | - h->free_huge_pages_node[nid]--; |
---|
904 | | - return page; |
---|
| 1084 | + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { |
---|
| 1085 | + if (nocma && is_migrate_cma_page(page)) |
---|
| 1086 | + continue; |
---|
| 1087 | + |
---|
| 1088 | + if (PageHWPoison(page)) |
---|
| 1089 | + continue; |
---|
| 1090 | + |
---|
| 1091 | + list_move(&page->lru, &h->hugepage_activelist); |
---|
| 1092 | + set_page_refcounted(page); |
---|
| 1093 | + ClearPageHugeFreed(page); |
---|
| 1094 | + h->free_huge_pages--; |
---|
| 1095 | + h->free_huge_pages_node[nid]--; |
---|
| 1096 | + return page; |
---|
| 1097 | + } |
---|
| 1098 | + |
---|
| 1099 | + return NULL; |
---|
905 | 1100 | } |
---|
906 | 1101 | |
---|
907 | 1102 | static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, |
---|
.. | .. |
---|
911 | 1106 | struct zonelist *zonelist; |
---|
912 | 1107 | struct zone *zone; |
---|
913 | 1108 | struct zoneref *z; |
---|
914 | | - int node = -1; |
---|
| 1109 | + int node = NUMA_NO_NODE; |
---|
915 | 1110 | |
---|
916 | 1111 | zonelist = node_zonelist(nid, gfp_mask); |
---|
917 | 1112 | |
---|
.. | .. |
---|
938 | 1133 | goto retry_cpuset; |
---|
939 | 1134 | |
---|
940 | 1135 | return NULL; |
---|
941 | | -} |
---|
942 | | - |
---|
943 | | -/* Movability of hugepages depends on migration support. */ |
---|
944 | | -static inline gfp_t htlb_alloc_mask(struct hstate *h) |
---|
945 | | -{ |
---|
946 | | - if (hugepage_migration_supported(h)) |
---|
947 | | - return GFP_HIGHUSER_MOVABLE; |
---|
948 | | - else |
---|
949 | | - return GFP_HIGHUSER; |
---|
950 | 1136 | } |
---|
951 | 1137 | |
---|
952 | 1138 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
---|
.. | .. |
---|
1068 | 1254 | struct page *p = page + 1; |
---|
1069 | 1255 | |
---|
1070 | 1256 | atomic_set(compound_mapcount_ptr(page), 0); |
---|
| 1257 | + atomic_set(compound_pincount_ptr(page), 0); |
---|
| 1258 | + |
---|
1071 | 1259 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { |
---|
1072 | 1260 | clear_compound_head(p); |
---|
1073 | 1261 | set_page_refcounted(p); |
---|
1074 | 1262 | } |
---|
1075 | 1263 | |
---|
1076 | 1264 | set_compound_order(page, 0); |
---|
| 1265 | + page[1].compound_nr = 0; |
---|
1077 | 1266 | __ClearPageHead(page); |
---|
1078 | 1267 | } |
---|
1079 | 1268 | |
---|
1080 | 1269 | static void free_gigantic_page(struct page *page, unsigned int order) |
---|
1081 | 1270 | { |
---|
| 1271 | + /* |
---|
| 1272 | + * If the page isn't allocated using the cma allocator, |
---|
| 1273 | + * cma_release() returns false. |
---|
| 1274 | + */ |
---|
| 1275 | +#ifdef CONFIG_CMA |
---|
| 1276 | + if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) |
---|
| 1277 | + return; |
---|
| 1278 | +#endif |
---|
| 1279 | + |
---|
1082 | 1280 | free_contig_range(page_to_pfn(page), 1 << order); |
---|
1083 | 1281 | } |
---|
1084 | 1282 | |
---|
1085 | | -static int __alloc_gigantic_page(unsigned long start_pfn, |
---|
1086 | | - unsigned long nr_pages, gfp_t gfp_mask) |
---|
1087 | | -{ |
---|
1088 | | - unsigned long end_pfn = start_pfn + nr_pages; |
---|
1089 | | - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, |
---|
1090 | | - gfp_mask); |
---|
1091 | | -} |
---|
1092 | | - |
---|
1093 | | -static bool pfn_range_valid_gigantic(struct zone *z, |
---|
1094 | | - unsigned long start_pfn, unsigned long nr_pages) |
---|
1095 | | -{ |
---|
1096 | | - unsigned long i, end_pfn = start_pfn + nr_pages; |
---|
1097 | | - struct page *page; |
---|
1098 | | - |
---|
1099 | | - for (i = start_pfn; i < end_pfn; i++) { |
---|
1100 | | - page = pfn_to_online_page(i); |
---|
1101 | | - if (!page) |
---|
1102 | | - return false; |
---|
1103 | | - |
---|
1104 | | - if (page_zone(page) != z) |
---|
1105 | | - return false; |
---|
1106 | | - |
---|
1107 | | - if (PageReserved(page)) |
---|
1108 | | - return false; |
---|
1109 | | - |
---|
1110 | | - if (page_count(page) > 0) |
---|
1111 | | - return false; |
---|
1112 | | - |
---|
1113 | | - if (PageHuge(page)) |
---|
1114 | | - return false; |
---|
1115 | | - } |
---|
1116 | | - |
---|
1117 | | - return true; |
---|
1118 | | -} |
---|
1119 | | - |
---|
1120 | | -static bool zone_spans_last_pfn(const struct zone *zone, |
---|
1121 | | - unsigned long start_pfn, unsigned long nr_pages) |
---|
1122 | | -{ |
---|
1123 | | - unsigned long last_pfn = start_pfn + nr_pages - 1; |
---|
1124 | | - return zone_spans_pfn(zone, last_pfn); |
---|
1125 | | -} |
---|
1126 | | - |
---|
| 1283 | +#ifdef CONFIG_CONTIG_ALLOC |
---|
1127 | 1284 | static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, |
---|
1128 | 1285 | int nid, nodemask_t *nodemask) |
---|
1129 | 1286 | { |
---|
1130 | | - unsigned int order = huge_page_order(h); |
---|
1131 | | - unsigned long nr_pages = 1 << order; |
---|
1132 | | - unsigned long ret, pfn, flags; |
---|
1133 | | - struct zonelist *zonelist; |
---|
1134 | | - struct zone *zone; |
---|
1135 | | - struct zoneref *z; |
---|
| 1287 | + unsigned long nr_pages = 1UL << huge_page_order(h); |
---|
| 1288 | + if (nid == NUMA_NO_NODE) |
---|
| 1289 | + nid = numa_mem_id(); |
---|
1136 | 1290 | |
---|
1137 | | - zonelist = node_zonelist(nid, gfp_mask); |
---|
1138 | | - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { |
---|
1139 | | - spin_lock_irqsave(&zone->lock, flags); |
---|
| 1291 | +#ifdef CONFIG_CMA |
---|
| 1292 | + { |
---|
| 1293 | + struct page *page; |
---|
| 1294 | + int node; |
---|
1140 | 1295 | |
---|
1141 | | - pfn = ALIGN(zone->zone_start_pfn, nr_pages); |
---|
1142 | | - while (zone_spans_last_pfn(zone, pfn, nr_pages)) { |
---|
1143 | | - if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) { |
---|
1144 | | - /* |
---|
1145 | | - * We release the zone lock here because |
---|
1146 | | - * alloc_contig_range() will also lock the zone |
---|
1147 | | - * at some point. If there's an allocation |
---|
1148 | | - * spinning on this lock, it may win the race |
---|
1149 | | - * and cause alloc_contig_range() to fail... |
---|
1150 | | - */ |
---|
1151 | | - spin_unlock_irqrestore(&zone->lock, flags); |
---|
1152 | | - ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask); |
---|
1153 | | - if (!ret) |
---|
1154 | | - return pfn_to_page(pfn); |
---|
1155 | | - spin_lock_irqsave(&zone->lock, flags); |
---|
1156 | | - } |
---|
1157 | | - pfn += nr_pages; |
---|
| 1296 | + if (hugetlb_cma[nid]) { |
---|
| 1297 | + page = cma_alloc(hugetlb_cma[nid], nr_pages, |
---|
| 1298 | + huge_page_order(h), |
---|
| 1299 | + GFP_KERNEL | __GFP_NOWARN); |
---|
| 1300 | + if (page) |
---|
| 1301 | + return page; |
---|
1158 | 1302 | } |
---|
1159 | 1303 | |
---|
1160 | | - spin_unlock_irqrestore(&zone->lock, flags); |
---|
1161 | | - } |
---|
| 1304 | + if (!(gfp_mask & __GFP_THISNODE)) { |
---|
| 1305 | + for_each_node_mask(node, *nodemask) { |
---|
| 1306 | + if (node == nid || !hugetlb_cma[node]) |
---|
| 1307 | + continue; |
---|
1162 | 1308 | |
---|
1163 | | - return NULL; |
---|
| 1309 | + page = cma_alloc(hugetlb_cma[node], nr_pages, |
---|
| 1310 | + huge_page_order(h), |
---|
| 1311 | + GFP_KERNEL | __GFP_NOWARN); |
---|
| 1312 | + if (page) |
---|
| 1313 | + return page; |
---|
| 1314 | + } |
---|
| 1315 | + } |
---|
| 1316 | + } |
---|
| 1317 | +#endif |
---|
| 1318 | + |
---|
| 1319 | + return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); |
---|
1164 | 1320 | } |
---|
1165 | 1321 | |
---|
1166 | | -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); |
---|
1167 | | -static void prep_compound_gigantic_page(struct page *page, unsigned int order); |
---|
| 1322 | +#else /* !CONFIG_CONTIG_ALLOC */ |
---|
| 1323 | +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, |
---|
| 1324 | + int nid, nodemask_t *nodemask) |
---|
| 1325 | +{ |
---|
| 1326 | + return NULL; |
---|
| 1327 | +} |
---|
| 1328 | +#endif /* CONFIG_CONTIG_ALLOC */ |
---|
1168 | 1329 | |
---|
1169 | 1330 | #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ |
---|
1170 | | -static inline bool gigantic_page_supported(void) { return false; } |
---|
1171 | 1331 | static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, |
---|
1172 | | - int nid, nodemask_t *nodemask) { return NULL; } |
---|
| 1332 | + int nid, nodemask_t *nodemask) |
---|
| 1333 | +{ |
---|
| 1334 | + return NULL; |
---|
| 1335 | +} |
---|
1173 | 1336 | static inline void free_gigantic_page(struct page *page, unsigned int order) { } |
---|
1174 | 1337 | static inline void destroy_compound_gigantic_page(struct page *page, |
---|
1175 | 1338 | unsigned int order) { } |
---|
.. | .. |
---|
1180 | 1343 | int i; |
---|
1181 | 1344 | struct page *subpage = page; |
---|
1182 | 1345 | |
---|
1183 | | - if (hstate_is_gigantic(h) && !gigantic_page_supported()) |
---|
| 1346 | + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) |
---|
1184 | 1347 | return; |
---|
1185 | 1348 | |
---|
1186 | 1349 | h->nr_huge_pages--; |
---|
.. | .. |
---|
1193 | 1356 | 1 << PG_writeback); |
---|
1194 | 1357 | } |
---|
1195 | 1358 | VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); |
---|
| 1359 | + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); |
---|
1196 | 1360 | set_compound_page_dtor(page, NULL_COMPOUND_DTOR); |
---|
1197 | 1361 | set_page_refcounted(page); |
---|
1198 | 1362 | if (hstate_is_gigantic(h)) { |
---|
| 1363 | + /* |
---|
| 1364 | + * Temporarily drop the hugetlb_lock, because |
---|
| 1365 | + * we might block in free_gigantic_page(). |
---|
| 1366 | + */ |
---|
| 1367 | + spin_unlock(&hugetlb_lock); |
---|
1199 | 1368 | destroy_compound_gigantic_page(page, huge_page_order(h)); |
---|
1200 | 1369 | free_gigantic_page(page, huge_page_order(h)); |
---|
| 1370 | + spin_lock(&hugetlb_lock); |
---|
1201 | 1371 | } else { |
---|
1202 | 1372 | __free_pages(page, huge_page_order(h)); |
---|
1203 | 1373 | } |
---|
.. | .. |
---|
1260 | 1430 | page[2].mapping = NULL; |
---|
1261 | 1431 | } |
---|
1262 | 1432 | |
---|
1263 | | -void free_huge_page(struct page *page) |
---|
| 1433 | +static void __free_huge_page(struct page *page) |
---|
1264 | 1434 | { |
---|
1265 | 1435 | /* |
---|
1266 | 1436 | * Can't pass hstate in here because it is called from the |
---|
.. | .. |
---|
1272 | 1442 | (struct hugepage_subpool *)page_private(page); |
---|
1273 | 1443 | bool restore_reserve; |
---|
1274 | 1444 | |
---|
1275 | | - set_page_private(page, 0); |
---|
1276 | | - page->mapping = NULL; |
---|
1277 | 1445 | VM_BUG_ON_PAGE(page_count(page), page); |
---|
1278 | 1446 | VM_BUG_ON_PAGE(page_mapcount(page), page); |
---|
| 1447 | + |
---|
| 1448 | + set_page_private(page, 0); |
---|
| 1449 | + page->mapping = NULL; |
---|
1279 | 1450 | restore_reserve = PagePrivate(page); |
---|
1280 | 1451 | ClearPagePrivate(page); |
---|
1281 | 1452 | |
---|
.. | .. |
---|
1302 | 1473 | clear_page_huge_active(page); |
---|
1303 | 1474 | hugetlb_cgroup_uncharge_page(hstate_index(h), |
---|
1304 | 1475 | pages_per_huge_page(h), page); |
---|
| 1476 | + hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), |
---|
| 1477 | + pages_per_huge_page(h), page); |
---|
1305 | 1478 | if (restore_reserve) |
---|
1306 | 1479 | h->resv_huge_pages++; |
---|
1307 | 1480 | |
---|
.. | .. |
---|
1322 | 1495 | spin_unlock(&hugetlb_lock); |
---|
1323 | 1496 | } |
---|
1324 | 1497 | |
---|
| 1498 | +/* |
---|
| 1499 | + * As free_huge_page() can be called from a non-task context, we have |
---|
| 1500 | + * to defer the actual freeing in a workqueue to prevent potential |
---|
| 1501 | + * hugetlb_lock deadlock. |
---|
| 1502 | + * |
---|
| 1503 | + * free_hpage_workfn() locklessly retrieves the linked list of pages to |
---|
| 1504 | + * be freed and frees them one-by-one. As the page->mapping pointer is |
---|
| 1505 | + * going to be cleared in __free_huge_page() anyway, it is reused as the |
---|
| 1506 | + * llist_node structure of a lockless linked list of huge pages to be freed. |
---|
| 1507 | + */ |
---|
| 1508 | +static LLIST_HEAD(hpage_freelist); |
---|
| 1509 | + |
---|
| 1510 | +static void free_hpage_workfn(struct work_struct *work) |
---|
| 1511 | +{ |
---|
| 1512 | + struct llist_node *node; |
---|
| 1513 | + struct page *page; |
---|
| 1514 | + |
---|
| 1515 | + node = llist_del_all(&hpage_freelist); |
---|
| 1516 | + |
---|
| 1517 | + while (node) { |
---|
| 1518 | + page = container_of((struct address_space **)node, |
---|
| 1519 | + struct page, mapping); |
---|
| 1520 | + node = node->next; |
---|
| 1521 | + __free_huge_page(page); |
---|
| 1522 | + } |
---|
| 1523 | +} |
---|
| 1524 | +static DECLARE_WORK(free_hpage_work, free_hpage_workfn); |
---|
| 1525 | + |
---|
| 1526 | +void free_huge_page(struct page *page) |
---|
| 1527 | +{ |
---|
| 1528 | + /* |
---|
| 1529 | + * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. |
---|
| 1530 | + */ |
---|
| 1531 | + if (!in_task()) { |
---|
| 1532 | + /* |
---|
| 1533 | + * Only call schedule_work() if hpage_freelist is previously |
---|
| 1534 | + * empty. Otherwise, schedule_work() had been called but the |
---|
| 1535 | + * workfn hasn't retrieved the list yet. |
---|
| 1536 | + */ |
---|
| 1537 | + if (llist_add((struct llist_node *)&page->mapping, |
---|
| 1538 | + &hpage_freelist)) |
---|
| 1539 | + schedule_work(&free_hpage_work); |
---|
| 1540 | + return; |
---|
| 1541 | + } |
---|
| 1542 | + |
---|
| 1543 | + __free_huge_page(page); |
---|
| 1544 | +} |
---|
| 1545 | + |
---|
1325 | 1546 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
---|
1326 | 1547 | { |
---|
1327 | 1548 | INIT_LIST_HEAD(&page->lru); |
---|
1328 | 1549 | set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); |
---|
1329 | | - spin_lock(&hugetlb_lock); |
---|
1330 | 1550 | set_hugetlb_cgroup(page, NULL); |
---|
| 1551 | + set_hugetlb_cgroup_rsvd(page, NULL); |
---|
| 1552 | + spin_lock(&hugetlb_lock); |
---|
1331 | 1553 | h->nr_huge_pages++; |
---|
1332 | 1554 | h->nr_huge_pages_node[nid]++; |
---|
1333 | 1555 | ClearPageHugeFreed(page); |
---|
.. | .. |
---|
1349 | 1571 | * For gigantic hugepages allocated through bootmem at |
---|
1350 | 1572 | * boot, it's safer to be consistent with the not-gigantic |
---|
1351 | 1573 | * hugepages and clear the PG_reserved bit from all tail pages |
---|
1352 | | - * too. Otherwse drivers using get_user_pages() to access tail |
---|
| 1574 | + * too. Otherwise drivers using get_user_pages() to access tail |
---|
1353 | 1575 | * pages may get the reference counting wrong if they see |
---|
1354 | 1576 | * PG_reserved set on a tail page (despite the head page not |
---|
1355 | 1577 | * having PG_reserved set). Enforcing this consistency between |
---|
.. | .. |
---|
1362 | 1584 | set_compound_head(p, page); |
---|
1363 | 1585 | } |
---|
1364 | 1586 | atomic_set(compound_mapcount_ptr(page), -1); |
---|
| 1587 | + atomic_set(compound_pincount_ptr(page), 0); |
---|
1365 | 1588 | } |
---|
1366 | 1589 | |
---|
1367 | 1590 | /* |
---|
.. | .. |
---|
1388 | 1611 | if (!PageHead(page_head)) |
---|
1389 | 1612 | return 0; |
---|
1390 | 1613 | |
---|
1391 | | - return get_compound_page_dtor(page_head) == free_huge_page; |
---|
| 1614 | + return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; |
---|
| 1615 | +} |
---|
| 1616 | + |
---|
| 1617 | +/* |
---|
| 1618 | + * Find and lock address space (mapping) in write mode. |
---|
| 1619 | + * |
---|
| 1620 | + * Upon entry, the page is locked which means that page_mapping() is |
---|
| 1621 | + * stable. Due to locking order, we can only trylock_write. If we can |
---|
| 1622 | + * not get the lock, simply return NULL to caller. |
---|
| 1623 | + */ |
---|
| 1624 | +struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) |
---|
| 1625 | +{ |
---|
| 1626 | + struct address_space *mapping = page_mapping(hpage); |
---|
| 1627 | + |
---|
| 1628 | + if (!mapping) |
---|
| 1629 | + return mapping; |
---|
| 1630 | + |
---|
| 1631 | + if (i_mmap_trylock_write(mapping)) |
---|
| 1632 | + return mapping; |
---|
| 1633 | + |
---|
| 1634 | + return NULL; |
---|
1392 | 1635 | } |
---|
1393 | 1636 | |
---|
1394 | 1637 | pgoff_t hugetlb_basepage_index(struct page *page) |
---|
.. | .. |
---|
1406 | 1649 | } |
---|
1407 | 1650 | |
---|
1408 | 1651 | static struct page *alloc_buddy_huge_page(struct hstate *h, |
---|
1409 | | - gfp_t gfp_mask, int nid, nodemask_t *nmask) |
---|
| 1652 | + gfp_t gfp_mask, int nid, nodemask_t *nmask, |
---|
| 1653 | + nodemask_t *node_alloc_noretry) |
---|
1410 | 1654 | { |
---|
1411 | 1655 | int order = huge_page_order(h); |
---|
1412 | 1656 | struct page *page; |
---|
| 1657 | + bool alloc_try_hard = true; |
---|
1413 | 1658 | |
---|
1414 | | - gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; |
---|
| 1659 | + /* |
---|
| 1660 | + * By default we always try hard to allocate the page with |
---|
| 1661 | + * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in |
---|
| 1662 | + * a loop (to adjust global huge page counts) and previous allocation |
---|
| 1663 | + * failed, do not continue to try hard on the same node. Use the |
---|
| 1664 | + * node_alloc_noretry bitmap to manage this state information. |
---|
| 1665 | + */ |
---|
| 1666 | + if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) |
---|
| 1667 | + alloc_try_hard = false; |
---|
| 1668 | + gfp_mask |= __GFP_COMP|__GFP_NOWARN; |
---|
| 1669 | + if (alloc_try_hard) |
---|
| 1670 | + gfp_mask |= __GFP_RETRY_MAYFAIL; |
---|
1415 | 1671 | if (nid == NUMA_NO_NODE) |
---|
1416 | 1672 | nid = numa_mem_id(); |
---|
1417 | 1673 | page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); |
---|
.. | .. |
---|
1419 | 1675 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
---|
1420 | 1676 | else |
---|
1421 | 1677 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); |
---|
| 1678 | + |
---|
| 1679 | + /* |
---|
| 1680 | + * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this |
---|
| 1681 | + * indicates an overall state change. Clear bit so that we resume |
---|
| 1682 | + * normal 'try hard' allocations. |
---|
| 1683 | + */ |
---|
| 1684 | + if (node_alloc_noretry && page && !alloc_try_hard) |
---|
| 1685 | + node_clear(nid, *node_alloc_noretry); |
---|
| 1686 | + |
---|
| 1687 | + /* |
---|
| 1688 | + * If we tried hard to get a page but failed, set bit so that |
---|
| 1689 | + * subsequent attempts will not try as hard until there is an |
---|
| 1690 | + * overall state change. |
---|
| 1691 | + */ |
---|
| 1692 | + if (node_alloc_noretry && !page && alloc_try_hard) |
---|
| 1693 | + node_set(nid, *node_alloc_noretry); |
---|
1422 | 1694 | |
---|
1423 | 1695 | return page; |
---|
1424 | 1696 | } |
---|
.. | .. |
---|
1428 | 1700 | * should use this function to get new hugetlb pages |
---|
1429 | 1701 | */ |
---|
1430 | 1702 | static struct page *alloc_fresh_huge_page(struct hstate *h, |
---|
1431 | | - gfp_t gfp_mask, int nid, nodemask_t *nmask) |
---|
| 1703 | + gfp_t gfp_mask, int nid, nodemask_t *nmask, |
---|
| 1704 | + nodemask_t *node_alloc_noretry) |
---|
1432 | 1705 | { |
---|
1433 | 1706 | struct page *page; |
---|
1434 | 1707 | |
---|
.. | .. |
---|
1436 | 1709 | page = alloc_gigantic_page(h, gfp_mask, nid, nmask); |
---|
1437 | 1710 | else |
---|
1438 | 1711 | page = alloc_buddy_huge_page(h, gfp_mask, |
---|
1439 | | - nid, nmask); |
---|
| 1712 | + nid, nmask, node_alloc_noretry); |
---|
1440 | 1713 | if (!page) |
---|
1441 | 1714 | return NULL; |
---|
1442 | 1715 | |
---|
.. | .. |
---|
1451 | 1724 | * Allocates a fresh page to the hugetlb allocator pool in the node interleaved |
---|
1452 | 1725 | * manner. |
---|
1453 | 1726 | */ |
---|
1454 | | -static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) |
---|
| 1727 | +static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
---|
| 1728 | + nodemask_t *node_alloc_noretry) |
---|
1455 | 1729 | { |
---|
1456 | 1730 | struct page *page; |
---|
1457 | 1731 | int nr_nodes, node; |
---|
1458 | 1732 | gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; |
---|
1459 | 1733 | |
---|
1460 | 1734 | for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { |
---|
1461 | | - page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); |
---|
| 1735 | + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, |
---|
| 1736 | + node_alloc_noretry); |
---|
1462 | 1737 | if (page) |
---|
1463 | 1738 | break; |
---|
1464 | 1739 | } |
---|
.. | .. |
---|
1623 | 1898 | goto out_unlock; |
---|
1624 | 1899 | spin_unlock(&hugetlb_lock); |
---|
1625 | 1900 | |
---|
1626 | | - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); |
---|
| 1901 | + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); |
---|
1627 | 1902 | if (!page) |
---|
1628 | 1903 | return NULL; |
---|
1629 | 1904 | |
---|
.. | .. |
---|
1652 | 1927 | } |
---|
1653 | 1928 | |
---|
1654 | 1929 | static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, |
---|
1655 | | - int nid, nodemask_t *nmask) |
---|
| 1930 | + int nid, nodemask_t *nmask) |
---|
1656 | 1931 | { |
---|
1657 | 1932 | struct page *page; |
---|
1658 | 1933 | |
---|
1659 | 1934 | if (hstate_is_gigantic(h)) |
---|
1660 | 1935 | return NULL; |
---|
1661 | 1936 | |
---|
1662 | | - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); |
---|
| 1937 | + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); |
---|
1663 | 1938 | if (!page) |
---|
1664 | 1939 | return NULL; |
---|
1665 | 1940 | |
---|
.. | .. |
---|
1693 | 1968 | } |
---|
1694 | 1969 | |
---|
1695 | 1970 | /* page migration callback function */ |
---|
1696 | | -struct page *alloc_huge_page_node(struct hstate *h, int nid) |
---|
1697 | | -{ |
---|
1698 | | - gfp_t gfp_mask = htlb_alloc_mask(h); |
---|
1699 | | - struct page *page = NULL; |
---|
1700 | | - |
---|
1701 | | - if (nid != NUMA_NO_NODE) |
---|
1702 | | - gfp_mask |= __GFP_THISNODE; |
---|
1703 | | - |
---|
1704 | | - spin_lock(&hugetlb_lock); |
---|
1705 | | - if (h->free_huge_pages - h->resv_huge_pages > 0) |
---|
1706 | | - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL); |
---|
1707 | | - spin_unlock(&hugetlb_lock); |
---|
1708 | | - |
---|
1709 | | - if (!page) |
---|
1710 | | - page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL); |
---|
1711 | | - |
---|
1712 | | - return page; |
---|
1713 | | -} |
---|
1714 | | - |
---|
1715 | | -/* page migration callback function */ |
---|
1716 | 1971 | struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, |
---|
1717 | | - nodemask_t *nmask) |
---|
| 1972 | + nodemask_t *nmask, gfp_t gfp_mask) |
---|
1718 | 1973 | { |
---|
1719 | | - gfp_t gfp_mask = htlb_alloc_mask(h); |
---|
1720 | | - |
---|
1721 | 1974 | spin_lock(&hugetlb_lock); |
---|
1722 | 1975 | if (h->free_huge_pages - h->resv_huge_pages > 0) { |
---|
1723 | 1976 | struct page *page; |
---|
.. | .. |
---|
1745 | 1998 | |
---|
1746 | 1999 | gfp_mask = htlb_alloc_mask(h); |
---|
1747 | 2000 | node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); |
---|
1748 | | - page = alloc_huge_page_nodemask(h, node, nodemask); |
---|
| 2001 | + page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); |
---|
1749 | 2002 | mpol_cond_put(mpol); |
---|
1750 | 2003 | |
---|
1751 | 2004 | return page; |
---|
.. | .. |
---|
1756 | 2009 | * of size 'delta'. |
---|
1757 | 2010 | */ |
---|
1758 | 2011 | static int gather_surplus_pages(struct hstate *h, int delta) |
---|
| 2012 | + __must_hold(&hugetlb_lock) |
---|
1759 | 2013 | { |
---|
1760 | 2014 | struct list_head surplus_list; |
---|
1761 | 2015 | struct page *page, *tmp; |
---|
.. | .. |
---|
1873 | 2127 | * evenly across all nodes with memory. Iterate across these nodes |
---|
1874 | 2128 | * until we can no longer free unreserved surplus pages. This occurs |
---|
1875 | 2129 | * when the nodes with surplus pages have no free pages. |
---|
1876 | | - * free_pool_huge_page() will balance the the freed pages across the |
---|
| 2130 | + * free_pool_huge_page() will balance the freed pages across the |
---|
1877 | 2131 | * on-line nodes with memory and will handle the hstate accounting. |
---|
1878 | 2132 | * |
---|
1879 | 2133 | * Note that we decrement resv_huge_pages as we free the pages. If |
---|
.. | .. |
---|
1931 | 2185 | struct resv_map *resv; |
---|
1932 | 2186 | pgoff_t idx; |
---|
1933 | 2187 | long ret; |
---|
| 2188 | + long dummy_out_regions_needed; |
---|
1934 | 2189 | |
---|
1935 | 2190 | resv = vma_resv_map(vma); |
---|
1936 | 2191 | if (!resv) |
---|
.. | .. |
---|
1939 | 2194 | idx = vma_hugecache_offset(h, vma, addr); |
---|
1940 | 2195 | switch (mode) { |
---|
1941 | 2196 | case VMA_NEEDS_RESV: |
---|
1942 | | - ret = region_chg(resv, idx, idx + 1); |
---|
| 2197 | + ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); |
---|
| 2198 | + /* We assume that vma_reservation_* routines always operate on |
---|
| 2199 | + * 1 page, and that adding to resv map a 1 page entry can only |
---|
| 2200 | + * ever require 1 region. |
---|
| 2201 | + */ |
---|
| 2202 | + VM_BUG_ON(dummy_out_regions_needed != 1); |
---|
1943 | 2203 | break; |
---|
1944 | 2204 | case VMA_COMMIT_RESV: |
---|
1945 | | - ret = region_add(resv, idx, idx + 1); |
---|
| 2205 | + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); |
---|
| 2206 | + /* region_add calls of range 1 should never fail. */ |
---|
| 2207 | + VM_BUG_ON(ret < 0); |
---|
1946 | 2208 | break; |
---|
1947 | 2209 | case VMA_END_RESV: |
---|
1948 | | - region_abort(resv, idx, idx + 1); |
---|
| 2210 | + region_abort(resv, idx, idx + 1, 1); |
---|
1949 | 2211 | ret = 0; |
---|
1950 | 2212 | break; |
---|
1951 | 2213 | case VMA_ADD_RESV: |
---|
1952 | | - if (vma->vm_flags & VM_MAYSHARE) |
---|
1953 | | - ret = region_add(resv, idx, idx + 1); |
---|
1954 | | - else { |
---|
1955 | | - region_abort(resv, idx, idx + 1); |
---|
| 2214 | + if (vma->vm_flags & VM_MAYSHARE) { |
---|
| 2215 | + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); |
---|
| 2216 | + /* region_add calls of range 1 should never fail. */ |
---|
| 2217 | + VM_BUG_ON(ret < 0); |
---|
| 2218 | + } else { |
---|
| 2219 | + region_abort(resv, idx, idx + 1, 1); |
---|
1956 | 2220 | ret = region_del(resv, idx, idx + 1); |
---|
1957 | 2221 | } |
---|
1958 | 2222 | break; |
---|
.. | .. |
---|
2063 | 2327 | long gbl_chg; |
---|
2064 | 2328 | int ret, idx; |
---|
2065 | 2329 | struct hugetlb_cgroup *h_cg; |
---|
| 2330 | + bool deferred_reserve; |
---|
2066 | 2331 | |
---|
2067 | 2332 | idx = hstate_index(h); |
---|
2068 | 2333 | /* |
---|
.. | .. |
---|
2100 | 2365 | gbl_chg = 1; |
---|
2101 | 2366 | } |
---|
2102 | 2367 | |
---|
| 2368 | + /* If this allocation is not consuming a reservation, charge it now. |
---|
| 2369 | + */ |
---|
| 2370 | + deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); |
---|
| 2371 | + if (deferred_reserve) { |
---|
| 2372 | + ret = hugetlb_cgroup_charge_cgroup_rsvd( |
---|
| 2373 | + idx, pages_per_huge_page(h), &h_cg); |
---|
| 2374 | + if (ret) |
---|
| 2375 | + goto out_subpool_put; |
---|
| 2376 | + } |
---|
| 2377 | + |
---|
2103 | 2378 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); |
---|
2104 | 2379 | if (ret) |
---|
2105 | | - goto out_subpool_put; |
---|
| 2380 | + goto out_uncharge_cgroup_reservation; |
---|
2106 | 2381 | |
---|
2107 | 2382 | spin_lock(&hugetlb_lock); |
---|
2108 | 2383 | /* |
---|
.. | .. |
---|
2116 | 2391 | page = alloc_buddy_huge_page_with_mpol(h, vma, addr); |
---|
2117 | 2392 | if (!page) |
---|
2118 | 2393 | goto out_uncharge_cgroup; |
---|
| 2394 | + spin_lock(&hugetlb_lock); |
---|
2119 | 2395 | if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { |
---|
2120 | 2396 | SetPagePrivate(page); |
---|
2121 | 2397 | h->resv_huge_pages--; |
---|
2122 | 2398 | } |
---|
2123 | | - spin_lock(&hugetlb_lock); |
---|
2124 | | - list_move(&page->lru, &h->hugepage_activelist); |
---|
| 2399 | + list_add(&page->lru, &h->hugepage_activelist); |
---|
2125 | 2400 | /* Fall through */ |
---|
2126 | 2401 | } |
---|
2127 | 2402 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); |
---|
| 2403 | + /* If allocation is not consuming a reservation, also store the |
---|
| 2404 | + * hugetlb_cgroup pointer on the page. |
---|
| 2405 | + */ |
---|
| 2406 | + if (deferred_reserve) { |
---|
| 2407 | + hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), |
---|
| 2408 | + h_cg, page); |
---|
| 2409 | + } |
---|
| 2410 | + |
---|
2128 | 2411 | spin_unlock(&hugetlb_lock); |
---|
2129 | 2412 | |
---|
2130 | 2413 | set_page_private(page, (unsigned long)spool); |
---|
.. | .. |
---|
2144 | 2427 | |
---|
2145 | 2428 | rsv_adjust = hugepage_subpool_put_pages(spool, 1); |
---|
2146 | 2429 | hugetlb_acct_memory(h, -rsv_adjust); |
---|
| 2430 | + if (deferred_reserve) |
---|
| 2431 | + hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), |
---|
| 2432 | + pages_per_huge_page(h), page); |
---|
2147 | 2433 | } |
---|
2148 | 2434 | return page; |
---|
2149 | 2435 | |
---|
2150 | 2436 | out_uncharge_cgroup: |
---|
2151 | 2437 | hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); |
---|
| 2438 | +out_uncharge_cgroup_reservation: |
---|
| 2439 | + if (deferred_reserve) |
---|
| 2440 | + hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), |
---|
| 2441 | + h_cg); |
---|
2152 | 2442 | out_subpool_put: |
---|
2153 | 2443 | if (map_chg || avoid_reserve) |
---|
2154 | 2444 | hugepage_subpool_put_pages(spool, 1); |
---|
.. | .. |
---|
2166 | 2456 | for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { |
---|
2167 | 2457 | void *addr; |
---|
2168 | 2458 | |
---|
2169 | | - addr = memblock_virt_alloc_try_nid_raw( |
---|
| 2459 | + addr = memblock_alloc_try_nid_raw( |
---|
2170 | 2460 | huge_page_size(h), huge_page_size(h), |
---|
2171 | | - 0, BOOTMEM_ALLOC_ACCESSIBLE, node); |
---|
| 2461 | + 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); |
---|
2172 | 2462 | if (addr) { |
---|
2173 | 2463 | /* |
---|
2174 | 2464 | * Use the beginning of the huge page to store the |
---|
.. | .. |
---|
2190 | 2480 | return 1; |
---|
2191 | 2481 | } |
---|
2192 | 2482 | |
---|
2193 | | -static void __init prep_compound_huge_page(struct page *page, |
---|
2194 | | - unsigned int order) |
---|
2195 | | -{ |
---|
2196 | | - if (unlikely(order > (MAX_ORDER - 1))) |
---|
2197 | | - prep_compound_gigantic_page(page, order); |
---|
2198 | | - else |
---|
2199 | | - prep_compound_page(page, order); |
---|
2200 | | -} |
---|
2201 | | - |
---|
2202 | | -/* Put bootmem huge pages into the standard lists after mem_map is up */ |
---|
| 2483 | +/* |
---|
| 2484 | + * Put bootmem huge pages into the standard lists after mem_map is up. |
---|
| 2485 | + * Note: This only applies to gigantic (order > MAX_ORDER) pages. |
---|
| 2486 | + */ |
---|
2203 | 2487 | static void __init gather_bootmem_prealloc(void) |
---|
2204 | 2488 | { |
---|
2205 | 2489 | struct huge_bootmem_page *m; |
---|
.. | .. |
---|
2208 | 2492 | struct page *page = virt_to_page(m); |
---|
2209 | 2493 | struct hstate *h = m->hstate; |
---|
2210 | 2494 | |
---|
| 2495 | + VM_BUG_ON(!hstate_is_gigantic(h)); |
---|
2211 | 2496 | WARN_ON(page_count(page) != 1); |
---|
2212 | | - prep_compound_huge_page(page, h->order); |
---|
| 2497 | + prep_compound_gigantic_page(page, huge_page_order(h)); |
---|
2213 | 2498 | WARN_ON(PageReserved(page)); |
---|
2214 | 2499 | prep_new_huge_page(h, page, page_to_nid(page)); |
---|
2215 | 2500 | put_page(page); /* free it into the hugepage allocator */ |
---|
2216 | 2501 | |
---|
2217 | 2502 | /* |
---|
2218 | | - * If we had gigantic hugepages allocated at boot time, we need |
---|
2219 | | - * to restore the 'stolen' pages to totalram_pages in order to |
---|
2220 | | - * fix confusing memory reports from free(1) and another |
---|
2221 | | - * side-effects, like CommitLimit going negative. |
---|
| 2503 | + * We need to restore the 'stolen' pages to totalram_pages |
---|
| 2504 | + * in order to fix confusing memory reports from free(1) and |
---|
| 2505 | + * other side-effects, like CommitLimit going negative. |
---|
2222 | 2506 | */ |
---|
2223 | | - if (hstate_is_gigantic(h)) |
---|
2224 | | - adjust_managed_page_count(page, 1 << h->order); |
---|
| 2507 | + adjust_managed_page_count(page, pages_per_huge_page(h)); |
---|
2225 | 2508 | cond_resched(); |
---|
2226 | 2509 | } |
---|
2227 | 2510 | } |
---|
.. | .. |
---|
2229 | 2512 | static void __init hugetlb_hstate_alloc_pages(struct hstate *h) |
---|
2230 | 2513 | { |
---|
2231 | 2514 | unsigned long i; |
---|
| 2515 | + nodemask_t *node_alloc_noretry; |
---|
| 2516 | + |
---|
| 2517 | + if (!hstate_is_gigantic(h)) { |
---|
| 2518 | + /* |
---|
| 2519 | + * Bit mask controlling how hard we retry per-node allocations. |
---|
| 2520 | + * Ignore errors as lower level routines can deal with |
---|
| 2521 | + * node_alloc_noretry == NULL. If this kmalloc fails at boot |
---|
| 2522 | + * time, we are likely in bigger trouble. |
---|
| 2523 | + */ |
---|
| 2524 | + node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), |
---|
| 2525 | + GFP_KERNEL); |
---|
| 2526 | + } else { |
---|
| 2527 | + /* allocations done at boot time */ |
---|
| 2528 | + node_alloc_noretry = NULL; |
---|
| 2529 | + } |
---|
| 2530 | + |
---|
| 2531 | + /* bit mask controlling how hard we retry per-node allocations */ |
---|
| 2532 | + if (node_alloc_noretry) |
---|
| 2533 | + nodes_clear(*node_alloc_noretry); |
---|
2232 | 2534 | |
---|
2233 | 2535 | for (i = 0; i < h->max_huge_pages; ++i) { |
---|
2234 | 2536 | if (hstate_is_gigantic(h)) { |
---|
| 2537 | + if (hugetlb_cma_size) { |
---|
| 2538 | + pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); |
---|
| 2539 | + goto free; |
---|
| 2540 | + } |
---|
2235 | 2541 | if (!alloc_bootmem_huge_page(h)) |
---|
2236 | 2542 | break; |
---|
2237 | 2543 | } else if (!alloc_pool_huge_page(h, |
---|
2238 | | - &node_states[N_MEMORY])) |
---|
| 2544 | + &node_states[N_MEMORY], |
---|
| 2545 | + node_alloc_noretry)) |
---|
2239 | 2546 | break; |
---|
2240 | 2547 | cond_resched(); |
---|
2241 | 2548 | } |
---|
.. | .. |
---|
2247 | 2554 | h->max_huge_pages, buf, i); |
---|
2248 | 2555 | h->max_huge_pages = i; |
---|
2249 | 2556 | } |
---|
| 2557 | +free: |
---|
| 2558 | + kfree(node_alloc_noretry); |
---|
2250 | 2559 | } |
---|
2251 | 2560 | |
---|
2252 | 2561 | static void __init hugetlb_init_hstates(void) |
---|
.. | .. |
---|
2341 | 2650 | } |
---|
2342 | 2651 | |
---|
2343 | 2652 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
---|
2344 | | -static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, |
---|
2345 | | - nodemask_t *nodes_allowed) |
---|
| 2653 | +static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, |
---|
| 2654 | + nodemask_t *nodes_allowed) |
---|
2346 | 2655 | { |
---|
2347 | 2656 | unsigned long min_count, ret; |
---|
| 2657 | + NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); |
---|
2348 | 2658 | |
---|
2349 | | - if (hstate_is_gigantic(h) && !gigantic_page_supported()) |
---|
2350 | | - return h->max_huge_pages; |
---|
| 2659 | + /* |
---|
| 2660 | + * Bit mask controlling how hard we retry per-node allocations. |
---|
| 2661 | + * If we can not allocate the bit mask, do not attempt to allocate |
---|
| 2662 | + * the requested huge pages. |
---|
| 2663 | + */ |
---|
| 2664 | + if (node_alloc_noretry) |
---|
| 2665 | + nodes_clear(*node_alloc_noretry); |
---|
| 2666 | + else |
---|
| 2667 | + return -ENOMEM; |
---|
| 2668 | + |
---|
| 2669 | + spin_lock(&hugetlb_lock); |
---|
| 2670 | + |
---|
| 2671 | + /* |
---|
| 2672 | + * Check for a node specific request. |
---|
| 2673 | + * Changing node specific huge page count may require a corresponding |
---|
| 2674 | + * change to the global count. In any case, the passed node mask |
---|
| 2675 | + * (nodes_allowed) will restrict alloc/free to the specified node. |
---|
| 2676 | + */ |
---|
| 2677 | + if (nid != NUMA_NO_NODE) { |
---|
| 2678 | + unsigned long old_count = count; |
---|
| 2679 | + |
---|
| 2680 | + count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; |
---|
| 2681 | + /* |
---|
| 2682 | + * User may have specified a large count value which caused the |
---|
| 2683 | + * above calculation to overflow. In this case, they wanted |
---|
| 2684 | + * to allocate as many huge pages as possible. Set count to |
---|
| 2685 | + * largest possible value to align with their intention. |
---|
| 2686 | + */ |
---|
| 2687 | + if (count < old_count) |
---|
| 2688 | + count = ULONG_MAX; |
---|
| 2689 | + } |
---|
| 2690 | + |
---|
| 2691 | + /* |
---|
| 2692 | + * Gigantic pages runtime allocation depend on the capability for large |
---|
| 2693 | + * page range allocation. |
---|
| 2694 | + * If the system does not provide this feature, return an error when |
---|
| 2695 | + * the user tries to allocate gigantic pages but let the user free the |
---|
| 2696 | + * boottime allocated gigantic pages. |
---|
| 2697 | + */ |
---|
| 2698 | + if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { |
---|
| 2699 | + if (count > persistent_huge_pages(h)) { |
---|
| 2700 | + spin_unlock(&hugetlb_lock); |
---|
| 2701 | + NODEMASK_FREE(node_alloc_noretry); |
---|
| 2702 | + return -EINVAL; |
---|
| 2703 | + } |
---|
| 2704 | + /* Fall through to decrease pool */ |
---|
| 2705 | + } |
---|
2351 | 2706 | |
---|
2352 | 2707 | /* |
---|
2353 | 2708 | * Increase the pool size |
---|
.. | .. |
---|
2360 | 2715 | * pool might be one hugepage larger than it needs to be, but |
---|
2361 | 2716 | * within all the constraints specified by the sysctls. |
---|
2362 | 2717 | */ |
---|
2363 | | - spin_lock(&hugetlb_lock); |
---|
2364 | 2718 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
---|
2365 | 2719 | if (!adjust_pool_surplus(h, nodes_allowed, -1)) |
---|
2366 | 2720 | break; |
---|
.. | .. |
---|
2377 | 2731 | /* yield cpu to avoid soft lockup */ |
---|
2378 | 2732 | cond_resched(); |
---|
2379 | 2733 | |
---|
2380 | | - ret = alloc_pool_huge_page(h, nodes_allowed); |
---|
| 2734 | + ret = alloc_pool_huge_page(h, nodes_allowed, |
---|
| 2735 | + node_alloc_noretry); |
---|
2381 | 2736 | spin_lock(&hugetlb_lock); |
---|
2382 | 2737 | if (!ret) |
---|
2383 | 2738 | goto out; |
---|
.. | .. |
---|
2415 | 2770 | break; |
---|
2416 | 2771 | } |
---|
2417 | 2772 | out: |
---|
2418 | | - ret = persistent_huge_pages(h); |
---|
| 2773 | + h->max_huge_pages = persistent_huge_pages(h); |
---|
2419 | 2774 | spin_unlock(&hugetlb_lock); |
---|
2420 | | - return ret; |
---|
| 2775 | + |
---|
| 2776 | + NODEMASK_FREE(node_alloc_noretry); |
---|
| 2777 | + |
---|
| 2778 | + return 0; |
---|
2421 | 2779 | } |
---|
2422 | 2780 | |
---|
2423 | 2781 | #define HSTATE_ATTR_RO(_name) \ |
---|
.. | .. |
---|
2467 | 2825 | unsigned long count, size_t len) |
---|
2468 | 2826 | { |
---|
2469 | 2827 | int err; |
---|
2470 | | - NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); |
---|
| 2828 | + nodemask_t nodes_allowed, *n_mask; |
---|
2471 | 2829 | |
---|
2472 | | - if (hstate_is_gigantic(h) && !gigantic_page_supported()) { |
---|
2473 | | - err = -EINVAL; |
---|
2474 | | - goto out; |
---|
2475 | | - } |
---|
| 2830 | + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) |
---|
| 2831 | + return -EINVAL; |
---|
2476 | 2832 | |
---|
2477 | 2833 | if (nid == NUMA_NO_NODE) { |
---|
2478 | 2834 | /* |
---|
2479 | 2835 | * global hstate attribute |
---|
2480 | 2836 | */ |
---|
2481 | 2837 | if (!(obey_mempolicy && |
---|
2482 | | - init_nodemask_of_mempolicy(nodes_allowed))) { |
---|
2483 | | - NODEMASK_FREE(nodes_allowed); |
---|
2484 | | - nodes_allowed = &node_states[N_MEMORY]; |
---|
2485 | | - } |
---|
2486 | | - } else if (nodes_allowed) { |
---|
| 2838 | + init_nodemask_of_mempolicy(&nodes_allowed))) |
---|
| 2839 | + n_mask = &node_states[N_MEMORY]; |
---|
| 2840 | + else |
---|
| 2841 | + n_mask = &nodes_allowed; |
---|
| 2842 | + } else { |
---|
2487 | 2843 | /* |
---|
2488 | | - * per node hstate attribute: adjust count to global, |
---|
2489 | | - * but restrict alloc/free to the specified node. |
---|
| 2844 | + * Node specific request. count adjustment happens in |
---|
| 2845 | + * set_max_huge_pages() after acquiring hugetlb_lock. |
---|
2490 | 2846 | */ |
---|
2491 | | - count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; |
---|
2492 | | - init_nodemask_of_node(nodes_allowed, nid); |
---|
2493 | | - } else |
---|
2494 | | - nodes_allowed = &node_states[N_MEMORY]; |
---|
| 2847 | + init_nodemask_of_node(&nodes_allowed, nid); |
---|
| 2848 | + n_mask = &nodes_allowed; |
---|
| 2849 | + } |
---|
2495 | 2850 | |
---|
2496 | | - h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); |
---|
| 2851 | + err = set_max_huge_pages(h, count, nid, n_mask); |
---|
2497 | 2852 | |
---|
2498 | | - if (nodes_allowed != &node_states[N_MEMORY]) |
---|
2499 | | - NODEMASK_FREE(nodes_allowed); |
---|
2500 | | - |
---|
2501 | | - return len; |
---|
2502 | | -out: |
---|
2503 | | - NODEMASK_FREE(nodes_allowed); |
---|
2504 | | - return err; |
---|
| 2853 | + return err ? err : len; |
---|
2505 | 2854 | } |
---|
2506 | 2855 | |
---|
2507 | 2856 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
---|
.. | .. |
---|
2675 | 3024 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
---|
2676 | 3025 | hstate_kobjs, &hstate_attr_group); |
---|
2677 | 3026 | if (err) |
---|
2678 | | - pr_err("Hugetlb: Unable to add hstate %s", h->name); |
---|
| 3027 | + pr_err("HugeTLB: Unable to add hstate %s", h->name); |
---|
2679 | 3028 | } |
---|
2680 | 3029 | } |
---|
2681 | 3030 | |
---|
.. | .. |
---|
2779 | 3128 | nhs->hstate_kobjs, |
---|
2780 | 3129 | &per_node_hstate_attr_group); |
---|
2781 | 3130 | if (err) { |
---|
2782 | | - pr_err("Hugetlb: Unable to add hstate %s for node %d\n", |
---|
| 3131 | + pr_err("HugeTLB: Unable to add hstate %s for node %d\n", |
---|
2783 | 3132 | h->name, node->dev.id); |
---|
2784 | 3133 | hugetlb_unregister_node(node); |
---|
2785 | 3134 | break; |
---|
.. | .. |
---|
2827 | 3176 | { |
---|
2828 | 3177 | int i; |
---|
2829 | 3178 | |
---|
2830 | | - if (!hugepages_supported()) |
---|
| 3179 | + if (!hugepages_supported()) { |
---|
| 3180 | + if (hugetlb_max_hstate || default_hstate_max_huge_pages) |
---|
| 3181 | + pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); |
---|
2831 | 3182 | return 0; |
---|
| 3183 | + } |
---|
2832 | 3184 | |
---|
2833 | | - if (!size_to_hstate(default_hstate_size)) { |
---|
2834 | | - if (default_hstate_size != 0) { |
---|
2835 | | - pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n", |
---|
2836 | | - default_hstate_size, HPAGE_SIZE); |
---|
| 3185 | + /* |
---|
| 3186 | + * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some |
---|
| 3187 | + * architectures depend on setup being done here. |
---|
| 3188 | + */ |
---|
| 3189 | + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); |
---|
| 3190 | + if (!parsed_default_hugepagesz) { |
---|
| 3191 | + /* |
---|
| 3192 | + * If we did not parse a default huge page size, set |
---|
| 3193 | + * default_hstate_idx to HPAGE_SIZE hstate. And, if the |
---|
| 3194 | + * number of huge pages for this default size was implicitly |
---|
| 3195 | + * specified, set that here as well. |
---|
| 3196 | + * Note that the implicit setting will overwrite an explicit |
---|
| 3197 | + * setting. A warning will be printed in this case. |
---|
| 3198 | + */ |
---|
| 3199 | + default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); |
---|
| 3200 | + if (default_hstate_max_huge_pages) { |
---|
| 3201 | + if (default_hstate.max_huge_pages) { |
---|
| 3202 | + char buf[32]; |
---|
| 3203 | + |
---|
| 3204 | + string_get_size(huge_page_size(&default_hstate), |
---|
| 3205 | + 1, STRING_UNITS_2, buf, 32); |
---|
| 3206 | + pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", |
---|
| 3207 | + default_hstate.max_huge_pages, buf); |
---|
| 3208 | + pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", |
---|
| 3209 | + default_hstate_max_huge_pages); |
---|
| 3210 | + } |
---|
| 3211 | + default_hstate.max_huge_pages = |
---|
| 3212 | + default_hstate_max_huge_pages; |
---|
2837 | 3213 | } |
---|
2838 | | - |
---|
2839 | | - default_hstate_size = HPAGE_SIZE; |
---|
2840 | | - if (!size_to_hstate(default_hstate_size)) |
---|
2841 | | - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); |
---|
2842 | | - } |
---|
2843 | | - default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); |
---|
2844 | | - if (default_hstate_max_huge_pages) { |
---|
2845 | | - if (!default_hstate.max_huge_pages) |
---|
2846 | | - default_hstate.max_huge_pages = default_hstate_max_huge_pages; |
---|
2847 | 3214 | } |
---|
2848 | 3215 | |
---|
| 3216 | + hugetlb_cma_check(); |
---|
2849 | 3217 | hugetlb_init_hstates(); |
---|
2850 | 3218 | gather_bootmem_prealloc(); |
---|
2851 | 3219 | report_hugepages(); |
---|
.. | .. |
---|
2870 | 3238 | } |
---|
2871 | 3239 | subsys_initcall(hugetlb_init); |
---|
2872 | 3240 | |
---|
2873 | | -/* Should be called on processing a hugepagesz=... option */ |
---|
2874 | | -void __init hugetlb_bad_size(void) |
---|
| 3241 | +/* Overwritten by architectures with more huge page sizes */ |
---|
| 3242 | +bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) |
---|
2875 | 3243 | { |
---|
2876 | | - parsed_valid_hugepagesz = false; |
---|
| 3244 | + return size == HPAGE_SIZE; |
---|
2877 | 3245 | } |
---|
2878 | 3246 | |
---|
2879 | 3247 | void __init hugetlb_add_hstate(unsigned int order) |
---|
.. | .. |
---|
2882 | 3250 | unsigned long i; |
---|
2883 | 3251 | |
---|
2884 | 3252 | if (size_to_hstate(PAGE_SIZE << order)) { |
---|
2885 | | - pr_warn("hugepagesz= specified twice, ignoring\n"); |
---|
2886 | 3253 | return; |
---|
2887 | 3254 | } |
---|
2888 | 3255 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); |
---|
.. | .. |
---|
2903 | 3270 | parsed_hstate = h; |
---|
2904 | 3271 | } |
---|
2905 | 3272 | |
---|
2906 | | -static int __init hugetlb_nrpages_setup(char *s) |
---|
| 3273 | +/* |
---|
| 3274 | + * hugepages command line processing |
---|
| 3275 | + * hugepages normally follows a valid hugepagsz or default_hugepagsz |
---|
| 3276 | + * specification. If not, ignore the hugepages value. hugepages can also |
---|
| 3277 | + * be the first huge page command line option in which case it implicitly |
---|
| 3278 | + * specifies the number of huge pages for the default size. |
---|
| 3279 | + */ |
---|
| 3280 | +static int __init hugepages_setup(char *s) |
---|
2907 | 3281 | { |
---|
2908 | 3282 | unsigned long *mhp; |
---|
2909 | 3283 | static unsigned long *last_mhp; |
---|
2910 | 3284 | |
---|
2911 | 3285 | if (!parsed_valid_hugepagesz) { |
---|
2912 | | - pr_warn("hugepages = %s preceded by " |
---|
2913 | | - "an unsupported hugepagesz, ignoring\n", s); |
---|
| 3286 | + pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); |
---|
2914 | 3287 | parsed_valid_hugepagesz = true; |
---|
2915 | | - return 1; |
---|
| 3288 | + return 0; |
---|
2916 | 3289 | } |
---|
| 3290 | + |
---|
2917 | 3291 | /* |
---|
2918 | | - * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, |
---|
2919 | | - * so this hugepages= parameter goes to the "default hstate". |
---|
| 3292 | + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter |
---|
| 3293 | + * yet, so this hugepages= parameter goes to the "default hstate". |
---|
| 3294 | + * Otherwise, it goes with the previously parsed hugepagesz or |
---|
| 3295 | + * default_hugepagesz. |
---|
2920 | 3296 | */ |
---|
2921 | 3297 | else if (!hugetlb_max_hstate) |
---|
2922 | 3298 | mhp = &default_hstate_max_huge_pages; |
---|
.. | .. |
---|
2924 | 3300 | mhp = &parsed_hstate->max_huge_pages; |
---|
2925 | 3301 | |
---|
2926 | 3302 | if (mhp == last_mhp) { |
---|
2927 | | - pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n"); |
---|
2928 | | - return 1; |
---|
| 3303 | + pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); |
---|
| 3304 | + return 0; |
---|
2929 | 3305 | } |
---|
2930 | 3306 | |
---|
2931 | 3307 | if (sscanf(s, "%lu", mhp) <= 0) |
---|
.. | .. |
---|
2943 | 3319 | |
---|
2944 | 3320 | return 1; |
---|
2945 | 3321 | } |
---|
2946 | | -__setup("hugepages=", hugetlb_nrpages_setup); |
---|
| 3322 | +__setup("hugepages=", hugepages_setup); |
---|
2947 | 3323 | |
---|
2948 | | -static int __init hugetlb_default_setup(char *s) |
---|
| 3324 | +/* |
---|
| 3325 | + * hugepagesz command line processing |
---|
| 3326 | + * A specific huge page size can only be specified once with hugepagesz. |
---|
| 3327 | + * hugepagesz is followed by hugepages on the command line. The global |
---|
| 3328 | + * variable 'parsed_valid_hugepagesz' is used to determine if prior |
---|
| 3329 | + * hugepagesz argument was valid. |
---|
| 3330 | + */ |
---|
| 3331 | +static int __init hugepagesz_setup(char *s) |
---|
2949 | 3332 | { |
---|
2950 | | - default_hstate_size = memparse(s, &s); |
---|
| 3333 | + unsigned long size; |
---|
| 3334 | + struct hstate *h; |
---|
| 3335 | + |
---|
| 3336 | + parsed_valid_hugepagesz = false; |
---|
| 3337 | + size = (unsigned long)memparse(s, NULL); |
---|
| 3338 | + |
---|
| 3339 | + if (!arch_hugetlb_valid_size(size)) { |
---|
| 3340 | + pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); |
---|
| 3341 | + return 0; |
---|
| 3342 | + } |
---|
| 3343 | + |
---|
| 3344 | + h = size_to_hstate(size); |
---|
| 3345 | + if (h) { |
---|
| 3346 | + /* |
---|
| 3347 | + * hstate for this size already exists. This is normally |
---|
| 3348 | + * an error, but is allowed if the existing hstate is the |
---|
| 3349 | + * default hstate. More specifically, it is only allowed if |
---|
| 3350 | + * the number of huge pages for the default hstate was not |
---|
| 3351 | + * previously specified. |
---|
| 3352 | + */ |
---|
| 3353 | + if (!parsed_default_hugepagesz || h != &default_hstate || |
---|
| 3354 | + default_hstate.max_huge_pages) { |
---|
| 3355 | + pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); |
---|
| 3356 | + return 0; |
---|
| 3357 | + } |
---|
| 3358 | + |
---|
| 3359 | + /* |
---|
| 3360 | + * No need to call hugetlb_add_hstate() as hstate already |
---|
| 3361 | + * exists. But, do set parsed_hstate so that a following |
---|
| 3362 | + * hugepages= parameter will be applied to this hstate. |
---|
| 3363 | + */ |
---|
| 3364 | + parsed_hstate = h; |
---|
| 3365 | + parsed_valid_hugepagesz = true; |
---|
| 3366 | + return 1; |
---|
| 3367 | + } |
---|
| 3368 | + |
---|
| 3369 | + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); |
---|
| 3370 | + parsed_valid_hugepagesz = true; |
---|
2951 | 3371 | return 1; |
---|
2952 | 3372 | } |
---|
2953 | | -__setup("default_hugepagesz=", hugetlb_default_setup); |
---|
| 3373 | +__setup("hugepagesz=", hugepagesz_setup); |
---|
2954 | 3374 | |
---|
2955 | | -static unsigned int cpuset_mems_nr(unsigned int *array) |
---|
| 3375 | +/* |
---|
| 3376 | + * default_hugepagesz command line input |
---|
| 3377 | + * Only one instance of default_hugepagesz allowed on command line. |
---|
| 3378 | + */ |
---|
| 3379 | +static int __init default_hugepagesz_setup(char *s) |
---|
| 3380 | +{ |
---|
| 3381 | + unsigned long size; |
---|
| 3382 | + |
---|
| 3383 | + parsed_valid_hugepagesz = false; |
---|
| 3384 | + if (parsed_default_hugepagesz) { |
---|
| 3385 | + pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); |
---|
| 3386 | + return 0; |
---|
| 3387 | + } |
---|
| 3388 | + |
---|
| 3389 | + size = (unsigned long)memparse(s, NULL); |
---|
| 3390 | + |
---|
| 3391 | + if (!arch_hugetlb_valid_size(size)) { |
---|
| 3392 | + pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); |
---|
| 3393 | + return 0; |
---|
| 3394 | + } |
---|
| 3395 | + |
---|
| 3396 | + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); |
---|
| 3397 | + parsed_valid_hugepagesz = true; |
---|
| 3398 | + parsed_default_hugepagesz = true; |
---|
| 3399 | + default_hstate_idx = hstate_index(size_to_hstate(size)); |
---|
| 3400 | + |
---|
| 3401 | + /* |
---|
| 3402 | + * The number of default huge pages (for this size) could have been |
---|
| 3403 | + * specified as the first hugetlb parameter: hugepages=X. If so, |
---|
| 3404 | + * then default_hstate_max_huge_pages is set. If the default huge |
---|
| 3405 | + * page size is gigantic (>= MAX_ORDER), then the pages must be |
---|
| 3406 | + * allocated here from bootmem allocator. |
---|
| 3407 | + */ |
---|
| 3408 | + if (default_hstate_max_huge_pages) { |
---|
| 3409 | + default_hstate.max_huge_pages = default_hstate_max_huge_pages; |
---|
| 3410 | + if (hstate_is_gigantic(&default_hstate)) |
---|
| 3411 | + hugetlb_hstate_alloc_pages(&default_hstate); |
---|
| 3412 | + default_hstate_max_huge_pages = 0; |
---|
| 3413 | + } |
---|
| 3414 | + |
---|
| 3415 | + return 1; |
---|
| 3416 | +} |
---|
| 3417 | +__setup("default_hugepagesz=", default_hugepagesz_setup); |
---|
| 3418 | + |
---|
| 3419 | +static unsigned int allowed_mems_nr(struct hstate *h) |
---|
2956 | 3420 | { |
---|
2957 | 3421 | int node; |
---|
2958 | 3422 | unsigned int nr = 0; |
---|
| 3423 | + nodemask_t *mpol_allowed; |
---|
| 3424 | + unsigned int *array = h->free_huge_pages_node; |
---|
| 3425 | + gfp_t gfp_mask = htlb_alloc_mask(h); |
---|
2959 | 3426 | |
---|
2960 | | - for_each_node_mask(node, cpuset_current_mems_allowed) |
---|
2961 | | - nr += array[node]; |
---|
| 3427 | + mpol_allowed = policy_nodemask_current(gfp_mask); |
---|
| 3428 | + |
---|
| 3429 | + for_each_node_mask(node, cpuset_current_mems_allowed) { |
---|
| 3430 | + if (!mpol_allowed || |
---|
| 3431 | + (mpol_allowed && node_isset(node, *mpol_allowed))) |
---|
| 3432 | + nr += array[node]; |
---|
| 3433 | + } |
---|
2962 | 3434 | |
---|
2963 | 3435 | return nr; |
---|
2964 | 3436 | } |
---|
.. | .. |
---|
2982 | 3454 | |
---|
2983 | 3455 | static int hugetlb_sysctl_handler_common(bool obey_mempolicy, |
---|
2984 | 3456 | struct ctl_table *table, int write, |
---|
2985 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 3457 | + void *buffer, size_t *length, loff_t *ppos) |
---|
2986 | 3458 | { |
---|
2987 | 3459 | struct hstate *h = &default_hstate; |
---|
2988 | 3460 | unsigned long tmp = h->max_huge_pages; |
---|
.. | .. |
---|
3004 | 3476 | } |
---|
3005 | 3477 | |
---|
3006 | 3478 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
---|
3007 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 3479 | + void *buffer, size_t *length, loff_t *ppos) |
---|
3008 | 3480 | { |
---|
3009 | 3481 | |
---|
3010 | 3482 | return hugetlb_sysctl_handler_common(false, table, write, |
---|
.. | .. |
---|
3013 | 3485 | |
---|
3014 | 3486 | #ifdef CONFIG_NUMA |
---|
3015 | 3487 | int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, |
---|
3016 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 3488 | + void *buffer, size_t *length, loff_t *ppos) |
---|
3017 | 3489 | { |
---|
3018 | 3490 | return hugetlb_sysctl_handler_common(true, table, write, |
---|
3019 | 3491 | buffer, length, ppos); |
---|
.. | .. |
---|
3021 | 3493 | #endif /* CONFIG_NUMA */ |
---|
3022 | 3494 | |
---|
3023 | 3495 | int hugetlb_overcommit_handler(struct ctl_table *table, int write, |
---|
3024 | | - void __user *buffer, |
---|
3025 | | - size_t *length, loff_t *ppos) |
---|
| 3496 | + void *buffer, size_t *length, loff_t *ppos) |
---|
3026 | 3497 | { |
---|
3027 | 3498 | struct hstate *h = &default_hstate; |
---|
3028 | 3499 | unsigned long tmp; |
---|
.. | .. |
---|
3082 | 3553 | seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); |
---|
3083 | 3554 | } |
---|
3084 | 3555 | |
---|
3085 | | -int hugetlb_report_node_meminfo(int nid, char *buf) |
---|
| 3556 | +int hugetlb_report_node_meminfo(char *buf, int len, int nid) |
---|
3086 | 3557 | { |
---|
3087 | 3558 | struct hstate *h = &default_hstate; |
---|
| 3559 | + |
---|
3088 | 3560 | if (!hugepages_supported()) |
---|
3089 | 3561 | return 0; |
---|
3090 | | - return sprintf(buf, |
---|
3091 | | - "Node %d HugePages_Total: %5u\n" |
---|
3092 | | - "Node %d HugePages_Free: %5u\n" |
---|
3093 | | - "Node %d HugePages_Surp: %5u\n", |
---|
3094 | | - nid, h->nr_huge_pages_node[nid], |
---|
3095 | | - nid, h->free_huge_pages_node[nid], |
---|
3096 | | - nid, h->surplus_huge_pages_node[nid]); |
---|
| 3562 | + |
---|
| 3563 | + return sysfs_emit_at(buf, len, |
---|
| 3564 | + "Node %d HugePages_Total: %5u\n" |
---|
| 3565 | + "Node %d HugePages_Free: %5u\n" |
---|
| 3566 | + "Node %d HugePages_Surp: %5u\n", |
---|
| 3567 | + nid, h->nr_huge_pages_node[nid], |
---|
| 3568 | + nid, h->free_huge_pages_node[nid], |
---|
| 3569 | + nid, h->surplus_huge_pages_node[nid]); |
---|
3097 | 3570 | } |
---|
3098 | 3571 | |
---|
3099 | 3572 | void hugetlb_show_meminfo(void) |
---|
.. | .. |
---|
3152 | 3625 | * we fall back to check against current free page availability as |
---|
3153 | 3626 | * a best attempt and hopefully to minimize the impact of changing |
---|
3154 | 3627 | * semantics that cpuset has. |
---|
| 3628 | + * |
---|
| 3629 | + * Apart from cpuset, we also have memory policy mechanism that |
---|
| 3630 | + * also determines from which node the kernel will allocate memory |
---|
| 3631 | + * in a NUMA system. So similar to cpuset, we also should consider |
---|
| 3632 | + * the memory policy of the current task. Similar to the description |
---|
| 3633 | + * above. |
---|
3155 | 3634 | */ |
---|
3156 | 3635 | if (delta > 0) { |
---|
3157 | 3636 | if (gather_surplus_pages(h, delta) < 0) |
---|
3158 | 3637 | goto out; |
---|
3159 | 3638 | |
---|
3160 | | - if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { |
---|
| 3639 | + if (delta > allowed_mems_nr(h)) { |
---|
3161 | 3640 | return_unused_surplus_pages(h, delta); |
---|
3162 | 3641 | goto out; |
---|
3163 | 3642 | } |
---|
.. | .. |
---|
3184 | 3663 | * after this open call completes. It is therefore safe to take a |
---|
3185 | 3664 | * new reference here without additional locking. |
---|
3186 | 3665 | */ |
---|
3187 | | - if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
---|
| 3666 | + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
---|
| 3667 | + resv_map_dup_hugetlb_cgroup_uncharge_info(resv); |
---|
3188 | 3668 | kref_get(&resv->refs); |
---|
| 3669 | + } |
---|
3189 | 3670 | } |
---|
3190 | 3671 | |
---|
3191 | 3672 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) |
---|
.. | .. |
---|
3203 | 3684 | end = vma_hugecache_offset(h, vma, vma->vm_end); |
---|
3204 | 3685 | |
---|
3205 | 3686 | reserve = (end - start) - region_count(resv, start, end); |
---|
3206 | | - |
---|
3207 | | - kref_put(&resv->refs, resv_map_release); |
---|
3208 | | - |
---|
| 3687 | + hugetlb_cgroup_uncharge_counter(resv, start, end); |
---|
3209 | 3688 | if (reserve) { |
---|
3210 | 3689 | /* |
---|
3211 | 3690 | * Decrement reserve counts. The global reserve count may be |
---|
.. | .. |
---|
3214 | 3693 | gbl_reserve = hugepage_subpool_put_pages(spool, reserve); |
---|
3215 | 3694 | hugetlb_acct_memory(h, -gbl_reserve); |
---|
3216 | 3695 | } |
---|
| 3696 | + |
---|
| 3697 | + kref_put(&resv->refs, resv_map_release); |
---|
3217 | 3698 | } |
---|
3218 | 3699 | |
---|
3219 | 3700 | static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) |
---|
3220 | 3701 | { |
---|
3221 | 3702 | if (addr & ~(huge_page_mask(hstate_vma(vma)))) |
---|
3222 | 3703 | return -EINVAL; |
---|
| 3704 | + |
---|
| 3705 | + /* |
---|
| 3706 | + * PMD sharing is only possible for PUD_SIZE-aligned address ranges |
---|
| 3707 | + * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this |
---|
| 3708 | + * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. |
---|
| 3709 | + */ |
---|
| 3710 | + if (addr & ~PUD_MASK) { |
---|
| 3711 | + /* |
---|
| 3712 | + * hugetlb_vm_op_split is called right before we attempt to |
---|
| 3713 | + * split the VMA. We will need to unshare PMDs in the old and |
---|
| 3714 | + * new VMAs, so let's unshare before we split. |
---|
| 3715 | + */ |
---|
| 3716 | + unsigned long floor = addr & PUD_MASK; |
---|
| 3717 | + unsigned long ceil = floor + PUD_SIZE; |
---|
| 3718 | + |
---|
| 3719 | + if (floor >= vma->vm_start && ceil <= vma->vm_end) |
---|
| 3720 | + hugetlb_unshare_pmds(vma, floor, ceil); |
---|
| 3721 | + } |
---|
| 3722 | + |
---|
3223 | 3723 | return 0; |
---|
3224 | 3724 | } |
---|
3225 | 3725 | |
---|
.. | .. |
---|
3293 | 3793 | if (huge_pte_none(pte) || pte_present(pte)) |
---|
3294 | 3794 | return false; |
---|
3295 | 3795 | swp = pte_to_swp_entry(pte); |
---|
3296 | | - if (non_swap_entry(swp) && is_migration_entry(swp)) |
---|
| 3796 | + if (is_migration_entry(swp)) |
---|
3297 | 3797 | return true; |
---|
3298 | 3798 | else |
---|
3299 | 3799 | return false; |
---|
3300 | 3800 | } |
---|
3301 | 3801 | |
---|
3302 | | -static int is_hugetlb_entry_hwpoisoned(pte_t pte) |
---|
| 3802 | +static bool is_hugetlb_entry_hwpoisoned(pte_t pte) |
---|
3303 | 3803 | { |
---|
3304 | 3804 | swp_entry_t swp; |
---|
3305 | 3805 | |
---|
3306 | 3806 | if (huge_pte_none(pte) || pte_present(pte)) |
---|
3307 | | - return 0; |
---|
| 3807 | + return false; |
---|
3308 | 3808 | swp = pte_to_swp_entry(pte); |
---|
3309 | | - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) |
---|
3310 | | - return 1; |
---|
| 3809 | + if (is_hwpoison_entry(swp)) |
---|
| 3810 | + return true; |
---|
3311 | 3811 | else |
---|
3312 | | - return 0; |
---|
| 3812 | + return false; |
---|
3313 | 3813 | } |
---|
3314 | 3814 | |
---|
3315 | 3815 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, |
---|
.. | .. |
---|
3321 | 3821 | int cow; |
---|
3322 | 3822 | struct hstate *h = hstate_vma(vma); |
---|
3323 | 3823 | unsigned long sz = huge_page_size(h); |
---|
3324 | | - unsigned long mmun_start; /* For mmu_notifiers */ |
---|
3325 | | - unsigned long mmun_end; /* For mmu_notifiers */ |
---|
| 3824 | + struct address_space *mapping = vma->vm_file->f_mapping; |
---|
| 3825 | + struct mmu_notifier_range range; |
---|
3326 | 3826 | int ret = 0; |
---|
3327 | 3827 | |
---|
3328 | 3828 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
---|
3329 | 3829 | |
---|
3330 | | - mmun_start = vma->vm_start; |
---|
3331 | | - mmun_end = vma->vm_end; |
---|
3332 | | - if (cow) |
---|
3333 | | - mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); |
---|
| 3830 | + if (cow) { |
---|
| 3831 | + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, |
---|
| 3832 | + vma->vm_start, |
---|
| 3833 | + vma->vm_end); |
---|
| 3834 | + mmu_notifier_invalidate_range_start(&range); |
---|
| 3835 | + } else { |
---|
| 3836 | + /* |
---|
| 3837 | + * For shared mappings i_mmap_rwsem must be held to call |
---|
| 3838 | + * huge_pte_alloc, otherwise the returned ptep could go |
---|
| 3839 | + * away if part of a shared pmd and another thread calls |
---|
| 3840 | + * huge_pmd_unshare. |
---|
| 3841 | + */ |
---|
| 3842 | + i_mmap_lock_read(mapping); |
---|
| 3843 | + } |
---|
3334 | 3844 | |
---|
3335 | 3845 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
---|
3336 | 3846 | spinlock_t *src_ptl, *dst_ptl; |
---|
3337 | 3847 | src_pte = huge_pte_offset(src, addr, sz); |
---|
3338 | 3848 | if (!src_pte) |
---|
3339 | 3849 | continue; |
---|
3340 | | - dst_pte = huge_pte_alloc(dst, addr, sz); |
---|
| 3850 | + dst_pte = huge_pte_alloc(dst, vma, addr, sz); |
---|
3341 | 3851 | if (!dst_pte) { |
---|
3342 | 3852 | ret = -ENOMEM; |
---|
3343 | 3853 | break; |
---|
.. | .. |
---|
3406 | 3916 | } |
---|
3407 | 3917 | |
---|
3408 | 3918 | if (cow) |
---|
3409 | | - mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); |
---|
| 3919 | + mmu_notifier_invalidate_range_end(&range); |
---|
| 3920 | + else |
---|
| 3921 | + i_mmap_unlock_read(mapping); |
---|
3410 | 3922 | |
---|
3411 | 3923 | return ret; |
---|
3412 | 3924 | } |
---|
.. | .. |
---|
3423 | 3935 | struct page *page; |
---|
3424 | 3936 | struct hstate *h = hstate_vma(vma); |
---|
3425 | 3937 | unsigned long sz = huge_page_size(h); |
---|
3426 | | - unsigned long mmun_start = start; /* For mmu_notifiers */ |
---|
3427 | | - unsigned long mmun_end = end; /* For mmu_notifiers */ |
---|
| 3938 | + struct mmu_notifier_range range; |
---|
3428 | 3939 | bool force_flush = false; |
---|
3429 | 3940 | |
---|
3430 | 3941 | WARN_ON(!is_vm_hugetlb_page(vma)); |
---|
.. | .. |
---|
3435 | 3946 | * This is a hugetlb vma, all the pte entries should point |
---|
3436 | 3947 | * to huge page. |
---|
3437 | 3948 | */ |
---|
3438 | | - tlb_remove_check_page_size_change(tlb, sz); |
---|
| 3949 | + tlb_change_page_size(tlb, sz); |
---|
3439 | 3950 | tlb_start_vma(tlb, vma); |
---|
3440 | 3951 | |
---|
3441 | 3952 | /* |
---|
3442 | 3953 | * If sharing possible, alert mmu notifiers of worst case. |
---|
3443 | 3954 | */ |
---|
3444 | | - adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end); |
---|
3445 | | - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
---|
| 3955 | + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, |
---|
| 3956 | + end); |
---|
| 3957 | + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); |
---|
| 3958 | + mmu_notifier_invalidate_range_start(&range); |
---|
3446 | 3959 | address = start; |
---|
3447 | 3960 | for (; address < end; address += sz) { |
---|
3448 | 3961 | ptep = huge_pte_offset(mm, address, sz); |
---|
.. | .. |
---|
3450 | 3963 | continue; |
---|
3451 | 3964 | |
---|
3452 | 3965 | ptl = huge_pte_lock(h, mm, ptep); |
---|
3453 | | - if (huge_pmd_unshare(mm, &address, ptep)) { |
---|
| 3966 | + if (huge_pmd_unshare(mm, vma, &address, ptep)) { |
---|
3454 | 3967 | spin_unlock(ptl); |
---|
3455 | 3968 | tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); |
---|
3456 | 3969 | force_flush = true; |
---|
.. | .. |
---|
3508 | 4021 | if (ref_page) |
---|
3509 | 4022 | break; |
---|
3510 | 4023 | } |
---|
3511 | | - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
---|
| 4024 | + mmu_notifier_invalidate_range_end(&range); |
---|
3512 | 4025 | tlb_end_vma(tlb, vma); |
---|
3513 | 4026 | |
---|
3514 | 4027 | /* |
---|
.. | .. |
---|
3642 | 4155 | struct page *old_page, *new_page; |
---|
3643 | 4156 | int outside_reserve = 0; |
---|
3644 | 4157 | vm_fault_t ret = 0; |
---|
3645 | | - unsigned long mmun_start; /* For mmu_notifiers */ |
---|
3646 | | - unsigned long mmun_end; /* For mmu_notifiers */ |
---|
3647 | 4158 | unsigned long haddr = address & huge_page_mask(h); |
---|
| 4159 | + struct mmu_notifier_range range; |
---|
3648 | 4160 | |
---|
3649 | 4161 | pte = huge_ptep_get(ptep); |
---|
3650 | 4162 | old_page = pte_page(pte); |
---|
.. | .. |
---|
3689 | 4201 | * may get SIGKILLed if it later faults. |
---|
3690 | 4202 | */ |
---|
3691 | 4203 | if (outside_reserve) { |
---|
| 4204 | + struct address_space *mapping = vma->vm_file->f_mapping; |
---|
| 4205 | + pgoff_t idx; |
---|
| 4206 | + u32 hash; |
---|
| 4207 | + |
---|
3692 | 4208 | put_page(old_page); |
---|
3693 | 4209 | BUG_ON(huge_pte_none(pte)); |
---|
| 4210 | + /* |
---|
| 4211 | + * Drop hugetlb_fault_mutex and i_mmap_rwsem before |
---|
| 4212 | + * unmapping. unmapping needs to hold i_mmap_rwsem |
---|
| 4213 | + * in write mode. Dropping i_mmap_rwsem in read mode |
---|
| 4214 | + * here is OK as COW mappings do not interact with |
---|
| 4215 | + * PMD sharing. |
---|
| 4216 | + * |
---|
| 4217 | + * Reacquire both after unmap operation. |
---|
| 4218 | + */ |
---|
| 4219 | + idx = vma_hugecache_offset(h, vma, haddr); |
---|
| 4220 | + hash = hugetlb_fault_mutex_hash(mapping, idx); |
---|
| 4221 | + mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
---|
| 4222 | + i_mmap_unlock_read(mapping); |
---|
| 4223 | + |
---|
3694 | 4224 | unmap_ref_private(mm, vma, old_page, haddr); |
---|
3695 | | - BUG_ON(huge_pte_none(pte)); |
---|
| 4225 | + |
---|
| 4226 | + i_mmap_lock_read(mapping); |
---|
| 4227 | + mutex_lock(&hugetlb_fault_mutex_table[hash]); |
---|
3696 | 4228 | spin_lock(ptl); |
---|
3697 | 4229 | ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); |
---|
3698 | 4230 | if (likely(ptep && |
---|
.. | .. |
---|
3722 | 4254 | pages_per_huge_page(h)); |
---|
3723 | 4255 | __SetPageUptodate(new_page); |
---|
3724 | 4256 | |
---|
3725 | | - mmun_start = haddr; |
---|
3726 | | - mmun_end = mmun_start + huge_page_size(h); |
---|
3727 | | - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
---|
| 4257 | + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, |
---|
| 4258 | + haddr + huge_page_size(h)); |
---|
| 4259 | + mmu_notifier_invalidate_range_start(&range); |
---|
3728 | 4260 | |
---|
3729 | 4261 | /* |
---|
3730 | 4262 | * Retake the page table lock to check for racing updates |
---|
.. | .. |
---|
3737 | 4269 | |
---|
3738 | 4270 | /* Break COW */ |
---|
3739 | 4271 | huge_ptep_clear_flush(vma, haddr, ptep); |
---|
3740 | | - mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); |
---|
| 4272 | + mmu_notifier_invalidate_range(mm, range.start, range.end); |
---|
3741 | 4273 | set_huge_pte_at(mm, haddr, ptep, |
---|
3742 | 4274 | make_huge_pte(vma, new_page, 1)); |
---|
3743 | 4275 | page_remove_rmap(old_page, true); |
---|
.. | .. |
---|
3747 | 4279 | new_page = old_page; |
---|
3748 | 4280 | } |
---|
3749 | 4281 | spin_unlock(ptl); |
---|
3750 | | - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
---|
| 4282 | + mmu_notifier_invalidate_range_end(&range); |
---|
3751 | 4283 | out_release_all: |
---|
3752 | 4284 | restore_reserve_on_error(h, vma, haddr, new_page); |
---|
3753 | 4285 | put_page(new_page); |
---|
.. | .. |
---|
3814 | 4346 | return 0; |
---|
3815 | 4347 | } |
---|
3816 | 4348 | |
---|
| 4349 | +static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, |
---|
| 4350 | + struct address_space *mapping, |
---|
| 4351 | + pgoff_t idx, |
---|
| 4352 | + unsigned int flags, |
---|
| 4353 | + unsigned long haddr, |
---|
| 4354 | + unsigned long reason) |
---|
| 4355 | +{ |
---|
| 4356 | + u32 hash = hugetlb_fault_mutex_hash(mapping, idx); |
---|
| 4357 | + struct vm_fault vmf = { |
---|
| 4358 | + .vma = vma, |
---|
| 4359 | + .address = haddr, |
---|
| 4360 | + .flags = flags, |
---|
| 4361 | + /* |
---|
| 4362 | + * Hard to debug if it ends up being |
---|
| 4363 | + * used by a callee that assumes |
---|
| 4364 | + * something about the other |
---|
| 4365 | + * uninitialized fields... same as in |
---|
| 4366 | + * memory.c |
---|
| 4367 | + */ |
---|
| 4368 | + }; |
---|
| 4369 | + |
---|
| 4370 | + /* |
---|
| 4371 | + * vma_lock and hugetlb_fault_mutex must be dropped |
---|
| 4372 | + * before handling userfault. Also mmap_lock will |
---|
| 4373 | + * be dropped during handling userfault, any vma |
---|
| 4374 | + * operation should be careful from here. |
---|
| 4375 | + */ |
---|
| 4376 | + mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
---|
| 4377 | + i_mmap_unlock_read(mapping); |
---|
| 4378 | + return handle_userfault(&vmf, VM_UFFD_MISSING); |
---|
| 4379 | +} |
---|
| 4380 | + |
---|
3817 | 4381 | static vm_fault_t hugetlb_no_page(struct mm_struct *mm, |
---|
3818 | 4382 | struct vm_area_struct *vma, |
---|
3819 | 4383 | struct address_space *mapping, pgoff_t idx, |
---|
.. | .. |
---|
3828 | 4392 | spinlock_t *ptl; |
---|
3829 | 4393 | unsigned long haddr = address & huge_page_mask(h); |
---|
3830 | 4394 | bool new_page = false; |
---|
| 4395 | + u32 hash = hugetlb_fault_mutex_hash(mapping, idx); |
---|
3831 | 4396 | |
---|
3832 | 4397 | /* |
---|
3833 | 4398 | * Currently, we are forced to kill the process in the event the |
---|
.. | .. |
---|
3837 | 4402 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { |
---|
3838 | 4403 | pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", |
---|
3839 | 4404 | current->pid); |
---|
3840 | | - return ret; |
---|
| 4405 | + goto out; |
---|
3841 | 4406 | } |
---|
3842 | 4407 | |
---|
3843 | 4408 | /* |
---|
3844 | | - * Use page lock to guard against racing truncation |
---|
3845 | | - * before we get page_table_lock. |
---|
| 4409 | + * We can not race with truncation due to holding i_mmap_rwsem. |
---|
| 4410 | + * i_size is modified when holding i_mmap_rwsem, so check here |
---|
| 4411 | + * once for faults beyond end of file. |
---|
3846 | 4412 | */ |
---|
| 4413 | + size = i_size_read(mapping->host) >> huge_page_shift(h); |
---|
| 4414 | + if (idx >= size) |
---|
| 4415 | + goto out; |
---|
| 4416 | + |
---|
3847 | 4417 | retry: |
---|
3848 | 4418 | page = find_lock_page(mapping, idx); |
---|
3849 | 4419 | if (!page) { |
---|
3850 | | - size = i_size_read(mapping->host) >> huge_page_shift(h); |
---|
3851 | | - if (idx >= size) |
---|
3852 | | - goto out; |
---|
3853 | | - |
---|
3854 | | - /* |
---|
3855 | | - * Check for page in userfault range |
---|
3856 | | - */ |
---|
| 4420 | + /* Check for page in userfault range */ |
---|
3857 | 4421 | if (userfaultfd_missing(vma)) { |
---|
3858 | | - u32 hash; |
---|
3859 | | - struct vm_fault vmf = { |
---|
3860 | | - .vma = vma, |
---|
3861 | | - .address = haddr, |
---|
3862 | | - .flags = flags, |
---|
3863 | | - /* |
---|
3864 | | - * Hard to debug if it ends up being |
---|
3865 | | - * used by a callee that assumes |
---|
3866 | | - * something about the other |
---|
3867 | | - * uninitialized fields... same as in |
---|
3868 | | - * memory.c |
---|
3869 | | - */ |
---|
3870 | | - }; |
---|
3871 | | - |
---|
3872 | | - /* |
---|
3873 | | - * hugetlb_fault_mutex must be dropped before |
---|
3874 | | - * handling userfault. Reacquire after handling |
---|
3875 | | - * fault to make calling code simpler. |
---|
3876 | | - */ |
---|
3877 | | - hash = hugetlb_fault_mutex_hash(h, mapping, idx); |
---|
3878 | | - mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
---|
3879 | | - ret = handle_userfault(&vmf, VM_UFFD_MISSING); |
---|
3880 | | - mutex_lock(&hugetlb_fault_mutex_table[hash]); |
---|
| 4422 | + ret = hugetlb_handle_userfault(vma, mapping, idx, |
---|
| 4423 | + flags, haddr, |
---|
| 4424 | + VM_UFFD_MISSING); |
---|
3881 | 4425 | goto out; |
---|
3882 | 4426 | } |
---|
3883 | 4427 | |
---|
3884 | 4428 | page = alloc_huge_page(vma, haddr, 0); |
---|
3885 | 4429 | if (IS_ERR(page)) { |
---|
| 4430 | + /* |
---|
| 4431 | + * Returning error will result in faulting task being |
---|
| 4432 | + * sent SIGBUS. The hugetlb fault mutex prevents two |
---|
| 4433 | + * tasks from racing to fault in the same page which |
---|
| 4434 | + * could result in false unable to allocate errors. |
---|
| 4435 | + * Page migration does not take the fault mutex, but |
---|
| 4436 | + * does a clear then write of pte's under page table |
---|
| 4437 | + * lock. Page fault code could race with migration, |
---|
| 4438 | + * notice the clear pte and try to allocate a page |
---|
| 4439 | + * here. Before returning error, get ptl and make |
---|
| 4440 | + * sure there really is no pte entry. |
---|
| 4441 | + */ |
---|
| 4442 | + ptl = huge_pte_lock(h, mm, ptep); |
---|
| 4443 | + if (!huge_pte_none(huge_ptep_get(ptep))) { |
---|
| 4444 | + ret = 0; |
---|
| 4445 | + spin_unlock(ptl); |
---|
| 4446 | + goto out; |
---|
| 4447 | + } |
---|
| 4448 | + spin_unlock(ptl); |
---|
3886 | 4449 | ret = vmf_error(PTR_ERR(page)); |
---|
3887 | 4450 | goto out; |
---|
3888 | 4451 | } |
---|
.. | .. |
---|
3917 | 4480 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
---|
3918 | 4481 | goto backout_unlocked; |
---|
3919 | 4482 | } |
---|
| 4483 | + |
---|
| 4484 | + /* Check for page in userfault range. */ |
---|
| 4485 | + if (userfaultfd_minor(vma)) { |
---|
| 4486 | + unlock_page(page); |
---|
| 4487 | + put_page(page); |
---|
| 4488 | + ret = hugetlb_handle_userfault(vma, mapping, idx, |
---|
| 4489 | + flags, haddr, |
---|
| 4490 | + VM_UFFD_MINOR); |
---|
| 4491 | + goto out; |
---|
| 4492 | + } |
---|
3920 | 4493 | } |
---|
3921 | 4494 | |
---|
3922 | 4495 | /* |
---|
.. | .. |
---|
3935 | 4508 | } |
---|
3936 | 4509 | |
---|
3937 | 4510 | ptl = huge_pte_lock(h, mm, ptep); |
---|
3938 | | - size = i_size_read(mapping->host) >> huge_page_shift(h); |
---|
3939 | | - if (idx >= size) |
---|
3940 | | - goto backout; |
---|
3941 | | - |
---|
3942 | 4511 | ret = 0; |
---|
3943 | 4512 | if (!huge_pte_none(huge_ptep_get(ptep))) |
---|
3944 | 4513 | goto backout; |
---|
.. | .. |
---|
3970 | 4539 | |
---|
3971 | 4540 | unlock_page(page); |
---|
3972 | 4541 | out: |
---|
| 4542 | + mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
---|
| 4543 | + i_mmap_unlock_read(mapping); |
---|
3973 | 4544 | return ret; |
---|
3974 | 4545 | |
---|
3975 | 4546 | backout: |
---|
.. | .. |
---|
3982 | 4553 | } |
---|
3983 | 4554 | |
---|
3984 | 4555 | #ifdef CONFIG_SMP |
---|
3985 | | -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, |
---|
3986 | | - pgoff_t idx) |
---|
| 4556 | +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) |
---|
3987 | 4557 | { |
---|
3988 | 4558 | unsigned long key[2]; |
---|
3989 | 4559 | u32 hash; |
---|
.. | .. |
---|
4000 | 4570 | * For uniprocesor systems we always use a single mutex, so just |
---|
4001 | 4571 | * return 0 and avoid the hashing overhead. |
---|
4002 | 4572 | */ |
---|
4003 | | -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, |
---|
4004 | | - pgoff_t idx) |
---|
| 4573 | +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) |
---|
4005 | 4574 | { |
---|
4006 | 4575 | return 0; |
---|
4007 | 4576 | } |
---|
.. | .. |
---|
4024 | 4593 | |
---|
4025 | 4594 | ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); |
---|
4026 | 4595 | if (ptep) { |
---|
| 4596 | + /* |
---|
| 4597 | + * Since we hold no locks, ptep could be stale. That is |
---|
| 4598 | + * OK as we are only making decisions based on content and |
---|
| 4599 | + * not actually modifying content here. |
---|
| 4600 | + */ |
---|
4027 | 4601 | entry = huge_ptep_get(ptep); |
---|
4028 | 4602 | if (unlikely(is_hugetlb_entry_migration(entry))) { |
---|
4029 | 4603 | migration_entry_wait_huge(vma, mm, ptep); |
---|
.. | .. |
---|
4031 | 4605 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
---|
4032 | 4606 | return VM_FAULT_HWPOISON_LARGE | |
---|
4033 | 4607 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
---|
4034 | | - } else { |
---|
4035 | | - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); |
---|
4036 | | - if (!ptep) |
---|
4037 | | - return VM_FAULT_OOM; |
---|
4038 | 4608 | } |
---|
4039 | 4609 | |
---|
| 4610 | + /* |
---|
| 4611 | + * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold |
---|
| 4612 | + * until finished with ptep. This serves two purposes: |
---|
| 4613 | + * 1) It prevents huge_pmd_unshare from being called elsewhere |
---|
| 4614 | + * and making the ptep no longer valid. |
---|
| 4615 | + * 2) It synchronizes us with i_size modifications during truncation. |
---|
| 4616 | + * |
---|
| 4617 | + * ptep could have already be assigned via huge_pte_offset. That |
---|
| 4618 | + * is OK, as huge_pte_alloc will return the same value unless |
---|
| 4619 | + * something has changed. |
---|
| 4620 | + */ |
---|
4040 | 4621 | mapping = vma->vm_file->f_mapping; |
---|
4041 | | - idx = vma_hugecache_offset(h, vma, haddr); |
---|
| 4622 | + i_mmap_lock_read(mapping); |
---|
| 4623 | + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); |
---|
| 4624 | + if (!ptep) { |
---|
| 4625 | + i_mmap_unlock_read(mapping); |
---|
| 4626 | + return VM_FAULT_OOM; |
---|
| 4627 | + } |
---|
4042 | 4628 | |
---|
4043 | 4629 | /* |
---|
4044 | 4630 | * Serialize hugepage allocation and instantiation, so that we don't |
---|
4045 | 4631 | * get spurious allocation failures if two CPUs race to instantiate |
---|
4046 | 4632 | * the same page in the page cache. |
---|
4047 | 4633 | */ |
---|
4048 | | - hash = hugetlb_fault_mutex_hash(h, mapping, idx); |
---|
| 4634 | + idx = vma_hugecache_offset(h, vma, haddr); |
---|
| 4635 | + hash = hugetlb_fault_mutex_hash(mapping, idx); |
---|
4049 | 4636 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
---|
4050 | 4637 | |
---|
4051 | 4638 | entry = huge_ptep_get(ptep); |
---|
4052 | | - if (huge_pte_none(entry)) { |
---|
4053 | | - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); |
---|
4054 | | - goto out_mutex; |
---|
4055 | | - } |
---|
| 4639 | + if (huge_pte_none(entry)) |
---|
| 4640 | + /* |
---|
| 4641 | + * hugetlb_no_page will drop vma lock and hugetlb fault |
---|
| 4642 | + * mutex internally, which make us return immediately. |
---|
| 4643 | + */ |
---|
| 4644 | + return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); |
---|
4056 | 4645 | |
---|
4057 | 4646 | ret = 0; |
---|
4058 | 4647 | |
---|
4059 | 4648 | /* |
---|
4060 | 4649 | * entry could be a migration/hwpoison entry at this point, so this |
---|
4061 | 4650 | * check prevents the kernel from going below assuming that we have |
---|
4062 | | - * a active hugepage in pagecache. This goto expects the 2nd page fault, |
---|
4063 | | - * and is_hugetlb_entry_(migration|hwpoisoned) check will properly |
---|
4064 | | - * handle it. |
---|
| 4651 | + * an active hugepage in pagecache. This goto expects the 2nd page |
---|
| 4652 | + * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will |
---|
| 4653 | + * properly handle it. |
---|
4065 | 4654 | */ |
---|
4066 | 4655 | if (!pte_present(entry)) |
---|
4067 | 4656 | goto out_mutex; |
---|
.. | .. |
---|
4132 | 4721 | } |
---|
4133 | 4722 | out_mutex: |
---|
4134 | 4723 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
---|
| 4724 | + i_mmap_unlock_read(mapping); |
---|
4135 | 4725 | /* |
---|
4136 | 4726 | * Generally it's safe to hold refcount during waiting page lock. But |
---|
4137 | 4727 | * here we just wait to defer the next page fault to avoid busy loop and |
---|
.. | .. |
---|
4144 | 4734 | return ret; |
---|
4145 | 4735 | } |
---|
4146 | 4736 | |
---|
| 4737 | +#ifdef CONFIG_USERFAULTFD |
---|
4147 | 4738 | /* |
---|
4148 | 4739 | * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with |
---|
4149 | 4740 | * modifications for huge pages. |
---|
.. | .. |
---|
4153 | 4744 | struct vm_area_struct *dst_vma, |
---|
4154 | 4745 | unsigned long dst_addr, |
---|
4155 | 4746 | unsigned long src_addr, |
---|
| 4747 | + enum mcopy_atomic_mode mode, |
---|
4156 | 4748 | struct page **pagep) |
---|
4157 | 4749 | { |
---|
| 4750 | + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); |
---|
4158 | 4751 | struct address_space *mapping; |
---|
4159 | 4752 | pgoff_t idx; |
---|
4160 | 4753 | unsigned long size; |
---|
.. | .. |
---|
4164 | 4757 | spinlock_t *ptl; |
---|
4165 | 4758 | int ret; |
---|
4166 | 4759 | struct page *page; |
---|
| 4760 | + int writable; |
---|
4167 | 4761 | |
---|
4168 | | - if (!*pagep) { |
---|
| 4762 | + mapping = dst_vma->vm_file->f_mapping; |
---|
| 4763 | + idx = vma_hugecache_offset(h, dst_vma, dst_addr); |
---|
| 4764 | + |
---|
| 4765 | + if (is_continue) { |
---|
| 4766 | + ret = -EFAULT; |
---|
| 4767 | + page = find_lock_page(mapping, idx); |
---|
| 4768 | + if (!page) |
---|
| 4769 | + goto out; |
---|
| 4770 | + } else if (!*pagep) { |
---|
4169 | 4771 | /* If a page already exists, then it's UFFDIO_COPY for |
---|
4170 | 4772 | * a non-missing case. Return -EEXIST. |
---|
4171 | 4773 | */ |
---|
.. | .. |
---|
4185 | 4787 | (const void __user *) src_addr, |
---|
4186 | 4788 | pages_per_huge_page(h), false); |
---|
4187 | 4789 | |
---|
4188 | | - /* fallback to copy_from_user outside mmap_sem */ |
---|
| 4790 | + /* fallback to copy_from_user outside mmap_lock */ |
---|
4189 | 4791 | if (unlikely(ret)) { |
---|
4190 | 4792 | ret = -ENOENT; |
---|
4191 | 4793 | *pagep = page; |
---|
.. | .. |
---|
4204 | 4806 | */ |
---|
4205 | 4807 | __SetPageUptodate(page); |
---|
4206 | 4808 | |
---|
4207 | | - mapping = dst_vma->vm_file->f_mapping; |
---|
4208 | | - idx = vma_hugecache_offset(h, dst_vma, dst_addr); |
---|
4209 | | - |
---|
4210 | | - /* |
---|
4211 | | - * If shared, add to page cache |
---|
4212 | | - */ |
---|
4213 | | - if (vm_shared) { |
---|
| 4809 | + /* Add shared, newly allocated pages to the page cache. */ |
---|
| 4810 | + if (vm_shared && !is_continue) { |
---|
4214 | 4811 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
---|
4215 | 4812 | ret = -EFAULT; |
---|
4216 | 4813 | if (idx >= size) |
---|
.. | .. |
---|
4255 | 4852 | hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); |
---|
4256 | 4853 | } |
---|
4257 | 4854 | |
---|
4258 | | - _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); |
---|
4259 | | - if (dst_vma->vm_flags & VM_WRITE) |
---|
| 4855 | + /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */ |
---|
| 4856 | + if (is_continue && !vm_shared) |
---|
| 4857 | + writable = 0; |
---|
| 4858 | + else |
---|
| 4859 | + writable = dst_vma->vm_flags & VM_WRITE; |
---|
| 4860 | + |
---|
| 4861 | + _dst_pte = make_huge_pte(dst_vma, page, writable); |
---|
| 4862 | + if (writable) |
---|
4260 | 4863 | _dst_pte = huge_pte_mkdirty(_dst_pte); |
---|
4261 | 4864 | _dst_pte = pte_mkyoung(_dst_pte); |
---|
4262 | 4865 | |
---|
.. | .. |
---|
4270 | 4873 | update_mmu_cache(dst_vma, dst_addr, dst_pte); |
---|
4271 | 4874 | |
---|
4272 | 4875 | spin_unlock(ptl); |
---|
4273 | | - set_page_huge_active(page); |
---|
4274 | | - if (vm_shared) |
---|
| 4876 | + if (!is_continue) |
---|
| 4877 | + set_page_huge_active(page); |
---|
| 4878 | + if (vm_shared || is_continue) |
---|
4275 | 4879 | unlock_page(page); |
---|
4276 | 4880 | ret = 0; |
---|
4277 | 4881 | out: |
---|
4278 | 4882 | return ret; |
---|
4279 | 4883 | out_release_unlock: |
---|
4280 | 4884 | spin_unlock(ptl); |
---|
4281 | | - if (vm_shared) |
---|
| 4885 | + if (vm_shared || is_continue) |
---|
4282 | 4886 | unlock_page(page); |
---|
4283 | 4887 | out_release_nounlock: |
---|
4284 | 4888 | put_page(page); |
---|
4285 | 4889 | goto out; |
---|
4286 | 4890 | } |
---|
| 4891 | +#endif /* CONFIG_USERFAULTFD */ |
---|
4287 | 4892 | |
---|
4288 | 4893 | long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
4289 | 4894 | struct page **pages, struct vm_area_struct **vmas, |
---|
4290 | 4895 | unsigned long *position, unsigned long *nr_pages, |
---|
4291 | | - long i, unsigned int flags, int *nonblocking) |
---|
| 4896 | + long i, unsigned int flags, int *locked) |
---|
4292 | 4897 | { |
---|
4293 | 4898 | unsigned long pfn_offset; |
---|
4294 | 4899 | unsigned long vaddr = *position; |
---|
.. | .. |
---|
4306 | 4911 | * If we have a pending SIGKILL, don't keep faulting pages and |
---|
4307 | 4912 | * potentially allocating memory. |
---|
4308 | 4913 | */ |
---|
4309 | | - if (unlikely(fatal_signal_pending(current))) { |
---|
| 4914 | + if (fatal_signal_pending(current)) { |
---|
4310 | 4915 | remainder = 0; |
---|
4311 | 4916 | break; |
---|
4312 | 4917 | } |
---|
.. | .. |
---|
4359 | 4964 | spin_unlock(ptl); |
---|
4360 | 4965 | if (flags & FOLL_WRITE) |
---|
4361 | 4966 | fault_flags |= FAULT_FLAG_WRITE; |
---|
4362 | | - if (nonblocking) |
---|
4363 | | - fault_flags |= FAULT_FLAG_ALLOW_RETRY; |
---|
| 4967 | + if (locked) |
---|
| 4968 | + fault_flags |= FAULT_FLAG_ALLOW_RETRY | |
---|
| 4969 | + FAULT_FLAG_KILLABLE; |
---|
4364 | 4970 | if (flags & FOLL_NOWAIT) |
---|
4365 | 4971 | fault_flags |= FAULT_FLAG_ALLOW_RETRY | |
---|
4366 | 4972 | FAULT_FLAG_RETRY_NOWAIT; |
---|
4367 | 4973 | if (flags & FOLL_TRIED) { |
---|
4368 | | - VM_WARN_ON_ONCE(fault_flags & |
---|
4369 | | - FAULT_FLAG_ALLOW_RETRY); |
---|
| 4974 | + /* |
---|
| 4975 | + * Note: FAULT_FLAG_ALLOW_RETRY and |
---|
| 4976 | + * FAULT_FLAG_TRIED can co-exist |
---|
| 4977 | + */ |
---|
4370 | 4978 | fault_flags |= FAULT_FLAG_TRIED; |
---|
4371 | 4979 | } |
---|
4372 | 4980 | ret = hugetlb_fault(mm, vma, vaddr, fault_flags); |
---|
.. | .. |
---|
4376 | 4984 | break; |
---|
4377 | 4985 | } |
---|
4378 | 4986 | if (ret & VM_FAULT_RETRY) { |
---|
4379 | | - if (nonblocking && |
---|
| 4987 | + if (locked && |
---|
4380 | 4988 | !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) |
---|
4381 | | - *nonblocking = 0; |
---|
| 4989 | + *locked = 0; |
---|
4382 | 4990 | *nr_pages = 0; |
---|
4383 | 4991 | /* |
---|
4384 | 4992 | * VM_FAULT_RETRY must not return an |
---|
.. | .. |
---|
4398 | 5006 | page = pte_page(huge_ptep_get(pte)); |
---|
4399 | 5007 | |
---|
4400 | 5008 | /* |
---|
4401 | | - * Instead of doing 'try_get_page()' below in the same_page |
---|
4402 | | - * loop, just check the count once here. |
---|
| 5009 | + * If subpage information not requested, update counters |
---|
| 5010 | + * and skip the same_page loop below. |
---|
4403 | 5011 | */ |
---|
4404 | | - if (unlikely(page_count(page) <= 0)) { |
---|
4405 | | - if (pages) { |
---|
| 5012 | + if (!pages && !vmas && !pfn_offset && |
---|
| 5013 | + (vaddr + huge_page_size(h) < vma->vm_end) && |
---|
| 5014 | + (remainder >= pages_per_huge_page(h))) { |
---|
| 5015 | + vaddr += huge_page_size(h); |
---|
| 5016 | + remainder -= pages_per_huge_page(h); |
---|
| 5017 | + i += pages_per_huge_page(h); |
---|
| 5018 | + spin_unlock(ptl); |
---|
| 5019 | + continue; |
---|
| 5020 | + } |
---|
| 5021 | + |
---|
| 5022 | +same_page: |
---|
| 5023 | + if (pages) { |
---|
| 5024 | + pages[i] = mem_map_offset(page, pfn_offset); |
---|
| 5025 | + /* |
---|
| 5026 | + * try_grab_page() should always succeed here, because: |
---|
| 5027 | + * a) we hold the ptl lock, and b) we've just checked |
---|
| 5028 | + * that the huge page is present in the page tables. If |
---|
| 5029 | + * the huge page is present, then the tail pages must |
---|
| 5030 | + * also be present. The ptl prevents the head page and |
---|
| 5031 | + * tail pages from being rearranged in any way. So this |
---|
| 5032 | + * page must be available at this point, unless the page |
---|
| 5033 | + * refcount overflowed: |
---|
| 5034 | + */ |
---|
| 5035 | + if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) { |
---|
4406 | 5036 | spin_unlock(ptl); |
---|
4407 | 5037 | remainder = 0; |
---|
4408 | 5038 | err = -ENOMEM; |
---|
4409 | 5039 | break; |
---|
4410 | 5040 | } |
---|
4411 | | - } |
---|
4412 | | -same_page: |
---|
4413 | | - if (pages) { |
---|
4414 | | - pages[i] = mem_map_offset(page, pfn_offset); |
---|
4415 | | - get_page(pages[i]); |
---|
4416 | 5041 | } |
---|
4417 | 5042 | |
---|
4418 | 5043 | if (vmas) |
---|
.. | .. |
---|
4443 | 5068 | return i ? i : err; |
---|
4444 | 5069 | } |
---|
4445 | 5070 | |
---|
4446 | | -#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE |
---|
4447 | | -/* |
---|
4448 | | - * ARCHes with special requirements for evicting HUGETLB backing TLB entries can |
---|
4449 | | - * implement this. |
---|
4450 | | - */ |
---|
4451 | | -#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) |
---|
4452 | | -#endif |
---|
4453 | | - |
---|
4454 | 5071 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
---|
4455 | 5072 | unsigned long address, unsigned long end, pgprot_t newprot) |
---|
4456 | 5073 | { |
---|
.. | .. |
---|
4460 | 5077 | pte_t pte; |
---|
4461 | 5078 | struct hstate *h = hstate_vma(vma); |
---|
4462 | 5079 | unsigned long pages = 0; |
---|
4463 | | - unsigned long f_start = start; |
---|
4464 | | - unsigned long f_end = end; |
---|
4465 | 5080 | bool shared_pmd = false; |
---|
| 5081 | + struct mmu_notifier_range range; |
---|
4466 | 5082 | |
---|
4467 | 5083 | /* |
---|
4468 | 5084 | * In the case of shared PMDs, the area to flush could be beyond |
---|
4469 | | - * start/end. Set f_start/f_end to cover the maximum possible |
---|
| 5085 | + * start/end. Set range.start/range.end to cover the maximum possible |
---|
4470 | 5086 | * range if PMD sharing is possible. |
---|
4471 | 5087 | */ |
---|
4472 | | - adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end); |
---|
| 5088 | + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, |
---|
| 5089 | + 0, vma, mm, start, end); |
---|
| 5090 | + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); |
---|
4473 | 5091 | |
---|
4474 | 5092 | BUG_ON(address >= end); |
---|
4475 | | - flush_cache_range(vma, f_start, f_end); |
---|
| 5093 | + flush_cache_range(vma, range.start, range.end); |
---|
4476 | 5094 | |
---|
4477 | | - mmu_notifier_invalidate_range_start(mm, f_start, f_end); |
---|
| 5095 | + mmu_notifier_invalidate_range_start(&range); |
---|
4478 | 5096 | i_mmap_lock_write(vma->vm_file->f_mapping); |
---|
4479 | 5097 | for (; address < end; address += huge_page_size(h)) { |
---|
4480 | 5098 | spinlock_t *ptl; |
---|
.. | .. |
---|
4482 | 5100 | if (!ptep) |
---|
4483 | 5101 | continue; |
---|
4484 | 5102 | ptl = huge_pte_lock(h, mm, ptep); |
---|
4485 | | - if (huge_pmd_unshare(mm, &address, ptep)) { |
---|
| 5103 | + if (huge_pmd_unshare(mm, vma, &address, ptep)) { |
---|
4486 | 5104 | pages++; |
---|
4487 | 5105 | spin_unlock(ptl); |
---|
4488 | 5106 | shared_pmd = true; |
---|
.. | .. |
---|
4509 | 5127 | continue; |
---|
4510 | 5128 | } |
---|
4511 | 5129 | if (!huge_pte_none(pte)) { |
---|
4512 | | - pte = huge_ptep_get_and_clear(mm, address, ptep); |
---|
4513 | | - pte = pte_mkhuge(huge_pte_modify(pte, newprot)); |
---|
| 5130 | + pte_t old_pte; |
---|
| 5131 | + |
---|
| 5132 | + old_pte = huge_ptep_modify_prot_start(vma, address, ptep); |
---|
| 5133 | + pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); |
---|
4514 | 5134 | pte = arch_make_huge_pte(pte, vma, NULL, 0); |
---|
4515 | | - set_huge_pte_at(mm, address, ptep, pte); |
---|
| 5135 | + huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); |
---|
4516 | 5136 | pages++; |
---|
4517 | 5137 | } |
---|
4518 | 5138 | spin_unlock(ptl); |
---|
.. | .. |
---|
4525 | 5145 | * did unshare a page of pmds, flush the range corresponding to the pud. |
---|
4526 | 5146 | */ |
---|
4527 | 5147 | if (shared_pmd) |
---|
4528 | | - flush_hugetlb_tlb_range(vma, f_start, f_end); |
---|
| 5148 | + flush_hugetlb_tlb_range(vma, range.start, range.end); |
---|
4529 | 5149 | else |
---|
4530 | 5150 | flush_hugetlb_tlb_range(vma, start, end); |
---|
4531 | 5151 | /* |
---|
.. | .. |
---|
4535 | 5155 | * See Documentation/vm/mmu_notifier.rst |
---|
4536 | 5156 | */ |
---|
4537 | 5157 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
---|
4538 | | - mmu_notifier_invalidate_range_end(mm, f_start, f_end); |
---|
| 5158 | + mmu_notifier_invalidate_range_end(&range); |
---|
4539 | 5159 | |
---|
4540 | 5160 | return pages << h->order; |
---|
4541 | 5161 | } |
---|
.. | .. |
---|
4545 | 5165 | struct vm_area_struct *vma, |
---|
4546 | 5166 | vm_flags_t vm_flags) |
---|
4547 | 5167 | { |
---|
4548 | | - long ret, chg; |
---|
| 5168 | + long ret, chg, add = -1; |
---|
4549 | 5169 | struct hstate *h = hstate_inode(inode); |
---|
4550 | 5170 | struct hugepage_subpool *spool = subpool_inode(inode); |
---|
4551 | 5171 | struct resv_map *resv_map; |
---|
4552 | | - long gbl_reserve; |
---|
| 5172 | + struct hugetlb_cgroup *h_cg = NULL; |
---|
| 5173 | + long gbl_reserve, regions_needed = 0; |
---|
4553 | 5174 | |
---|
4554 | 5175 | /* This should never happen */ |
---|
4555 | 5176 | if (from > to) { |
---|
.. | .. |
---|
4572 | 5193 | * called to make the mapping read-write. Assume !vma is a shm mapping |
---|
4573 | 5194 | */ |
---|
4574 | 5195 | if (!vma || vma->vm_flags & VM_MAYSHARE) { |
---|
| 5196 | + /* |
---|
| 5197 | + * resv_map can not be NULL as hugetlb_reserve_pages is only |
---|
| 5198 | + * called for inodes for which resv_maps were created (see |
---|
| 5199 | + * hugetlbfs_get_inode). |
---|
| 5200 | + */ |
---|
4575 | 5201 | resv_map = inode_resv_map(inode); |
---|
4576 | 5202 | |
---|
4577 | | - chg = region_chg(resv_map, from, to); |
---|
| 5203 | + chg = region_chg(resv_map, from, to, ®ions_needed); |
---|
4578 | 5204 | |
---|
4579 | 5205 | } else { |
---|
| 5206 | + /* Private mapping. */ |
---|
4580 | 5207 | resv_map = resv_map_alloc(); |
---|
4581 | 5208 | if (!resv_map) |
---|
4582 | 5209 | return -ENOMEM; |
---|
.. | .. |
---|
4592 | 5219 | goto out_err; |
---|
4593 | 5220 | } |
---|
4594 | 5221 | |
---|
| 5222 | + ret = hugetlb_cgroup_charge_cgroup_rsvd( |
---|
| 5223 | + hstate_index(h), chg * pages_per_huge_page(h), &h_cg); |
---|
| 5224 | + |
---|
| 5225 | + if (ret < 0) { |
---|
| 5226 | + ret = -ENOMEM; |
---|
| 5227 | + goto out_err; |
---|
| 5228 | + } |
---|
| 5229 | + |
---|
| 5230 | + if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { |
---|
| 5231 | + /* For private mappings, the hugetlb_cgroup uncharge info hangs |
---|
| 5232 | + * of the resv_map. |
---|
| 5233 | + */ |
---|
| 5234 | + resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); |
---|
| 5235 | + } |
---|
| 5236 | + |
---|
4595 | 5237 | /* |
---|
4596 | 5238 | * There must be enough pages in the subpool for the mapping. If |
---|
4597 | 5239 | * the subpool has a minimum size, there may be some global |
---|
.. | .. |
---|
4600 | 5242 | gbl_reserve = hugepage_subpool_get_pages(spool, chg); |
---|
4601 | 5243 | if (gbl_reserve < 0) { |
---|
4602 | 5244 | ret = -ENOSPC; |
---|
4603 | | - goto out_err; |
---|
| 5245 | + goto out_uncharge_cgroup; |
---|
4604 | 5246 | } |
---|
4605 | 5247 | |
---|
4606 | 5248 | /* |
---|
.. | .. |
---|
4609 | 5251 | */ |
---|
4610 | 5252 | ret = hugetlb_acct_memory(h, gbl_reserve); |
---|
4611 | 5253 | if (ret < 0) { |
---|
4612 | | - /* put back original number of pages, chg */ |
---|
4613 | | - (void)hugepage_subpool_put_pages(spool, chg); |
---|
4614 | | - goto out_err; |
---|
| 5254 | + goto out_put_pages; |
---|
4615 | 5255 | } |
---|
4616 | 5256 | |
---|
4617 | 5257 | /* |
---|
.. | .. |
---|
4626 | 5266 | * else has to be done for private mappings here |
---|
4627 | 5267 | */ |
---|
4628 | 5268 | if (!vma || vma->vm_flags & VM_MAYSHARE) { |
---|
4629 | | - long add = region_add(resv_map, from, to); |
---|
| 5269 | + add = region_add(resv_map, from, to, regions_needed, h, h_cg); |
---|
4630 | 5270 | |
---|
4631 | | - if (unlikely(chg > add)) { |
---|
| 5271 | + if (unlikely(add < 0)) { |
---|
| 5272 | + hugetlb_acct_memory(h, -gbl_reserve); |
---|
| 5273 | + ret = add; |
---|
| 5274 | + goto out_put_pages; |
---|
| 5275 | + } else if (unlikely(chg > add)) { |
---|
4632 | 5276 | /* |
---|
4633 | 5277 | * pages in this range were added to the reserve |
---|
4634 | 5278 | * map between region_chg and region_add. This |
---|
.. | .. |
---|
4638 | 5282 | */ |
---|
4639 | 5283 | long rsv_adjust; |
---|
4640 | 5284 | |
---|
| 5285 | + /* |
---|
| 5286 | + * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the |
---|
| 5287 | + * reference to h_cg->css. See comment below for detail. |
---|
| 5288 | + */ |
---|
| 5289 | + hugetlb_cgroup_uncharge_cgroup_rsvd( |
---|
| 5290 | + hstate_index(h), |
---|
| 5291 | + (chg - add) * pages_per_huge_page(h), h_cg); |
---|
| 5292 | + |
---|
4641 | 5293 | rsv_adjust = hugepage_subpool_put_pages(spool, |
---|
4642 | 5294 | chg - add); |
---|
4643 | 5295 | hugetlb_acct_memory(h, -rsv_adjust); |
---|
| 5296 | + } else if (h_cg) { |
---|
| 5297 | + /* |
---|
| 5298 | + * The file_regions will hold their own reference to |
---|
| 5299 | + * h_cg->css. So we should release the reference held |
---|
| 5300 | + * via hugetlb_cgroup_charge_cgroup_rsvd() when we are |
---|
| 5301 | + * done. |
---|
| 5302 | + */ |
---|
| 5303 | + hugetlb_cgroup_put_rsvd_cgroup(h_cg); |
---|
4644 | 5304 | } |
---|
4645 | 5305 | } |
---|
4646 | 5306 | return 0; |
---|
| 5307 | +out_put_pages: |
---|
| 5308 | + /* put back original number of pages, chg */ |
---|
| 5309 | + (void)hugepage_subpool_put_pages(spool, chg); |
---|
| 5310 | +out_uncharge_cgroup: |
---|
| 5311 | + hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), |
---|
| 5312 | + chg * pages_per_huge_page(h), h_cg); |
---|
4647 | 5313 | out_err: |
---|
4648 | 5314 | if (!vma || vma->vm_flags & VM_MAYSHARE) |
---|
4649 | | - /* Don't call region_abort if region_chg failed */ |
---|
4650 | | - if (chg >= 0) |
---|
4651 | | - region_abort(resv_map, from, to); |
---|
| 5315 | + /* Only call region_abort if the region_chg succeeded but the |
---|
| 5316 | + * region_add failed or didn't run. |
---|
| 5317 | + */ |
---|
| 5318 | + if (chg >= 0 && add < 0) |
---|
| 5319 | + region_abort(resv_map, from, to, regions_needed); |
---|
4652 | 5320 | if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
---|
4653 | 5321 | kref_put(&resv_map->refs, resv_map_release); |
---|
4654 | 5322 | return ret; |
---|
.. | .. |
---|
4663 | 5331 | struct hugepage_subpool *spool = subpool_inode(inode); |
---|
4664 | 5332 | long gbl_reserve; |
---|
4665 | 5333 | |
---|
| 5334 | + /* |
---|
| 5335 | + * Since this routine can be called in the evict inode path for all |
---|
| 5336 | + * hugetlbfs inodes, resv_map could be NULL. |
---|
| 5337 | + */ |
---|
4666 | 5338 | if (resv_map) { |
---|
4667 | 5339 | chg = region_del(resv_map, start, end); |
---|
4668 | 5340 | /* |
---|
.. | .. |
---|
4727 | 5399 | return false; |
---|
4728 | 5400 | } |
---|
4729 | 5401 | |
---|
| 5402 | +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) |
---|
| 5403 | +{ |
---|
| 5404 | +#ifdef CONFIG_USERFAULTFD |
---|
| 5405 | + if (uffd_disable_huge_pmd_share(vma)) |
---|
| 5406 | + return false; |
---|
| 5407 | +#endif |
---|
| 5408 | + return vma_shareable(vma, addr); |
---|
| 5409 | +} |
---|
| 5410 | + |
---|
4730 | 5411 | /* |
---|
4731 | 5412 | * Determine if start,end range within vma could be mapped by shared pmd. |
---|
4732 | 5413 | * If yes, adjust start and end to cover range associated with possible |
---|
.. | .. |
---|
4758 | 5439 | * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() |
---|
4759 | 5440 | * and returns the corresponding pte. While this is not necessary for the |
---|
4760 | 5441 | * !shared pmd case because we can allocate the pmd later as well, it makes the |
---|
4761 | | - * code much cleaner. pmd allocation is essential for the shared case because |
---|
4762 | | - * pud has to be populated inside the same i_mmap_rwsem section - otherwise |
---|
4763 | | - * racing tasks could either miss the sharing (see huge_pte_offset) or select a |
---|
4764 | | - * bad pmd for sharing. |
---|
| 5442 | + * code much cleaner. |
---|
| 5443 | + * |
---|
| 5444 | + * This routine must be called with i_mmap_rwsem held in at least read mode if |
---|
| 5445 | + * sharing is possible. For hugetlbfs, this prevents removal of any page |
---|
| 5446 | + * table entries associated with the address space. This is important as we |
---|
| 5447 | + * are setting up sharing based on existing page table entries (mappings). |
---|
| 5448 | + * |
---|
| 5449 | + * NOTE: This routine is only called from huge_pte_alloc. Some callers of |
---|
| 5450 | + * huge_pte_alloc know that sharing is not possible and do not take |
---|
| 5451 | + * i_mmap_rwsem as a performance optimization. This is handled by the |
---|
| 5452 | + * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is |
---|
| 5453 | + * only required for subsequent processing. |
---|
4765 | 5454 | */ |
---|
4766 | | -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) |
---|
| 5455 | +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
| 5456 | + unsigned long addr, pud_t *pud) |
---|
4767 | 5457 | { |
---|
4768 | | - struct vm_area_struct *vma = find_vma(mm, addr); |
---|
4769 | 5458 | struct address_space *mapping = vma->vm_file->f_mapping; |
---|
4770 | 5459 | pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + |
---|
4771 | 5460 | vma->vm_pgoff; |
---|
.. | .. |
---|
4775 | 5464 | pte_t *pte; |
---|
4776 | 5465 | spinlock_t *ptl; |
---|
4777 | 5466 | |
---|
4778 | | - if (!vma_shareable(vma, addr)) |
---|
4779 | | - return (pte_t *)pmd_alloc(mm, pud, addr); |
---|
4780 | | - |
---|
4781 | | - i_mmap_lock_write(mapping); |
---|
| 5467 | + i_mmap_assert_locked(mapping); |
---|
4782 | 5468 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { |
---|
4783 | 5469 | if (svma == vma) |
---|
4784 | 5470 | continue; |
---|
.. | .. |
---|
4808 | 5494 | spin_unlock(ptl); |
---|
4809 | 5495 | out: |
---|
4810 | 5496 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
---|
4811 | | - i_mmap_unlock_write(mapping); |
---|
4812 | 5497 | return pte; |
---|
4813 | 5498 | } |
---|
4814 | 5499 | |
---|
.. | .. |
---|
4819 | 5504 | * indicated by page_count > 1, unmap is achieved by clearing pud and |
---|
4820 | 5505 | * decrementing the ref count. If count == 1, the pte page is not shared. |
---|
4821 | 5506 | * |
---|
4822 | | - * called with page table lock held. |
---|
| 5507 | + * Called with page table lock held and i_mmap_rwsem held in write mode. |
---|
4823 | 5508 | * |
---|
4824 | 5509 | * returns: 1 successfully unmapped a shared pte page |
---|
4825 | 5510 | * 0 the underlying pte page is not shared, or it is the last user |
---|
4826 | 5511 | */ |
---|
4827 | | -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) |
---|
| 5512 | +int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
| 5513 | + unsigned long *addr, pte_t *ptep) |
---|
4828 | 5514 | { |
---|
4829 | 5515 | pgd_t *pgd = pgd_offset(mm, *addr); |
---|
4830 | 5516 | p4d_t *p4d = p4d_offset(pgd, *addr); |
---|
4831 | 5517 | pud_t *pud = pud_offset(p4d, *addr); |
---|
4832 | 5518 | |
---|
| 5519 | + i_mmap_assert_write_locked(vma->vm_file->f_mapping); |
---|
4833 | 5520 | BUG_ON(page_count(virt_to_page(ptep)) == 0); |
---|
4834 | 5521 | if (page_count(virt_to_page(ptep)) == 1) |
---|
4835 | 5522 | return 0; |
---|
.. | .. |
---|
4837 | 5524 | pud_clear(pud); |
---|
4838 | 5525 | put_page(virt_to_page(ptep)); |
---|
4839 | 5526 | mm_dec_nr_pmds(mm); |
---|
4840 | | - *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; |
---|
| 5527 | + /* |
---|
| 5528 | + * This update of passed address optimizes loops sequentially |
---|
| 5529 | + * processing addresses in increments of huge page size (PMD_SIZE |
---|
| 5530 | + * in this case). By clearing the pud, a PUD_SIZE area is unmapped. |
---|
| 5531 | + * Update address to the 'last page' in the cleared area so that |
---|
| 5532 | + * calling loop can move to first page past this area. |
---|
| 5533 | + */ |
---|
| 5534 | + *addr |= PUD_SIZE - PMD_SIZE; |
---|
4841 | 5535 | return 1; |
---|
4842 | 5536 | } |
---|
4843 | | -#define want_pmd_share() (1) |
---|
| 5537 | + |
---|
4844 | 5538 | #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ |
---|
4845 | | -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) |
---|
| 5539 | +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
| 5540 | + unsigned long addr, pud_t *pud) |
---|
4846 | 5541 | { |
---|
4847 | 5542 | return NULL; |
---|
4848 | 5543 | } |
---|
4849 | 5544 | |
---|
4850 | | -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) |
---|
| 5545 | +int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
| 5546 | + unsigned long *addr, pte_t *ptep) |
---|
4851 | 5547 | { |
---|
4852 | 5548 | return 0; |
---|
4853 | 5549 | } |
---|
.. | .. |
---|
4856 | 5552 | unsigned long *start, unsigned long *end) |
---|
4857 | 5553 | { |
---|
4858 | 5554 | } |
---|
4859 | | -#define want_pmd_share() (0) |
---|
| 5555 | + |
---|
| 5556 | +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) |
---|
| 5557 | +{ |
---|
| 5558 | + return false; |
---|
| 5559 | +} |
---|
4860 | 5560 | #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ |
---|
4861 | 5561 | |
---|
4862 | 5562 | #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB |
---|
4863 | | -pte_t *huge_pte_alloc(struct mm_struct *mm, |
---|
| 5563 | +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
4864 | 5564 | unsigned long addr, unsigned long sz) |
---|
4865 | 5565 | { |
---|
4866 | 5566 | pgd_t *pgd; |
---|
.. | .. |
---|
4878 | 5578 | pte = (pte_t *)pud; |
---|
4879 | 5579 | } else { |
---|
4880 | 5580 | BUG_ON(sz != PMD_SIZE); |
---|
4881 | | - if (want_pmd_share() && pud_none(*pud)) |
---|
4882 | | - pte = huge_pmd_share(mm, addr, pud); |
---|
| 5581 | + if (want_pmd_share(vma, addr) && pud_none(*pud)) |
---|
| 5582 | + pte = huge_pmd_share(mm, vma, addr, pud); |
---|
4883 | 5583 | else |
---|
4884 | 5584 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
---|
4885 | 5585 | } |
---|
.. | .. |
---|
4893 | 5593 | * huge_pte_offset() - Walk the page table to resolve the hugepage |
---|
4894 | 5594 | * entry at address @addr |
---|
4895 | 5595 | * |
---|
4896 | | - * Return: Pointer to page table or swap entry (PUD or PMD) for |
---|
4897 | | - * address @addr, or NULL if a p*d_none() entry is encountered and the |
---|
| 5596 | + * Return: Pointer to page table entry (PUD or PMD) for |
---|
| 5597 | + * address @addr, or NULL if a !p*d_present() entry is encountered and the |
---|
4898 | 5598 | * size @sz doesn't match the hugepage size at this level of the page |
---|
4899 | 5599 | * table. |
---|
4900 | 5600 | */ |
---|
.. | .. |
---|
4903 | 5603 | { |
---|
4904 | 5604 | pgd_t *pgd; |
---|
4905 | 5605 | p4d_t *p4d; |
---|
4906 | | - pud_t *pud, pud_entry; |
---|
4907 | | - pmd_t *pmd, pmd_entry; |
---|
| 5606 | + pud_t *pud; |
---|
| 5607 | + pmd_t *pmd; |
---|
4908 | 5608 | |
---|
4909 | 5609 | pgd = pgd_offset(mm, addr); |
---|
4910 | 5610 | if (!pgd_present(*pgd)) |
---|
.. | .. |
---|
4914 | 5614 | return NULL; |
---|
4915 | 5615 | |
---|
4916 | 5616 | pud = pud_offset(p4d, addr); |
---|
4917 | | - pud_entry = READ_ONCE(*pud); |
---|
4918 | | - if (sz != PUD_SIZE && pud_none(pud_entry)) |
---|
4919 | | - return NULL; |
---|
4920 | | - /* hugepage or swap? */ |
---|
4921 | | - if (pud_huge(pud_entry) || !pud_present(pud_entry)) |
---|
| 5617 | + if (sz == PUD_SIZE) |
---|
| 5618 | + /* must be pud huge, non-present or none */ |
---|
4922 | 5619 | return (pte_t *)pud; |
---|
| 5620 | + if (!pud_present(*pud)) |
---|
| 5621 | + return NULL; |
---|
| 5622 | + /* must have a valid entry and size to go further */ |
---|
4923 | 5623 | |
---|
4924 | 5624 | pmd = pmd_offset(pud, addr); |
---|
4925 | | - pmd_entry = READ_ONCE(*pmd); |
---|
4926 | | - if (sz != PMD_SIZE && pmd_none(pmd_entry)) |
---|
4927 | | - return NULL; |
---|
4928 | | - /* hugepage or swap? */ |
---|
4929 | | - if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry)) |
---|
4930 | | - return (pte_t *)pmd; |
---|
4931 | | - |
---|
4932 | | - return NULL; |
---|
| 5625 | + /* must be pmd huge, non-present or none */ |
---|
| 5626 | + return (pte_t *)pmd; |
---|
4933 | 5627 | } |
---|
4934 | 5628 | |
---|
4935 | 5629 | #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ |
---|
.. | .. |
---|
4954 | 5648 | } |
---|
4955 | 5649 | |
---|
4956 | 5650 | struct page * __weak |
---|
4957 | | -follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
---|
4958 | | - pmd_t *pmd, int flags) |
---|
| 5651 | +follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) |
---|
4959 | 5652 | { |
---|
| 5653 | + struct hstate *h = hstate_vma(vma); |
---|
| 5654 | + struct mm_struct *mm = vma->vm_mm; |
---|
4960 | 5655 | struct page *page = NULL; |
---|
4961 | 5656 | spinlock_t *ptl; |
---|
4962 | | - pte_t pte; |
---|
| 5657 | + pte_t *ptep, pte; |
---|
| 5658 | + |
---|
| 5659 | + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ |
---|
| 5660 | + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == |
---|
| 5661 | + (FOLL_PIN | FOLL_GET))) |
---|
| 5662 | + return NULL; |
---|
| 5663 | + |
---|
4963 | 5664 | retry: |
---|
4964 | | - ptl = pmd_lockptr(mm, pmd); |
---|
4965 | | - spin_lock(ptl); |
---|
4966 | | - /* |
---|
4967 | | - * make sure that the address range covered by this pmd is not |
---|
4968 | | - * unmapped from other threads. |
---|
4969 | | - */ |
---|
4970 | | - if (!pmd_huge(*pmd)) |
---|
4971 | | - goto out; |
---|
4972 | | - pte = huge_ptep_get((pte_t *)pmd); |
---|
| 5665 | + ptep = huge_pte_offset(mm, address, huge_page_size(h)); |
---|
| 5666 | + if (!ptep) |
---|
| 5667 | + return NULL; |
---|
| 5668 | + |
---|
| 5669 | + ptl = huge_pte_lock(h, mm, ptep); |
---|
| 5670 | + pte = huge_ptep_get(ptep); |
---|
4973 | 5671 | if (pte_present(pte)) { |
---|
4974 | | - page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); |
---|
4975 | | - if (flags & FOLL_GET) |
---|
4976 | | - get_page(page); |
---|
| 5672 | + page = pte_page(pte) + |
---|
| 5673 | + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); |
---|
| 5674 | + /* |
---|
| 5675 | + * try_grab_page() should always succeed here, because: a) we |
---|
| 5676 | + * hold the pmd (ptl) lock, and b) we've just checked that the |
---|
| 5677 | + * huge pmd (head) page is present in the page tables. The ptl |
---|
| 5678 | + * prevents the head page and tail pages from being rearranged |
---|
| 5679 | + * in any way. So this page must be available at this point, |
---|
| 5680 | + * unless the page refcount overflowed: |
---|
| 5681 | + */ |
---|
| 5682 | + if (WARN_ON_ONCE(!try_grab_page(page, flags))) { |
---|
| 5683 | + page = NULL; |
---|
| 5684 | + goto out; |
---|
| 5685 | + } |
---|
4977 | 5686 | } else { |
---|
4978 | 5687 | if (is_hugetlb_entry_migration(pte)) { |
---|
4979 | 5688 | spin_unlock(ptl); |
---|
4980 | | - __migration_entry_wait(mm, (pte_t *)pmd, ptl); |
---|
| 5689 | + __migration_entry_wait(mm, ptep, ptl); |
---|
4981 | 5690 | goto retry; |
---|
4982 | 5691 | } |
---|
4983 | 5692 | /* |
---|
.. | .. |
---|
4994 | 5703 | follow_huge_pud(struct mm_struct *mm, unsigned long address, |
---|
4995 | 5704 | pud_t *pud, int flags) |
---|
4996 | 5705 | { |
---|
4997 | | - if (flags & FOLL_GET) |
---|
| 5706 | + if (flags & (FOLL_GET | FOLL_PIN)) |
---|
4998 | 5707 | return NULL; |
---|
4999 | 5708 | |
---|
5000 | 5709 | return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); |
---|
.. | .. |
---|
5003 | 5712 | struct page * __weak |
---|
5004 | 5713 | follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) |
---|
5005 | 5714 | { |
---|
5006 | | - if (flags & FOLL_GET) |
---|
| 5715 | + if (flags & (FOLL_GET | FOLL_PIN)) |
---|
5007 | 5716 | return NULL; |
---|
5008 | 5717 | |
---|
5009 | 5718 | return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); |
---|
5010 | 5719 | } |
---|
5011 | 5720 | |
---|
5012 | | -bool isolate_huge_page(struct page *page, struct list_head *list) |
---|
| 5721 | +int isolate_hugetlb(struct page *page, struct list_head *list) |
---|
5013 | 5722 | { |
---|
5014 | | - bool ret = true; |
---|
| 5723 | + int ret = 0; |
---|
5015 | 5724 | |
---|
5016 | 5725 | spin_lock(&hugetlb_lock); |
---|
5017 | 5726 | if (!PageHeadHuge(page) || !page_huge_active(page) || |
---|
5018 | 5727 | !get_page_unless_zero(page)) { |
---|
5019 | | - ret = false; |
---|
| 5728 | + ret = -EBUSY; |
---|
5020 | 5729 | goto unlock; |
---|
5021 | 5730 | } |
---|
5022 | 5731 | clear_page_huge_active(page); |
---|
.. | .. |
---|
5068 | 5777 | spin_unlock(&hugetlb_lock); |
---|
5069 | 5778 | } |
---|
5070 | 5779 | } |
---|
| 5780 | + |
---|
| 5781 | +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, |
---|
| 5782 | + unsigned long start, |
---|
| 5783 | + unsigned long end) |
---|
| 5784 | +{ |
---|
| 5785 | + struct hstate *h = hstate_vma(vma); |
---|
| 5786 | + unsigned long sz = huge_page_size(h); |
---|
| 5787 | + struct mm_struct *mm = vma->vm_mm; |
---|
| 5788 | + struct mmu_notifier_range range; |
---|
| 5789 | + unsigned long address; |
---|
| 5790 | + spinlock_t *ptl; |
---|
| 5791 | + pte_t *ptep; |
---|
| 5792 | + |
---|
| 5793 | + if (!(vma->vm_flags & VM_MAYSHARE)) |
---|
| 5794 | + return; |
---|
| 5795 | + |
---|
| 5796 | + if (start >= end) |
---|
| 5797 | + return; |
---|
| 5798 | + |
---|
| 5799 | + flush_cache_range(vma, start, end); |
---|
| 5800 | + /* |
---|
| 5801 | + * No need to call adjust_range_if_pmd_sharing_possible(), because |
---|
| 5802 | + * we have already done the PUD_SIZE alignment. |
---|
| 5803 | + */ |
---|
| 5804 | + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, |
---|
| 5805 | + start, end); |
---|
| 5806 | + mmu_notifier_invalidate_range_start(&range); |
---|
| 5807 | + i_mmap_lock_write(vma->vm_file->f_mapping); |
---|
| 5808 | + for (address = start; address < end; address += PUD_SIZE) { |
---|
| 5809 | + unsigned long tmp = address; |
---|
| 5810 | + |
---|
| 5811 | + ptep = huge_pte_offset(mm, address, sz); |
---|
| 5812 | + if (!ptep) |
---|
| 5813 | + continue; |
---|
| 5814 | + ptl = huge_pte_lock(h, mm, ptep); |
---|
| 5815 | + /* We don't want 'address' to be changed */ |
---|
| 5816 | + huge_pmd_unshare(mm, vma, &tmp, ptep); |
---|
| 5817 | + spin_unlock(ptl); |
---|
| 5818 | + } |
---|
| 5819 | + flush_hugetlb_tlb_range(vma, start, end); |
---|
| 5820 | + i_mmap_unlock_write(vma->vm_file->f_mapping); |
---|
| 5821 | + /* |
---|
| 5822 | + * No need to call mmu_notifier_invalidate_range(), see |
---|
| 5823 | + * Documentation/vm/mmu_notifier.rst. |
---|
| 5824 | + */ |
---|
| 5825 | + mmu_notifier_invalidate_range_end(&range); |
---|
| 5826 | +} |
---|
| 5827 | + |
---|
| 5828 | +/* |
---|
| 5829 | + * This function will unconditionally remove all the shared pmd pgtable entries |
---|
| 5830 | + * within the specific vma for a hugetlbfs memory range. |
---|
| 5831 | + */ |
---|
| 5832 | +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) |
---|
| 5833 | +{ |
---|
| 5834 | + hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), |
---|
| 5835 | + ALIGN_DOWN(vma->vm_end, PUD_SIZE)); |
---|
| 5836 | +} |
---|
| 5837 | + |
---|
| 5838 | +#ifdef CONFIG_CMA |
---|
| 5839 | +static bool cma_reserve_called __initdata; |
---|
| 5840 | + |
---|
| 5841 | +static int __init cmdline_parse_hugetlb_cma(char *p) |
---|
| 5842 | +{ |
---|
| 5843 | + hugetlb_cma_size = memparse(p, &p); |
---|
| 5844 | + return 0; |
---|
| 5845 | +} |
---|
| 5846 | + |
---|
| 5847 | +early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); |
---|
| 5848 | + |
---|
| 5849 | +void __init hugetlb_cma_reserve(int order) |
---|
| 5850 | +{ |
---|
| 5851 | + unsigned long size, reserved, per_node; |
---|
| 5852 | + int nid; |
---|
| 5853 | + |
---|
| 5854 | + cma_reserve_called = true; |
---|
| 5855 | + |
---|
| 5856 | + if (!hugetlb_cma_size) |
---|
| 5857 | + return; |
---|
| 5858 | + |
---|
| 5859 | + if (hugetlb_cma_size < (PAGE_SIZE << order)) { |
---|
| 5860 | + pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", |
---|
| 5861 | + (PAGE_SIZE << order) / SZ_1M); |
---|
| 5862 | + return; |
---|
| 5863 | + } |
---|
| 5864 | + |
---|
| 5865 | + /* |
---|
| 5866 | + * If 3 GB area is requested on a machine with 4 numa nodes, |
---|
| 5867 | + * let's allocate 1 GB on first three nodes and ignore the last one. |
---|
| 5868 | + */ |
---|
| 5869 | + per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); |
---|
| 5870 | + pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", |
---|
| 5871 | + hugetlb_cma_size / SZ_1M, per_node / SZ_1M); |
---|
| 5872 | + |
---|
| 5873 | + reserved = 0; |
---|
| 5874 | + for_each_node_state(nid, N_ONLINE) { |
---|
| 5875 | + int res; |
---|
| 5876 | + char name[CMA_MAX_NAME]; |
---|
| 5877 | + |
---|
| 5878 | + size = min(per_node, hugetlb_cma_size - reserved); |
---|
| 5879 | + size = round_up(size, PAGE_SIZE << order); |
---|
| 5880 | + |
---|
| 5881 | + snprintf(name, sizeof(name), "hugetlb%d", nid); |
---|
| 5882 | + res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order, |
---|
| 5883 | + 0, false, name, |
---|
| 5884 | + &hugetlb_cma[nid], nid); |
---|
| 5885 | + if (res) { |
---|
| 5886 | + pr_warn("hugetlb_cma: reservation failed: err %d, node %d", |
---|
| 5887 | + res, nid); |
---|
| 5888 | + continue; |
---|
| 5889 | + } |
---|
| 5890 | + |
---|
| 5891 | + reserved += size; |
---|
| 5892 | + pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", |
---|
| 5893 | + size / SZ_1M, nid); |
---|
| 5894 | + |
---|
| 5895 | + if (reserved >= hugetlb_cma_size) |
---|
| 5896 | + break; |
---|
| 5897 | + } |
---|
| 5898 | +} |
---|
| 5899 | + |
---|
| 5900 | +void __init hugetlb_cma_check(void) |
---|
| 5901 | +{ |
---|
| 5902 | + if (!hugetlb_cma_size || cma_reserve_called) |
---|
| 5903 | + return; |
---|
| 5904 | + |
---|
| 5905 | + pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); |
---|
| 5906 | +} |
---|
| 5907 | + |
---|
| 5908 | +#endif /* CONFIG_CMA */ |
---|