| .. | .. |
|---|
| 36 | 36 | |
|---|
| 37 | 37 | #define pmd_read_atomic pmd_read_atomic |
|---|
| 38 | 38 | /* |
|---|
| 39 | | - * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with |
|---|
| 40 | | - * a "*pmdp" dereference done by gcc. Problem is, in certain places |
|---|
| 41 | | - * where pte_offset_map_lock is called, concurrent page faults are |
|---|
| 42 | | - * allowed, if the mmap_sem is hold for reading. An example is mincore |
|---|
| 39 | + * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with |
|---|
| 40 | + * a "*pmdp" dereference done by GCC. Problem is, in certain places |
|---|
| 41 | + * where pte_offset_map_lock() is called, concurrent page faults are |
|---|
| 42 | + * allowed, if the mmap_lock is hold for reading. An example is mincore |
|---|
| 43 | 43 | * vs page faults vs MADV_DONTNEED. On the page fault side |
|---|
| 44 | | - * pmd_populate rightfully does a set_64bit, but if we're reading the |
|---|
| 44 | + * pmd_populate() rightfully does a set_64bit(), but if we're reading the |
|---|
| 45 | 45 | * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen |
|---|
| 46 | | - * because gcc will not read the 64bit of the pmd atomically. To fix |
|---|
| 47 | | - * this all places running pmd_offset_map_lock() while holding the |
|---|
| 48 | | - * mmap_sem in read mode, shall read the pmdp pointer using this |
|---|
| 49 | | - * function to know if the pmd is null nor not, and in turn to know if |
|---|
| 50 | | - * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd |
|---|
| 46 | + * because GCC will not read the 64-bit value of the pmd atomically. |
|---|
| 47 | + * |
|---|
| 48 | + * To fix this all places running pte_offset_map_lock() while holding the |
|---|
| 49 | + * mmap_lock in read mode, shall read the pmdp pointer using this |
|---|
| 50 | + * function to know if the pmd is null or not, and in turn to know if |
|---|
| 51 | + * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd |
|---|
| 51 | 52 | * operations. |
|---|
| 52 | 53 | * |
|---|
| 53 | | - * Without THP if the mmap_sem is hold for reading, the pmd can only |
|---|
| 54 | | - * transition from null to not null while pmd_read_atomic runs. So |
|---|
| 54 | + * Without THP if the mmap_lock is held for reading, the pmd can only |
|---|
| 55 | + * transition from null to not null while pmd_read_atomic() runs. So |
|---|
| 55 | 56 | * we can always return atomic pmd values with this function. |
|---|
| 56 | 57 | * |
|---|
| 57 | | - * With THP if the mmap_sem is hold for reading, the pmd can become |
|---|
| 58 | + * With THP if the mmap_lock is held for reading, the pmd can become |
|---|
| 58 | 59 | * trans_huge or none or point to a pte (and in turn become "stable") |
|---|
| 59 | | - * at any time under pmd_read_atomic. We could read it really |
|---|
| 60 | | - * atomically here with a atomic64_read for the THP enabled case (and |
|---|
| 60 | + * at any time under pmd_read_atomic(). We could read it truly |
|---|
| 61 | + * atomically here with an atomic64_read() for the THP enabled case (and |
|---|
| 61 | 62 | * it would be a whole lot simpler), but to avoid using cmpxchg8b we |
|---|
| 62 | 63 | * only return an atomic pmdval if the low part of the pmdval is later |
|---|
| 63 | | - * found stable (i.e. pointing to a pte). And we're returning a none |
|---|
| 64 | | - * pmdval if the low part of the pmd is none. In some cases the high |
|---|
| 65 | | - * and low part of the pmdval returned may not be consistent if THP is |
|---|
| 66 | | - * enabled (the low part may point to previously mapped hugepage, |
|---|
| 67 | | - * while the high part may point to a more recently mapped hugepage), |
|---|
| 68 | | - * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part |
|---|
| 69 | | - * of the pmd to be read atomically to decide if the pmd is unstable |
|---|
| 70 | | - * or not, with the only exception of when the low part of the pmd is |
|---|
| 71 | | - * zero in which case we return a none pmd. |
|---|
| 64 | + * found to be stable (i.e. pointing to a pte). We are also returning a |
|---|
| 65 | + * 'none' (zero) pmdval if the low part of the pmd is zero. |
|---|
| 66 | + * |
|---|
| 67 | + * In some cases the high and low part of the pmdval returned may not be |
|---|
| 68 | + * consistent if THP is enabled (the low part may point to previously |
|---|
| 69 | + * mapped hugepage, while the high part may point to a more recently |
|---|
| 70 | + * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only |
|---|
| 71 | + * needs the low part of the pmd to be read atomically to decide if the |
|---|
| 72 | + * pmd is unstable or not, with the only exception when the low part |
|---|
| 73 | + * of the pmd is zero, in which case we return a 'none' pmd. |
|---|
| 72 | 74 | */ |
|---|
| 73 | 75 | static inline pmd_t pmd_read_atomic(pmd_t *pmdp) |
|---|
| 74 | 76 | { |
|---|
| .. | .. |
|---|
| 284 | 286 | |
|---|
| 285 | 287 | #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ |
|---|
| 286 | 288 | __pteval_swp_offset(pte))) |
|---|
| 287 | | - |
|---|
| 288 | | -#define gup_get_pte gup_get_pte |
|---|
| 289 | | -/* |
|---|
| 290 | | - * WARNING: only to be used in the get_user_pages_fast() implementation. |
|---|
| 291 | | - * |
|---|
| 292 | | - * With get_user_pages_fast(), we walk down the pagetables without taking |
|---|
| 293 | | - * any locks. For this we would like to load the pointers atomically, |
|---|
| 294 | | - * but that is not possible (without expensive cmpxchg8b) on PAE. What |
|---|
| 295 | | - * we do have is the guarantee that a PTE will only either go from not |
|---|
| 296 | | - * present to present, or present to not present or both -- it will not |
|---|
| 297 | | - * switch to a completely different present page without a TLB flush in |
|---|
| 298 | | - * between; something that we are blocking by holding interrupts off. |
|---|
| 299 | | - * |
|---|
| 300 | | - * Setting ptes from not present to present goes: |
|---|
| 301 | | - * |
|---|
| 302 | | - * ptep->pte_high = h; |
|---|
| 303 | | - * smp_wmb(); |
|---|
| 304 | | - * ptep->pte_low = l; |
|---|
| 305 | | - * |
|---|
| 306 | | - * And present to not present goes: |
|---|
| 307 | | - * |
|---|
| 308 | | - * ptep->pte_low = 0; |
|---|
| 309 | | - * smp_wmb(); |
|---|
| 310 | | - * ptep->pte_high = 0; |
|---|
| 311 | | - * |
|---|
| 312 | | - * We must ensure here that the load of pte_low sees 'l' iff pte_high |
|---|
| 313 | | - * sees 'h'. We load pte_high *after* loading pte_low, which ensures we |
|---|
| 314 | | - * don't see an older value of pte_high. *Then* we recheck pte_low, |
|---|
| 315 | | - * which ensures that we haven't picked up a changed pte high. We might |
|---|
| 316 | | - * have gotten rubbish values from pte_low and pte_high, but we are |
|---|
| 317 | | - * guaranteed that pte_low will not have the present bit set *unless* |
|---|
| 318 | | - * it is 'l'. Because get_user_pages_fast() only operates on present ptes |
|---|
| 319 | | - * we're safe. |
|---|
| 320 | | - */ |
|---|
| 321 | | -static inline pte_t gup_get_pte(pte_t *ptep) |
|---|
| 322 | | -{ |
|---|
| 323 | | - pte_t pte; |
|---|
| 324 | | - |
|---|
| 325 | | - do { |
|---|
| 326 | | - pte.pte_low = ptep->pte_low; |
|---|
| 327 | | - smp_rmb(); |
|---|
| 328 | | - pte.pte_high = ptep->pte_high; |
|---|
| 329 | | - smp_rmb(); |
|---|
| 330 | | - } while (unlikely(pte.pte_low != ptep->pte_low)); |
|---|
| 331 | | - |
|---|
| 332 | | - return pte; |
|---|
| 333 | | -} |
|---|
| 334 | 289 | |
|---|
| 335 | 290 | #include <asm/pgtable-invert.h> |
|---|
| 336 | 291 | |
|---|