.. | .. |
---|
36 | 36 | |
---|
37 | 37 | #define pmd_read_atomic pmd_read_atomic |
---|
38 | 38 | /* |
---|
39 | | - * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with |
---|
40 | | - * a "*pmdp" dereference done by gcc. Problem is, in certain places |
---|
41 | | - * where pte_offset_map_lock is called, concurrent page faults are |
---|
42 | | - * allowed, if the mmap_sem is hold for reading. An example is mincore |
---|
| 39 | + * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with |
---|
| 40 | + * a "*pmdp" dereference done by GCC. Problem is, in certain places |
---|
| 41 | + * where pte_offset_map_lock() is called, concurrent page faults are |
---|
| 42 | + * allowed, if the mmap_lock is hold for reading. An example is mincore |
---|
43 | 43 | * vs page faults vs MADV_DONTNEED. On the page fault side |
---|
44 | | - * pmd_populate rightfully does a set_64bit, but if we're reading the |
---|
| 44 | + * pmd_populate() rightfully does a set_64bit(), but if we're reading the |
---|
45 | 45 | * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen |
---|
46 | | - * because gcc will not read the 64bit of the pmd atomically. To fix |
---|
47 | | - * this all places running pmd_offset_map_lock() while holding the |
---|
48 | | - * mmap_sem in read mode, shall read the pmdp pointer using this |
---|
49 | | - * function to know if the pmd is null nor not, and in turn to know if |
---|
50 | | - * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd |
---|
| 46 | + * because GCC will not read the 64-bit value of the pmd atomically. |
---|
| 47 | + * |
---|
| 48 | + * To fix this all places running pte_offset_map_lock() while holding the |
---|
| 49 | + * mmap_lock in read mode, shall read the pmdp pointer using this |
---|
| 50 | + * function to know if the pmd is null or not, and in turn to know if |
---|
| 51 | + * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd |
---|
51 | 52 | * operations. |
---|
52 | 53 | * |
---|
53 | | - * Without THP if the mmap_sem is hold for reading, the pmd can only |
---|
54 | | - * transition from null to not null while pmd_read_atomic runs. So |
---|
| 54 | + * Without THP if the mmap_lock is held for reading, the pmd can only |
---|
| 55 | + * transition from null to not null while pmd_read_atomic() runs. So |
---|
55 | 56 | * we can always return atomic pmd values with this function. |
---|
56 | 57 | * |
---|
57 | | - * With THP if the mmap_sem is hold for reading, the pmd can become |
---|
| 58 | + * With THP if the mmap_lock is held for reading, the pmd can become |
---|
58 | 59 | * trans_huge or none or point to a pte (and in turn become "stable") |
---|
59 | | - * at any time under pmd_read_atomic. We could read it really |
---|
60 | | - * atomically here with a atomic64_read for the THP enabled case (and |
---|
| 60 | + * at any time under pmd_read_atomic(). We could read it truly |
---|
| 61 | + * atomically here with an atomic64_read() for the THP enabled case (and |
---|
61 | 62 | * it would be a whole lot simpler), but to avoid using cmpxchg8b we |
---|
62 | 63 | * only return an atomic pmdval if the low part of the pmdval is later |
---|
63 | | - * found stable (i.e. pointing to a pte). And we're returning a none |
---|
64 | | - * pmdval if the low part of the pmd is none. In some cases the high |
---|
65 | | - * and low part of the pmdval returned may not be consistent if THP is |
---|
66 | | - * enabled (the low part may point to previously mapped hugepage, |
---|
67 | | - * while the high part may point to a more recently mapped hugepage), |
---|
68 | | - * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part |
---|
69 | | - * of the pmd to be read atomically to decide if the pmd is unstable |
---|
70 | | - * or not, with the only exception of when the low part of the pmd is |
---|
71 | | - * zero in which case we return a none pmd. |
---|
| 64 | + * found to be stable (i.e. pointing to a pte). We are also returning a |
---|
| 65 | + * 'none' (zero) pmdval if the low part of the pmd is zero. |
---|
| 66 | + * |
---|
| 67 | + * In some cases the high and low part of the pmdval returned may not be |
---|
| 68 | + * consistent if THP is enabled (the low part may point to previously |
---|
| 69 | + * mapped hugepage, while the high part may point to a more recently |
---|
| 70 | + * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only |
---|
| 71 | + * needs the low part of the pmd to be read atomically to decide if the |
---|
| 72 | + * pmd is unstable or not, with the only exception when the low part |
---|
| 73 | + * of the pmd is zero, in which case we return a 'none' pmd. |
---|
72 | 74 | */ |
---|
73 | 75 | static inline pmd_t pmd_read_atomic(pmd_t *pmdp) |
---|
74 | 76 | { |
---|
.. | .. |
---|
284 | 286 | |
---|
285 | 287 | #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ |
---|
286 | 288 | __pteval_swp_offset(pte))) |
---|
287 | | - |
---|
288 | | -#define gup_get_pte gup_get_pte |
---|
289 | | -/* |
---|
290 | | - * WARNING: only to be used in the get_user_pages_fast() implementation. |
---|
291 | | - * |
---|
292 | | - * With get_user_pages_fast(), we walk down the pagetables without taking |
---|
293 | | - * any locks. For this we would like to load the pointers atomically, |
---|
294 | | - * but that is not possible (without expensive cmpxchg8b) on PAE. What |
---|
295 | | - * we do have is the guarantee that a PTE will only either go from not |
---|
296 | | - * present to present, or present to not present or both -- it will not |
---|
297 | | - * switch to a completely different present page without a TLB flush in |
---|
298 | | - * between; something that we are blocking by holding interrupts off. |
---|
299 | | - * |
---|
300 | | - * Setting ptes from not present to present goes: |
---|
301 | | - * |
---|
302 | | - * ptep->pte_high = h; |
---|
303 | | - * smp_wmb(); |
---|
304 | | - * ptep->pte_low = l; |
---|
305 | | - * |
---|
306 | | - * And present to not present goes: |
---|
307 | | - * |
---|
308 | | - * ptep->pte_low = 0; |
---|
309 | | - * smp_wmb(); |
---|
310 | | - * ptep->pte_high = 0; |
---|
311 | | - * |
---|
312 | | - * We must ensure here that the load of pte_low sees 'l' iff pte_high |
---|
313 | | - * sees 'h'. We load pte_high *after* loading pte_low, which ensures we |
---|
314 | | - * don't see an older value of pte_high. *Then* we recheck pte_low, |
---|
315 | | - * which ensures that we haven't picked up a changed pte high. We might |
---|
316 | | - * have gotten rubbish values from pte_low and pte_high, but we are |
---|
317 | | - * guaranteed that pte_low will not have the present bit set *unless* |
---|
318 | | - * it is 'l'. Because get_user_pages_fast() only operates on present ptes |
---|
319 | | - * we're safe. |
---|
320 | | - */ |
---|
321 | | -static inline pte_t gup_get_pte(pte_t *ptep) |
---|
322 | | -{ |
---|
323 | | - pte_t pte; |
---|
324 | | - |
---|
325 | | - do { |
---|
326 | | - pte.pte_low = ptep->pte_low; |
---|
327 | | - smp_rmb(); |
---|
328 | | - pte.pte_high = ptep->pte_high; |
---|
329 | | - smp_rmb(); |
---|
330 | | - } while (unlikely(pte.pte_low != ptep->pte_low)); |
---|
331 | | - |
---|
332 | | - return pte; |
---|
333 | | -} |
---|
334 | 289 | |
---|
335 | 290 | #include <asm/pgtable-invert.h> |
---|
336 | 291 | |
---|