| .. | .. | 
|---|
| 36 | 36 |   | 
|---|
| 37 | 37 |  #define pmd_read_atomic pmd_read_atomic | 
|---|
| 38 | 38 |  /* | 
|---|
| 39 |  | - * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with  | 
|---|
| 40 |  | - * a "*pmdp" dereference done by gcc. Problem is, in certain places  | 
|---|
| 41 |  | - * where pte_offset_map_lock is called, concurrent page faults are  | 
|---|
| 42 |  | - * allowed, if the mmap_sem is hold for reading. An example is mincore  | 
|---|
 | 39 | + * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with  | 
|---|
 | 40 | + * a "*pmdp" dereference done by GCC. Problem is, in certain places  | 
|---|
 | 41 | + * where pte_offset_map_lock() is called, concurrent page faults are  | 
|---|
 | 42 | + * allowed, if the mmap_lock is hold for reading. An example is mincore  | 
|---|
| 43 | 43 |   * vs page faults vs MADV_DONTNEED. On the page fault side | 
|---|
| 44 |  | - * pmd_populate rightfully does a set_64bit, but if we're reading the  | 
|---|
 | 44 | + * pmd_populate() rightfully does a set_64bit(), but if we're reading the  | 
|---|
| 45 | 45 |   * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen | 
|---|
| 46 |  | - * because gcc will not read the 64bit of the pmd atomically. To fix  | 
|---|
| 47 |  | - * this all places running pmd_offset_map_lock() while holding the  | 
|---|
| 48 |  | - * mmap_sem in read mode, shall read the pmdp pointer using this  | 
|---|
| 49 |  | - * function to know if the pmd is null nor not, and in turn to know if  | 
|---|
| 50 |  | - * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd  | 
|---|
 | 46 | + * because GCC will not read the 64-bit value of the pmd atomically.  | 
|---|
 | 47 | + *  | 
|---|
 | 48 | + * To fix this all places running pte_offset_map_lock() while holding the  | 
|---|
 | 49 | + * mmap_lock in read mode, shall read the pmdp pointer using this  | 
|---|
 | 50 | + * function to know if the pmd is null or not, and in turn to know if  | 
|---|
 | 51 | + * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd  | 
|---|
| 51 | 52 |   * operations. | 
|---|
| 52 | 53 |   * | 
|---|
| 53 |  | - * Without THP if the mmap_sem is hold for reading, the pmd can only  | 
|---|
| 54 |  | - * transition from null to not null while pmd_read_atomic runs. So  | 
|---|
 | 54 | + * Without THP if the mmap_lock is held for reading, the pmd can only  | 
|---|
 | 55 | + * transition from null to not null while pmd_read_atomic() runs. So  | 
|---|
| 55 | 56 |   * we can always return atomic pmd values with this function. | 
|---|
| 56 | 57 |   * | 
|---|
| 57 |  | - * With THP if the mmap_sem is hold for reading, the pmd can become  | 
|---|
 | 58 | + * With THP if the mmap_lock is held for reading, the pmd can become  | 
|---|
| 58 | 59 |   * trans_huge or none or point to a pte (and in turn become "stable") | 
|---|
| 59 |  | - * at any time under pmd_read_atomic. We could read it really  | 
|---|
| 60 |  | - * atomically here with a atomic64_read for the THP enabled case (and  | 
|---|
 | 60 | + * at any time under pmd_read_atomic(). We could read it truly  | 
|---|
 | 61 | + * atomically here with an atomic64_read() for the THP enabled case (and  | 
|---|
| 61 | 62 |   * it would be a whole lot simpler), but to avoid using cmpxchg8b we | 
|---|
| 62 | 63 |   * only return an atomic pmdval if the low part of the pmdval is later | 
|---|
| 63 |  | - * found stable (i.e. pointing to a pte). And we're returning a none  | 
|---|
| 64 |  | - * pmdval if the low part of the pmd is none. In some cases the high  | 
|---|
| 65 |  | - * and low part of the pmdval returned may not be consistent if THP is  | 
|---|
| 66 |  | - * enabled (the low part may point to previously mapped hugepage,  | 
|---|
| 67 |  | - * while the high part may point to a more recently mapped hugepage),  | 
|---|
| 68 |  | - * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part  | 
|---|
| 69 |  | - * of the pmd to be read atomically to decide if the pmd is unstable  | 
|---|
| 70 |  | - * or not, with the only exception of when the low part of the pmd is  | 
|---|
| 71 |  | - * zero in which case we return a none pmd.  | 
|---|
 | 64 | + * found to be stable (i.e. pointing to a pte). We are also returning a  | 
|---|
 | 65 | + * 'none' (zero) pmdval if the low part of the pmd is zero.  | 
|---|
 | 66 | + *  | 
|---|
 | 67 | + * In some cases the high and low part of the pmdval returned may not be  | 
|---|
 | 68 | + * consistent if THP is enabled (the low part may point to previously  | 
|---|
 | 69 | + * mapped hugepage, while the high part may point to a more recently  | 
|---|
 | 70 | + * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only  | 
|---|
 | 71 | + * needs the low part of the pmd to be read atomically to decide if the  | 
|---|
 | 72 | + * pmd is unstable or not, with the only exception when the low part  | 
|---|
 | 73 | + * of the pmd is zero, in which case we return a 'none' pmd.  | 
|---|
| 72 | 74 |   */ | 
|---|
| 73 | 75 |  static inline pmd_t pmd_read_atomic(pmd_t *pmdp) | 
|---|
| 74 | 76 |  { | 
|---|
| .. | .. | 
|---|
| 284 | 286 |   | 
|---|
| 285 | 287 |  #define __pte_to_swp_entry(pte)	(__swp_entry(__pteval_swp_type(pte), \ | 
|---|
| 286 | 288 |  					     __pteval_swp_offset(pte))) | 
|---|
| 287 |  | -  | 
|---|
| 288 |  | -#define gup_get_pte gup_get_pte  | 
|---|
| 289 |  | -/*  | 
|---|
| 290 |  | - * WARNING: only to be used in the get_user_pages_fast() implementation.  | 
|---|
| 291 |  | - *  | 
|---|
| 292 |  | - * With get_user_pages_fast(), we walk down the pagetables without taking  | 
|---|
| 293 |  | - * any locks.  For this we would like to load the pointers atomically,  | 
|---|
| 294 |  | - * but that is not possible (without expensive cmpxchg8b) on PAE.  What  | 
|---|
| 295 |  | - * we do have is the guarantee that a PTE will only either go from not  | 
|---|
| 296 |  | - * present to present, or present to not present or both -- it will not  | 
|---|
| 297 |  | - * switch to a completely different present page without a TLB flush in  | 
|---|
| 298 |  | - * between; something that we are blocking by holding interrupts off.  | 
|---|
| 299 |  | - *  | 
|---|
| 300 |  | - * Setting ptes from not present to present goes:  | 
|---|
| 301 |  | - *  | 
|---|
| 302 |  | - *   ptep->pte_high = h;  | 
|---|
| 303 |  | - *   smp_wmb();  | 
|---|
| 304 |  | - *   ptep->pte_low = l;  | 
|---|
| 305 |  | - *  | 
|---|
| 306 |  | - * And present to not present goes:  | 
|---|
| 307 |  | - *  | 
|---|
| 308 |  | - *   ptep->pte_low = 0;  | 
|---|
| 309 |  | - *   smp_wmb();  | 
|---|
| 310 |  | - *   ptep->pte_high = 0;  | 
|---|
| 311 |  | - *  | 
|---|
| 312 |  | - * We must ensure here that the load of pte_low sees 'l' iff pte_high  | 
|---|
| 313 |  | - * sees 'h'. We load pte_high *after* loading pte_low, which ensures we  | 
|---|
| 314 |  | - * don't see an older value of pte_high.  *Then* we recheck pte_low,  | 
|---|
| 315 |  | - * which ensures that we haven't picked up a changed pte high. We might  | 
|---|
| 316 |  | - * have gotten rubbish values from pte_low and pte_high, but we are  | 
|---|
| 317 |  | - * guaranteed that pte_low will not have the present bit set *unless*  | 
|---|
| 318 |  | - * it is 'l'. Because get_user_pages_fast() only operates on present ptes  | 
|---|
| 319 |  | - * we're safe.  | 
|---|
| 320 |  | - */  | 
|---|
| 321 |  | -static inline pte_t gup_get_pte(pte_t *ptep)  | 
|---|
| 322 |  | -{  | 
|---|
| 323 |  | -	pte_t pte;  | 
|---|
| 324 |  | -  | 
|---|
| 325 |  | -	do {  | 
|---|
| 326 |  | -		pte.pte_low = ptep->pte_low;  | 
|---|
| 327 |  | -		smp_rmb();  | 
|---|
| 328 |  | -		pte.pte_high = ptep->pte_high;  | 
|---|
| 329 |  | -		smp_rmb();  | 
|---|
| 330 |  | -	} while (unlikely(pte.pte_low != ptep->pte_low));  | 
|---|
| 331 |  | -  | 
|---|
| 332 |  | -	return pte;  | 
|---|
| 333 |  | -}  | 
|---|
| 334 | 289 |   | 
|---|
| 335 | 290 |  #include <asm/pgtable-invert.h> | 
|---|
| 336 | 291 |   | 
|---|