hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/x86/include/asm/pgtable-3level.h
....@@ -36,39 +36,41 @@
3636
3737 #define pmd_read_atomic pmd_read_atomic
3838 /*
39
- * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
40
- * a "*pmdp" dereference done by gcc. Problem is, in certain places
41
- * where pte_offset_map_lock is called, concurrent page faults are
42
- * allowed, if the mmap_sem is hold for reading. An example is mincore
39
+ * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with
40
+ * a "*pmdp" dereference done by GCC. Problem is, in certain places
41
+ * where pte_offset_map_lock() is called, concurrent page faults are
42
+ * allowed, if the mmap_lock is hold for reading. An example is mincore
4343 * vs page faults vs MADV_DONTNEED. On the page fault side
44
- * pmd_populate rightfully does a set_64bit, but if we're reading the
44
+ * pmd_populate() rightfully does a set_64bit(), but if we're reading the
4545 * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
46
- * because gcc will not read the 64bit of the pmd atomically. To fix
47
- * this all places running pmd_offset_map_lock() while holding the
48
- * mmap_sem in read mode, shall read the pmdp pointer using this
49
- * function to know if the pmd is null nor not, and in turn to know if
50
- * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
46
+ * because GCC will not read the 64-bit value of the pmd atomically.
47
+ *
48
+ * To fix this all places running pte_offset_map_lock() while holding the
49
+ * mmap_lock in read mode, shall read the pmdp pointer using this
50
+ * function to know if the pmd is null or not, and in turn to know if
51
+ * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd
5152 * operations.
5253 *
53
- * Without THP if the mmap_sem is hold for reading, the pmd can only
54
- * transition from null to not null while pmd_read_atomic runs. So
54
+ * Without THP if the mmap_lock is held for reading, the pmd can only
55
+ * transition from null to not null while pmd_read_atomic() runs. So
5556 * we can always return atomic pmd values with this function.
5657 *
57
- * With THP if the mmap_sem is hold for reading, the pmd can become
58
+ * With THP if the mmap_lock is held for reading, the pmd can become
5859 * trans_huge or none or point to a pte (and in turn become "stable")
59
- * at any time under pmd_read_atomic. We could read it really
60
- * atomically here with a atomic64_read for the THP enabled case (and
60
+ * at any time under pmd_read_atomic(). We could read it truly
61
+ * atomically here with an atomic64_read() for the THP enabled case (and
6162 * it would be a whole lot simpler), but to avoid using cmpxchg8b we
6263 * only return an atomic pmdval if the low part of the pmdval is later
63
- * found stable (i.e. pointing to a pte). And we're returning a none
64
- * pmdval if the low part of the pmd is none. In some cases the high
65
- * and low part of the pmdval returned may not be consistent if THP is
66
- * enabled (the low part may point to previously mapped hugepage,
67
- * while the high part may point to a more recently mapped hugepage),
68
- * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part
69
- * of the pmd to be read atomically to decide if the pmd is unstable
70
- * or not, with the only exception of when the low part of the pmd is
71
- * zero in which case we return a none pmd.
64
+ * found to be stable (i.e. pointing to a pte). We are also returning a
65
+ * 'none' (zero) pmdval if the low part of the pmd is zero.
66
+ *
67
+ * In some cases the high and low part of the pmdval returned may not be
68
+ * consistent if THP is enabled (the low part may point to previously
69
+ * mapped hugepage, while the high part may point to a more recently
70
+ * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only
71
+ * needs the low part of the pmd to be read atomically to decide if the
72
+ * pmd is unstable or not, with the only exception when the low part
73
+ * of the pmd is zero, in which case we return a 'none' pmd.
7274 */
7375 static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
7476 {
....@@ -284,53 +286,6 @@
284286
285287 #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \
286288 __pteval_swp_offset(pte)))
287
-
288
-#define gup_get_pte gup_get_pte
289
-/*
290
- * WARNING: only to be used in the get_user_pages_fast() implementation.
291
- *
292
- * With get_user_pages_fast(), we walk down the pagetables without taking
293
- * any locks. For this we would like to load the pointers atomically,
294
- * but that is not possible (without expensive cmpxchg8b) on PAE. What
295
- * we do have is the guarantee that a PTE will only either go from not
296
- * present to present, or present to not present or both -- it will not
297
- * switch to a completely different present page without a TLB flush in
298
- * between; something that we are blocking by holding interrupts off.
299
- *
300
- * Setting ptes from not present to present goes:
301
- *
302
- * ptep->pte_high = h;
303
- * smp_wmb();
304
- * ptep->pte_low = l;
305
- *
306
- * And present to not present goes:
307
- *
308
- * ptep->pte_low = 0;
309
- * smp_wmb();
310
- * ptep->pte_high = 0;
311
- *
312
- * We must ensure here that the load of pte_low sees 'l' iff pte_high
313
- * sees 'h'. We load pte_high *after* loading pte_low, which ensures we
314
- * don't see an older value of pte_high. *Then* we recheck pte_low,
315
- * which ensures that we haven't picked up a changed pte high. We might
316
- * have gotten rubbish values from pte_low and pte_high, but we are
317
- * guaranteed that pte_low will not have the present bit set *unless*
318
- * it is 'l'. Because get_user_pages_fast() only operates on present ptes
319
- * we're safe.
320
- */
321
-static inline pte_t gup_get_pte(pte_t *ptep)
322
-{
323
- pte_t pte;
324
-
325
- do {
326
- pte.pte_low = ptep->pte_low;
327
- smp_rmb();
328
- pte.pte_high = ptep->pte_high;
329
- smp_rmb();
330
- } while (unlikely(pte.pte_low != ptep->pte_low));
331
-
332
- return pte;
333
-}
334289
335290 #include <asm/pgtable-invert.h>
336291