.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * This file contains common routines for dealing with free of page tables |
---|
3 | 4 | * Along with common page table handling code |
---|
.. | .. |
---|
14 | 15 | * |
---|
15 | 16 | * Dave Engebretsen <engebret@us.ibm.com> |
---|
16 | 17 | * Rework for PPC64 port. |
---|
17 | | - * |
---|
18 | | - * This program is free software; you can redistribute it and/or |
---|
19 | | - * modify it under the terms of the GNU General Public License |
---|
20 | | - * as published by the Free Software Foundation; either version |
---|
21 | | - * 2 of the License, or (at your option) any later version. |
---|
22 | 18 | */ |
---|
23 | 19 | |
---|
24 | 20 | #include <linux/kernel.h> |
---|
.. | .. |
---|
27 | 23 | #include <linux/percpu.h> |
---|
28 | 24 | #include <linux/hardirq.h> |
---|
29 | 25 | #include <linux/hugetlb.h> |
---|
30 | | -#include <asm/pgalloc.h> |
---|
31 | 26 | #include <asm/tlbflush.h> |
---|
32 | 27 | #include <asm/tlb.h> |
---|
| 28 | +#include <asm/hugetlb.h> |
---|
33 | 29 | |
---|
34 | 30 | static inline int is_exec_fault(void) |
---|
35 | 31 | { |
---|
.. | .. |
---|
44 | 40 | static inline int pte_looks_normal(pte_t pte) |
---|
45 | 41 | { |
---|
46 | 42 | |
---|
47 | | -#if defined(CONFIG_PPC_BOOK3S_64) |
---|
48 | | - if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) { |
---|
| 43 | + if (pte_present(pte) && !pte_special(pte)) { |
---|
49 | 44 | if (pte_ci(pte)) |
---|
50 | 45 | return 0; |
---|
51 | 46 | if (pte_user(pte)) |
---|
52 | 47 | return 1; |
---|
53 | 48 | } |
---|
54 | 49 | return 0; |
---|
55 | | -#else |
---|
56 | | - return (pte_val(pte) & |
---|
57 | | - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER | |
---|
58 | | - _PAGE_PRIVILEGED)) == |
---|
59 | | - (_PAGE_PRESENT | _PAGE_USER); |
---|
60 | | -#endif |
---|
61 | 50 | } |
---|
62 | 51 | |
---|
63 | 52 | static struct page *maybe_pte_to_page(pte_t pte) |
---|
.. | .. |
---|
73 | 62 | return page; |
---|
74 | 63 | } |
---|
75 | 64 | |
---|
76 | | -#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 |
---|
| 65 | +#ifdef CONFIG_PPC_BOOK3S |
---|
77 | 66 | |
---|
78 | 67 | /* Server-style MMU handles coherency when hashing if HW exec permission |
---|
79 | 68 | * is supposed per page (currently 64-bit only). If not, then, we always |
---|
.. | .. |
---|
81 | 70 | * support falls into the same category. |
---|
82 | 71 | */ |
---|
83 | 72 | |
---|
84 | | -static pte_t set_pte_filter(pte_t pte) |
---|
| 73 | +static pte_t set_pte_filter_hash(pte_t pte) |
---|
85 | 74 | { |
---|
86 | 75 | if (radix_enabled()) |
---|
87 | 76 | return pte; |
---|
.. | .. |
---|
100 | 89 | return pte; |
---|
101 | 90 | } |
---|
102 | 91 | |
---|
103 | | -static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, |
---|
104 | | - int dirty) |
---|
105 | | -{ |
---|
106 | | - return pte; |
---|
107 | | -} |
---|
| 92 | +#else /* CONFIG_PPC_BOOK3S */ |
---|
108 | 93 | |
---|
109 | | -#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ |
---|
| 94 | +static pte_t set_pte_filter_hash(pte_t pte) { return pte; } |
---|
| 95 | + |
---|
| 96 | +#endif /* CONFIG_PPC_BOOK3S */ |
---|
110 | 97 | |
---|
111 | 98 | /* Embedded type MMU with HW exec support. This is a bit more complicated |
---|
112 | 99 | * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so |
---|
113 | 100 | * instead we "filter out" the exec permission for non clean pages. |
---|
114 | 101 | */ |
---|
115 | | -static pte_t set_pte_filter(pte_t pte) |
---|
| 102 | +static inline pte_t set_pte_filter(pte_t pte) |
---|
116 | 103 | { |
---|
117 | 104 | struct page *pg; |
---|
118 | 105 | |
---|
| 106 | + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) |
---|
| 107 | + return set_pte_filter_hash(pte); |
---|
| 108 | + |
---|
119 | 109 | /* No exec permission in the first place, move on */ |
---|
120 | | - if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) |
---|
| 110 | + if (!pte_exec(pte) || !pte_looks_normal(pte)) |
---|
121 | 111 | return pte; |
---|
122 | 112 | |
---|
123 | 113 | /* If you set _PAGE_EXEC on weird pages you're on your own */ |
---|
.. | .. |
---|
137 | 127 | } |
---|
138 | 128 | |
---|
139 | 129 | /* Else, we filter out _PAGE_EXEC */ |
---|
140 | | - return __pte(pte_val(pte) & ~_PAGE_EXEC); |
---|
| 130 | + return pte_exprotect(pte); |
---|
141 | 131 | } |
---|
142 | 132 | |
---|
143 | 133 | static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, |
---|
.. | .. |
---|
145 | 135 | { |
---|
146 | 136 | struct page *pg; |
---|
147 | 137 | |
---|
| 138 | + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) |
---|
| 139 | + return pte; |
---|
| 140 | + |
---|
148 | 141 | /* So here, we only care about exec faults, as we use them |
---|
149 | 142 | * to recover lost _PAGE_EXEC and perform I$/D$ coherency |
---|
150 | 143 | * if necessary. Also if _PAGE_EXEC is already set, same deal, |
---|
151 | 144 | * we just bail out |
---|
152 | 145 | */ |
---|
153 | | - if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) |
---|
| 146 | + if (dirty || pte_exec(pte) || !is_exec_fault()) |
---|
154 | 147 | return pte; |
---|
155 | 148 | |
---|
156 | 149 | #ifdef CONFIG_DEBUG_VM |
---|
.. | .. |
---|
176 | 169 | set_bit(PG_arch_1, &pg->flags); |
---|
177 | 170 | |
---|
178 | 171 | bail: |
---|
179 | | - return __pte(pte_val(pte) | _PAGE_EXEC); |
---|
| 172 | + return pte_mkexec(pte); |
---|
180 | 173 | } |
---|
181 | | - |
---|
182 | | -#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ |
---|
183 | 174 | |
---|
184 | 175 | /* |
---|
185 | 176 | * set_pte stores a linux PTE into the linux page table. |
---|
.. | .. |
---|
188 | 179 | pte_t pte) |
---|
189 | 180 | { |
---|
190 | 181 | /* |
---|
191 | | - * When handling numa faults, we already have the pte marked |
---|
192 | | - * _PAGE_PRESENT, but we can be sure that it is not in hpte. |
---|
193 | | - * Hence we can use set_pte_at for them. |
---|
| 182 | + * Make sure hardware valid bit is not set. We don't do |
---|
| 183 | + * tlb flush for this update. |
---|
194 | 184 | */ |
---|
195 | | - VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); |
---|
196 | | - |
---|
197 | | - /* Add the pte bit when trying to set a pte */ |
---|
198 | | - pte = __pte(pte_val(pte) | _PAGE_PTE); |
---|
| 185 | + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); |
---|
199 | 186 | |
---|
200 | 187 | /* Note: mm->context.id might not yet have been assigned as |
---|
201 | 188 | * this context might not have been activated yet when this |
---|
.. | .. |
---|
205 | 192 | |
---|
206 | 193 | /* Perform the setting of the PTE */ |
---|
207 | 194 | __set_pte_at(mm, addr, ptep, pte, 0); |
---|
| 195 | +} |
---|
| 196 | + |
---|
| 197 | +void unmap_kernel_page(unsigned long va) |
---|
| 198 | +{ |
---|
| 199 | + pmd_t *pmdp = pmd_off_k(va); |
---|
| 200 | + pte_t *ptep = pte_offset_kernel(pmdp, va); |
---|
| 201 | + |
---|
| 202 | + pte_clear(&init_mm, va, ptep); |
---|
| 203 | + flush_tlb_kernel_range(va, va + PAGE_SIZE); |
---|
208 | 204 | } |
---|
209 | 205 | |
---|
210 | 206 | /* |
---|
.. | .. |
---|
229 | 225 | } |
---|
230 | 226 | |
---|
231 | 227 | #ifdef CONFIG_HUGETLB_PAGE |
---|
232 | | -extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, |
---|
233 | | - unsigned long addr, pte_t *ptep, |
---|
234 | | - pte_t pte, int dirty) |
---|
| 228 | +int huge_ptep_set_access_flags(struct vm_area_struct *vma, |
---|
| 229 | + unsigned long addr, pte_t *ptep, |
---|
| 230 | + pte_t pte, int dirty) |
---|
235 | 231 | { |
---|
236 | 232 | #ifdef HUGETLB_NEED_PRELOAD |
---|
237 | 233 | /* |
---|
.. | .. |
---|
258 | 254 | |
---|
259 | 255 | #else |
---|
260 | 256 | /* |
---|
261 | | - * Not used on non book3s64 platforms. But 8xx |
---|
262 | | - * can possibly use tsize derived from hstate. |
---|
| 257 | + * Not used on non book3s64 platforms. |
---|
| 258 | + * 8xx compares it with mmu_virtual_psize to |
---|
| 259 | + * know if it is a huge page or not. |
---|
263 | 260 | */ |
---|
264 | | - psize = 0; |
---|
| 261 | + psize = MMU_PAGE_COUNT; |
---|
265 | 262 | #endif |
---|
266 | 263 | __ptep_set_access_flags(vma, ptep, pte, addr, psize); |
---|
267 | 264 | } |
---|
268 | 265 | return changed; |
---|
269 | 266 | #endif |
---|
270 | 267 | } |
---|
| 268 | + |
---|
| 269 | +#if defined(CONFIG_PPC_8xx) |
---|
| 270 | +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) |
---|
| 271 | +{ |
---|
| 272 | + pmd_t *pmd = pmd_off(mm, addr); |
---|
| 273 | + pte_basic_t val; |
---|
| 274 | + pte_basic_t *entry = &ptep->pte; |
---|
| 275 | + int num, i; |
---|
| 276 | + |
---|
| 277 | + /* |
---|
| 278 | + * Make sure hardware valid bit is not set. We don't do |
---|
| 279 | + * tlb flush for this update. |
---|
| 280 | + */ |
---|
| 281 | + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); |
---|
| 282 | + |
---|
| 283 | + pte = set_pte_filter(pte); |
---|
| 284 | + |
---|
| 285 | + val = pte_val(pte); |
---|
| 286 | + |
---|
| 287 | + num = number_of_cells_per_pte(pmd, val, 1); |
---|
| 288 | + |
---|
| 289 | + for (i = 0; i < num; i++, entry++, val += SZ_4K) |
---|
| 290 | + *entry = val; |
---|
| 291 | +} |
---|
| 292 | +#endif |
---|
271 | 293 | #endif /* CONFIG_HUGETLB_PAGE */ |
---|
272 | 294 | |
---|
273 | 295 | #ifdef CONFIG_DEBUG_VM |
---|
274 | 296 | void assert_pte_locked(struct mm_struct *mm, unsigned long addr) |
---|
275 | 297 | { |
---|
276 | 298 | pgd_t *pgd; |
---|
| 299 | + p4d_t *p4d; |
---|
277 | 300 | pud_t *pud; |
---|
278 | 301 | pmd_t *pmd; |
---|
279 | 302 | |
---|
.. | .. |
---|
281 | 304 | return; |
---|
282 | 305 | pgd = mm->pgd + pgd_index(addr); |
---|
283 | 306 | BUG_ON(pgd_none(*pgd)); |
---|
284 | | - pud = pud_offset(pgd, addr); |
---|
| 307 | + p4d = p4d_offset(pgd, addr); |
---|
| 308 | + BUG_ON(p4d_none(*p4d)); |
---|
| 309 | + pud = pud_offset(p4d, addr); |
---|
285 | 310 | BUG_ON(pud_none(*pud)); |
---|
286 | 311 | pmd = pmd_offset(pud, addr); |
---|
287 | 312 | /* |
---|
288 | 313 | * khugepaged to collapse normal pages to hugepage, first set |
---|
289 | | - * pmd to none to force page fault/gup to take mmap_sem. After |
---|
| 314 | + * pmd to none to force page fault/gup to take mmap_lock. After |
---|
290 | 315 | * pmd is set to none, we do a pte_clear which does this assertion |
---|
291 | 316 | * so if we find pmd none, return. |
---|
292 | 317 | */ |
---|
.. | .. |
---|
305 | 330 | return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va); |
---|
306 | 331 | } |
---|
307 | 332 | EXPORT_SYMBOL_GPL(vmalloc_to_phys); |
---|
| 333 | + |
---|
| 334 | +/* |
---|
| 335 | + * We have 4 cases for pgds and pmds: |
---|
| 336 | + * (1) invalid (all zeroes) |
---|
| 337 | + * (2) pointer to next table, as normal; bottom 6 bits == 0 |
---|
| 338 | + * (3) leaf pte for huge page _PAGE_PTE set |
---|
| 339 | + * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table |
---|
| 340 | + * |
---|
| 341 | + * So long as we atomically load page table pointers we are safe against teardown, |
---|
| 342 | + * we can follow the address down to the the page and take a ref on it. |
---|
| 343 | + * This function need to be called with interrupts disabled. We use this variant |
---|
| 344 | + * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED |
---|
| 345 | + */ |
---|
| 346 | +pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, |
---|
| 347 | + bool *is_thp, unsigned *hpage_shift) |
---|
| 348 | +{ |
---|
| 349 | + pgd_t *pgdp; |
---|
| 350 | + p4d_t p4d, *p4dp; |
---|
| 351 | + pud_t pud, *pudp; |
---|
| 352 | + pmd_t pmd, *pmdp; |
---|
| 353 | + pte_t *ret_pte; |
---|
| 354 | + hugepd_t *hpdp = NULL; |
---|
| 355 | + unsigned pdshift; |
---|
| 356 | + |
---|
| 357 | + if (hpage_shift) |
---|
| 358 | + *hpage_shift = 0; |
---|
| 359 | + |
---|
| 360 | + if (is_thp) |
---|
| 361 | + *is_thp = false; |
---|
| 362 | + |
---|
| 363 | + /* |
---|
| 364 | + * Always operate on the local stack value. This make sure the |
---|
| 365 | + * value don't get updated by a parallel THP split/collapse, |
---|
| 366 | + * page fault or a page unmap. The return pte_t * is still not |
---|
| 367 | + * stable. So should be checked there for above conditions. |
---|
| 368 | + * Top level is an exception because it is folded into p4d. |
---|
| 369 | + */ |
---|
| 370 | + pgdp = pgdir + pgd_index(ea); |
---|
| 371 | + p4dp = p4d_offset(pgdp, ea); |
---|
| 372 | + p4d = READ_ONCE(*p4dp); |
---|
| 373 | + pdshift = P4D_SHIFT; |
---|
| 374 | + |
---|
| 375 | + if (p4d_none(p4d)) |
---|
| 376 | + return NULL; |
---|
| 377 | + |
---|
| 378 | + if (p4d_is_leaf(p4d)) { |
---|
| 379 | + ret_pte = (pte_t *)p4dp; |
---|
| 380 | + goto out; |
---|
| 381 | + } |
---|
| 382 | + |
---|
| 383 | + if (is_hugepd(__hugepd(p4d_val(p4d)))) { |
---|
| 384 | + hpdp = (hugepd_t *)&p4d; |
---|
| 385 | + goto out_huge; |
---|
| 386 | + } |
---|
| 387 | + |
---|
| 388 | + /* |
---|
| 389 | + * Even if we end up with an unmap, the pgtable will not |
---|
| 390 | + * be freed, because we do an rcu free and here we are |
---|
| 391 | + * irq disabled |
---|
| 392 | + */ |
---|
| 393 | + pdshift = PUD_SHIFT; |
---|
| 394 | + pudp = pud_offset(&p4d, ea); |
---|
| 395 | + pud = READ_ONCE(*pudp); |
---|
| 396 | + |
---|
| 397 | + if (pud_none(pud)) |
---|
| 398 | + return NULL; |
---|
| 399 | + |
---|
| 400 | + if (pud_is_leaf(pud)) { |
---|
| 401 | + ret_pte = (pte_t *)pudp; |
---|
| 402 | + goto out; |
---|
| 403 | + } |
---|
| 404 | + |
---|
| 405 | + if (is_hugepd(__hugepd(pud_val(pud)))) { |
---|
| 406 | + hpdp = (hugepd_t *)&pud; |
---|
| 407 | + goto out_huge; |
---|
| 408 | + } |
---|
| 409 | + |
---|
| 410 | + pdshift = PMD_SHIFT; |
---|
| 411 | + pmdp = pmd_offset(&pud, ea); |
---|
| 412 | + pmd = READ_ONCE(*pmdp); |
---|
| 413 | + |
---|
| 414 | + /* |
---|
| 415 | + * A hugepage collapse is captured by this condition, see |
---|
| 416 | + * pmdp_collapse_flush. |
---|
| 417 | + */ |
---|
| 418 | + if (pmd_none(pmd)) |
---|
| 419 | + return NULL; |
---|
| 420 | + |
---|
| 421 | +#ifdef CONFIG_PPC_BOOK3S_64 |
---|
| 422 | + /* |
---|
| 423 | + * A hugepage split is captured by this condition, see |
---|
| 424 | + * pmdp_invalidate. |
---|
| 425 | + * |
---|
| 426 | + * Huge page modification can be caught here too. |
---|
| 427 | + */ |
---|
| 428 | + if (pmd_is_serializing(pmd)) |
---|
| 429 | + return NULL; |
---|
| 430 | +#endif |
---|
| 431 | + |
---|
| 432 | + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { |
---|
| 433 | + if (is_thp) |
---|
| 434 | + *is_thp = true; |
---|
| 435 | + ret_pte = (pte_t *)pmdp; |
---|
| 436 | + goto out; |
---|
| 437 | + } |
---|
| 438 | + |
---|
| 439 | + if (pmd_is_leaf(pmd)) { |
---|
| 440 | + ret_pte = (pte_t *)pmdp; |
---|
| 441 | + goto out; |
---|
| 442 | + } |
---|
| 443 | + |
---|
| 444 | + if (is_hugepd(__hugepd(pmd_val(pmd)))) { |
---|
| 445 | + hpdp = (hugepd_t *)&pmd; |
---|
| 446 | + goto out_huge; |
---|
| 447 | + } |
---|
| 448 | + |
---|
| 449 | + return pte_offset_kernel(&pmd, ea); |
---|
| 450 | + |
---|
| 451 | +out_huge: |
---|
| 452 | + if (!hpdp) |
---|
| 453 | + return NULL; |
---|
| 454 | + |
---|
| 455 | + ret_pte = hugepte_offset(*hpdp, ea, pdshift); |
---|
| 456 | + pdshift = hugepd_shift(*hpdp); |
---|
| 457 | +out: |
---|
| 458 | + if (hpage_shift) |
---|
| 459 | + *hpage_shift = pdshift; |
---|
| 460 | + return ret_pte; |
---|
| 461 | +} |
---|
| 462 | +EXPORT_SYMBOL_GPL(__find_linux_pte); |
---|