| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * This file contains common routines for dealing with free of page tables |
|---|
| 3 | 4 | * Along with common page table handling code |
|---|
| .. | .. |
|---|
| 14 | 15 | * |
|---|
| 15 | 16 | * Dave Engebretsen <engebret@us.ibm.com> |
|---|
| 16 | 17 | * Rework for PPC64 port. |
|---|
| 17 | | - * |
|---|
| 18 | | - * This program is free software; you can redistribute it and/or |
|---|
| 19 | | - * modify it under the terms of the GNU General Public License |
|---|
| 20 | | - * as published by the Free Software Foundation; either version |
|---|
| 21 | | - * 2 of the License, or (at your option) any later version. |
|---|
| 22 | 18 | */ |
|---|
| 23 | 19 | |
|---|
| 24 | 20 | #include <linux/kernel.h> |
|---|
| .. | .. |
|---|
| 27 | 23 | #include <linux/percpu.h> |
|---|
| 28 | 24 | #include <linux/hardirq.h> |
|---|
| 29 | 25 | #include <linux/hugetlb.h> |
|---|
| 30 | | -#include <asm/pgalloc.h> |
|---|
| 31 | 26 | #include <asm/tlbflush.h> |
|---|
| 32 | 27 | #include <asm/tlb.h> |
|---|
| 28 | +#include <asm/hugetlb.h> |
|---|
| 33 | 29 | |
|---|
| 34 | 30 | static inline int is_exec_fault(void) |
|---|
| 35 | 31 | { |
|---|
| .. | .. |
|---|
| 44 | 40 | static inline int pte_looks_normal(pte_t pte) |
|---|
| 45 | 41 | { |
|---|
| 46 | 42 | |
|---|
| 47 | | -#if defined(CONFIG_PPC_BOOK3S_64) |
|---|
| 48 | | - if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) { |
|---|
| 43 | + if (pte_present(pte) && !pte_special(pte)) { |
|---|
| 49 | 44 | if (pte_ci(pte)) |
|---|
| 50 | 45 | return 0; |
|---|
| 51 | 46 | if (pte_user(pte)) |
|---|
| 52 | 47 | return 1; |
|---|
| 53 | 48 | } |
|---|
| 54 | 49 | return 0; |
|---|
| 55 | | -#else |
|---|
| 56 | | - return (pte_val(pte) & |
|---|
| 57 | | - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER | |
|---|
| 58 | | - _PAGE_PRIVILEGED)) == |
|---|
| 59 | | - (_PAGE_PRESENT | _PAGE_USER); |
|---|
| 60 | | -#endif |
|---|
| 61 | 50 | } |
|---|
| 62 | 51 | |
|---|
| 63 | 52 | static struct page *maybe_pte_to_page(pte_t pte) |
|---|
| .. | .. |
|---|
| 73 | 62 | return page; |
|---|
| 74 | 63 | } |
|---|
| 75 | 64 | |
|---|
| 76 | | -#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 |
|---|
| 65 | +#ifdef CONFIG_PPC_BOOK3S |
|---|
| 77 | 66 | |
|---|
| 78 | 67 | /* Server-style MMU handles coherency when hashing if HW exec permission |
|---|
| 79 | 68 | * is supposed per page (currently 64-bit only). If not, then, we always |
|---|
| .. | .. |
|---|
| 81 | 70 | * support falls into the same category. |
|---|
| 82 | 71 | */ |
|---|
| 83 | 72 | |
|---|
| 84 | | -static pte_t set_pte_filter(pte_t pte) |
|---|
| 73 | +static pte_t set_pte_filter_hash(pte_t pte) |
|---|
| 85 | 74 | { |
|---|
| 86 | 75 | if (radix_enabled()) |
|---|
| 87 | 76 | return pte; |
|---|
| .. | .. |
|---|
| 100 | 89 | return pte; |
|---|
| 101 | 90 | } |
|---|
| 102 | 91 | |
|---|
| 103 | | -static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, |
|---|
| 104 | | - int dirty) |
|---|
| 105 | | -{ |
|---|
| 106 | | - return pte; |
|---|
| 107 | | -} |
|---|
| 92 | +#else /* CONFIG_PPC_BOOK3S */ |
|---|
| 108 | 93 | |
|---|
| 109 | | -#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ |
|---|
| 94 | +static pte_t set_pte_filter_hash(pte_t pte) { return pte; } |
|---|
| 95 | + |
|---|
| 96 | +#endif /* CONFIG_PPC_BOOK3S */ |
|---|
| 110 | 97 | |
|---|
| 111 | 98 | /* Embedded type MMU with HW exec support. This is a bit more complicated |
|---|
| 112 | 99 | * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so |
|---|
| 113 | 100 | * instead we "filter out" the exec permission for non clean pages. |
|---|
| 114 | 101 | */ |
|---|
| 115 | | -static pte_t set_pte_filter(pte_t pte) |
|---|
| 102 | +static inline pte_t set_pte_filter(pte_t pte) |
|---|
| 116 | 103 | { |
|---|
| 117 | 104 | struct page *pg; |
|---|
| 118 | 105 | |
|---|
| 106 | + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) |
|---|
| 107 | + return set_pte_filter_hash(pte); |
|---|
| 108 | + |
|---|
| 119 | 109 | /* No exec permission in the first place, move on */ |
|---|
| 120 | | - if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) |
|---|
| 110 | + if (!pte_exec(pte) || !pte_looks_normal(pte)) |
|---|
| 121 | 111 | return pte; |
|---|
| 122 | 112 | |
|---|
| 123 | 113 | /* If you set _PAGE_EXEC on weird pages you're on your own */ |
|---|
| .. | .. |
|---|
| 137 | 127 | } |
|---|
| 138 | 128 | |
|---|
| 139 | 129 | /* Else, we filter out _PAGE_EXEC */ |
|---|
| 140 | | - return __pte(pte_val(pte) & ~_PAGE_EXEC); |
|---|
| 130 | + return pte_exprotect(pte); |
|---|
| 141 | 131 | } |
|---|
| 142 | 132 | |
|---|
| 143 | 133 | static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, |
|---|
| .. | .. |
|---|
| 145 | 135 | { |
|---|
| 146 | 136 | struct page *pg; |
|---|
| 147 | 137 | |
|---|
| 138 | + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) |
|---|
| 139 | + return pte; |
|---|
| 140 | + |
|---|
| 148 | 141 | /* So here, we only care about exec faults, as we use them |
|---|
| 149 | 142 | * to recover lost _PAGE_EXEC and perform I$/D$ coherency |
|---|
| 150 | 143 | * if necessary. Also if _PAGE_EXEC is already set, same deal, |
|---|
| 151 | 144 | * we just bail out |
|---|
| 152 | 145 | */ |
|---|
| 153 | | - if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) |
|---|
| 146 | + if (dirty || pte_exec(pte) || !is_exec_fault()) |
|---|
| 154 | 147 | return pte; |
|---|
| 155 | 148 | |
|---|
| 156 | 149 | #ifdef CONFIG_DEBUG_VM |
|---|
| .. | .. |
|---|
| 176 | 169 | set_bit(PG_arch_1, &pg->flags); |
|---|
| 177 | 170 | |
|---|
| 178 | 171 | bail: |
|---|
| 179 | | - return __pte(pte_val(pte) | _PAGE_EXEC); |
|---|
| 172 | + return pte_mkexec(pte); |
|---|
| 180 | 173 | } |
|---|
| 181 | | - |
|---|
| 182 | | -#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ |
|---|
| 183 | 174 | |
|---|
| 184 | 175 | /* |
|---|
| 185 | 176 | * set_pte stores a linux PTE into the linux page table. |
|---|
| .. | .. |
|---|
| 188 | 179 | pte_t pte) |
|---|
| 189 | 180 | { |
|---|
| 190 | 181 | /* |
|---|
| 191 | | - * When handling numa faults, we already have the pte marked |
|---|
| 192 | | - * _PAGE_PRESENT, but we can be sure that it is not in hpte. |
|---|
| 193 | | - * Hence we can use set_pte_at for them. |
|---|
| 182 | + * Make sure hardware valid bit is not set. We don't do |
|---|
| 183 | + * tlb flush for this update. |
|---|
| 194 | 184 | */ |
|---|
| 195 | | - VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); |
|---|
| 196 | | - |
|---|
| 197 | | - /* Add the pte bit when trying to set a pte */ |
|---|
| 198 | | - pte = __pte(pte_val(pte) | _PAGE_PTE); |
|---|
| 185 | + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); |
|---|
| 199 | 186 | |
|---|
| 200 | 187 | /* Note: mm->context.id might not yet have been assigned as |
|---|
| 201 | 188 | * this context might not have been activated yet when this |
|---|
| .. | .. |
|---|
| 205 | 192 | |
|---|
| 206 | 193 | /* Perform the setting of the PTE */ |
|---|
| 207 | 194 | __set_pte_at(mm, addr, ptep, pte, 0); |
|---|
| 195 | +} |
|---|
| 196 | + |
|---|
| 197 | +void unmap_kernel_page(unsigned long va) |
|---|
| 198 | +{ |
|---|
| 199 | + pmd_t *pmdp = pmd_off_k(va); |
|---|
| 200 | + pte_t *ptep = pte_offset_kernel(pmdp, va); |
|---|
| 201 | + |
|---|
| 202 | + pte_clear(&init_mm, va, ptep); |
|---|
| 203 | + flush_tlb_kernel_range(va, va + PAGE_SIZE); |
|---|
| 208 | 204 | } |
|---|
| 209 | 205 | |
|---|
| 210 | 206 | /* |
|---|
| .. | .. |
|---|
| 229 | 225 | } |
|---|
| 230 | 226 | |
|---|
| 231 | 227 | #ifdef CONFIG_HUGETLB_PAGE |
|---|
| 232 | | -extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, |
|---|
| 233 | | - unsigned long addr, pte_t *ptep, |
|---|
| 234 | | - pte_t pte, int dirty) |
|---|
| 228 | +int huge_ptep_set_access_flags(struct vm_area_struct *vma, |
|---|
| 229 | + unsigned long addr, pte_t *ptep, |
|---|
| 230 | + pte_t pte, int dirty) |
|---|
| 235 | 231 | { |
|---|
| 236 | 232 | #ifdef HUGETLB_NEED_PRELOAD |
|---|
| 237 | 233 | /* |
|---|
| .. | .. |
|---|
| 258 | 254 | |
|---|
| 259 | 255 | #else |
|---|
| 260 | 256 | /* |
|---|
| 261 | | - * Not used on non book3s64 platforms. But 8xx |
|---|
| 262 | | - * can possibly use tsize derived from hstate. |
|---|
| 257 | + * Not used on non book3s64 platforms. |
|---|
| 258 | + * 8xx compares it with mmu_virtual_psize to |
|---|
| 259 | + * know if it is a huge page or not. |
|---|
| 263 | 260 | */ |
|---|
| 264 | | - psize = 0; |
|---|
| 261 | + psize = MMU_PAGE_COUNT; |
|---|
| 265 | 262 | #endif |
|---|
| 266 | 263 | __ptep_set_access_flags(vma, ptep, pte, addr, psize); |
|---|
| 267 | 264 | } |
|---|
| 268 | 265 | return changed; |
|---|
| 269 | 266 | #endif |
|---|
| 270 | 267 | } |
|---|
| 268 | + |
|---|
| 269 | +#if defined(CONFIG_PPC_8xx) |
|---|
| 270 | +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) |
|---|
| 271 | +{ |
|---|
| 272 | + pmd_t *pmd = pmd_off(mm, addr); |
|---|
| 273 | + pte_basic_t val; |
|---|
| 274 | + pte_basic_t *entry = &ptep->pte; |
|---|
| 275 | + int num, i; |
|---|
| 276 | + |
|---|
| 277 | + /* |
|---|
| 278 | + * Make sure hardware valid bit is not set. We don't do |
|---|
| 279 | + * tlb flush for this update. |
|---|
| 280 | + */ |
|---|
| 281 | + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); |
|---|
| 282 | + |
|---|
| 283 | + pte = set_pte_filter(pte); |
|---|
| 284 | + |
|---|
| 285 | + val = pte_val(pte); |
|---|
| 286 | + |
|---|
| 287 | + num = number_of_cells_per_pte(pmd, val, 1); |
|---|
| 288 | + |
|---|
| 289 | + for (i = 0; i < num; i++, entry++, val += SZ_4K) |
|---|
| 290 | + *entry = val; |
|---|
| 291 | +} |
|---|
| 292 | +#endif |
|---|
| 271 | 293 | #endif /* CONFIG_HUGETLB_PAGE */ |
|---|
| 272 | 294 | |
|---|
| 273 | 295 | #ifdef CONFIG_DEBUG_VM |
|---|
| 274 | 296 | void assert_pte_locked(struct mm_struct *mm, unsigned long addr) |
|---|
| 275 | 297 | { |
|---|
| 276 | 298 | pgd_t *pgd; |
|---|
| 299 | + p4d_t *p4d; |
|---|
| 277 | 300 | pud_t *pud; |
|---|
| 278 | 301 | pmd_t *pmd; |
|---|
| 279 | 302 | |
|---|
| .. | .. |
|---|
| 281 | 304 | return; |
|---|
| 282 | 305 | pgd = mm->pgd + pgd_index(addr); |
|---|
| 283 | 306 | BUG_ON(pgd_none(*pgd)); |
|---|
| 284 | | - pud = pud_offset(pgd, addr); |
|---|
| 307 | + p4d = p4d_offset(pgd, addr); |
|---|
| 308 | + BUG_ON(p4d_none(*p4d)); |
|---|
| 309 | + pud = pud_offset(p4d, addr); |
|---|
| 285 | 310 | BUG_ON(pud_none(*pud)); |
|---|
| 286 | 311 | pmd = pmd_offset(pud, addr); |
|---|
| 287 | 312 | /* |
|---|
| 288 | 313 | * khugepaged to collapse normal pages to hugepage, first set |
|---|
| 289 | | - * pmd to none to force page fault/gup to take mmap_sem. After |
|---|
| 314 | + * pmd to none to force page fault/gup to take mmap_lock. After |
|---|
| 290 | 315 | * pmd is set to none, we do a pte_clear which does this assertion |
|---|
| 291 | 316 | * so if we find pmd none, return. |
|---|
| 292 | 317 | */ |
|---|
| .. | .. |
|---|
| 305 | 330 | return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va); |
|---|
| 306 | 331 | } |
|---|
| 307 | 332 | EXPORT_SYMBOL_GPL(vmalloc_to_phys); |
|---|
| 333 | + |
|---|
| 334 | +/* |
|---|
| 335 | + * We have 4 cases for pgds and pmds: |
|---|
| 336 | + * (1) invalid (all zeroes) |
|---|
| 337 | + * (2) pointer to next table, as normal; bottom 6 bits == 0 |
|---|
| 338 | + * (3) leaf pte for huge page _PAGE_PTE set |
|---|
| 339 | + * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table |
|---|
| 340 | + * |
|---|
| 341 | + * So long as we atomically load page table pointers we are safe against teardown, |
|---|
| 342 | + * we can follow the address down to the the page and take a ref on it. |
|---|
| 343 | + * This function need to be called with interrupts disabled. We use this variant |
|---|
| 344 | + * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED |
|---|
| 345 | + */ |
|---|
| 346 | +pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, |
|---|
| 347 | + bool *is_thp, unsigned *hpage_shift) |
|---|
| 348 | +{ |
|---|
| 349 | + pgd_t *pgdp; |
|---|
| 350 | + p4d_t p4d, *p4dp; |
|---|
| 351 | + pud_t pud, *pudp; |
|---|
| 352 | + pmd_t pmd, *pmdp; |
|---|
| 353 | + pte_t *ret_pte; |
|---|
| 354 | + hugepd_t *hpdp = NULL; |
|---|
| 355 | + unsigned pdshift; |
|---|
| 356 | + |
|---|
| 357 | + if (hpage_shift) |
|---|
| 358 | + *hpage_shift = 0; |
|---|
| 359 | + |
|---|
| 360 | + if (is_thp) |
|---|
| 361 | + *is_thp = false; |
|---|
| 362 | + |
|---|
| 363 | + /* |
|---|
| 364 | + * Always operate on the local stack value. This make sure the |
|---|
| 365 | + * value don't get updated by a parallel THP split/collapse, |
|---|
| 366 | + * page fault or a page unmap. The return pte_t * is still not |
|---|
| 367 | + * stable. So should be checked there for above conditions. |
|---|
| 368 | + * Top level is an exception because it is folded into p4d. |
|---|
| 369 | + */ |
|---|
| 370 | + pgdp = pgdir + pgd_index(ea); |
|---|
| 371 | + p4dp = p4d_offset(pgdp, ea); |
|---|
| 372 | + p4d = READ_ONCE(*p4dp); |
|---|
| 373 | + pdshift = P4D_SHIFT; |
|---|
| 374 | + |
|---|
| 375 | + if (p4d_none(p4d)) |
|---|
| 376 | + return NULL; |
|---|
| 377 | + |
|---|
| 378 | + if (p4d_is_leaf(p4d)) { |
|---|
| 379 | + ret_pte = (pte_t *)p4dp; |
|---|
| 380 | + goto out; |
|---|
| 381 | + } |
|---|
| 382 | + |
|---|
| 383 | + if (is_hugepd(__hugepd(p4d_val(p4d)))) { |
|---|
| 384 | + hpdp = (hugepd_t *)&p4d; |
|---|
| 385 | + goto out_huge; |
|---|
| 386 | + } |
|---|
| 387 | + |
|---|
| 388 | + /* |
|---|
| 389 | + * Even if we end up with an unmap, the pgtable will not |
|---|
| 390 | + * be freed, because we do an rcu free and here we are |
|---|
| 391 | + * irq disabled |
|---|
| 392 | + */ |
|---|
| 393 | + pdshift = PUD_SHIFT; |
|---|
| 394 | + pudp = pud_offset(&p4d, ea); |
|---|
| 395 | + pud = READ_ONCE(*pudp); |
|---|
| 396 | + |
|---|
| 397 | + if (pud_none(pud)) |
|---|
| 398 | + return NULL; |
|---|
| 399 | + |
|---|
| 400 | + if (pud_is_leaf(pud)) { |
|---|
| 401 | + ret_pte = (pte_t *)pudp; |
|---|
| 402 | + goto out; |
|---|
| 403 | + } |
|---|
| 404 | + |
|---|
| 405 | + if (is_hugepd(__hugepd(pud_val(pud)))) { |
|---|
| 406 | + hpdp = (hugepd_t *)&pud; |
|---|
| 407 | + goto out_huge; |
|---|
| 408 | + } |
|---|
| 409 | + |
|---|
| 410 | + pdshift = PMD_SHIFT; |
|---|
| 411 | + pmdp = pmd_offset(&pud, ea); |
|---|
| 412 | + pmd = READ_ONCE(*pmdp); |
|---|
| 413 | + |
|---|
| 414 | + /* |
|---|
| 415 | + * A hugepage collapse is captured by this condition, see |
|---|
| 416 | + * pmdp_collapse_flush. |
|---|
| 417 | + */ |
|---|
| 418 | + if (pmd_none(pmd)) |
|---|
| 419 | + return NULL; |
|---|
| 420 | + |
|---|
| 421 | +#ifdef CONFIG_PPC_BOOK3S_64 |
|---|
| 422 | + /* |
|---|
| 423 | + * A hugepage split is captured by this condition, see |
|---|
| 424 | + * pmdp_invalidate. |
|---|
| 425 | + * |
|---|
| 426 | + * Huge page modification can be caught here too. |
|---|
| 427 | + */ |
|---|
| 428 | + if (pmd_is_serializing(pmd)) |
|---|
| 429 | + return NULL; |
|---|
| 430 | +#endif |
|---|
| 431 | + |
|---|
| 432 | + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { |
|---|
| 433 | + if (is_thp) |
|---|
| 434 | + *is_thp = true; |
|---|
| 435 | + ret_pte = (pte_t *)pmdp; |
|---|
| 436 | + goto out; |
|---|
| 437 | + } |
|---|
| 438 | + |
|---|
| 439 | + if (pmd_is_leaf(pmd)) { |
|---|
| 440 | + ret_pte = (pte_t *)pmdp; |
|---|
| 441 | + goto out; |
|---|
| 442 | + } |
|---|
| 443 | + |
|---|
| 444 | + if (is_hugepd(__hugepd(pmd_val(pmd)))) { |
|---|
| 445 | + hpdp = (hugepd_t *)&pmd; |
|---|
| 446 | + goto out_huge; |
|---|
| 447 | + } |
|---|
| 448 | + |
|---|
| 449 | + return pte_offset_kernel(&pmd, ea); |
|---|
| 450 | + |
|---|
| 451 | +out_huge: |
|---|
| 452 | + if (!hpdp) |
|---|
| 453 | + return NULL; |
|---|
| 454 | + |
|---|
| 455 | + ret_pte = hugepte_offset(*hpdp, ea, pdshift); |
|---|
| 456 | + pdshift = hugepd_shift(*hpdp); |
|---|
| 457 | +out: |
|---|
| 458 | + if (hpage_shift) |
|---|
| 459 | + *hpage_shift = pdshift; |
|---|
| 460 | + return ret_pte; |
|---|
| 461 | +} |
|---|
| 462 | +EXPORT_SYMBOL_GPL(__find_linux_pte); |
|---|