| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Copyright 2007 Andi Kleen, SUSE Labs. |
|---|
| 3 | | - * Subject to the GPL, v.2 |
|---|
| 4 | 4 | * |
|---|
| 5 | 5 | * This contains most of the x86 vDSO kernel-side code. |
|---|
| 6 | 6 | */ |
|---|
| .. | .. |
|---|
| 14 | 14 | #include <linux/elf.h> |
|---|
| 15 | 15 | #include <linux/cpu.h> |
|---|
| 16 | 16 | #include <linux/ptrace.h> |
|---|
| 17 | +#include <linux/time_namespace.h> |
|---|
| 18 | + |
|---|
| 17 | 19 | #include <asm/pvclock.h> |
|---|
| 18 | 20 | #include <asm/vgtod.h> |
|---|
| 19 | 21 | #include <asm/proto.h> |
|---|
| 20 | 22 | #include <asm/vdso.h> |
|---|
| 21 | 23 | #include <asm/vvar.h> |
|---|
| 24 | +#include <asm/tlb.h> |
|---|
| 22 | 25 | #include <asm/page.h> |
|---|
| 23 | 26 | #include <asm/desc.h> |
|---|
| 24 | 27 | #include <asm/cpufeature.h> |
|---|
| 25 | | -#include <asm/mshyperv.h> |
|---|
| 28 | +#include <clocksource/hyperv_timer.h> |
|---|
| 29 | + |
|---|
| 30 | +#undef _ASM_X86_VVAR_H |
|---|
| 31 | +#define EMIT_VVAR(name, offset) \ |
|---|
| 32 | + const size_t name ## _offset = offset; |
|---|
| 33 | +#include <asm/vvar.h> |
|---|
| 34 | + |
|---|
| 35 | +struct vdso_data *arch_get_vdso_data(void *vvar_page) |
|---|
| 36 | +{ |
|---|
| 37 | + return (struct vdso_data *)(vvar_page + _vdso_data_offset); |
|---|
| 38 | +} |
|---|
| 39 | +#undef EMIT_VVAR |
|---|
| 40 | + |
|---|
| 41 | +unsigned int vclocks_used __read_mostly; |
|---|
| 26 | 42 | |
|---|
| 27 | 43 | #if defined(CONFIG_X86_64) |
|---|
| 28 | 44 | unsigned int __read_mostly vdso64_enabled = 1; |
|---|
| .. | .. |
|---|
| 37 | 53 | image->alt_len)); |
|---|
| 38 | 54 | } |
|---|
| 39 | 55 | |
|---|
| 56 | +static const struct vm_special_mapping vvar_mapping; |
|---|
| 40 | 57 | struct linux_binprm; |
|---|
| 41 | 58 | |
|---|
| 42 | | -static int vdso_fault(const struct vm_special_mapping *sm, |
|---|
| 59 | +static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, |
|---|
| 43 | 60 | struct vm_area_struct *vma, struct vm_fault *vmf) |
|---|
| 44 | 61 | { |
|---|
| 45 | 62 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; |
|---|
| .. | .. |
|---|
| 84 | 101 | return 0; |
|---|
| 85 | 102 | } |
|---|
| 86 | 103 | |
|---|
| 87 | | -static int vvar_fault(const struct vm_special_mapping *sm, |
|---|
| 104 | +static int vvar_mremap(const struct vm_special_mapping *sm, |
|---|
| 105 | + struct vm_area_struct *new_vma) |
|---|
| 106 | +{ |
|---|
| 107 | + const struct vdso_image *image = new_vma->vm_mm->context.vdso_image; |
|---|
| 108 | + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; |
|---|
| 109 | + |
|---|
| 110 | + if (new_size != -image->sym_vvar_start) |
|---|
| 111 | + return -EINVAL; |
|---|
| 112 | + |
|---|
| 113 | + return 0; |
|---|
| 114 | +} |
|---|
| 115 | + |
|---|
| 116 | +#ifdef CONFIG_TIME_NS |
|---|
| 117 | +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) |
|---|
| 118 | +{ |
|---|
| 119 | + if (likely(vma->vm_mm == current->mm)) |
|---|
| 120 | + return current->nsproxy->time_ns->vvar_page; |
|---|
| 121 | + |
|---|
| 122 | + /* |
|---|
| 123 | + * VM_PFNMAP | VM_IO protect .fault() handler from being called |
|---|
| 124 | + * through interfaces like /proc/$pid/mem or |
|---|
| 125 | + * process_vm_{readv,writev}() as long as there's no .access() |
|---|
| 126 | + * in special_mapping_vmops(). |
|---|
| 127 | + * For more details check_vma_flags() and __access_remote_vm() |
|---|
| 128 | + */ |
|---|
| 129 | + |
|---|
| 130 | + WARN(1, "vvar_page accessed remotely"); |
|---|
| 131 | + |
|---|
| 132 | + return NULL; |
|---|
| 133 | +} |
|---|
| 134 | + |
|---|
| 135 | +/* |
|---|
| 136 | + * The vvar page layout depends on whether a task belongs to the root or |
|---|
| 137 | + * non-root time namespace. Whenever a task changes its namespace, the VVAR |
|---|
| 138 | + * page tables are cleared and then they will re-faulted with a |
|---|
| 139 | + * corresponding layout. |
|---|
| 140 | + * See also the comment near timens_setup_vdso_data() for details. |
|---|
| 141 | + */ |
|---|
| 142 | +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) |
|---|
| 143 | +{ |
|---|
| 144 | + struct mm_struct *mm = task->mm; |
|---|
| 145 | + struct vm_area_struct *vma; |
|---|
| 146 | + |
|---|
| 147 | + mmap_read_lock(mm); |
|---|
| 148 | + |
|---|
| 149 | + for (vma = mm->mmap; vma; vma = vma->vm_next) { |
|---|
| 150 | + unsigned long size = vma->vm_end - vma->vm_start; |
|---|
| 151 | + |
|---|
| 152 | + if (vma_is_special_mapping(vma, &vvar_mapping)) |
|---|
| 153 | + zap_page_range(vma, vma->vm_start, size); |
|---|
| 154 | + } |
|---|
| 155 | + |
|---|
| 156 | + mmap_read_unlock(mm); |
|---|
| 157 | + return 0; |
|---|
| 158 | +} |
|---|
| 159 | +#else |
|---|
| 160 | +static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) |
|---|
| 161 | +{ |
|---|
| 162 | + return NULL; |
|---|
| 163 | +} |
|---|
| 164 | +#endif |
|---|
| 165 | + |
|---|
| 166 | +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, |
|---|
| 88 | 167 | struct vm_area_struct *vma, struct vm_fault *vmf) |
|---|
| 89 | 168 | { |
|---|
| 90 | 169 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; |
|---|
| 170 | + unsigned long pfn; |
|---|
| 91 | 171 | long sym_offset; |
|---|
| 92 | | - int ret = -EFAULT; |
|---|
| 93 | 172 | |
|---|
| 94 | 173 | if (!image) |
|---|
| 95 | 174 | return VM_FAULT_SIGBUS; |
|---|
| .. | .. |
|---|
| 108 | 187 | return VM_FAULT_SIGBUS; |
|---|
| 109 | 188 | |
|---|
| 110 | 189 | if (sym_offset == image->sym_vvar_page) { |
|---|
| 111 | | - ret = vm_insert_pfn(vma, vmf->address, |
|---|
| 112 | | - __pa_symbol(&__vvar_page) >> PAGE_SHIFT); |
|---|
| 190 | + struct page *timens_page = find_timens_vvar_page(vma); |
|---|
| 191 | + |
|---|
| 192 | + pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; |
|---|
| 193 | + |
|---|
| 194 | + /* |
|---|
| 195 | + * If a task belongs to a time namespace then a namespace |
|---|
| 196 | + * specific VVAR is mapped with the sym_vvar_page offset and |
|---|
| 197 | + * the real VVAR page is mapped with the sym_timens_page |
|---|
| 198 | + * offset. |
|---|
| 199 | + * See also the comment near timens_setup_vdso_data(). |
|---|
| 200 | + */ |
|---|
| 201 | + if (timens_page) { |
|---|
| 202 | + unsigned long addr; |
|---|
| 203 | + vm_fault_t err; |
|---|
| 204 | + |
|---|
| 205 | + /* |
|---|
| 206 | + * Optimization: inside time namespace pre-fault |
|---|
| 207 | + * VVAR page too. As on timens page there are only |
|---|
| 208 | + * offsets for clocks on VVAR, it'll be faulted |
|---|
| 209 | + * shortly by VDSO code. |
|---|
| 210 | + */ |
|---|
| 211 | + addr = vmf->address + (image->sym_timens_page - sym_offset); |
|---|
| 212 | + err = vmf_insert_pfn(vma, addr, pfn); |
|---|
| 213 | + if (unlikely(err & VM_FAULT_ERROR)) |
|---|
| 214 | + return err; |
|---|
| 215 | + |
|---|
| 216 | + pfn = page_to_pfn(timens_page); |
|---|
| 217 | + } |
|---|
| 218 | + |
|---|
| 219 | + return vmf_insert_pfn(vma, vmf->address, pfn); |
|---|
| 113 | 220 | } else if (sym_offset == image->sym_pvclock_page) { |
|---|
| 114 | 221 | struct pvclock_vsyscall_time_info *pvti = |
|---|
| 115 | 222 | pvclock_get_pvti_cpu0_va(); |
|---|
| 116 | | - if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { |
|---|
| 117 | | - ret = vm_insert_pfn_prot( |
|---|
| 118 | | - vma, |
|---|
| 119 | | - vmf->address, |
|---|
| 120 | | - __pa(pvti) >> PAGE_SHIFT, |
|---|
| 121 | | - pgprot_decrypted(vma->vm_page_prot)); |
|---|
| 223 | + if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) { |
|---|
| 224 | + return vmf_insert_pfn_prot(vma, vmf->address, |
|---|
| 225 | + __pa(pvti) >> PAGE_SHIFT, |
|---|
| 226 | + pgprot_decrypted(vma->vm_page_prot)); |
|---|
| 122 | 227 | } |
|---|
| 123 | 228 | } else if (sym_offset == image->sym_hvclock_page) { |
|---|
| 124 | 229 | struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); |
|---|
| 125 | 230 | |
|---|
| 126 | | - if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) |
|---|
| 127 | | - ret = vm_insert_pfn(vma, vmf->address, |
|---|
| 128 | | - vmalloc_to_pfn(tsc_pg)); |
|---|
| 129 | | - } |
|---|
| 231 | + if (tsc_pg && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) |
|---|
| 232 | + return vmf_insert_pfn(vma, vmf->address, |
|---|
| 233 | + virt_to_phys(tsc_pg) >> PAGE_SHIFT); |
|---|
| 234 | + } else if (sym_offset == image->sym_timens_page) { |
|---|
| 235 | + struct page *timens_page = find_timens_vvar_page(vma); |
|---|
| 130 | 236 | |
|---|
| 131 | | - if (ret == 0 || ret == -EBUSY) |
|---|
| 132 | | - return VM_FAULT_NOPAGE; |
|---|
| 237 | + if (!timens_page) |
|---|
| 238 | + return VM_FAULT_SIGBUS; |
|---|
| 239 | + |
|---|
| 240 | + pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; |
|---|
| 241 | + return vmf_insert_pfn(vma, vmf->address, pfn); |
|---|
| 242 | + } |
|---|
| 133 | 243 | |
|---|
| 134 | 244 | return VM_FAULT_SIGBUS; |
|---|
| 135 | 245 | } |
|---|
| .. | .. |
|---|
| 142 | 252 | static const struct vm_special_mapping vvar_mapping = { |
|---|
| 143 | 253 | .name = "[vvar]", |
|---|
| 144 | 254 | .fault = vvar_fault, |
|---|
| 255 | + .mremap = vvar_mremap, |
|---|
| 145 | 256 | }; |
|---|
| 146 | 257 | |
|---|
| 147 | 258 | /* |
|---|
| .. | .. |
|---|
| 156 | 267 | unsigned long text_start; |
|---|
| 157 | 268 | int ret = 0; |
|---|
| 158 | 269 | |
|---|
| 159 | | - if (down_write_killable(&mm->mmap_sem)) |
|---|
| 270 | + if (mmap_write_lock_killable(mm)) |
|---|
| 160 | 271 | return -EINTR; |
|---|
| 161 | 272 | |
|---|
| 162 | 273 | addr = get_unmapped_area(NULL, addr, |
|---|
| .. | .. |
|---|
| 199 | 310 | } |
|---|
| 200 | 311 | |
|---|
| 201 | 312 | up_fail: |
|---|
| 202 | | - up_write(&mm->mmap_sem); |
|---|
| 313 | + mmap_write_unlock(mm); |
|---|
| 203 | 314 | return ret; |
|---|
| 204 | 315 | } |
|---|
| 205 | 316 | |
|---|
| .. | .. |
|---|
| 261 | 372 | struct mm_struct *mm = current->mm; |
|---|
| 262 | 373 | struct vm_area_struct *vma; |
|---|
| 263 | 374 | |
|---|
| 264 | | - down_write(&mm->mmap_sem); |
|---|
| 375 | + mmap_write_lock(mm); |
|---|
| 265 | 376 | /* |
|---|
| 266 | 377 | * Check if we have already mapped vdso blob - fail to prevent |
|---|
| 267 | 378 | * abusing from userspace install_speciall_mapping, which may |
|---|
| 268 | 379 | * not do accounting and rlimit right. |
|---|
| 269 | 380 | * We could search vma near context.vdso, but it's a slowpath, |
|---|
| 270 | | - * so let's explicitely check all VMAs to be completely sure. |
|---|
| 381 | + * so let's explicitly check all VMAs to be completely sure. |
|---|
| 271 | 382 | */ |
|---|
| 272 | 383 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
|---|
| 273 | 384 | if (vma_is_special_mapping(vma, &vdso_mapping) || |
|---|
| 274 | 385 | vma_is_special_mapping(vma, &vvar_mapping)) { |
|---|
| 275 | | - up_write(&mm->mmap_sem); |
|---|
| 386 | + mmap_write_unlock(mm); |
|---|
| 276 | 387 | return -EEXIST; |
|---|
| 277 | 388 | } |
|---|
| 278 | 389 | } |
|---|
| 279 | | - up_write(&mm->mmap_sem); |
|---|
| 390 | + mmap_write_unlock(mm); |
|---|
| 280 | 391 | |
|---|
| 281 | 392 | return map_vdso(image, addr); |
|---|
| 282 | 393 | } |
|---|
| .. | .. |
|---|
| 329 | 440 | static __init int vdso_setup(char *s) |
|---|
| 330 | 441 | { |
|---|
| 331 | 442 | vdso64_enabled = simple_strtoul(s, NULL, 0); |
|---|
| 332 | | - return 0; |
|---|
| 443 | + return 1; |
|---|
| 333 | 444 | } |
|---|
| 334 | 445 | __setup("vdso=", vdso_setup); |
|---|
| 335 | 446 | |
|---|
| 336 | 447 | static int __init init_vdso(void) |
|---|
| 337 | 448 | { |
|---|
| 449 | + BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32); |
|---|
| 450 | + |
|---|
| 338 | 451 | init_vdso_image(&vdso_image_64); |
|---|
| 339 | 452 | |
|---|
| 340 | 453 | #ifdef CONFIG_X86_X32_ABI |
|---|