/* * SPDX-License-Identifier: GPL-2.0 * * Copyright (C) 2016 Philippe Gerum . */ #include #include #include #include #include #include #include static bool dovetail_enabled; void __weak arch_inband_task_init(struct task_struct *p) { } void inband_task_init(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); clear_ti_local_flags(ti, _TLF_DOVETAIL|_TLF_OOB|_TLF_OFFSTAGE); arch_inband_task_init(p); } void dovetail_init_altsched(struct dovetail_altsched_context *p) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; check_inband_stage(); p->task = tsk; p->active_mm = mm; p->borrowed_mm = false; /* * Make sure the current process will not share any private * page with its child upon fork(), sparing it the random * latency induced by COW. MMF_DOVETAILED is never cleared once * set. We serialize with dup_mmap() which holds the mm write * lock. */ if (!(tsk->flags & PF_KTHREAD) && !test_bit(MMF_DOVETAILED, &mm->flags)) { mmap_write_lock(mm); __set_bit(MMF_DOVETAILED, &mm->flags); mmap_write_unlock(mm); } } EXPORT_SYMBOL_GPL(dovetail_init_altsched); void dovetail_start_altsched(void) { check_inband_stage(); set_thread_local_flags(_TLF_DOVETAIL); } EXPORT_SYMBOL_GPL(dovetail_start_altsched); void dovetail_stop_altsched(void) { clear_thread_local_flags(_TLF_DOVETAIL); clear_thread_flag(TIF_MAYDAY); } EXPORT_SYMBOL_GPL(dovetail_stop_altsched); int __weak handle_oob_syscall(struct pt_regs *regs) { return 0; } int __weak handle_pipelined_syscall(struct irq_stage *stage, struct pt_regs *regs) { return 0; /* i.e. propagate to in-band handler. */ } void __weak handle_oob_mayday(struct pt_regs *regs) { } static inline void call_mayday(struct thread_info *ti, struct pt_regs *regs) { clear_ti_thread_flag(ti, TIF_MAYDAY); handle_oob_mayday(regs); } void dovetail_call_mayday(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); unsigned long flags; flags = hard_local_irq_save(); call_mayday(ti, regs); hard_local_irq_restore(flags); } void inband_retuser_notify(void) { clear_thread_flag(TIF_RETUSER); inband_event_notify(INBAND_TASK_RETUSER, current); /* CAUTION: we might have switched out-of-band here. */ } int __pipeline_syscall(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); struct irq_stage *caller_stage, *target_stage; struct irq_stage_data *p, *this_context; unsigned long flags; int ret = 0; /* * We should definitely not pipeline a syscall through the * slow path with IRQs off. */ WARN_ON_ONCE(dovetail_debug() && hard_irqs_disabled()); if (!dovetail_enabled) return 0; flags = hard_local_irq_save(); caller_stage = current_irq_stage; this_context = current_irq_staged; target_stage = &oob_stage; next: p = this_staged(target_stage); set_current_irq_staged(p); hard_local_irq_restore(flags); ret = handle_pipelined_syscall(caller_stage, regs); flags = hard_local_irq_save(); /* * Be careful about stage switching _and_ CPU migration that * might have happened as a result of handing over the syscall * to the out-of-band handler. * * - if a stage migration is detected, fetch the new * per-stage, per-CPU context pointer. * * - if no stage migration happened, switch back to the * initial call stage, on a possibly different CPU though. */ if (current_irq_stage != target_stage) { this_context = current_irq_staged; } else { p = this_staged(this_context->stage); set_current_irq_staged(p); } if (this_context->stage == &inband_stage) { if (target_stage != &inband_stage && ret == 0) { target_stage = &inband_stage; goto next; } p = this_inband_staged(); if (stage_irqs_pending(p)) sync_current_irq_stage(); } else { if (test_ti_thread_flag(ti, TIF_MAYDAY)) call_mayday(ti, regs); } hard_local_irq_restore(flags); return ret; } static inline bool maybe_oob_syscall(unsigned int nr, struct pt_regs *regs) { /* * Check whether the companion core might be interested in the * syscall call. If the old syscall form is handled, pass the * request to the core if __OOB_SYSCALL_BIT is set in * @nr. Otherwise, only check whether an oob syscall is folded * into a prctl() request. */ if (IS_ENABLED(CONFIG_DOVETAIL_LEGACY_SYSCALL_RANGE)) { if (nr & __OOB_SYSCALL_BIT) return true; } return arch_dovetail_is_syscall(nr) && syscall_get_arg0(regs) & __OOB_SYSCALL_BIT; } int pipeline_syscall(unsigned int nr, struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); unsigned long local_flags = READ_ONCE(ti_local_flags(ti)); int ret; WARN_ON_ONCE(dovetail_debug() && hard_irqs_disabled()); /* * If the syscall signature belongs to the out-of-band syscall * set and we are running out-of-band, pass the request * directly to the companion core by calling the oob syscall * handler. * * Otherwise, if this is an out-of-band syscall or alternate * scheduling is enabled for the caller, propagate the syscall * through the pipeline stages, so that: * * - the core can manipulate the current execution stage for * handling the request, which includes switching the current * thread back to the in-band context if the syscall is a * native one, or promoting it to the oob stage if handling an * oob syscall requires this. * * - the core can receive the initial oob syscall a thread * might have to emit for enabling dovetailing from the * in-band stage. * * Native syscalls from common (non-dovetailed) threads are * not subject to pipelining, but flow down to the in-band * system call handler directly. * * Sanity check: we bark on returning from a syscall on a * stalled in-band stage, which combined with running with * hard irqs on might cause interrupts to linger in the log * after exiting to user. */ if ((local_flags & _TLF_OOB) && maybe_oob_syscall(nr, regs)) { ret = handle_oob_syscall(regs); if (!IS_ENABLED(CONFIG_DOVETAIL_LEGACY_SYSCALL_RANGE)) WARN_ON_ONCE(dovetail_debug() && !ret); local_flags = READ_ONCE(ti_local_flags(ti)); if (likely(ret)) { if (local_flags & _TLF_OOB) { if (test_ti_thread_flag(ti, TIF_MAYDAY)) dovetail_call_mayday(regs); return 1; /* don't pass down, no tail work. */ } else { WARN_ON_ONCE(dovetail_debug() && irqs_disabled()); return -1; /* don't pass down, do tail work. */ } } } if ((local_flags & _TLF_DOVETAIL) || maybe_oob_syscall(nr, regs)) { ret = __pipeline_syscall(regs); local_flags = READ_ONCE(ti_local_flags(ti)); if (local_flags & _TLF_OOB) return 1; /* don't pass down, no tail work. */ if (ret) { WARN_ON_ONCE(dovetail_debug() && irqs_disabled()); return -1; /* don't pass down, do tail work. */ } } return 0; /* pass syscall down to the in-band dispatcher. */ } void __weak handle_oob_trap_entry(unsigned int trapnr, struct pt_regs *regs) { } noinstr void __oob_trap_notify(unsigned int exception, struct pt_regs *regs) { unsigned long flags; /* * We send a notification about exceptions raised over a * registered oob stage only. The trap_entry handler expects * hard irqs off on entry. It may demote the current context * to the in-band stage, may return with hard irqs on. */ if (dovetail_enabled) { set_thread_local_flags(_TLF_OOBTRAP); flags = hard_local_irq_save(); instrumentation_begin(); handle_oob_trap_entry(exception, regs); instrumentation_end(); hard_local_irq_restore(flags); } } void __weak handle_oob_trap_exit(unsigned int trapnr, struct pt_regs *regs) { } noinstr void __oob_trap_unwind(unsigned int exception, struct pt_regs *regs) { /* * The trap_exit handler runs only if trap_entry was called * for the same trap occurrence. It expects hard irqs off on * entry, may switch the current context back to the oob * stage. Must return with hard irqs off. */ hard_local_irq_disable(); clear_thread_local_flags(_TLF_OOBTRAP); instrumentation_begin(); handle_oob_trap_exit(exception, regs); instrumentation_end(); } void __weak handle_inband_event(enum inband_event_type event, void *data) { } void inband_event_notify(enum inband_event_type event, void *data) { check_inband_stage(); if (dovetail_enabled) handle_inband_event(event, data); } void __weak resume_oob_task(struct task_struct *p) { } static void finalize_oob_transition(void) /* hard IRQs off */ { struct irq_pipeline_data *pd; struct irq_stage_data *p; struct thread_info *ti; struct task_struct *t; pd = raw_cpu_ptr(&irq_pipeline); t = pd->task_inflight; if (t == NULL) return; /* * @t which is in flight to the oob stage might have received * a signal while waiting in off-stage state to be actually * scheduled out. We can't act upon that signal safely from * here, we simply let the task complete the migration process * to the oob stage. The pending signal will be handled when * the task eventually exits the out-of-band context by the * converse migration. */ pd->task_inflight = NULL; ti = task_thread_info(t); /* * The transition handler in the companion core assumes the * oob stage is stalled, fix this up. */ stall_oob(); resume_oob_task(t); unstall_oob(); p = this_oob_staged(); if (stage_irqs_pending(p)) /* Current stage (in-band) != p->stage (oob). */ sync_irq_stage(p->stage); } void oob_trampoline(void) { unsigned long flags; check_inband_stage(); flags = hard_local_irq_save(); finalize_oob_transition(); hard_local_irq_restore(flags); } bool inband_switch_tail(void) { bool oob; check_hard_irqs_disabled(); /* * We may run this code either over the inband or oob * contexts. If inband, we may have a thread blocked in * dovetail_leave_inband(), waiting for the companion core to * schedule it back in over the oob context, in which case * finalize_oob_transition() should take care of it. If oob, * the core just switched us back, and we may update the * context markers before returning to context_switch(). * * Since the preemption count does not reflect the active * stage yet upon inband -> oob transition, we figure out * which one we are on by testing _TLF_OFFSTAGE. Having this * bit set when running the inband switch tail code means that * we are completing such transition for the current task, * switched in by dovetail_context_switch() over the oob * stage. If so, update the context markers appropriately. */ oob = test_thread_local_flags(_TLF_OFFSTAGE); if (oob) { /* * The companion core assumes a stalled stage on exit * from dovetail_leave_inband(). */ stall_oob(); set_thread_local_flags(_TLF_OOB); if (!IS_ENABLED(CONFIG_HAVE_PERCPU_PREEMPT_COUNT)) { WARN_ON_ONCE(dovetail_debug() && (preempt_count() & STAGE_MASK)); preempt_count_add(STAGE_OFFSET); } } else { finalize_oob_transition(); hard_local_irq_enable(); } return oob; } void __weak inband_clock_was_set(void) { } void __weak install_inband_fd(unsigned int fd, struct file *file, struct files_struct *files) { } void __weak uninstall_inband_fd(unsigned int fd, struct file *file, struct files_struct *files) { } void __weak replace_inband_fd(unsigned int fd, struct file *file, struct files_struct *files) { } int dovetail_start(void) { check_inband_stage(); if (dovetail_enabled) return -EBUSY; if (!oob_stage_present()) return -EAGAIN; dovetail_enabled = true; smp_wmb(); return 0; } EXPORT_SYMBOL_GPL(dovetail_start); void dovetail_stop(void) { check_inband_stage(); dovetail_enabled = false; smp_wmb(); } EXPORT_SYMBOL_GPL(dovetail_stop);