/* * SPDX-License-Identifier: GPL-2.0 * * Copyright (C) 2019 Philippe Gerum . */ #include #include #include #include #include #include #include #include #include #include static struct irq_domain *sipic_domain; static void sipic_irq_noop(struct irq_data *data) { } static unsigned int sipic_irq_noop_ret(struct irq_data *data) { return 0; } static struct irq_chip sipic_chip = { .name = "SIPIC", .irq_startup = sipic_irq_noop_ret, .irq_shutdown = sipic_irq_noop, .irq_enable = sipic_irq_noop, .irq_disable = sipic_irq_noop, .flags = IRQCHIP_PIPELINE_SAFE | IRQCHIP_SKIP_SET_WAKE, }; void handle_apic_irq(struct irq_desc *desc) { if (WARN_ON_ONCE(irq_pipeline_debug() && !on_pipeline_entry())) return; /* * MCE events are non-maskable therefore their in-band * handlers have to be oob-compatible by construction. Those * handlers run immediately out of the IDT for this reason as * well. We won't see them here since they are not routed via * arch_handle_irq() -> generic_pipeline_irq(). * * All we need to do at this stage is to acknowledge other * APIC events, then pipeline the corresponding interrupt from * our synthetic controller chip (SIPIC). */ __ack_APIC_irq(); handle_oob_irq(desc); } void irq_send_oob_ipi(unsigned int ipi, const struct cpumask *cpumask) { apic->send_IPI_mask_allbutself(cpumask, apicm_irq_vector(ipi)); } EXPORT_SYMBOL_GPL(irq_send_oob_ipi); static void do_sysvec_inband(struct irq_desc *desc) { unsigned int irq = irq_desc_get_irq(desc); struct pt_regs *regs = get_irq_regs(); int vector = apicm_irq_vector(irq); /* * This code only sees pipelined sysvec events tagged with * DEFINE_IDTENTRY_SYSVEC_PIPELINED: * * arch_handle_irq(irq) * generic_pipeline_irq(irq) * handle_apic_irq(irq) * handle_oob_irq(irq) * [...irq_post_inband...] * * arch_do_IRQ_pipelined(desc) * * | * v * do_sysvec_inband(desc) * * System vectors which are still tagged as * DEFINE_IDTENTRY_SYSVEC/DEFINE_IDTENTRY_SYSVEC_SIMPLE are * directly dispatched out of the IDT, assuming their handler * is oob-safe (like NMI handlers) therefore never reach this * in-band stage handler. */ switch (vector) { #ifdef CONFIG_SMP case RESCHEDULE_VECTOR: __sysvec_reschedule_ipi(regs); break; case CALL_FUNCTION_VECTOR: __sysvec_call_function(regs); break; case CALL_FUNCTION_SINGLE_VECTOR: __sysvec_call_function_single(regs); break; case REBOOT_VECTOR: __sysvec_reboot(regs); break; #endif case X86_PLATFORM_IPI_VECTOR: __sysvec_x86_platform_ipi(regs); break; case IRQ_WORK_VECTOR: __sysvec_irq_work(regs); break; #ifdef CONFIG_HAVE_KVM case POSTED_INTR_VECTOR: __sysvec_kvm_posted_intr_ipi(regs); break; case POSTED_INTR_WAKEUP_VECTOR: __sysvec_kvm_posted_intr_wakeup_ipi(regs); break; case POSTED_INTR_NESTED_VECTOR: __sysvec_kvm_posted_intr_nested_ipi(regs); break; #endif #ifdef CONFIG_HYPERV case HYPERVISOR_CALLBACK_VECTOR: __sysvec_hyperv_callback(regs); break; case HYPERV_REENLIGHTENMENT_VECTOR: __sysvec_hyperv_reenlightenment(regs); break; case HYPERV_STIMER0_VECTOR: __sysvec_hyperv_stimer0(regs); break; #endif #ifdef CONFIG_ACRN_GUEST case HYPERVISOR_CALLBACK_VECTOR: __sysvec_acrn_hv_callback(regs); break; #endif #ifdef CONFIG_XEN_PVHVM case HYPERVISOR_CALLBACK_VECTOR: __sysvec_xen_hvm_callback(regs); break; #endif case LOCAL_TIMER_VECTOR: __sysvec_apic_timer_interrupt(regs); break; default: printk_once(KERN_ERR "irq_pipeline: unexpected event" " on vector #%.2x (irq=%u)", vector, irq); } } static irqentry_state_t pipeline_enter_rcu(void) { irqentry_state_t state = { .exit_rcu = false, .stage_info = IRQENTRY_INBAND_UNSTALLED, }; if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { rcu_irq_enter(); state.exit_rcu = true; } else { rcu_irq_enter_check_tick(); } return state; } static void pipeline_exit_rcu(irqentry_state_t state) { if (state.exit_rcu) rcu_irq_exit(); } void arch_do_IRQ_pipelined(struct irq_desc *desc) { struct pt_regs *regs = raw_cpu_ptr(&irq_pipeline.tick_regs); struct pt_regs *old_regs = set_irq_regs(regs); irqentry_state_t state; /* Emulate a kernel entry. */ state = pipeline_enter_rcu(); irq_enter_rcu(); if (desc->irq_data.domain == sipic_domain) run_irq_on_irqstack_cond(do_sysvec_inband, desc, regs); else run_irq_on_irqstack_cond(desc->handle_irq, desc, regs); irq_exit_rcu(); pipeline_exit_rcu(state); set_irq_regs(old_regs); } void arch_handle_irq(struct pt_regs *regs, u8 vector, bool irq_movable) { struct irq_desc *desc; unsigned int irq; if (vector >= FIRST_SYSTEM_VECTOR) { irq = apicm_vector_irq(vector); } else { desc = __this_cpu_read(vector_irq[vector]); if (unlikely(IS_ERR_OR_NULL(desc))) { __ack_APIC_irq(); if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), vector); } else { __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); } return; } if (irqd_is_setaffinity_pending(&desc->irq_data)) { raw_spin_lock(&desc->lock); if (irq_movable) irqd_clr_move_blocked(&desc->irq_data); else irqd_set_move_blocked(&desc->irq_data); raw_spin_unlock(&desc->lock); } irq = irq_desc_get_irq(desc); } generic_pipeline_irq(irq, regs); } noinstr void arch_pipeline_entry(struct pt_regs *regs, u8 vector) { struct irq_stage_data *prevd; irqentry_state_t state; /* * The tricky one: we distinguish the following cases: * * [1] entry from oob context, either kernel or user code was * preempted by the IRQ, the in-band (virtual) interrupt state * is 'undefined' (could be either stalled/unstalled, it is * not relevant). * * [2] entry from in-band context while the stage is stalled, * which means that some kernel code was preempted by the IRQ * since in-band user code cannot run with interrupts * (virtually) disabled. * * [3] entry from in-band context while the stage is * unstalled: the common case for IRQ entry. Kernel or user * code may have been preempted, we handle the event * identically. * * [1] and [2] are processed almost the same way, except for * one key aspect: the potential stage demotion of the * preempted task which originally entered [1] on the oob * stage, then left it for the in-band stage as a result of * handling the IRQ (such demotion normally happens during * handle_irq_pipelined_finish() if required). In this * particular case, we want to run the common IRQ epilogue * code before returning to user mode, so that all pending * in-band work (_TIF_WORK_*) is carried out for the task * which is about to exit kernel mode. * * If the task runs in-band at the exit point and a user mode * context was preempted, then case [2] is excluded by * definition so we know for sure that we just observed a * stage demotion, therefore we have to run the work loop by * calling irqentry_exit_to_user_mode(). */ if (unlikely(running_oob() || irqs_disabled())) { instrumentation_begin(); prevd = handle_irq_pipelined_prepare(regs); arch_handle_irq(regs, vector, false); kvm_set_cpu_l1tf_flush_l1d(); handle_irq_pipelined_finish(prevd, regs); if (running_inband() && user_mode(regs)) { stall_inband_nocheck(); irqentry_exit_to_user_mode(regs); } instrumentation_end(); return; } /* In-band on entry, accepting interrupts. */ state = irqentry_enter(regs); instrumentation_begin(); /* Prep for handling, switching oob. */ prevd = handle_irq_pipelined_prepare(regs); arch_handle_irq(regs, vector, true); kvm_set_cpu_l1tf_flush_l1d(); /* irqentry_enter() stalled the in-band stage. */ trace_hardirqs_on(); unstall_inband_nocheck(); handle_irq_pipelined_finish(prevd, regs); stall_inband_nocheck(); trace_hardirqs_off(); instrumentation_end(); irqentry_exit(regs, state); } static int sipic_irq_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq) { irq_set_percpu_devid(irq); irq_set_chip_and_handler(irq, &sipic_chip, handle_apic_irq); return 0; } static struct irq_domain_ops sipic_domain_ops = { .map = sipic_irq_map, }; static void create_x86_apic_domain(void) { sipic_domain = irq_domain_add_simple(NULL, NR_APIC_VECTORS, FIRST_SYSTEM_IRQ, &sipic_domain_ops, NULL); } #ifdef CONFIG_SMP DEFINE_IDTENTRY_SYSVEC_PIPELINED(RESCHEDULE_OOB_VECTOR, sysvec_reschedule_oob_ipi) { /* In-band handler is unused. */ } DEFINE_IDTENTRY_SYSVEC_PIPELINED(TIMER_OOB_VECTOR, sysvec_timer_oob_ipi) { /* In-band handler is unused. */ } void handle_irq_move_cleanup(struct irq_desc *desc) { if (on_pipeline_entry()) { /* 1. on receipt from hardware. */ __ack_APIC_irq(); handle_oob_irq(desc); } else { /* 2. in-band delivery. */ __sysvec_irq_move_cleanup(NULL); } } static void smp_setup(void) { int irq; /* * The IRQ cleanup event must be pipelined to the inband * stage, so we need a valid IRQ descriptor for it. Since we * still are in the early boot stage on CPU0, we ask for a 1:1 * mapping between the vector number and IRQ number, to make * things easier for us later on. */ irq = irq_alloc_desc_at(IRQ_MOVE_CLEANUP_VECTOR, 0); WARN_ON(IRQ_MOVE_CLEANUP_VECTOR != irq); /* * Set up the vector_irq[] mapping array for the boot CPU, * other CPUs will copy this entry when their APIC is going * online (see lapic_online()). */ per_cpu(vector_irq, 0)[irq] = irq_to_desc(irq); irq_set_chip_and_handler(irq, &dummy_irq_chip, handle_irq_move_cleanup); } #else static void smp_setup(void) { } #endif void __init arch_irq_pipeline_init(void) { /* * Create an IRQ domain for mapping APIC system interrupts * (in-band and out-of-band), with fixed sirq numbers starting * from FIRST_SYSTEM_IRQ. Upon receipt of a system interrupt, * the corresponding sirq is injected into the pipeline. */ create_x86_apic_domain(); smp_setup(); }