From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/kernel/cpu.c | 785 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 files changed, 655 insertions(+), 130 deletions(-) diff --git a/kernel/kernel/cpu.c b/kernel/kernel/cpu.c index c664ec4..09d8ee3 100644 --- a/kernel/kernel/cpu.c +++ b/kernel/kernel/cpu.c @@ -10,6 +10,7 @@ #include <linux/notifier.h> #include <linux/sched/signal.h> #include <linux/sched/hotplug.h> +#include <linux/sched/isolation.h> #include <linux/sched/task.h> #include <linux/sched/smt.h> #include <linux/unistd.h> @@ -30,13 +31,21 @@ #include <linux/smpboot.h> #include <linux/relay.h> #include <linux/slab.h> +#include <linux/scs.h> #include <linux/percpu-rwsem.h> #include <linux/cpuset.h> +#include <linux/random.h> +#include <uapi/linux/sched/types.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS #include <trace/events/cpuhp.h> +#undef CREATE_TRACE_POINTS +#include <trace/hooks/sched.h> +#include <trace/hooks/cpu.h> + +#include "sched/sched.h" #include "smpboot.h" /** @@ -63,7 +72,6 @@ bool rollback; bool single; bool bringup; - bool booted_once; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; @@ -76,6 +84,10 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { .fail = CPUHP_INVALID, }; + +#ifdef CONFIG_SMP +cpumask_t cpus_booted_once_mask; +#endif #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = @@ -269,11 +281,13 @@ { mutex_lock(&cpu_add_remove_lock); } +EXPORT_SYMBOL_GPL(cpu_maps_update_begin); void cpu_maps_update_done(void) { mutex_unlock(&cpu_add_remove_lock); } +EXPORT_SYMBOL_GPL(cpu_maps_update_done); /* * If set, cpu_up and cpu_down will return -EBUSY and do nothing. @@ -327,6 +341,16 @@ percpu_rwsem_assert_held(&cpu_hotplug_lock); } +static void lockdep_acquire_cpus_lock(void) +{ + rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); +} + +static void lockdep_release_cpus_lock(void) +{ + rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_); +} + /* * Wait for currently running CPU hotplug operations to complete (if any) and * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects @@ -356,6 +380,17 @@ cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); + +#else + +static void lockdep_acquire_cpus_lock(void) +{ +} + +static void lockdep_release_cpus_lock(void) +{ +} + #endif /* CONFIG_HOTPLUG_CPU */ /* @@ -369,8 +404,7 @@ void __init cpu_smt_disable(bool force) { - if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || - cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + if (!cpu_smt_possible()) return; if (force) { @@ -410,11 +444,19 @@ /* * On x86 it's required to boot all logical CPUs at least once so * that the init code can get a chance to set CR4.MCE on each - * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any + * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ - return !per_cpu(cpuhp_state, cpu).booted_once; + return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } + +/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */ +bool cpu_smt_possible(void) +{ + return cpu_smt_control != CPU_SMT_FORCE_DISABLED && + cpu_smt_control != CPU_SMT_NOT_SUPPORTED; +} +EXPORT_SYMBOL_GPL(cpu_smt_possible); #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } #endif @@ -501,7 +543,7 @@ /* * SMT soft disabling on X86 requires to bring the CPU out of the * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The - * CPU marked itself as booted_once in cpu_notify_starting() so the + * CPU marked itself as booted_once in notify_cpu_starting() so the * cpu_smt_allowed() check will now return false if this is not the * primary sibling. */ @@ -518,6 +560,12 @@ { struct task_struct *idle = idle_thread_get(cpu); int ret; + + /* + * Reset stale stack state from the last time this CPU was online. + */ + scs_task_reset(idle); + kasan_unpoison_task_stack(idle); /* * Some architectures have to walk the irq descriptors to @@ -640,6 +688,12 @@ */ smp_mb(); + /* + * The BP holds the hotplug lock, but we're now running on the AP, + * ensure that anybody asserting the lock is held, will actually find + * it so. + */ + lockdep_acquire_cpus_lock(); cpuhp_lock_acquire(bringup); if (st->single) { @@ -685,6 +739,7 @@ } cpuhp_lock_release(bringup); + lockdep_release_cpus_lock(); if (!st->should_run) complete_ap_thread(st, bringup); @@ -898,6 +953,8 @@ /* Give up timekeeping duties */ tick_handover_do_timer(); + /* Remove CPU from timer broadcasting */ + tick_offline_cpu(cpu); /* Park the stopper thread */ stop_machine_park(cpu); return 0; @@ -1005,7 +1062,7 @@ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int prev_state, ret = 0; - if (num_online_cpus() == 1) + if (num_active_cpus() == 1 && cpu_active(cpu)) return -EBUSY; if (!cpu_present(cpu)) @@ -1068,7 +1125,7 @@ return _cpu_down(cpu, 0, target); } -static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) +static int cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; @@ -1078,11 +1135,315 @@ return err; } -int cpu_down(unsigned int cpu) +/** + * cpu_device_down - Bring down a cpu device + * @dev: Pointer to the cpu device to offline + * + * This function is meant to be used by device core cpu subsystem only. + * + * Other subsystems should use remove_cpu() instead. + */ +int cpu_device_down(struct device *dev) { - return do_cpu_down(cpu, CPUHP_OFFLINE); + return cpu_down(dev->id, CPUHP_OFFLINE); } -EXPORT_SYMBOL(cpu_down); + +int remove_cpu(unsigned int cpu) +{ + int ret; + + lock_device_hotplug(); + ret = device_offline(get_cpu_device(cpu)); + unlock_device_hotplug(); + + return ret; +} +EXPORT_SYMBOL_GPL(remove_cpu); + +int __pause_drain_rq(struct cpumask *cpus) +{ + unsigned int cpu; + int err = 0; + + /* + * Disabling preemption avoids that one of the stopper, started from + * sched_cpu_drain_rq(), blocks firing draining for the whole cpumask. + */ + preempt_disable(); + for_each_cpu(cpu, cpus) { + err = sched_cpu_drain_rq(cpu); + if (err) + break; + } + preempt_enable(); + + return err; +} + +void __wait_drain_rq(struct cpumask *cpus) +{ + unsigned int cpu; + + for_each_cpu(cpu, cpus) + sched_cpu_drain_rq_wait(cpu); +} + +/* if rt task, set to cfs and return previous prio */ +static int pause_reduce_prio(void) +{ + int prev_prio = -1; + + if (current->prio < MAX_RT_PRIO) { + struct sched_param param = { .sched_priority = 0 }; + + prev_prio = current->prio; + sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); + } + + return prev_prio; +} + +/* if previous prio was set, restore */ +static void pause_restore_prio(int prev_prio) +{ + if (prev_prio >= 0 && prev_prio < MAX_RT_PRIO) { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1-prev_prio }; + + sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + } +} + +int pause_cpus(struct cpumask *cpus) +{ + int err = 0; + int cpu; + u64 start_time = 0; + int prev_prio; + + start_time = sched_clock(); + + cpu_maps_update_begin(); + + if (cpu_hotplug_disabled) { + err = -EBUSY; + goto err_cpu_maps_update; + } + + /* Pausing an already inactive CPU isn't an error */ + cpumask_and(cpus, cpus, cpu_active_mask); + + for_each_cpu(cpu, cpus) { + if (!cpu_online(cpu) || dl_bw_check_overflow(cpu) || + get_cpu_device(cpu)->offline_disabled == true) { + err = -EBUSY; + goto err_cpu_maps_update; + } + } + + if (cpumask_weight(cpus) >= num_active_cpus()) { + err = -EBUSY; + goto err_cpu_maps_update; + } + + if (cpumask_empty(cpus)) + goto err_cpu_maps_update; + + /* + * Lazy migration: + * + * We do care about how fast a CPU can go idle and stay this in this + * state. If we try to take the cpus_write_lock() here, we would have + * to wait for a few dozens of ms, as this function might schedule. + * However, we can, as a first step, flip the active mask and migrate + * anything currently on the run-queue, to give a chance to the paused + * CPUs to reach quickly an idle state. There's a risk meanwhile for + * another CPU to observe an out-of-date active_mask or to incompletely + * update a cpuset. Both problems would be resolved later in the slow + * path, which ensures active_mask synchronization, triggers a cpuset + * rebuild and migrate any task that would have escaped the lazy + * migration. + */ + for_each_cpu(cpu, cpus) + set_cpu_active(cpu, false); + err = __pause_drain_rq(cpus); + if (err) { + __wait_drain_rq(cpus); + for_each_cpu(cpu, cpus) + set_cpu_active(cpu, true); + goto err_cpu_maps_update; + } + + prev_prio = pause_reduce_prio(); + + /* + * Slow path deactivation: + * + * Now that paused CPUs are most likely idle, we can go through a + * complete scheduler deactivation. + * + * The cpu_active_mask being already set and cpus_write_lock calling + * synchronize_rcu(), we know that all preempt-disabled and RCU users + * will observe the updated value. + */ + cpus_write_lock(); + + __wait_drain_rq(cpus); + + cpuhp_tasks_frozen = 0; + + if (sched_cpus_deactivate_nosync(cpus)) { + err = -EBUSY; + goto err_cpus_write_unlock; + } + + err = __pause_drain_rq(cpus); + __wait_drain_rq(cpus); + if (err) { + for_each_cpu(cpu, cpus) + sched_cpu_activate(cpu); + goto err_cpus_write_unlock; + } + + /* + * Even if living on the side of the regular HP path, pause is using + * one of the HP step (CPUHP_AP_ACTIVE). This should be reflected on the + * current state of the CPU. + */ + for_each_cpu(cpu, cpus) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + st->state = CPUHP_AP_ACTIVE - 1; + st->target = st->state; + } + +err_cpus_write_unlock: + cpus_write_unlock(); + pause_restore_prio(prev_prio); +err_cpu_maps_update: + cpu_maps_update_done(); + + trace_cpuhp_pause(cpus, start_time, 1); + + return err; +} +EXPORT_SYMBOL_GPL(pause_cpus); + +int resume_cpus(struct cpumask *cpus) +{ + unsigned int cpu; + int err = 0; + u64 start_time = 0; + int prev_prio; + + start_time = sched_clock(); + + cpu_maps_update_begin(); + + if (cpu_hotplug_disabled) { + err = -EBUSY; + goto err_cpu_maps_update; + } + + /* Resuming an already active CPU isn't an error */ + cpumask_andnot(cpus, cpus, cpu_active_mask); + + for_each_cpu(cpu, cpus) { + if (!cpu_online(cpu)) { + err = -EBUSY; + goto err_cpu_maps_update; + } + } + + if (cpumask_empty(cpus)) + goto err_cpu_maps_update; + + for_each_cpu(cpu, cpus) + set_cpu_active(cpu, true); + + trace_android_rvh_resume_cpus(cpus, &err); + if (err) + goto err_cpu_maps_update; + + prev_prio = pause_reduce_prio(); + + /* Lazy Resume. Build domains through schedule a workqueue on + * resuming cpu. This is so that the resuming cpu can work more + * early, and cannot add additional load to other busy cpu. + */ + cpuset_update_active_cpus_affine(cpumask_first(cpus)); + + cpus_write_lock(); + + cpuhp_tasks_frozen = 0; + + if (sched_cpus_activate(cpus)) { + err = -EBUSY; + goto err_cpus_write_unlock; + } + + /* + * see pause_cpus. + */ + for_each_cpu(cpu, cpus) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + st->state = CPUHP_ONLINE; + st->target = st->state; + } + +err_cpus_write_unlock: + cpus_write_unlock(); + pause_restore_prio(prev_prio); +err_cpu_maps_update: + cpu_maps_update_done(); + + trace_cpuhp_pause(cpus, start_time, 0); + + return err; +} +EXPORT_SYMBOL_GPL(resume_cpus); + +void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) +{ + unsigned int cpu; + int error; + + cpu_maps_update_begin(); + + /* + * Make certain the cpu I'm about to reboot on is online. + * + * This is inline to what migrate_to_reboot_cpu() already do. + */ + if (!cpu_online(primary_cpu)) + primary_cpu = cpumask_first(cpu_online_mask); + + for_each_online_cpu(cpu) { + if (cpu == primary_cpu) + continue; + + error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (error) { + pr_err("Failed to offline CPU%d - error=%d", + cpu, error); + break; + } + } + + /* + * Ensure all but the reboot CPU are offline. + */ + BUG_ON(num_online_cpus() > 1); + + /* + * Make sure the CPUs won't be enabled by someone else after this + * point. Kexec will reboot to a new kernel shortly resetting + * everything along the way. + */ + cpu_hotplug_disabled++; + + cpu_maps_update_done(); +} #else #define takedown_cpu NULL @@ -1102,7 +1463,7 @@ int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ - st->booted_once = true; + cpumask_set_cpu(cpu, &cpus_booted_once_mask); while (st->state < target) { st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); @@ -1136,6 +1497,25 @@ complete_ap_thread(st, true); } +static int switch_to_rt_policy(void) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + unsigned int policy = current->policy; + + if (policy == SCHED_NORMAL) + /* Switch to SCHED_FIFO from SCHED_NORMAL. */ + return sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + else + return 1; +} + +static int switch_to_fair_policy(void) +{ + struct sched_param param = { .sched_priority = 0 }; + + return sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); +} + /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { @@ -1151,8 +1531,8 @@ } /* - * The caller of do_cpu_up might have raced with another - * caller. Ignore it for now. + * The caller of cpu_up() might have raced with another + * caller. Nothing to do. */ if (st->state >= target) goto out; @@ -1197,9 +1577,10 @@ return ret; } -static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) +static int cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; + int switch_err; if (!cpu_possible(cpu)) { pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", @@ -1210,9 +1591,23 @@ return -EINVAL; } + trace_android_vh_cpu_up(cpu); + + /* + * CPU hotplug operations consists of many steps and each step + * calls a callback of core kernel subsystem. CPU hotplug-in + * operation may get preempted by other CFS tasks and whole + * operation of cpu hotplug in CPU gets delayed. Switch the + * current task to SCHED_FIFO from SCHED_NORMAL, so that + * hotplug in operation may complete quickly in heavy loaded + * conditions and new CPU will start handle the workload. + */ + + switch_err = switch_to_rt_policy(); + err = try_online_node(cpu_to_node(cpu)); if (err) - return err; + goto switch_out; cpu_maps_update_begin(); @@ -1228,14 +1623,76 @@ err = _cpu_up(cpu, 0, target); out: cpu_maps_update_done(); +switch_out: + if (!switch_err) { + switch_err = switch_to_fair_policy(); + if (switch_err) + pr_err("Hotplug policy switch err=%d Task %s pid=%d\n", + switch_err, current->comm, current->pid); + } + return err; } -int cpu_up(unsigned int cpu) +/** + * cpu_device_up - Bring up a cpu device + * @dev: Pointer to the cpu device to online + * + * This function is meant to be used by device core cpu subsystem only. + * + * Other subsystems should use add_cpu() instead. + */ +int cpu_device_up(struct device *dev) { - return do_cpu_up(cpu, CPUHP_ONLINE); + return cpu_up(dev->id, CPUHP_ONLINE); } -EXPORT_SYMBOL_GPL(cpu_up); + +int add_cpu(unsigned int cpu) +{ + int ret; + + lock_device_hotplug(); + ret = device_online(get_cpu_device(cpu)); + unlock_device_hotplug(); + + return ret; +} +EXPORT_SYMBOL_GPL(add_cpu); + +/** + * bringup_hibernate_cpu - Bring up the CPU that we hibernated on + * @sleep_cpu: The cpu we hibernated on and should be brought up. + * + * On some architectures like arm64, we can hibernate on any CPU, but on + * wake up the CPU we hibernated on might be offline as a side effect of + * using maxcpus= for example. + */ +int bringup_hibernate_cpu(unsigned int sleep_cpu) +{ + int ret; + + if (!cpu_online(sleep_cpu)) { + pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n"); + ret = cpu_up(sleep_cpu, CPUHP_ONLINE); + if (ret) { + pr_err("Failed to bring hibernate-CPU up!\n"); + return ret; + } + } + return 0; +} + +void bringup_nonboot_cpus(unsigned int setup_max_cpus) +{ + unsigned int cpu; + + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu, CPUHP_ONLINE); + } +} #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; @@ -1245,8 +1702,15 @@ int cpu, error = 0; cpu_maps_update_begin(); - if (!cpu_online(primary)) + if (primary == -1) { primary = cpumask_first(cpu_online_mask); + if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) + primary = housekeeping_any_cpu(HK_FLAG_TIMER); + } else { + if (!cpu_online(primary)) + primary = cpumask_first(cpu_online_mask); + } + /* * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time @@ -1257,6 +1721,13 @@ for_each_online_cpu(cpu) { if (cpu == primary) continue; + + if (pm_wakeup_pending()) { + pr_info("Wakeup pending. Abort CPU freeze\n"); + error = -EBUSY; + break; + } + trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); @@ -1275,8 +1746,8 @@ /* * Make sure the CPUs won't be enabled by someone else. We need to do - * this even in case of failure as all disable_nonboot_cpus() users are - * supposed to do enable_nonboot_cpus() on the failure path. + * this even in case of failure as all freeze_secondary_cpus() users are + * supposed to do thaw_secondary_cpus() on the failure path. */ cpu_hotplug_disabled++; @@ -1284,15 +1755,15 @@ return error; } -void __weak arch_enable_nonboot_cpus_begin(void) +void __weak arch_thaw_secondary_cpus_begin(void) { } -void __weak arch_enable_nonboot_cpus_end(void) +void __weak arch_thaw_secondary_cpus_end(void) { } -void enable_nonboot_cpus(void) +void thaw_secondary_cpus(void) { int cpu, error; struct device *cpu_device; @@ -1305,7 +1776,7 @@ pr_info("Enabling non-boot CPUs ...\n"); - arch_enable_nonboot_cpus_begin(); + arch_thaw_secondary_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); @@ -1324,7 +1795,7 @@ pr_warn("Error taking CPU%d up: %d\n", cpu, error); } - arch_enable_nonboot_cpus_end(); + arch_thaw_secondary_cpus_end(); cpumask_clear(frozen_cpus); out: @@ -1390,6 +1861,22 @@ int __boot_cpu_id; +/* Horrific hacks because we can't add more to cpuhp_hp_states. */ +static int random_and_perf_prepare_fusion(unsigned int cpu) +{ +#ifdef CONFIG_PERF_EVENTS + perf_event_init_cpu(cpu); +#endif + random_prepare_cpu(cpu); + return 0; +} +static int random_and_workqueue_online_fusion(unsigned int cpu) +{ + workqueue_online_cpu(cpu); + random_online_cpu(cpu); + return 0; +} + #endif /* CONFIG_SMP */ /* Boot processor state steps */ @@ -1408,7 +1895,7 @@ }, [CPUHP_PERF_PREPARE] = { .name = "perf:prepare", - .startup.single = perf_event_init_cpu, + .startup.single = random_and_perf_prepare_fusion, .teardown.single = perf_event_exit_cpu, }, [CPUHP_WORKQUEUE_PREP] = { @@ -1524,7 +2011,7 @@ }, [CPUHP_AP_WORKQUEUE_ONLINE] = { .name = "workqueue:online", - .startup.single = workqueue_online_cpu, + .startup.single = random_and_workqueue_online_fusion, .teardown.single = workqueue_offline_cpu, }, [CPUHP_AP_RCUTREE_ONLINE] = { @@ -1935,6 +2422,78 @@ } EXPORT_SYMBOL(__cpuhp_remove_state); +#ifdef CONFIG_HOTPLUG_SMT +static void cpuhp_offline_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = true; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); +} + +static void cpuhp_online_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = false; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_ONLINE); +} + +int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + for_each_online_cpu(cpu) { + if (topology_is_primary_thread(cpu)) + continue; + ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (ret) + break; + /* + * As this needs to hold the cpu maps lock it's impossible + * to call device_offline() because that ends up calling + * cpu_down() which takes cpu maps lock. cpu maps lock + * needs to be held as this might race against in kernel + * abusers of the hotplug machinery (thermal management). + * + * So nothing would update device:offline state. That would + * leave the sysfs entry stale and prevent onlining after + * smt control has been changed to 'off' again. This is + * called under the sysfs hotplug lock, so it is properly + * serialized against the regular offline usage. + */ + cpuhp_offline_cpu_device(cpu); + } + if (!ret) + cpu_smt_control = ctrlval; + cpu_maps_update_done(); + return ret; +} + +int cpuhp_smt_enable(void) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + cpu_smt_control = CPU_SMT_ENABLED; + for_each_present_cpu(cpu) { + /* Skip online CPUs and CPUs on offline nodes */ + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) + continue; + ret = _cpu_up(cpu, 0, CPUHP_ONLINE); + if (ret) + break; + /* See comment in cpuhp_smt_disable() */ + cpuhp_online_cpu_device(cpu); + } + cpu_maps_update_done(); + return ret; +} +#endif + #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) static ssize_t show_cpuhp_state(struct device *dev, struct device_attribute *attr, char *buf) @@ -1977,9 +2536,11 @@ goto out; if (st->state < target) - ret = do_cpu_up(dev->id, target); - else - ret = do_cpu_down(dev->id, target); + ret = cpu_up(dev->id, target); + else if (st->state > target) + ret = cpu_down(dev->id, target); + else if (WARN_ON(st->target != target)) + st->target = target; out: unlock_device_hotplug(); return ret ? ret : count; @@ -2089,92 +2650,9 @@ #ifdef CONFIG_HOTPLUG_SMT -static const char *smt_states[] = { - [CPU_SMT_ENABLED] = "on", - [CPU_SMT_DISABLED] = "off", - [CPU_SMT_FORCE_DISABLED] = "forceoff", - [CPU_SMT_NOT_SUPPORTED] = "notsupported", -}; - static ssize_t -show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]); -} - -static void cpuhp_offline_cpu_device(unsigned int cpu) -{ - struct device *dev = get_cpu_device(cpu); - - dev->offline = true; - /* Tell user space about the state change */ - kobject_uevent(&dev->kobj, KOBJ_OFFLINE); -} - -static void cpuhp_online_cpu_device(unsigned int cpu) -{ - struct device *dev = get_cpu_device(cpu); - - dev->offline = false; - /* Tell user space about the state change */ - kobject_uevent(&dev->kobj, KOBJ_ONLINE); -} - -int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) -{ - int cpu, ret = 0; - - cpu_maps_update_begin(); - for_each_online_cpu(cpu) { - if (topology_is_primary_thread(cpu)) - continue; - ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); - if (ret) - break; - /* - * As this needs to hold the cpu maps lock it's impossible - * to call device_offline() because that ends up calling - * cpu_down() which takes cpu maps lock. cpu maps lock - * needs to be held as this might race against in kernel - * abusers of the hotplug machinery (thermal management). - * - * So nothing would update device:offline state. That would - * leave the sysfs entry stale and prevent onlining after - * smt control has been changed to 'off' again. This is - * called under the sysfs hotplug lock, so it is properly - * serialized against the regular offline usage. - */ - cpuhp_offline_cpu_device(cpu); - } - if (!ret) - cpu_smt_control = ctrlval; - cpu_maps_update_done(); - return ret; -} - -int cpuhp_smt_enable(void) -{ - int cpu, ret = 0; - - cpu_maps_update_begin(); - cpu_smt_control = CPU_SMT_ENABLED; - for_each_present_cpu(cpu) { - /* Skip online CPUs and CPUs on offline nodes */ - if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) - continue; - ret = _cpu_up(cpu, 0, CPUHP_ONLINE); - if (ret) - break; - /* See comment in cpuhp_smt_disable() */ - cpuhp_online_cpu_device(cpu); - } - cpu_maps_update_done(); - return ret; -} - -static ssize_t -store_smt_control(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +__store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { int ctrlval, ret; @@ -2212,14 +2690,44 @@ unlock_device_hotplug(); return ret ? ret : count; } + +#else /* !CONFIG_HOTPLUG_SMT */ +static ssize_t +__store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + return -ENODEV; +} +#endif /* CONFIG_HOTPLUG_SMT */ + +static const char *smt_states[] = { + [CPU_SMT_ENABLED] = "on", + [CPU_SMT_DISABLED] = "off", + [CPU_SMT_FORCE_DISABLED] = "forceoff", + [CPU_SMT_NOT_SUPPORTED] = "notsupported", + [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented", +}; + +static ssize_t +show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +{ + const char *state = smt_states[cpu_smt_control]; + + return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); +} + +static ssize_t +store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + return __store_smt_control(dev, attr, buf, count); +} static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); static ssize_t show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) { - bool active = topology_max_smt_threads() > 1; - - return snprintf(buf, PAGE_SIZE - 2, "%d\n", active); + return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active()); } static DEVICE_ATTR(active, 0444, show_smt_active, NULL); @@ -2235,21 +2743,17 @@ NULL }; -static int __init cpu_smt_state_init(void) +static int __init cpu_smt_sysfs_init(void) { return sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_smt_attr_group); } -#else -static inline int cpu_smt_state_init(void) { return 0; } -#endif - static int __init cpuhp_sysfs_init(void) { int cpu, ret; - ret = cpu_smt_state_init(); + ret = cpu_smt_sysfs_init(); if (ret) return ret; @@ -2270,7 +2774,7 @@ return 0; } device_initcall(cpuhp_sysfs_init); -#endif +#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */ /* * cpu_bit_bitmap[] is a special, "compressed" data structure that @@ -2317,8 +2821,8 @@ struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); -struct cpumask __cpu_isolated_mask __read_mostly; -EXPORT_SYMBOL(__cpu_isolated_mask); +atomic_t __num_online_cpus __read_mostly; +EXPORT_SYMBOL(__num_online_cpus); void init_cpu_present(const struct cpumask *src) { @@ -2333,6 +2837,27 @@ void init_cpu_online(const struct cpumask *src) { cpumask_copy(&__cpu_online_mask, src); +} + +void set_cpu_online(unsigned int cpu, bool online) +{ + /* + * atomic_inc/dec() is required to handle the horrid abuse of this + * function by the reboot and kexec code which invoke it from + * IPI/NMI broadcasts when shutting down CPUs. Invocation from + * regular CPU hotplug is properly serialized. + * + * Note, that the fact that __num_online_cpus is of type atomic_t + * does not protect readers which are not serialized against + * concurrent hotplug operations. + */ + if (online) { + if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask)) + atomic_inc(&__num_online_cpus); + } else { + if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask)) + atomic_dec(&__num_online_cpus); + } } /* @@ -2359,7 +2884,7 @@ void __init boot_cpu_hotplug_init(void) { #ifdef CONFIG_SMP - this_cpu_write(cpuhp_state.booted_once, true); + cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask); #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -- Gitblit v1.6.2