From 61598093bbdd283a7edc367d900f223070ead8d2 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Fri, 10 May 2024 07:43:03 +0000 Subject: [PATCH] add ax88772C AX88772C_eeprom_tools --- kernel/kernel/sched/psi.c | 340 +++++++++++++++++++++++++++++++++++++------------------- 1 files changed, 223 insertions(+), 117 deletions(-) diff --git a/kernel/kernel/sched/psi.c b/kernel/kernel/sched/psi.c index 3afd919..adcfc82 100644 --- a/kernel/kernel/sched/psi.c +++ b/kernel/kernel/sched/psi.c @@ -142,6 +142,8 @@ #include <linux/psi.h> #include "sched.h" +#include <trace/hooks/psi.h> + static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); @@ -174,7 +176,7 @@ /* System-level pressure and stall tracking */ static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); -static struct psi_group psi_system = { +struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; @@ -193,6 +195,7 @@ INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); @@ -232,7 +235,7 @@ case PSI_MEM_FULL: return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; case PSI_CPU_SOME: - return tasks[NR_RUNNING] > 1; + return tasks[NR_RUNNING] > tasks[NR_ONCPU]; case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -541,11 +544,15 @@ if (now < t->last_event_time + t->win.size) continue; + trace_android_vh_psi_event(t); + /* Generate an event */ if (cmpxchg(&t->event, 0, 1) == 0) wake_up_interruptible(&t->event_wait); t->last_event_time = now; } + + trace_android_vh_psi_group(group); if (new_stall) memcpy(group->polling_total, total, @@ -554,18 +561,17 @@ return now + group->poll_min_period; } -/* Schedule polling if it's not already scheduled. */ -static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +/* Schedule polling if it's not already scheduled or forced. */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay, + bool force) { struct task_struct *task; /* - * Do not reschedule if already scheduled. - * Possible race with a timer scheduled after this check but before - * mod_timer below can be tolerated because group->polling_next_update - * will keep updates on schedule. + * atomic_xchg should be called even when !force to provide a + * full memory barrier (see the comment inside psi_poll_work). */ - if (timer_pending(&group->poll_timer)) + if (atomic_xchg(&group->poll_scheduled, 1) && !force) return; rcu_read_lock(); @@ -577,18 +583,58 @@ */ if (likely(task)) mod_timer(&group->poll_timer, jiffies + delay); + else + atomic_set(&group->poll_scheduled, 0); rcu_read_unlock(); } static void psi_poll_work(struct psi_group *group) { + bool force_reschedule = false; u32 changed_states; u64 now; mutex_lock(&group->trigger_lock); now = sched_clock(); + + if (now > group->polling_until) { + /* + * We are either about to start or might stop polling if no + * state change was recorded. Resetting poll_scheduled leaves + * a small window for psi_group_change to sneak in and schedule + * an immegiate poll_work before we get to rescheduling. One + * potential extra wakeup at the end of the polling window + * should be negligible and polling_next_update still keeps + * updates correctly on schedule. + */ + atomic_set(&group->poll_scheduled, 0); + /* + * A task change can race with the poll worker that is supposed to + * report on it. To avoid missing events, ensure ordering between + * poll_scheduled and the task state accesses, such that if the poll + * worker misses the state update, the task change is guaranteed to + * reschedule the poll worker: + * + * poll worker: + * atomic_set(poll_scheduled, 0) + * smp_mb() + * LOAD states + * + * task change: + * STORE states + * if atomic_xchg(poll_scheduled, 1) == 0: + * schedule poll worker + * + * The atomic_xchg() implies a full barrier. + */ + smp_mb(); + } else { + /* Polling window is not over, keep rescheduling */ + force_reschedule = true; + } + collect_percpu_times(group, PSI_POLL, &changed_states); @@ -615,7 +661,8 @@ group->polling_next_update = update_triggers(group, now); psi_schedule_poll_work(group, - nsecs_to_jiffies(group->polling_next_update - now) + 1); + nsecs_to_jiffies(group->polling_next_update - now) + 1, + force_reschedule); out: mutex_unlock(&group->trigger_lock); @@ -624,11 +671,8 @@ static int psi_poll_worker(void *data) { struct psi_group *group = (struct psi_group *)data; - struct sched_param param = { - .sched_priority = 1, - }; - sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + sched_set_fifo_low(current); while (true) { wait_event_interruptible(group->poll_wait, @@ -696,13 +740,14 @@ groupc->times[PSI_NONIDLE] += delta; } -static u32 psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set, + bool wake_clock) { struct psi_group_cpu *groupc; + u32 state_mask = 0; unsigned int t, m; enum psi_states s; - u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -721,14 +766,15 @@ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; - if (groupc->tasks[t] == 0 && !psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + if (groupc->tasks[t]) { + groupc->tasks[t]--; + } else if (!psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } - groupc->tasks[t]--; } for (t = 0; set; set &= ~(1 << t), t++) @@ -744,7 +790,11 @@ write_seqcount_end(&groupc->seq); - return state_mask; + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1, false); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -771,6 +821,21 @@ return &psi_system; } +static void psi_flags_change(struct task_struct *task, int clear, int set) +{ + if (((task->psi_flags & set) || + (task->psi_flags & clear) != clear) && + !psi_bug) { + printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", + task->pid, task->comm, task_cpu(task), + task->psi_flags, clear, set); + psi_bug = 1; + } + + task->psi_flags &= ~clear; + task->psi_flags |= set; +} + void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); @@ -781,17 +846,7 @@ if (!task->pid) return; - if (((task->psi_flags & set) || - (task->psi_flags & clear) != clear) && - !psi_bug) { - printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", - task->pid, task->comm, cpu, - task->psi_flags, clear, set); - psi_bug = 1; - } - - task->psi_flags &= ~clear; - task->psi_flags |= set; + psi_flags_change(task, clear, set); /* * Periodic aggregation shuts off if there is a period of no @@ -804,14 +859,51 @@ wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; - while ((group = iterate_groups(task, &iter))) { - u32 state_mask = psi_group_change(group, cpu, clear, set); + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set, wake_clock); +} - if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1); +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep) +{ + struct psi_group *group, *common = NULL; + int cpu = task_cpu(prev); + void *iter; - if (wake_clock && !delayed_work_pending(&group->avgs_work)) - schedule_delayed_work(&group->avgs_work, PSI_FREQ); + if (next->pid) { + psi_flags_change(next, 0, TSK_ONCPU); + /* + * When moving state between tasks, the group that + * contains them both does not change: we can stop + * updating the tree once we reach the first common + * ancestor. Iterate @next's ancestors until we + * encounter @prev's state. + */ + iter = NULL; + while ((group = iterate_groups(next, &iter))) { + if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + common = group; + break; + } + + psi_group_change(group, cpu, 0, TSK_ONCPU, true); + } + } + + /* + * If this is a voluntary sleep, dequeue will have taken care + * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We + * only need to deal with it during preemption. + */ + if (sleep) + return; + + if (prev->pid) { + psi_flags_change(prev, TSK_ONCPU, 0); + + iter = NULL; + while ((group = iterate_groups(prev, &iter)) && group != common) + psi_group_change(group, cpu, TSK_ONCPU, 0, true); } } @@ -845,17 +937,17 @@ if (static_branch_likely(&psi_disabled)) return; - *flags = current->flags & PF_MEMSTALL; + *flags = current->in_memstall; if (*flags) return; /* - * PF_MEMSTALL setting & accounting needs to be atomic wrt + * in_memstall setting & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we can * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags |= PF_MEMSTALL; + current->in_memstall = 1; psi_task_change(current, 0, TSK_MEMSTALL); rq_unlock_irq(rq, &rf); @@ -878,13 +970,13 @@ if (*flags) return; /* - * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags &= ~PF_MEMSTALL; + current->in_memstall = 0; psi_task_change(current, TSK_MEMSTALL, 0); rq_unlock_irq(rq, &rf); @@ -928,7 +1020,7 @@ */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - unsigned int task_flags = 0; + unsigned int task_flags; struct rq_flags rf; struct rq *rq; @@ -943,13 +1035,31 @@ rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) - task_flags = TSK_RUNNING; - else if (task->in_iowait) - task_flags = TSK_IOWAIT; - - if (task->flags & PF_MEMSTALL) - task_flags |= TSK_MEMSTALL; + /* + * We may race with schedule() dropping the rq lock between + * deactivating prev and switching to next. Because the psi + * updates from the deactivation are deferred to the switch + * callback to save cgroup tree updates, the task's scheduling + * state here is not coherent with its psi state: + * + * schedule() cgroup_move_task() + * rq_lock() + * deactivate_task() + * p->on_rq = 0 + * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates + * pick_next_task() + * rq_unlock() + * rq_lock() + * psi_task_change() // old cgroup + * task->cgroups = to + * psi_task_change() // new cgroup + * rq_unlock() + * rq_lock() + * psi_sched_switch() // does deferred updates in new cgroup + * + * Don't rely on the scheduling state. Use psi_flags instead. + */ + task_flags = task->psi_flags; if (task_flags) psi_task_change(task, task_flags, 0); @@ -1073,7 +1183,6 @@ t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); mutex_lock(&group->trigger_lock); @@ -1102,20 +1211,25 @@ return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* - * Wakeup waiters to stop polling. Can happen if cgroup is deleted - * from under a polling process. + * Wakeup waiters to stop polling and clear the queue to prevent it from + * being accessed later. Can happen if cgroup is deleted from under a + * polling process. */ - wake_up_interruptible(&t->event_wait); + wake_up_pollfree(&t->event_wait); mutex_lock(&group->trigger_lock); @@ -1146,9 +1260,9 @@ mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1161,20 +1275,9 @@ * can no longer be found through group->poll_task. */ kthread_stop(task_to_destroy); + atomic_set(&group->poll_scheduled, 0); } kfree(t); -} - -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); } __poll_t psi_trigger_poll(void **trigger_ptr, @@ -1186,23 +1289,14 @@ if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - - kref_put(&t->refcount, psi_trigger_destroy); return ret; } @@ -1227,14 +1321,24 @@ buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1269,43 +1373,45 @@ { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } -static const struct file_operations psi_io_fops = { - .open = psi_io_open, - .read = seq_read, - .llseek = seq_lseek, - .write = psi_io_write, - .poll = psi_fop_poll, - .release = psi_fop_release, +static const struct proc_ops psi_io_proc_ops = { + .proc_open = psi_io_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_io_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; -static const struct file_operations psi_memory_fops = { - .open = psi_memory_open, - .read = seq_read, - .llseek = seq_lseek, - .write = psi_memory_write, - .poll = psi_fop_poll, - .release = psi_fop_release, +static const struct proc_ops psi_memory_proc_ops = { + .proc_open = psi_memory_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_memory_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; -static const struct file_operations psi_cpu_fops = { - .open = psi_cpu_open, - .read = seq_read, - .llseek = seq_lseek, - .write = psi_cpu_write, - .poll = psi_fop_poll, - .release = psi_fop_release, +static const struct proc_ops psi_cpu_proc_ops = { + .proc_open = psi_cpu_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_cpu_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; static int __init psi_proc_init(void) { - proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_fops); - proc_create("pressure/memory", 0, NULL, &psi_memory_fops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + if (psi_enable) { + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_proc_ops); + proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops); + } return 0; } module_init(psi_proc_init); -- Gitblit v1.6.2