From b22da3d8526a935aa31e086e63f60ff3246cb61c Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Sat, 09 Dec 2023 07:24:11 +0000
Subject: [PATCH] add stmac read mac form eeprom
---
kernel/kernel/sched/psi.c | 333 ++++++++++++++++++++++++++++++++++++-------------------
1 files changed, 219 insertions(+), 114 deletions(-)
diff --git a/kernel/kernel/sched/psi.c b/kernel/kernel/sched/psi.c
index 3afd919..e14917c 100644
--- a/kernel/kernel/sched/psi.c
+++ b/kernel/kernel/sched/psi.c
@@ -142,6 +142,8 @@
#include <linux/psi.h>
#include "sched.h"
+#include <trace/hooks/psi.h>
+
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
@@ -174,7 +176,7 @@
/* System-level pressure and stall tracking */
static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
-static struct psi_group psi_system = {
+struct psi_group psi_system = {
.pcpu = &system_group_pcpu,
};
@@ -193,6 +195,7 @@
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
+ atomic_set(&group->poll_scheduled, 0);
mutex_init(&group->trigger_lock);
INIT_LIST_HEAD(&group->triggers);
memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
@@ -232,7 +235,7 @@
case PSI_MEM_FULL:
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
case PSI_CPU_SOME:
- return tasks[NR_RUNNING] > 1;
+ return tasks[NR_RUNNING] > tasks[NR_ONCPU];
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -541,11 +544,15 @@
if (now < t->last_event_time + t->win.size)
continue;
+ trace_android_vh_psi_event(t);
+
/* Generate an event */
if (cmpxchg(&t->event, 0, 1) == 0)
wake_up_interruptible(&t->event_wait);
t->last_event_time = now;
}
+
+ trace_android_vh_psi_group(group);
if (new_stall)
memcpy(group->polling_total, total,
@@ -554,18 +561,17 @@
return now + group->poll_min_period;
}
-/* Schedule polling if it's not already scheduled. */
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
+/* Schedule polling if it's not already scheduled or forced. */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
+ bool force)
{
struct task_struct *task;
/*
- * Do not reschedule if already scheduled.
- * Possible race with a timer scheduled after this check but before
- * mod_timer below can be tolerated because group->polling_next_update
- * will keep updates on schedule.
+ * atomic_xchg should be called even when !force to provide a
+ * full memory barrier (see the comment inside psi_poll_work).
*/
- if (timer_pending(&group->poll_timer))
+ if (atomic_xchg(&group->poll_scheduled, 1) && !force)
return;
rcu_read_lock();
@@ -577,18 +583,58 @@
*/
if (likely(task))
mod_timer(&group->poll_timer, jiffies + delay);
+ else
+ atomic_set(&group->poll_scheduled, 0);
rcu_read_unlock();
}
static void psi_poll_work(struct psi_group *group)
{
+ bool force_reschedule = false;
u32 changed_states;
u64 now;
mutex_lock(&group->trigger_lock);
now = sched_clock();
+
+ if (now > group->polling_until) {
+ /*
+ * We are either about to start or might stop polling if no
+ * state change was recorded. Resetting poll_scheduled leaves
+ * a small window for psi_group_change to sneak in and schedule
+ * an immegiate poll_work before we get to rescheduling. One
+ * potential extra wakeup at the end of the polling window
+ * should be negligible and polling_next_update still keeps
+ * updates correctly on schedule.
+ */
+ atomic_set(&group->poll_scheduled, 0);
+ /*
+ * A task change can race with the poll worker that is supposed to
+ * report on it. To avoid missing events, ensure ordering between
+ * poll_scheduled and the task state accesses, such that if the poll
+ * worker misses the state update, the task change is guaranteed to
+ * reschedule the poll worker:
+ *
+ * poll worker:
+ * atomic_set(poll_scheduled, 0)
+ * smp_mb()
+ * LOAD states
+ *
+ * task change:
+ * STORE states
+ * if atomic_xchg(poll_scheduled, 1) == 0:
+ * schedule poll worker
+ *
+ * The atomic_xchg() implies a full barrier.
+ */
+ smp_mb();
+ } else {
+ /* Polling window is not over, keep rescheduling */
+ force_reschedule = true;
+ }
+
collect_percpu_times(group, PSI_POLL, &changed_states);
@@ -615,7 +661,8 @@
group->polling_next_update = update_triggers(group, now);
psi_schedule_poll_work(group,
- nsecs_to_jiffies(group->polling_next_update - now) + 1);
+ nsecs_to_jiffies(group->polling_next_update - now) + 1,
+ force_reschedule);
out:
mutex_unlock(&group->trigger_lock);
@@ -624,11 +671,8 @@
static int psi_poll_worker(void *data)
{
struct psi_group *group = (struct psi_group *)data;
- struct sched_param param = {
- .sched_priority = 1,
- };
- sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m);
+ sched_set_fifo_low(current);
while (true) {
wait_event_interruptible(group->poll_wait,
@@ -696,13 +740,14 @@
groupc->times[PSI_NONIDLE] += delta;
}
-static u32 psi_group_change(struct psi_group *group, int cpu,
- unsigned int clear, unsigned int set)
+static void psi_group_change(struct psi_group *group, int cpu,
+ unsigned int clear, unsigned int set,
+ bool wake_clock)
{
struct psi_group_cpu *groupc;
+ u32 state_mask = 0;
unsigned int t, m;
enum psi_states s;
- u32 state_mask = 0;
groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -721,14 +766,15 @@
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
continue;
- if (groupc->tasks[t] == 0 && !psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
+ if (groupc->tasks[t]) {
+ groupc->tasks[t]--;
+ } else if (!psi_bug) {
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- clear, set);
+ groupc->tasks[3], clear, set);
psi_bug = 1;
}
- groupc->tasks[t]--;
}
for (t = 0; set; set &= ~(1 << t), t++)
@@ -744,7 +790,11 @@
write_seqcount_end(&groupc->seq);
- return state_mask;
+ if (state_mask & group->poll_states)
+ psi_schedule_poll_work(group, 1, false);
+
+ if (wake_clock && !delayed_work_pending(&group->avgs_work))
+ schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -771,6 +821,21 @@
return &psi_system;
}
+static void psi_flags_change(struct task_struct *task, int clear, int set)
+{
+ if (((task->psi_flags & set) ||
+ (task->psi_flags & clear) != clear) &&
+ !psi_bug) {
+ printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
+ task->pid, task->comm, task_cpu(task),
+ task->psi_flags, clear, set);
+ psi_bug = 1;
+ }
+
+ task->psi_flags &= ~clear;
+ task->psi_flags |= set;
+}
+
void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
@@ -781,17 +846,7 @@
if (!task->pid)
return;
- if (((task->psi_flags & set) ||
- (task->psi_flags & clear) != clear) &&
- !psi_bug) {
- printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
- task->pid, task->comm, cpu,
- task->psi_flags, clear, set);
- psi_bug = 1;
- }
-
- task->psi_flags &= ~clear;
- task->psi_flags |= set;
+ psi_flags_change(task, clear, set);
/*
* Periodic aggregation shuts off if there is a period of no
@@ -804,14 +859,51 @@
wq_worker_last_func(task) == psi_avgs_work))
wake_clock = false;
- while ((group = iterate_groups(task, &iter))) {
- u32 state_mask = psi_group_change(group, cpu, clear, set);
+ while ((group = iterate_groups(task, &iter)))
+ psi_group_change(group, cpu, clear, set, wake_clock);
+}
- if (state_mask & group->poll_states)
- psi_schedule_poll_work(group, 1);
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+ bool sleep)
+{
+ struct psi_group *group, *common = NULL;
+ int cpu = task_cpu(prev);
+ void *iter;
- if (wake_clock && !delayed_work_pending(&group->avgs_work))
- schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+ if (next->pid) {
+ psi_flags_change(next, 0, TSK_ONCPU);
+ /*
+ * When moving state between tasks, the group that
+ * contains them both does not change: we can stop
+ * updating the tree once we reach the first common
+ * ancestor. Iterate @next's ancestors until we
+ * encounter @prev's state.
+ */
+ iter = NULL;
+ while ((group = iterate_groups(next, &iter))) {
+ if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+ common = group;
+ break;
+ }
+
+ psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+ }
+ }
+
+ /*
+ * If this is a voluntary sleep, dequeue will have taken care
+ * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
+ * only need to deal with it during preemption.
+ */
+ if (sleep)
+ return;
+
+ if (prev->pid) {
+ psi_flags_change(prev, TSK_ONCPU, 0);
+
+ iter = NULL;
+ while ((group = iterate_groups(prev, &iter)) && group != common)
+ psi_group_change(group, cpu, TSK_ONCPU, 0, true);
}
}
@@ -845,17 +937,17 @@
if (static_branch_likely(&psi_disabled))
return;
- *flags = current->flags & PF_MEMSTALL;
+ *flags = current->in_memstall;
if (*flags)
return;
/*
- * PF_MEMSTALL setting & accounting needs to be atomic wrt
+ * in_memstall setting & accounting needs to be atomic wrt
* changes to the task's scheduling state, otherwise we can
* race with CPU migration.
*/
rq = this_rq_lock_irq(&rf);
- current->flags |= PF_MEMSTALL;
+ current->in_memstall = 1;
psi_task_change(current, 0, TSK_MEMSTALL);
rq_unlock_irq(rq, &rf);
@@ -878,13 +970,13 @@
if (*flags)
return;
/*
- * PF_MEMSTALL clearing & accounting needs to be atomic wrt
+ * in_memstall clearing & accounting needs to be atomic wrt
* changes to the task's scheduling state, otherwise we could
* race with CPU migration.
*/
rq = this_rq_lock_irq(&rf);
- current->flags &= ~PF_MEMSTALL;
+ current->in_memstall = 0;
psi_task_change(current, TSK_MEMSTALL, 0);
rq_unlock_irq(rq, &rf);
@@ -928,7 +1020,7 @@
*/
void cgroup_move_task(struct task_struct *task, struct css_set *to)
{
- unsigned int task_flags = 0;
+ unsigned int task_flags;
struct rq_flags rf;
struct rq *rq;
@@ -943,13 +1035,31 @@
rq = task_rq_lock(task, &rf);
- if (task_on_rq_queued(task))
- task_flags = TSK_RUNNING;
- else if (task->in_iowait)
- task_flags = TSK_IOWAIT;
-
- if (task->flags & PF_MEMSTALL)
- task_flags |= TSK_MEMSTALL;
+ /*
+ * We may race with schedule() dropping the rq lock between
+ * deactivating prev and switching to next. Because the psi
+ * updates from the deactivation are deferred to the switch
+ * callback to save cgroup tree updates, the task's scheduling
+ * state here is not coherent with its psi state:
+ *
+ * schedule() cgroup_move_task()
+ * rq_lock()
+ * deactivate_task()
+ * p->on_rq = 0
+ * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
+ * pick_next_task()
+ * rq_unlock()
+ * rq_lock()
+ * psi_task_change() // old cgroup
+ * task->cgroups = to
+ * psi_task_change() // new cgroup
+ * rq_unlock()
+ * rq_lock()
+ * psi_sched_switch() // does deferred updates in new cgroup
+ *
+ * Don't rely on the scheduling state. Use psi_flags instead.
+ */
+ task_flags = task->psi_flags;
if (task_flags)
psi_task_change(task, task_flags, 0);
@@ -1073,7 +1183,6 @@
t->event = 0;
t->last_event_time = 0;
init_waitqueue_head(&t->event_wait);
- kref_init(&t->refcount);
mutex_lock(&group->trigger_lock);
@@ -1102,15 +1211,19 @@
return t;
}
-static void psi_trigger_destroy(struct kref *ref)
+void psi_trigger_destroy(struct psi_trigger *t)
{
- struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
- struct psi_group *group = t->group;
+ struct psi_group *group;
struct task_struct *task_to_destroy = NULL;
- if (static_branch_likely(&psi_disabled))
+ /*
+ * We do not check psi_disabled since it might have been disabled after
+ * the trigger got created.
+ */
+ if (!t)
return;
+ group = t->group;
/*
* Wakeup waiters to stop polling. Can happen if cgroup is deleted
* from under a polling process.
@@ -1146,9 +1259,9 @@
mutex_unlock(&group->trigger_lock);
/*
- * Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_task RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_task
+ * Wait for psi_schedule_poll_work RCU to complete its read-side
+ * critical section before destroying the trigger and optionally the
+ * poll_task.
*/
synchronize_rcu();
/*
@@ -1161,20 +1274,9 @@
* can no longer be found through group->poll_task.
*/
kthread_stop(task_to_destroy);
+ atomic_set(&group->poll_scheduled, 0);
}
kfree(t);
-}
-
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
-{
- struct psi_trigger *old = *trigger_ptr;
-
- if (static_branch_likely(&psi_disabled))
- return;
-
- rcu_assign_pointer(*trigger_ptr, new);
- if (old)
- kref_put(&old->refcount, psi_trigger_destroy);
}
__poll_t psi_trigger_poll(void **trigger_ptr,
@@ -1186,23 +1288,14 @@
if (static_branch_likely(&psi_disabled))
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- rcu_read_lock();
-
- t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
- if (!t) {
- rcu_read_unlock();
+ t = smp_load_acquire(trigger_ptr);
+ if (!t)
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- }
- kref_get(&t->refcount);
-
- rcu_read_unlock();
poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI;
-
- kref_put(&t->refcount, psi_trigger_destroy);
return ret;
}
@@ -1227,14 +1320,24 @@
buf[buf_size - 1] = '\0';
- new = psi_trigger_create(&psi_system, buf, nbytes, res);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
seq = file->private_data;
+
/* Take seq->lock to protect seq->private from concurrent writes */
mutex_lock(&seq->lock);
- psi_trigger_replace(&seq->private, new);
+
+ /* Allow only one trigger per file descriptor */
+ if (seq->private) {
+ mutex_unlock(&seq->lock);
+ return -EBUSY;
+ }
+
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ mutex_unlock(&seq->lock);
+ return PTR_ERR(new);
+ }
+
+ smp_store_release(&seq->private, new);
mutex_unlock(&seq->lock);
return nbytes;
@@ -1269,43 +1372,45 @@
{
struct seq_file *seq = file->private_data;
- psi_trigger_replace(&seq->private, NULL);
+ psi_trigger_destroy(seq->private);
return single_release(inode, file);
}
-static const struct file_operations psi_io_fops = {
- .open = psi_io_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_io_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_io_proc_ops = {
+ .proc_open = psi_io_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_io_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
-static const struct file_operations psi_memory_fops = {
- .open = psi_memory_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_memory_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_memory_proc_ops = {
+ .proc_open = psi_memory_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_memory_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
-static const struct file_operations psi_cpu_fops = {
- .open = psi_cpu_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_cpu_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_cpu_proc_ops = {
+ .proc_open = psi_cpu_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_cpu_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
static int __init psi_proc_init(void)
{
- proc_mkdir("pressure", NULL);
- proc_create("pressure/io", 0, NULL, &psi_io_fops);
- proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
- proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+ if (psi_enable) {
+ proc_mkdir("pressure", NULL);
+ proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
+ proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
+ proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
+ }
return 0;
}
module_init(psi_proc_init);
--
Gitblit v1.6.2