hc
2024-05-10 61598093bbdd283a7edc367d900f223070ead8d2
kernel/kernel/sched/psi.c
....@@ -142,6 +142,8 @@
142142 #include <linux/psi.h>
143143 #include "sched.h"
144144
145
+#include <trace/hooks/psi.h>
146
+
145147 static int psi_bug __read_mostly;
146148
147149 DEFINE_STATIC_KEY_FALSE(psi_disabled);
....@@ -174,7 +176,7 @@
174176
175177 /* System-level pressure and stall tracking */
176178 static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
177
-static struct psi_group psi_system = {
179
+struct psi_group psi_system = {
178180 .pcpu = &system_group_pcpu,
179181 };
180182
....@@ -193,6 +195,7 @@
193195 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
194196 mutex_init(&group->avgs_lock);
195197 /* Init trigger-related members */
198
+ atomic_set(&group->poll_scheduled, 0);
196199 mutex_init(&group->trigger_lock);
197200 INIT_LIST_HEAD(&group->triggers);
198201 memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
....@@ -232,7 +235,7 @@
232235 case PSI_MEM_FULL:
233236 return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
234237 case PSI_CPU_SOME:
235
- return tasks[NR_RUNNING] > 1;
238
+ return tasks[NR_RUNNING] > tasks[NR_ONCPU];
236239 case PSI_NONIDLE:
237240 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
238241 tasks[NR_RUNNING];
....@@ -541,11 +544,15 @@
541544 if (now < t->last_event_time + t->win.size)
542545 continue;
543546
547
+ trace_android_vh_psi_event(t);
548
+
544549 /* Generate an event */
545550 if (cmpxchg(&t->event, 0, 1) == 0)
546551 wake_up_interruptible(&t->event_wait);
547552 t->last_event_time = now;
548553 }
554
+
555
+ trace_android_vh_psi_group(group);
549556
550557 if (new_stall)
551558 memcpy(group->polling_total, total,
....@@ -554,18 +561,17 @@
554561 return now + group->poll_min_period;
555562 }
556563
557
-/* Schedule polling if it's not already scheduled. */
558
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
564
+/* Schedule polling if it's not already scheduled or forced. */
565
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
566
+ bool force)
559567 {
560568 struct task_struct *task;
561569
562570 /*
563
- * Do not reschedule if already scheduled.
564
- * Possible race with a timer scheduled after this check but before
565
- * mod_timer below can be tolerated because group->polling_next_update
566
- * will keep updates on schedule.
571
+ * atomic_xchg should be called even when !force to provide a
572
+ * full memory barrier (see the comment inside psi_poll_work).
567573 */
568
- if (timer_pending(&group->poll_timer))
574
+ if (atomic_xchg(&group->poll_scheduled, 1) && !force)
569575 return;
570576
571577 rcu_read_lock();
....@@ -577,18 +583,58 @@
577583 */
578584 if (likely(task))
579585 mod_timer(&group->poll_timer, jiffies + delay);
586
+ else
587
+ atomic_set(&group->poll_scheduled, 0);
580588
581589 rcu_read_unlock();
582590 }
583591
584592 static void psi_poll_work(struct psi_group *group)
585593 {
594
+ bool force_reschedule = false;
586595 u32 changed_states;
587596 u64 now;
588597
589598 mutex_lock(&group->trigger_lock);
590599
591600 now = sched_clock();
601
+
602
+ if (now > group->polling_until) {
603
+ /*
604
+ * We are either about to start or might stop polling if no
605
+ * state change was recorded. Resetting poll_scheduled leaves
606
+ * a small window for psi_group_change to sneak in and schedule
607
+ * an immegiate poll_work before we get to rescheduling. One
608
+ * potential extra wakeup at the end of the polling window
609
+ * should be negligible and polling_next_update still keeps
610
+ * updates correctly on schedule.
611
+ */
612
+ atomic_set(&group->poll_scheduled, 0);
613
+ /*
614
+ * A task change can race with the poll worker that is supposed to
615
+ * report on it. To avoid missing events, ensure ordering between
616
+ * poll_scheduled and the task state accesses, such that if the poll
617
+ * worker misses the state update, the task change is guaranteed to
618
+ * reschedule the poll worker:
619
+ *
620
+ * poll worker:
621
+ * atomic_set(poll_scheduled, 0)
622
+ * smp_mb()
623
+ * LOAD states
624
+ *
625
+ * task change:
626
+ * STORE states
627
+ * if atomic_xchg(poll_scheduled, 1) == 0:
628
+ * schedule poll worker
629
+ *
630
+ * The atomic_xchg() implies a full barrier.
631
+ */
632
+ smp_mb();
633
+ } else {
634
+ /* Polling window is not over, keep rescheduling */
635
+ force_reschedule = true;
636
+ }
637
+
592638
593639 collect_percpu_times(group, PSI_POLL, &changed_states);
594640
....@@ -615,7 +661,8 @@
615661 group->polling_next_update = update_triggers(group, now);
616662
617663 psi_schedule_poll_work(group,
618
- nsecs_to_jiffies(group->polling_next_update - now) + 1);
664
+ nsecs_to_jiffies(group->polling_next_update - now) + 1,
665
+ force_reschedule);
619666
620667 out:
621668 mutex_unlock(&group->trigger_lock);
....@@ -624,11 +671,8 @@
624671 static int psi_poll_worker(void *data)
625672 {
626673 struct psi_group *group = (struct psi_group *)data;
627
- struct sched_param param = {
628
- .sched_priority = 1,
629
- };
630674
631
- sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
675
+ sched_set_fifo_low(current);
632676
633677 while (true) {
634678 wait_event_interruptible(group->poll_wait,
....@@ -696,13 +740,14 @@
696740 groupc->times[PSI_NONIDLE] += delta;
697741 }
698742
699
-static u32 psi_group_change(struct psi_group *group, int cpu,
700
- unsigned int clear, unsigned int set)
743
+static void psi_group_change(struct psi_group *group, int cpu,
744
+ unsigned int clear, unsigned int set,
745
+ bool wake_clock)
701746 {
702747 struct psi_group_cpu *groupc;
748
+ u32 state_mask = 0;
703749 unsigned int t, m;
704750 enum psi_states s;
705
- u32 state_mask = 0;
706751
707752 groupc = per_cpu_ptr(group->pcpu, cpu);
708753
....@@ -721,14 +766,15 @@
721766 for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
722767 if (!(m & (1 << t)))
723768 continue;
724
- if (groupc->tasks[t] == 0 && !psi_bug) {
725
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
769
+ if (groupc->tasks[t]) {
770
+ groupc->tasks[t]--;
771
+ } else if (!psi_bug) {
772
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
726773 cpu, t, groupc->tasks[0],
727774 groupc->tasks[1], groupc->tasks[2],
728
- clear, set);
775
+ groupc->tasks[3], clear, set);
729776 psi_bug = 1;
730777 }
731
- groupc->tasks[t]--;
732778 }
733779
734780 for (t = 0; set; set &= ~(1 << t), t++)
....@@ -744,7 +790,11 @@
744790
745791 write_seqcount_end(&groupc->seq);
746792
747
- return state_mask;
793
+ if (state_mask & group->poll_states)
794
+ psi_schedule_poll_work(group, 1, false);
795
+
796
+ if (wake_clock && !delayed_work_pending(&group->avgs_work))
797
+ schedule_delayed_work(&group->avgs_work, PSI_FREQ);
748798 }
749799
750800 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
....@@ -771,6 +821,21 @@
771821 return &psi_system;
772822 }
773823
824
+static void psi_flags_change(struct task_struct *task, int clear, int set)
825
+{
826
+ if (((task->psi_flags & set) ||
827
+ (task->psi_flags & clear) != clear) &&
828
+ !psi_bug) {
829
+ printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
830
+ task->pid, task->comm, task_cpu(task),
831
+ task->psi_flags, clear, set);
832
+ psi_bug = 1;
833
+ }
834
+
835
+ task->psi_flags &= ~clear;
836
+ task->psi_flags |= set;
837
+}
838
+
774839 void psi_task_change(struct task_struct *task, int clear, int set)
775840 {
776841 int cpu = task_cpu(task);
....@@ -781,17 +846,7 @@
781846 if (!task->pid)
782847 return;
783848
784
- if (((task->psi_flags & set) ||
785
- (task->psi_flags & clear) != clear) &&
786
- !psi_bug) {
787
- printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
788
- task->pid, task->comm, cpu,
789
- task->psi_flags, clear, set);
790
- psi_bug = 1;
791
- }
792
-
793
- task->psi_flags &= ~clear;
794
- task->psi_flags |= set;
849
+ psi_flags_change(task, clear, set);
795850
796851 /*
797852 * Periodic aggregation shuts off if there is a period of no
....@@ -804,14 +859,51 @@
804859 wq_worker_last_func(task) == psi_avgs_work))
805860 wake_clock = false;
806861
807
- while ((group = iterate_groups(task, &iter))) {
808
- u32 state_mask = psi_group_change(group, cpu, clear, set);
862
+ while ((group = iterate_groups(task, &iter)))
863
+ psi_group_change(group, cpu, clear, set, wake_clock);
864
+}
809865
810
- if (state_mask & group->poll_states)
811
- psi_schedule_poll_work(group, 1);
866
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
867
+ bool sleep)
868
+{
869
+ struct psi_group *group, *common = NULL;
870
+ int cpu = task_cpu(prev);
871
+ void *iter;
812872
813
- if (wake_clock && !delayed_work_pending(&group->avgs_work))
814
- schedule_delayed_work(&group->avgs_work, PSI_FREQ);
873
+ if (next->pid) {
874
+ psi_flags_change(next, 0, TSK_ONCPU);
875
+ /*
876
+ * When moving state between tasks, the group that
877
+ * contains them both does not change: we can stop
878
+ * updating the tree once we reach the first common
879
+ * ancestor. Iterate @next's ancestors until we
880
+ * encounter @prev's state.
881
+ */
882
+ iter = NULL;
883
+ while ((group = iterate_groups(next, &iter))) {
884
+ if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
885
+ common = group;
886
+ break;
887
+ }
888
+
889
+ psi_group_change(group, cpu, 0, TSK_ONCPU, true);
890
+ }
891
+ }
892
+
893
+ /*
894
+ * If this is a voluntary sleep, dequeue will have taken care
895
+ * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
896
+ * only need to deal with it during preemption.
897
+ */
898
+ if (sleep)
899
+ return;
900
+
901
+ if (prev->pid) {
902
+ psi_flags_change(prev, TSK_ONCPU, 0);
903
+
904
+ iter = NULL;
905
+ while ((group = iterate_groups(prev, &iter)) && group != common)
906
+ psi_group_change(group, cpu, TSK_ONCPU, 0, true);
815907 }
816908 }
817909
....@@ -845,17 +937,17 @@
845937 if (static_branch_likely(&psi_disabled))
846938 return;
847939
848
- *flags = current->flags & PF_MEMSTALL;
940
+ *flags = current->in_memstall;
849941 if (*flags)
850942 return;
851943 /*
852
- * PF_MEMSTALL setting & accounting needs to be atomic wrt
944
+ * in_memstall setting & accounting needs to be atomic wrt
853945 * changes to the task's scheduling state, otherwise we can
854946 * race with CPU migration.
855947 */
856948 rq = this_rq_lock_irq(&rf);
857949
858
- current->flags |= PF_MEMSTALL;
950
+ current->in_memstall = 1;
859951 psi_task_change(current, 0, TSK_MEMSTALL);
860952
861953 rq_unlock_irq(rq, &rf);
....@@ -878,13 +970,13 @@
878970 if (*flags)
879971 return;
880972 /*
881
- * PF_MEMSTALL clearing & accounting needs to be atomic wrt
973
+ * in_memstall clearing & accounting needs to be atomic wrt
882974 * changes to the task's scheduling state, otherwise we could
883975 * race with CPU migration.
884976 */
885977 rq = this_rq_lock_irq(&rf);
886978
887
- current->flags &= ~PF_MEMSTALL;
979
+ current->in_memstall = 0;
888980 psi_task_change(current, TSK_MEMSTALL, 0);
889981
890982 rq_unlock_irq(rq, &rf);
....@@ -928,7 +1020,7 @@
9281020 */
9291021 void cgroup_move_task(struct task_struct *task, struct css_set *to)
9301022 {
931
- unsigned int task_flags = 0;
1023
+ unsigned int task_flags;
9321024 struct rq_flags rf;
9331025 struct rq *rq;
9341026
....@@ -943,13 +1035,31 @@
9431035
9441036 rq = task_rq_lock(task, &rf);
9451037
946
- if (task_on_rq_queued(task))
947
- task_flags = TSK_RUNNING;
948
- else if (task->in_iowait)
949
- task_flags = TSK_IOWAIT;
950
-
951
- if (task->flags & PF_MEMSTALL)
952
- task_flags |= TSK_MEMSTALL;
1038
+ /*
1039
+ * We may race with schedule() dropping the rq lock between
1040
+ * deactivating prev and switching to next. Because the psi
1041
+ * updates from the deactivation are deferred to the switch
1042
+ * callback to save cgroup tree updates, the task's scheduling
1043
+ * state here is not coherent with its psi state:
1044
+ *
1045
+ * schedule() cgroup_move_task()
1046
+ * rq_lock()
1047
+ * deactivate_task()
1048
+ * p->on_rq = 0
1049
+ * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
1050
+ * pick_next_task()
1051
+ * rq_unlock()
1052
+ * rq_lock()
1053
+ * psi_task_change() // old cgroup
1054
+ * task->cgroups = to
1055
+ * psi_task_change() // new cgroup
1056
+ * rq_unlock()
1057
+ * rq_lock()
1058
+ * psi_sched_switch() // does deferred updates in new cgroup
1059
+ *
1060
+ * Don't rely on the scheduling state. Use psi_flags instead.
1061
+ */
1062
+ task_flags = task->psi_flags;
9531063
9541064 if (task_flags)
9551065 psi_task_change(task, task_flags, 0);
....@@ -1073,7 +1183,6 @@
10731183 t->event = 0;
10741184 t->last_event_time = 0;
10751185 init_waitqueue_head(&t->event_wait);
1076
- kref_init(&t->refcount);
10771186
10781187 mutex_lock(&group->trigger_lock);
10791188
....@@ -1102,20 +1211,25 @@
11021211 return t;
11031212 }
11041213
1105
-static void psi_trigger_destroy(struct kref *ref)
1214
+void psi_trigger_destroy(struct psi_trigger *t)
11061215 {
1107
- struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
1108
- struct psi_group *group = t->group;
1216
+ struct psi_group *group;
11091217 struct task_struct *task_to_destroy = NULL;
11101218
1111
- if (static_branch_likely(&psi_disabled))
1219
+ /*
1220
+ * We do not check psi_disabled since it might have been disabled after
1221
+ * the trigger got created.
1222
+ */
1223
+ if (!t)
11121224 return;
11131225
1226
+ group = t->group;
11141227 /*
1115
- * Wakeup waiters to stop polling. Can happen if cgroup is deleted
1116
- * from under a polling process.
1228
+ * Wakeup waiters to stop polling and clear the queue to prevent it from
1229
+ * being accessed later. Can happen if cgroup is deleted from under a
1230
+ * polling process.
11171231 */
1118
- wake_up_interruptible(&t->event_wait);
1232
+ wake_up_pollfree(&t->event_wait);
11191233
11201234 mutex_lock(&group->trigger_lock);
11211235
....@@ -1146,9 +1260,9 @@
11461260 mutex_unlock(&group->trigger_lock);
11471261
11481262 /*
1149
- * Wait for both *trigger_ptr from psi_trigger_replace and
1150
- * poll_task RCUs to complete their read-side critical sections
1151
- * before destroying the trigger and optionally the poll_task
1263
+ * Wait for psi_schedule_poll_work RCU to complete its read-side
1264
+ * critical section before destroying the trigger and optionally the
1265
+ * poll_task.
11521266 */
11531267 synchronize_rcu();
11541268 /*
....@@ -1161,20 +1275,9 @@
11611275 * can no longer be found through group->poll_task.
11621276 */
11631277 kthread_stop(task_to_destroy);
1278
+ atomic_set(&group->poll_scheduled, 0);
11641279 }
11651280 kfree(t);
1166
-}
1167
-
1168
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
1169
-{
1170
- struct psi_trigger *old = *trigger_ptr;
1171
-
1172
- if (static_branch_likely(&psi_disabled))
1173
- return;
1174
-
1175
- rcu_assign_pointer(*trigger_ptr, new);
1176
- if (old)
1177
- kref_put(&old->refcount, psi_trigger_destroy);
11781281 }
11791282
11801283 __poll_t psi_trigger_poll(void **trigger_ptr,
....@@ -1186,23 +1289,14 @@
11861289 if (static_branch_likely(&psi_disabled))
11871290 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
11881291
1189
- rcu_read_lock();
1190
-
1191
- t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
1192
- if (!t) {
1193
- rcu_read_unlock();
1292
+ t = smp_load_acquire(trigger_ptr);
1293
+ if (!t)
11941294 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1195
- }
1196
- kref_get(&t->refcount);
1197
-
1198
- rcu_read_unlock();
11991295
12001296 poll_wait(file, &t->event_wait, wait);
12011297
12021298 if (cmpxchg(&t->event, 1, 0) == 1)
12031299 ret |= EPOLLPRI;
1204
-
1205
- kref_put(&t->refcount, psi_trigger_destroy);
12061300
12071301 return ret;
12081302 }
....@@ -1227,14 +1321,24 @@
12271321
12281322 buf[buf_size - 1] = '\0';
12291323
1230
- new = psi_trigger_create(&psi_system, buf, nbytes, res);
1231
- if (IS_ERR(new))
1232
- return PTR_ERR(new);
1233
-
12341324 seq = file->private_data;
1325
+
12351326 /* Take seq->lock to protect seq->private from concurrent writes */
12361327 mutex_lock(&seq->lock);
1237
- psi_trigger_replace(&seq->private, new);
1328
+
1329
+ /* Allow only one trigger per file descriptor */
1330
+ if (seq->private) {
1331
+ mutex_unlock(&seq->lock);
1332
+ return -EBUSY;
1333
+ }
1334
+
1335
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
1336
+ if (IS_ERR(new)) {
1337
+ mutex_unlock(&seq->lock);
1338
+ return PTR_ERR(new);
1339
+ }
1340
+
1341
+ smp_store_release(&seq->private, new);
12381342 mutex_unlock(&seq->lock);
12391343
12401344 return nbytes;
....@@ -1269,43 +1373,45 @@
12691373 {
12701374 struct seq_file *seq = file->private_data;
12711375
1272
- psi_trigger_replace(&seq->private, NULL);
1376
+ psi_trigger_destroy(seq->private);
12731377 return single_release(inode, file);
12741378 }
12751379
1276
-static const struct file_operations psi_io_fops = {
1277
- .open = psi_io_open,
1278
- .read = seq_read,
1279
- .llseek = seq_lseek,
1280
- .write = psi_io_write,
1281
- .poll = psi_fop_poll,
1282
- .release = psi_fop_release,
1380
+static const struct proc_ops psi_io_proc_ops = {
1381
+ .proc_open = psi_io_open,
1382
+ .proc_read = seq_read,
1383
+ .proc_lseek = seq_lseek,
1384
+ .proc_write = psi_io_write,
1385
+ .proc_poll = psi_fop_poll,
1386
+ .proc_release = psi_fop_release,
12831387 };
12841388
1285
-static const struct file_operations psi_memory_fops = {
1286
- .open = psi_memory_open,
1287
- .read = seq_read,
1288
- .llseek = seq_lseek,
1289
- .write = psi_memory_write,
1290
- .poll = psi_fop_poll,
1291
- .release = psi_fop_release,
1389
+static const struct proc_ops psi_memory_proc_ops = {
1390
+ .proc_open = psi_memory_open,
1391
+ .proc_read = seq_read,
1392
+ .proc_lseek = seq_lseek,
1393
+ .proc_write = psi_memory_write,
1394
+ .proc_poll = psi_fop_poll,
1395
+ .proc_release = psi_fop_release,
12921396 };
12931397
1294
-static const struct file_operations psi_cpu_fops = {
1295
- .open = psi_cpu_open,
1296
- .read = seq_read,
1297
- .llseek = seq_lseek,
1298
- .write = psi_cpu_write,
1299
- .poll = psi_fop_poll,
1300
- .release = psi_fop_release,
1398
+static const struct proc_ops psi_cpu_proc_ops = {
1399
+ .proc_open = psi_cpu_open,
1400
+ .proc_read = seq_read,
1401
+ .proc_lseek = seq_lseek,
1402
+ .proc_write = psi_cpu_write,
1403
+ .proc_poll = psi_fop_poll,
1404
+ .proc_release = psi_fop_release,
13011405 };
13021406
13031407 static int __init psi_proc_init(void)
13041408 {
1305
- proc_mkdir("pressure", NULL);
1306
- proc_create("pressure/io", 0, NULL, &psi_io_fops);
1307
- proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
1308
- proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
1409
+ if (psi_enable) {
1410
+ proc_mkdir("pressure", NULL);
1411
+ proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
1412
+ proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
1413
+ proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
1414
+ }
13091415 return 0;
13101416 }
13111417 module_init(psi_proc_init);