hc
2024-05-10 61598093bbdd283a7edc367d900f223070ead8d2
kernel/kernel/sched/cputime.c
....@@ -1,8 +1,10 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Simple CPU accounting cgroup controller
34 */
45 #include <linux/cpufreq_times.h>
56 #include "sched.h"
7
+#include <trace/hooks/sched.h>
68
79 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
810
....@@ -18,6 +20,7 @@
1820 * compromise in place of having locks on each irq in account_system_time.
1921 */
2022 DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
23
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_irqtime);
2124
2225 static int sched_clock_irqtime;
2326
....@@ -70,6 +73,8 @@
7073 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
7174 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
7275 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
76
+
77
+ trace_android_rvh_account_irq(curr, cpu, delta);
7378 }
7479 EXPORT_SYMBOL_GPL(irqtime_account_irq);
7580
....@@ -361,7 +366,7 @@
361366 * softirq as those do not count in task exec_runtime any more.
362367 */
363368 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
364
- struct rq *rq, int ticks)
369
+ int ticks)
365370 {
366371 u64 other, cputime = TICK_NSEC * ticks;
367372
....@@ -387,51 +392,48 @@
387392 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
388393 } else if (user_tick) {
389394 account_user_time(p, cputime);
390
- } else if (p == rq->idle) {
395
+ } else if (p == this_rq()->idle) {
391396 account_idle_time(cputime);
392397 } else if (p->flags & PF_VCPU) { /* System time or guest time */
393398 account_guest_time(p, cputime);
394399 } else {
395400 account_system_index_time(p, cputime, CPUTIME_SYSTEM);
396401 }
402
+ trace_android_vh_irqtime_account_process_tick(p, this_rq(), user_tick, ticks);
397403 }
398404
399405 static void irqtime_account_idle_ticks(int ticks)
400406 {
401
- struct rq *rq = this_rq();
402
-
403
- irqtime_account_process_tick(current, 0, rq, ticks);
407
+ irqtime_account_process_tick(current, 0, ticks);
404408 }
405409 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
406410 static inline void irqtime_account_idle_ticks(int ticks) { }
407411 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
408
- struct rq *rq, int nr_ticks) { }
412
+ int nr_ticks) { }
409413 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
410414
411415 /*
412416 * Use precise platform statistics if available:
413417 */
414
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
418
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
419
+
415420 # ifndef __ARCH_HAS_VTIME_TASK_SWITCH
416
-void vtime_common_task_switch(struct task_struct *prev)
421
+void vtime_task_switch(struct task_struct *prev)
417422 {
418423 if (is_idle_task(prev))
419424 vtime_account_idle(prev);
420425 else
421
- vtime_account_system(prev);
426
+ vtime_account_kernel(prev);
422427
423428 vtime_flush(prev);
424429 arch_vtime_task_switch(prev);
425430 }
426431 # endif
427
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
428432
429
-
430
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
431433 /*
432434 * Archs that account the whole time spent in the idle task
433435 * (outside irq) as idle time can rely on this and just implement
434
- * vtime_account_system() and vtime_account_idle(). Archs that
436
+ * vtime_account_kernel() and vtime_account_idle(). Archs that
435437 * have other meaning of the idle time (s390 only includes the
436438 * time spent by the CPU when it's in low power mode) must override
437439 * vtime_account().
....@@ -442,7 +444,7 @@
442444 if (!in_interrupt() && is_idle_task(tsk))
443445 vtime_account_idle(tsk);
444446 else
445
- vtime_account_system(tsk);
447
+ vtime_account_kernel(tsk);
446448 }
447449 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
448450 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
....@@ -470,6 +472,7 @@
470472 *ut = cputime.utime;
471473 *st = cputime.stime;
472474 }
475
+EXPORT_SYMBOL_GPL(thread_group_cputime_adjusted);
473476
474477 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
475478
....@@ -481,13 +484,13 @@
481484 void account_process_tick(struct task_struct *p, int user_tick)
482485 {
483486 u64 cputime, steal;
484
- struct rq *rq = this_rq();
485487
486
- if (vtime_accounting_cpu_enabled())
488
+ if (vtime_accounting_enabled_this_cpu())
487489 return;
490
+ trace_android_vh_account_task_time(p, this_rq(), user_tick);
488491
489492 if (sched_clock_irqtime) {
490
- irqtime_account_process_tick(p, user_tick, rq, 1);
493
+ irqtime_account_process_tick(p, user_tick, 1);
491494 return;
492495 }
493496
....@@ -501,7 +504,7 @@
501504
502505 if (user_tick)
503506 account_user_time(p, cputime);
504
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
507
+ else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
505508 account_system_time(p, HARDIRQ_OFFSET, cputime);
506509 else
507510 account_idle_time(cputime);
....@@ -528,50 +531,6 @@
528531
529532 cputime -= steal;
530533 account_idle_time(cputime);
531
-}
532
-
533
-/*
534
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
535
- * loosing precision when the numbers are big.
536
- */
537
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
538
-{
539
- u64 scaled;
540
-
541
- for (;;) {
542
- /* Make sure "rtime" is the bigger of stime/rtime */
543
- if (stime > rtime)
544
- swap(rtime, stime);
545
-
546
- /* Make sure 'total' fits in 32 bits */
547
- if (total >> 32)
548
- goto drop_precision;
549
-
550
- /* Does rtime (and thus stime) fit in 32 bits? */
551
- if (!(rtime >> 32))
552
- break;
553
-
554
- /* Can we just balance rtime/stime rather than dropping bits? */
555
- if (stime >> 31)
556
- goto drop_precision;
557
-
558
- /* We can grow stime and shrink rtime and try to make them both fit */
559
- stime <<= 1;
560
- rtime >>= 1;
561
- continue;
562
-
563
-drop_precision:
564
- /* We drop from rtime, it has more bits than stime */
565
- rtime >>= 1;
566
- total >>= 1;
567
- }
568
-
569
- /*
570
- * Make sure gcc understands that this is a 32x32->64 multiply,
571
- * followed by a 64/32->64 divide.
572
- */
573
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
574
- return scaled;
575534 }
576535
577536 /*
....@@ -633,7 +592,7 @@
633592 goto update;
634593 }
635594
636
- stime = scale_stime(stime, rtime, stime + utime);
595
+ stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
637596
638597 update:
639598 /*
....@@ -684,6 +643,8 @@
684643 thread_group_cputime(p, &cputime);
685644 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
686645 }
646
+EXPORT_SYMBOL_GPL(thread_group_cputime_adjusted);
647
+
687648 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
688649
689650 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
....@@ -717,8 +678,8 @@
717678 return delta - other;
718679 }
719680
720
-static void __vtime_account_system(struct task_struct *tsk,
721
- struct vtime *vtime)
681
+static void vtime_account_system(struct task_struct *tsk,
682
+ struct vtime *vtime)
722683 {
723684 vtime->stime += get_vtime_delta(vtime);
724685 if (vtime->stime >= TICK_NSEC) {
....@@ -737,7 +698,17 @@
737698 }
738699 }
739700
740
-void vtime_account_system(struct task_struct *tsk)
701
+static void __vtime_account_kernel(struct task_struct *tsk,
702
+ struct vtime *vtime)
703
+{
704
+ /* We might have scheduled out from guest path */
705
+ if (vtime->state == VTIME_GUEST)
706
+ vtime_account_guest(tsk, vtime);
707
+ else
708
+ vtime_account_system(tsk, vtime);
709
+}
710
+
711
+void vtime_account_kernel(struct task_struct *tsk)
741712 {
742713 struct vtime *vtime = &tsk->vtime;
743714
....@@ -745,11 +716,7 @@
745716 return;
746717
747718 write_seqcount_begin(&vtime->seqcount);
748
- /* We might have scheduled out from guest path */
749
- if (tsk->flags & PF_VCPU)
750
- vtime_account_guest(tsk, vtime);
751
- else
752
- __vtime_account_system(tsk, vtime);
719
+ __vtime_account_kernel(tsk, vtime);
753720 write_seqcount_end(&vtime->seqcount);
754721 }
755722
....@@ -758,7 +725,7 @@
758725 struct vtime *vtime = &tsk->vtime;
759726
760727 write_seqcount_begin(&vtime->seqcount);
761
- __vtime_account_system(tsk, vtime);
728
+ vtime_account_system(tsk, vtime);
762729 vtime->state = VTIME_USER;
763730 write_seqcount_end(&vtime->seqcount);
764731 }
....@@ -788,8 +755,9 @@
788755 * that can thus safely catch up with a tickless delta.
789756 */
790757 write_seqcount_begin(&vtime->seqcount);
791
- __vtime_account_system(tsk, vtime);
758
+ vtime_account_system(tsk, vtime);
792759 tsk->flags |= PF_VCPU;
760
+ vtime->state = VTIME_GUEST;
793761 write_seqcount_end(&vtime->seqcount);
794762 }
795763 EXPORT_SYMBOL_GPL(vtime_guest_enter);
....@@ -801,6 +769,7 @@
801769 write_seqcount_begin(&vtime->seqcount);
802770 vtime_account_guest(tsk, vtime);
803771 tsk->flags &= ~PF_VCPU;
772
+ vtime->state = VTIME_SYS;
804773 write_seqcount_end(&vtime->seqcount);
805774 }
806775 EXPORT_SYMBOL_GPL(vtime_guest_exit);
....@@ -810,19 +779,30 @@
810779 account_idle_time(get_vtime_delta(&tsk->vtime));
811780 }
812781
813
-void arch_vtime_task_switch(struct task_struct *prev)
782
+void vtime_task_switch_generic(struct task_struct *prev)
814783 {
815784 struct vtime *vtime = &prev->vtime;
816785
817786 write_seqcount_begin(&vtime->seqcount);
787
+ if (vtime->state == VTIME_IDLE)
788
+ vtime_account_idle(prev);
789
+ else
790
+ __vtime_account_kernel(prev, vtime);
818791 vtime->state = VTIME_INACTIVE;
792
+ vtime->cpu = -1;
819793 write_seqcount_end(&vtime->seqcount);
820794
821795 vtime = &current->vtime;
822796
823797 write_seqcount_begin(&vtime->seqcount);
824
- vtime->state = VTIME_SYS;
798
+ if (is_idle_task(current))
799
+ vtime->state = VTIME_IDLE;
800
+ else if (current->flags & PF_VCPU)
801
+ vtime->state = VTIME_GUEST;
802
+ else
803
+ vtime->state = VTIME_SYS;
825804 vtime->starttime = sched_clock();
805
+ vtime->cpu = smp_processor_id();
826806 write_seqcount_end(&vtime->seqcount);
827807 }
828808
....@@ -833,8 +813,9 @@
833813
834814 local_irq_save(flags);
835815 write_seqcount_begin(&vtime->seqcount);
836
- vtime->state = VTIME_SYS;
816
+ vtime->state = VTIME_IDLE;
837817 vtime->starttime = sched_clock();
818
+ vtime->cpu = cpu;
838819 write_seqcount_end(&vtime->seqcount);
839820 local_irq_restore(flags);
840821 }
....@@ -852,7 +833,7 @@
852833 seq = read_seqcount_begin(&vtime->seqcount);
853834
854835 gtime = t->gtime;
855
- if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
836
+ if (vtime->state == VTIME_GUEST)
856837 gtime += vtime->gtime + vtime_delta(vtime);
857838
858839 } while (read_seqcount_retry(&vtime->seqcount, seq));
....@@ -883,20 +864,233 @@
883864 *utime = t->utime;
884865 *stime = t->stime;
885866
886
- /* Task is sleeping, nothing to add */
887
- if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
867
+ /* Task is sleeping or idle, nothing to add */
868
+ if (vtime->state < VTIME_SYS)
888869 continue;
889870
890871 delta = vtime_delta(vtime);
891872
892873 /*
893
- * Task runs either in user or kernel space, add pending nohz time to
894
- * the right place.
874
+ * Task runs either in user (including guest) or kernel space,
875
+ * add pending nohz time to the right place.
895876 */
896
- if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
897
- *utime += vtime->utime + delta;
898
- else if (vtime->state == VTIME_SYS)
877
+ if (vtime->state == VTIME_SYS)
899878 *stime += vtime->stime + delta;
879
+ else
880
+ *utime += vtime->utime + delta;
900881 } while (read_seqcount_retry(&vtime->seqcount, seq));
901882 }
883
+
884
+static int vtime_state_fetch(struct vtime *vtime, int cpu)
885
+{
886
+ int state = READ_ONCE(vtime->state);
887
+
888
+ /*
889
+ * We raced against a context switch, fetch the
890
+ * kcpustat task again.
891
+ */
892
+ if (vtime->cpu != cpu && vtime->cpu != -1)
893
+ return -EAGAIN;
894
+
895
+ /*
896
+ * Two possible things here:
897
+ * 1) We are seeing the scheduling out task (prev) or any past one.
898
+ * 2) We are seeing the scheduling in task (next) but it hasn't
899
+ * passed though vtime_task_switch() yet so the pending
900
+ * cputime of the prev task may not be flushed yet.
901
+ *
902
+ * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
903
+ */
904
+ if (state == VTIME_INACTIVE)
905
+ return -EAGAIN;
906
+
907
+ return state;
908
+}
909
+
910
+static u64 kcpustat_user_vtime(struct vtime *vtime)
911
+{
912
+ if (vtime->state == VTIME_USER)
913
+ return vtime->utime + vtime_delta(vtime);
914
+ else if (vtime->state == VTIME_GUEST)
915
+ return vtime->gtime + vtime_delta(vtime);
916
+ return 0;
917
+}
918
+
919
+static int kcpustat_field_vtime(u64 *cpustat,
920
+ struct task_struct *tsk,
921
+ enum cpu_usage_stat usage,
922
+ int cpu, u64 *val)
923
+{
924
+ struct vtime *vtime = &tsk->vtime;
925
+ unsigned int seq;
926
+
927
+ do {
928
+ int state;
929
+
930
+ seq = read_seqcount_begin(&vtime->seqcount);
931
+
932
+ state = vtime_state_fetch(vtime, cpu);
933
+ if (state < 0)
934
+ return state;
935
+
936
+ *val = cpustat[usage];
937
+
938
+ /*
939
+ * Nice VS unnice cputime accounting may be inaccurate if
940
+ * the nice value has changed since the last vtime update.
941
+ * But proper fix would involve interrupting target on nice
942
+ * updates which is a no go on nohz_full (although the scheduler
943
+ * may still interrupt the target if rescheduling is needed...)
944
+ */
945
+ switch (usage) {
946
+ case CPUTIME_SYSTEM:
947
+ if (state == VTIME_SYS)
948
+ *val += vtime->stime + vtime_delta(vtime);
949
+ break;
950
+ case CPUTIME_USER:
951
+ if (task_nice(tsk) <= 0)
952
+ *val += kcpustat_user_vtime(vtime);
953
+ break;
954
+ case CPUTIME_NICE:
955
+ if (task_nice(tsk) > 0)
956
+ *val += kcpustat_user_vtime(vtime);
957
+ break;
958
+ case CPUTIME_GUEST:
959
+ if (state == VTIME_GUEST && task_nice(tsk) <= 0)
960
+ *val += vtime->gtime + vtime_delta(vtime);
961
+ break;
962
+ case CPUTIME_GUEST_NICE:
963
+ if (state == VTIME_GUEST && task_nice(tsk) > 0)
964
+ *val += vtime->gtime + vtime_delta(vtime);
965
+ break;
966
+ default:
967
+ break;
968
+ }
969
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
970
+
971
+ return 0;
972
+}
973
+
974
+u64 kcpustat_field(struct kernel_cpustat *kcpustat,
975
+ enum cpu_usage_stat usage, int cpu)
976
+{
977
+ u64 *cpustat = kcpustat->cpustat;
978
+ u64 val = cpustat[usage];
979
+ struct rq *rq;
980
+ int err;
981
+
982
+ if (!vtime_accounting_enabled_cpu(cpu))
983
+ return val;
984
+
985
+ rq = cpu_rq(cpu);
986
+
987
+ for (;;) {
988
+ struct task_struct *curr;
989
+
990
+ rcu_read_lock();
991
+ curr = rcu_dereference(rq->curr);
992
+ if (WARN_ON_ONCE(!curr)) {
993
+ rcu_read_unlock();
994
+ return cpustat[usage];
995
+ }
996
+
997
+ err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
998
+ rcu_read_unlock();
999
+
1000
+ if (!err)
1001
+ return val;
1002
+
1003
+ cpu_relax();
1004
+ }
1005
+}
1006
+EXPORT_SYMBOL_GPL(kcpustat_field);
1007
+
1008
+static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
1009
+ const struct kernel_cpustat *src,
1010
+ struct task_struct *tsk, int cpu)
1011
+{
1012
+ struct vtime *vtime = &tsk->vtime;
1013
+ unsigned int seq;
1014
+
1015
+ do {
1016
+ u64 *cpustat;
1017
+ u64 delta;
1018
+ int state;
1019
+
1020
+ seq = read_seqcount_begin(&vtime->seqcount);
1021
+
1022
+ state = vtime_state_fetch(vtime, cpu);
1023
+ if (state < 0)
1024
+ return state;
1025
+
1026
+ *dst = *src;
1027
+ cpustat = dst->cpustat;
1028
+
1029
+ /* Task is sleeping, dead or idle, nothing to add */
1030
+ if (state < VTIME_SYS)
1031
+ continue;
1032
+
1033
+ delta = vtime_delta(vtime);
1034
+
1035
+ /*
1036
+ * Task runs either in user (including guest) or kernel space,
1037
+ * add pending nohz time to the right place.
1038
+ */
1039
+ if (state == VTIME_SYS) {
1040
+ cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
1041
+ } else if (state == VTIME_USER) {
1042
+ if (task_nice(tsk) > 0)
1043
+ cpustat[CPUTIME_NICE] += vtime->utime + delta;
1044
+ else
1045
+ cpustat[CPUTIME_USER] += vtime->utime + delta;
1046
+ } else {
1047
+ WARN_ON_ONCE(state != VTIME_GUEST);
1048
+ if (task_nice(tsk) > 0) {
1049
+ cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
1050
+ cpustat[CPUTIME_NICE] += vtime->gtime + delta;
1051
+ } else {
1052
+ cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
1053
+ cpustat[CPUTIME_USER] += vtime->gtime + delta;
1054
+ }
1055
+ }
1056
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
1057
+
1058
+ return 0;
1059
+}
1060
+
1061
+void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
1062
+{
1063
+ const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
1064
+ struct rq *rq;
1065
+ int err;
1066
+
1067
+ if (!vtime_accounting_enabled_cpu(cpu)) {
1068
+ *dst = *src;
1069
+ return;
1070
+ }
1071
+
1072
+ rq = cpu_rq(cpu);
1073
+
1074
+ for (;;) {
1075
+ struct task_struct *curr;
1076
+
1077
+ rcu_read_lock();
1078
+ curr = rcu_dereference(rq->curr);
1079
+ if (WARN_ON_ONCE(!curr)) {
1080
+ rcu_read_unlock();
1081
+ *dst = *src;
1082
+ return;
1083
+ }
1084
+
1085
+ err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
1086
+ rcu_read_unlock();
1087
+
1088
+ if (!err)
1089
+ return;
1090
+
1091
+ cpu_relax();
1092
+ }
1093
+}
1094
+EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
1095
+
9021096 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */