On Fri, Aug 03, 2018 at 06:56:41PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 01, 2018 at 11:19:57AM -0400, Johannes Weiner wrote:
> > +static bool psi_update_stats(struct psi_group *group)
> > +{
> > +   u64 deltas[NR_PSI_STATES - 1] = { 0, };
> > +   unsigned long missed_periods = 0;
> > +   unsigned long nonidle_total = 0;
> > +   u64 now, expires, period;
> > +   int cpu;
> > +   int s;
> > +
> > +   mutex_lock(&group->stat_lock);
> > +
> > +   /*
> > +    * Collect the per-cpu time buckets and average them into a
> > +    * single time sample that is normalized to wallclock time.
> > +    *
> > +    * For averaging, each CPU is weighted by its non-idle time in
> > +    * the sampling period. This eliminates artifacts from uneven
> > +    * loading, or even entirely idle CPUs.
> > +    *
> > +    * We don't need to synchronize against CPU hotplugging. If we
> > +    * see a CPU that's online and has samples, we incorporate it.
> > +    */
> > +   for_each_online_cpu(cpu) {
> > +           struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> > +           u32 uninitialized_var(nonidle);
> 
> urgh.. I can see why the compiler got confused. Dodgy :-)

:-) I think we can make this cleaner. Something like this (modulo the
READ_ONCE/WRITE_ONCE you pointed out in the other email)?

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index abccfddba5d5..ce6f02ada1cd 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -220,6 +220,49 @@ static bool test_state(unsigned int *tasks, enum 
psi_states state)
        }
 }
 
+static u32 read_update_delta(struct psi_group_cpu *groupc,
+                            enum psi_states state, int cpu)
+{
+       u32 time, delta;
+
+       time = READ_ONCE(groupc->times[state]);
+       /*
+        * In addition to already concluded states, we also
+        * incorporate currently active states on the CPU, since
+        * states may last for many sampling periods.
+        *
+        * This way we keep our delta sampling buckets small (u32) and
+        * our reported pressure close to what's actually happening.
+        */
+       if (test_state(groupc->tasks, state)) {
+               /*
+                * We can race with a state change and need to make
+                * sure the state_start update is ordered against the
+                * updates to the live state and the time buckets
+                * (groupc->times).
+                *
+                * 1. If we observe task state that needs to be
+                * recorded, make sure we see state_start from when
+                * that state went into effect or we'll count time
+                * from the previous state.
+                *
+                * 2. If the time delta has already been added to the
+                * bucket, make sure we don't see it in state_start or
+                * we'll count it twice.
+                *
+                * If the time delta is out of state_start but not in
+                * the time bucket yet, we'll miss it entirely and
+                * handle it in the next period.
+                */
+               smp_rmb();
+               time += cpu_clock(cpu) - groupc->state_start;
+       }
+       delta = time - groupc->times_prev[state];
+       groupc->times_prev[state] = time;
+
+       return delta;
+}
+
 static bool psi_update_stats(struct psi_group *group)
 {
        u64 deltas[NR_PSI_STATES - 1] = { 0, };
@@ -244,60 +287,17 @@ static bool psi_update_stats(struct psi_group *group)
         */
        for_each_online_cpu(cpu) {
                struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
-               u32 uninitialized_var(nonidle);
-
-               BUILD_BUG_ON(PSI_NONIDLE != NR_PSI_STATES - 1);
-
-               for (s = PSI_NONIDLE; s >= 0; s--) {
-                       u32 time, delta;
-
-                       time = READ_ONCE(groupc->times[s]);
-                       /*
-                        * In addition to already concluded states, we
-                        * also incorporate currently active states on
-                        * the CPU, since states may last for many
-                        * sampling periods.
-                        *
-                        * This way we keep our delta sampling buckets
-                        * small (u32) and our reported pressure close
-                        * to what's actually happening.
-                        */
-                       if (test_state(groupc->tasks, cpu, s)) {
-                               /*
-                                * We can race with a state change and
-                                * need to make sure the state_start
-                                * update is ordered against the
-                                * updates to the live state and the
-                                * time buckets (groupc->times).
-                                *
-                                * 1. If we observe task state that
-                                * needs to be recorded, make sure we
-                                * see state_start from when that
-                                * state went into effect or we'll
-                                * count time from the previous state.
-                                *
-                                * 2. If the time delta has already
-                                * been added to the bucket, make sure
-                                * we don't see it in state_start or
-                                * we'll count it twice.
-                                *
-                                * If the time delta is out of
-                                * state_start but not in the time
-                                * bucket yet, we'll miss it entirely
-                                * and handle it in the next period.
-                                */
-                               smp_rmb();
-                               time += cpu_clock(cpu) - groupc->state_start;
-                       }
-                       delta = time - groupc->times_prev[s];
-                       groupc->times_prev[s] = time;
-
-                       if (s == PSI_NONIDLE) {
-                               nonidle = nsecs_to_jiffies(delta);
-                               nonidle_total += nonidle;
-                       } else {
-                               deltas[s] += (u64)delta * nonidle;
-                       }
+               u32 nonidle;
+
+               nonidle = read_update_delta(groupc, PSI_NONIDLE, cpu);
+               nonidle = nsecs_to_jiffies(nonidle);
+               nonidle_total += nonidle;
+
+               for (s = 0; s < PSI_NONIDLE; s++) {
+                       u32 delta;
+
+                       delta = read_update_delta(groupc, s, cpu);
+                       deltas[s] += (u64)delta * nonidle;
                }
        }
 

Reply via email to