Currently it is possible for an NMI (or FIQ on ARM) to come in and
read sched_clock() whilst update_sched_clock() has locked the seqcount
for writing. This results in the NMI handler locking up when it calls
raw_read_seqcount_begin().

This patch fixes that problem by providing banked clock data in a
similar manner to Thomas Gleixner's 4396e058c52e("timekeeping: Provide
fast and NMI safe access to CLOCK_MONOTONIC").

Changing the mode of operation of the seqcount away from the traditional
LSB-means-interrupted-write to a banked approach also revealed a good deal
of "fake" write locking within sched_clock_register(). This is likely
to be a latent issue because sched_clock_register() is typically called
before we enable interrupts. Nevertheless the issue has been eliminated
by increasing the scope of the read locking performed by sched_clock().

Suggested-by: Stephen Boyd <[email protected]>
Signed-off-by: Daniel Thompson <[email protected]>
---

Notes:
    Tested on i.MX6 with perf drowning the system in FIQs and using the perf
    handler to check that sched_clock() returns monotonic values. At the
    same time I forcefully reduced kt_wrap so that update_sched_clock()
    is being called at >1000Hz.
    
    Without the patch the above system is grossly unstable, surviving
    [9K,115K,25K] perf event cycles during three separate runs. With the
    patch I ran for over 11M perf event cycles before getting bored.
    
    v2:
    
    * Extended the scope of the read lock in sched_clock() so we can bank
      all data consumed there (John Stultz)
    

 kernel/time/sched_clock.c | 157 +++++++++++++++++++++++++++++-----------------
 1 file changed, 100 insertions(+), 57 deletions(-)

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a2ea66944bc1 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -18,28 +18,28 @@
 #include <linux/seqlock.h>
 #include <linux/bitops.h>

-struct clock_data {
-       ktime_t wrap_kt;
+struct clock_data_banked {
        u64 epoch_ns;
        u64 epoch_cyc;
-       seqcount_t seq;
-       unsigned long rate;
+       u64 (*read_sched_clock)(void);
+       u64 sched_clock_mask;
        u32 mult;
        u32 shift;
        bool suspended;
 };

+struct clock_data {
+       ktime_t wrap_kt;
+       seqcount_t seq;
+       unsigned long rate;
+       struct clock_data_banked bank[2];
+};
+
 static struct hrtimer sched_clock_timer;
 static int irqtime = -1;

 core_param(irqtime, irqtime, int, 0400);

-static struct clock_data cd = {
-       .mult   = NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
 static u64 notrace jiffy_sched_clock_read(void)
 {
        /*
@@ -49,7 +49,14 @@ static u64 notrace jiffy_sched_clock_read(void)
        return (u64)(jiffies - INITIAL_JIFFIES);
 }

-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd = {
+       .bank = {
+               [0] = {
+                       .mult   = NSEC_PER_SEC / HZ,
+                       .read_sched_clock = jiffy_sched_clock_read,
+               },
+       },
+};

 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
@@ -58,50 +65,82 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 
shift)

 unsigned long long notrace sched_clock(void)
 {
-       u64 epoch_ns;
-       u64 epoch_cyc;
        u64 cyc;
        unsigned long seq;
-
-       if (cd.suspended)
-               return cd.epoch_ns;
+       struct clock_data_banked *b;
+       u64 res;

        do {
-               seq = raw_read_seqcount_begin(&cd.seq);
-               epoch_cyc = cd.epoch_cyc;
-               epoch_ns = cd.epoch_ns;
+               seq = raw_read_seqcount(&cd.seq);
+               b = cd.bank + (seq & 1);
+               if (b->suspended) {
+                       res = b->epoch_ns;
+               } else {
+                       cyc = b->read_sched_clock();
+                       cyc = (cyc - b->epoch_cyc) & b->sched_clock_mask;
+                       res = b->epoch_ns + cyc_to_ns(cyc, b->mult, b->shift);
+               }
        } while (read_seqcount_retry(&cd.seq, seq));

-       cyc = read_sched_clock();
-       cyc = (cyc - epoch_cyc) & sched_clock_mask;
-       return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+       return res;
+}
+
+/*
+ * Start updating the banked clock data.
+ *
+ * sched_clock will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ *
+ * The caller is responsible for avoiding simultaneous updates.
+ */
+static struct clock_data_banked *update_bank_begin(void)
+{
+       /* update the backup (odd) bank and steer readers towards it */
+       memcpy(cd.bank + 1, cd.bank, sizeof(struct clock_data_banked));
+       raw_write_seqcount_latch(&cd.seq);
+
+       return cd.bank;
+}
+
+/*
+ * Finalize update of banked clock data.
+ *
+ * This is just a trivial switch back to the primary (even) copy.
+ */
+static void update_bank_end(void)
+{
+       raw_write_seqcount_latch(&cd.seq);
 }

 /*
  * Atomically update the sched_clock epoch.
  */
-static void notrace update_sched_clock(void)
+static void notrace update_sched_clock(bool suspended)
 {
-       unsigned long flags;
+       struct clock_data_banked *b;
        u64 cyc;
        u64 ns;

-       cyc = read_sched_clock();
-       ns = cd.epoch_ns +
-               cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-                         cd.mult, cd.shift);
-
-       raw_local_irq_save(flags);
-       raw_write_seqcount_begin(&cd.seq);
-       cd.epoch_ns = ns;
-       cd.epoch_cyc = cyc;
-       raw_write_seqcount_end(&cd.seq);
-       raw_local_irq_restore(flags);
+       b = update_bank_begin();
+
+       cyc = b->read_sched_clock();
+       ns = b->epoch_ns + cyc_to_ns((cyc - b->epoch_cyc) & b->sched_clock_mask,
+                                    b->mult, b->shift);
+
+       b->epoch_ns = ns;
+       b->epoch_cyc = cyc;
+       b->suspended = suspended;
+
+       update_bank_end();
 }

 static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
-       update_sched_clock();
+       update_sched_clock(false);
        hrtimer_forward_now(hrt, cd.wrap_kt);
        return HRTIMER_RESTART;
 }
@@ -111,9 +150,9 @@ void __init sched_clock_register(u64 (*read)(void), int 
bits,
 {
        u64 res, wrap, new_mask, new_epoch, cyc, ns;
        u32 new_mult, new_shift;
-       ktime_t new_wrap_kt;
        unsigned long r;
        char r_unit;
+       struct clock_data_banked *b;

        if (cd.rate > rate)
                return;
@@ -122,29 +161,30 @@ void __init sched_clock_register(u64 (*read)(void), int 
bits,

        /* calculate the mult/shift to convert counter ticks to ns. */
        clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
+       cd.rate = rate;

        new_mask = CLOCKSOURCE_MASK(bits);

        /* calculate how many ns until we wrap */
        wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
-       new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+       cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+
+       b = update_bank_begin();

        /* update epoch for new counter and update epoch_ns from old counter*/
        new_epoch = read();
-       cyc = read_sched_clock();
-       ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-                         cd.mult, cd.shift);
+       cyc = b->read_sched_clock();
+       ns = b->epoch_ns + cyc_to_ns((cyc - b->epoch_cyc) & b->sched_clock_mask,
+                                    b->mult, b->shift);

-       raw_write_seqcount_begin(&cd.seq);
-       read_sched_clock = read;
-       sched_clock_mask = new_mask;
-       cd.rate = rate;
-       cd.wrap_kt = new_wrap_kt;
-       cd.mult = new_mult;
-       cd.shift = new_shift;
-       cd.epoch_cyc = new_epoch;
-       cd.epoch_ns = ns;
-       raw_write_seqcount_end(&cd.seq);
+       b->read_sched_clock = read;
+       b->sched_clock_mask = new_mask;
+       b->mult = new_mult;
+       b->shift = new_shift;
+       b->epoch_cyc = new_epoch;
+       b->epoch_ns = ns;
+
+       update_bank_end();

        r = rate;
        if (r >= 4000000) {
@@ -175,10 +215,10 @@ void __init sched_clock_postinit(void)
         * If no sched_clock function has been provided at that point,
         * make it the final one one.
         */
-       if (read_sched_clock == jiffy_sched_clock_read)
+       if (cd.bank[0].read_sched_clock == jiffy_sched_clock_read)
                sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);

-       update_sched_clock();
+       update_sched_clock(false);

        /*
         * Start the timer to keep sched_clock() properly updated and
@@ -191,17 +231,20 @@ void __init sched_clock_postinit(void)

 static int sched_clock_suspend(void)
 {
-       update_sched_clock();
+       update_sched_clock(true);
        hrtimer_cancel(&sched_clock_timer);
-       cd.suspended = true;
        return 0;
 }

 static void sched_clock_resume(void)
 {
-       cd.epoch_cyc = read_sched_clock();
+       struct clock_data_banked *b;
+
+       b = update_bank_begin();
+       b->epoch_cyc = b->read_sched_clock();
        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
-       cd.suspended = false;
+       b->suspended = false;
+       update_bank_end();
 }

 static struct syscore_ops sched_clock_ops = {
--
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to