irq-am library helps I/O devices implement interrupt moderation in
an adaptive fashion, based on online stats.

The consumer can initialize an irq-am context with a callback that
performs the device specific moderation programming and also the number
of am (adaptive moderation) levels which are also, abstracted and allows
for device specific tuning.

The irq-am code will sample once every nr_events and will check for significant
change in workload characteristics (completions per second, events per second)
and if it detects one, will perform an am level update(called a step).

The irq-am code  assumes that the am levels are sorted in an increasing order 
when
the lowest level corresponds to the optimum latency tuning (short time and low
completion-count) and gradually increasing towards the throughput optimum tuning
(longer time and higher completion-count). So there is a trend and tuning 
direction
tracked by the moderator. When the moderator collects sufficient statistics 
(also
controlled by the consumer defining nr_events), it compares the current stats 
with the
previous stats and if a significant changed was observed in the load, the 
moderator
attempts to increment/decrement its current level (step) and schedules a program
dispatch work.

Signed-off-by: Sagi Grimberg <s...@grimberg.me>
---
 include/linux/irq-am.h | 116 +++++++++++++++++++++++++++++++
 lib/Kconfig            |   5 ++
 lib/Makefile           |   1 +
 lib/irq-am.c           | 182 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 304 insertions(+)
 create mode 100644 include/linux/irq-am.h
 create mode 100644 lib/irq-am.c

diff --git a/include/linux/irq-am.h b/include/linux/irq-am.h
new file mode 100644
index 000000000000..5ddd5ca268aa
--- /dev/null
+++ b/include/linux/irq-am.h
@@ -0,0 +1,116 @@
+/*
+ * Adaptive moderation support for I/O devices.
+ * Copyright (c) 2018 Lightbits Labs.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _IRQ_AM_H
+#define _IRQ_AM_H
+
+#include <linux/ktime.h>
+#include <linux/workqueue.h>
+
+struct irq_am;
+typedef int (irq_am_fn)(struct irq_am *, unsigned short level);
+
+/*
+ * struct irq_am_sample_stats - sample stats for adpative moderation
+ * @cps:        completions per-second
+ * @eps:        events per-second
+ * @cpe:       completions per event
+ */
+struct irq_am_sample_stats {
+       u32 cps;
+       u32 eps;
+       u32 cpe;
+};
+
+/*
+ * struct irq_am_sample - per-irq interrupt batch sample unit
+ * @time:         current time
+ * @comps:     completions count since last sample
+ * @events:    events count since the last sample
+ */
+struct irq_am_sample {
+       ktime_t time;
+       u64     comps;
+       u64     events;
+};
+
+/*
+ * enum irq_am_state - adaptive moderation monitor states
+ * @IRQ_AM_START_MEASURING:        collect first sample (start_sample)
+ * @IRQ_AM_MEASURING:              measurement in progress
+ * @IRQ_AM_PROGRAM_MODERATION:     moderatio program scheduled
+ *                                 so we should not react to any stats
+ *                                 from the old moderation profile.
+ */
+enum irq_am_state {
+       IRQ_AM_START_MEASURING,
+       IRQ_AM_MEASURING,
+       IRQ_AM_PROGRAM_MODERATION,
+};
+
+enum irq_am_tune_state {
+       IRQ_AM_GOING_UP,
+       IRQ_AM_GOING_DOWN,
+};
+
+enum irq_am_relative_diff {
+       IRQ_AM_STATS_WORSE,
+       IRQ_AM_STATS_SAME,
+       IRQ_AM_STATS_BETTER,
+};
+
+struct irq_am_stats {
+       u64     events;
+       u64     comps;
+};
+
+/*
+ * struct irq_am - irq adaptive moderation monitor
+ * @state:             adaptive moderation monitor state
+ * @tune_state:        tuning state of the moderation monitor
+ * @am_stats:          overall completions and events counters
+ * @start_sample:      first sample in moderation batch
+ * @prev_stats:        previous stats for trend detection
+ * @nr_events:         number of events between samples
+ * @nr_levels:         number of moderation levels
+ * @curr_level:        current moderation level
+ * @work:              schedule moderation program
+ * @program:           moderation program handler
+ */
+struct irq_am {
+       enum irq_am_state               state;
+       enum irq_am_tune_state          tune_state;
+
+       struct irq_am_stats             am_stats;
+       struct irq_am_sample            start_sample;
+       struct irq_am_sample_stats      prev_stats;
+
+       u16                             nr_events;
+       unsigned short                  nr_levels;
+       unsigned short                  curr_level;
+
+       struct work_struct              work;
+       irq_am_fn                       *program;
+};
+
+void irq_am_add_event(struct irq_am *am);
+static inline void irq_am_add_comps(struct irq_am *am, u64 n)
+{
+       am->am_stats.comps += n;
+}
+
+void irq_am_cleanup(struct irq_am *am);
+void irq_am_init(struct irq_am *am, unsigned int nr_events,
+       unsigned short nr_levels, unsigned short start_level, irq_am_fn *fn);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 4dd5c11366f9..bbb4c9eea84d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -504,6 +504,11 @@ config DDR
          information. This data is useful for drivers handling
          DDR SDRAM controllers.
 
+config IRQ_AM
+       bool "IRQ adaptive moderation library"
+       help
+         Helper library to implement adaptive moderation for I/O devices.
+
 config IRQ_POLL
        bool "IRQ polling library"
        help
diff --git a/lib/Makefile b/lib/Makefile
index d11c48ec8ffd..795583a685b9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -193,6 +193,7 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_SG_POOL) += sg_pool.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 obj-$(CONFIG_IRQ_POLL) += irq_poll.o
+obj-$(CONFIG_IRQ_AM) += irq-am.o
 
 obj-$(CONFIG_STACKDEPOT) += stackdepot.o
 KASAN_SANITIZE_stackdepot.o := n
diff --git a/lib/irq-am.c b/lib/irq-am.c
new file mode 100644
index 000000000000..ed7befd7a560
--- /dev/null
+++ b/lib/irq-am.c
@@ -0,0 +1,182 @@
+/*
+ * Adaptive moderation support for I/O devices.
+ * Copyright (c) 2018 Lightbits Labs.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/irq-am.h>
+
+static void irq_am_try_step(struct irq_am *am)
+{
+       if (am->tune_state == IRQ_AM_GOING_UP &&
+           am->curr_level != am->nr_levels - 1) {
+               am->curr_level++;
+       } else if (am->tune_state == IRQ_AM_GOING_DOWN &&
+                  am->curr_level != 0) {
+               am->curr_level--;
+       }
+}
+
+static inline bool irq_am_on_edge(struct irq_am *am)
+{
+       return am->curr_level == 0 || am->curr_level == am->nr_levels - 1;
+}
+
+static void irq_am_turn(struct irq_am *am)
+{
+       am->tune_state = am->tune_state == IRQ_AM_GOING_UP ?
+               IRQ_AM_GOING_DOWN : IRQ_AM_GOING_UP;
+       irq_am_try_step(am);
+}
+
+#define IRQ_AM_SIGNIFICANT_DIFF(val, ref) \
+       (((100 * abs((val) - (ref))) / (ref)) > 20) /* more than 20% difference 
*/
+
+static int irq_am_stats_compare(struct irq_am *am, struct irq_am_sample_stats 
*curr)
+{
+       struct irq_am_sample_stats *prev = &am->prev_stats;
+
+       /* first stat */
+       if (!prev->cps)
+               return IRQ_AM_STATS_SAME;
+
+       /* more completions per second is better */
+       if (IRQ_AM_SIGNIFICANT_DIFF(curr->cps, prev->cps))
+               return (curr->cps > prev->cps) ? IRQ_AM_STATS_BETTER :
+                                                IRQ_AM_STATS_WORSE;
+
+       /* less events per second is better */
+       if (IRQ_AM_SIGNIFICANT_DIFF(curr->eps, prev->eps))
+               return (curr->eps < prev->eps) ? IRQ_AM_STATS_BETTER :
+                                                IRQ_AM_STATS_WORSE;
+
+       /*
+        * we get 1 completion per event, no point in trying to aggregate
+        * any further, start declining moderation
+        */
+       if (curr->cpe == 1 && am->curr_level)
+               return am->tune_state == IRQ_AM_GOING_UP ?
+                       IRQ_AM_STATS_WORSE : IRQ_AM_STATS_BETTER;
+
+       return IRQ_AM_STATS_SAME;
+}
+
+static bool irq_am_decision(struct irq_am *am,
+               struct irq_am_sample_stats *curr_stats)
+{
+       unsigned short prev_level = am->curr_level;
+       enum irq_am_relative_diff diff;
+       bool changed;
+
+       diff = irq_am_stats_compare(am, curr_stats);
+       switch (diff) {
+       default:
+       case IRQ_AM_STATS_SAME:
+               /* fall through */
+               break;
+       case IRQ_AM_STATS_WORSE:
+               irq_am_turn(am);
+               break;
+       case IRQ_AM_STATS_BETTER:
+               irq_am_try_step(am);
+               break;
+       }
+
+       changed = am->curr_level != prev_level || irq_am_on_edge(am);
+       if (changed || !am->prev_stats.cps)
+               am->prev_stats = *curr_stats;
+
+       return changed;
+}
+
+static void irq_am_sample(struct irq_am *am, struct irq_am_sample *s)
+{
+       s->time = ktime_get();
+       s->events = am->am_stats.events;
+       s->comps = am->am_stats.comps;
+}
+
+static void irq_am_calc_stats(struct irq_am *am, struct irq_am_sample *start,
+               struct irq_am_sample *end,
+               struct irq_am_sample_stats *curr_stats)
+{
+       /* u32 holds up to 71 minutes, should be enough */
+       u32 delta_us = ktime_us_delta(end->time, start->time);
+       u32 ncomps = end->comps - start->comps;
+
+       if (!delta_us)
+               return;
+
+       curr_stats->cps = DIV_ROUND_UP(ncomps * USEC_PER_SEC, delta_us);
+       curr_stats->eps = DIV_ROUND_UP(am->nr_events * USEC_PER_SEC, delta_us);
+       curr_stats->cpe = DIV_ROUND_UP(ncomps, am->nr_events);
+}
+
+void irq_am_add_event(struct irq_am *am)
+{
+       struct irq_am_sample end_sample;
+       struct irq_am_sample_stats curr_stats;
+       u16 nr_events;
+
+       am->am_stats.events++;
+
+       switch (am->state) {
+       case IRQ_AM_MEASURING:
+               nr_events = am->am_stats.events - am->start_sample.events;
+               if (nr_events < am->nr_events)
+                       break;
+
+               irq_am_sample(am, &end_sample);
+               irq_am_calc_stats(am, &am->start_sample, &end_sample,
+                                   &curr_stats);
+               if (irq_am_decision(am, &curr_stats)) {
+                       am->state = IRQ_AM_PROGRAM_MODERATION;
+                       schedule_work(&am->work);
+                       break;
+               }
+               /* fall through */
+       case IRQ_AM_START_MEASURING:
+               irq_am_sample(am, &am->start_sample);
+               am->state = IRQ_AM_MEASURING;
+               break;
+       case IRQ_AM_PROGRAM_MODERATION:
+               break;
+       }
+}
+EXPORT_SYMBOL_GPL(irq_am_add_event);
+
+static void irq_am_program_moderation_work(struct work_struct *w)
+{
+       struct irq_am *am = container_of(w, struct irq_am, work);
+
+       WARN_ON_ONCE(am->program(am, am->curr_level));
+       am->state = IRQ_AM_START_MEASURING;
+}
+
+
+void irq_am_cleanup(struct irq_am *am)
+{
+       flush_work(&am->work);
+}
+EXPORT_SYMBOL_GPL(irq_am_cleanup);
+
+void irq_am_init(struct irq_am *am, unsigned int nr_events,
+       unsigned short nr_levels, unsigned short start_level, irq_am_fn *fn)
+{
+       memset(am, 0, sizeof(*am));
+       am->state = IRQ_AM_START_MEASURING;
+       am->tune_state = IRQ_AM_GOING_UP;
+       am->nr_levels = nr_levels;
+       am->nr_events = nr_events;
+       am->curr_level = start_level;
+       am->program = fn;
+       INIT_WORK(&am->work, irq_am_program_moderation_work);
+}
+EXPORT_SYMBOL_GPL(irq_am_init);
-- 
2.14.1

Reply via email to