[lng-odp] [API-NEXT PATCH v2 11/16] Add scalable scheduler build config

Brian Brooks Tue, 04 Apr 2017 11:56:20 -0700

Signed-off-by: Brian Brooks <[email protected]>
---
 .../linux-generic/include/odp_config_internal.h    | 80 ++++++++++++++++++++++
 1 file changed, 80 insertions(+)


diff --git a/platform/linux-generic/include/odp_config_internal.h 
b/platform/linux-generic/include/odp_config_internal.h
index 09e82bb9..3bc89017 100644
--- a/platform/linux-generic/include/odp_config_internal.h
+++ b/platform/linux-generic/include/odp_config_internal.h
@@ -136,6 +136,21 @@ extern "C" {
  */
 #define CONFIG_BURST_SIZE 16
 
+/* Default weight (in events) for WRR in scalable scheduler
+ *
+ * This controls the per-queue weight for WRR between queues of the same
+ * priority in the scalable scheduler
+ * A higher value improves throughput while a lower value increases fairness
+ * and thus likely decreases latency
+ *
+ * If WRR is undesired, set the value to ~0 which will use the largest possible
+ * weight
+ *
+ * Note: an API for specifying this on a per-queue basis would be useful but is
+ * not yet available
+ */
+#define CONFIG_WRR_WEIGHT 64
+
 /*
  * Maximum number of events in a pool
  */
@@ -146,6 +161,71 @@ extern "C" {
  */
 #define CONFIG_POOL_CACHE_SIZE 256
 
+/*
+ * Split queue producer/consumer metadata into separate cache lines.
+ * This is beneficial on e.g. Cortex-A57 but not so much on A53.
+ */
+#define ODP_CONFIG_USE_SPLIT_PRODCONS
+
+/*
+ * Split queue read/write metadata into separate cache lines.
+ * This enhances scalability even further on Cortex-A57.
+ */
+#define ODP_CONFIG_USE_SPLIT_READWRITE
+
+/*
+ * Use locks to protect queue (ring buffer) and scheduler state updates
+ * On x86, this decreases overhead but also degrades scalability
+ */
+#ifndef __ARM_ARCH
+#define CONFIG_QSCHST_LOCK
+#endif
+
+#ifdef CONFIG_QSCHST_LOCK
+/* Keep all ring buffer/qschst data together when using locks
+ */
+#undef ODP_CONFIG_USE_SPLIT_PRODCONS
+#undef ODP_CONFIG_USE_SPLIT_READWRITE
+#endif
+
+/*
+ * Size of atomic bit set. This limits the max number of threads,
+ * scheduler groups and reorder windows. On ARMv8/64-bit and x86-64, the
+ * (lock-free) max is 128
+ */
+#define ATOM_BITSET_SIZE 64
+
+/*
+ * Use LLD/SCD atomic primitives instead of lock-based code path in llqueue
+ * LLD/SCD is on ARM the fastest way to enqueue and dequeue elements from a
+ * linked list queue.
+ */
+#ifdef __ARM_ARCH
+#define ODP_CONFIG_LLDSCD
+#endif
+
+/*
+ * Use DMB;STR instead of STRL on ARM
+ * On early ARMv8 implementations (e.g. Cortex-A57) this is noticeably more
+ * performant than using store-release.
+ * This also allows for load-only barriers (DMB ISHLD) which are much cheaper
+ * than a full barrier
+ */
+#ifdef __ARM_ARCH
+#define ODP_CONFIG_DMBSTR
+#endif
+
+/*
+ * Use ARM event signalling mechanism
+ * Event signalling minimises spinning (busy waiting) which decreases
+ * cache coherency traffic when spinning on shared locations (thus faster and
+ * more scalable) and enables the CPU to enter a sleep state (lower power
+ * consumption).
+ */
+#ifdef __ARM_ARCH
+#define ODP_CONFIG_USE_WFE
+#endif
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.12.2

[lng-odp] [API-NEXT PATCH v2 11/16] Add scalable scheduler build config

Reply via email to