Signed-off-by: Brian Brooks <[email protected]>
---
.../linux-generic/include/odp_config_internal.h | 80 ++++++++++++++++++++++
1 file changed, 80 insertions(+)
diff --git a/platform/linux-generic/include/odp_config_internal.h
b/platform/linux-generic/include/odp_config_internal.h
index 09e82bb9..3bc89017 100644
--- a/platform/linux-generic/include/odp_config_internal.h
+++ b/platform/linux-generic/include/odp_config_internal.h
@@ -136,6 +136,21 @@ extern "C" {
*/
#define CONFIG_BURST_SIZE 16
+/* Default weight (in events) for WRR in scalable scheduler
+ *
+ * This controls the per-queue weight for WRR between queues of the same
+ * priority in the scalable scheduler
+ * A higher value improves throughput while a lower value increases fairness
+ * and thus likely decreases latency
+ *
+ * If WRR is undesired, set the value to ~0 which will use the largest possible
+ * weight
+ *
+ * Note: an API for specifying this on a per-queue basis would be useful but is
+ * not yet available
+ */
+#define CONFIG_WRR_WEIGHT 64
+
/*
* Maximum number of events in a pool
*/
@@ -146,6 +161,71 @@ extern "C" {
*/
#define CONFIG_POOL_CACHE_SIZE 256
+/*
+ * Split queue producer/consumer metadata into separate cache lines.
+ * This is beneficial on e.g. Cortex-A57 but not so much on A53.
+ */
+#define ODP_CONFIG_USE_SPLIT_PRODCONS
+
+/*
+ * Split queue read/write metadata into separate cache lines.
+ * This enhances scalability even further on Cortex-A57.
+ */
+#define ODP_CONFIG_USE_SPLIT_READWRITE
+
+/*
+ * Use locks to protect queue (ring buffer) and scheduler state updates
+ * On x86, this decreases overhead but also degrades scalability
+ */
+#ifndef __ARM_ARCH
+#define CONFIG_QSCHST_LOCK
+#endif
+
+#ifdef CONFIG_QSCHST_LOCK
+/* Keep all ring buffer/qschst data together when using locks
+ */
+#undef ODP_CONFIG_USE_SPLIT_PRODCONS
+#undef ODP_CONFIG_USE_SPLIT_READWRITE
+#endif
+
+/*
+ * Size of atomic bit set. This limits the max number of threads,
+ * scheduler groups and reorder windows. On ARMv8/64-bit and x86-64, the
+ * (lock-free) max is 128
+ */
+#define ATOM_BITSET_SIZE 64
+
+/*
+ * Use LLD/SCD atomic primitives instead of lock-based code path in llqueue
+ * LLD/SCD is on ARM the fastest way to enqueue and dequeue elements from a
+ * linked list queue.
+ */
+#ifdef __ARM_ARCH
+#define ODP_CONFIG_LLDSCD
+#endif
+
+/*
+ * Use DMB;STR instead of STRL on ARM
+ * On early ARMv8 implementations (e.g. Cortex-A57) this is noticeably more
+ * performant than using store-release.
+ * This also allows for load-only barriers (DMB ISHLD) which are much cheaper
+ * than a full barrier
+ */
+#ifdef __ARM_ARCH
+#define ODP_CONFIG_DMBSTR
+#endif
+
+/*
+ * Use ARM event signalling mechanism
+ * Event signalling minimises spinning (busy waiting) which decreases
+ * cache coherency traffic when spinning on shared locations (thus faster and
+ * more scalable) and enables the CPU to enter a sleep state (lower power
+ * consumption).
+ */
+#ifdef __ARM_ARCH
+#define ODP_CONFIG_USE_WFE
+#endif
+
#ifdef __cplusplus
}
#endif
--
2.12.2