On 20/06/2017, 15:58, "Savolainen, Petri (Nokia - FI/Espoo)" <petri.savolai...@nokia.com> wrote:
>> --- a/platform/linux-generic/include/odp_config_internal.h >> +++ b/platform/linux-generic/include/odp_config_internal.h >> @@ -7,9 +7,7 @@ >> #ifndef ODP_CONFIG_INTERNAL_H_ >> #define ODP_CONFIG_INTERNAL_H_ >> >> -#ifdef __cplusplus >> -extern "C" { >> -#endif >> +#include <odp_schedule_scalable_config.h> > >Why these configs need global visibility? This file should contain >general configuration options. > >> >> /* >> * Maximum number of pools >> @@ -22,6 +20,13 @@ extern "C" { >> #define ODP_CONFIG_QUEUES 1024 >> >> /* >> + * Maximum queue depth. Maximum number of elements that can be stored >>in >> a >> + * queue. This value is used only when the size is not explicitly >> provided >> + * during queue creation. >> + */ >> +#define CONFIG_QUEUE_SIZE 4096 >> + >> +/* >> * Maximum number of ordered locks per queue >> */ >> #define CONFIG_QUEUE_MAX_ORD_LOCKS 4 >> @@ -120,7 +125,7 @@ extern "C" { >> * >> * This the the number of separate SHM areas that can be reserved >> concurrently >> */ >> -#define ODPDRV_CONFIG_SHM_BLOCKS 48 >> +#define ODPDRV_CONFIG_SHM_BLOCKS ODP_CONFIG_SHM_BLOCKS > > >Is this change necessary? Increases driver memory usage for no reason? The scalable scheduler should have anything to do with the drivers and their shmem use. So I think this change is unnecessary and should be reverted. > > >> + >> +#endif /* ODP_SCHEDULE_SCALABLE_H */ >> diff --git >>a/platform/linux-generic/include/odp_schedule_scalable_config.h >> b/platform/linux-generic/include/odp_schedule_scalable_config.h >> new file mode 100644 >> index 00000000..febf379b >> --- /dev/null >> +++ b/platform/linux-generic/include/odp_schedule_scalable_config.h >> @@ -0,0 +1,55 @@ >> +/* Copyright (c) 2017, ARM Limited >> + * All rights reserved. >> + * >> + * SPDX-License-Identifier: BSD-3-Clause >> + */ >> + >> +#ifndef ODP_SCHEDULE_SCALABLE_CONFIG_H_ >> +#define ODP_SCHEDULE_SCALABLE_CONFIG_H_ >> + >> +/* >> + * Default scaling factor for the scheduler group >> + * >> + * This scaling factor is used when the application creates a scheduler >> + * group with no worker threads. >> + */ >> +#define CONFIG_DEFAULT_XFACTOR 4 >> + >> +/* >> + * Default weight (in events) for WRR in scalable scheduler >> + * >> + * This controls the per-queue weight for WRR between queues of the >>same >> + * priority in the scalable scheduler >> + * A higher value improves throughput while a lower value increases >> fairness >> + * and thus likely decreases latency >> + * >> + * If WRR is undesired, set the value to ~0 which will use the largest >> possible >> + * weight >> + * >> + * Note: an API for specifying this on a per-queue basis would be >>useful >> but is >> + * not yet available >> + */ >> +#define CONFIG_WRR_WEIGHT 64 >> + >> +/* >> + * Split queue producer/consumer metadata into separate cache lines. >> + * This is beneficial on e.g. Cortex-A57 but not so much on A53. >> + */ >> +#define CONFIG_SPLIT_PRODCONS >> + >> +/* >> + * Use locks to protect queue (ring buffer) and scheduler state updates >> + * On x86, this decreases overhead noticeably. >> + */ >> +#ifndef __ARM_ARCH >> +#define CONFIG_QSCHST_LOCK >> +/* Keep all ring buffer/qschst data together when using locks */ >> +#undef CONFIG_SPLIT_PRODCONS >> +#endif >> + >> +/* >> + * Maximum number of ordered locks per queue. >> + */ >> +#define CONFIG_MAX_ORDERED_LOCKS_PER_QUEUE 2 > > >There's already CONFIG_QUEUE_MAX_ORD_LOCKS 4, in the general config file. >Should not add the same define twice (with different value). Yes it is unnecessary for the scalable scheduler to use a separate define for this. > > > >> + >> +#endif /* ODP_SCHEDULE_SCALABLE_CONFIG_H_ */ >> diff --git a/platform/linux- >> generic/include/odp_schedule_scalable_ordered.h b/platform/linux- >> generic/include/odp_schedule_scalable_ordered.h >> new file mode 100644 >> index 00000000..9f3acf7a >> --- /dev/null >> +++ b/platform/linux-generic/include/odp_schedule_scalable_ordered.h >> @@ -0,0 +1,132 @@ >> +/* Copyright (c) 2017, ARM Limited >> + * All rights reserved. >> + * >> + * SPDX-License-Identifier: BSD-3-Clause >> + */ >> + >> +#ifndef ODP_SCHEDULE_SCALABLE_ORDERED_H >> +#define ODP_SCHEDULE_SCALABLE_ORDERED_H >> + >> +#include <odp/api/shared_memory.h> >> + >> +#include <odp_internal.h> >> +#include <odp_align_internal.h> >> +#include <odp_bitset.h> >> +#include <_ishmpool_internal.h> > >> + >> +/* Number of reorder contects in the reorder window. >> + * Should be at least one per CPU. >> + */ >> +#define RWIN_SIZE 32 >> +ODP_STATIC_ASSERT(CHECK_IS_POWER2(RWIN_SIZE), "RWIN_SIZE is not a power >> of 2"); >> + >> +#define NUM_OLOCKS 2 > >Is this the same as CONFIG_MAX_ORDERED_LOCKS_PER_QUEUE or something >different with similar name ? This is the define that is actually usedŠ There is an obvious need of a cleanup here. > > >> diff --git a/platform/linux-generic/odp_queue_if.c b/platform/linux- >> generic/odp_queue_if.c >> index c91f00eb..d7471dfc 100644 >> --- a/platform/linux-generic/odp_queue_if.c >> +++ b/platform/linux-generic/odp_queue_if.c >> @@ -6,11 +6,19 @@ >> >> #include <odp_queue_if.h> >> >> +extern const queue_api_t queue_scalable_api; >> +extern const queue_fn_t queue_scalable_fn; >> + >> extern const queue_api_t queue_default_api; >> extern const queue_fn_t queue_default_fn; >> >> +#ifdef ODP_SCHEDULE_SCALABLE >> +const queue_api_t *queue_api = &queue_scalable_api; >> +const queue_fn_t *queue_fn = &queue_scalable_fn; >> +#else >> const queue_api_t *queue_api = &queue_default_api; >> const queue_fn_t *queue_fn = &queue_default_fn; >> +#endif >> >> odp_queue_t odp_queue_create(const char *name, const odp_queue_param_t >> *param) >> { >> diff --git a/platform/linux-generic/odp_queue_scalable.c >>b/platform/linux- >> generic/odp_queue_scalable.c >> new file mode 100644 >> index 00000000..d5c6d0ae >> --- /dev/null >> +++ b/platform/linux-generic/odp_queue_scalable.c >> @@ -0,0 +1,1020 @@ >> +/* Copyright (c) 2017, ARM Limited >> + * All rights reserved. >> + * >> + * SPDX-License-Identifier: BSD-3-Clause >> + */ >> + > > >> +static int queue_init(queue_entry_t *queue, const char *name, >> + const odp_queue_param_t *param) >> +{ >> + ringidx_t ring_idx; >> + sched_elem_t *sched_elem; >> + uint32_t ring_size; >> + odp_buffer_hdr_t **ring; >> + uint32_t size; >> + >> + sched_elem = &queue->s.sched_elem; >> + ring_size = param->size > 0 ? >> + ROUNDUP_POWER2_U32(param->size) : CONFIG_QUEUE_SIZE; >> + strncpy(queue->s.name, name ? name : "", ODP_QUEUE_NAME_LEN - >> 1); >> + queue->s.name[ODP_QUEUE_NAME_LEN - 1] = 0; >> + memcpy(&queue->s.param, param, sizeof(odp_queue_param_t)); >> + >> + size = ring_size * sizeof(odp_buffer_hdr_t *); >> + ring = (odp_buffer_hdr_t >> **)shm_pool_alloc_align(queue_shm_pool, size); >> + if (NULL == ring) >> + return -1; >> + >> + for (ring_idx = 0; ring_idx < ring_size; ring_idx++) >> + ring[ring_idx] = NULL; >> + >> + queue->s.type = queue->s.param.type; >> + queue->s.enqueue = _queue_enq; >> + queue->s.dequeue = _queue_deq; >> + queue->s.enqueue_multi = _queue_enq_multi; >> + queue->s.dequeue_multi = _queue_deq_multi; >> + queue->s.pktin = PKTIN_INVALID; >> + >> + sched_elem->node.next = NULL; >> +#ifdef CONFIG_QSCHST_LOCK >> + LOCK_INIT(&sched_elem->qschlock); >> +#endif > >There are about 30 of these CONFIG_QSCHST_LOCK ifdefs. It would be >cleaner to embed ifdefs into the lock macros instead of surrounding each >macro call with an ifdef. Except the ifdefs not always cover only the lock/unlock calls. Sometimes non-MT-safe versions of functions can be called as the MT-safeness is provided by the lock, this is faster than calling the default MT-safe functions. > >Also some of these CONFIG_QSCHST_LOCK ifdefs contain a large amount of >code, entire functions like: sched_update_deq, sched_update_deq_sc, >_odp_queue_enq. Again, one version of a function is built for ARM, >another version for other archs. It is not necessarily an ARM-vs-other-architecture issue. Both code paths work on ARM, both code paths work on x86. The lock-based code path is noticeably faster on x86 (spin locks are fast, cmpxchg not so fast). On ARM I have seen varying results but in general relaxed atomic RMW operations seem fast on ARM (at least the microarchitectures I have tested). The non-lock code path has better characteristics (no locking until you need to actually push/pop a queue to/from a scheduler queue), less dependencies between threads, this should enhance scalability. > Could these be built always and make selection with a minimal amount of >ifdef'ed code? The different functions could always be built but wouldn¹t the compiler complain that functions are defined but not used? CC odp_schedule_scalable.lo odp_schedule_scalable.c:2092:13: error: Œfunction_not_used¹ defined but not used [-Werror=unused-function] static void function_not_used(void) ^ cc1: all warnings being treated as errors > > >-Petri > >