[dpdk-dev] [PATCH v2] malloc: fix malloc and free linear complexity

2014-05-16 Thread rsanfo...@gmail.com
Problems with lib rte_malloc:
 1. Rte_malloc searches a heap's entire free list looking for
the best fit, resulting in linear complexity.
 2. Heaps store free blocks in a singly-linked list, resulting
in linear complexity when rte_free needs to remove an
adjacent block.
 3. The library inserts and removes free blocks with ad hoc,
in-line code, rather than using linked-list functions or
macros.

This patch addresses those problems as follows:
 1. Replace single free list with a handful of free lists.
Each free list contains blocks of a specified size range,
for example:
  list[0]: (0   , 2^7]
  list[1]: (2^7 , 2^9]
  list[2]: (2^9 , 2^11]
  list[3]: (2^11, 2^13]
  list[4]: (2^13, MAX_SIZE]

When allocating a block, start at the first list that can
contain a big enough block. Search subsequent lists, if
necessary. Terminate the search as soon as we find a block
that is big enough.
 2. Use doubly-linked lists, so that we can remove free blocks
in constant time.
 3. Use BSD LIST macros, as defined in sys/queue.h and the
QUEUE(3) man page.

Signed-off-by: Robert Sanford 
---
 lib/librte_eal/common/include/rte_malloc_heap.h |6 +-
 lib/librte_malloc/malloc_elem.c |  121 +++
 lib/librte_malloc/malloc_elem.h |   17 +++-
 lib/librte_malloc/malloc_heap.c |   67 ++---
 4 files changed, 128 insertions(+), 83 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h 
b/lib/librte_eal/common/include/rte_malloc_heap.h
index 5e139cf..1f5d653 100644
--- a/lib/librte_eal/common/include/rte_malloc_heap.h
+++ b/lib/librte_eal/common/include/rte_malloc_heap.h
@@ -35,14 +35,18 @@
 #define _RTE_MALLOC_HEAP_H_

 #include 
+#include 
 #include 

+/* Number of free lists per heap, grouped by size. */
+#define RTE_HEAP_NUM_FREELISTS  5
+
 /**
  * Structure to hold malloc heap
  */
 struct malloc_heap {
rte_spinlock_t lock;
-   struct malloc_elem * volatile free_head;
+   LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS];
unsigned mz_count;
unsigned alloc_count;
size_t total_size;
diff --git a/lib/librte_malloc/malloc_elem.c b/lib/librte_malloc/malloc_elem.c
index f0da640..13cd5d3 100644
--- a/lib/librte_malloc/malloc_elem.c
+++ b/lib/librte_malloc/malloc_elem.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 

 #include 
@@ -60,7 +61,8 @@ malloc_elem_init(struct malloc_elem *elem,
 {
elem->heap = heap;
elem->mz = mz;
-   elem->prev = elem->next_free = NULL;
+   elem->prev = NULL;
+   memset(&elem->free_list, 0, sizeof(elem->free_list));
elem->state = ELEM_FREE;
elem->size = size;
elem->pad = 0;
@@ -125,14 +127,71 @@ split_elem(struct malloc_elem *elem, struct malloc_elem 
*split_pt)
 }

 /*
+ * Given an element size, compute its freelist index.
+ * We free an element into the freelist containing similarly-sized elements.
+ * We try to allocate elements starting with the freelist containing
+ * similarly-sized elements, and if necessary, we search freelists
+ * containing larger elements.
+ *
+ * Example element size ranges for a heap with five free lists:
+ *   heap->free_head[0] - (0   , 2^7]
+ *   heap->free_head[1] - (2^7 , 2^9]
+ *   heap->free_head[2] - (2^9 , 2^11]
+ *   heap->free_head[3] - (2^11, 2^13]
+ *   heap->free_head[4] - (2^13, MAX_SIZE]
+ */
+size_t
+malloc_elem_free_list_index(size_t size)
+{
+#define MALLOC_MINSIZE_LOG2   7
+#define MALLOC_LOG2_INCREMENT 2
+
+   size_t log2;
+   size_t index;
+
+   if (size <= (1UL << MALLOC_MINSIZE_LOG2))
+   return 0;
+
+   /* Find next power of 2 >= size. */
+   log2 = sizeof(size) * 8 - __builtin_clzl(size-1);
+
+   /* Compute freelist index, based on log2(size). */
+   index = (log2 - MALLOC_MINSIZE_LOG2 + MALLOC_LOG2_INCREMENT - 1) /
+   MALLOC_LOG2_INCREMENT;
+
+   return (index <= RTE_HEAP_NUM_FREELISTS-1?
+   index: RTE_HEAP_NUM_FREELISTS-1);
+}
+
+/*
+ * Add the specified element to its heap's free list.
+ */
+void
+malloc_elem_free_list_insert(struct malloc_elem *elem)
+{
+   size_t idx = malloc_elem_free_list_index(elem->size - 
MALLOC_ELEM_HEADER_LEN);
+
+   elem->state = ELEM_FREE;
+   LIST_INSERT_HEAD(&elem->heap->free_head[idx], elem, free_list);
+}
+
+/*
+ * Remove the specified element from its heap's free list.
+ */
+static void
+elem_free_list_remove(struct malloc_elem *elem)
+{
+   LIST_REMOVE(elem, free_list);
+}
+
+/*
  * reserve a block of data in an existing malloc_elem. If the malloc_elem
  * is much larger than the data block requested, we split the element in two.
  * This function is only called from malloc_heap_alloc so parameter checking
  * is not done here, as it's done there previously.
  */
 struct malloc_elem *
-malloc_elem_alloc(struct malloc_elem *elem, size_t size,
-  

[dpdk-dev] [RFC PATCH] rte_timer: Fix rte_timer_reset return value

2015-02-03 Thread rsanfo...@gmail.com
From: Robert Sanford 

- API rte_timer_reset() should return -1 when the timer is in the
RUNNING or CONFIG state. Instead, it ignores the return value of
internal function __rte_timer_reset() and always returns 0.
We change rte_timer_reset() to return the value returned by
__rte_timer_reset().

- Change API rte_timer_reset_sync() to invoke rte_pause() while
spin-waiting for rte_timer_reset() to succeed.

- Enhance timer stress test 2 to report how many timer reset
collisions occur, i.e., how many times rte_timer_reset() fails
due to a timer being in the CONFIG state.

Signed-off-by: Robert Sanford 

---
 app/test/test_timer.c|   25 ++---
 lib/librte_timer/rte_timer.c |7 +++
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/app/test/test_timer.c b/app/test/test_timer.c
index 4b4800b..2f27f84 100644
--- a/app/test/test_timer.c
+++ b/app/test/test_timer.c
@@ -247,12 +247,15 @@ static int
 timer_stress2_main_loop(__attribute__((unused)) void *arg)
 {
static struct rte_timer *timers;
-   int i;
+   int i, ret;
static volatile int ready = 0;
uint64_t delay = rte_get_timer_hz() / 4;
unsigned lcore_id = rte_lcore_id();
+   int32_t my_collisions = 0;
+   static rte_atomic32_t collisions = RTE_ATOMIC32_INIT(0);

if (lcore_id == rte_get_master_lcore()) {
+   cb_count = 0;
timers = rte_malloc(NULL, sizeof(*timers) * NB_STRESS2_TIMERS, 
0);
if (timers == NULL) {
printf("Test Failed\n");
@@ -268,15 +271,24 @@ timer_stress2_main_loop(__attribute__((unused)) void *arg)
}

/* have all cores schedule all timers on master lcore */
-   for (i = 0; i < NB_STRESS2_TIMERS; i++)
-   rte_timer_reset(&timers[i], delay, SINGLE, 
rte_get_master_lcore(),
+   for (i = 0; i < NB_STRESS2_TIMERS; i++) {
+   ret = rte_timer_reset(&timers[i], delay, SINGLE, 
rte_get_master_lcore(),
timer_stress2_cb, NULL);
+   /* there will be collisions when multiple cores simultaneously
+* configure the same timers */
+   if (ret != 0)
+   my_collisions++;
+   }
+   if (my_collisions != 0)
+   rte_atomic32_add(&collisions, my_collisions);

ready = 0;
rte_delay_ms(500);

/* now check that we get the right number of callbacks */
if (lcore_id == rte_get_master_lcore()) {
+   if ((my_collisions = rte_atomic32_read(&collisions)) != 0)
+   printf("- %d timer reset collisions (OK)\n", 
my_collisions);
rte_timer_manage();
if (cb_count != NB_STRESS2_TIMERS) {
printf("Test Failed\n");
@@ -311,6 +323,13 @@ timer_stress2_main_loop(__attribute__((unused)) void *arg)
/* now check that we get the right number of callbacks */
if (lcore_id == rte_get_master_lcore()) {
rte_timer_manage();
+
+   /* clean up statics, in case we run again */
+   rte_free(timers);
+   timers = 0;
+   ready = 0;
+   rte_atomic32_set(&collisions, 0);
+
if (cb_count != NB_STRESS2_TIMERS) {
printf("Test Failed\n");
printf("- Stress test 2, part 2 failed\n");
diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index 269a992..d18abf5 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -424,10 +424,8 @@ rte_timer_reset(struct rte_timer *tim, uint64_t ticks,
else
period = 0;

-   __rte_timer_reset(tim,  cur_time + ticks, period, tim_lcore,
+   return __rte_timer_reset(tim,  cur_time + ticks, period, tim_lcore,
  fct, arg, 0);
-
-   return 0;
 }

 /* loop until rte_timer_reset() succeed */
@@ -437,7 +435,8 @@ rte_timer_reset_sync(struct rte_timer *tim, uint64_t ticks,
 rte_timer_cb_t fct, void *arg)
 {
while (rte_timer_reset(tim, ticks, type, tim_lcore,
-  fct, arg) != 0);
+  fct, arg) != 0)
+   rte_pause();
 }

 /* Stop the timer associated with the timer handle tim */
-- 
1.7.1



[dpdk-dev] [PATCH 0/3] timer: fix rte_timer_manage and improve unit tests

2015-07-23 Thread rsanfo...@gmail.com
From: Robert Sanford 

This patchset fixes a bug in timer stress test 2, adds a new stress test
to expose a race condition bug in API rte_timer_manage(), and then fixes
the rte_timer_manage() bug.
--

Patch 1, app/test timer stress test 2: Sometimes this test fails and
seg-faults because the slave lcores get out of phase with the master.
The master uses a single int, 'ready', to synchronize multiple slave
lcores through multiple phases of the test.

To resolve, we construct simple synchronization primitives that use one
atomic-int state variable per slave. The master tells the slaves when to
start, and then waits for all of them to finish. Each slave waits for
the master to tell it to start, and then tells the master when it has
finished.
--

Description of rte_timer_manage() race condition bug: Through code
inspection, we noticed a potential problem in rte_timer_manage() that
leads to corruption of per-lcore pending-lists (implemented as
skip-lists). The race condition occurs when rte_timer_manage() expires
multiple timers on lcore A, while lcore B simultaneously invokes
rte_timer_reset() for one of the expiring timers (other than the first
one).

Lcore A splits its pending-list, creating a local list of expired timers
linked through their sl_next[0] pointers, and sets the first expired
timer to the RUNNING state, all during one list-lock round trip. Lcore A
then unlocks the list-lock to run the first callback, and that is when
lcore B can misinterpret the subsequent expired timers' true states.
Lcore B sees an expired timer still in the PENDING state, locks A's
list-lock, and reinserts the timer into A's pending-list. We are trying
to use the same pointer to belong to both lists!

One solution is to remove timers from the pending-list and set them to
the RUNNING state in one atomic step, i.e., we should perform these two
actions within one ownership of the list-lock.
--

Patch 2, new timer-manage race-condition test: We wrote a test to
confirm our suspicion that we could crash rte_timer_manage() (RTM) under
the right circumstances. We repeatedly set several timers to expire at
roughly the same time on the master core. The master lcore just delays
and runs RTM about ten times per second. The slave lcores all watch the
first timer (timer-0) to see when RTM is running on the master, i.e.,
timer-0's state is not PENDING. At this point, each slave attempts to
reset a subset of the timers to a later expiration time. The goal here
is to have the slaves moving most of the timers to a different place in
the master's pending-list, while the master is working with the same
next-pointers and running callback functions. This eventually results in
the master traversing a corrupted linked-list. In our observations, it
results in an infinite loop.
--

Patch 3, eliminate race condition in rte_timer_manage(): After splitting
the pending-list at the current time point, we try to set all expired
timers to the RUNNING state, and put back into the pending-list any
timers that we fail to set to the RUNNING state, all during one
list-lock round trip. It is then safe to run the callback functions
for all expired timers that remain on our local list, without the lock.

Robert Sanford (3):
  timer: fix stress test 2 synchronization bug
  timer: add timer-manage race condition test
  timer: fix race condition in rte_timer_manage()

 app/test/Makefile  |1 +
 app/test/test_timer.c  |  149 ++---
 app/test/test_timer_racecond.c |  209 
 lib/librte_timer/rte_timer.c   |   45 ++---
 4 files changed, 355 insertions(+), 49 deletions(-)
 create mode 100644 app/test/test_timer_racecond.c



[dpdk-dev] [PATCH 1/3] timer: fix stress test 2 synchronization bug

2015-07-23 Thread rsanfo...@gmail.com
From: Robert Sanford 

Signed-off-by: Robert Sanford 
---
 app/test/test_timer.c |  149 ++---
 1 files changed, 116 insertions(+), 33 deletions(-)

diff --git a/app/test/test_timer.c b/app/test/test_timer.c
index 73da5b6..dd63421 100644
--- a/app/test/test_timer.c
+++ b/app/test/test_timer.c
@@ -137,13 +137,13 @@
 #include 
 #include 

-
 #define TEST_DURATION_S 20 /* in seconds */
 #define NB_TIMER 4

 #define RTE_LOGTYPE_TESTTIMER RTE_LOGTYPE_USER3

 static volatile uint64_t end_time;
+static volatile int test_failed;

 struct mytimerinfo {
struct rte_timer tim;
@@ -230,6 +230,59 @@ timer_stress_main_loop(__attribute__((unused)) void *arg)
return 0;
 }

+/* Need to synchronize slave lcores through multiple steps. */
+enum { SLAVE_WAITING=1, SLAVE_RUN_SIGNAL, SLAVE_RUNNING, SLAVE_FINISHED };
+static rte_atomic16_t test_slave_state[RTE_MAX_LCORE];
+
+static void
+master_init_slaves(void)
+{
+   unsigned i;
+   RTE_LCORE_FOREACH_SLAVE(i) {
+   rte_atomic16_set(&test_slave_state[i], SLAVE_WAITING);
+   }
+}
+
+static void
+master_start_slaves(void)
+{
+   unsigned i;
+   RTE_LCORE_FOREACH_SLAVE(i) {
+   rte_atomic16_set(&test_slave_state[i], SLAVE_RUN_SIGNAL);
+   }
+   RTE_LCORE_FOREACH_SLAVE(i) {
+   while (rte_atomic16_read(&test_slave_state[i]) != SLAVE_RUNNING)
+   rte_pause();
+   }
+}
+
+static void
+master_wait_for_slaves(void)
+{
+   unsigned i;
+   RTE_LCORE_FOREACH_SLAVE(i) {
+   while (rte_atomic16_read(&test_slave_state[i]) != 
SLAVE_FINISHED)
+   rte_pause();
+   }
+}
+
+static void
+slave_wait_to_start(void)
+{
+   unsigned lcore_id = rte_lcore_id();
+   while (rte_atomic16_read(&test_slave_state[lcore_id]) != 
SLAVE_RUN_SIGNAL)
+   rte_pause();
+   rte_atomic16_set(&test_slave_state[lcore_id], SLAVE_RUNNING);
+}
+
+static void
+slave_finish(void)
+{
+   unsigned lcore_id = rte_lcore_id();
+   rte_atomic16_set(&test_slave_state[lcore_id], SLAVE_FINISHED);
+}
+
+
 static volatile int cb_count = 0;

 /* callback for second stress test. will only be called
@@ -247,31 +300,37 @@ timer_stress2_main_loop(__attribute__((unused)) void *arg)
 {
static struct rte_timer *timers;
int i, ret;
-   static volatile int ready = 0;
uint64_t delay = rte_get_timer_hz() / 4;
unsigned lcore_id = rte_lcore_id();
+   unsigned master = rte_get_master_lcore();
int32_t my_collisions = 0;
-   static rte_atomic32_t collisions = RTE_ATOMIC32_INIT(0);
+   static rte_atomic32_t collisions;

-   if (lcore_id == rte_get_master_lcore()) {
+   if (lcore_id == master) {
cb_count = 0;
+   test_failed = 0;
+   rte_atomic32_set(&collisions, 0);
+   master_init_slaves();
timers = rte_malloc(NULL, sizeof(*timers) * NB_STRESS2_TIMERS, 
0);
if (timers == NULL) {
printf("Test Failed\n");
printf("- Cannot allocate memory for timers\n" );
-   return -1;
+   test_failed = 1;
+   master_start_slaves();
+   goto cleanup;
}
for (i = 0; i < NB_STRESS2_TIMERS; i++)
rte_timer_init(&timers[i]);
-   ready = 1;
+   master_start_slaves();
} else {
-   while (!ready)
-   rte_pause();
+   slave_wait_to_start();
+   if (test_failed)
+   goto cleanup;
}

/* have all cores schedule all timers on master lcore */
for (i = 0; i < NB_STRESS2_TIMERS; i++) {
-   ret = rte_timer_reset(&timers[i], delay, SINGLE, 
rte_get_master_lcore(),
+   ret = rte_timer_reset(&timers[i], delay, SINGLE, master,
timer_stress2_cb, NULL);
/* there will be collisions when multiple cores simultaneously
 * configure the same timers */
@@ -281,11 +340,18 @@ timer_stress2_main_loop(__attribute__((unused)) void *arg)
if (my_collisions != 0)
rte_atomic32_add(&collisions, my_collisions);

-   ready = 0;
+   /* wait long enough for timers to expire */
rte_delay_ms(500);

+   /* all cores rendezvous */
+   if (lcore_id == master) {
+   master_wait_for_slaves();
+   } else {
+   slave_finish();
+   }
+
/* now check that we get the right number of callbacks */
-   if (lcore_id == rte_get_master_lcore()) {
+   if (lcore_id == master) {
my_collisions = rte_atomic32_read(&collisions);
if (my_collisions != 0)
printf("- %d timer reset collisions (OK)\n", 
my_collisions);
@@ -295,49 +361,

[dpdk-dev] [PATCH 2/3] timer: add timer-manage race condition test

2015-07-23 Thread rsanfo...@gmail.com
From: Robert Sanford 

Signed-off-by: Robert Sanford 
---
 app/test/Makefile  |1 +
 app/test/test_timer_racecond.c |  209 
 2 files changed, 210 insertions(+), 0 deletions(-)
 create mode 100644 app/test/test_timer_racecond.c

diff --git a/app/test/Makefile b/app/test/Makefile
index caa359c..e7f148f 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -71,6 +71,7 @@ SRCS-y += test_rwlock.c

 SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer.c
 SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer_perf.c
+SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer_racecond.c

 SRCS-y += test_mempool.c
 SRCS-y += test_mempool_perf.c
diff --git a/app/test/test_timer_racecond.c b/app/test/test_timer_racecond.c
new file mode 100644
index 000..6f60a7a
--- /dev/null
+++ b/app/test/test_timer_racecond.c
@@ -0,0 +1,209 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 Akamai Technologies.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "test.h"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#undef TEST_TIMER_RACECOND_VERBOSE
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+#define usec_delay(us) usleep(us)
+#else
+#define usec_delay(us) rte_delay_us(us)
+#endif
+
+#define BILLION (1UL << 30)
+
+#define TEST_DURATION_S 20 /* in seconds */
+#define N_TIMERS50
+
+static struct rte_timer timer[N_TIMERS];
+static unsigned timer_lcore_id[N_TIMERS];
+
+static unsigned master;
+static volatile unsigned stop_slaves;
+
+static int reload_timer(struct rte_timer *tim);
+
+static void
+timer_cb(struct rte_timer *tim, void *arg __rte_unused)
+{
+   /* Simulate slow callback function, 100 us. */
+   rte_delay_us(100);
+
+#ifdef TEST_TIMER_RACECOND_VERBOSE
+   if (tim == &timer[0])
+   printf("\n");
+   printf("timer_cb: core %u timer %lu\n",
+   rte_lcore_id(), tim - timer);
+#endif
+   (void)reload_timer(tim);
+}
+
+RTE_DEFINE_PER_LCORE(unsigned, n_reset_collisions);
+
+static int
+reload_timer(struct rte_timer *tim)
+{
+   /* Make timer expire roughly when the TSC hits the next BILLION 
multiple.
+* Add in timer's index to make them expire in somewhat sorted order.
+* This makes all timers somewhat synchronized, firing ~2-3 times per
+* second, assuming 2-3 GHz TSCs.
+*/
+   uint64_t ticks = BILLION - (rte_get_timer_cycles() % BILLION) +
+   (tim - timer);
+   int ret;
+
+   ret = rte_timer_reset(tim, ticks, PERIODICAL, master, timer_cb, NULL);
+   if (ret != 0) {
+#ifdef TEST_TIMER_RACECOND_VERBOSE
+   printf("- core %u failed to reset timer %lu (OK)\n",
+   rte_lcore_id(), tim - timer);
+#endif
+   RTE_PER_LCORE(n_reset_collisions) += 1;
+   }
+   return ret;
+}
+
+static int
+slave_main_loop(__attribute__((unused)) void *arg)
+{
+   unsigned lcore_id = rte_lcore_id();
+   unsigned i;
+
+   RTE_PER_LCORE(n_reset_collisions) = 0;
+
+   printf("Starting main loop on core %u\n", lcore_id);
+
+   while (!stop_slaves) {
+   /* Wait until the timer manager is running.
+* We know it's running when we see timer[0] NOT pending.
+*/
+   if (rte_timer_pending(&timer[0])) {
+   rte_p

[dpdk-dev] [PATCH 3/3] timer: fix race condition in rte_timer_manage()

2015-07-23 Thread rsanfo...@gmail.com
From: Robert Sanford 

Signed-off-by: Robert Sanford 
---
 lib/librte_timer/rte_timer.c |   45 +++--
 1 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index 8e9243a..51e6038 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -504,6 +504,7 @@ void rte_timer_manage(void)
 {
union rte_timer_status status;
struct rte_timer *tim, *next_tim;
+   struct rte_timer *run_first_tim, **pprev;
unsigned lcore_id = rte_lcore_id();
struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
uint64_t cur_time;
@@ -531,8 +532,10 @@ void rte_timer_manage(void)

/* if nothing to do just unlock and return */
if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
-   priv_timer[lcore_id].pending_head.sl_next[0]->expire > 
cur_time)
-   goto done;
+   priv_timer[lcore_id].pending_head.sl_next[0]->expire > 
cur_time) {
+   rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+   return;
+   }

/* save start of list of expired timers */
tim = priv_timer[lcore_id].pending_head.sl_next[0];
@@ -546,24 +549,40 @@ void rte_timer_manage(void)
prev[i] ->sl_next[i] = NULL;
}

-   /* now scan expired list and call callbacks */
+   /* transition run-list from PENDING to RUNNING */
+   run_first_tim = tim;
+   pprev = &run_first_tim;
+
for ( ; tim != NULL; tim = next_tim) {
next_tim = tim->sl_next[0];

ret = timer_set_running_state(tim);
+   if (likely(ret == 0)) {
+   pprev = &tim->sl_next[0];
+   } else {
+   /* another core is trying to re-config this one,
+* remove it from local expired list and put it
+* back on the priv_timer[] skip list */
+   *pprev = next_tim;
+   timer_add(tim, lcore_id, 1);
+   }
+   }

-   /* this timer was not pending, continue */
-   if (ret < 0)
-   continue;
+   /* update the next to expire timer value */
+   priv_timer[lcore_id].pending_head.expire =
+   (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) 
? 0 :
+   
priv_timer[lcore_id].pending_head.sl_next[0]->expire;

-   rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+   rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);

+   /* now scan expired list and call callbacks */
+   for (tim = run_first_tim; tim != NULL; tim = next_tim) {
+   next_tim = tim->sl_next[0];
priv_timer[lcore_id].updated = 0;

/* execute callback function with list unlocked */
tim->f(tim, tim->arg);

-   rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
__TIMER_STAT_ADD(pending, -1);
/* the timer was stopped or reloaded by the callback
 * function, we have nothing to do here */
@@ -579,6 +598,7 @@ void rte_timer_manage(void)
}
else {
/* keep it in list and mark timer as pending */
+   rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
status.state = RTE_TIMER_PENDING;
__TIMER_STAT_ADD(pending, 1);
status.owner = (int16_t)lcore_id;
@@ -586,16 +606,9 @@ void rte_timer_manage(void)
tim->status.u32 = status.u32;
__rte_timer_reset(tim, cur_time + tim->period,
tim->period, lcore_id, tim->f, 
tim->arg, 1);
+   rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
}
}
-
-   /* update the next to expire timer value */
-   priv_timer[lcore_id].pending_head.expire =
-   (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) 
? 0 :
-   
priv_timer[lcore_id].pending_head.sl_next[0]->expire;
-done:
-   /* job finished, unlock the list lock */
-   rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
 }

 /* dump statistics about timers */
-- 
1.7.1



[dpdk-dev] [PATCH v2 0/3] timer: fix rte_timer_manage and improve unit tests

2015-07-27 Thread rsanfo...@gmail.com
From: Robert Sanford 

This patchset fixes a bug in timer stress test 2, adds a new stress test
to expose a race condition bug in API rte_timer_manage(), and then fixes
the rte_timer_manage() bug.

Description of rte_timer_manage() race condition bug: Through code
inspection, we notice a potential problem in rte_timer_manage() that
leads to corruption of per-lcore pending-lists (implemented as
skip-lists). The race condition occurs when rte_timer_manage() expires
multiple timers on lcore A, while lcore B simultaneously invokes
rte_timer_reset() for one of the expiring timers (other than the first
one).

Lcore A splits its pending-list, creating a local list of expired timers
linked through their sl_next[0] pointers, and sets the first expired
timer to the RUNNING state, all during one list-lock round trip.
Lcore A then unlocks the list-lock to run the first callback, and that
is when A and B can have different interpretations of the subsequent
expired timers' true state. Lcore B sees an expired timer still in the
PENDING state, atomically changes the timer to the CONFIG state, locks
lcore A's list-lock, and reinserts the timer into A's pending-list.
The two lcores try to use the same next-pointers to maintain both lists!

v2 changes:
Move patch descriptions to their respective patches.
Correct checkpatch warnings.

Robert Sanford (3):
  fix stress test 2 sync bug
  add timer manage race condition test
  fix race condition in rte_timer_manage

 app/test/Makefile  |1 +
 app/test/test_timer.c  |  154 +++---
 app/test/test_timer_racecond.c |  209 
 lib/librte_timer/rte_timer.c   |   56 +++
 4 files changed, 366 insertions(+), 54 deletions(-)
 create mode 100644 app/test/test_timer_racecond.c



[dpdk-dev] [PATCH v2 1/3] timer: fix stress test 2 synchronization bug

2015-07-27 Thread rsanfo...@gmail.com
From: Robert Sanford 

Fix app/test timer stress test 2: Sometimes this test fails and
seg-faults because the slave lcores get out of phase with the master.
The master uses a single int, 'ready', to synchronize multiple slave
lcores through multiple phases of the test.

To resolve, we construct simple synchronization primitives that use one
atomic-int state variable per slave. The master tells the slaves when to
start, and then waits for all of them to finish. Each slave waits for
the master to tell it to start, and then tells the master when it has
finished.

Signed-off-by: Robert Sanford 
---
 app/test/test_timer.c |  154 ++---
 1 files changed, 121 insertions(+), 33 deletions(-)

diff --git a/app/test/test_timer.c b/app/test/test_timer.c
index 73da5b6..944e2ad 100644
--- a/app/test/test_timer.c
+++ b/app/test/test_timer.c
@@ -137,13 +137,13 @@
 #include 
 #include 

-
 #define TEST_DURATION_S 20 /* in seconds */
 #define NB_TIMER 4

 #define RTE_LOGTYPE_TESTTIMER RTE_LOGTYPE_USER3

 static volatile uint64_t end_time;
+static volatile int test_failed;

 struct mytimerinfo {
struct rte_timer tim;
@@ -230,6 +230,64 @@ timer_stress_main_loop(__attribute__((unused)) void *arg)
return 0;
 }

+/* Need to synchronize slave lcores through multiple steps. */
+enum { SLAVE_WAITING = 1, SLAVE_RUN_SIGNAL, SLAVE_RUNNING, SLAVE_FINISHED };
+static rte_atomic16_t slave_state[RTE_MAX_LCORE];
+
+static void
+master_init_slaves(void)
+{
+   unsigned i;
+
+   RTE_LCORE_FOREACH_SLAVE(i) {
+   rte_atomic16_set(&slave_state[i], SLAVE_WAITING);
+   }
+}
+
+static void
+master_start_slaves(void)
+{
+   unsigned i;
+
+   RTE_LCORE_FOREACH_SLAVE(i) {
+   rte_atomic16_set(&slave_state[i], SLAVE_RUN_SIGNAL);
+   }
+   RTE_LCORE_FOREACH_SLAVE(i) {
+   while (rte_atomic16_read(&slave_state[i]) != SLAVE_RUNNING)
+   rte_pause();
+   }
+}
+
+static void
+master_wait_for_slaves(void)
+{
+   unsigned i;
+
+   RTE_LCORE_FOREACH_SLAVE(i) {
+   while (rte_atomic16_read(&slave_state[i]) != SLAVE_FINISHED)
+   rte_pause();
+   }
+}
+
+static void
+slave_wait_to_start(void)
+{
+   unsigned lcore_id = rte_lcore_id();
+
+   while (rte_atomic16_read(&slave_state[lcore_id]) != SLAVE_RUN_SIGNAL)
+   rte_pause();
+   rte_atomic16_set(&slave_state[lcore_id], SLAVE_RUNNING);
+}
+
+static void
+slave_finish(void)
+{
+   unsigned lcore_id = rte_lcore_id();
+
+   rte_atomic16_set(&slave_state[lcore_id], SLAVE_FINISHED);
+}
+
+
 static volatile int cb_count = 0;

 /* callback for second stress test. will only be called
@@ -247,31 +305,37 @@ timer_stress2_main_loop(__attribute__((unused)) void *arg)
 {
static struct rte_timer *timers;
int i, ret;
-   static volatile int ready = 0;
uint64_t delay = rte_get_timer_hz() / 4;
unsigned lcore_id = rte_lcore_id();
+   unsigned master = rte_get_master_lcore();
int32_t my_collisions = 0;
-   static rte_atomic32_t collisions = RTE_ATOMIC32_INIT(0);
+   static rte_atomic32_t collisions;

-   if (lcore_id == rte_get_master_lcore()) {
+   if (lcore_id == master) {
cb_count = 0;
+   test_failed = 0;
+   rte_atomic32_set(&collisions, 0);
+   master_init_slaves();
timers = rte_malloc(NULL, sizeof(*timers) * NB_STRESS2_TIMERS, 
0);
if (timers == NULL) {
printf("Test Failed\n");
printf("- Cannot allocate memory for timers\n" );
-   return -1;
+   test_failed = 1;
+   master_start_slaves();
+   goto cleanup;
}
for (i = 0; i < NB_STRESS2_TIMERS; i++)
rte_timer_init(&timers[i]);
-   ready = 1;
+   master_start_slaves();
} else {
-   while (!ready)
-   rte_pause();
+   slave_wait_to_start();
+   if (test_failed)
+   goto cleanup;
}

/* have all cores schedule all timers on master lcore */
for (i = 0; i < NB_STRESS2_TIMERS; i++) {
-   ret = rte_timer_reset(&timers[i], delay, SINGLE, 
rte_get_master_lcore(),
+   ret = rte_timer_reset(&timers[i], delay, SINGLE, master,
timer_stress2_cb, NULL);
/* there will be collisions when multiple cores simultaneously
 * configure the same timers */
@@ -281,11 +345,18 @@ timer_stress2_main_loop(__attribute__((unused)) void *arg)
if (my_collisions != 0)
rte_atomic32_add(&collisions, my_collisions);

-   ready = 0;
+   /* wait long enough for timers to expire */
rte_delay_ms(500);

+   /* all

[dpdk-dev] [PATCH v2 2/3] timer: add timer-manage race condition test

2015-07-27 Thread rsanfo...@gmail.com
From: Robert Sanford 

Add new timer-manage race-condition test: We wrote a test to confirm
our suspicion that we could crash rte_timer_manage() under the right
circumstances. We repeatedly set several timers to expire at roughly
the same time on the master core. The master lcore just delays and runs
rte_timer_manage() about ten times per second. The slave lcores all
watch the first timer (timer-0) to see when rte_timer_manage() is
running on the master, i.e., timer-0's state is not PENDING.
At this point, each slave attempts to reset a subset of the timers to
a later expiration time. The goal here is to have the slaves moving
most of the timers to a different place in the master's pending-list,
while the master is traversing the same next-pointers (the slaves'
sl_next[0] pointers) and running callback functions. This eventually
results in the master traversing a corrupted linked-list.
In our observations, it results in an infinite loop.

Signed-off-by: Robert Sanford 
---
 app/test/Makefile  |1 +
 app/test/test_timer_racecond.c |  209 
 2 files changed, 210 insertions(+), 0 deletions(-)
 create mode 100644 app/test/test_timer_racecond.c

diff --git a/app/test/Makefile b/app/test/Makefile
index caa359c..e7f148f 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -71,6 +71,7 @@ SRCS-y += test_rwlock.c

 SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer.c
 SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer_perf.c
+SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer_racecond.c

 SRCS-y += test_mempool.c
 SRCS-y += test_mempool_perf.c
diff --git a/app/test/test_timer_racecond.c b/app/test/test_timer_racecond.c
new file mode 100644
index 000..32693d8
--- /dev/null
+++ b/app/test/test_timer_racecond.c
@@ -0,0 +1,209 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 Akamai Technologies.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "test.h"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#undef TEST_TIMER_RACECOND_VERBOSE
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+#define usec_delay(us) usleep(us)
+#else
+#define usec_delay(us) rte_delay_us(us)
+#endif
+
+#define BILLION (1UL << 30)
+
+#define TEST_DURATION_S 20 /* in seconds */
+#define N_TIMERS50
+
+static struct rte_timer timer[N_TIMERS];
+static unsigned timer_lcore_id[N_TIMERS];
+
+static unsigned master;
+static volatile unsigned stop_slaves;
+
+static int reload_timer(struct rte_timer *tim);
+
+static void
+timer_cb(struct rte_timer *tim, void *arg __rte_unused)
+{
+   /* Simulate slow callback function, 100 us. */
+   rte_delay_us(100);
+
+#ifdef TEST_TIMER_RACECOND_VERBOSE
+   if (tim == &timer[0])
+   printf("\n");
+   printf("timer_cb: core %u timer %lu\n",
+   rte_lcore_id(), tim - timer);
+#endif
+   (void)reload_timer(tim);
+}
+
+RTE_DEFINE_PER_LCORE(unsigned, n_reset_collisions);
+
+static int
+reload_timer(struct rte_timer *tim)
+{
+   /* Make timer expire roughly when the TSC hits the next BILLION
+* multiple. Add in timer's index to make them expire in nearly
+* sorted order. This makes all timers somewhat synchronized,
+* firing ~2-3 times per second, assuming 2-3 GHz TSCs.
+*/
+   uint64_t ticks = BILLION - (rte_get_timer_cycles() % BILLION) +
+  

[dpdk-dev] [PATCH v2 3/3] timer: fix race condition in rte_timer_manage()

2015-07-27 Thread rsanfo...@gmail.com
From: Robert Sanford 

Eliminate problematic race condition in rte_timer_manage() that can
lead to corruption of per-lcore pending-lists (implemented as
skip-lists). The race condition occurs when rte_timer_manage() expires
multiple timers on lcore A, while lcore B simultaneously invokes
rte_timer_reset() for one of the expiring timers (other than the first
one).

Lcore A splits its pending-list, creating a local list of expired timers
linked through their sl_next[0] pointers, and sets the first expired
timer to the RUNNING state, all during one list-lock round trip.
Lcore A then unlocks the list-lock to run the first callback, and that
is when A and B can have different interpretations of the subsequent
expired timers' true state. Lcore B sees an expired timer still in the
PENDING state, atomically changes the timer to the CONFIG state, locks
lcore A's list-lock, and reinserts the timer into A's pending-list.
The two lcores try to use the same next-pointers to maintain both lists!

Our solution is to remove expired timers from the pending-list and try
to set them all to the RUNNING state in one atomic step, i.e.,
rte_timer_manage() should perform these two actions within one
ownership of the list-lock.

After splitting the pending-list at the current point in time and trying
to set all expired timers to the RUNNING state, we must put back into
the pending-list any timers that we failed to set to the RUNNING state,
all while still holding the list-lock. It is then safe to release the
lock and run the callback functions for all expired timers that remain
on our local run-list.

Signed-off-by: Robert Sanford 
---
 lib/librte_timer/rte_timer.c |   56 ++---
 1 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index 8e9243a..3dcdab5 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -504,6 +504,7 @@ void rte_timer_manage(void)
 {
union rte_timer_status status;
struct rte_timer *tim, *next_tim;
+   struct rte_timer *run_first_tim, **pprev;
unsigned lcore_id = rte_lcore_id();
struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
uint64_t cur_time;
@@ -519,9 +520,9 @@ void rte_timer_manage(void)
cur_time = rte_get_timer_cycles();

 #ifdef RTE_ARCH_X86_64
-   /* on 64-bit the value cached in the pending_head.expired will be 
updated
-* atomically, so we can consult that for a quick check here outside the
-* lock */
+   /* on 64-bit the value cached in the pending_head.expired will be
+* updated atomically, so we can consult that for a quick check here
+* outside the lock */
if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
return;
 #endif
@@ -531,8 +532,10 @@ void rte_timer_manage(void)

/* if nothing to do just unlock and return */
if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
-   priv_timer[lcore_id].pending_head.sl_next[0]->expire > 
cur_time)
-   goto done;
+   priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
+   rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+   return;
+   }

/* save start of list of expired timers */
tim = priv_timer[lcore_id].pending_head.sl_next[0];
@@ -540,30 +543,47 @@ void rte_timer_manage(void)
/* break the existing list at current time point */
timer_get_prev_entries(cur_time, lcore_id, prev);
for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
-   priv_timer[lcore_id].pending_head.sl_next[i] = 
prev[i]->sl_next[i];
+   priv_timer[lcore_id].pending_head.sl_next[i] =
+   prev[i]->sl_next[i];
if (prev[i]->sl_next[i] == NULL)
priv_timer[lcore_id].curr_skiplist_depth--;
prev[i] ->sl_next[i] = NULL;
}

-   /* now scan expired list and call callbacks */
+   /* transition run-list from PENDING to RUNNING */
+   run_first_tim = tim;
+   pprev = &run_first_tim;
+
for ( ; tim != NULL; tim = next_tim) {
next_tim = tim->sl_next[0];

ret = timer_set_running_state(tim);
+   if (likely(ret == 0)) {
+   pprev = &tim->sl_next[0];
+   } else {
+   /* another core is trying to re-config this one,
+* remove it from local expired list and put it
+* back on the priv_timer[] skip list */
+   *pprev = next_tim;
+   timer_add(tim, lcore_id, 1);
+   }
+   }

-   /* this timer was not pending, continue */
-   if (ret < 0)
-   continue;
+   /* update the next to expire timer value */
+   priv_timer[lcore_