Hi,

I think there is a concurrency issue with Spinlocks in linuxmodule
multi-threaded click (running a 2.6.19.2 patched kernel, the e1000-NAPI
driver and today's trunk). I've put together a temporary patch, but there
might be further issues. Sorry, I don't have the time to investigate more.


Problems:
The series of events are:
-Thread A is running on CPU 0 and thread B is running on CPU 1.
-A acquires the Spinlock and executes code in the critical section.
-Linux schedules Thread A to run on CPU 1 and thread B to run on CPU 0.
-B acquires the Spinlock (works because CPU 0 is the owner of the lock,
not the thread)
-A and B are both operating in the critical section
-(if you're lucky) A releases the Spinlock and generates a “releasing
someone else's lock” message
-(if you're unlucky) Oops

Attached is a test element and a configuration to elicit the behavior. I
could only replicate the problem when using a polling input source under
heavy load.

After fixing the above problem I was still seeing two threads inside the
CS. For some reason the atomic_uint32_t::swap was not working atomically
as expected. Changing this to atomic_uint32::compare_and_swap solved the
problem. We're running x86_64 quad core Xeon CPUs.


Solution:
In the patch attached I added a new function click_current_thread() which
returns a thread_info pointer. Inside of Spinlock I replace
click_current_processor() uses with click_current_thread(). This way even
if the thread is assigned to another core it will still have the same
thread_info pointer.

The problem with this solution is that it does not work as an index into
an array nicely and cannot simply replace all uses of
click_current_processor(). There may be other places in the code which
make the same incorrect use of click_current_processor(), but a better
approach will be needed if the value is used to index into an array (such
as in ReadWriteLock).


Thanks!
Kevin Springborn
--- click.orig/include/click/glue.hh
+++ click.new/include/click/glue.hh
@@ -336,6 +336,42 @@
 #endif
 }
 
+// THREAD IDENTITIES
+#if CLICK_LINUXMODULE
+typedef thread_info* click_thread_t;
+#elif CLICK_USERLEVEL && HAVE_MULTITHREAD
+typedef pthread_t click_thread_t;
+#else
+typedef int8_t click_thread_t;
+#endif
+
+inline click_thread_t
+click_current_thread()
+{
+#if CLICK_LINUXMODULE
+# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0)
+    return current_thread_info();
+# else
+    return current->processor;
+# endif
+#elif CLICK_USERLEVEL && HAVE_MULTITHREAD
+    return pthread_self();
+#else
+    return 0;
+#endif
+}
+
+inline click_thread_t
+click_invalid_thread()
+{
+#if CLICK_LINUXMODULE
+    return NULL;
+#elif CLICK_USERLEVEL && HAVE_MULTITHREAD
+    return 0;
+#else
+    return -1;
+#endif
+}
 
 // TIMEVALS AND JIFFIES
 // click_jiffies_t is the type of click_jiffies() and is usually unsigned.
--- click.orig/include/click/sync.hh
+++ click.new/include/click/sync.hh
@@ -64,7 +64,7 @@
   private:
     atomic_uint32_t _lock;
     int32_t _depth;
-    click_processor_t _owner;
+    click_thread_t _owner;
 #endif
   
 };
@@ -73,7 +73,7 @@
 inline
 Spinlock::Spinlock()
 #if CLICK_MULTITHREAD_SPINLOCK
-    : _depth(0), _owner(click_invalid_processor())
+    : _depth(0), _owner(click_invalid_thread())
 #endif
 {
 #if CLICK_MULTITHREAD_SPINLOCK
@@ -101,11 +101,11 @@
 Spinlock::acquire()
 {
 #if CLICK_MULTITHREAD_SPINLOCK
-    if (_owner != click_current_processor()) {
-	while (_lock.swap(1) != 0)
+    if (_owner != click_current_thread()) {
+	while (!_lock.compare_and_swap(0, 1))
 	    while (_lock != 0)
 		asm volatile ("" : : : "memory");
-	_owner = click_current_processor();
+	_owner = click_current_thread();
     }
     _depth++;
 #endif
@@ -121,10 +121,10 @@
 Spinlock::attempt()
 {
 #if CLICK_MULTITHREAD_SPINLOCK
-    if (_owner != click_current_processor()) {
-	if (_lock.swap(1) != 0)
+    if (_owner != click_current_thread()) {
+	if (!_lock.compare_and_swap(0, 1))
 	    return false;
-	_owner = click_current_processor();
+	_owner = click_current_thread();
     }
     _depth++;
     return true;
@@ -142,11 +142,11 @@
 Spinlock::release()
 {
 #if CLICK_MULTITHREAD_SPINLOCK
-    if (unlikely(_owner != click_current_processor()))
+    if (unlikely(_owner != click_current_thread()))
 	click_chatter("releasing someone else's lock");
     if (likely(_depth > 0)) {
 	if (--_depth == 0) {
-	    _owner = click_invalid_processor();
+	    _owner = click_invalid_thread();
 	    _lock = 0;
 	}
     } else
#include <click/config.h>
#include "test.hh"
#include <click/glue.hh>

CLICK_DECLS

void 
TestElement::push(int, Packet * p_in)
{
  click_spinlock.acquire();
  for (int i = 0; i < 10000; i++)
    asm volatile ("" : : : "memory");
  count++;
  click_spinlock.release();

  checked_output_push(0, p_in);
}


CLICK_ENDDECLS
EXPORT_ELEMENT(TestElement)
#ifndef TEST_ELEMENT_HH
#define TEST_ELEMENT_HH
#include <click/element.hh>
#include <click/sync.hh>

CLICK_DECLS

class TestElement : public Element {
public:
  TestElement() 
  {
    count = 0;
  }
  ~TestElement(){click_chatter("Count %u\n", count);}

  const char * class_name() const { return "TestElement";}
  int configure(Vector<String> &conf, ErrorHandler *errh){return 1;}
  const char *port_count() const { return "1/1"; }

  void push(int x, Packet * p_in);

private:
  uint count;
  Spinlock click_spinlock;
};

CLICK_ENDDECLS
#endif

Attachment: test.click
Description: Binary data

_______________________________________________
click mailing list
[email protected]
https://amsterdam.lcs.mit.edu/mailman/listinfo/click

Reply via email to