Hi, Mathieu, There is a big compatible problem in URCU which should be fix in next round.
LB: liburcu built on the system which has sys_membarrier(). LU: liburcu built on the system which does NOT have sys_membarrier(). LBM: liburcu-mb .... LUM: liburcu-mb ... AB: application(-lliburcu) built on the system which has sys_membarrier(). AU: application(-lliburcu) built on the system which does NOT have sys_membarrier(). ABM application(-lliburcu-mb) ... AUM application(-lliburcu-mb) ... AB/AU + LB/LU: 4 combinations ABM/AUM + LBM/LUM: 4 combinations I remember some of the 8 combinations can't works due to symbols are miss match. only LU+AB and LB+AU ? could you check it? How to fix it: In LU and AU, keep all the symbol name/ABI as LA and AB, but only the behaviors falls back to URCU_MB. Thanks, Lai On 05/06/2013 08:44 PM, Mathieu Desnoyers wrote: > Since we are bumping the urcu soname version number to 2.0.0 for the > upcoming urcu 0.8 anyway, it's time to merge this patch. > > Thanks! > > Mathieu > > * Lai Jiangshan ([email protected]) wrote: >> @rcu_gp_ctr and @registry share the same cache line, it causes >> false sharing and slowdown both of the read site and update site. >> >> Fix: Use different cache line for them. >> >> Although rcu_gp_futex is updated less than rcu_gp_ctr, but >> they always be accessed at almost the same time, so we also move rcu_gp_futex >> to the cacheline of rcu_gp_ctr to reduce the cacheline-usage or cache-missing >> of read site. >> >> >> test: (4X6=24 CPUs) >> >> Before patch: >> >> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20 >> SUMMARY ./tests/test_urcu_mb testdur 20 nr_readers 20 rdur 0 >> wdur 0 nr_writers 1 wdelay 0 nr_reads 2100285330 nr_writes >> 3390219 nr_ops 2103675549 >> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20 >> SUMMARY ./tests/test_urcu_mb testdur 20 nr_readers 20 rdur 0 >> wdur 0 nr_writers 1 wdelay 0 nr_reads 1619868562 nr_writes >> 3529478 nr_ops 1623398040 >> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20 >> SUMMARY ./tests/test_urcu_mb testdur 20 nr_readers 20 rdur 0 >> wdur 0 nr_writers 1 wdelay 0 nr_reads 1949067038 nr_writes >> 3469334 nr_ops 1952536372 >> >> >> after patch: >> >> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20 >> SUMMARY ./tests/test_urcu_mb testdur 20 nr_readers 20 rdur 0 >> wdur 0 nr_writers 1 wdelay 0 nr_reads 3380191848 nr_writes >> 4903248 nr_ops 3385095096 >> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20 >> SUMMARY ./tests/test_urcu_mb testdur 20 nr_readers 20 rdur 0 >> wdur 0 nr_writers 1 wdelay 0 nr_reads 3397637486 nr_writes >> 4129809 nr_ops 3401767295 >> >> Singed-by: Lai Jiangshan <[email protected]> >> --- >> diff --git a/urcu.c b/urcu.c >> index 15def09..436d71c 100644 >> --- a/urcu.c >> +++ b/urcu.c >> @@ -83,16 +83,7 @@ void __attribute__((destructor)) rcu_exit(void); >> #endif >> >> static pthread_mutex_t rcu_gp_lock = PTHREAD_MUTEX_INITIALIZER; >> - >> -int32_t rcu_gp_futex; >> - >> -/* >> - * Global grace period counter. >> - * Contains the current RCU_GP_CTR_PHASE. >> - * Also has a RCU_GP_COUNT of 1, to accelerate the reader fast path. >> - * Written to only by writer with mutex taken. Read by both writer and >> readers. >> - */ >> -unsigned long rcu_gp_ctr = RCU_GP_COUNT; >> +struct urcu_gp rcu_gp = { .ctr = RCU_GP_COUNT }; >> >> /* >> * Written to only by each individual reader. Read by both the reader and >> the >> @@ -217,8 +208,8 @@ static void wait_gp(void) >> { >> /* Read reader_gp before read futex */ >> smp_mb_master(RCU_MB_GROUP); >> - if (uatomic_read(&rcu_gp_futex) == -1) >> - futex_async(&rcu_gp_futex, FUTEX_WAIT, -1, >> + if (uatomic_read(&rcu_gp.futex) == -1) >> + futex_async(&rcu_gp.futex, FUTEX_WAIT, -1, >> NULL, NULL, 0); >> } >> >> @@ -232,12 +223,12 @@ static void wait_for_readers(struct cds_list_head >> *input_readers, >> /* >> * Wait for each thread URCU_TLS(rcu_reader).ctr to either >> * indicate quiescence (not nested), or observe the current >> - * rcu_gp_ctr value. >> + * rcu_gp.ctr value. >> */ >> for (;;) { >> wait_loops++; >> if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) { >> - uatomic_dec(&rcu_gp_futex); >> + uatomic_dec(&rcu_gp.futex); >> /* Write futex before read reader_gp */ >> smp_mb_master(RCU_MB_GROUP); >> } >> @@ -270,7 +261,7 @@ static void wait_for_readers(struct cds_list_head >> *input_readers, >> if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) { >> /* Read reader_gp before write futex */ >> smp_mb_master(RCU_MB_GROUP); >> - uatomic_set(&rcu_gp_futex, 0); >> + uatomic_set(&rcu_gp.futex, 0); >> } >> break; >> } else { >> @@ -289,7 +280,7 @@ static void wait_for_readers(struct cds_list_head >> *input_readers, >> if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) { >> /* Read reader_gp before write futex */ >> smp_mb_master(RCU_MB_GROUP); >> - uatomic_set(&rcu_gp_futex, 0); >> + uatomic_set(&rcu_gp.futex, 0); >> } >> break; >> } else { >> @@ -357,10 +348,10 @@ void synchronize_rcu(void) >> >> /* >> * Must finish waiting for quiescent state for original parity before >> - * committing next rcu_gp_ctr update to memory. Failure to do so could >> + * committing next rcu_gp.ctr update to memory. Failure to do so could >> * result in the writer waiting forever while new readers are always >> * accessing data (no progress). Enforce compiler-order of load >> - * URCU_TLS(rcu_reader).ctr before store to rcu_gp_ctr. >> + * URCU_TLS(rcu_reader).ctr before store to rcu_gp.ctr. >> */ >> cmm_barrier(); >> >> @@ -372,13 +363,13 @@ void synchronize_rcu(void) >> cmm_smp_mb(); >> >> /* Switch parity: 0 -> 1, 1 -> 0 */ >> - CMM_STORE_SHARED(rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR_PHASE); >> + CMM_STORE_SHARED(rcu_gp.ctr, rcu_gp.ctr ^ RCU_GP_CTR_PHASE); >> >> /* >> - * Must commit rcu_gp_ctr update to memory before waiting for quiescent >> + * Must commit rcu_gp.ctr update to memory before waiting for quiescent >> * state. Failure to do so could result in the writer waiting forever >> * while new readers are always accessing data (no progress). Enforce >> - * compiler-order of store to rcu_gp_ctr before load rcu_reader ctr. >> + * compiler-order of store to rcu_gp.ctr before load rcu_reader ctr. >> */ >> cmm_barrier(); >> >> diff --git a/urcu/static/urcu.h b/urcu/static/urcu.h >> index 973826a..0dd733e 100644 >> --- a/urcu/static/urcu.h >> +++ b/urcu/static/urcu.h >> @@ -213,12 +213,20 @@ static inline void smp_mb_slave(int group) >> #define RCU_GP_CTR_PHASE (1UL << (sizeof(unsigned long) << 2)) >> #define RCU_GP_CTR_NEST_MASK (RCU_GP_CTR_PHASE - 1) >> >> -/* >> - * Global quiescent period counter with low-order bits unused. >> - * Using a int rather than a char to eliminate false register dependencies >> - * causing stalls on some architectures. >> - */ >> -extern unsigned long rcu_gp_ctr; >> +struct urcu_gp { >> + /* >> + * Global grace period counter. >> + * Contains the current RCU_GP_CTR_PHASE. >> + * Also has a RCU_GP_COUNT of 1, to accelerate the reader fast path. >> + * Written to only by writer with mutex taken. >> + * Read by both writer and readers. >> + */ >> + unsigned long ctr; >> + >> + int32_t futex; >> +} __attribute__((aligned(CAA_CACHE_LINE_SIZE))); >> + >> +extern struct urcu_gp rcu_gp; >> >> struct rcu_reader { >> /* Data used by both reader and synchronize_rcu() */ >> @@ -231,16 +239,14 @@ struct rcu_reader { >> >> extern DECLARE_URCU_TLS(struct rcu_reader, rcu_reader); >> >> -extern int32_t rcu_gp_futex; >> - >> /* >> * Wake-up waiting synchronize_rcu(). Called from many concurrent threads. >> */ >> static inline void wake_up_gp(void) >> { >> - if (caa_unlikely(uatomic_read(&rcu_gp_futex) == -1)) { >> - uatomic_set(&rcu_gp_futex, 0); >> - futex_async(&rcu_gp_futex, FUTEX_WAKE, 1, >> + if (caa_unlikely(uatomic_read(&rcu_gp.futex) == -1)) { >> + uatomic_set(&rcu_gp.futex, 0); >> + futex_async(&rcu_gp.futex, FUTEX_WAKE, 1, >> NULL, NULL, 0); >> } >> } >> @@ -256,13 +262,13 @@ static inline enum rcu_state rcu_reader_state(unsigned >> long *ctr) >> v = CMM_LOAD_SHARED(*ctr); >> if (!(v & RCU_GP_CTR_NEST_MASK)) >> return RCU_READER_INACTIVE; >> - if (!((v ^ rcu_gp_ctr) & RCU_GP_CTR_PHASE)) >> + if (!((v ^ rcu_gp.ctr) & RCU_GP_CTR_PHASE)) >> return RCU_READER_ACTIVE_CURRENT; >> return RCU_READER_ACTIVE_OLD; >> } >> >> /* >> - * Helper for _rcu_read_lock(). The format of rcu_gp_ctr (as well as >> + * Helper for _rcu_read_lock(). The format of rcu_gp.ctr (as well as >> * the per-thread rcu_reader.ctr) has the upper bits containing a count of >> * _rcu_read_lock() nesting, and a lower-order bit that contains either zero >> * or RCU_GP_CTR_PHASE. The smp_mb_slave() ensures that the accesses in >> @@ -271,7 +277,7 @@ static inline enum rcu_state rcu_reader_state(unsigned >> long *ctr) >> static inline void _rcu_read_lock_update(unsigned long tmp) >> { >> if (caa_likely(!(tmp & RCU_GP_CTR_NEST_MASK))) { >> - _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, >> _CMM_LOAD_SHARED(rcu_gp_ctr)); >> + _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, >> _CMM_LOAD_SHARED(rcu_gp.ctr)); >> smp_mb_slave(RCU_MB_GROUP); >> } else >> _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, tmp + RCU_GP_COUNT); > _______________________________________________ lttng-dev mailing list [email protected] http://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev
