Since we are bumping the urcu soname version number to 2.0.0 for the
upcoming urcu 0.8 anyway, it's time to merge this patch.

Thanks!

Mathieu

* Lai Jiangshan ([email protected]) wrote:
> @rcu_gp_ctr and @registry share the same cache line, it causes
> false sharing and slowdown both of the read site and update site.
> 
> Fix: Use different cache line for them.
> 
> Although rcu_gp_futex is updated less than rcu_gp_ctr, but
> they always be accessed at almost the same time, so we also move rcu_gp_futex
> to the cacheline of rcu_gp_ctr to reduce the cacheline-usage or cache-missing
> of read site.
> 
> 
> test: (4X6=24 CPUs)
> 
> Before patch:
> 
> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20
> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 
> wdur      0 nr_writers   1 wdelay      0 nr_reads   2100285330 nr_writes      
> 3390219 nr_ops   2103675549
> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20
> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 
> wdur      0 nr_writers   1 wdelay      0 nr_reads   1619868562 nr_writes      
> 3529478 nr_ops   1623398040
> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20
> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 
> wdur      0 nr_writers   1 wdelay      0 nr_reads   1949067038 nr_writes      
> 3469334 nr_ops   1952536372
> 
> 
> after patch:
> 
> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20
> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 
> wdur      0 nr_writers   1 wdelay      0 nr_reads   3380191848 nr_writes      
> 4903248 nr_ops   3385095096
> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20
> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 
> wdur      0 nr_writers   1 wdelay      0 nr_reads   3397637486 nr_writes      
> 4129809 nr_ops   3401767295
> 
> Singed-by: Lai Jiangshan <[email protected]>
> ---
> diff --git a/urcu.c b/urcu.c
> index 15def09..436d71c 100644
> --- a/urcu.c
> +++ b/urcu.c
> @@ -83,16 +83,7 @@ void __attribute__((destructor)) rcu_exit(void);
>  #endif
>  
>  static pthread_mutex_t rcu_gp_lock = PTHREAD_MUTEX_INITIALIZER;
> -
> -int32_t rcu_gp_futex;
> -
> -/*
> - * Global grace period counter.
> - * Contains the current RCU_GP_CTR_PHASE.
> - * Also has a RCU_GP_COUNT of 1, to accelerate the reader fast path.
> - * Written to only by writer with mutex taken. Read by both writer and 
> readers.
> - */
> -unsigned long rcu_gp_ctr = RCU_GP_COUNT;
> +struct urcu_gp rcu_gp = { .ctr = RCU_GP_COUNT };
>  
>  /*
>   * Written to only by each individual reader. Read by both the reader and the
> @@ -217,8 +208,8 @@ static void wait_gp(void)
>  {
>       /* Read reader_gp before read futex */
>       smp_mb_master(RCU_MB_GROUP);
> -     if (uatomic_read(&rcu_gp_futex) == -1)
> -             futex_async(&rcu_gp_futex, FUTEX_WAIT, -1,
> +     if (uatomic_read(&rcu_gp.futex) == -1)
> +             futex_async(&rcu_gp.futex, FUTEX_WAIT, -1,
>                     NULL, NULL, 0);
>  }
>  
> @@ -232,12 +223,12 @@ static void wait_for_readers(struct cds_list_head 
> *input_readers,
>       /*
>        * Wait for each thread URCU_TLS(rcu_reader).ctr to either
>        * indicate quiescence (not nested), or observe the current
> -      * rcu_gp_ctr value.
> +      * rcu_gp.ctr value.
>        */
>       for (;;) {
>               wait_loops++;
>               if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
> -                     uatomic_dec(&rcu_gp_futex);
> +                     uatomic_dec(&rcu_gp.futex);
>                       /* Write futex before read reader_gp */
>                       smp_mb_master(RCU_MB_GROUP);
>               }
> @@ -270,7 +261,7 @@ static void wait_for_readers(struct cds_list_head 
> *input_readers,
>                       if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
>                               /* Read reader_gp before write futex */
>                               smp_mb_master(RCU_MB_GROUP);
> -                             uatomic_set(&rcu_gp_futex, 0);
> +                             uatomic_set(&rcu_gp.futex, 0);
>                       }
>                       break;
>               } else {
> @@ -289,7 +280,7 @@ static void wait_for_readers(struct cds_list_head 
> *input_readers,
>                       if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
>                               /* Read reader_gp before write futex */
>                               smp_mb_master(RCU_MB_GROUP);
> -                             uatomic_set(&rcu_gp_futex, 0);
> +                             uatomic_set(&rcu_gp.futex, 0);
>                       }
>                       break;
>               } else {
> @@ -357,10 +348,10 @@ void synchronize_rcu(void)
>  
>       /*
>        * Must finish waiting for quiescent state for original parity before
> -      * committing next rcu_gp_ctr update to memory. Failure to do so could
> +      * committing next rcu_gp.ctr update to memory. Failure to do so could
>        * result in the writer waiting forever while new readers are always
>        * accessing data (no progress).  Enforce compiler-order of load
> -      * URCU_TLS(rcu_reader).ctr before store to rcu_gp_ctr.
> +      * URCU_TLS(rcu_reader).ctr before store to rcu_gp.ctr.
>        */
>       cmm_barrier();
>  
> @@ -372,13 +363,13 @@ void synchronize_rcu(void)
>       cmm_smp_mb();
>  
>       /* Switch parity: 0 -> 1, 1 -> 0 */
> -     CMM_STORE_SHARED(rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR_PHASE);
> +     CMM_STORE_SHARED(rcu_gp.ctr, rcu_gp.ctr ^ RCU_GP_CTR_PHASE);
>  
>       /*
> -      * Must commit rcu_gp_ctr update to memory before waiting for quiescent
> +      * Must commit rcu_gp.ctr update to memory before waiting for quiescent
>        * state. Failure to do so could result in the writer waiting forever
>        * while new readers are always accessing data (no progress). Enforce
> -      * compiler-order of store to rcu_gp_ctr before load rcu_reader ctr.
> +      * compiler-order of store to rcu_gp.ctr before load rcu_reader ctr.
>        */
>       cmm_barrier();
>  
> diff --git a/urcu/static/urcu.h b/urcu/static/urcu.h
> index 973826a..0dd733e 100644
> --- a/urcu/static/urcu.h
> +++ b/urcu/static/urcu.h
> @@ -213,12 +213,20 @@ static inline void smp_mb_slave(int group)
>  #define RCU_GP_CTR_PHASE     (1UL << (sizeof(unsigned long) << 2))
>  #define RCU_GP_CTR_NEST_MASK (RCU_GP_CTR_PHASE - 1)
>  
> -/*
> - * Global quiescent period counter with low-order bits unused.
> - * Using a int rather than a char to eliminate false register dependencies
> - * causing stalls on some architectures.
> - */
> -extern unsigned long rcu_gp_ctr;
> +struct urcu_gp {
> +     /*
> +      * Global grace period counter.
> +      * Contains the current RCU_GP_CTR_PHASE.
> +      * Also has a RCU_GP_COUNT of 1, to accelerate the reader fast path.
> +      * Written to only by writer with mutex taken.
> +      * Read by both writer and readers.
> +      */
> +     unsigned long ctr;
> +
> +     int32_t futex;
> +} __attribute__((aligned(CAA_CACHE_LINE_SIZE)));
> +
> +extern struct urcu_gp rcu_gp;
>  
>  struct rcu_reader {
>       /* Data used by both reader and synchronize_rcu() */
> @@ -231,16 +239,14 @@ struct rcu_reader {
>  
>  extern DECLARE_URCU_TLS(struct rcu_reader, rcu_reader);
>  
> -extern int32_t rcu_gp_futex;
> -
>  /*
>   * Wake-up waiting synchronize_rcu(). Called from many concurrent threads.
>   */
>  static inline void wake_up_gp(void)
>  {
> -     if (caa_unlikely(uatomic_read(&rcu_gp_futex) == -1)) {
> -             uatomic_set(&rcu_gp_futex, 0);
> -             futex_async(&rcu_gp_futex, FUTEX_WAKE, 1,
> +     if (caa_unlikely(uatomic_read(&rcu_gp.futex) == -1)) {
> +             uatomic_set(&rcu_gp.futex, 0);
> +             futex_async(&rcu_gp.futex, FUTEX_WAKE, 1,
>                     NULL, NULL, 0);
>       }
>  }
> @@ -256,13 +262,13 @@ static inline enum rcu_state rcu_reader_state(unsigned 
> long *ctr)
>       v = CMM_LOAD_SHARED(*ctr);
>       if (!(v & RCU_GP_CTR_NEST_MASK))
>               return RCU_READER_INACTIVE;
> -     if (!((v ^ rcu_gp_ctr) & RCU_GP_CTR_PHASE))
> +     if (!((v ^ rcu_gp.ctr) & RCU_GP_CTR_PHASE))
>               return RCU_READER_ACTIVE_CURRENT;
>       return RCU_READER_ACTIVE_OLD;
>  }
>  
>  /*
> - * Helper for _rcu_read_lock().  The format of rcu_gp_ctr (as well as
> + * Helper for _rcu_read_lock().  The format of rcu_gp.ctr (as well as
>   * the per-thread rcu_reader.ctr) has the upper bits containing a count of
>   * _rcu_read_lock() nesting, and a lower-order bit that contains either zero
>   * or RCU_GP_CTR_PHASE.  The smp_mb_slave() ensures that the accesses in
> @@ -271,7 +277,7 @@ static inline enum rcu_state rcu_reader_state(unsigned 
> long *ctr)
>  static inline void _rcu_read_lock_update(unsigned long tmp)
>  {
>       if (caa_likely(!(tmp & RCU_GP_CTR_NEST_MASK))) {
> -             _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, 
> _CMM_LOAD_SHARED(rcu_gp_ctr));
> +             _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, 
> _CMM_LOAD_SHARED(rcu_gp.ctr));
>               smp_mb_slave(RCU_MB_GROUP);
>       } else
>               _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, tmp + RCU_GP_COUNT);

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

_______________________________________________
lttng-dev mailing list
[email protected]
http://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev

Reply via email to