> It's > still unproven whether it'd be an improvement, but you could expect to > prove it one way or the other with a well-defined amount of testing.
I've hacked the code to use adaptive pthread mutexes instead of spinlocks. see attached patch. The patch is for the git head, but it can easily be applied for 9.1.3, which is what I did for my tests. This had disastrous effects on Solaris because it does not use anything similar to futexes for PTHREAD_PROCESS_SHARED mutexes (only the _PRIVATE mutexes do without syscalls for the simple case). But I was surprised to see that it works relatively well on linux. Here's a glimpse of my results: hacked code 9.1.3: -bash-4.1$ rsync -av --delete /tmp/test_template_data/ ../data/ ; /usr/bin/time ./postgres -D ../data -p 55502 & ppid=$! ; pid=$(pgrep -P $ppid ) ; sleep 15 ; ./pgbench -c 768 -t 20 -j 128 -p 55502 postgres ; kill $pid sending incremental file list ... ransaction type: TPC-B (sort of) scaling factor: 10 query mode: simple number of clients: 768 number of threads: 128 number of transactions per client: 20 number of transactions actually processed: 15360/15360 tps = 476.873261 (including connections establishing) tps = 485.964355 (excluding connections establishing) LOG: received smart shutdown request LOG: autovacuum launcher shutting down -bash-4.1$ LOG: shutting down LOG: database system is shut down 210.58user 78.88system 0:50.64elapsed 571%CPU (0avgtext+0avgdata 1995968maxresident)k 0inputs+1153872outputs (0major+2464649minor)pagefaults 0swaps original code (vanilla build on amd64) 9.1.3: -bash-4.1$ rsync -av --delete /tmp/test_template_data/ ../data/ ; /usr/bin/time ./postgres -D ../data -p 55502 & ppid=$! ; pid=$(pgrep -P $ppid ) ; sleep 15 ; ./pgbench -c 768 -t 20 -j 128 -p 55502 postgres ; kill $pid sending incremental file list ... transaction type: TPC-B (sort of) scaling factor: 10 query mode: simple number of clients: 768 number of threads: 128 number of transactions per client: 20 number of transactions actually processed: 15360/15360 tps = 499.993685 (including connections establishing) tps = 510.410883 (excluding connections establishing) LOG: received smart shutdown request -bash-4.1$ LOG: autovacuum launcher shutting down LOG: shutting down LOG: database system is shut down 196.21user 71.38system 0:47.99elapsed 557%CPU (0avgtext+0avgdata 1360800maxresident)k 0inputs+1147904outputs (0major+2375965minor)pagefaults 0swaps config: -bash-4.1$ egrep '^[a-z]' /tmp/test_template_data/postgresql.conf max_connections = 1800 # (change requires restart) shared_buffers = 10GB # min 128kB temp_buffers = 64MB # min 800kB work_mem = 256MB # min 64kB,d efault 1MB maintenance_work_mem = 2GB # min 1MB, default 16MB bgwriter_delay = 10ms # 10-10000ms between rounds bgwriter_lru_maxpages = 1000 # 0-1000 max buffers written/round bgwriter_lru_multiplier = 10.0 # 0-10.0 multipler on buffers scanned/round wal_level = hot_standby # minimal, archive, or hot_standby wal_buffers = 64MB # min 32kB, -1 sets based on shared_buffers commit_delay = 10000 # range 0-100000, in microseconds datestyle = 'iso, mdy' lc_messages = 'en_US.UTF-8' # locale for system error message lc_monetary = 'en_US.UTF-8' # locale for monetary formatting lc_numeric = 'en_US.UTF-8' # locale for number formatting lc_time = 'en_US.UTF-8' # locale for time formatting default_text_search_config = 'pg_catalog.english' seq_page_cost = 1.0 # measured on an arbitrary scale random_page_cost = 1.5 # same scale as above (default: 4.0) cpu_tuple_cost = 0.005 cpu_index_tuple_cost = 0.0025 cpu_operator_cost = 0.0001 effective_cache_size = 192GB So it looks like using pthread_mutexes could at least be an option on Linux. Using futexes directly could be even cheaper. As a side note, it looks like I have not expressed myself clearly: I did not intend to suggest to replace proven, working code (which probably is the best you can get for some platforms) with posix calls. I apologize for the provocative question. Regarding the actual production issue, I did not manage to synthetically provoke the saturation we are seeing in production using pgbench - I could not even get anywhere near the production load. So I cannot currently test if reducing the amount of spinning and waking up exactly one waiter (which is what linux/nptl pthread_mutex_unlock does) would solve/mitigate the production issue I am working on, and I'd highly appreciate any pointers in this direction. Cheers, Nils
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c index bc8d89f..a45fdf6 100644 --- a/src/backend/storage/lmgr/s_lock.c +++ b/src/backend/storage/lmgr/s_lock.c @@ -20,6 +20,8 @@ #include "storage/s_lock.h" +#ifndef USE_PTHREAD_SLOCK + slock_t dummy_spinlock; static int spins_per_delay = DEFAULT_SPINS_PER_DELAY; @@ -200,9 +202,95 @@ update_spins_per_delay(int shared_spins_per_delay) * because the definitions for these are split between this file and s_lock.h. */ +#endif /* !USE_PTHREAD_SLOCK */ #ifdef HAVE_SPINLOCKS /* skip spinlocks if requested */ +#ifdef USE_PTHREAD_SLOCK +#include <pthread.h> +void +posix_lock(volatile slock_t *lock, const char *file, int line) +{ + struct slock *s_lock = (slock_t *)lock; + int ret; + + /* XXX error recovery ! */ + + do { + ret = pthread_mutex_lock(&s_lock->mutex); + } while (ret != 0); + s_lock->held = 1; +} + +void +posix_unlock(volatile slock_t *lock, const char *file, int line) +{ + struct slock *s_lock = (slock_t *)lock; + int ret; + + /* XXX error recovery ! */ + + s_lock->held = 0; + do { + ret = pthread_mutex_unlock(&s_lock->mutex); + } while (ret != 0); +} + +void +posix_init(volatile slock_t *lock) +{ + struct slock *s_lock = (slock_t *)lock; + pthread_mutexattr_t attr; + int ret; + + ret = pthread_mutexattr_init(&attr); + Assert(ret == 0); + +#ifdef PTHREAD_MUTEX_ADAPTIVE_NP + (void)pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); +#else + (void)pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_NORMAL); +#endif + +#ifdef USE_PRIO_INHERIT + /* + * looks like this incurs significant syscall overhead on linux + */ + (void)pthread_mutexattr_setprotocol(&attr, PTHREAD_PRIO_INHERIT); +#endif + + ret = pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); + Assert(ret == 0); + + s_lock->held = 0; + + ret = pthread_mutex_init(&s_lock->mutex, &attr); + Assert(ret == 0); + + pthread_mutexattr_destroy(&attr); +} + +int +posix_lock_free(volatile slock_t *lock) +{ + struct slock *s_lock = (slock_t *)lock; + + return (s_lock->held == 0); +} + +#if 0 +void +posix_lock_free(slock_t *lock) +{ + struct slock *s_lock = lock; + int ret; + + ret = pthread_mutex_destroy(&s_lock->mutex); + Assert(ret == 0); +} +#endif + +#else /* !USE_PTHREAD_SLOCK */ #if defined(__GNUC__) @@ -282,6 +370,7 @@ tas_dummy() /* really means: extern int tas(slock_t } #endif /* sun3 */ #endif /* not __GNUC__ */ +#endif /* USE_PTHREAD_SLOCK */ #endif /* HAVE_SPINLOCKS */ diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h index d4a783f..200af8f 100644 --- a/src/include/storage/s_lock.h +++ b/src/include/storage/s_lock.h @@ -97,6 +97,30 @@ #ifdef HAVE_SPINLOCKS /* skip spinlocks if requested */ +#ifdef USE_PTHREAD_SLOCK +#include <pthread.h> +struct slock { + pthread_mutex_t mutex; + int held; +}; + +typedef struct slock slock_t; + +extern void posix_lock(volatile slock_t *lock, const char *file, int line); +extern void posix_unlock(volatile slock_t *lock, const char *file, int line); +extern void posix_init(volatile slock_t *lock); +extern int posix_lock_free(volatile slock_t *lock); + +#define S_LOCK(lock) posix_lock((lock), __FILE__, __LINE__) +#define S_UNLOCK(lock) posix_unlock((lock), __FILE__, __LINE__) +#define S_INIT_LOCK(lock) posix_init(lock) +#define S_LOCK_FREE(lock) posix_lock_free(lock) +#define SPIN_DELAY() ((void) 0) + +#define set_spins_per_delay(s) ((void) 0) +#define update_spins_per_delay(s) (0) +#define DEFAULT_SPINS_PER_DELAY 0 +#else /* !USE_PTHREAD_SLOCK */ #if defined(__GNUC__) || defined(__INTEL_COMPILER) /************************************************************************* @@ -947,7 +971,7 @@ spin_delay(void) #error PostgreSQL does not have native spinlock support on this platform. To continue the compilation, rerun configure using --disable-spinlocks. However, performance will be poor. Please report this to pgsql-b...@postgresql.org. #endif - +#endif /* USE_PTHREAD_SLOCK */ #else /* !HAVE_SPINLOCKS */ @@ -968,7 +992,6 @@ extern int tas_sema(volatile slock_t *lock); #define S_INIT_LOCK(lock) s_init_lock_sema(lock) #define TAS(lock) tas_sema(lock) - #endif /* HAVE_SPINLOCKS */ @@ -996,6 +1019,8 @@ extern int tas_sema(volatile slock_t *lock); #define S_INIT_LOCK(lock) S_UNLOCK(lock) #endif /* S_INIT_LOCK */ +#ifndef USE_PTHREAD_SLOCK + #if !defined(SPIN_DELAY) #define SPIN_DELAY() ((void) 0) #endif /* SPIN_DELAY */ @@ -1004,6 +1029,7 @@ extern int tas_sema(volatile slock_t *lock); extern int tas(volatile slock_t *lock); /* in port/.../tas.s, or * s_lock.c */ + #define TAS(lock) tas(lock) #endif /* TAS */ @@ -1022,5 +1048,6 @@ extern void s_lock(volatile slock_t *lock, const char *file, int line); extern void set_spins_per_delay(int shared_spins_per_delay); extern int update_spins_per_delay(int shared_spins_per_delay); +#endif /* !USE_PTHREAD_SLOCK */ #endif /* S_LOCK_H */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers