Hello, Robert

> Thanks, this looks way better.  Some more comments:
> 
> - I don't think there's any good reason for this patch to change the
> calling convention for ShmemInitHash().  Maybe that's a good idea and
> maybe it isn't, but it's a separate issue from what this patch is
> doing, and if we're going to do it at all, it should be discussed
> separately.  Let's leave it out of this patch.
> 
> - I would not provide an option to change the number of freelist
> mutexes.  Let's dump DEFAULT_MUTEXES_NUM and MAX_MUTEXES_NUM and have
> FREELIST_MUTEXES_NUM.  The value of 32 which you selected is fine with
> me.
> 
> - The change to LOG2_NUM_LOCK_PARTITIONS in lwlock.h looks like an
> independent change.  Again, leave it out of this patch and submit it
> separately with its own benchmarking data if you want to argue for it.
> 
> I think if you make these changes this patch will be quite a bit
> smaller and in pretty good shape.
> 
> Thanks for working on this.
> 

Here is a corrected patch. I decided to keep a small change in
InitLocks procedure (lock.c) since ShmemInitHash description clearly
states "For a table whose maximum size is certain, [init_size] should
be equal to max_size". Also I added two items to my TODO list to send
more patches as soon (and if) this patch will be applied.

Thanks for your contribution to this patch.
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index e3e9599..5296e77 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -374,18 +374,10 @@ void
 InitLocks(void)
 {
 	HASHCTL		info;
-	long		init_table_size,
-				max_table_size;
+	long		max_table_size;
 	bool		found;
 
 	/*
-	 * Compute init/max size to request for lock hashtables.  Note these
-	 * calculations must agree with LockShmemSize!
-	 */
-	max_table_size = NLOCKENTS();
-	init_table_size = max_table_size / 2;
-
-	/*
 	 * Allocate hash table for LOCK structs.  This stores per-locked-object
 	 * information.
 	 */
@@ -393,16 +385,16 @@ InitLocks(void)
 	info.keysize = sizeof(LOCKTAG);
 	info.entrysize = sizeof(LOCK);
 	info.num_partitions = NUM_LOCK_PARTITIONS;
+	max_table_size = NLOCKENTS();
 
 	LockMethodLockHash = ShmemInitHash("LOCK hash",
-									   init_table_size,
+									   max_table_size,
 									   max_table_size,
 									   &info,
 									HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
 
 	/* Assume an average of 2 holders per lock */
 	max_table_size *= 2;
-	init_table_size *= 2;
 
 	/*
 	 * Allocate hash table for PROCLOCK structs.  This stores
@@ -414,7 +406,7 @@ InitLocks(void)
 	info.num_partitions = NUM_LOCK_PARTITIONS;
 
 	LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash",
-										   init_table_size,
+										   max_table_size,
 										   max_table_size,
 										   &info,
 								 HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 24a53da..1a3244a 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -15,7 +15,7 @@
  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
  * Therefore, each hash bucket chain operates independently, and no fields
  * of the hash header change after init except nentries and freeList.
- * A partitioned table uses a spinlock to guard changes of those two fields.
+ * A partitioned table uses spinlocks to guard changes of those fields.
  * This lets any subset of the hash buckets be treated as a separately
  * lockable partition.  We expect callers to use the low-order bits of a
  * lookup key's hash value as a partition number --- this will work because
@@ -111,6 +111,11 @@
 #define DEF_DIRSIZE			   256
 #define DEF_FFACTOR			   1	/* default fill factor */
 
+/*
+ * Number of mutexes and respectively number of nentries and freeLists
+ * (see below). Should be power of 2.
+ */
+#define MUTEXES_NUM    32
 
 /* A hash bucket is a linked list of HASHELEMENTs */
 typedef HASHELEMENT *HASHBUCKET;
@@ -128,12 +133,23 @@ typedef HASHBUCKET *HASHSEGMENT;
  */
 struct HASHHDR
 {
-	/* In a partitioned table, take this lock to touch nentries or freeList */
-	slock_t		mutex;			/* unused if not partitioned table */
-
-	/* These fields change during entry addition/deletion */
-	long		nentries;		/* number of entries in hash table */
-	HASHELEMENT *freeList;		/* linked list of free elements */
+	/*
+	 * There are two fields declared below: nentries and freeList. nentries
+	 * stores current number of entries in a hash table. freeList is a linked
+	 * list of free elements.
+	 *
+	 * To keep these fields consistent in a partitioned table we need to
+	 * synchronize access to them using a spinlock. But it turned out that a
+	 * single spinlock can create a bottleneck. To prevent lock contention an
+	 * array of MUTEXES_NUM spinlocks is used. Each spinlock protects one
+	 * element of nentries and freeList arrays.
+	 *
+	 * If hash table is not partitioned only nentries[0] and freeList[0] are
+	 * used and spinlocks are not used at all.
+	 */
+	slock_t		mutex[MUTEXES_NUM];		/* array of spinlocks */
+	long		nentries[MUTEXES_NUM];	/* number of entries */
+	HASHELEMENT *freeList[MUTEXES_NUM]; /* lists of free elements */
 
 	/* These fields can change, but not in a partitioned table */
 	/* Also, dsize can't change in a shared table, even if unpartitioned */
@@ -166,6 +182,8 @@ struct HASHHDR
 
 #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
 
+#define FREELIST_IDX(hctl, hashcode) (IS_PARTITIONED(hctl) ? hashcode % MUTEXES_NUM : 0)
+
 /*
  * Top control structure for a hashtable --- in a shared table, each backend
  * has its own copy (OK since no fields change at runtime)
@@ -219,10 +237,10 @@ static long hash_accesses,
  */
 static void *DynaHashAlloc(Size size);
 static HASHSEGMENT seg_alloc(HTAB *hashp);
-static bool element_alloc(HTAB *hashp, int nelem);
+static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
 static bool dir_realloc(HTAB *hashp);
 static bool expand_table(HTAB *hashp);
-static HASHBUCKET get_hash_entry(HTAB *hashp);
+static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
 static void hdefault(HTAB *hashp);
 static int	choose_nelem_alloc(Size entrysize);
 static bool init_htab(HTAB *hashp, long nelem);
@@ -282,6 +300,10 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 {
 	HTAB	   *hashp;
 	HASHHDR    *hctl;
+	int			i,
+				partitions_number,
+				nelem_alloc,
+				nelem_alloc_first;
 
 	/*
 	 * For shared hash tables, we have a local hash header (HTAB struct) that
@@ -482,10 +504,34 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	if ((flags & HASH_SHARED_MEM) ||
 		nelem < hctl->nelem_alloc)
 	{
-		if (!element_alloc(hashp, (int) nelem))
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+		/*
+		 * If hash table is partitioned all freeLists have equal number of
+		 * elements. Otherwise only freeList[0] is used.
+		 */
+		if (IS_PARTITIONED(hashp->hctl))
+			partitions_number = MUTEXES_NUM;
+		else
+			partitions_number = 1;
+
+		nelem_alloc = nelem / partitions_number;
+		if (nelem_alloc == 0)
+			nelem_alloc = 1;
+
+		if (nelem_alloc * partitions_number < nelem)
+			/* Make sure all memory will be used */
+			nelem_alloc_first = nelem - nelem_alloc * (partitions_number - 1);
+		else
+			nelem_alloc_first = nelem_alloc;
+
+		for (i = 0; i < partitions_number; i++)
+		{
+			int			temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
+
+			if (!element_alloc(hashp, temp, i))
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+		}
 	}
 
 	if (flags & HASH_FIXED_SIZE)
@@ -503,9 +549,6 @@ hdefault(HTAB *hashp)
 
 	MemSet(hctl, 0, sizeof(HASHHDR));
 
-	hctl->nentries = 0;
-	hctl->freeList = NULL;
-
 	hctl->dsize = DEF_DIRSIZE;
 	hctl->nsegs = 0;
 
@@ -572,12 +615,14 @@ init_htab(HTAB *hashp, long nelem)
 	HASHSEGMENT *segp;
 	int			nbuckets;
 	int			nsegs;
+	int			i;
 
 	/*
 	 * initialize mutex if it's a partitioned table
 	 */
 	if (IS_PARTITIONED(hctl))
-		SpinLockInit(&hctl->mutex);
+		for (i = 0; i < MUTEXES_NUM; i++)
+			SpinLockInit(&(hctl->mutex[i]));
 
 	/*
 	 * Divide number of elements by the fill factor to determine a desired
@@ -648,7 +693,7 @@ init_htab(HTAB *hashp, long nelem)
 			"HIGH MASK       ", hctl->high_mask,
 			"LOW  MASK       ", hctl->low_mask,
 			"NSEGS           ", hctl->nsegs,
-			"NENTRIES        ", hctl->nentries);
+			"NENTRIES        ", hash_get_num_entries(hctl));
 #endif
 	return true;
 }
@@ -769,7 +814,7 @@ hash_stats(const char *where, HTAB *hashp)
 			where, hashp->hctl->accesses, hashp->hctl->collisions);
 
 	fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
-			hashp->hctl->nentries, (long) hashp->hctl->keysize,
+			hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
 			hashp->hctl->max_bucket, hashp->hctl->nsegs);
 	fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
 			where, hash_accesses, hash_collisions);
@@ -863,6 +908,7 @@ hash_search_with_hash_value(HTAB *hashp,
 	HASHBUCKET	currBucket;
 	HASHBUCKET *prevBucketPtr;
 	HashCompareFunc match;
+	int			freelist_idx = FREELIST_IDX(hctl, hashvalue);
 
 #if HASH_STATISTICS
 	hash_accesses++;
@@ -885,7 +931,7 @@ hash_search_with_hash_value(HTAB *hashp,
 		 * order of these tests is to try to check cheaper conditions first.
 		 */
 		if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
-			hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+		hctl->nentries[0] / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
 			!has_seq_scans(hashp))
 			(void) expand_table(hashp);
 	}
@@ -943,20 +989,20 @@ hash_search_with_hash_value(HTAB *hashp,
 			{
 				/* if partitioned, must lock to touch nentries and freeList */
 				if (IS_PARTITIONED(hctl))
-					SpinLockAcquire(&hctl->mutex);
+					SpinLockAcquire(&(hctl->mutex[freelist_idx]));
 
-				Assert(hctl->nentries > 0);
-				hctl->nentries--;
+				Assert(hctl->nentries[freelist_idx] > 0);
+				hctl->nentries[freelist_idx]--;
 
 				/* remove record from hash bucket's chain. */
 				*prevBucketPtr = currBucket->link;
 
 				/* add the record to the freelist for this table.  */
-				currBucket->link = hctl->freeList;
-				hctl->freeList = currBucket;
+				currBucket->link = hctl->freeList[freelist_idx];
+				hctl->freeList[freelist_idx] = currBucket;
 
 				if (IS_PARTITIONED(hctl))
-					SpinLockRelease(&hctl->mutex);
+					SpinLockRelease(&hctl->mutex[freelist_idx]);
 
 				/*
 				 * better hope the caller is synchronizing access to this
@@ -982,7 +1028,7 @@ hash_search_with_hash_value(HTAB *hashp,
 				elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
 					 hashp->tabname);
 
-			currBucket = get_hash_entry(hashp);
+			currBucket = get_hash_entry(hashp, freelist_idx);
 			if (currBucket == NULL)
 			{
 				/* out of memory */
@@ -1175,41 +1221,70 @@ hash_update_hash_key(HTAB *hashp,
  * create a new entry if possible
  */
 static HASHBUCKET
-get_hash_entry(HTAB *hashp)
+get_hash_entry(HTAB *hashp, int freelist_idx)
 {
-	HASHHDR *hctl = hashp->hctl;
+	HASHHDR    *hctl = hashp->hctl;
 	HASHBUCKET	newElement;
+	int			borrow_from_idx;
 
 	for (;;)
 	{
 		/* if partitioned, must lock to touch nentries and freeList */
 		if (IS_PARTITIONED(hctl))
-			SpinLockAcquire(&hctl->mutex);
+			SpinLockAcquire(&hctl->mutex[freelist_idx]);
 
 		/* try to get an entry from the freelist */
-		newElement = hctl->freeList;
+		newElement = hctl->freeList[freelist_idx];
+
 		if (newElement != NULL)
-			break;
+		{
+			/* remove entry from freelist, bump nentries */
+			hctl->freeList[freelist_idx] = newElement->link;
+			hctl->nentries[freelist_idx]++;
+			if (IS_PARTITIONED(hctl))
+				SpinLockRelease(&hctl->mutex[freelist_idx]);
+
+			return newElement;
+		}
 
-		/* no free elements.  allocate another chunk of buckets */
 		if (IS_PARTITIONED(hctl))
-			SpinLockRelease(&hctl->mutex);
+			SpinLockRelease(&hctl->mutex[freelist_idx]);
 
-		if (!element_alloc(hashp, hctl->nelem_alloc))
+		/* no free elements.  allocate another chunk of buckets */
+		if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
 		{
-			/* out of memory */
-			return NULL;
-		}
-	}
+			if (!IS_PARTITIONED(hctl))
+				return NULL;	/* out of memory */
 
-	/* remove entry from freelist, bump nentries */
-	hctl->freeList = newElement->link;
-	hctl->nentries++;
+			/* try to borrow element from another partition */
+			borrow_from_idx = freelist_idx;
+			for (;;)
+			{
+				borrow_from_idx = (borrow_from_idx + 1) % MUTEXES_NUM;
+				if (borrow_from_idx == freelist_idx)
+					break;
 
-	if (IS_PARTITIONED(hctl))
-		SpinLockRelease(&hctl->mutex);
+				SpinLockAcquire(&(hctl->mutex[borrow_from_idx]));
+				newElement = hctl->freeList[borrow_from_idx];
+
+				if (newElement != NULL)
+				{
+					hctl->freeList[borrow_from_idx] = newElement->link;
+					SpinLockRelease(&(hctl->mutex[borrow_from_idx]));
 
-	return newElement;
+					SpinLockAcquire(&hctl->mutex[freelist_idx]);
+					hctl->nentries[freelist_idx]++;
+					SpinLockRelease(&hctl->mutex[freelist_idx]);
+
+					break;
+				}
+
+				SpinLockRelease(&(hctl->mutex[borrow_from_idx]));
+			}
+
+			return newElement;
+		}
+	}
 }
 
 /*
@@ -1218,11 +1293,21 @@ get_hash_entry(HTAB *hashp)
 long
 hash_get_num_entries(HTAB *hashp)
 {
+	int			i;
+	long		sum = hashp->hctl->nentries[0];
+
 	/*
 	 * We currently don't bother with the mutex; it's only sensible to call
 	 * this function if you've got lock on all partitions of the table.
 	 */
-	return hashp->hctl->nentries;
+
+	if (!IS_PARTITIONED(hashp->hctl))
+		return sum;
+
+	for (i = 1; i < MUTEXES_NUM; i++)
+		sum += hashp->hctl->nentries[i];
+
+	return sum;
 }
 
 /*
@@ -1530,9 +1615,9 @@ seg_alloc(HTAB *hashp)
  * allocate some new elements and link them into the free list
  */
 static bool
-element_alloc(HTAB *hashp, int nelem)
+element_alloc(HTAB *hashp, int nelem, int freelist_idx)
 {
-	HASHHDR *hctl = hashp->hctl;
+	HASHHDR    *hctl = hashp->hctl;
 	Size		elementSize;
 	HASHELEMENT *firstElement;
 	HASHELEMENT *tmpElement;
@@ -1563,14 +1648,14 @@ element_alloc(HTAB *hashp, int nelem)
 
 	/* if partitioned, must lock to touch freeList */
 	if (IS_PARTITIONED(hctl))
-		SpinLockAcquire(&hctl->mutex);
+		SpinLockAcquire(&hctl->mutex[freelist_idx]);
 
 	/* freelist could be nonempty if two backends did this concurrently */
-	firstElement->link = hctl->freeList;
-	hctl->freeList = prevElement;
+	firstElement->link = hctl->freeList[freelist_idx];
+	hctl->freeList[freelist_idx] = prevElement;
 
 	if (IS_PARTITIONED(hctl))
-		SpinLockRelease(&hctl->mutex);
+		SpinLockRelease(&hctl->mutex[freelist_idx]);
 
 	return true;
 }
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to