On 11.7.2014 19:25, Tomas Vondra wrote:
> 2) walking through the tuples sequentially
> ------------------------------------------
> 
> The other option is not to walk the tuples through buckets, but by
> walking throught the chunks - we know the tuples are stored as
> HashJoinTuple/MinimalTuple, so it shouldn't be that difficult. And by
> walking the tuples in the order they're stored, we may just rearrange
> the tuples in already allocated memory. And maybe it could be even
> faster than the current code, because of the sequential access to the
> memory (as opposed to the random access through buckets) and CPU caches.
> So I like this approach - it's simple, clean and shouldn't add any
> overhead (memory or CPU).

So, attached is a patch that implements this. It's not very complicated
and the resulting code is surprisingly readable (IMHO comprable to the
previous code).

Basics of the patch:

(1) adds HashChunkData structure - a buffer used for dense allocation
    of tuples, 16kB by default

(2) adds 'chunks_batch' into HashJoinTableData, which is a linked list
    of HashChunkData

(3) the allocation itself is done in chunk_alloc - in short it either
    allocates the tuple in the current chunk, or adds a new one if
    needed

(4) ExecHashTableInsert calls chunk_alloc (instead of the regular
    MemoryContextAlloc)

(5) the main change is in ExecHashIncreaseNumBatches, which now can't
    do pfree (does not work with dense allocation), so it reads the
    tuples from chunks directly and simply rebuilds the buckets
    from scratch (thanks to this we only need one more chunk of memory,
    not a complete copy, and we don't need to duplicate buckets at all)

>From the tests I've done so far this seems to work perfectly - it still
saves the memory overhead, and I've seen no difference in performance.

The current patch only implemnents this for tuples in the main hash
table, not for skew buckets. I plan to do that, but it will require
separate chunks for each skew bucket (so we can remove it without
messing with all of them). The chunks for skew buckets should be smaller
(I'm thinking about ~4kB), because there'll be more of them, but OTOH
those buckets are for frequent values so the chunks should not remain empty.

But let's see if the current patch misses something important first.


regards
Tomas
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 589b2f1..5f9f2df 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -47,6 +47,7 @@ static void ExecHashSkewTableInsert(HashJoinTable hashtable,
 						int bucketNumber);
 static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
 
+static char * chunk_alloc(HashJoinTable hashtable, int tupleSize);
 
 /* ----------------------------------------------------------------
  *		ExecHash
@@ -130,6 +131,9 @@ MultiExecHash(HashState *node)
 	if (node->ps.instrument)
 		InstrStopNode(node->ps.instrument, hashtable->totalTuples);
 
+	/* print some context stats */
+	MemoryContextStats(hashtable->batchCxt);
+	
 	/*
 	 * We do not return the hash table directly because it's not a subtype of
 	 * Node, and so would violate the MultiExecProcNode API.  Instead, our
@@ -223,7 +227,6 @@ ExecEndHash(HashState *node)
 	ExecEndNode(outerPlan);
 }
 
-
 /* ----------------------------------------------------------------
  *		ExecHashTableCreate
  *
@@ -294,6 +297,8 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	hashtable->spaceAllowedSkew =
 		hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
 
+	hashtable->chunks_batch = NULL;
+
 	/*
 	 * Get info about the hash functions to be used for each hash key. Also
 	 * remember whether the join operators are strict.
@@ -556,10 +561,10 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	int			oldnbatch = hashtable->nbatch;
 	int			curbatch = hashtable->curbatch;
 	int			nbatch;
-	int			i;
 	MemoryContext oldcxt;
 	long		ninmemory;
 	long		nfreed;
+	HashChunk	oldchunks = hashtable->chunks_batch;
 
 	/* do nothing if we've decided to shut off growth */
 	if (!hashtable->growEnabled)
@@ -612,51 +617,70 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	 */
 	ninmemory = nfreed = 0;
 
-	for (i = 0; i < hashtable->nbuckets; i++)
-	{
-		HashJoinTuple prevtuple;
-		HashJoinTuple tuple;
+	/* we're going to scan through the chunks directly, not buckets, so we can reset the
+	 * buckets and reinsert all the tuples there - just set them to NULL */
+	memset(hashtable->buckets, 0, sizeof(void*) * hashtable->nbuckets);
 
-		prevtuple = NULL;
-		tuple = hashtable->buckets[i];
+	/* reset the chunks (we have a copy in oldchunks) - this way we'll allocate */
+	hashtable->chunks_batch = NULL;
 
-		while (tuple != NULL)
-		{
-			/* save link in case we delete */
-			HashJoinTuple nexttuple = tuple->next;
-			int			bucketno;
-			int			batchno;
+	/* so, let's scan through the old chunks, and all tuples in each chunk */
+	while (oldchunks != NULL) {
+
+		/* pointer to the next memory chunk (may be NULL for the last chunk) */
+		HashChunk nextchunk = oldchunks->next;
+
+		/* position within the buffer (up to oldchunks->used) */
+		size_t idx = 0;	
+
+		/* process all tuples stored in this chunk (and then free it) */
+		while (idx < oldchunks->used) {
+
+			/* get the hashjoin tuple and mintuple stored right after it */
+			HashJoinTuple hashTuple = (HashJoinTuple)(oldchunks->data + idx);
+			MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
+
+			int		hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len);
+
+			int		bucketno;
+			int		batchno;
 
 			ninmemory++;
-			ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue,
+			ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
 									  &bucketno, &batchno);
-			Assert(bucketno == i);
+
 			if (batchno == curbatch)
 			{
-				/* keep tuple */
-				prevtuple = tuple;
+				/* keep tuple - this means we need to copy it into the new chunks */
+				HashJoinTuple copyTuple = (HashJoinTuple) chunk_alloc(hashtable, hashTupleSize);
+				memcpy(copyTuple, hashTuple, hashTupleSize);
+
+				/* and of course add it to the new buckets - just push the copy it onto the front
+				 * of the bucket's list */
+				copyTuple->next = hashtable->buckets[bucketno];
+				hashtable->buckets[bucketno] = copyTuple;
 			}
 			else
 			{
 				/* dump it out */
 				Assert(batchno > curbatch);
-				ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(tuple),
-									  tuple->hashvalue,
+				ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple),
+									  hashTuple->hashvalue,
 									  &hashtable->innerBatchFile[batchno]);
-				/* and remove from hash table */
-				if (prevtuple)
-					prevtuple->next = nexttuple;
-				else
-					hashtable->buckets[i] = nexttuple;
-				/* prevtuple doesn't change */
-				hashtable->spaceUsed -=
-					HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(tuple)->t_len;
-				pfree(tuple);
+
+				hashtable->spaceUsed -= hashTupleSize;
 				nfreed++;
 			}
 
-			tuple = nexttuple;
+			/* bytes occupied in memory HJ tuple overhead + actual tuple length */
+			idx += hashTupleSize;
+
 		}
+
+		/* so, we're done with this chunk - free it and proceed to the next one */
+		pfree(oldchunks);
+		oldchunks = nextchunk;
+
 	}
 
 #ifdef HJDEBUG
@@ -717,8 +741,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
 
 		/* Create the HashJoinTuple */
 		hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
-		hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
-													   hashTupleSize);
+		hashTuple = (HashJoinTuple) chunk_alloc(hashtable, hashTupleSize);
+
 		hashTuple->hashvalue = hashvalue;
 		memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
 
@@ -1068,6 +1092,11 @@ ExecHashTableReset(HashJoinTable hashtable)
 	hashtable->spaceUsed = 0;
 
 	MemoryContextSwitchTo(oldcxt);
+
+	/* reset the chunks too (the memory was allocated within batchCxt, so it's
+	 * already freed by resetting the batch context -just set it to NULL) */
+	hashtable->chunks_batch = NULL;
+
 }
 
 /*
@@ -1318,6 +1347,61 @@ ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue)
 	return INVALID_SKEW_BUCKET_NO;
 }
 
+static
+char * chunk_alloc(HashJoinTable hashtable, int size) {
+
+	/* maybe we should use MAXALIGN(size) here ... */
+
+	/* if tuple size is larger than of 1/8 of chunk size, allocate a separate chunk */
+	if (size > (HASH_CHUNK_SIZE/8)) {
+
+		/* allocate new chunk and put it at the beginning of the list */
+		HashChunk newChunk = (HashChunk)MemoryContextAlloc(hashtable->batchCxt, offsetof(HashChunkData, data) + size);
+
+		newChunk->maxlen = size;
+		newChunk->used = 0;
+		newChunk->ntuples = 0;
+
+		/* If there already is a chunk, add if after it so we can still use the space in it */
+		if (hashtable->chunks_batch != NULL) {
+			newChunk->next = hashtable->chunks_batch->next;
+			hashtable->chunks_batch->next = newChunk;
+		} else {
+			newChunk->next = hashtable->chunks_batch;
+			hashtable->chunks_batch = newChunk;
+		}
+
+		newChunk->used += size;
+		newChunk->ntuples += 1;
+
+		return newChunk->data;
+
+	}
+
+	/* ok, it's within chunk size, let's see if we have enough space for it in the current
+	 * chunk => if not, allocate a new chunk (chunks==NULL means this is the first chunk) */
+	if ((hashtable->chunks_batch == NULL) || (hashtable->chunks_batch->maxlen - hashtable->chunks_batch->used) < size) {
+
+		/* allocate new chunk and put it at the beginning of the list */
+		HashChunk newChunk = (HashChunk)MemoryContextAlloc(hashtable->batchCxt, offsetof(HashChunkData, data) + HASH_CHUNK_SIZE);
+
+		newChunk->maxlen = HASH_CHUNK_SIZE;
+		newChunk->used = 0;
+		newChunk->ntuples = 0;
+
+		newChunk->next = hashtable->chunks_batch;
+		hashtable->chunks_batch = newChunk;
+	}
+
+	/* OK, we have enough space in the chunk, let's add the tuple */
+	hashtable->chunks_batch->used += size;
+	hashtable->chunks_batch->ntuples += 1;
+
+	/* allocate pointer to the start of the tuple memory */
+	return hashtable->chunks_batch->data + (hashtable->chunks_batch->used - size);
+
+}
+
 /*
  * ExecHashSkewTableInsert
  *
@@ -1340,6 +1424,7 @@ ExecHashSkewTableInsert(HashJoinTable hashtable,
 	hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
 	hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
 												   hashTupleSize);
+
 	hashTuple->hashvalue = hashvalue;
 	memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
 	HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 3beae40..7b55707 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -102,6 +102,22 @@ typedef struct HashSkewBucket
 #define SKEW_WORK_MEM_PERCENT  2
 #define SKEW_MIN_OUTER_FRACTION  0.01
 
+#define HASH_CHUNK_SIZE			(16*1024L)	/* 16kB chunks by default */
+#define HASH_SKEW_CHUNK_SIZE	(4*1024L)	/* 4kB chunks for skew buckets */
+
+typedef struct HashChunkData
+{
+	int			ntuples;	/* number of tuples stored in this chunk */
+	size_t		maxlen;		/* length of the buffer */
+	size_t		used;		/* number of chunk bytes already used */
+
+	struct HashChunkData   *next;	/* pointer to the next chunk (linked list) */
+
+	char		data[1];	/* buffer allocated at the end */
+} HashChunkData;
+
+typedef struct HashChunkData* HashChunk;
+
 
 typedef struct HashJoinTableData
 {
@@ -157,6 +173,11 @@ typedef struct HashJoinTableData
 
 	MemoryContext hashCxt;		/* context for whole-hash-join storage */
 	MemoryContext batchCxt;		/* context for this-batch-only storage */
+
+	/* used for dense allocation of tuples (linked chunks) */
+	HashChunk	chunks_batch;	/* memory for dense-packing tuples (one list for the batch) */
+	HashChunk  *chunks_skew;	/* memory for dense-packing tuples (list per skew bucket) */
+
 }	HashJoinTableData;
 
 #endif   /* HASHJOIN_H */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to