On Thu, 2020-05-21 at 21:13 +0200, Tomas Vondra wrote:
> 2) We could make it self-tuning, by increasing the number of blocks
> we pre-allocate. So every time we exhaust the range, we double the
> number of blocks (with a reasonable maximum, like 1024 or so). Or we
> might just increment it by 32, or something.
Attached a new version that uses the doubling behavior, and cleans it
up a bit. It also returns the unused prealloc blocks back to lts-
>freeBlocks when the tape is rewound for reading.
> IIUC the danger of pre-allocating blocks is that we might not fill
> them,
> resulting in temp file much larger than necessary. It might be
> harmless
> on some (most?) current filesystems that don't actually allocate
> space
> for blocks that are never written, but it also confuses our
> accounting
> of temporary file sizes. So we should try to limit that, and growing
> the
> number of pre-allocated blocks over time seems reasonable.
There's another danger here: it doesn't matter how well the filesystem
deals with sparse writes, because ltsWriteBlock fills in the holes with
zeros anyway. That's potentially a significant amount of wasted IO
effort if we aren't careful.
Regards,
Jeff Davis
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index bc8d56807e2..84f9d6d2358 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -110,6 +110,8 @@ typedef struct TapeBlockTrailer
#define TapeBlockSetNBytes(buf, nbytes) \
(TapeBlockGetTrailer(buf)->next = -(nbytes))
+#define TAPE_WRITE_PREALLOC_MIN 8
+#define TAPE_WRITE_PREALLOC_MAX 128
/*
* This data structure represents a single "logical tape" within the set
@@ -151,6 +153,21 @@ typedef struct LogicalTape
int max_size; /* highest useful, safe buffer_size */
int pos; /* next read/write position in buffer */
int nbytes; /* total # of valid bytes in buffer */
+
+ /*
+ * When multiple tapes are being written to concurrently (as in HashAgg),
+ * avoid excessive fragmentation by preallocating block numbers to
+ * individual tapes. The preallocated block numbers are held in an array
+ * sorted in descending order; blocks are consumed from the end of the
+ * array (lowest block numbers first).
+ *
+ * No filesystem operations are performed for preallocation; only the
+ * block numbers are reserved. This may lead to sparse writes, which will
+ * cause ltsWriteBlock() to fill in holes with zeros.
+ */
+ long *prealloc;
+ int nprealloc; /* number of elements in list */
+ int prealloc_size; /* number of elements list can hold */
} LogicalTape;
/*
@@ -198,6 +215,7 @@ struct LogicalTapeSet
static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
static long ltsGetFreeBlock(LogicalTapeSet *lts);
+static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt);
static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
SharedFileSet *fileset);
@@ -397,6 +415,51 @@ ltsGetFreeBlock(LogicalTapeSet *lts)
return blocknum;
}
+/*
+ * Return the lowest free block number from the tape's preallocation list.
+ * Refill the preallocation list if necessary.
+ */
+static long
+ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt)
+{
+ /* sorted in descending order, so return the last element */
+ if (lt->nprealloc > 0)
+ return lt->prealloc[--lt->nprealloc];
+
+ if (lt->prealloc == NULL)
+ {
+ lt->prealloc_size = TAPE_WRITE_PREALLOC_MIN;
+ lt->prealloc = (long *) palloc(sizeof(long) * lt->prealloc_size);
+ }
+ else
+ {
+ /* when the preallocation list runs out, double the size */
+ if (lt->prealloc_size * 2 <= TAPE_WRITE_PREALLOC_MAX)
+ lt->prealloc_size *= 2;
+ lt->prealloc = (long *) repalloc(lt->prealloc,
+ sizeof(long) * lt->prealloc_size);
+ }
+
+ /* refill preallocation list */
+ lt->nprealloc = lt->prealloc_size;
+ for (int i = lt->nprealloc; i > 0; i--)
+ lt->prealloc[i - 1] = ltsGetFreeBlock(lts);
+
+#ifdef USE_ASSERT_CHECKING
+ {
+ /* verify that list is in reverse sorted order */
+ long last_block_number = -1;
+ for (int i = lt->nprealloc; i > 0; i--)
+ {
+ Assert(lt->prealloc[i - 1] > last_block_number);
+ last_block_number = lt->prealloc[i - 1];
+ }
+ }
+#endif
+
+ return lt->prealloc[--lt->nprealloc];
+}
+
/*
* Return a block# to the freelist.
*/
@@ -557,6 +620,9 @@ ltsInitTape(LogicalTape *lt)
lt->max_size = MaxAllocSize;
lt->pos = 0;
lt->nbytes = 0;
+ lt->prealloc = NULL;
+ lt->nprealloc = 0;
+ lt->prealloc_size = 0;
}
/*
@@ -709,7 +775,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
Assert(lt->firstBlockNumber == -1);
Assert(lt->pos == 0);
- lt->curBlockNumber = ltsGetFreeBlock(lts);
+ lt->curBlockNumber = ltsGetPreallocBlock(lts, lt);
lt->firstBlockNumber = lt->curBlockNumber;
TapeBlockGetTrailer(lt->buffer)->prev = -1L;
@@ -733,7 +799,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
* First allocate the next block, so that we can store it in the
* 'next' pointer of this block.
*/
- nextBlockNumber = ltsGetFreeBlock(lts);
+ nextBlockNumber = ltsGetPreallocBlock(lts, lt);
/* set the next-pointer and dump the current block. */
TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
@@ -835,13 +901,23 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
Assert(lt->frozen);
}
- /* Allocate a read buffer (unless the tape is empty) */
if (lt->buffer)
pfree(lt->buffer);
/* the buffer is lazily allocated, but set the size here */
lt->buffer = NULL;
lt->buffer_size = buffer_size;
+
+ /* free the preallocation list, and return unused block numbers */
+ if (lt->prealloc != NULL)
+ {
+ for (int i = lt->nprealloc; i > 0; i--)
+ ltsReleaseBlock(lts, lt->prealloc[i - 1]);
+ pfree(lt->prealloc);
+ lt->prealloc = NULL;
+ lt->nprealloc = 0;
+ lt->prealloc_size = 0;
+ }
}
/*