Hi,
I started to analyze XLogInsert because it was the major bottleneck when
creating some materialized view/cached tables/whatever.
Analyzing it I could see that content of the COMP_CRC32 macro was taking most
of the time which isn't immediately obvious when you profile because it
obviously doesn't show up as a separate function.
I first put it into functions to make it easier to profile. I couldn't measure
any difference for COPY, CTAS and a simple pgbench run on 3 kinds of hardware
(Core2, older Xeon, older Sparc systems).
I looked a bit around for faster implementations of CRC32 and found one in
zlib. After adapting it (pg uses slightly different computation (non-
inverted)) I found that it increases the speed of the CRC32 calculation itself
3 fold.
It does that by not only using one lookup table but four (one for each byte of
a word). Those four calculations are independent and thus are considerably
faster on somewhat recent hardware.
Also it does memory lookups in 4 byte steps instead of 1 byte as the pg
version (thats only about ~8% benefit in itself).
I wrote a preliminary patch which includes both, the original implementation
and the new one switchable via an #define.
I tested performance differences in a small number of scenarios:
- CTAS/INSERT ... SELECT (8-30%)
- COPY (3-20%)
- pgbench (no real difference unless directly after a checkpoint)
Setup:
CREATE TABLE blub (ai int, bi int, aibi int);
CREATE TABLE speedtest (ai int, bi int, aibi int);
INSERT ... SELECT:
Statement:
INSERT INTO blub SELECT a.i, b.i, a.i *b.i FROM generate_series(1, 10000)
a(i), generate_series(1, 1000) b(i);
legacy crc:
11526.588
11406.518
11412.182
11430.245
zlib:
9977.394
9945.408
9840.907
9842.875
COPY:
Statement:
('blub' enlarged here 4 times, as otherwise the variances were to large)
COPY blub TO '/tmp/b' BINARY;
...
CHECKPOINT;TRUNCATE speedtest; COPY speedtest FROM '/tmp/b' BINARY;
legacy:
44835.840
44832.876
zlib:
39530.549
39365.109
39295.167
The performance differences are bigger if the table rows are significantly
bigger.
Do you think something like that is sensible? If yes, I will make it into a
proper patch and such.
Thanks,
Andres
INSERT ... SELECT profile before patch:
20.22% postgres postgres [.] comp_crc32
5.77% postgres postgres [.] XLogInsert
5.55% postgres postgres [.] LWLockAcquire
5.21% postgres [kernel. [k] copy_user_generic_string
4.64% postgres postgres [.] LWLockRelease
4.39% postgres postgres [.] ReadBuffer_common
2.75% postgres postgres [.] heap_insert
2.22% postgres libc-2.1 [.] memcpy
2.09% postgres postgres [.] UnlockReleaseBuffer
1.85% postgres postgres [.] hash_any
1.77% postgres [kernel. [k] clear_page_c
1.69% postgres postgres [.] hash_search_with_hash_value
1.61% postgres postgres [.] heapgettup_pagemode
1.50% postgres postgres [.] PageAddItem
1.42% postgres postgres [.] MarkBufferDirty
1.28% postgres postgres [.] RelationGetBufferForTuple
1.15% postgres postgres [.] ExecModifyTable
1.06% postgres postgres [.] RelationPutHeapTuple
After:
9.97% postgres postgres [.] comp_crc32
5.95% postgres [kernel. [k] copy_user_generic_string
5.94% postgres postgres [.] LWLockAcquire
5.64% postgres postgres [.] XLogInsert
5.11% postgres postgres [.] LWLockRelease
4.63% postgres postgres [.] ReadBuffer_common
3.45% postgres postgres [.] heap_insert
2.54% postgres libc-2.1 [.] memcpy
2.03% postgres postgres [.] UnlockReleaseBuffer
1.94% postgres postgres [.] hash_search_with_hash_value
1.84% postgres postgres [.] hash_any
1.73% postgres [kernel. [k] clear_page_c
1.68% postgres postgres [.] PageAddItem
1.62% postgres postgres [.] heapgettup_pagemode
1.52% postgres postgres [.] RelationGetBufferForTuple
1.47% postgres postgres [.] MarkBufferDirty
1.30% postgres postgres [.] ExecModifyTable
1.23% postgres postgres [.] RelationPutHeapTuple
From f8ec18769e581cf039535730d2088466c461d8f6 Mon Sep 17 00:00:00 2001
From: Andres Freund <[email protected]>
Date: Thu, 29 Apr 2010 22:19:08 +0200
Subject: [PATCH] Preliminary patch using an improved out of line crc32 computation for
the xlog.
---
src/backend/access/transam/xlog.c | 66 +++++++++---------
src/backend/utils/hash/pg_crc.c | 142 ++++++++++++++++++++++++++++++++++++-
src/include/utils/pg_crc.h | 9 ++-
3 files changed, 180 insertions(+), 37 deletions(-)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 992a337..ffb9fc4 100644
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
*************** begin:;
*** 700,706 ****
{
/* Simple data, just include it */
len += rdt->len;
! COMP_CRC32(rdata_crc, rdt->data, rdt->len);
}
else
{
--- 700,706 ----
{
/* Simple data, just include it */
len += rdt->len;
! rdata_crc = comp_crc32(rdata_crc, rdt->data, rdt->len);
}
else
{
*************** begin:;
*** 715,721 ****
else if (rdt->data)
{
len += rdt->len;
! COMP_CRC32(rdata_crc, rdt->data, rdt->len);
}
break;
}
--- 715,721 ----
else if (rdt->data)
{
len += rdt->len;
! rdata_crc = comp_crc32(rdata_crc, rdt->data, rdt->len);
}
break;
}
*************** begin:;
*** 732,738 ****
else if (rdt->data)
{
len += rdt->len;
! COMP_CRC32(rdata_crc, rdt->data, rdt->len);
}
break;
}
--- 732,738 ----
else if (rdt->data)
{
len += rdt->len;
! rdata_crc = comp_crc32(rdata_crc, rdt->data, rdt->len);
}
break;
}
*************** begin:;
*** 757,781 ****
BkpBlock *bkpb = &(dtbuf_xlg[i]);
char *page;
! COMP_CRC32(rdata_crc,
! (char *) bkpb,
! sizeof(BkpBlock));
page = (char *) BufferGetBlock(dtbuf[i]);
if (bkpb->hole_length == 0)
{
! COMP_CRC32(rdata_crc,
! page,
! BLCKSZ);
}
else
{
/* must skip the hole */
! COMP_CRC32(rdata_crc,
! page,
! bkpb->hole_offset);
! COMP_CRC32(rdata_crc,
! page + (bkpb->hole_offset + bkpb->hole_length),
! BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
}
}
}
--- 757,781 ----
BkpBlock *bkpb = &(dtbuf_xlg[i]);
char *page;
! rdata_crc = comp_crc32(rdata_crc,
! (char *) bkpb,
! sizeof(BkpBlock));
page = (char *) BufferGetBlock(dtbuf[i]);
if (bkpb->hole_length == 0)
{
! rdata_crc = comp_crc32(rdata_crc,
! page,
! BLCKSZ);
}
else
{
/* must skip the hole */
! rdata_crc = comp_crc32(rdata_crc,
! page,
! bkpb->hole_offset);
! rdata_crc = comp_crc32(rdata_crc,
! page + (bkpb->hole_offset + bkpb->hole_length),
! BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
}
}
}
*************** begin:;
*** 980,987 ****
record->xl_rmid = rmid;
/* Now we can finish computing the record's CRC */
! COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
! SizeOfXLogRecord - sizeof(pg_crc32));
FIN_CRC32(rdata_crc);
record->xl_crc = rdata_crc;
--- 980,987 ----
record->xl_rmid = rmid;
/* Now we can finish computing the record's CRC */
! rdata_crc = comp_crc32(rdata_crc, (char *) record + sizeof(pg_crc32),
! SizeOfXLogRecord - sizeof(pg_crc32));
FIN_CRC32(rdata_crc);
record->xl_crc = rdata_crc;
*************** RecordIsValid(XLogRecord *record, XLogRe
*** 3550,3556 ****
/* First the rmgr data */
INIT_CRC32(crc);
! COMP_CRC32(crc, XLogRecGetData(record), len);
/* Add in the backup blocks, if any */
blk = (char *) XLogRecGetData(record) + len;
--- 3550,3556 ----
/* First the rmgr data */
INIT_CRC32(crc);
! crc = comp_crc32(crc, XLogRecGetData(record), len);
/* Add in the backup blocks, if any */
blk = (char *) XLogRecGetData(record) + len;
*************** RecordIsValid(XLogRecord *record, XLogRe
*** 3570,3576 ****
return false;
}
blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
! COMP_CRC32(crc, blk, blen);
blk += blen;
}
--- 3570,3576 ----
return false;
}
blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
! crc = comp_crc32(crc, blk, blen);
blk += blen;
}
*************** RecordIsValid(XLogRecord *record, XLogRe
*** 3584,3591 ****
}
/* Finally include the record header */
! COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
! SizeOfXLogRecord - sizeof(pg_crc32));
FIN_CRC32(crc);
if (!EQ_CRC32(record->xl_crc, crc))
--- 3584,3591 ----
}
/* Finally include the record header */
! crc = comp_crc32(crc, (char *) record + sizeof(pg_crc32),
! SizeOfXLogRecord - sizeof(pg_crc32));
FIN_CRC32(crc);
if (!EQ_CRC32(record->xl_crc, crc))
*************** WriteControlFile(void)
*** 4450,4458 ****
/* Contents are protected with a CRC */
INIT_CRC32(ControlFile->crc);
! COMP_CRC32(ControlFile->crc,
! (char *) ControlFile,
! offsetof(ControlFileData, crc));
FIN_CRC32(ControlFile->crc);
/*
--- 4450,4458 ----
/* Contents are protected with a CRC */
INIT_CRC32(ControlFile->crc);
! ControlFile->crc = comp_crc32(ControlFile->crc,
! (char *) ControlFile,
! offsetof(ControlFileData, crc));
FIN_CRC32(ControlFile->crc);
/*
*************** ReadControlFile(void)
*** 4550,4558 ****
/* Now check the CRC. */
INIT_CRC32(crc);
! COMP_CRC32(crc,
! (char *) ControlFile,
! offsetof(ControlFileData, crc));
FIN_CRC32(crc);
if (!EQ_CRC32(crc, ControlFile->crc))
--- 4550,4558 ----
/* Now check the CRC. */
INIT_CRC32(crc);
! crc = comp_crc32(crc,
! (char *) ControlFile,
! offsetof(ControlFileData, crc));
FIN_CRC32(crc);
if (!EQ_CRC32(crc, ControlFile->crc))
*************** UpdateControlFile(void)
*** 4688,4696 ****
int fd;
INIT_CRC32(ControlFile->crc);
! COMP_CRC32(ControlFile->crc,
! (char *) ControlFile,
! offsetof(ControlFileData, crc));
FIN_CRC32(ControlFile->crc);
fd = BasicOpenFile(XLOG_CONTROL_FILE,
--- 4688,4696 ----
int fd;
INIT_CRC32(ControlFile->crc);
! ControlFile->crc = comp_crc32(ControlFile->crc,
! (char *) ControlFile,
! offsetof(ControlFileData, crc));
FIN_CRC32(ControlFile->crc);
fd = BasicOpenFile(XLOG_CONTROL_FILE,
*************** BootStrapXLOG(void)
*** 4900,4908 ****
memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
INIT_CRC32(crc);
! COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
! COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
! SizeOfXLogRecord - sizeof(pg_crc32));
FIN_CRC32(crc);
record->xl_crc = crc;
--- 4900,4908 ----
memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
INIT_CRC32(crc);
! crc = comp_crc32(crc, &checkPoint, sizeof(checkPoint));
! crc = comp_crc32(crc, (char *) record + sizeof(pg_crc32),
! SizeOfXLogRecord - sizeof(pg_crc32));
FIN_CRC32(crc);
record->xl_crc = crc;
diff --git a/src/backend/utils/hash/pg_crc.c b/src/backend/utils/hash/pg_crc.c
index 2ef62e5..32ba6f6 100644
*** a/src/backend/utils/hash/pg_crc.c
--- b/src/backend/utils/hash/pg_crc.c
***************
*** 26,32 ****
/* Use c.h so that this file can be built in either frontend or backend */
#include "c.h"
!
/*
* This table is based on the polynomial
--- 26,32 ----
/* Use c.h so that this file can be built in either frontend or backend */
#include "c.h"
! #include "utils/pg_crc.h"
/*
* This table is based on the polynomial
*************** const uint32 pg_crc32_table[256] = {
*** 100,105 ****
--- 100,245 ----
0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
};
+ /*
+ * 1: pg version
+ * 2: adapted zlib version
+ */
+ #define CRC_VERSION 2
+ #if CRC_VERSION == 1
+
+ /* Accumulate some (more) bytes into a CRC */
+ pg_crc32 comp_crc32(pg_crc32 crc, const void *data, uint32 len){
+ pg_crc32 start = crc;
+ uint32 start_len = len;
+ do {
+ unsigned char *__data = (unsigned char *) (data);
+ while (len-- > 0){
+ int __tab_index = ((int) ((crc) >> 24) ^ *__data++) & 0xFF;
+ crc = pg_crc32_table[__tab_index] ^ ((crc) << 8);
+ }
+ } while (0);
+
+ return crc;
+ }
+
+
+ #elif CRC_VERSION == 2
+
+ static inline uint32 swab32(const uint32 x);
+ static inline uint32 swab32(const uint32 x){
+ return ((x & (uint32)0x000000ffUL) << 24) |
+ ((x & (uint32)0x0000ff00UL) << 8) |
+ ((x & (uint32)0x00ff0000UL) >> 8) |
+ ((x & (uint32)0xff000000UL) >> 24);
+ }
+
+ #if defined __BIG_ENDIAN__
+ #define cpu_to_be32(x)
+ #else
+ #define cpu_to_be32(x) swab32(x)
+ #endif
+
+ #define unlikely(x) __builtin_expect(!!(x), 0)
+ #define likely(x) __builtin_expect(!!(x), 1)
+
+ static pg_crc32 crc_table[4][256];
+
+ static void prepare(){
+ /* generate crc for each value followed by one, two, and three zeros*/
+ int n;
+ int k;
+ pg_crc32 c;
+
+ for (n = 0; n < 256; n++) {
+ c = pg_crc32_table[n];
+ crc_table[0][n] = c;
+ }
+
+ for (n = 0; n < 256; n++) {
+ c = crc_table[0][n];
+ for (k = 1; k < 4; k++) {
+ //orig:
+ //c = crc_table[0][c & 0xff] ^ (c >> 8);
+ //pg compliant
+ c = crc_table[0][(c >> 24)] ^ (c << 8);
+ crc_table[k][n] = c;
+ }
+ }
+ }
+
+
+ /* ========================================================================= */
+ #define DOCRC4 d = *buf4++; c ^= cpu_to_be32(d); \
+ c = crc_table[0][c & 0xff] ^ crc_table[1][(c >> 8) & 0xff] ^ \
+ crc_table[2][(c >> 16) & 0xff] ^ crc_table[3][c >> 24]
+ #define DOCRC32 DOCRC4; DOCRC4; DOCRC4; DOCRC4; DOCRC4; DOCRC4; DOCRC4; DOCRC4
+
+ /* ========================================================================= */
+ pg_crc32 comp_crc32(pg_crc32 crc, const void *data2, uint32 len)
+ {
+ /*
+ * XXX: That table should likely be precomputed at compile time,
+ * but for now I didnt bother.
+ */
+ static int initialized = 0;
+ if(!initialized){
+ prepare();
+ initialized = 1;
+ }
+
+ unsigned char *data = (unsigned char *) (data2);
+ uint32 c = crc;
+ const uint32 *buf4;
+ uint32 d;
+
+ /*
+ * If the input is not on a 4byte boundary, align it. Its dubious
+ * if we actually need that, but the costs of that check should be
+ * marginal.
+ */
+ while (unlikely(len && ((ptrdiff_t)data & 3))) {
+ //printf("align\n");
+ c = crc_table[0][((c >> 24) ^ *data++) & 0xff] ^ (c << 8);
+ len--;
+ }
+
+
+ buf4 = (const uint32 *)(const void *)data;
+ /*
+ * manually unrolling the loop seems to be faster on most
+ * hardware/compiler combination I tried.
+ */
+ while (likely(len >= 32)) {
+ DOCRC32;
+ len -= 32;
+ //printf("crc: %x after bigstep\n", c);
+ }
+
+ /*
+ * There very validly might be input which is not a multiple of 32byte.
+ */
+ while (len >= 4) {
+ DOCRC4;
+ len -= 4;
+ //printf("crc: %x after step2\n", c);
+ }
+
+ data = (unsigned char *)buf4;
+ /*
+ * Computed unaligned end of data
+ */
+ if (unlikely(len)){
+ //printf("unalign\n");
+ do {
+ c = crc_table[0][((c >> 24) ^ *data++) & 0xff] ^ (c << 8);
+ }
+ while (--len);
+ }
+ return c;
+ }
+ #undef DOCRC4
+ #undef DOCRC32
+ #endif
#ifdef PROVIDE_64BIT_CRC
diff --git a/src/include/utils/pg_crc.h b/src/include/utils/pg_crc.h
index 3e34d62..32de738 100644
*** a/src/include/utils/pg_crc.h
--- b/src/include/utils/pg_crc.h
***************
*** 31,36 ****
--- 31,42 ----
typedef uint32 pg_crc32;
+ extern pg_crc32 comp_crc32(pg_crc32 crc, const void *data, uint32 len);
+
+
+ /* Constant table for CRC calculation */
+ extern CRCDLLIMPORT const uint32 pg_crc32_table[];
+
/* Initialize a CRC accumulator */
#define INIT_CRC32(crc) ((crc) = 0xFFFFFFFF)
*************** do { \
*** 53,61 ****
/* Check for equality of two CRCs */
#define EQ_CRC32(c1,c2) ((c1) == (c2))
- /* Constant table for CRC calculation */
- extern CRCDLLIMPORT const uint32 pg_crc32_table[];
-
#ifdef PROVIDE_64BIT_CRC
--- 59,64 ----
--
1.6.5.12.gd65df24
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers