Hi, Commit 9556aa01c69 (Use single-byte Boyer-Moore-Horspool search even with multibyte encodings), was a speed improvement for the majority of cases, but when the match was toward the end of the string, the slowdown in text_position_get_match_pos() was noticeable. It was found that there was a lot of overhead in pg_mblen(). [1]
The attached exploratory PoC improves this for utf-8. It applies on
top v25 of my utf-8 verification patch in [2], since one approach
relies on the DFA from it. The other three approaches are:
- a version of pg_utf_mblen() that uses a lookup table [3]
- an inlined copy of pg_utf_mblen()
- an ascii fast path with a fallback to the inlined copy of pg_utf_mblen()
The test is attached and the test function is part of the patch. It's
based on the test used in the commit above. The test searches for a
string that's at the end of a ~1 million byte string. This is on gcc
11 with 2-3 runs to ensure repeatability, but I didn't bother with
statistics because the differences are pretty big:
patch | no match | ascii | mulitbyte
-----------------------------------------+----------+-------+-----------
PG11 | 1120 | 1100 | 900
master | 381 | 2350 | 1900
DFA | 386 | 1640 | 1640
branchless utf mblen | 387 | 4100 | 2600
inline pg_utf_mblen() | 380 | 1080 | 920
inline pg_utf_mblen() + ascii fast path | 382 | 470 | 918
Neither of the branchless approaches worked well. The DFA can't work
as well here as in verification because it must do additional work.
Inlining pg_utf_mblen() restores worst-case performance to PG11
levels. The ascii fast path is a nice improvement on top of that. A
similar approach could work for pg_mbstrlen() as well, but I haven't
looked into that yet. There are other callers of pg_mblen(), but I
haven't looked into whether they are performance-sensitive. A more
general application would be preferable to a targeted one.
[1]
https://www.postgresql.org/message-id/b65df3d8-1f59-3bd7-ebbe-68b81d5a76a4%40iki.fi
[2]
https://www.postgresql.org/message-id/CAFBsxsHG%3Dg6W8Mie%2B_NO8dV6O0pO2stxrnS%3Dme5ZmGqk--fd5g%40mail.gmail.com
[3] https://github.com/skeeto/branchless-utf8/blob/master/utf8.h
--
John Naylor
EDB: http://www.enterprisedb.com
src/backend/utils/adt/varlena.c | 112 ++++++++++++++++++++++++++++++++++++++++
src/common/wchar.c | 90 ++------------------------------
src/include/mb/pg_wchar.h | 53 -------------------
src/test/regress/regress.c | 18 +++++++
4 files changed, 133 insertions(+), 140 deletions(-)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index bd3091bbfb..cc93818007 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -26,6 +26,7 @@
#include "common/unicode_norm.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
+#include "mb/pg_utf8.h"
#include "miscadmin.h"
#include "nodes/execnodes.h"
#include "parser/scansup.h"
@@ -1468,6 +1469,116 @@ text_position_get_match_pos(TextPositionState *state)
{
if (!state->is_multibyte)
return state->last_match - state->str1 + 1;
+ else if (GetDatabaseEncoding() == PG_UTF8)
+ {
+#if 0 /* dfa */
+
+ int utf8_state_count[UTF8_LAST_STATE + 1] = {0};
+ uint32 dfa_state = BGN;
+
+ /*
+ * See pg_utf8.h for an explanation of the state machine. Unlike in
+ * pg_wchar.c, we must calculate the exact state so we can use it as
+ * an array index.
+ */
+ while (state->refpoint < state->last_match)
+ {
+ const unsigned char *s = (const unsigned char *) state->refpoint++;
+
+ dfa_state = (Utf8Transition[*s] >> dfa_state) & 31;
+ utf8_state_count[dfa_state]++;
+ }
+ Assert(state->refpoint == state->last_match);
+ state->refpos += utf8_state_count[END];
+ return state->refpos + 1;
+
+#endif
+#if 0 /* like pg_utf_mblen(), but different algorithm: */
+
+ static const char lengths[] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 1
+ };
+
+ while (state->refpoint < state->last_match)
+ {
+ const unsigned char *s = (const unsigned char *) state->refpoint;
+
+ state->refpoint += lengths[s[0] >> 3];
+ state->refpos++;
+ }
+
+ Assert(state->refpoint == state->last_match);
+ return state->refpos + 1;
+#endif
+#if 0 /* inline pg_utf_mblen()*/
+
+ int len = 0;
+
+ while (state->refpoint < state->last_match)
+ {
+ const unsigned char *s = (const unsigned char *) state->refpoint;
+
+ if ((*s & 0x80) == 0)
+ len = 1;
+ else if ((*s & 0xe0) == 0xc0)
+ len = 2;
+ else if ((*s & 0xf0) == 0xe0)
+ len = 3;
+ else if ((*s & 0xf8) == 0xf0)
+ len = 4;
+ else
+ len = 1;
+
+ state->refpoint += len;
+ state->refpos++;
+ }
+
+ Assert(state->refpoint == state->last_match);
+ return state->refpos + 1;
+#endif
+#if 1 /* try ASCII fast path, then fall back to inlined pg_utf_mblen() */
+
+ int len = 0;
+
+#define STRIDE_LENGTH 16
+
+ /* fast path for ascii text. */
+ while (state->refpoint + STRIDE_LENGTH <= state->last_match)
+ {
+ if (is_valid_ascii((const unsigned char *) state->refpoint, STRIDE_LENGTH))
+ {
+ state->refpoint += STRIDE_LENGTH;
+ state->refpos += STRIDE_LENGTH;
+ }
+ else
+ break;
+ }
+
+ while (state->refpoint < state->last_match)
+ {
+ const unsigned char *s = (const unsigned char *) state->refpoint;
+
+ if ((*s & 0x80) == 0)
+ len = 1;
+ else if ((*s & 0xe0) == 0xc0)
+ len = 2;
+ else if ((*s & 0xf0) == 0xe0)
+ len = 3;
+ else if ((*s & 0xf8) == 0xf0)
+ len = 4;
+ else
+ len = 1;
+
+ state->refpoint += len;
+ state->refpos++;
+ }
+
+ Assert(state->refpoint == state->last_match);
+ return state->refpos + 1;
+#endif
+ }
+
else
{
/* Convert the byte position to char position. */
@@ -1481,6 +1592,7 @@ text_position_get_match_pos(TextPositionState *state)
}
}
+
/*
* Reset search state to the initial state installed by text_position_setup.
*
diff --git a/src/common/wchar.c b/src/common/wchar.c
index be931c5e92..2c51c3b6f9 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -12,6 +12,7 @@
*/
#include "c.h"
+#include "mb/pg_utf8.h"
#include "mb/pg_wchar.h"
@@ -1750,94 +1751,9 @@ pg_utf8_verifychar(const unsigned char *s, int len)
return l;
}
-/*
- * The fast path of the UTF-8 verifier uses a deterministic finite automaton
- * (DFA) for multibyte characters. In a traditional table-driven DFA, the
- * input byte and current state are used to compute an index into an array of
- * state transitions. Since the address of the next transition is dependent
- * on this computation, there is latency in executing the load instruction,
- * and the CPU is not kept busy.
- *
- * Instead, we use a "shift-based" DFA as described by Per Vognsen:
- *
- * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
- *
- * In a shift-based DFA, the input byte is an index into array of integers
- * whose bit pattern encodes the state transitions. To compute the next
- * state, we simply right-shift the integer by the current state and apply a
- * mask. In this scheme, the address of the transition only depends on the
- * input byte, so there is better pipelining.
- *
- * The naming convention for states and transitions was adopted from a UTF-8
- * to UTF-16/32 transcoder, whose table is reproduced below:
- *
- * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
- *
- * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
- * ==========================================================================
- * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
- * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
- * |
- * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
- * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
- * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
- * |
- * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
- * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
- * |
- * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
- * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
- *
- * In the most straightforward implementation, a shift-based DFA for UTF-8
- * requires 64-bit integers to encode the transitions, but with an SMT solver
- * it's possible to find state numbers such that the transitions fit within
- * 32-bit integers, as Dougall Johnson demonstrated:
- *
- * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
- *
- * This packed representation is the reason for the seemingly odd choice of
- * state values below.
- */
-/* Error */
-#define ERR 0
-/* Begin */
-#define BGN 11
-/* Continuation states, expect 1/2/3 continuation bytes */
-#define CS1 16
-#define CS2 1
-#define CS3 5
-/* Leading byte was E0/ED, expect 1 more continuation byte */
-#define P3A 6
-#define P3B 20
-/* Leading byte was F0/F4, expect 2 more continuation bytes */
-#define P4A 25
-#define P4B 30
-/* Begin and End are the same state */
-#define END BGN
-
-/* the encoded state transitions for the lookup table */
-
-/* ASCII */
-#define ASC (END << BGN)
-/* 2-byte lead */
-#define L2A (CS1 << BGN)
-/* 3-byte lead */
-#define L3A (P3A << BGN)
-#define L3B (CS2 << BGN)
-#define L3C (P3B << BGN)
-/* 4-byte lead */
-#define L4A (P4A << BGN)
-#define L4B (CS3 << BGN)
-#define L4C (P4B << BGN)
-/* continuation byte */
-#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
-#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
-#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
-/* invalid byte */
-#define ILL ERR
-
-static const uint32 Utf8Transition[256] =
+/* transition table for the UTF-8 DFA -- see pg_utf8.h for details */
+const uint32 Utf8Transition[256] =
{
/* ASCII */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 6bd996b3d0..d93ccac263 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -699,57 +699,4 @@ extern int mic2latin_with_table(const unsigned char *mic, unsigned char *p,
extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
#endif
-
-/*
- * Verify a chunk of bytes for valid ASCII.
- *
- * Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
- */
-static inline bool
-is_valid_ascii(const unsigned char *s, int len)
-{
- uint64 chunk,
- highbit_cum = UINT64CONST(0),
- zero_cum = UINT64CONST(0x8080808080808080);
-
- Assert(len % sizeof(chunk) == 0);
-
- while (len > 0)
- {
- memcpy(&chunk, s, sizeof(chunk));
-
- /*
- * Capture any zero bytes in this chunk.
- *
- * First, add 0x7f to each byte. This sets the high bit in each byte,
- * unless it was a zero. If any resulting high bits are zero, the
- * corresponding high bits in the zero accumulator will be cleared.
- *
- * If none of the bytes in the chunk had the high bit set, the max
- * value each byte can have after the addition is 0x7f + 0x7f = 0xfe,
- * and we don't need to worry about carrying over to the next byte. If
- * any input bytes did have the high bit set, it doesn't matter
- * because we check for those separately.
- */
- zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
-
- /* Capture any set bits in this chunk. */
- highbit_cum |= chunk;
-
- s += sizeof(chunk);
- len -= sizeof(chunk);
- }
-
- /* Check if any high bits in the high bit accumulator got set. */
- if (highbit_cum & UINT64CONST(0x8080808080808080))
- return false;
-
- /* Check if any high bits in the zero accumulator got cleared. */
- if (zero_cum != UINT64CONST(0x8080808080808080))
- return false;
-
- return true;
-}
-
#endif /* PG_WCHAR_H */
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 351d79e1f0..7f23cffa21 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -1206,3 +1206,21 @@ binary_coercible(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(IsBinaryCoercible(srctype, targettype));
}
+
+
+PG_FUNCTION_INFO_V1(benchmark_textpos);
+Datum
+benchmark_textpos(PG_FUNCTION_ARGS)
+{
+ text *t1 = PG_GETARG_TEXT_PP(0);
+ text *t2 = PG_GETARG_TEXT_PP(1);
+ int loops = PG_GETARG_INT32(2);
+ int i;
+ Datum result = Int32GetDatum(0);
+
+ for (i = 0; i < loops; i++)
+ {
+ result = DirectFunctionCall2Coll(textpos, PG_GET_COLLATION(), PointerGetDatum(t1), PointerGetDatum(t2));
+ }
+ return result;
+}
textpos-benchmark-utf8.sql
Description: application/sql
