While poking at that xml issue I tried to compare the memory usage of
the xml queries with vaguely comparable regexp_split_to_table queries,
and found that I could not do so; the regexp queries simply did not
complete in any sensible timeframe.
Investigation showed that in UTF8 (or other multibyte encodings), the
performance of regexp_matches and regexp_split_to_* is subject to
quadratic slowdowns thanks to the use of character offsets and calls to
text_substr, which must scan the string from the start each time to
count characters.
This patch changes that by keeping the wide-char copy of the string
available rather than freeing it, and converting the required substrings
back to the db encoding via a pre-allocated conversion buffer.
This gives noticable speedups on even very small strings (all timings
below are with -O2 --disable-cassert):
select count(*)
from (select 'aaaaa,aaaaa,aaaaa'::text as content
from generate_series(1,1000000) i offset 0) s,
regexp_split_to_table(content, ',') r;
-- ~10% faster with patch: 2.8 s -> 2.5 s
but on strings of even modest size (~1kb) the improvement is vast:
select count(*)
from (select repeat(repeat('a',10) || ',', 100+(i*0))::text
as content -- 1100 bytes, 101 matches
from generate_series(1,100000) i offset 0) s,
regexp_split_to_table(content, ',') r;
-- over 8 times faster: 51.8 sec -> 6.3 sec
and it only gets bigger:
select count(*)
from regexp_split_to_table(repeat('aaa,',10000), ',') r; -- 40KB
-- 270 times faster: 1628ms -> 6ms
This patch passes regression but I haven't yet tested it on complex
multibyte data (or non-UTF8 encodings but that shouldn't matter).
Comments?
--
Andrew (irc:RhodiumToad)
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 5025a449fb..8839fb4ce2 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -61,6 +61,9 @@ typedef struct regexp_matches_ctx
/* workspace for build_regexp_match_result() */
Datum *elems; /* has npatterns elements */
bool *nulls; /* has npatterns elements */
+ pg_wchar *wide_str; /* wide-char version of original string */
+ char *conv_buf; /* conversion buffer */
+ size_t conv_bufsiz; /* size thereof */
} regexp_matches_ctx;
/*
@@ -111,7 +114,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
pg_re_flags *flags,
Oid collation,
bool use_subpatterns,
- bool ignore_degenerate);
+ bool ignore_degenerate,
+ bool fetching_unmatched);
static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
errhint("Use the regexp_matches function instead.")));
matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
- PG_GET_COLLATION(), true, false);
+ PG_GET_COLLATION(), true, false, false);
if (matchctx->nmatches == 0)
PG_RETURN_NULL();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
&re_flags,
PG_GET_COLLATION(),
- true, false);
+ true, false, false);
/* Pre-create workspace that build_regexp_match_result needs */
matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -954,7 +958,7 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
* all the matching in one swoop. The returned regexp_matches_ctx contains
* the locations of all the substrings matching the pattern.
*
- * The two bool parameters have only two patterns (one for matching, one for
+ * The three bool parameters have only two patterns (one for matching, one for
* splitting) but it seems clearer to distinguish the functionality this way
* than to key it all off one "is_split" flag.
*/
@@ -962,9 +966,11 @@ static regexp_matches_ctx *
setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
Oid collation,
bool use_subpatterns,
- bool ignore_degenerate)
+ bool ignore_degenerate,
+ bool fetching_unmatched)
{
regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
+ int eml = pg_database_encoding_max_length();
int orig_len;
pg_wchar *wide_str;
int wide_len;
@@ -975,6 +981,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
int array_idx;
int prev_match_end;
int start_search;
+ int maxlen = 0; /* largest fetch length in characters */
/* save original string --- we'll extract result substrings from it */
matchctx->orig_str = orig_str;
@@ -1024,7 +1031,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
pmatch[0].rm_eo > prev_match_end))
{
/* enlarge output space if needed */
- while (array_idx + matchctx->npatterns * 2 > array_len)
+ while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
{
array_len *= 2;
matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
@@ -1038,16 +1045,29 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
for (i = 1; i <= matchctx->npatterns; i++)
{
- matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
- matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
+ int so = pmatch[i].rm_so;
+ int eo = pmatch[i].rm_eo;
+ matchctx->match_locs[array_idx++] = so;
+ matchctx->match_locs[array_idx++] = eo;
+ if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+ maxlen = (eo - so);
}
}
else
{
- matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
- matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
+ int so = pmatch[0].rm_so;
+ int eo = pmatch[0].rm_eo;
+ matchctx->match_locs[array_idx++] = so;
+ matchctx->match_locs[array_idx++] = eo;
+ if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+ maxlen = (eo - so);
}
matchctx->nmatches++;
+
+ if (fetching_unmatched &&
+ pmatch[0].rm_so >= 0 &&
+ (pmatch[0].rm_so - prev_match_end) > maxlen)
+ maxlen = (pmatch[0].rm_so - prev_match_end);
}
prev_match_end = pmatch[0].rm_eo;
@@ -1068,8 +1088,45 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
break;
}
+ if (fetching_unmatched &&
+ (wide_len - prev_match_end) > maxlen)
+ maxlen = (wide_len - prev_match_end);
+
+ /*
+ * Keep a note of the end position of the string for the benefit of
+ * splitting code.
+ */
+ matchctx->match_locs[array_idx] = wide_len;
+
+ if (eml > 1)
+ {
+ int64 maxsiz = (int64) maxlen * eml;
+ size_t conv_bufsiz;
+
+ /*
+ * worst case: assume we need the maximum size (maxlen*eml), but take
+ * advantage of the fact that the original string length in bytes is an
+ * upper bound on the byte length of any fetched substring (and we know
+ * that len+1 is safe to allocate because the varlena header is longer
+ * than 1 byte).
+ */
+ if (maxsiz > orig_len)
+ conv_bufsiz = (size_t) orig_len + 1;
+ else
+ conv_bufsiz = (size_t) maxsiz + 1; /* safe since maxsiz < 2^30 */
+ matchctx->conv_buf = palloc(conv_bufsiz);
+ matchctx->conv_bufsiz = conv_bufsiz;
+ matchctx->wide_str = wide_str;
+ }
+ else
+ {
+ pfree(wide_str);
+ matchctx->wide_str = NULL;
+ matchctx->conv_buf = NULL;
+ matchctx->conv_bufsiz = 0;
+ }
+
/* Clean up temp storage */
- pfree(wide_str);
pfree(pmatch);
return matchctx;
@@ -1082,6 +1139,10 @@ static void
cleanup_regexp_matches(regexp_matches_ctx *matchctx)
{
pfree(matchctx->orig_str);
+ if (matchctx->wide_str)
+ pfree(matchctx->wide_str);
+ if (matchctx->conv_buf)
+ pfree(matchctx->conv_buf);
pfree(matchctx->match_locs);
if (matchctx->elems)
pfree(matchctx->elems);
@@ -1096,6 +1157,7 @@ cleanup_regexp_matches(regexp_matches_ctx *matchctx)
static ArrayType *
build_regexp_match_result(regexp_matches_ctx *matchctx)
{
+ int eml = pg_database_encoding_max_length();
Datum *elems = matchctx->elems;
bool *nulls = matchctx->nulls;
int dims[1];
@@ -1115,6 +1177,17 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
elems[i] = (Datum) 0;
nulls[i] = true;
}
+ else if (eml > 1)
+ {
+ size_t bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
+ char *buf = matchctx->conv_buf;
+ int len = pg_wchar2mb_with_len(matchctx->wide_str + so,
+ buf,
+ eo - so);
+ Assert(len < bufsiz);
+ elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
+ nulls[i] = false;
+ }
else
{
elems[i] = DirectFunctionCall3(text_substr,
@@ -1168,7 +1241,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
&re_flags,
PG_GET_COLLATION(),
- false, true);
+ false, true, true);
MemoryContextSwitchTo(oldcontext);
funcctx->user_fctx = (void *) splitctx;
@@ -1224,7 +1297,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
PG_GETARG_TEXT_PP(1),
&re_flags,
PG_GET_COLLATION(),
- false, true);
+ false, true, true);
while (splitctx->next_match <= splitctx->nmatches)
{
@@ -1261,6 +1334,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
static Datum
build_regexp_split_result(regexp_matches_ctx *splitctx)
{
+ int eml = pg_database_encoding_max_length();
int startpos;
int endpos;
@@ -1271,7 +1345,22 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
if (startpos < 0)
elog(ERROR, "invalid match ending position");
- if (splitctx->next_match < splitctx->nmatches)
+ if (eml > 1)
+ {
+ size_t bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
+ char *buf = splitctx->conv_buf;
+ int len;
+
+ endpos = splitctx->match_locs[splitctx->next_match * 2];
+ if (endpos < startpos)
+ elog(ERROR, "invalid match starting position");
+ len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
+ buf,
+ endpos-startpos);
+ Assert(len < bufsiz);
+ return PointerGetDatum(cstring_to_text_with_len(buf, len));
+ }
+ else if (splitctx->next_match < splitctx->nmatches)
{
endpos = splitctx->match_locs[splitctx->next_match * 2];
if (endpos < startpos)