Re: Another regexp performance improvement: skip useless paren-captures

Tom Lane Sun, 08 Aug 2021 13:25:22 -0700

Mark Dilger <[email protected]> writes:
> Hmm.  This changes the behavior when applied against master 
> (c1132aae336c41cf9d316222e525d8d593c2b5d2):


>  select regexp_split_to_array('uuuzkodphfbfbfb', '((.))(\1\2)', 'ntw');
>   regexp_split_to_array
>  -----------------------
> - {"",zkodphfbfbfb}
> + {uuuzkodphfbfbfb}
>  (1 row)

Ugh.  The regex engine is finding the match correctly, but it's failing to
tell the caller where it is :-(.  I was a little too cute in optimizing
the regmatch_t result-vector copying in pg_regexec, and forgot to ensure
that the overall match position would be reported.

Thanks for the testing!

                        regards, tom lane

diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index bf1dea6352..64816dd370 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -541,9 +541,11 @@ createTrgmNFA(text *text_re, Oid collation,
 	 * Stage 1: Compile the regexp into a NFA, using the regexp library.
 	 */
 #ifdef IGNORECASE
-	RE_compile(&regex, text_re, REG_ADVANCED | REG_ICASE, collation);
+	RE_compile(&regex, text_re,
+			   REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
 #else
-	RE_compile(&regex, text_re, REG_ADVANCED, collation);
+	RE_compile(&regex, text_re,
+			   REG_ADVANCED | REG_NOSUB, collation);
 #endif
 
 	/*
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
index 7673dab76f..45727ffa01 100644
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@@ -528,10 +528,7 @@ next(struct vars *v)
 				}
 				assert(NOTREACHED);
 			}
-			if (v->cflags & REG_NOSUB)
-				RETV('(', 0);	/* all parens non-capturing */
-			else
-				RETV('(', 1);
+			RETV('(', 1);
 			break;
 		case CHR(')'):
 			if (LASTTYPE('('))
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 60a220c57a..ae3a7b6a38 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -65,7 +65,7 @@ static struct subre *subre(struct vars *, int, int, struct state *, struct state
 static void freesubre(struct vars *, struct subre *);
 static void freesubreandsiblings(struct vars *, struct subre *);
 static void freesrnode(struct vars *, struct subre *);
-static void optst(struct vars *, struct subre *);
+static void removecaptures(struct vars *, struct subre *);
 static int	numst(struct subre *, int);
 static void markst(struct subre *);
 static void cleanst(struct vars *);
@@ -431,7 +431,8 @@ pg_regcomp(regex_t *re,
 		dumpst(v->tree, debug, 1);
 	}
 #endif
-	optst(v, v->tree);
+	if (v->cflags & REG_NOSUB)
+		removecaptures(v, v->tree);
 	v->ntree = numst(v->tree, 1);
 	markst(v->tree);
 	cleanst(v);
@@ -1013,14 +1014,16 @@ parseqatom(struct vars *v,
 			break;
 		case BACKREF:			/* the Feature From The Black Lagoon */
 			INSIST(type != LACON, REG_ESUBREG);
-			INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
-			INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
+			subno = v->nextvalue;
+			assert(subno > 0);
+			INSIST(subno < v->nsubs, REG_ESUBREG);
+			NOERRN();
+			INSIST(v->subs[subno] != NULL, REG_ESUBREG);
 			NOERRN();
-			assert(v->nextvalue > 0);
 			atom = subre(v, 'b', BACKR, lp, rp);
 			NOERRN();
-			subno = v->nextvalue;
 			atom->backno = subno;
+			v->subs[subno]->flags |= BRUSE;
 			EMPTYARC(lp, rp);	/* temporarily, so there's something */
 			NEXT();
 			break;
@@ -2141,19 +2144,54 @@ freesrnode(struct vars *v,		/* might be NULL */
 }
 
 /*
- * optst - optimize a subRE subtree
+ * removecaptures - remove unnecessary capture subREs
+ *
+ * If the caller said that it doesn't care about subexpression match data,
+ * we may delete the "capture" markers on subREs that are not referenced
+ * by any backrefs, and then simplify anything that's become non-messy.
+ * Call this only if REG_NOSUB flag is set.
  */
 static void
-optst(struct vars *v,
-	  struct subre *t)
+removecaptures(struct vars *v,
+			   struct subre *t)
 {
+	struct subre *t2;
+
+	assert(t != NULL);
+
+	/*
+	 * If this isn't itself a backref target, clear capno and tentatively
+	 * clear CAP flag.
+	 */
+	if (!(t->flags & BRUSE))
+	{
+		t->capno = 0;
+		t->flags &= ~CAP;
+	}
+
+	/* Now recurse to children */
+	for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+	{
+		removecaptures(v, t2);
+		/* Propagate child CAP flag back up, if it's still set */
+		if (t2->flags & CAP)
+			t->flags |= CAP;
+	}
+
 	/*
-	 * DGP (2007-11-13): I assume it was the programmer's intent to eventually
-	 * come back and add code to optimize subRE trees, but the routine coded
-	 * just spends effort traversing the tree and doing nothing. We can do
-	 * nothing with less effort.
+	 * If t now contains neither captures nor backrefs, there's no longer any
+	 * need to care where its sub-match boundaries are, so we can reduce it to
+	 * a simple DFA node.  (Note in particular that MIXED child greediness is
+	 * not a hindrance here, so we don't use the MESSY() macro.)
 	 */
-	return;
+	if ((t->flags & (CAP | BACKR)) == 0)
+	{
+		if (t->child)
+			freesubreandsiblings(v, t->child);
+		t->child = NULL;
+		t->op = '=';
+		t->flags &= ~MIXED;
+	}
 }
 
 /*
@@ -2501,6 +2539,8 @@ stdump(struct subre *t,
 		fprintf(f, " hascapture");
 	if (t->flags & BACKR)
 		fprintf(f, " hasbackref");
+	if (t->flags & BRUSE)
+		fprintf(f, " isreferenced");
 	if (!(t->flags & INUSE))
 		fprintf(f, " UNUSED");
 	if (t->latype != (char) -1)
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 5b9a087820..c08a224618 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -213,12 +213,10 @@ pg_regexec(regex_t *re,
 		return REG_NOMATCH;
 	backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
 	v->eflags = flags;
-	if (v->g->cflags & REG_NOSUB)
-		nmatch = 0;				/* override client */
 	v->nmatch = nmatch;
-	if (backref)
+	if (backref && nmatch <= v->g->nsub)
 	{
-		/* need work area */
+		/* need larger work area */
 		if (v->g->nsub + 1 <= LOCALMAT)
 			v->pmatch = mat;
 		else
@@ -229,7 +227,13 @@ pg_regexec(regex_t *re,
 		v->nmatch = v->g->nsub + 1;
 	}
 	else
+	{
+		/* we can store results directly in caller's array */
 		v->pmatch = pmatch;
+		/* ensure any extra entries in caller's array are filled with -1 */
+		if (nmatch > 0)
+			zapallsubs(pmatch, nmatch);
+	}
 	v->details = details;
 	v->start = (chr *) string;
 	v->search_start = (chr *) string + search_start;
@@ -290,12 +294,20 @@ pg_regexec(regex_t *re,
 	else
 		st = find(v, &v->g->tree->cnfa, &v->g->cmap);
 
-	/* copy (portion of) match vector over if necessary */
-	if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
+	/* on success, ensure caller's match vector is filled correctly */
+	if (st == REG_OKAY && nmatch > 0)
 	{
-		zapallsubs(pmatch, nmatch);
-		n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
-		memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
+		if (v->pmatch != pmatch)
+		{
+			/* copy portion of match vector over from (larger) work area */
+			assert(nmatch <= v->nmatch);
+			memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
+		}
+		if (v->g->cflags & REG_NOSUB)
+		{
+			/* don't expose possibly-partial sub-match results to caller */
+			zapallsubs(pmatch, nmatch);
+		}
 	}
 
 	/* clean up */
@@ -752,7 +764,6 @@ cdissect(struct vars *v,
 			break;
 		case '(':				/* no-op capture node */
 			assert(t->child != NULL);
-			assert(t->capno > 0);
 			er = cdissect(v, t->child, begin, end);
 			break;
 		default:
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y
index de3d97931e..bd5d4488a0 100644
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -583,6 +583,14 @@ jspConvertRegexFlags(uint32 xflags)
 					 errmsg("XQuery \"x\" flag (expanded regular expressions) is not implemented")));
 	}
 
+	/*
+	 * We'll never need sub-match details at execution.  While
+	 * RE_compile_and_execute would set this flag anyway, force it on here to
+	 * ensure that the regex cache entries created by makeItemLikeRegex are
+	 * useful.
+	 */
+	cflags |= REG_NOSUB;
+
 	return cflags;
 }
 
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 484d4265fd..268cee1cbe 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -347,6 +347,10 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
 {
 	regex_t    *re;
 
+	/* Use REG_NOSUB if caller does not want sub-match details */
+	if (nmatch < 2)
+		cflags |= REG_NOSUB;
+
 	/* Compile RE */
 	re = RE_compile_and_cache(text_re, cflags, collation);
 
@@ -1412,6 +1416,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 	int			orig_len;
 	pg_wchar   *wide_str;
 	int			wide_len;
+	int			cflags;
 	regex_t    *cpattern;
 	regmatch_t *pmatch;
 	int			pmatch_len;
@@ -1430,7 +1435,10 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 	wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
 
 	/* set up the compiled pattern */
-	cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
+	cflags = re_flags->cflags;
+	if (!use_subpatterns)
+		cflags |= REG_NOSUB;
+	cpattern = RE_compile_and_cache(pattern, cflags, collation);
 
 	/* do we want to remember subpatterns? */
 	if (use_subpatterns && cpattern->re_nsub > 0)
@@ -1952,7 +1960,7 @@ regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
 	if (case_insensitive)
 		cflags |= REG_ICASE;
 
-	re = RE_compile_and_cache(text_re, cflags, collation);
+	re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);
 
 	/* Examine it to see if there's a fixed prefix */
 	re_result = pg_regprefix(re, &str, &slen);
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h
index 2b48a19fb7..0455ae8069 100644
--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -106,7 +106,7 @@ typedef struct
 #define REG_QUOTE	000004		/* no special characters, none */
 #define REG_NOSPEC	REG_QUOTE	/* historical synonym */
 #define REG_ICASE	000010		/* ignore case */
-#define REG_NOSUB	000020		/* don't care about subexpressions */
+#define REG_NOSUB	000020		/* caller doesn't need subexpr match data */
 #define REG_EXPANDED	000040	/* expanded format, white space & comments */
 #define REG_NLSTOP	000100		/* \n doesn't match . or [^ ] */
 #define REG_NLANCH	000200		/* ^ matches after \n, $ before */
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 8db631c83b..91a52840c4 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -477,13 +477,14 @@ struct subre
 #define  MIXED	 04				/* mixed preference below */
 #define  CAP	 010			/* capturing parens here or below */
 #define  BACKR	 020			/* back reference here or below */
+#define  BRUSE	 040			/* is referenced by a back reference */
 #define  INUSE	 0100			/* in use in final tree */
-#define  NOPROP  03				/* bits which may not propagate up */
+#define  UPPROP  (MIXED|CAP|BACKR)	/* flags which should propagate up */
 #define  LMIX(f) ((f)<<2)		/* LONGER -> MIXED */
 #define  SMIX(f) ((f)<<1)		/* SHORTER -> MIXED */
-#define  UP(f)	 (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED))
+#define  UP(f)	 (((f)&UPPROP) | (LMIX(f) & SMIX(f) & MIXED))
 #define  MESSY(f)	 ((f)&(MIXED|CAP|BACKR))
-#define  PREF(f) ((f)&NOPROP)
+#define  PREF(f)	 ((f)&(LONGER|SHORTER))
 #define  PREF2(f1, f2)	 ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
 #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
 	char		latype;			/* LATYPE code, if lookaround constraint */
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 44da7d2019..5a6cdf47c2 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -146,12 +146,12 @@ select * from test_regex('(a)e', 'ae', '-');
  {ae,a}
 (2 rows)
 
--- expectMatch	4.2  o		(a)e		ae
-select * from test_regex('(a)e', 'ae', 'o');
- test_regex 
-------------
- {0}
- {NULL}
+-- expectMatch	4.2  oPR	(.)\1e		abeaae	aae	{}
+select * from test_regex('(.)\1e', 'abeaae', 'oPR');
+           test_regex           
+--------------------------------
+ {1,REG_UBACKREF,REG_UNONPOSIX}
+ {aae,NULL}
 (2 rows)
 
 -- expectMatch	4.3  b		{\(a\)b}	ab	ab	a
@@ -2658,6 +2658,20 @@ select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
  {"abc abc",abc}
 (2 rows)
 
+-- exercise oversize-regmatch_t-array paths in regexec()
+-- (that case is not reachable via test_regex, sadly)
+select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
+ substring 
+-----------
+ f
+(1 row)
+
+select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)');
+ regexp_split_to_array 
+-----------------------
+ {abc,def,ghi}
+(1 row)
+
 -- doing 15 "octal escapes vs back references"
 -- # initial zero is always octal
 -- expectMatch	15.1  MP	"a\\010b"	"a\bb"	"a\bb"
@@ -3476,6 +3490,22 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
  {x,x,x,NULL}
 (2 rows)
 
+-- expectMatch	21.37 RP	((.))(\2)	xyy	yy	y	y	y
+select * from test_regex('((.))(\2)', 'xyy', 'RP');
+           test_regex           
+--------------------------------
+ {3,REG_UBACKREF,REG_UNONPOSIX}
+ {yy,y,y,y}
+(2 rows)
+
+-- expectMatch	21.38 oRP	((.))(\2)	xyy	yy	{}	{}	{}
+select * from test_regex('((.))(\2)', 'xyy', 'oRP');
+           test_regex           
+--------------------------------
+ {3,REG_UBACKREF,REG_UNONPOSIX}
+ {yy,NULL,NULL,NULL}
+(2 rows)
+
 -- doing 22 "multicharacter collating elements"
 -- # again ugh
 -- MCCEs are not implemented in Postgres, so we skip all these tests
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 9224fdfdd3..3419564203 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -63,8 +63,8 @@ select * from test_regex('ab', 'ab', 'b');
 
 -- expectMatch	4.1  -		(a)e		ae	ae	a
 select * from test_regex('(a)e', 'ae', '-');
--- expectMatch	4.2  o		(a)e		ae
-select * from test_regex('(a)e', 'ae', 'o');
+-- expectMatch	4.2  oPR	(.)\1e		abeaae	aae	{}
+select * from test_regex('(.)\1e', 'abeaae', 'oPR');
 -- expectMatch	4.3  b		{\(a\)b}	ab	ab	a
 select * from test_regex('\(a\)b', 'ab', 'b');
 -- expectMatch	4.4  -		a((b)c)		abc	abc	bc	b
@@ -775,6 +775,11 @@ select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
 select * from test_regex('(^\w+\M).*\1', 'abc abcd abd', 'LRP');
 select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
 
+-- exercise oversize-regmatch_t-array paths in regexec()
+-- (that case is not reachable via test_regex, sadly)
+select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
+select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)');
+
 -- doing 15 "octal escapes vs back references"
 
 -- # initial zero is always octal
@@ -1011,6 +1016,10 @@ select * from test_regex('(a*)*', 'bc', 'N');
 select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M');
 -- expectMatch	21.36 RPQ	((.))(\2){0}	xy	x	x	x	{}
 select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
+-- expectMatch	21.37 RP	((.))(\2)	xyy	yy	y	y	y
+select * from test_regex('((.))(\2)', 'xyy', 'RP');
+-- expectMatch	21.38 oRP	((.))(\2)	xyy	yy	{}	{}	{}
+select * from test_regex('((.))(\2)', 'xyy', 'oRP');
 
 -- doing 22 "multicharacter collating elements"
 -- # again ugh

Re: Another regexp performance improvement: skip useless paren-captures

Reply via email to