I wrote:

> > I'll look for other rules that could be more
> > easily optimized, but I'm not terribly optimistic.
>
> I found a possible other way to bring the size of the transition table
> under 32k entries while keeping the existing no-backup rules in place:
> Replace the "quotecontinue" rule with a new state. In the attached
> draft patch, when Flex encounters a quote while inside any kind of
> quoted string, it saves the current state and enters %xqs (think
> 'quotestop'). If it then sees {whitespace_with_newline}{quote}, it
> reenters the previous state and continues to slurp the string,
> otherwise, it throws back everything and returns the string it just
> exited. Doing it this way is a bit uglier, but with some extra
> commentary it might not be too bad.

I had an epiphany and managed to get rid of the backup states.
Regression tests pass. The array is down to 30367 entries and the
binary is smaller by 172kB on Linux x86-64. Performance is identical
to master on both tests mentioned upthread. I'll clean this up and add
it to the commitfest.

--
John Naylor                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index e1cae859e8..67ad06da4f 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -56,6 +56,8 @@ fprintf_to_ereport(const char *fmt, const char *msg)
 	ereport(ERROR, (errmsg_internal("%s", msg)));
 }
 
+static int state_before;
+
 /*
  * GUC variables.  This is a DIRECT violation of the warning given at the
  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
@@ -168,6 +170,7 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
  *  <xd> delimited identifiers (double-quoted identifiers)
  *  <xh> hexadecimal numeric string
  *  <xq> standard quoted strings
+ *  <xqs> quote stop (detect continued strings)
  *  <xe> extended quoted strings (support backslash escape sequences)
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
@@ -185,6 +188,7 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
 %x xd
 %x xh
 %x xq
+%x xqs
 %x xe
 %x xdolq
 %x xui
@@ -231,19 +235,7 @@ special_whitespace		({space}+|{comment}{newline})
 horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
 
-/*
- * To ensure that {quotecontinue} can be scanned without having to back up
- * if the full pattern isn't matched, we include trailing whitespace in
- * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
- * except for {quote} followed by whitespace and just one "-" (not two,
- * which would start a {comment}).  To cover that we have {quotefail}.
- * The actions for {quotestop} and {quotefail} must throw back characters
- * beyond the quote proper.
- */
 quote			'
-quotestop		{quote}{whitespace}*
-quotecontinue	{quote}{whitespace_with_newline}{quote}
-quotefail		{quote}{whitespace}*"-"
 
 /* Bit string
  * It is tempting to scan the string for only those characters
@@ -476,21 +468,10 @@ other			.
 					startlit();
 					addlitchar('b', yyscanner);
 				}
-<xb>{quotestop}	|
-<xb>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					yylval->str = litbufdup(yyscanner);
-					return BCONST;
-				}
 <xh>{xhinside}	|
 <xb>{xbinside}	{
 					addlit(yytext, yyleng, yyscanner);
 				}
-<xh>{quotecontinue}	|
-<xb>{quotecontinue}	{
-					/* ignore */
-				}
 <xb><<EOF>>		{ yyerror("unterminated bit string literal"); }
 
 {xhstart}		{
@@ -505,13 +486,6 @@ other			.
 					startlit();
 					addlitchar('x', yyscanner);
 				}
-<xh>{quotestop}	|
-<xh>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					yylval->str = litbufdup(yyscanner);
-					return XCONST;
-				}
 <xh><<EOF>>		{ yyerror("unterminated hexadecimal string literal"); }
 
 {xnstart}		{
@@ -568,28 +542,65 @@ other			.
 					BEGIN(xus);
 					startlit();
 				}
-<xq,xe>{quotestop}	|
-<xq,xe>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
+
+<xb,xh,xq,xe,xus>{quote} {
+					state_before = YYSTATE;
+					BEGIN(xqs);
+				}
+<xqs>{whitespace_with_newline}{quote} {
+					/* resume scanning string that started on a previous line */
+					BEGIN(state_before);
+				}
+<xqs>{quote} {
 					/*
-					 * check that the data remains valid if it might have been
-					 * made invalid by unescaping any chars.
+					 * SQL requires at least one newline in the whitespace separating
+					 * string literals that are to be concatenated, so throw an error
+					 * if we see the start of a new string on the same line.
 					 */
-					if (yyextra->saw_non_ascii)
-						pg_verifymbstr(yyextra->literalbuf,
-									   yyextra->literallen,
-									   false);
-					yylval->str = litbufdup(yyscanner);
-					return SCONST;
+					SET_YYLLOC();
+					ADVANCE_YYLLOC(yyleng - 1);
+					yyerror("syntax error");
 				}
-<xus>{quotestop} |
-<xus>{quotefail} {
-					/* throw back all but the quote */
-					yyless(1);
-					/* xusend state looks for possible UESCAPE */
-					BEGIN(xusend);
+<xqs>{whitespace}*[^']? |
+<xqs><<EOF>> {
+					/* throw back everything and handle the string we just scanned */
+					yyless(0);
+
+					switch (state_before)
+					{
+						case xb:
+							BEGIN(INITIAL);
+							yylval->str = litbufdup(yyscanner);
+							return BCONST;
+						case xh:
+							BEGIN(INITIAL);
+							yylval->str = litbufdup(yyscanner);
+							return XCONST;
+						case xe:
+							/* fallthrough */
+						case xq:
+							BEGIN(INITIAL);
+
+							/*
+							 * check that the data remains valid if it might have been
+							 * made invalid by unescaping any chars.
+							 */
+							if (yyextra->saw_non_ascii)
+								pg_verifymbstr(yyextra->literalbuf,
+											   yyextra->literallen,
+											   false);
+							yylval->str = litbufdup(yyscanner);
+							return SCONST;
+						case xus:
+							/* xusend state looks for possible UESCAPE */
+							BEGIN(xusend);
+							break;
+						default:
+							yyerror("unhandled previous state in quote continuation");
+					}
+
 				}
+
 <xusend>{whitespace} {
 					/* stay in xusend state over whitespace */
 				}
@@ -693,9 +704,6 @@ other			.
 					if (c == '\0' || IS_HIGHBIT_SET(c))
 						yyextra->saw_non_ascii = true;
 				}
-<xq,xe,xus>{quotecontinue} {
-					/* ignore */
-				}
 <xe>.			{
 					/* This is only needed for \ just before EOF */
 					addlitchar(yytext[0], yyscanner);

Reply via email to