From 9b9b2882905409b91a26ee8f92961450af6591d7 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Thu, 27 Jun 2019 13:52:58 +0800
Subject: [PATCH v3 2/2] Replace the Flex quotestop rules with a new exclusive
 state

When Flex encounters a quote while inside any kind of quoted string,
it saves the current state and enters a new state in order to
detect string continuations, if any. This brings the number of
scanner states down to 29521, which is small enough to allow Flex to
use 16 bit types in the yy_transition array. This reduces the size
of the postgres binary by 171kB.
---
 src/backend/parser/scan.l | 110 ++++++++++++++++++++------------------
 1 file changed, 59 insertions(+), 51 deletions(-)

diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index 90f96c446f..525cef4b02 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -56,6 +56,8 @@ fprintf_to_ereport(const char *fmt, const char *msg)
 	ereport(ERROR, (errmsg_internal("%s", msg)));
 }
 
+static int state_before;
+
 /*
  * GUC variables.  This is a DIRECT violation of the warning given at the
  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
@@ -168,6 +170,7 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
  *  <xd> delimited identifiers (double-quoted identifiers)
  *  <xh> hexadecimal numeric string
  *  <xq> standard quoted strings
+ *  <xqs> quote stop (detect continued strings)
  *  <xe> extended quoted strings (support backslash escape sequences)
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
@@ -185,6 +188,7 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
 %x xd
 %x xh
 %x xq
+%x xqs
 %x xe
 %x xdolq
 %x xui
@@ -231,19 +235,9 @@ special_whitespace		({space}|{comment}{newline})
 horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
 
-/*
- * To ensure that {quotecontinue} can be scanned without having to back up
- * if the full pattern isn't matched, we include trailing whitespace in
- * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
- * except for {quote} followed by whitespace and just one "-" (not two,
- * which would start a {comment}).  To cover that we have {quotefail}.
- * The actions for {quotestop} and {quotefail} must throw back characters
- * beyond the quote proper.
- */
 quote			'
-quotestop		{quote}{whitespace}*
-quotecontinue	{quote}{whitespace_with_newline}{quote}
-quotefail		{quote}{whitespace}*"-"
+quotecontinue		{whitespace_with_newline}{quote}
+quotecontinuefail	{whitespace}*{other}?
 
 /* Bit string
  * It is tempting to scan the string for only those characters
@@ -476,21 +470,10 @@ other			.
 					startlit();
 					addlitchar('b', yyscanner);
 				}
-<xb>{quotestop}	|
-<xb>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					yylval->str = litbufdup(yyscanner);
-					return BCONST;
-				}
 <xh>{xhinside}	|
 <xb>{xbinside}	{
 					addlit(yytext, yyleng, yyscanner);
 				}
-<xh>{quotecontinue}	|
-<xb>{quotecontinue}	{
-					/* ignore */
-				}
 <xb><<EOF>>		{ yyerror("unterminated bit string literal"); }
 
 {xhstart}		{
@@ -505,13 +488,6 @@ other			.
 					startlit();
 					addlitchar('x', yyscanner);
 				}
-<xh>{quotestop}	|
-<xh>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					yylval->str = litbufdup(yyscanner);
-					return XCONST;
-				}
 <xh><<EOF>>		{ yyerror("unterminated hexadecimal string literal"); }
 
 {xnstart}		{
@@ -568,28 +544,63 @@ other			.
 					BEGIN(xus);
 					startlit();
 				}
-<xq,xe>{quotestop}	|
-<xq,xe>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
+
+<xb,xh,xq,xe,xus>{quote} {
 					/*
-					 * check that the data remains valid if it might have been
-					 * made invalid by unescaping any chars.
+					 * When we are scanning a quoted string and see an end
+					 * quote, we must look ahead for a possible continuation.
+					 * If we don't see one, we know the last quote was in
+					 * fact the end of the string.
 					 */
-					if (yyextra->saw_non_ascii)
-						pg_verifymbstr(yyextra->literalbuf,
-									   yyextra->literallen,
-									   false);
-					yylval->str = litbufdup(yyscanner);
-					return SCONST;
+					state_before = YYSTATE;
+					BEGIN(xqs);
 				}
-<xus>{quotestop} |
-<xus>{quotefail} {
-					/* throw back all but the quote */
-					yyless(1);
-					/* xusend state looks for possible UESCAPE */
-					BEGIN(xusend);
+<xqs>{quotecontinue} {
+					BEGIN(state_before);
+				}
+<xqs><<EOF>> |
+<xqs>{quotecontinuefail} {
+					/*
+					 * throw back everything and handle the string
+					 * we scanned previously
+					 */
+					yyless(0);
+
+					switch (state_before)
+					{
+						case xb:
+							BEGIN(INITIAL);
+							yylval->str = litbufdup(yyscanner);
+							return BCONST;
+						case xh:
+							BEGIN(INITIAL);
+							yylval->str = litbufdup(yyscanner);
+							return XCONST;
+						case xe:
+							/* fallthrough */
+						case xq:
+							BEGIN(INITIAL);
+
+							/*
+							 * Check that the data remains valid if it
+							 * might have been made invalid by unescaping
+							 * any chars.
+							 */
+							if (yyextra->saw_non_ascii)
+								pg_verifymbstr(yyextra->literalbuf,
+											   yyextra->literallen,
+											   false);
+							yylval->str = litbufdup(yyscanner);
+							return SCONST;
+						case xus:
+							/* xusend state looks for possible UESCAPE */
+							BEGIN(xusend);
+							break;
+						default:
+							yyerror("unhandled previous state after endquote");
+					}
 				}
+
 <xusend>{whitespace} {
 					/* stay in xusend state over whitespace */
 				}
@@ -693,9 +704,6 @@ other			.
 					if (c == '\0' || IS_HIGHBIT_SET(c))
 						yyextra->saw_non_ascii = true;
 				}
-<xq,xe,xus>{quotecontinue} {
-					/* ignore */
-				}
 <xe>\\			{
 					addlitchar(yytext[0], yyscanner);
 				}
-- 
2.17.2 (Apple Git-113)

