This is an automated email from the ASF dual-hosted git repository.
jhyde pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git
The following commit(s) were added to refs/heads/main by this push:
new f8c33df67e [CALCITE-5305] Support character literals with C-style
escapes
f8c33df67e is described below
commit f8c33df67e4197206af5369fce8f67ad94deddaf
Author: dssysolyatin <[email protected]>
AuthorDate: Fri Sep 30 18:31:58 2022 +0300
[CALCITE-5305] Support character literals with C-style escapes
Document the new literal format in the SQL Reference.
Contains code copied from
https://github.com/crate/crate/blob/master/libs/sql-parser/src/main/java/io/crate/sql/Literals.java
Close apache/calcite#2926
---
core/src/main/codegen/templates/Parser.jj | 13 ++
.../apache/calcite/sql/parser/SqlParserUtil.java | 186 ++++++++++++++++++++-
site/_docs/reference.md | 2 +-
.../apache/calcite/sql/parser/SqlParserTest.java | 68 ++++++++
4 files changed, 263 insertions(+), 6 deletions(-)
diff --git a/core/src/main/codegen/templates/Parser.jj
b/core/src/main/codegen/templates/Parser.jj
index 7f26f806f3..f258e8235d 100644
--- a/core/src/main/codegen/templates/Parser.jj
+++ b/core/src/main/codegen/templates/Parser.jj
@@ -4504,6 +4504,17 @@ SqlNode StringLiteral() :
return SqlStdOperatorTable.LITERAL_CHAIN.createCall(pos2, rands);
}
}
+|
+ <C_STYLE_ESCAPED_STRING_LITERAL>
+ {
+ try {
+ p = SqlParserUtil.parseCString(getToken(0).image);
+ } catch (SqlParserUtil.MalformedUnicodeEscape e) {
+ throw SqlUtil.newContextException(getPos(),
+ RESOURCE.unicodeEscapeMalformed(e.i));
+ }
+ return SqlLiteral.createCharString(p, "UTF16", getPos());
+ }
|
<BIG_QUERY_DOUBLE_QUOTED_STRING>
{
@@ -8178,6 +8189,8 @@ void NonReservedKeyWord2of3() :
< PREFIXED_STRING_LITERAL: ("_" <CHARSETNAME> | "N") <QUOTED_STRING> >
|
< UNICODE_STRING_LITERAL: "U" "&" <QUOTED_STRING> >
+|
+ < C_STYLE_ESCAPED_STRING_LITERAL: "E" <QUOTE> ( (~["'", "\\"]) | ("\\"
~[]) | "''")* <QUOTE> >
|
< #CHARSETNAME: (["a"-"z","A"-"Z","0"-"9"])
(["a"-"z","A"-"Z","0"-"9",":",".","-","_"])*
diff --git
a/core/src/main/java/org/apache/calcite/sql/parser/SqlParserUtil.java
b/core/src/main/java/org/apache/calcite/sql/parser/SqlParserUtil.java
index f625c9a0b1..5141c19d2e 100644
--- a/core/src/main/java/org/apache/calcite/sql/parser/SqlParserUtil.java
+++ b/core/src/main/java/org/apache/calcite/sql/parser/SqlParserUtil.java
@@ -61,12 +61,12 @@ import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import java.util.Locale;
-import java.util.Objects;
import java.util.StringTokenizer;
import java.util.function.Predicate;
import static org.apache.calcite.util.Static.RESOURCE;
+import static java.lang.Integer.parseInt;
import static java.util.Objects.requireNonNull;
/**
@@ -110,6 +110,158 @@ public final class SqlParserUtil {
return strip(s, "'", "'", "''", Casing.UNCHANGED);
}
+ /**
+ * Converts the contents of a SQL quoted character literal with C-style
+ * escapes into the corresponding Java string representation.
+ *
+ * @throws MalformedUnicodeEscape if input contains invalid unicode escapes
+ */
+ public static String parseCString(String s) throws MalformedUnicodeEscape {
+ final String s2 = parseString(s);
+ return replaceEscapedChars(s2);
+ }
+
+ /**
+ * Converts the contents of a character literal with escapes like those used
+ * in the C programming language to the corresponding Java string
+ * representation.
+ *
+ * <p>If the literal "{@code E'a\tc'}" occurs in the SQL source text, then
+ * this method will be invoked with the string "{@code a\tc}" (4 characters)
+ * and will return a Java string with the three characters 'a', TAB, 'b'.
+ *
+ * <p>The format is the same as the Postgres; see
+ * <a
href="https://www.postgresql.org/docs/14/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS">
+ * Postgres 4.1.2.2. String Constants With C-Style Escapes</a>.
+ *
+ * @param input String that contains C-style escapes
+ * @return String with escapes converted into Java characters
+ * @throws MalformedUnicodeEscape if input contains invalid unicode escapes
+ */
+ public static String replaceEscapedChars(String input)
+ throws MalformedUnicodeEscape {
+ // The implementation of this method is based on Crate's method
+ // Literals.replaceEscapedChars.
+ final int length = input.length();
+ if (length <= 1) {
+ return input;
+ }
+ final StringBuilder builder = new StringBuilder(length);
+ int endIdx;
+ for (int i = 0; i < length; i++) {
+ char currentChar = input.charAt(i);
+ if (currentChar == '\\' && i + 1 < length) {
+ char nextChar = input.charAt(i + 1);
+ switch (nextChar) {
+ case 'b':
+ builder.append('\b');
+ i++;
+ break;
+ case 'f':
+ builder.append('\f');
+ i++;
+ break;
+ case 'n':
+ builder.append('\n');
+ i++;
+ break;
+ case 'r':
+ builder.append('\r');
+ i++;
+ break;
+ case 't':
+ builder.append('\t');
+ i++;
+ break;
+ case '\\':
+ case '\'':
+ builder.append(nextChar);
+ i++;
+ break;
+ case 'u':
+ case 'U':
+ // handle unicode case
+ final int charsToConsume = (nextChar == 'u') ? 4 : 8;
+ if (i + 1 + charsToConsume >= length) {
+ throw new MalformedUnicodeEscape(i);
+ }
+ endIdx = calculateMaxCharsInSequence(input,
+ i + 2,
+ charsToConsume,
+ SqlParserUtil::isHexDigit);
+ if (endIdx != i + 2 + charsToConsume) {
+ throw new MalformedUnicodeEscape(i);
+ }
+ builder.appendCodePoint(parseInt(input.substring(i + 2, endIdx),
16));
+ i = endIdx - 1; // skip already consumed chars
+ break;
+ case 'x':
+ // handle hex byte case - up to 2 chars for hex value
+ endIdx = calculateMaxCharsInSequence(input,
+ i + 2,
+ 2,
+ SqlParserUtil::isHexDigit);
+ if (endIdx > i + 2) {
+ builder.appendCodePoint(parseInt(input.substring(i + 2, endIdx),
16));
+ i = endIdx - 1; // skip already consumed chars
+ } else {
+ // hex sequence unmatched - output original char
+ builder.append(nextChar);
+ i++;
+ }
+ break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ // handle octal case - up to 3 chars
+ endIdx = calculateMaxCharsInSequence(input,
+ i + 2,
+ 2, // first char is already "consumed"
+ SqlParserUtil::isOctalDigit);
+ builder.appendCodePoint(parseInt(input.substring(i + 1, endIdx), 8));
+ i = endIdx - 1; // skip already consumed chars
+ break;
+ default:
+ // non-valid escaped char sequence
+ builder.append(currentChar);
+ }
+ } else {
+ builder.append(currentChar);
+ }
+ }
+ return builder.toString();
+ }
+
+ /**
+ * Calculates the maximum number of consecutive characters of the
+ * {@link CharSequence} argument, starting from {@code beginIndex}, that
match
+ * a given {@link Predicate}. The number of characters to match are either
+ * capped from the {@code maxCharsToMatch} parameter or the sequence length.
+ *
+ * <p>Examples:
+ * <pre>
+ * {@code
+ * calculateMaxCharsInSequence("12345", 0, 2, Character::isDigit) -> 2
+ * calculateMaxCharsInSequence("12345", 3, 2, Character::isDigit) -> 5
+ * calculateMaxCharsInSequence("12345", 4, 2, Character::isDigit) -> 5
+ * }
+ * </pre>
+ *
+ * @return the index of the first non-matching character
+ */
+ private static int calculateMaxCharsInSequence(CharSequence seq,
+ int beginIndex,
+ int maxCharsToMatch,
+ Predicate<Character> predicate) {
+ int idx = beginIndex;
+ final int end = Math.min(seq.length(), beginIndex + maxCharsToMatch);
+ while (idx < end && predicate.test(seq.charAt(idx))) {
+ idx++;
+ }
+ return idx;
+ }
+
public static BigDecimal parseDecimal(String s) {
return new BigDecimal(s);
}
@@ -118,6 +270,22 @@ public final class SqlParserUtil {
return new BigDecimal(s);
}
+ /**
+ * Returns true if the specific character is a base-8 digit.
+ */
+ public static boolean isOctalDigit(final char ch) {
+ return ch >= '0' && ch <= '7';
+ }
+
+ /**
+ * Returns true if the specified character is a base-16 digit.
+ */
+ public static boolean isHexDigit(final char ch) {
+ return (ch >= '0' && ch <= '9')
+ || (ch >= 'A' && ch <= 'F')
+ || (ch >= 'a' && ch <= 'f');
+ }
+
// CHECKSTYLE: IGNORE 1
/** @deprecated this method is not localized for Farrago standards */
@Deprecated // to be removed before 2.0
@@ -302,7 +470,7 @@ public final class SqlParserUtil {
if (value.charAt(0) == '-') {
throw new NumberFormatException(value);
}
- return Integer.parseInt(value);
+ return parseInt(value);
}
/**
@@ -348,9 +516,8 @@ public final class SqlParserUtil {
public static String strip(String s, @Nullable String startQuote,
@Nullable String endQuote, @Nullable String escape, Casing casing) {
if (startQuote != null) {
- return stripQuotes(s, Objects.requireNonNull(startQuote, "startQuote"),
- Objects.requireNonNull(endQuote, "endQuote"),
Objects.requireNonNull(escape, "escape"),
- casing);
+ return stripQuotes(s, startQuote, requireNonNull(endQuote, "endQuote"),
+ requireNonNull(escape, "escape"), casing);
} else {
return toCase(s, casing);
}
@@ -969,4 +1136,13 @@ public final class SqlParserUtil {
final DateFormat date =
new SimpleDateFormat(DateTimeUtils.DATE_FORMAT_STRING, Locale.ROOT);
}
+
+ /** Thrown by {@link #replaceEscapedChars(String)}. */
+ public static class MalformedUnicodeEscape extends Exception {
+ public final int i;
+
+ MalformedUnicodeEscape(int i) {
+ this.i = i;
+ }
+ }
}
diff --git a/site/_docs/reference.md b/site/_docs/reference.md
index 7e6aeba4bf..66daae80bd 100644
--- a/site/_docs/reference.md
+++ b/site/_docs/reference.md
@@ -1123,7 +1123,7 @@ name will have been converted to upper case also.
| NUMERIC | Fixed point |
| REAL, FLOAT | 4 byte floating point | 6 decimal digits precision
| DOUBLE | 8 byte floating point | 15 decimal digits precision
-| CHAR(n), CHARACTER(n) | Fixed-width character string | 'Hello', '' (empty
string), _latin1'Hello', n'Hello', _UTF16'Hello', 'Hello' 'there' (literal
split into multiple parts)
+| CHAR(n), CHARACTER(n) | Fixed-width character string | 'Hello', '' (empty
string), _latin1'Hello', n'Hello', _UTF16'Hello', 'Hello' 'there' (literal
split into multiple parts), e'Hello\nthere' (literal containing C-style escapes)
| VARCHAR(n), CHARACTER VARYING(n) | Variable-length character string | As
CHAR(n)
| BINARY(n) | Fixed-width binary string | x'45F0AB', x'' (empty binary
string), x'AB' 'CD' (multi-part binary string literal)
| VARBINARY(n), BINARY VARYING(n) | Variable-length binary string | As
BINARY(n)
diff --git
a/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java
b/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java
index 9272197acd..c14c291734 100644
--- a/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java
+++ b/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java
@@ -72,6 +72,7 @@ import java.util.function.Consumer;
import java.util.function.UnaryOperator;
import java.util.stream.Collectors;
+import static org.apache.calcite.util.Static.RESOURCE;
import static org.apache.calcite.util.Util.toLinux;
import static org.hamcrest.CoreMatchers.containsString;
@@ -1900,6 +1901,73 @@ public class SqlParserTest {
expr("'a' || 'b'").ok("('a' || 'b')");
}
+ @Test void testCStyleEscapedString() {
+ expr("E'Apache\\tCalcite'")
+ .ok("_UTF16'Apache\tCalcite'");
+ expr("E'Apache\\bCalcite'")
+ .ok("_UTF16'Apache\bCalcite'");
+ expr("E'Apache\\fCalcite'")
+ .ok("_UTF16'Apache\fCalcite'");
+ expr("E'Apache\\nCalcite'")
+ .ok("_UTF16'Apache\nCalcite'");
+ expr("E'Apache\\rCalcite'")
+ .ok("_UTF16'Apache\rCalcite'");
+ expr("E'\\t\\n\\f'")
+ .ok("_UTF16'\t\n\f'");
+ expr("E'\\Apache Calcite'")
+ .ok("_UTF16'\\Apache Calcite'");
+
+ expr("E'\141'")
+ .ok("_UTF16'a'");
+ expr("E'\141\141\141'")
+ .ok("_UTF16'aaa'");
+ expr("E'\1411'")
+ .ok("_UTF16'a1'");
+
+ expr("E'\\x61'")
+ .ok("_UTF16'a'");
+ expr("E'\\x61\\x61\\x61'")
+ .ok("_UTF16'aaa'");
+ expr("E'\\x61\\x61\\x61'")
+ .ok("_UTF16'aaa'");
+ expr("E'\\xDg0000'")
+ .ok("_UTF16'\rg0000'");
+
+ expr("E'\\u0061'")
+ .ok("_UTF16'a'");
+ expr("E'\\u0061\\u0061\\u0061'")
+ .ok("_UTF16'aaa'");
+
+ expr("E'\\U00000061'")
+ .ok("_UTF16'a'");
+ expr("E'\\U00000061\\U00000061\\U00000061'")
+ .ok("_UTF16'aaa'");
+
+ expr("E'\\0'")
+ .ok("_UTF16'\u0000'");
+ expr("E'\\07'")
+ .ok("_UTF16'\u0007'");
+ expr("E'\\07'")
+ .ok("_UTF16'\u0007'");
+
+ expr("E'a\\'a\\'a\\''")
+ .ok("_UTF16'a''a''a'''");
+ expr("E'a''a''a'''")
+ .ok("_UTF16'a''a''a'''");
+
+ expr("E^'\\'^")
+ .fails("(?s).*Encountered .*");
+ expr("E^'a\\'^")
+ .fails("(?s).*Encountered.*");
+ expr("E'a\\''^a^'")
+ .fails("(?s).*Encountered.*");
+
+ expr("^E'A\\U0061'^")
+ .fails(RESOURCE.unicodeEscapeMalformed(1).str());
+ expr("^E'AB\\U0000006G'^")
+ .fails(RESOURCE.unicodeEscapeMalformed(2).str());
+ }
+
@Test void testReverseSolidus() {
expr("'\\'").ok("'\\'");
}