[calcite] branch main updated: [CALCITE-5305] Support character literals with C-style escapes

jhyde Wed, 19 Oct 2022 16:04:08 -0700

This is an automated email from the ASF dual-hosted git repository.

jhyde pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git



The following commit(s) were added to refs/heads/main by this push:
     new f8c33df67e [CALCITE-5305] Support character literals with C-style 
escapes
f8c33df67e is described below

commit f8c33df67e4197206af5369fce8f67ad94deddaf
Author: dssysolyatin <[email protected]>
AuthorDate: Fri Sep 30 18:31:58 2022 +0300

    [CALCITE-5305] Support character literals with C-style escapes
    
    Document the new literal format in the SQL Reference.
    
    Contains code copied from
    
https://github.com/crate/crate/blob/master/libs/sql-parser/src/main/java/io/crate/sql/Literals.java
    
    Close apache/calcite#2926
---
 core/src/main/codegen/templates/Parser.jj          |  13 ++
 .../apache/calcite/sql/parser/SqlParserUtil.java   | 186 ++++++++++++++++++++-
 site/_docs/reference.md                            |   2 +-
 .../apache/calcite/sql/parser/SqlParserTest.java   |  68 ++++++++
 4 files changed, 263 insertions(+), 6 deletions(-)

diff --git a/core/src/main/codegen/templates/Parser.jj 
b/core/src/main/codegen/templates/Parser.jj
index 7f26f806f3..f258e8235d 100644
--- a/core/src/main/codegen/templates/Parser.jj
+++ b/core/src/main/codegen/templates/Parser.jj
@@ -4504,6 +4504,17 @@ SqlNode StringLiteral() :
             return SqlStdOperatorTable.LITERAL_CHAIN.createCall(pos2, rands);
         }
     }
+|
+    <C_STYLE_ESCAPED_STRING_LITERAL>
+    {
+        try {
+            p = SqlParserUtil.parseCString(getToken(0).image);
+        } catch (SqlParserUtil.MalformedUnicodeEscape e) {
+            throw SqlUtil.newContextException(getPos(),
+                RESOURCE.unicodeEscapeMalformed(e.i));
+       }
+       return SqlLiteral.createCharString(p, "UTF16", getPos());
+    }
 |
     <BIG_QUERY_DOUBLE_QUOTED_STRING>
     {
@@ -8178,6 +8189,8 @@ void NonReservedKeyWord2of3() :
     < PREFIXED_STRING_LITERAL: ("_" <CHARSETNAME> | "N") <QUOTED_STRING> >
 |
     < UNICODE_STRING_LITERAL: "U" "&" <QUOTED_STRING> >
+|
+    < C_STYLE_ESCAPED_STRING_LITERAL: "E" <QUOTE> ( (~["'", "\\"]) | ("\\" 
~[]) | "''")* <QUOTE> >
 |
     < #CHARSETNAME: (["a"-"z","A"-"Z","0"-"9"])
     (["a"-"z","A"-"Z","0"-"9",":",".","-","_"])*
diff --git 
a/core/src/main/java/org/apache/calcite/sql/parser/SqlParserUtil.java 
b/core/src/main/java/org/apache/calcite/sql/parser/SqlParserUtil.java
index f625c9a0b1..5141c19d2e 100644
--- a/core/src/main/java/org/apache/calcite/sql/parser/SqlParserUtil.java
+++ b/core/src/main/java/org/apache/calcite/sql/parser/SqlParserUtil.java
@@ -61,12 +61,12 @@ import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.List;
 import java.util.Locale;
-import java.util.Objects;
 import java.util.StringTokenizer;
 import java.util.function.Predicate;
 
 import static org.apache.calcite.util.Static.RESOURCE;
 
+import static java.lang.Integer.parseInt;
 import static java.util.Objects.requireNonNull;
 
 /**
@@ -110,6 +110,158 @@ public final class SqlParserUtil {
     return strip(s, "'", "'", "''", Casing.UNCHANGED);
   }
 
+  /**
+   * Converts the contents of a SQL quoted character literal with C-style
+   * escapes into the corresponding Java string representation.
+   *
+   * @throws MalformedUnicodeEscape if input contains invalid unicode escapes
+   */
+  public static String parseCString(String s) throws MalformedUnicodeEscape {
+    final String s2 = parseString(s);
+    return replaceEscapedChars(s2);
+  }
+
+  /**
+   * Converts the contents of a character literal  with escapes like those used
+   * in the C programming language to the corresponding Java string
+   * representation.
+   *
+   * <p>If the literal "{@code E'a\tc'}" occurs in the SQL source text, then
+   * this method will be invoked with the string "{@code a\tc}" (4 characters)
+   * and will return a Java string with the three characters 'a', TAB, 'b'.
+   *
+   * <p>The format is the same as the Postgres; see
+   * <a 
href="https://www.postgresql.org/docs/14/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS";>
+   * Postgres 4.1.2.2. String Constants With C-Style Escapes</a>.
+   *
+   * @param input String that contains C-style escapes
+   * @return String with escapes converted into Java characters
+   * @throws MalformedUnicodeEscape if input contains invalid unicode escapes
+   */
+  public static String replaceEscapedChars(String input)
+      throws MalformedUnicodeEscape {
+    // The implementation of this method is based on Crate's method
+    // Literals.replaceEscapedChars.
+    final int length = input.length();
+    if (length <= 1) {
+      return input;
+    }
+    final StringBuilder builder = new StringBuilder(length);
+    int endIdx;
+    for (int i = 0; i < length; i++) {
+      char currentChar = input.charAt(i);
+      if (currentChar == '\\' && i + 1 < length) {
+        char nextChar = input.charAt(i + 1);
+        switch (nextChar) {
+        case 'b':
+          builder.append('\b');
+          i++;
+          break;
+        case 'f':
+          builder.append('\f');
+          i++;
+          break;
+        case 'n':
+          builder.append('\n');
+          i++;
+          break;
+        case 'r':
+          builder.append('\r');
+          i++;
+          break;
+        case 't':
+          builder.append('\t');
+          i++;
+          break;
+        case '\\':
+        case '\'':
+          builder.append(nextChar);
+          i++;
+          break;
+        case 'u':
+        case 'U':
+          // handle unicode case
+          final int charsToConsume = (nextChar == 'u') ? 4 : 8;
+          if (i + 1 + charsToConsume >= length) {
+            throw new MalformedUnicodeEscape(i);
+          }
+          endIdx = calculateMaxCharsInSequence(input,
+              i + 2,
+              charsToConsume,
+              SqlParserUtil::isHexDigit);
+          if (endIdx != i + 2 + charsToConsume) {
+            throw new MalformedUnicodeEscape(i);
+          }
+          builder.appendCodePoint(parseInt(input.substring(i + 2, endIdx), 
16));
+          i = endIdx - 1; // skip already consumed chars
+          break;
+        case 'x':
+          // handle hex byte case - up to 2 chars for hex value
+          endIdx = calculateMaxCharsInSequence(input,
+              i + 2,
+              2,
+              SqlParserUtil::isHexDigit);
+          if (endIdx > i + 2) {
+            builder.appendCodePoint(parseInt(input.substring(i + 2, endIdx), 
16));
+            i = endIdx - 1; // skip already consumed chars
+          } else {
+            // hex sequence unmatched - output original char
+            builder.append(nextChar);
+            i++;
+          }
+          break;
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+          // handle octal case - up to 3 chars
+          endIdx = calculateMaxCharsInSequence(input,
+              i + 2,
+              2,      // first char is already "consumed"
+              SqlParserUtil::isOctalDigit);
+          builder.appendCodePoint(parseInt(input.substring(i + 1, endIdx), 8));
+          i = endIdx - 1; // skip already consumed chars
+          break;
+        default:
+          // non-valid escaped char sequence
+          builder.append(currentChar);
+        }
+      } else {
+        builder.append(currentChar);
+      }
+    }
+    return builder.toString();
+  }
+
+  /**
+   * Calculates the maximum number of consecutive characters of the
+   * {@link CharSequence} argument, starting from {@code beginIndex}, that 
match
+   * a given {@link Predicate}. The number of characters to match are either
+   * capped from the {@code maxCharsToMatch} parameter or the sequence length.
+   *
+   * <p>Examples:
+   * <pre>
+   * {@code
+   *    calculateMaxCharsInSequence("12345", 0, 2, Character::isDigit) -> 2
+   *    calculateMaxCharsInSequence("12345", 3, 2, Character::isDigit) -> 5
+   *    calculateMaxCharsInSequence("12345", 4, 2, Character::isDigit) -> 5
+   * }
+   * </pre>
+   *
+   * @return the index of the first non-matching character
+   */
+  private static int calculateMaxCharsInSequence(CharSequence seq,
+      int beginIndex,
+      int maxCharsToMatch,
+      Predicate<Character> predicate) {
+    int idx = beginIndex;
+    final int end = Math.min(seq.length(), beginIndex + maxCharsToMatch);
+    while (idx < end && predicate.test(seq.charAt(idx))) {
+      idx++;
+    }
+    return idx;
+  }
+
   public static BigDecimal parseDecimal(String s) {
     return new BigDecimal(s);
   }
@@ -118,6 +270,22 @@ public final class SqlParserUtil {
     return new BigDecimal(s);
   }
 
+  /**
+   * Returns true if the specific character is a base-8 digit.
+   */
+  public static boolean isOctalDigit(final char ch) {
+    return ch >= '0' && ch <= '7';
+  }
+
+  /**
+   * Returns true if the specified character is a base-16 digit.
+   */
+  public static boolean isHexDigit(final char ch) {
+    return (ch >= '0' && ch <= '9')
+        || (ch >= 'A' && ch <= 'F')
+        || (ch >= 'a' && ch <= 'f');
+  }
+
   // CHECKSTYLE: IGNORE 1
   /** @deprecated this method is not localized for Farrago standards */
   @Deprecated // to be removed before 2.0
@@ -302,7 +470,7 @@ public final class SqlParserUtil {
     if (value.charAt(0) == '-') {
       throw new NumberFormatException(value);
     }
-    return Integer.parseInt(value);
+    return parseInt(value);
   }
 
   /**
@@ -348,9 +516,8 @@ public final class SqlParserUtil {
   public static String strip(String s, @Nullable String startQuote,
       @Nullable String endQuote, @Nullable String escape, Casing casing) {
     if (startQuote != null) {
-      return stripQuotes(s, Objects.requireNonNull(startQuote, "startQuote"),
-          Objects.requireNonNull(endQuote, "endQuote"), 
Objects.requireNonNull(escape, "escape"),
-          casing);
+      return stripQuotes(s, startQuote, requireNonNull(endQuote, "endQuote"),
+          requireNonNull(escape, "escape"), casing);
     } else {
       return toCase(s, casing);
     }
@@ -969,4 +1136,13 @@ public final class SqlParserUtil {
     final DateFormat date =
         new SimpleDateFormat(DateTimeUtils.DATE_FORMAT_STRING, Locale.ROOT);
   }
+
+  /** Thrown by {@link #replaceEscapedChars(String)}. */
+  public static class MalformedUnicodeEscape extends Exception {
+    public final int i;
+
+    MalformedUnicodeEscape(int i) {
+      this.i = i;
+    }
+  }
 }
diff --git a/site/_docs/reference.md b/site/_docs/reference.md
index 7e6aeba4bf..66daae80bd 100644
--- a/site/_docs/reference.md
+++ b/site/_docs/reference.md
@@ -1123,7 +1123,7 @@ name will have been converted to upper case also.
 | NUMERIC     | Fixed point               |
 | REAL, FLOAT | 4 byte floating point     | 6 decimal digits precision
 | DOUBLE      | 8 byte floating point     | 15 decimal digits precision
-| CHAR(n), CHARACTER(n) | Fixed-width character string | 'Hello', '' (empty 
string), _latin1'Hello', n'Hello', _UTF16'Hello', 'Hello' 'there' (literal 
split into multiple parts)
+| CHAR(n), CHARACTER(n) | Fixed-width character string | 'Hello', '' (empty 
string), _latin1'Hello', n'Hello', _UTF16'Hello', 'Hello' 'there' (literal 
split into multiple parts), e'Hello\nthere' (literal containing C-style escapes)
 | VARCHAR(n), CHARACTER VARYING(n) | Variable-length character string | As 
CHAR(n)
 | BINARY(n)   | Fixed-width binary string | x'45F0AB', x'' (empty binary 
string), x'AB' 'CD' (multi-part binary string literal)
 | VARBINARY(n), BINARY VARYING(n) | Variable-length binary string | As 
BINARY(n)
diff --git 
a/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java 
b/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java
index 9272197acd..c14c291734 100644
--- a/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java
+++ b/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java
@@ -72,6 +72,7 @@ import java.util.function.Consumer;
 import java.util.function.UnaryOperator;
 import java.util.stream.Collectors;
 
+import static org.apache.calcite.util.Static.RESOURCE;
 import static org.apache.calcite.util.Util.toLinux;
 
 import static org.hamcrest.CoreMatchers.containsString;
@@ -1900,6 +1901,73 @@ public class SqlParserTest {
     expr("'a' || 'b'").ok("('a' || 'b')");
   }
 
+  @Test void testCStyleEscapedString() {
+    expr("E'Apache\\tCalcite'")
+        .ok("_UTF16'Apache\tCalcite'");
+    expr("E'Apache\\bCalcite'")
+        .ok("_UTF16'Apache\bCalcite'");
+    expr("E'Apache\\fCalcite'")
+        .ok("_UTF16'Apache\fCalcite'");
+    expr("E'Apache\\nCalcite'")
+        .ok("_UTF16'Apache\nCalcite'");
+    expr("E'Apache\\rCalcite'")
+        .ok("_UTF16'Apache\rCalcite'");
+    expr("E'\\t\\n\\f'")
+        .ok("_UTF16'\t\n\f'");
+    expr("E'\\Apache Calcite'")
+        .ok("_UTF16'\\Apache Calcite'");
+
+    expr("E'\141'")
+        .ok("_UTF16'a'");
+    expr("E'\141\141\141'")
+        .ok("_UTF16'aaa'");
+    expr("E'\1411'")
+        .ok("_UTF16'a1'");
+
+    expr("E'\\x61'")
+        .ok("_UTF16'a'");
+    expr("E'\\x61\\x61\\x61'")
+        .ok("_UTF16'aaa'");
+    expr("E'\\x61\\x61\\x61'")
+        .ok("_UTF16'aaa'");
+    expr("E'\\xDg0000'")
+        .ok("_UTF16'\rg0000'");
+
+    expr("E'\\u0061'")
+        .ok("_UTF16'a'");
+    expr("E'\\u0061\\u0061\\u0061'")
+        .ok("_UTF16'aaa'");
+
+    expr("E'\\U00000061'")
+        .ok("_UTF16'a'");
+    expr("E'\\U00000061\\U00000061\\U00000061'")
+        .ok("_UTF16'aaa'");
+
+    expr("E'\\0'")
+        .ok("_UTF16'\u0000'");
+    expr("E'\\07'")
+        .ok("_UTF16'\u0007'");
+    expr("E'\\07'")
+        .ok("_UTF16'\u0007'");
+
+    expr("E'a\\'a\\'a\\''")
+        .ok("_UTF16'a''a''a'''");
+    expr("E'a''a''a'''")
+        .ok("_UTF16'a''a''a'''");
+
+    expr("E^'\\'^")
+        .fails("(?s).*Encountered .*");
+    expr("E^'a\\'^")
+        .fails("(?s).*Encountered.*");
+    expr("E'a\\''^a^'")
+        .fails("(?s).*Encountered.*");
+
+    expr("^E'A\\U0061'^")
+        .fails(RESOURCE.unicodeEscapeMalformed(1).str());
+    expr("^E'AB\\U0000006G'^")
+        .fails(RESOURCE.unicodeEscapeMalformed(2).str());
+  }
+
   @Test void testReverseSolidus() {
     expr("'\\'").ok("'\\'");
   }

[calcite] branch main updated: [CALCITE-5305] Support character literals with C-style escapes

Reply via email to