[CALCITE-2619] Reduce string literal creation cost by deferring and caching 
charset conversion (Ted Xu)

Close apache/calcite#911


Project: http://git-wip-us.apache.org/repos/asf/calcite/repo
Commit: http://git-wip-us.apache.org/repos/asf/calcite/commit/36bd5fb2
Tree: http://git-wip-us.apache.org/repos/asf/calcite/tree/36bd5fb2
Diff: http://git-wip-us.apache.org/repos/asf/calcite/diff/36bd5fb2

Branch: refs/heads/master
Commit: 36bd5fb2e6db955a581de5f55b00a089c4bc1389
Parents: 96605a8
Author: Ted Xu <[email protected]>
Authored: Sat Nov 10 14:46:24 2018 +0800
Committer: Julian Hyde <[email protected]>
Committed: Sat Dec 1 14:42:49 2018 -0800

----------------------------------------------------------------------
 .../java/org/apache/calcite/rex/RexBuilder.java |  17 ++
 .../java/org/apache/calcite/sql/SqlUtil.java    |  65 +++++++-
 .../java/org/apache/calcite/util/NlsString.java | 155 +++++++++++++------
 .../org/apache/calcite/rex/RexBuilderTest.java  |  50 ++++++
 .../calcite/sql/parser/SqlParserTest.java       |   1 +
 5 files changed, 234 insertions(+), 54 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/calcite/blob/36bd5fb2/core/src/main/java/org/apache/calcite/rex/RexBuilder.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/calcite/rex/RexBuilder.java 
b/core/src/main/java/org/apache/calcite/rex/RexBuilder.java
index 5741a11..d346ec7 100644
--- a/core/src/main/java/org/apache/calcite/rex/RexBuilder.java
+++ b/core/src/main/java/org/apache/calcite/rex/RexBuilder.java
@@ -28,6 +28,7 @@ import org.apache.calcite.rel.type.RelDataTypeFactory;
 import org.apache.calcite.rel.type.RelDataTypeField;
 import org.apache.calcite.runtime.FlatLists;
 import org.apache.calcite.sql.SqlAggFunction;
+import org.apache.calcite.sql.SqlCollation;
 import org.apache.calcite.sql.SqlIntervalQualifier;
 import org.apache.calcite.sql.SqlKind;
 import org.apache.calcite.sql.SqlOperator;
@@ -1034,6 +1035,22 @@ public class RexBuilder {
   }
 
   /**
+   * Creates a character string literal with type CHAR.
+   *
+   * @param value       String value in bytes
+   * @param charsetName SQL-level charset name
+   * @param collation   Sql collation
+   * @return String     literal
+   */
+  protected RexLiteral makePreciseStringLiteral(ByteString value,
+      String charsetName, SqlCollation collation) {
+    return makeLiteral(
+        new NlsString(value, charsetName, collation),
+        typeFactory.createSqlType(SqlTypeName.CHAR),
+        SqlTypeName.CHAR);
+  }
+
+  /**
    * Ensures expression is interpreted as a specified type. The returned
    * expression may be wrapped with a cast.
    *

http://git-wip-us.apache.org/repos/asf/calcite/blob/36bd5fb2/core/src/main/java/org/apache/calcite/sql/SqlUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/calcite/sql/SqlUtil.java 
b/core/src/main/java/org/apache/calcite/sql/SqlUtil.java
index 52ba9a4..461b7ed 100644
--- a/core/src/main/java/org/apache/calcite/sql/SqlUtil.java
+++ b/core/src/main/java/org/apache/calcite/sql/SqlUtil.java
@@ -16,6 +16,7 @@
  */
 package org.apache.calcite.sql;
 
+import org.apache.calcite.avatica.util.ByteString;
 import org.apache.calcite.linq4j.Ord;
 import org.apache.calcite.linq4j.function.Functions;
 import org.apache.calcite.rel.type.RelDataType;
@@ -38,11 +39,14 @@ import org.apache.calcite.util.Pair;
 import org.apache.calcite.util.Util;
 
 import com.google.common.base.Predicates;
+import com.google.common.base.Utf8;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterators;
 import com.google.common.collect.Lists;
 
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
 import java.sql.DatabaseMetaData;
 import java.sql.SQLException;
 import java.text.MessageFormat;
@@ -836,18 +840,63 @@ public abstract class SqlUtil {
    * @return Java-level name, or null if SQL-level name is unknown
    */
   public static String translateCharacterSetName(String name) {
-    if (name.equals("LATIN1")) {
+    switch (name) {
+    case "BIG5":
+      return "Big5";
+    case "LATIN1":
       return "ISO-8859-1";
-    } else if (name.equals("UTF16")) {
-      return ConversionUtil.NATIVE_UTF16_CHARSET_NAME;
-    } else if (name.equals(ConversionUtil.NATIVE_UTF16_CHARSET_NAME)) {
-      // no translation needed
+    case "GB2312":
+    case "GBK":
       return name;
-    } else if (name.equals("ISO-8859-1")) {
-      // no translation needed
+    case "UTF8":
+      return "UTF-8";
+    case "UTF16":
+      return ConversionUtil.NATIVE_UTF16_CHARSET_NAME;
+    case "UTF-16BE":
+    case "UTF-16LE":
+    case "ISO-8859-1":
+    case "UTF-8":
       return name;
+    default:
+      return null;
+    }
+  }
+
+  /**
+   * Returns the Java-level {@link Charset} based on given SQL-level name.
+   *
+   * @param charsetName Sql charset name, must not be null.
+   * @return charset, or default charset if charsetName is null.
+   * @throws UnsupportedCharsetException If no support for the named charset
+   *     is available in this instance of the Java virtual machine
+   */
+  public static Charset getCharset(String charsetName) {
+    assert charsetName != null;
+    charsetName = charsetName.toUpperCase(Locale.ROOT);
+    String javaCharsetName = translateCharacterSetName(charsetName);
+    if (javaCharsetName == null) {
+      throw new UnsupportedCharsetException(charsetName);
+    }
+    return Charset.forName(javaCharsetName);
+  }
+
+  /**
+   * Validate if value can be decoded by given charset.
+   *
+   * @param value nls string in byte array
+   * @param charset charset
+   * @throws RuntimeException If the given value cannot be represented in the
+   *     given charset
+   */
+  public static void validateCharset(ByteString value, Charset charset) {
+    if (charset == StandardCharsets.UTF_8) {
+      final byte[] bytes = value.getBytes();
+      if (!Utf8.isWellFormed(bytes)) {
+        //CHECKSTYLE: IGNORE 1
+        final String string = new String(bytes, charset);
+        throw RESOURCE.charsetEncoding(string, charset.name()).ex();
+      }
     }
-    return null;
   }
 
   /** If a node is "AS", returns the underlying expression; otherwise returns

http://git-wip-us.apache.org/repos/asf/calcite/blob/36bd5fb2/core/src/main/java/org/apache/calcite/util/NlsString.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/calcite/util/NlsString.java 
b/core/src/main/java/org/apache/calcite/util/NlsString.java
index fbb9386..fd41811 100644
--- a/core/src/main/java/org/apache/calcite/util/NlsString.java
+++ b/core/src/main/java/org/apache/calcite/util/NlsString.java
@@ -16,19 +16,25 @@
  */
 package org.apache.calcite.util;
 
+import org.apache.calcite.avatica.util.ByteString;
 import org.apache.calcite.runtime.SqlFunctions;
 import org.apache.calcite.sql.SqlCollation;
 import org.apache.calcite.sql.SqlUtil;
 
-import java.nio.CharBuffer;
+import com.google.common.cache.CacheBuilder;
+import com.google.common.cache.CacheLoader;
+import com.google.common.cache.LoadingCache;
+
+import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.Charset;
-import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CharsetDecoder;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.List;
 import java.util.Locale;
 import java.util.Objects;
+import javax.annotation.Nonnull;
 
 import static org.apache.calcite.util.Static.RESOURCE;
 
@@ -39,8 +45,31 @@ import static org.apache.calcite.util.Static.RESOURCE;
 public class NlsString implements Comparable<NlsString>, Cloneable {
   //~ Instance fields --------------------------------------------------------
 
+  private static final LoadingCache<Pair<ByteString, Charset>, String>
+      DECODE_MAP =
+      CacheBuilder.newBuilder()
+          .softValues()
+          .build(
+              new CacheLoader<Pair<ByteString, Charset>, String>() {
+                public String load(@Nonnull Pair<ByteString, Charset> key) {
+                  final Charset charset = key.right;
+                  final CharsetDecoder decoder = charset.newDecoder();
+                  final byte[] bytes = key.left.getBytes();
+                  final ByteBuffer buffer = ByteBuffer.wrap(bytes);
+                  try {
+                    return decoder.decode(buffer).toString();
+                  } catch (CharacterCodingException ex) {
+                    throw RESOURCE.charsetEncoding(
+                        //CHECKSTYLE: IGNORE 1
+                        new String(bytes, Charset.defaultCharset()),
+                        charset.name()).ex();
+                  }
+                }
+              });
+
+  private final String stringValue;
+  private final ByteString bytesValue;
   private final String charsetName;
-  private final String value;
   private final Charset charset;
   private final SqlCollation collation;
 
@@ -49,43 +78,71 @@ public class NlsString implements Comparable<NlsString>, 
Cloneable {
   /**
    * Creates a string in a specified character set.
    *
-   * @param value       String constant, must not be null
-   * @param charsetName Name of the character set, may be null
+   * @param bytesValue  Byte array constant, must not be null
+   * @param charsetName Name of the character set, must not be null
    * @param collation   Collation, may be null
+   *
    * @throws IllegalCharsetNameException If the given charset name is illegal
    * @throws UnsupportedCharsetException If no support for the named charset
    *     is available in this instance of the Java virtual machine
    * @throws RuntimeException If the given value cannot be represented in the
    *     given charset
    */
-  public NlsString(
-      String value,
-      String charsetName,
+  public NlsString(ByteString bytesValue, String charsetName,
       SqlCollation collation) {
-    assert value != null;
-    if (null != charsetName) {
-      charsetName = charsetName.toUpperCase(Locale.ROOT);
-      this.charsetName = charsetName;
-      String javaCharsetName =
-          SqlUtil.translateCharacterSetName(charsetName);
-      if (javaCharsetName == null) {
-        throw new UnsupportedCharsetException(charsetName);
-      }
-      this.charset = Charset.forName(javaCharsetName);
-      CharsetEncoder encoder = charset.newEncoder();
-
-      // dry run to see if encoding hits any problems
-      try {
-        encoder.encode(CharBuffer.wrap(value));
-      } catch (CharacterCodingException ex) {
-        throw RESOURCE.charsetEncoding(value, javaCharsetName).ex();
-      }
+    this(null, Objects.requireNonNull(bytesValue),
+        Objects.requireNonNull(charsetName), collation);
+  }
+
+  /**
+   * Easy constructor for Java string.
+   *
+   * @param stringValue String constant, must not be null
+   * @param charsetName Name of the character set, may be null
+   * @param collation Collation, may be null
+   *
+   * @throws IllegalCharsetNameException If the given charset name is illegal
+   * @throws UnsupportedCharsetException If no support for the named charset
+   *     is available in this instance of the Java virtual machine
+   * @throws RuntimeException If the given value cannot be represented in the
+   *     given charset
+   */
+  public NlsString(String stringValue, String charsetName,
+      SqlCollation collation) {
+    this(Objects.requireNonNull(stringValue), null, charsetName, collation);
+  }
+
+  /** Internal constructor; other constructors must call it. */
+  private NlsString(String stringValue, ByteString bytesValue,
+      String charsetName, SqlCollation collation) {
+    if (charsetName != null) {
+      this.charsetName = charsetName.toUpperCase(Locale.ROOT);
+      this.charset = SqlUtil.getCharset(charsetName);
     } else {
       this.charsetName = null;
       this.charset = null;
     }
+    if ((stringValue != null) == (bytesValue != null)) {
+      throw new IllegalArgumentException("Specify stringValue or bytesValue");
+    }
+    if (bytesValue != null) {
+      if (charsetName == null) {
+        throw new IllegalArgumentException("Bytes value requires charset");
+      }
+      SqlUtil.validateCharset(bytesValue, charset);
+    } else {
+      // Java string can be malformed if LATIN1 is required.
+      if (this.charsetName != null
+          && (this.charsetName.equals("LATIN1")
+          || this.charsetName.equals("ISO-8859-1"))) {
+        if (!charset.newEncoder().canEncode(stringValue)) {
+          throw RESOURCE.charsetEncoding(stringValue, charset.name()).ex();
+        }
+      }
+    }
     this.collation = collation;
-    this.value = value;
+    this.stringValue = stringValue;
+    this.bytesValue = bytesValue;
   }
 
   //~ Methods ----------------------------------------------------------------
@@ -99,25 +156,22 @@ public class NlsString implements Comparable<NlsString>, 
Cloneable {
   }
 
   public int hashCode() {
-    return Objects.hash(value, charsetName, collation);
+    return Objects.hash(stringValue, bytesValue, charsetName, collation);
   }
 
   public boolean equals(Object obj) {
-    if (!(obj instanceof NlsString)) {
-      return false;
-    }
-    NlsString that = (NlsString) obj;
-    return Objects.equals(value, that.value)
-        && Objects.equals(charsetName, that.charsetName)
-        && Objects.equals(collation, that.collation);
+    return this == obj
+        || obj instanceof NlsString
+        && Objects.equals(stringValue, ((NlsString) obj).stringValue)
+        && Objects.equals(bytesValue, ((NlsString) obj).bytesValue)
+        && Objects.equals(charsetName, ((NlsString) obj).charsetName)
+        && Objects.equals(collation, ((NlsString) obj).collation);
   }
 
-  // implement Comparable
-  public int compareTo(NlsString other) {
+  @Override public int compareTo(NlsString other) {
     // TODO jvs 18-Jan-2006:  Actual collation support.  This just uses
     // the default collation.
-
-    return value.compareTo(other.value);
+    return getValue().compareTo(other.getValue());
   }
 
   public String getCharsetName() {
@@ -133,7 +187,11 @@ public class NlsString implements Comparable<NlsString>, 
Cloneable {
   }
 
   public String getValue() {
-    return value;
+    if (stringValue == null) {
+      assert bytesValue != null;
+      return DECODE_MAP.getUnchecked(Pair.of(bytesValue, charset));
+    }
+    return stringValue;
   }
 
   /**
@@ -141,8 +199,8 @@ public class NlsString implements Comparable<NlsString>, 
Cloneable {
    * right.
    */
   public NlsString rtrim() {
-    String trimmed = SqlFunctions.rtrim(value);
-    if (!trimmed.equals(value)) {
+    String trimmed = SqlFunctions.rtrim(getValue());
+    if (!trimmed.equals(getValue())) {
       return new NlsString(trimmed, charsetName, collation);
     }
     return this;
@@ -165,7 +223,7 @@ public class NlsString implements Comparable<NlsString>, 
Cloneable {
       ret.append(charsetName);
     }
     ret.append("'");
-    ret.append(Util.replace(value, "'", "''"));
+    ret.append(Util.replace(getValue(), "'", "''"));
     ret.append("'");
 
     // NOTE jvs 3-Feb-2005:  see FRG-78 for why this should go away
@@ -200,12 +258,12 @@ public class NlsString implements Comparable<NlsString>, 
Cloneable {
     }
     String charSetName = args.get(0).charsetName;
     SqlCollation collation = args.get(0).collation;
-    int length = args.get(0).value.length();
+    int length = args.get(0).getValue().length();
 
     // sum string lengths and validate
     for (int i = 1; i < args.size(); i++) {
       final NlsString arg = args.get(i);
-      length += arg.value.length();
+      length += arg.getValue().length();
       if (!((arg.charsetName == null)
           || arg.charsetName.equals(charSetName))) {
         throw new IllegalArgumentException("mismatched charsets");
@@ -218,7 +276,7 @@ public class NlsString implements Comparable<NlsString>, 
Cloneable {
 
     StringBuilder sb = new StringBuilder(length);
     for (NlsString arg : args) {
-      sb.append(arg.value);
+      sb.append(arg.getValue());
     }
     return new NlsString(
         sb.toString(),
@@ -231,6 +289,11 @@ public class NlsString implements Comparable<NlsString>, 
Cloneable {
   public NlsString copy(String value) {
     return new NlsString(value, charsetName, collation);
   }
+
+  /** Returns the value as a {@link ByteString}. */
+  public ByteString getValueBytes() {
+    return bytesValue;
+  }
 }
 
 // End NlsString.java

http://git-wip-us.apache.org/repos/asf/calcite/blob/36bd5fb2/core/src/test/java/org/apache/calcite/rex/RexBuilderTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/calcite/rex/RexBuilderTest.java 
b/core/src/test/java/org/apache/calcite/rex/RexBuilderTest.java
index f959a76..1445e95 100644
--- a/core/src/test/java/org/apache/calcite/rex/RexBuilderTest.java
+++ b/core/src/test/java/org/apache/calcite/rex/RexBuilderTest.java
@@ -16,12 +16,15 @@
  */
 package org.apache.calcite.rex;
 
+import org.apache.calcite.avatica.util.ByteString;
 import org.apache.calcite.rel.type.RelDataType;
 import org.apache.calcite.rel.type.RelDataTypeFactory;
 import org.apache.calcite.rel.type.RelDataTypeSystem;
+import org.apache.calcite.sql.SqlCollation;
 import org.apache.calcite.sql.type.SqlTypeFactoryImpl;
 import org.apache.calcite.sql.type.SqlTypeName;
 import org.apache.calcite.util.DateString;
+import org.apache.calcite.util.NlsString;
 import org.apache.calcite.util.TimeString;
 import org.apache.calcite.util.TimestampString;
 import org.apache.calcite.util.TimestampWithTimeZoneString;
@@ -29,6 +32,7 @@ import org.apache.calcite.util.Util;
 
 import org.junit.Test;
 
+import java.nio.charset.StandardCharsets;
 import java.util.Calendar;
 import java.util.TimeZone;
 
@@ -489,6 +493,52 @@ public class RexBuilderTest {
     }
   }
 
+  /**
+   * Test string literal encoding.
+   */
+  @Test public void testStringLiteral() {
+    final RelDataTypeFactory typeFactory =
+        new SqlTypeFactoryImpl(RelDataTypeSystem.DEFAULT);
+    final RelDataType varchar =
+        typeFactory.createSqlType(SqlTypeName.VARCHAR);
+    final RexBuilder builder = new RexBuilder(typeFactory);
+
+    final NlsString latin1 = new NlsString("foobar", "LATIN1", 
SqlCollation.IMPLICIT);
+    final NlsString utf8 = new NlsString("foobar", "UTF8", 
SqlCollation.IMPLICIT);
+
+    RexNode literal = builder.makePreciseStringLiteral("foobar");
+    assertEquals("'foobar'", literal.toString());
+    literal = builder.makePreciseStringLiteral(
+        new ByteString(new byte[] { 'f', 'o', 'o', 'b', 'a', 'r'}),
+        "UTF8",
+        SqlCollation.IMPLICIT);
+    assertEquals("_UTF8'foobar'", literal.toString());
+    literal = builder.makePreciseStringLiteral(
+        new ByteString("\u82f1\u56fd".getBytes(StandardCharsets.UTF_8)),
+        "UTF8",
+        SqlCollation.IMPLICIT);
+    assertEquals("_UTF8'\u82f1\u56fd'", literal.toString());
+    // Test again to check decode cache.
+    literal = builder.makePreciseStringLiteral(
+        new ByteString("\u82f1".getBytes(StandardCharsets.UTF_8)),
+        "UTF8",
+        SqlCollation.IMPLICIT);
+    assertEquals("_UTF8'\u82f1'", literal.toString());
+    try {
+      literal = builder.makePreciseStringLiteral(
+          new ByteString("\u82f1\u56fd".getBytes(StandardCharsets.UTF_8)),
+          "GB2312",
+          SqlCollation.IMPLICIT);
+      fail("expected exception, got " + literal);
+    } catch (RuntimeException e) {
+      assertThat(e.getMessage(), containsString("Failed to encode"));
+    }
+    literal = builder.makeLiteral(latin1, varchar, false);
+    assertEquals("_LATIN1'foobar'", literal.toString());
+    literal = builder.makeLiteral(utf8, varchar, false);
+    assertEquals("_UTF8'foobar'", literal.toString());
+  }
+
 }
 
 // End RexBuilderTest.java

http://git-wip-us.apache.org/repos/asf/calcite/blob/36bd5fb2/core/src/test/java/org/apache/calcite/sql/parser/SqlParserTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/calcite/sql/parser/SqlParserTest.java 
b/core/src/test/java/org/apache/calcite/sql/parser/SqlParserTest.java
index a7a1233..af49b2f 100644
--- a/core/src/test/java/org/apache/calcite/sql/parser/SqlParserTest.java
+++ b/core/src/test/java/org/apache/calcite/sql/parser/SqlParserTest.java
@@ -3727,6 +3727,7 @@ public class SqlParserTest {
     checkExp(
         "_iso-8859-1'bye' \n\n--\n-- this is a comment\n' bye'",
         "_ISO-8859-1'bye'\n' bye'");
+    checkExp("_utf8'hi'", "_UTF8'hi'");
 
     // newline in string literal
     checkExp("'foo\rbar'", "'foo\rbar'");

Reply via email to