IMPALA-7586: fix predicate pushdown of escaped strings

This fixes a class of bugs where the planner incorrectly uses the raw
string from the parser instead of the unescaped string. This occurs in
several places that push predicates down to the storage layer:
* Kudu scans
* HBase scans
* Data source scans

There are some more complex issues with escapes and the LIKE predicate
that are tracked separately by IMPALA-2422.

This also uncovered a different issue with RCFiles that is tracked by
IMPALA-7778 and is worked around by the tests added.

In order to make bugs like this more obvious in future, I renamed
getValue() to getValueWithOriginalEscapes().

Testing:
Added regression test that tests handling of backslash escapes on all
file formats. I did not add a regression test for the data source bug
since it seems to require some major modification of the data source
test infrastructure.

Change-Id: I53d6e20dd48ab6837ddd325db8a9d49ee04fed28
Reviewed-on: http://gerrit.cloudera.org:8080/11814
Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/c124d26b
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/c124d26b
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/c124d26b

Branch: refs/heads/branch-3.1.0
Commit: c124d26b546b40e1a2d98bfe396898ab563ccafc
Parents: e3a7027
Author: Tim Armstrong <tarmstr...@cloudera.com>
Authored: Fri Oct 26 16:50:22 2018 -0700
Committer: Zoltan Borok-Nagy <borokna...@cloudera.com>
Committed: Tue Nov 13 12:50:23 2018 +0100

----------------------------------------------------------------------
 .../org/apache/impala/analysis/AdminFnStmt.java |  2 +-
 .../java/org/apache/impala/analysis/Expr.java   |  2 +-
 .../apache/impala/analysis/ExtractFromExpr.java |  6 +-
 .../apache/impala/analysis/LikePredicate.java   |  5 +-
 .../apache/impala/analysis/StringLiteral.java   |  8 ++-
 .../impala/planner/DataSourceScanNode.java      |  3 +-
 .../apache/impala/planner/HBaseScanNode.java    |  4 +-
 .../org/apache/impala/planner/KuduScanNode.java |  4 +-
 testdata/data/README                            |  3 +
 testdata/data/strings_with_quotes.csv           | 11 ++++
 .../functional/functional_schema_template.sql   | 28 +++++++++
 .../datasets/functional/schema_constraints.csv  |  1 +
 .../QueryTest/string-escaping-rcfile-bug.test   | 66 ++++++++++++++++++++
 .../queries/QueryTest/string-escaping.test      | 62 ++++++++++++++++++
 tests/query_test/test_scanners.py               |  8 +++
 15 files changed, 199 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/analysis/AdminFnStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/AdminFnStmt.java 
b/fe/src/main/java/org/apache/impala/analysis/AdminFnStmt.java
index 2f2eb2e..1e1f022 100644
--- a/fe/src/main/java/org/apache/impala/analysis/AdminFnStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/AdminFnStmt.java
@@ -117,7 +117,7 @@ public class AdminFnStmt extends StatementBase {
         throw new AnalysisException(
             "Invalid backend, must be a string literal: " + 
backendExpr.toSql());
       }
-      backend_ = parseBackendAddress(((StringLiteral) backendExpr).getValue());
+      backend_ = parseBackendAddress(((StringLiteral) 
backendExpr).getUnescapedValue());
     }
     if (deadlineExpr != null) {
       deadlineSecs_ = deadlineExpr.evalToNonNegativeInteger(analyzer, 
"deadline");

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/analysis/Expr.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/Expr.java 
b/fe/src/main/java/org/apache/impala/analysis/Expr.java
index 912ec8a..fdf639e 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Expr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java
@@ -1486,7 +1486,7 @@ abstract public class Expr extends TreeNode<Expr> 
implements ParseNode, Cloneabl
         return DEFAULT_AVG_STRING_LENGTH;
       }
     } else if (e instanceof StringLiteral) {
-      return ((StringLiteral) e).getValue().length();
+      return ((StringLiteral) e).getUnescapedValue().length();
     } else {
       // TODO(tmarshall): Extend this to support other string Exprs, such as
       // function calls that return string.

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/analysis/ExtractFromExpr.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/ExtractFromExpr.java 
b/fe/src/main/java/org/apache/impala/analysis/ExtractFromExpr.java
index 40020e0..9f46187 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ExtractFromExpr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ExtractFromExpr.java
@@ -20,7 +20,6 @@ package org.apache.impala.analysis;
 import java.util.Set;
 
 import org.apache.impala.catalog.BuiltinsDb;
-import org.apache.impala.catalog.Catalog;
 import org.apache.impala.catalog.Type;
 import org.apache.impala.common.AnalysisException;
 import org.apache.impala.thrift.TExtractField;
@@ -81,7 +80,8 @@ public class ExtractFromExpr extends FunctionCallExpr {
 
     super.analyzeImpl(analyzer);
 
-    String extractFieldIdent = ((StringLiteral)children_.get(1)).getValue();
+    String extractFieldIdent =
+        ((StringLiteral)children_.get(1)).getValueWithOriginalEscapes();
     Preconditions.checkNotNull(extractFieldIdent);
     if (!EXTRACT_FIELDS.contains(extractFieldIdent.toUpperCase())) {
       throw new AnalysisException("Time unit '" + extractFieldIdent + "' in 
expression '"
@@ -102,7 +102,7 @@ public class ExtractFromExpr extends FunctionCallExpr {
     StringBuilder strBuilder = new StringBuilder();
     strBuilder.append(getFnName().toString().toUpperCase());
     strBuilder.append("(");
-    strBuilder.append(((StringLiteral)getChild(1)).getValue());
+    
strBuilder.append(((StringLiteral)getChild(1)).getValueWithOriginalEscapes());
     strBuilder.append(" FROM ");
     strBuilder.append(getChild(0).toSql());
     strBuilder.append(")");

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java 
b/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
index bc10b2a..3830c95 100644
--- a/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
+++ b/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
@@ -135,7 +135,7 @@ public class LikePredicate extends Predicate {
       // TODO: this checks that it's a Java-supported regex, but the syntax 
supported
       // by the backend is Posix; add a call to the backend to check the re 
syntax
       try {
-        Pattern.compile(((StringLiteral) getChild(1)).getValue());
+        Pattern.compile(((StringLiteral) 
getChild(1)).getValueWithOriginalEscapes());
       } catch (PatternSyntaxException e) {
         throw new AnalysisException(
             "invalid regular expression in '" + this.toSql() + "'");
@@ -148,7 +148,8 @@ public class LikePredicate extends Predicate {
   protected float computeEvalCost() {
     if (!hasChildCosts()) return UNKNOWN_COST;
     if (getChild(1).isLiteral() && !getChild(1).isNullLiteral() &&
-      Pattern.matches("[%_]*[^%_]*[%_]*", ((StringLiteral) 
getChild(1)).getValue())) {
+      Pattern.matches("[%_]*[^%_]*[%_]*",
+          ((StringLiteral) getChild(1)).getValueWithOriginalEscapes())) {
       // This pattern only has wildcards as leading or trailing character,
       // so it is linear.
       return getChildCosts() +

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java 
b/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java
index 5c51c45..12ad635 100644
--- a/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java
+++ b/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java
@@ -79,7 +79,11 @@ public class StringLiteral extends LiteralExpr {
     msg.string_literal = new TStringLiteral(val);
   }
 
-  public String getValue() { return value_; }
+  /**
+   * Returns the original value that the string literal was constructed with,
+   * without escaping or unescaping it.
+   */
+  public String getValueWithOriginalEscapes() { return value_; }
 
   public String getUnescapedValue() {
     // Unescape string exactly like Hive does. Hive's method assumes
@@ -126,7 +130,7 @@ public class StringLiteral extends LiteralExpr {
 
   @Override
   public String getStringValue() {
-    return value_;
+    return getValueWithOriginalEscapes();
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java 
b/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
index a41630b..1ddb394 100644
--- a/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
@@ -129,7 +129,8 @@ public class DataSourceScanNode extends ScanNode {
         return new TColumnValue().setDouble_val(
             ((NumericLiteral) expr).getDoubleValue());
       case STRING:
-        return new TColumnValue().setString_val(((StringLiteral) 
expr).getValue());
+        return new TColumnValue().setString_val(
+            ((StringLiteral) expr).getUnescapedValue());
       case DECIMAL:
       case DATE:
       case DATETIME:

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java 
b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
index 06aca1f..13ecb6a 100644
--- a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
@@ -166,7 +166,7 @@ public class HBaseScanNode extends ScanNode {
             analyzer.getQueryCtx());
         if (val instanceof StringLiteral) {
           StringLiteral litVal = (StringLiteral) val;
-          startKey_ = convertToBytes(litVal.getStringValue(),
+          startKey_ = convertToBytes(litVal.getUnescapedValue(),
               !rowRange.getLowerBoundInclusive());
         } else {
           // lower bound is null.
@@ -182,7 +182,7 @@ public class HBaseScanNode extends ScanNode {
             analyzer.getQueryCtx());
         if (val instanceof StringLiteral) {
           StringLiteral litVal = (StringLiteral) val;
-          stopKey_ = convertToBytes(litVal.getStringValue(),
+          stopKey_ = convertToBytes(litVal.getUnescapedValue(),
               rowRange.getUpperBoundInclusive());
         } else {
           // lower bound is null.

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java 
b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
index 56c7602..adeaa72 100644
--- a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
@@ -415,7 +415,7 @@ public class KuduScanNode extends ScanNode {
       case VARCHAR:
       case CHAR: {
         kuduPredicate = KuduPredicate.newComparisonPredicate(column, op,
-            ((StringLiteral)literal).getStringValue());
+            ((StringLiteral)literal).getUnescapedValue());
         break;
       }
       case TIMESTAMP: {
@@ -523,7 +523,7 @@ public class KuduScanNode extends ScanNode {
       case BIGINT: return ((NumericLiteral) e).getLongValue();
       case FLOAT: return (float) ((NumericLiteral) e).getDoubleValue();
       case DOUBLE: return ((NumericLiteral) e).getDoubleValue();
-      case STRING: return ((StringLiteral) e).getValue();
+      case STRING: return ((StringLiteral) e).getUnescapedValue();
       case TIMESTAMP: {
         try {
           // TODO: Simplify when Impala supports a 64-bit TIMESTAMP type.

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/data/README
----------------------------------------------------------------------
diff --git a/testdata/data/README b/testdata/data/README
index 599b009..db055c7 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -230,3 +230,6 @@ valid range [0..24H). Before the fix, select * returned 
these values:
 1970-01-01 00:00:00
 1970-01-01 23:59:59.999999999
 1970-01-01 24:00:00 (invalid - time of day should be less than a whole day)
+
+strings_with_quotes.csv:
+Various strings with quotes in them to reproduce bugs like IMPALA-7586.

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/data/strings_with_quotes.csv
----------------------------------------------------------------------
diff --git a/testdata/data/strings_with_quotes.csv 
b/testdata/data/strings_with_quotes.csv
new file mode 100644
index 0000000..0d82f03
--- /dev/null
+++ b/testdata/data/strings_with_quotes.csv
@@ -0,0 +1,11 @@
+",1
+"",2
+\\",3
+'',4
+',5
+foo',6
+'foo,7
+"foo",8
+"foo,9
+foo"bar,10
+foo\\"bar,11

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/datasets/functional/functional_schema_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/functional_schema_template.sql 
b/testdata/datasets/functional/functional_schema_template.sql
index cfd9030..0712b7c 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -2194,3 +2194,31 @@ CREATE TABLE IF NOT EXISTS 
{db_name}{db_suffix}.{table_name} (i1 integer)
 STORED AS {file_format}
 TBLPROPERTIES('skip.header.line.count'='2');
 ====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
+strings_with_quotes
+---- COLUMNS
+s string
+i int
+---- ROW_FORMAT
+delimited fields terminated by ','  escaped by '\\'
+---- LOAD
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/strings_with_quotes.csv'
+OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD
+INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
+SELECT s, i
+FROM {db_name}.{table_name};
+---- CREATE_KUDU
+DROP TABLE IF EXISTS {db_name}{db_suffix}.{table_name};
+CREATE TABLE {db_name}{db_suffix}.{table_name} (
+  s string PRIMARY KEY,
+  i int
+)
+PARTITION BY HASH (s) PARTITIONS 3 STORED AS KUDU;
+---- DEPENDENT_LOAD_KUDU
+INSERT into TABLE {db_name}{db_suffix}.{table_name}
+SELECT s, i
+FROM {db_name}.{table_name};
+====

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/datasets/functional/schema_constraints.csv
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/schema_constraints.csv 
b/testdata/datasets/functional/schema_constraints.csv
index baf0306..e09ca36 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -193,6 +193,7 @@ table_name:nulltable, constraint:only, 
table_format:kudu/none/none
 table_name:nullescapedtable, constraint:only, table_format:kudu/none/none
 table_name:decimal_tbl, constraint:only, table_format:kudu/none/none
 table_name:decimal_tiny, constraint:only, table_format:kudu/none/none
+table_name:strings_with_quotes, constraint:only, table_format:kudu/none/none
 
 # Skipping header lines is only effective with text tables
 table_name:table_with_header, constraint:restrict_to, 
table_format:text/none/none

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/workloads/functional-query/queries/QueryTest/string-escaping-rcfile-bug.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/string-escaping-rcfile-bug.test
 
b/testdata/workloads/functional-query/queries/QueryTest/string-escaping-rcfile-bug.test
new file mode 100644
index 0000000..d757480
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/string-escaping-rcfile-bug.test
@@ -0,0 +1,66 @@
+# This file contains the current output for queries against strings_with_quotes
+# for RCFile. The output is different because of a bug in the RCFile scanner.
+# This file can be removed once IMPALA-7778 is fixed.
+====
+---- QUERY
+# Check that all strings in the table are returned correctly.
+# IMPALA-7778: escapes are ignored so output is incorrect
+select s
+from strings_with_quotes
+---- RESULTS
+'"'
+'""'
+'\\\\"'
+''''''
+''''
+'foo'''
+'''foo'
+'"foo"'
+'"foo'
+'foo"bar'
+'foo\\\\"bar'
+---- TYPES
+STRING
+====
+---- QUERY
+# Regression test for IMPALA-7586: predicate pushed down with incorrect string 
escaping.
+select s
+from strings_with_quotes
+where s = '"'
+---- RESULTS
+'"'
+---- TYPES
+STRING
+====
+---- QUERY
+# Regression test for IMPALA-7586: predicate pushed down with incorrect string 
escaping.
+# IMPALA-7778: escapes are ignored so output is incorrect
+select s
+from strings_with_quotes
+where s = '\\"'
+---- RESULTS
+---- TYPES
+STRING
+====
+---- QUERY
+# Regression test for IMPALA-7586: predicate pushed down with incorrect string 
escaping.
+select s
+from strings_with_quotes
+where s in ('"', 'foo"bar')
+---- RESULTS
+'"'
+'foo"bar'
+---- TYPES
+STRING
+====
+---- QUERY
+# Regression test for IMPALA-7586: predicate pushed down with incorrect string 
escaping.
+# IMPALA-7778: escapes are ignored so output is incorrect
+select s
+from strings_with_quotes
+where s in ('\\"', 'foo"bar')
+---- RESULTS
+'foo"bar'
+---- TYPES
+STRING
+====

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/workloads/functional-query/queries/QueryTest/string-escaping.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/string-escaping.test 
b/testdata/workloads/functional-query/queries/QueryTest/string-escaping.test
new file mode 100644
index 0000000..a2f2c5d
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/string-escaping.test
@@ -0,0 +1,62 @@
+====
+---- QUERY
+# Check that all strings in the table are returned correctly.
+select s
+from strings_with_quotes
+---- RESULTS
+'"'
+'""'
+'\\"'
+''''''
+''''
+'foo'''
+'''foo'
+'"foo"'
+'"foo'
+'foo"bar'
+'foo\\"bar'
+---- TYPES
+STRING
+====
+---- QUERY
+# Regression test for IMPALA-7586: predicate pushed down with incorrect string 
escaping.
+select s
+from strings_with_quotes
+where s = '"'
+---- RESULTS
+'"'
+---- TYPES
+STRING
+====
+---- QUERY
+# Regression test for IMPALA-7586: predicate pushed down with incorrect string 
escaping.
+select s
+from strings_with_quotes
+where s = '\\"'
+---- RESULTS
+'\\"'
+---- TYPES
+STRING
+====
+---- QUERY
+# Regression test for IMPALA-7586: predicate pushed down with incorrect string 
escaping.
+select s
+from strings_with_quotes
+where s in ('"', 'foo"bar')
+---- RESULTS
+'"'
+'foo"bar'
+---- TYPES
+STRING
+====
+---- QUERY
+# Regression test for IMPALA-7586: predicate pushed down with incorrect string 
escaping.
+select s
+from strings_with_quotes
+where s in ('\\"', 'foo"bar')
+---- RESULTS
+'\\"'
+'foo"bar'
+---- TYPES
+STRING
+====

http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index 91f0449..f4ddad4 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -97,6 +97,14 @@ class TestScannersAllTableFormats(ImpalaTestSuite):
       pytest.skip()
     self.run_test_case('QueryTest/hdfs_scanner_profile', vector)
 
+  def test_string_escaping(self, vector):
+    """Test handling of string escape sequences."""
+    if vector.get_value('table_format').file_format == 'rc':
+      # IMPALA-7778: RCFile scanner incorrectly ignores escapes for now.
+      self.run_test_case('QueryTest/string-escaping-rcfile-bug', vector)
+    else:
+      self.run_test_case('QueryTest/string-escaping', vector)
+
 # Test all the scanners with a simple limit clause. The limit clause triggers
 # cancellation in the scanner code paths.
 class TestScannersAllTableFormatsWithLimit(ImpalaTestSuite):

Reply via email to