IMPALA-7586: fix predicate pushdown of escaped strings This fixes a class of bugs where the planner incorrectly uses the raw string from the parser instead of the unescaped string. This occurs in several places that push predicates down to the storage layer: * Kudu scans * HBase scans * Data source scans
There are some more complex issues with escapes and the LIKE predicate that are tracked separately by IMPALA-2422. This also uncovered a different issue with RCFiles that is tracked by IMPALA-7778 and is worked around by the tests added. In order to make bugs like this more obvious in future, I renamed getValue() to getValueWithOriginalEscapes(). Testing: Added regression test that tests handling of backslash escapes on all file formats. I did not add a regression test for the data source bug since it seems to require some major modification of the data source test infrastructure. Change-Id: I53d6e20dd48ab6837ddd325db8a9d49ee04fed28 Reviewed-on: http://gerrit.cloudera.org:8080/11814 Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/c124d26b Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/c124d26b Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/c124d26b Branch: refs/heads/branch-3.1.0 Commit: c124d26b546b40e1a2d98bfe396898ab563ccafc Parents: e3a7027 Author: Tim Armstrong <tarmstr...@cloudera.com> Authored: Fri Oct 26 16:50:22 2018 -0700 Committer: Zoltan Borok-Nagy <borokna...@cloudera.com> Committed: Tue Nov 13 12:50:23 2018 +0100 ---------------------------------------------------------------------- .../org/apache/impala/analysis/AdminFnStmt.java | 2 +- .../java/org/apache/impala/analysis/Expr.java | 2 +- .../apache/impala/analysis/ExtractFromExpr.java | 6 +- .../apache/impala/analysis/LikePredicate.java | 5 +- .../apache/impala/analysis/StringLiteral.java | 8 ++- .../impala/planner/DataSourceScanNode.java | 3 +- .../apache/impala/planner/HBaseScanNode.java | 4 +- .../org/apache/impala/planner/KuduScanNode.java | 4 +- testdata/data/README | 3 + testdata/data/strings_with_quotes.csv | 11 ++++ .../functional/functional_schema_template.sql | 28 +++++++++ .../datasets/functional/schema_constraints.csv | 1 + .../QueryTest/string-escaping-rcfile-bug.test | 66 ++++++++++++++++++++ .../queries/QueryTest/string-escaping.test | 62 ++++++++++++++++++ tests/query_test/test_scanners.py | 8 +++ 15 files changed, 199 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/analysis/AdminFnStmt.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/AdminFnStmt.java b/fe/src/main/java/org/apache/impala/analysis/AdminFnStmt.java index 2f2eb2e..1e1f022 100644 --- a/fe/src/main/java/org/apache/impala/analysis/AdminFnStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/AdminFnStmt.java @@ -117,7 +117,7 @@ public class AdminFnStmt extends StatementBase { throw new AnalysisException( "Invalid backend, must be a string literal: " + backendExpr.toSql()); } - backend_ = parseBackendAddress(((StringLiteral) backendExpr).getValue()); + backend_ = parseBackendAddress(((StringLiteral) backendExpr).getUnescapedValue()); } if (deadlineExpr != null) { deadlineSecs_ = deadlineExpr.evalToNonNegativeInteger(analyzer, "deadline"); http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/analysis/Expr.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/Expr.java b/fe/src/main/java/org/apache/impala/analysis/Expr.java index 912ec8a..fdf639e 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Expr.java +++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java @@ -1486,7 +1486,7 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl return DEFAULT_AVG_STRING_LENGTH; } } else if (e instanceof StringLiteral) { - return ((StringLiteral) e).getValue().length(); + return ((StringLiteral) e).getUnescapedValue().length(); } else { // TODO(tmarshall): Extend this to support other string Exprs, such as // function calls that return string. http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/analysis/ExtractFromExpr.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/ExtractFromExpr.java b/fe/src/main/java/org/apache/impala/analysis/ExtractFromExpr.java index 40020e0..9f46187 100644 --- a/fe/src/main/java/org/apache/impala/analysis/ExtractFromExpr.java +++ b/fe/src/main/java/org/apache/impala/analysis/ExtractFromExpr.java @@ -20,7 +20,6 @@ package org.apache.impala.analysis; import java.util.Set; import org.apache.impala.catalog.BuiltinsDb; -import org.apache.impala.catalog.Catalog; import org.apache.impala.catalog.Type; import org.apache.impala.common.AnalysisException; import org.apache.impala.thrift.TExtractField; @@ -81,7 +80,8 @@ public class ExtractFromExpr extends FunctionCallExpr { super.analyzeImpl(analyzer); - String extractFieldIdent = ((StringLiteral)children_.get(1)).getValue(); + String extractFieldIdent = + ((StringLiteral)children_.get(1)).getValueWithOriginalEscapes(); Preconditions.checkNotNull(extractFieldIdent); if (!EXTRACT_FIELDS.contains(extractFieldIdent.toUpperCase())) { throw new AnalysisException("Time unit '" + extractFieldIdent + "' in expression '" @@ -102,7 +102,7 @@ public class ExtractFromExpr extends FunctionCallExpr { StringBuilder strBuilder = new StringBuilder(); strBuilder.append(getFnName().toString().toUpperCase()); strBuilder.append("("); - strBuilder.append(((StringLiteral)getChild(1)).getValue()); + strBuilder.append(((StringLiteral)getChild(1)).getValueWithOriginalEscapes()); strBuilder.append(" FROM "); strBuilder.append(getChild(0).toSql()); strBuilder.append(")"); http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java b/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java index bc10b2a..3830c95 100644 --- a/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java +++ b/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java @@ -135,7 +135,7 @@ public class LikePredicate extends Predicate { // TODO: this checks that it's a Java-supported regex, but the syntax supported // by the backend is Posix; add a call to the backend to check the re syntax try { - Pattern.compile(((StringLiteral) getChild(1)).getValue()); + Pattern.compile(((StringLiteral) getChild(1)).getValueWithOriginalEscapes()); } catch (PatternSyntaxException e) { throw new AnalysisException( "invalid regular expression in '" + this.toSql() + "'"); @@ -148,7 +148,8 @@ public class LikePredicate extends Predicate { protected float computeEvalCost() { if (!hasChildCosts()) return UNKNOWN_COST; if (getChild(1).isLiteral() && !getChild(1).isNullLiteral() && - Pattern.matches("[%_]*[^%_]*[%_]*", ((StringLiteral) getChild(1)).getValue())) { + Pattern.matches("[%_]*[^%_]*[%_]*", + ((StringLiteral) getChild(1)).getValueWithOriginalEscapes())) { // This pattern only has wildcards as leading or trailing character, // so it is linear. return getChildCosts() + http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java b/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java index 5c51c45..12ad635 100644 --- a/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java +++ b/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java @@ -79,7 +79,11 @@ public class StringLiteral extends LiteralExpr { msg.string_literal = new TStringLiteral(val); } - public String getValue() { return value_; } + /** + * Returns the original value that the string literal was constructed with, + * without escaping or unescaping it. + */ + public String getValueWithOriginalEscapes() { return value_; } public String getUnescapedValue() { // Unescape string exactly like Hive does. Hive's method assumes @@ -126,7 +130,7 @@ public class StringLiteral extends LiteralExpr { @Override public String getStringValue() { - return value_; + return getValueWithOriginalEscapes(); } @Override http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java b/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java index a41630b..1ddb394 100644 --- a/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java @@ -129,7 +129,8 @@ public class DataSourceScanNode extends ScanNode { return new TColumnValue().setDouble_val( ((NumericLiteral) expr).getDoubleValue()); case STRING: - return new TColumnValue().setString_val(((StringLiteral) expr).getValue()); + return new TColumnValue().setString_val( + ((StringLiteral) expr).getUnescapedValue()); case DECIMAL: case DATE: case DATETIME: http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java index 06aca1f..13ecb6a 100644 --- a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java @@ -166,7 +166,7 @@ public class HBaseScanNode extends ScanNode { analyzer.getQueryCtx()); if (val instanceof StringLiteral) { StringLiteral litVal = (StringLiteral) val; - startKey_ = convertToBytes(litVal.getStringValue(), + startKey_ = convertToBytes(litVal.getUnescapedValue(), !rowRange.getLowerBoundInclusive()); } else { // lower bound is null. @@ -182,7 +182,7 @@ public class HBaseScanNode extends ScanNode { analyzer.getQueryCtx()); if (val instanceof StringLiteral) { StringLiteral litVal = (StringLiteral) val; - stopKey_ = convertToBytes(litVal.getStringValue(), + stopKey_ = convertToBytes(litVal.getUnescapedValue(), rowRange.getUpperBoundInclusive()); } else { // lower bound is null. http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java index 56c7602..adeaa72 100644 --- a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java @@ -415,7 +415,7 @@ public class KuduScanNode extends ScanNode { case VARCHAR: case CHAR: { kuduPredicate = KuduPredicate.newComparisonPredicate(column, op, - ((StringLiteral)literal).getStringValue()); + ((StringLiteral)literal).getUnescapedValue()); break; } case TIMESTAMP: { @@ -523,7 +523,7 @@ public class KuduScanNode extends ScanNode { case BIGINT: return ((NumericLiteral) e).getLongValue(); case FLOAT: return (float) ((NumericLiteral) e).getDoubleValue(); case DOUBLE: return ((NumericLiteral) e).getDoubleValue(); - case STRING: return ((StringLiteral) e).getValue(); + case STRING: return ((StringLiteral) e).getUnescapedValue(); case TIMESTAMP: { try { // TODO: Simplify when Impala supports a 64-bit TIMESTAMP type. http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/data/README ---------------------------------------------------------------------- diff --git a/testdata/data/README b/testdata/data/README index 599b009..db055c7 100644 --- a/testdata/data/README +++ b/testdata/data/README @@ -230,3 +230,6 @@ valid range [0..24H). Before the fix, select * returned these values: 1970-01-01 00:00:00 1970-01-01 23:59:59.999999999 1970-01-01 24:00:00 (invalid - time of day should be less than a whole day) + +strings_with_quotes.csv: +Various strings with quotes in them to reproduce bugs like IMPALA-7586. http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/data/strings_with_quotes.csv ---------------------------------------------------------------------- diff --git a/testdata/data/strings_with_quotes.csv b/testdata/data/strings_with_quotes.csv new file mode 100644 index 0000000..0d82f03 --- /dev/null +++ b/testdata/data/strings_with_quotes.csv @@ -0,0 +1,11 @@ +",1 +"",2 +\\",3 +'',4 +',5 +foo',6 +'foo,7 +"foo",8 +"foo,9 +foo"bar,10 +foo\\"bar,11 http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/datasets/functional/functional_schema_template.sql ---------------------------------------------------------------------- diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index cfd9030..0712b7c 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -2194,3 +2194,31 @@ CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} (i1 integer) STORED AS {file_format} TBLPROPERTIES('skip.header.line.count'='2'); ==== +---- DATASET +functional +---- BASE_TABLE_NAME +strings_with_quotes +---- COLUMNS +s string +i int +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' +---- LOAD +LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/strings_with_quotes.csv' +OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +---- DEPENDENT_LOAD +INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} +SELECT s, i +FROM {db_name}.{table_name}; +---- CREATE_KUDU +DROP TABLE IF EXISTS {db_name}{db_suffix}.{table_name}; +CREATE TABLE {db_name}{db_suffix}.{table_name} ( + s string PRIMARY KEY, + i int +) +PARTITION BY HASH (s) PARTITIONS 3 STORED AS KUDU; +---- DEPENDENT_LOAD_KUDU +INSERT into TABLE {db_name}{db_suffix}.{table_name} +SELECT s, i +FROM {db_name}.{table_name}; +==== http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/datasets/functional/schema_constraints.csv ---------------------------------------------------------------------- diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index baf0306..e09ca36 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -193,6 +193,7 @@ table_name:nulltable, constraint:only, table_format:kudu/none/none table_name:nullescapedtable, constraint:only, table_format:kudu/none/none table_name:decimal_tbl, constraint:only, table_format:kudu/none/none table_name:decimal_tiny, constraint:only, table_format:kudu/none/none +table_name:strings_with_quotes, constraint:only, table_format:kudu/none/none # Skipping header lines is only effective with text tables table_name:table_with_header, constraint:restrict_to, table_format:text/none/none http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/workloads/functional-query/queries/QueryTest/string-escaping-rcfile-bug.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/string-escaping-rcfile-bug.test b/testdata/workloads/functional-query/queries/QueryTest/string-escaping-rcfile-bug.test new file mode 100644 index 0000000..d757480 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/string-escaping-rcfile-bug.test @@ -0,0 +1,66 @@ +# This file contains the current output for queries against strings_with_quotes +# for RCFile. The output is different because of a bug in the RCFile scanner. +# This file can be removed once IMPALA-7778 is fixed. +==== +---- QUERY +# Check that all strings in the table are returned correctly. +# IMPALA-7778: escapes are ignored so output is incorrect +select s +from strings_with_quotes +---- RESULTS +'"' +'""' +'\\\\"' +'''''' +'''' +'foo''' +'''foo' +'"foo"' +'"foo' +'foo"bar' +'foo\\\\"bar' +---- TYPES +STRING +==== +---- QUERY +# Regression test for IMPALA-7586: predicate pushed down with incorrect string escaping. +select s +from strings_with_quotes +where s = '"' +---- RESULTS +'"' +---- TYPES +STRING +==== +---- QUERY +# Regression test for IMPALA-7586: predicate pushed down with incorrect string escaping. +# IMPALA-7778: escapes are ignored so output is incorrect +select s +from strings_with_quotes +where s = '\\"' +---- RESULTS +---- TYPES +STRING +==== +---- QUERY +# Regression test for IMPALA-7586: predicate pushed down with incorrect string escaping. +select s +from strings_with_quotes +where s in ('"', 'foo"bar') +---- RESULTS +'"' +'foo"bar' +---- TYPES +STRING +==== +---- QUERY +# Regression test for IMPALA-7586: predicate pushed down with incorrect string escaping. +# IMPALA-7778: escapes are ignored so output is incorrect +select s +from strings_with_quotes +where s in ('\\"', 'foo"bar') +---- RESULTS +'foo"bar' +---- TYPES +STRING +==== http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/testdata/workloads/functional-query/queries/QueryTest/string-escaping.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/string-escaping.test b/testdata/workloads/functional-query/queries/QueryTest/string-escaping.test new file mode 100644 index 0000000..a2f2c5d --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/string-escaping.test @@ -0,0 +1,62 @@ +==== +---- QUERY +# Check that all strings in the table are returned correctly. +select s +from strings_with_quotes +---- RESULTS +'"' +'""' +'\\"' +'''''' +'''' +'foo''' +'''foo' +'"foo"' +'"foo' +'foo"bar' +'foo\\"bar' +---- TYPES +STRING +==== +---- QUERY +# Regression test for IMPALA-7586: predicate pushed down with incorrect string escaping. +select s +from strings_with_quotes +where s = '"' +---- RESULTS +'"' +---- TYPES +STRING +==== +---- QUERY +# Regression test for IMPALA-7586: predicate pushed down with incorrect string escaping. +select s +from strings_with_quotes +where s = '\\"' +---- RESULTS +'\\"' +---- TYPES +STRING +==== +---- QUERY +# Regression test for IMPALA-7586: predicate pushed down with incorrect string escaping. +select s +from strings_with_quotes +where s in ('"', 'foo"bar') +---- RESULTS +'"' +'foo"bar' +---- TYPES +STRING +==== +---- QUERY +# Regression test for IMPALA-7586: predicate pushed down with incorrect string escaping. +select s +from strings_with_quotes +where s in ('\\"', 'foo"bar') +---- RESULTS +'\\"' +'foo"bar' +---- TYPES +STRING +==== http://git-wip-us.apache.org/repos/asf/impala/blob/c124d26b/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 91f0449..f4ddad4 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -97,6 +97,14 @@ class TestScannersAllTableFormats(ImpalaTestSuite): pytest.skip() self.run_test_case('QueryTest/hdfs_scanner_profile', vector) + def test_string_escaping(self, vector): + """Test handling of string escape sequences.""" + if vector.get_value('table_format').file_format == 'rc': + # IMPALA-7778: RCFile scanner incorrectly ignores escapes for now. + self.run_test_case('QueryTest/string-escaping-rcfile-bug', vector) + else: + self.run_test_case('QueryTest/string-escaping', vector) + # Test all the scanners with a simple limit clause. The limit clause triggers # cancellation in the scanner code paths. class TestScannersAllTableFormatsWithLimit(ImpalaTestSuite):