IMPALA-2835: introduce PARQUET_FALLBACK_SCHEMA_RESOLUTION query option This patch introduces a new query option, PARQUET_FALLBACK_SCHEMA_RESOLUTION which allows Parquet files' schemas to be resolved by either name or position. It's "fallback" because eventually field IDs will be the primary schema resolution scheme, and we don't want to create an option that we will have to change the name of later. The default is still by position. I chose to do a query option because it will make testing easier and also be easier to diagnose resolution problems quickly in the field. If users want to switch the default behavior to be by name (like Hive), they can use the --default_query_options flag.
This patch also introduces a new test section, SHELL, which can be used to execute shell commands in a .test file. This is useful for copying files into test tables. Change-Id: Id0c715ea23792b2a6872610839a40532aabbb5a6 Reviewed-on: http://gerrit.cloudera.org:8080/2384 Reviewed-by: Skye Wanderman-Milne <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/9b51b2b6 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/9b51b2b6 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/9b51b2b6 Branch: refs/heads/master Commit: 9b51b2b6e64b29d8af5147945e4b0d2679611350 Parents: 3f2840f Author: Skye Wanderman-Milne <[email protected]> Authored: Wed Mar 30 16:05:25 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Sat Apr 2 04:04:25 2016 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-parquet-scanner.cc | 75 +++++- be/src/exec/hdfs-parquet-scanner.h | 15 +- be/src/service/query-options.cc | 19 ++ be/src/service/query-options.h | 5 +- common/thrift/ImpalaInternalService.thrift | 10 + common/thrift/ImpalaService.thrift | 4 + testdata/parquet_schema_resolution/README | 12 + .../parquet_schema_resolution/switched_map.avsc | 8 + .../parquet_schema_resolution/switched_map.json | 4 + .../parquet_schema_resolution/switched_map.parq | Bin 0 -> 586 bytes .../QueryTest/parquet-resolution-by-name.test | 234 +++++++++++++++++++ tests/common/impala_test_suite.py | 18 +- tests/conftest.py | 2 + tests/query_test/test_scanners.py | 5 + tests/util/test_file_parser.py | 2 +- 15 files changed, 395 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/be/src/exec/hdfs-parquet-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index 5adc97f..ad8e360 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -1976,7 +1976,7 @@ Status HdfsParquetScanner::ResolvePathHelper(ArrayEncoding array_encoding, for (int i = 0; i < path.size(); ++i) { // Advance '*node' if necessary if (i == 0 || col_type->type != TYPE_ARRAY || array_encoding == THREE_LEVEL) { - *node = NextSchemaNode(path, i, *node, missing_field); + *node = NextSchemaNode(col_type, path, i, *node, missing_field); if (*missing_field) return Status::OK(); } else { // We just resolved an array, meaning *node is set to the repeated field of the @@ -2017,22 +2017,79 @@ Status HdfsParquetScanner::ResolvePathHelper(ArrayEncoding array_encoding, return Status::OK(); } -HdfsParquetScanner::SchemaNode* HdfsParquetScanner::NextSchemaNode(const SchemaPath& path, - int next_idx, SchemaNode* node, bool* missing_field) { +HdfsParquetScanner::SchemaNode* HdfsParquetScanner::NextSchemaNode( + const ColumnType* col_type, const SchemaPath& path, int next_idx, SchemaNode* node, + bool* missing_field) { DCHECK_LT(next_idx, path.size()); - // The first index in a path includes the table's partition keys - int file_idx = - next_idx == 0 ? path[next_idx] - scan_node_->num_partition_keys() : path[next_idx]; - if (node->children.size() <= file_idx) { - // The selected field is not in the file + if (next_idx != 0) DCHECK(col_type != NULL); + + int file_idx; + int table_idx = path[next_idx]; + bool resolve_by_name = state_->query_options().parquet_fallback_schema_resolution == + TParquetFallbackSchemaResolution::NAME; + if (resolve_by_name) { + if (next_idx == 0) { + // Resolve top-level table column by name. + DCHECK_LT(table_idx, scan_node_->hdfs_table()->col_descs().size()); + const string& name = scan_node_->hdfs_table()->col_descs()[table_idx].name(); + file_idx = FindChildWithName(node, name); + } else if (col_type->type == TYPE_STRUCT) { + // Resolve struct field by name. + DCHECK_LT(table_idx, col_type->field_names.size()); + const string& name = col_type->field_names[table_idx]; + file_idx = FindChildWithName(node, name); + } else if (col_type->type == TYPE_ARRAY) { + // Arrays have only one child in the file. + DCHECK_EQ(table_idx, SchemaPathConstants::ARRAY_ITEM); + file_idx = table_idx; + } else { + DCHECK_EQ(col_type->type, TYPE_MAP); + // Maps have two values, "key" and "value". These are supposed to be ordered and may + // not have the right field names, but try to resolve by name in case they're + // switched and otherwise use the order. See + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps for + // more details. + DCHECK(table_idx == SchemaPathConstants::MAP_KEY || + table_idx == SchemaPathConstants::MAP_VALUE); + const string& name = table_idx == SchemaPathConstants::MAP_KEY ? "key" : "value"; + file_idx = FindChildWithName(node, name); + if (file_idx >= node->children.size()) { + // Couldn't resolve by name, fall back to resolution by position. + file_idx = table_idx; + } + } + } else { + // Resolution by position. + DCHECK_EQ(state_->query_options().parquet_fallback_schema_resolution, + TParquetFallbackSchemaResolution::POSITION); + if (next_idx == 0) { + // For top-level columns, the first index in a path includes the table's partition + // keys. + file_idx = table_idx - scan_node_->num_partition_keys(); + } else { + file_idx = table_idx; + } + } + + if (file_idx >= node->children.size()) { VLOG_FILE << Substitute( - "File '$0' does not contain path '$1'", filename(), PrintPath(path)); + "File '$0' does not contain path '$1' (resolving by $2)", filename(), + PrintPath(path), resolve_by_name ? "name" : "position"); *missing_field = true; return NULL; } return &node->children[file_idx]; } +int HdfsParquetScanner::FindChildWithName(HdfsParquetScanner::SchemaNode* node, + const string& name) { + int idx; + for (idx = 0; idx < node->children.size(); ++idx) { + if (node->children[idx].element->name == name) break; + } + return idx; +} + // There are three types of array encodings: // // 1. One-level encoding http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/be/src/exec/hdfs-parquet-scanner.h ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.h b/be/src/exec/hdfs-parquet-scanner.h index 1b97363..c238c6a 100644 --- a/be/src/exec/hdfs-parquet-scanner.h +++ b/be/src/exec/hdfs-parquet-scanner.h @@ -418,7 +418,7 @@ class HdfsParquetScanner : public HdfsScanner { /// Version of the application that wrote this file. FileVersion file_version_; - /// The root schema node for this file + /// The root schema node for this file. SchemaNode schema_; /// Scan range for the metadata. @@ -590,10 +590,15 @@ class HdfsParquetScanner : public HdfsScanner { /// Helper functions for ResolvePathHelper(). - /// Advances 'node' to one of its children based on path[next_idx]. Returns the child - /// node or sets 'missing_field' to true. - SchemaNode* NextSchemaNode(const SchemaPath& path, int next_idx, SchemaNode* node, - bool* missing_field); + /// Advances 'node' to one of its children based on path[next_idx] and + /// 'col_type'. 'col_type' is NULL if 'node' is the root node, otherwise it's the type + /// associated with 'node'. Returns the child node or sets 'missing_field' to true. + SchemaNode* NextSchemaNode(const ColumnType* col_type, const SchemaPath& path, + int next_idx, SchemaNode* node, bool* missing_field); + + /// Returns the index of 'node's child with 'name', or the number of children if not + /// found. + int FindChildWithName(SchemaNode* node, const string& name); /// The ResolvePathHelper() logic for arrays. Status ResolveArray(ArrayEncoding array_encoding, const SchemaPath& path, int idx, http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/be/src/service/query-options.cc ---------------------------------------------------------------------- diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index 2b5b879..00f55a3 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -33,6 +33,7 @@ using boost::algorithm::is_any_of; using boost::algorithm::token_compress_on; using boost::algorithm::split; using boost::algorithm::trim; +using std::to_string; using namespace impala; using namespace strings; @@ -100,6 +101,9 @@ int GetQueryOptionForKey(const string& key) { return -1; } +// Note that we allow numerical values for boolean and enum options. This is because +// TQueryOptionsToMap() will output the numerical values, and we need to parse its output +// configuration. Status impala::SetQueryOption(const string& key, const string& value, TQueryOptions* query_options, QueryOptionsMask* set_query_options_mask) { int option = GetQueryOptionForKey(key); @@ -367,6 +371,21 @@ Status impala::SetQueryOption(const string& key, const string& value, iequals(value, "true") || iequals(value, "1")); break; } + case TImpalaQueryOptions::PARQUET_FALLBACK_SCHEMA_RESOLUTION: { + if (iequals(value, "position") || + iequals(value, to_string(TParquetFallbackSchemaResolution::POSITION))) { + query_options->__set_parquet_fallback_schema_resolution( + TParquetFallbackSchemaResolution::POSITION); + } else if (iequals(value, "name") || + iequals(value, to_string(TParquetFallbackSchemaResolution::NAME))) { + query_options->__set_parquet_fallback_schema_resolution( + TParquetFallbackSchemaResolution::NAME); + } else { + return Status(Substitute("Invalid PARQUET_FALLBACK_SCHEMA_RESOLUTION option: " + "'$0'. Valid options are 'POSITION' and 'NAME'.", value)); + } + break; + } default: // We hit this DCHECK(false) if we forgot to add the corresponding entry here // when we add a new query option. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/be/src/service/query-options.h ---------------------------------------------------------------------- diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index 6964583..c445658 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -32,7 +32,7 @@ class TQueryOptions; // the DCHECK. #define QUERY_OPTS_TABLE\ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\ - TImpalaQueryOptions::PARQUET_ANNOTATE_STRINGS_UTF8 + 1);\ + TImpalaQueryOptions::PARQUET_FALLBACK_SCHEMA_RESOLUTION + 1);\ QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\ QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR)\ QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\ @@ -74,7 +74,8 @@ class TQueryOptions; QUERY_OPT_FN(runtime_filter_wait_time_ms, RUNTIME_FILTER_WAIT_TIME_MS)\ QUERY_OPT_FN(disable_row_runtime_filtering, DISABLE_ROW_RUNTIME_FILTERING)\ QUERY_OPT_FN(max_num_runtime_filters, MAX_NUM_RUNTIME_FILTERS)\ - QUERY_OPT_FN(parquet_annotate_strings_utf8, PARQUET_ANNOTATE_STRINGS_UTF8); + QUERY_OPT_FN(parquet_annotate_strings_utf8, PARQUET_ANNOTATE_STRINGS_UTF8)\ + QUERY_OPT_FN(parquet_fallback_schema_resolution, PARQUET_FALLBACK_SCHEMA_RESOLUTION); /// Converts a TQueryOptions struct into a map of key, value pairs. void TQueryOptionsToMap(const TQueryOptions& query_options, http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/common/thrift/ImpalaInternalService.thrift ---------------------------------------------------------------------- diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift index d3d3080..2af20e3 100644 --- a/common/thrift/ImpalaInternalService.thrift +++ b/common/thrift/ImpalaInternalService.thrift @@ -42,6 +42,11 @@ const i32 INVALID_PLAN_NODE_ID = -1 // Constant default partition ID, must be < 0 to avoid collisions const i64 DEFAULT_PARTITION_ID = -1; +enum TParquetFallbackSchemaResolution { + POSITION, + NAME +} + // Query options that correspond to ImpalaService.ImpalaQueryOptions, with their // respective defaults. Query options can be set in the following ways: // @@ -170,6 +175,11 @@ struct TQueryOptions { // This is disabled by default in order to preserve the existing behavior of legacy // workloads. In addition, Impala strings are not necessarily UTF8-encoded. 42: optional bool parquet_annotate_strings_utf8 = false + + // Determines how to resolve Parquet files' schemas in the absence of field IDs (which + // is always, since fields IDs are NYI). Valid values are "position" (default) and + // "name". + 43: optional TParquetFallbackSchemaResolution parquet_fallback_schema_resolution = 0 } // Impala currently has two types of sessions: Beeswax and HiveServer2 http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/common/thrift/ImpalaService.thrift ---------------------------------------------------------------------- diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift index cbeaa65..3f5273e 100644 --- a/common/thrift/ImpalaService.thrift +++ b/common/thrift/ImpalaService.thrift @@ -205,6 +205,10 @@ enum TImpalaQueryOptions { // If true, use UTF-8 annotation for string columns. Note that char and varchar columns // always use the annotation. PARQUET_ANNOTATE_STRINGS_UTF8 + + // Determines how to resolve Parquet files' schemas in the absence of field IDs (which + // is always, since fields IDs are NYI). Valid values are "position" and "name". + PARQUET_FALLBACK_SCHEMA_RESOLUTION } // The summary of an insert. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/testdata/parquet_schema_resolution/README ---------------------------------------------------------------------- diff --git a/testdata/parquet_schema_resolution/README b/testdata/parquet_schema_resolution/README new file mode 100644 index 0000000..840067f --- /dev/null +++ b/testdata/parquet_schema_resolution/README @@ -0,0 +1,12 @@ +switched_map.parq was generated by modifying parquet-mr to switch the key and value fields +of map, and then converting switched_map.json to parquet using switched_map.avsc as the +schema. switched_map.parq has the following schema according to parquet-tools: + +message com.cloudera.impala.switched_map { + required group int_map (MAP) { + repeated group map (MAP_KEY_VALUE) { + required int32 value; + required binary key (UTF8); + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/testdata/parquet_schema_resolution/switched_map.avsc ---------------------------------------------------------------------- diff --git a/testdata/parquet_schema_resolution/switched_map.avsc b/testdata/parquet_schema_resolution/switched_map.avsc new file mode 100644 index 0000000..8805261 --- /dev/null +++ b/testdata/parquet_schema_resolution/switched_map.avsc @@ -0,0 +1,8 @@ + +{"type": "record", + "namespace": "com.cloudera.impala", + "name": "switched_map", + "fields": [ + {"name": "int_map", "type": {"type": "map", "values": "int"}} + ] +} http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/testdata/parquet_schema_resolution/switched_map.json ---------------------------------------------------------------------- diff --git a/testdata/parquet_schema_resolution/switched_map.json b/testdata/parquet_schema_resolution/switched_map.json new file mode 100644 index 0000000..d6cac13 --- /dev/null +++ b/testdata/parquet_schema_resolution/switched_map.json @@ -0,0 +1,4 @@ +[ + {"int_map": {"a": 1, "b": 2}}, + {"int_map": {"c": 3}} +] http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/testdata/parquet_schema_resolution/switched_map.parq ---------------------------------------------------------------------- diff --git a/testdata/parquet_schema_resolution/switched_map.parq b/testdata/parquet_schema_resolution/switched_map.parq new file mode 100644 index 0000000..9306145 Binary files /dev/null and b/testdata/parquet_schema_resolution/switched_map.parq differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test new file mode 100644 index 0000000..c81c468 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test @@ -0,0 +1,234 @@ +==== +---- QUERY +# Create a table and populate with data file +drop table if exists resolution_by_name_test; +create table resolution_by_name_test stored as parquet +as select * from functional_parquet.tinytable; +select a, b from resolution_by_name_test; +---- TYPES +string,string +---- RESULTS +'aaaaaaa','bbbbbbb' +'ccccc','dddd' +'eeeeeeee','f' +==== +---- QUERY +# Rearrange the columns and make sure we can still resolve by name +alter table resolution_by_name_test replace columns (b string, a string); +set parquet_fallback_schema_resolution="NAME"; +select a, b from resolution_by_name_test; +---- TYPES +string,string +---- RESULTS +'aaaaaaa','bbbbbbb' +'ccccc','dddd' +'eeeeeeee','f' +==== +---- QUERY +# Renaming a column will cause the column to not be resolved +alter table resolution_by_name_test change a new_a string; +select new_a from resolution_by_name_test; +---- TYPES +string +---- RESULTS +'NULL' +'NULL' +'NULL' +==== +---- QUERY +# Can still resolve by ordinal +set parquet_fallback_schema_resolution="POSITION"; +select b, new_a from resolution_by_name_test; +---- TYPES +string,string +---- RESULTS +'aaaaaaa','bbbbbbb' +'ccccc','dddd' +'eeeeeeee','f' +==== +---- QUERY +# Check that we can parse the integer enum value as well +set parquet_fallback_schema_resolution=1; +select new_a from resolution_by_name_test; +---- TYPES +string +---- RESULTS +'NULL' +'NULL' +'NULL' +==== +---- QUERY +set parquet_fallback_schema_resolution=0; +select b, new_a from resolution_by_name_test; +---- TYPES +string,string +---- RESULTS +'aaaaaaa','bbbbbbb' +'ccccc','dddd' +'eeeeeeee','f' +==== +---- QUERY +drop table resolution_by_name_test; +==== +---- QUERY +# Test nested types resolution +drop table if exists nested_resolution_by_name_test; +create table nested_resolution_by_name_test like functional_parquet.complextypestbl; +==== +---- SHELL +hadoop fs -cp $FILESYSTEM_PREFIX/test-warehouse/complextypestbl_parquet/nullable.parq \ +$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/nested_resolution_by_name_test/ +hadoop fs -cp $FILESYSTEM_PREFIX/test-warehouse/complextypestbl_parquet/nonnullable.parq \ +$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/nested_resolution_by_name_test/ +==== +---- QUERY +select id, nested_struct.a, b.item +from nested_resolution_by_name_test t, t.nested_struct.b +---- TYPES +bigint,int,int +---- RESULTS +1,1,1 +2,NULL,NULL +7,7,2 +7,7,3 +7,7,NULL +8,-1,-1 +==== +---- QUERY +# Can safely ignore extra fields in nested_struct +alter table nested_resolution_by_name_test change nested_struct nested_struct +struct<a:int, b: array<int>>; +select id, nested_struct.a, b.item +from nested_resolution_by_name_test t, t.nested_struct.b +---- TYPES +bigint,int,int +---- RESULTS +1,1,1 +2,NULL,NULL +7,7,2 +7,7,3 +7,7,NULL +8,-1,-1 +==== +---- QUERY +# Rearrange nested_struct's fields and make sure we can still resolve by name +alter table nested_resolution_by_name_test change nested_struct nested_struct +struct<b: array<int>, a: int>; +set parquet_fallback_schema_resolution="name"; +select id, nested_struct.a, b.item +from nested_resolution_by_name_test t, t.nested_struct.b +---- TYPES +bigint,int,int +---- RESULTS +1,1,1 +2,NULL,NULL +7,7,2 +7,7,3 +7,7,NULL +8,-1,-1 +==== +---- QUERY +# Can add back a single field +alter table nested_resolution_by_name_test change nested_struct nested_struct +struct<b: array<int>, a: int, g: map<string, struct<h: struct<i: array<float>>>>>; +select id, g.key +from nested_resolution_by_name_test t, t.nested_struct.g +---- TYPES +bigint,string +---- RESULTS +1,'foo' +2,'g1' +2,'g2' +2,'g3' +2,'g4' +2,'g5' +5,'foo' +==== +---- QUERY +# Add back single more nested field (and remove 'g' field) +alter table nested_resolution_by_name_test change nested_struct nested_struct +struct<b: array<int>, a: int, c: struct<d: array<array<struct<f: string>>>>>; +select tmp.f from nested_resolution_by_name_test.nested_struct.c.d.item tmp; +---- TYPES +string +---- RESULTS +'aaa' +'bbb' +'c' +'NULL' +'aaa' +'NULL' +'bbb' +'NULL' +'c' +'NULL' +'NULL' +'nonnullable' +==== +---- QUERY +# Can't rename nested field +alter table nested_resolution_by_name_test change nested_struct nested_struct +struct<b: array<int>, a: int, c: struct<d: array<array<struct<renamed: string>>>>>; +select tmp.renamed from nested_resolution_by_name_test.nested_struct.c.d.item tmp; +---- TYPES +string +---- RESULTS +'NULL' +'NULL' +'NULL' +'NULL' +'NULL' +'NULL' +'NULL' +'NULL' +'NULL' +'NULL' +'NULL' +'NULL' +==== +---- QUERY +drop table nested_resolution_by_name_test; +==== +---- QUERY +# Test switched key/value map fields +drop table if exists switched_map_fields_resolution_test; +create table switched_map_fields_resolution_test (int_map map<string,int>) +stored as parquet; +==== +---- SHELL +hadoop fs -copyFromLocal \ +$IMPALA_HOME/testdata/parquet_schema_resolution/switched_map.parq \ +$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/switched_map_fields_resolution_test/ +==== +---- QUERY +# Switched map fields should be resolvable by name. +set parquet_fallback_schema_resolution="name"; +select key, value from switched_map_fields_resolution_test.int_map +---- TYPES +string,int +---- RESULTS +'a',1 +'b',2 +'c',3 +==== +---- QUERY +# Can't resolve switched map fields by position since types are switched. +set parquet_fallback_schema_resolution="position"; +select key, value from switched_map_fields_resolution_test.int_map +---- CATCH +File '$NAMENODE/test-warehouse/$DATABASE.db/switched_map_fields_resolution_test/ +switched_map.parq' has an incompatible Parquet schema for column + '$DATABASE.switched_map_fields_resolution_test.int_map.key'. + Column type: STRING, Parquet schema: +required int32 value [i:0 d:1 r:1] +==== +---- QUERY +drop table switched_map_fields_resolution_test +==== +---- QUERY +# Check that we handle bad options gracefully +set parquet_fallback_schema_resolution="FOO" +---- CATCH +Invalid PARQUET_FALLBACK_SCHEMA_RESOLUTION option: 'FOO'. + Valid options are 'POSITION' and 'NAME'. +==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/tests/common/impala_test_suite.py ---------------------------------------------------------------------- diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py index 8f8bcce..1bacc16 100644 --- a/tests/common/impala_test_suite.py +++ b/tests/common/impala_test_suite.py @@ -27,6 +27,7 @@ from getpass import getuser from functools import wraps from impala._thrift_gen.ImpalaService.ttypes import TImpalaQueryOptions from random import choice +from subprocess import check_call from tests.common.impala_service import ImpaladService from tests.common.impala_connection import ImpalaConnection, create_connection from tests.common.test_dimensions import * @@ -221,6 +222,17 @@ class ImpalaTestSuite(BaseTestSuite): sections = self.load_query_test_file(self.get_workload(), test_file_name, encoding=encoding) for test_section in sections: + if 'SHELL' in test_section: + assert len(test_section) == 1, \ + "SHELL test sections can't contain other sections" + cmd = test_section['SHELL']\ + .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX)\ + .replace('$IMPALA_HOME', IMPALA_HOME) + if use_db: cmd = cmd.replace('$DATABASE', use_db) + LOG.info("Shell command: " + cmd) + check_call(cmd, shell=True) + continue + if 'QUERY' not in test_section: assert 0, 'Error in test file %s. Test cases require a -- QUERY section.\n%s' %\ (test_file_name, pprint.pformat(test_section)) @@ -265,7 +277,11 @@ class ImpalaTestSuite(BaseTestSuite): .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX) \ .replace('$NAMENODE', NAMENODE) \ .replace('$IMPALA_HOME', IMPALA_HOME) - assert expected_str in str(e) + if use_db: expected_str = expected_str.replace('$DATABASE', use_db) + # Strip newlines so we can split error message into multiple lines + expected_str = expected_str.replace('\n', '') + actual_str = str(e).replace('\n', '') + assert expected_str in actual_str continue raise finally: http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/tests/conftest.py ---------------------------------------------------------------------- diff --git a/tests/conftest.py b/tests/conftest.py index a9329f3..c6a22c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -219,6 +219,8 @@ def unique_database(request, testid_checksum): 'characters.'.format(db_name)) def cleanup(): + # Make sure we don't try to drop the current session database + request.instance.execute_query_expect_success(request.instance.client, "use default") request.instance.execute_query_expect_success( request.instance.client, 'DROP DATABASE `{0}` CASCADE'.format(db_name)) LOG.info('Dropped database "{0}" for test ID "{1}"'.format(db_name, http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 7a4290f..7768a40 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -336,6 +336,11 @@ class TestParquet(ImpalaTestSuite): assert c_schema_elt.converted_type == ConvertedType.UTF8 assert d_schema_elt.converted_type == None + @SkipIfS3.insert + def test_resolution_by_name(self, unique_database, vector): + self.run_test_case('QueryTest/parquet-resolution-by-name', vector, + use_db=unique_database) + # We use various scan range lengths to exercise corner cases in the HDFS scanner more # thoroughly. In particular, it will exercise: # 1. default scan range http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b51b2b6/tests/util/test_file_parser.py ---------------------------------------------------------------------- diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py index 57105b2..f608aa7 100644 --- a/tests/util/test_file_parser.py +++ b/tests/util/test_file_parser.py @@ -78,7 +78,7 @@ def parse_query_test_file(file_name, valid_section_names=None, encoding=None): section_names = valid_section_names if section_names is None: section_names = ['QUERY', 'RESULTS', 'TYPES', 'LABELS', 'SETUP', 'CATCH', 'ERRORS', - 'USER', 'RUNTIME_PROFILE'] + 'USER', 'RUNTIME_PROFILE', 'SHELL'] return parse_test_file(file_name, section_names, encoding=encoding, skip_unknown_sections=False)
