IMPALA-4675: Case-insensitive matching of Parquet fields. The query option PARQUET_FALLBACK_SCHEMA_RESOLUTION allows matching of Parquet fields by name instead of by index (the default).
Parquet column names are case sensitive, but Impala treats db/table/column/field names as case-insensitive. Today, there is no way today to select Parquet columns with mixed casing via SQL using the name-based field resolution policy. This patch changes the matching of Parquet fields to be case-insensitive. Testing: - Modified the data files backing complextypestbl to contain fields with mixed casing. - Several existing tests run against this table, including the test for name-based resolution. - I confirmed that without this fix, the existing name-based resolution tests fail on the modified data files. - I locally ran test_scanners.py and test_nested_types.py on exhaustive with this fix. Change-Id: I87395f84ba29b4c3d8e41be1ea4e89e500b8a9f4 Reviewed-on: http://gerrit.cloudera.org:8080/5891 Reviewed-by: Alex Behm <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/34353218 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/34353218 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/34353218 Branch: refs/heads/master Commit: 34353218cebbcd02630b07681624c1a1d5c9fd5b Parents: 7ea96c6 Author: Nathan Salmon <[email protected]> Authored: Wed Mar 1 13:23:41 2017 -0800 Committer: Impala Public Jenkins <[email protected]> Committed: Fri Mar 3 10:20:07 2017 +0000 ---------------------------------------------------------------------- be/src/exec/parquet-metadata-utils.cc | 3 +- be/src/exec/parquet-metadata-utils.h | 4 +- testdata/ComplexTypesTbl/nonnullable.avsc | 16 ++-- testdata/ComplexTypesTbl/nonnullable.json | 14 ++-- testdata/ComplexTypesTbl/nonnullable.parq | Bin 3190 -> 3186 bytes testdata/ComplexTypesTbl/nullable.avsc | 16 ++-- testdata/ComplexTypesTbl/nullable.json | 80 +++++++++---------- testdata/ComplexTypesTbl/nullable.parq | Bin 3900 -> 3896 bytes .../QueryTest/parquet-resolution-by-name.test | 12 --- tests/query_test/test_scanners.py | 2 +- 10 files changed, 71 insertions(+), 76 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/be/src/exec/parquet-metadata-utils.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/parquet-metadata-utils.cc b/be/src/exec/parquet-metadata-utils.cc index 7fb2313..2527219 100644 --- a/be/src/exec/parquet-metadata-utils.cc +++ b/be/src/exec/parquet-metadata-utils.cc @@ -20,6 +20,7 @@ #include <string> #include <sstream> #include <vector> +#include <strings.h> #include <boost/algorithm/string.hpp> #include <gutil/strings/substitute.h> @@ -546,7 +547,7 @@ int ParquetSchemaResolver::FindChildWithName(SchemaNode* node, const string& name) const { int idx; for (idx = 0; idx < node->children.size(); ++idx) { - if (node->children[idx].element->name == name) break; + if (strcasecmp(node->children[idx].element->name.c_str(), name.c_str()) == 0) break; } return idx; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/be/src/exec/parquet-metadata-utils.h ---------------------------------------------------------------------- diff --git a/be/src/exec/parquet-metadata-utils.h b/be/src/exec/parquet-metadata-utils.h index fe7552e..fdfc463 100644 --- a/be/src/exec/parquet-metadata-utils.h +++ b/be/src/exec/parquet-metadata-utils.h @@ -188,7 +188,9 @@ class ParquetSchemaResolver { int next_idx, SchemaNode* node, bool* missing_field) const; /// Returns the index of 'node's child with 'name', or the number of children if not - /// found. + /// found. The name comparison is case-insensitive because that's how Impala treats + /// db/table/column/field names. If there are several matches with different casing, + /// then the index of the first match is returned. int FindChildWithName(SchemaNode* node, const string& name) const; /// The ResolvePathHelper() logic for arrays. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nonnullable.avsc ---------------------------------------------------------------------- diff --git a/testdata/ComplexTypesTbl/nonnullable.avsc b/testdata/ComplexTypesTbl/nonnullable.avsc index 5f78dcf..b82c423 100644 --- a/testdata/ComplexTypesTbl/nonnullable.avsc +++ b/testdata/ComplexTypesTbl/nonnullable.avsc @@ -1,26 +1,28 @@ {"type": "record", "namespace": "org.apache.impala", "name": "ComplexTypesTbl", + /* Field names have mixed casing to test the case-insensitive matching of + fields in Parquet files. */ "fields": [ - {"name": "id", "type": "long"}, - {"name": "int_array", "type": {"type": "array", "items": "int"}}, + {"name": "ID", "type": "long"}, + {"name": "Int_Array", "type": {"type": "array", "items": "int"}}, {"name": "int_array_array", "type": {"type": "array", "items": {"type": "array", "items": "int"}}}, - {"name": "int_map", "type": {"type": "map", "values": "int"}}, + {"name": "Int_Map", "type": {"type": "map", "values": "int"}}, {"name": "int_map_array", "type": {"type": "array", "items": {"type": "map", "values": "int"}}}, - {"name": "nested_struct", "type": + {"name": "nested_Struct", "type": {"type": "record", "name": "r1", "fields": [ {"name": "a", "type": "int"}, - {"name": "b", "type": {"type": "array", "items": "int"}}, + {"name": "B", "type": {"type": "array", "items": "int"}}, {"name": "c", "type": {"type": "record", "name": "r2", "fields": [ - {"name": "d", "type": {"type": "array", "items": + {"name": "D", "type": {"type": "array", "items": {"type": "array", "items": {"type": "record", "name": "r3", "fields": [ {"name": "e", "type": "int"}, {"name": "f", "type": "string"}]}}}} ]}}, - {"name": "g", "type": {"type": "map", "values": { + {"name": "G", "type": {"type": "map", "values": { "type": "record", "name": "r4", "fields": [ {"name": "h", "type": {"type": "record", "name": "r5", "fields": [ {"name": "i", "type": {"type": "array", "items": "double"}}]}} http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nonnullable.json ---------------------------------------------------------------------- diff --git a/testdata/ComplexTypesTbl/nonnullable.json b/testdata/ComplexTypesTbl/nonnullable.json index 9c49460..4da4968 100644 --- a/testdata/ComplexTypesTbl/nonnullable.json +++ b/testdata/ComplexTypesTbl/nonnullable.json @@ -1,14 +1,14 @@ [ -{"id": 8, - "int_array": [-1], +{"ID": 8, + "Int_Array": [-1], "int_array_array": [[-1,-2],[]], - "int_map": {"k1": -1}, + "Int_Map": {"k1": -1}, "int_map_array": [{}, {"k1": 1}, {}, {}], - "nested_struct": { + "nested_Struct": { "a": -1, - "b": [-1], + "B": [-1], "c": { - "d": [ + "D": [ [{"e": -1, "f": "nonnullable"}]]}, - "g": {}}} + "G": {}}} ] http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nonnullable.parq ---------------------------------------------------------------------- diff --git a/testdata/ComplexTypesTbl/nonnullable.parq b/testdata/ComplexTypesTbl/nonnullable.parq index e13d9eb..f4be082 100644 Binary files a/testdata/ComplexTypesTbl/nonnullable.parq and b/testdata/ComplexTypesTbl/nonnullable.parq differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nullable.avsc ---------------------------------------------------------------------- diff --git a/testdata/ComplexTypesTbl/nullable.avsc b/testdata/ComplexTypesTbl/nullable.avsc index 564bea8..b3bd874 100644 --- a/testdata/ComplexTypesTbl/nullable.avsc +++ b/testdata/ComplexTypesTbl/nullable.avsc @@ -1,28 +1,30 @@ {"type": "record", "namespace": "org.apache.impala", "name": "ComplexTypesTbl", + /* Field names have mixed casing to test the case-insensitive matching of + fields in Parquet files. */ "fields": [ {"name": "id", "type": ["null", "long"]}, {"name": "int_array", "type": ["null", {"type": "array", "items": ["null", "int"]}]}, - {"name": "int_array_array", "type": ["null", {"type": "array", "items": + {"name": "int_array_Array", "type": ["null", {"type": "array", "items": ["null", {"type": "array", "items": ["null", "int"]}]}]}, {"name": "int_map", "type": ["null", {"type": "map", "values": ["null", "int"]}]}, - {"name": "int_map_array", "type": ["null", {"type": "array", "items": + {"name": "int_Map_Array", "type": ["null", {"type": "array", "items": ["null", {"type": "map", "values": ["null", "int"]}]}]}, {"name": "nested_struct", "type": ["null", {"type": "record", "name": "r1", "fields": [ - {"name": "a", "type": ["null", "int"]}, + {"name": "A", "type": ["null", "int"]}, {"name": "b", "type": ["null", {"type": "array", "items": ["null", "int"]}]}, - {"name": "c", "type": ["null", {"type": "record", "name": "r2", "fields": [ + {"name": "C", "type": ["null", {"type": "record", "name": "r2", "fields": [ {"name": "d", "type": ["null", {"type": "array", "items": ["null", {"type": "array", "items": ["null", {"type": "record", "name": "r3", "fields": [ - {"name": "e", "type": ["null", "int"]}, - {"name": "f", "type": ["null", "string"]}]}]}]}]} + {"name": "E", "type": ["null", "int"]}, + {"name": "F", "type": ["null", "string"]}]}]}]}]} ]}]}, {"name": "g", "type": ["null", {"type": "map", "values": ["null", {"type": "record", "name": "r4", "fields": [ - {"name": "h", "type": + {"name": "H", "type": ["null", {"type": "record", "name": "r5", "fields": [ {"name": "i", "type": ["null", {"type": "array", "items": ["null", "double"]}]}]}]}]}]}]} http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nullable.json ---------------------------------------------------------------------- diff --git a/testdata/ComplexTypesTbl/nullable.json b/testdata/ComplexTypesTbl/nullable.json index 251b522..6f49c20 100644 --- a/testdata/ComplexTypesTbl/nullable.json +++ b/testdata/ComplexTypesTbl/nullable.json @@ -1,86 +1,86 @@ [ {"id": 1, "int_array": [1,2,3], - "int_array_array": [[1,2],[3,4]], + "int_array_Array": [[1,2],[3,4]], "int_map": {"k1": 1, "k2": 100}, - "int_map_array": [{"k1": 1}], + "int_Map_Array": [{"k1": 1}], "nested_struct": { - "a": 1, + "A": 1, "b": [1], - "c": { + "C": { "d": [ - [{"e": 10, "f": "aaa"}, - {"e": -10, "f": "bbb"}], - [{"e": 11, "f": "c"}]]}, - "g": {"foo": {"h": {"i": [1.1]}}}}}, + [{"E": 10, "F": "aaa"}, + {"E": -10, "F": "bbb"}], + [{"E": 11, "F": "c"}]]}, + "g": {"foo": {"H": {"i": [1.1]}}}}}, {"id": 2, "int_array": [null,1,2,null,3,null], - "int_array_array": [[null,1,2,null],[3,null,4],[], null], + "int_array_Array": [[null,1,2,null],[3,null,4],[], null], "int_map": {"k1": 2, "k2": null}, - "int_map_array": [{"k3": null, "k1": 1}, null, {}], + "int_Map_Array": [{"k3": null, "k1": 1}, null, {}], "nested_struct": { - "a": null, + "A": null, "b": [null], - "c": {"d": [ - [{"e": null, "f": null}, - {"e": 10, "f": "aaa"}, - {"e": null, "f": null}, - {"e": -10, "f": "bbb"}, - {"e": null, "f": null}], - [{"e": 11, "f": "c"}, + "C": {"d": [ + [{"E": null, "F": null}, + {"E": 10, "F": "aaa"}, + {"E": null, "F": null}, + {"E": -10, "F": "bbb"}, + {"E": null, "F": null}], + [{"E": 11, "F": "c"}, null], [], null]}, "g": { - "g1": {"h": {"i": [2.2, null]}}, - "g2": {"h": {"i": []}}, + "g1": {"H": {"i": [2.2, null]}}, + "g2": {"H": {"i": []}}, "g3": null, - "g4": {"h": {"i": null}}, - "g5": {"h": null}}}}, + "g4": {"H": {"i": null}}, + "g5": {"H": null}}}}, {"id": 3, "int_array": [], - "int_array_array": [null], + "int_array_Array": [null], "int_map": {}, - "int_map_array": [null, null], + "int_Map_Array": [null, null], "nested_struct": { - "a": null, + "A": null, "b": null, - "c": {"d": []}, + "C": {"d": []}, "g": {}}}, {"id": 4, "int_array": null, - "int_array_array": [], + "int_array_Array": [], "int_map": {}, - "int_map_array": [], + "int_Map_Array": [], "nested_struct": { - "a": null, + "A": null, "b": null, - "c": {"d": null}, + "C": {"d": null}, "g": null}}, {"id": 5, "int_array": null, - "int_array_array": null, + "int_array_Array": null, "int_map": {}, "nested_struct": { - "a": null, + "A": null, "b": null, - "c": null, - "g": {"foo": {"h": {"i": [2.2, 3.3]}}}}}, + "C": null, + "g": {"foo": {"H": {"i": [2.2, 3.3]}}}}}, {"id": 6, "int_array": null, - "int_array_array": null, + "int_array_Array": null, "int_map": null, - "int_map_array": null, + "int_Map_Array": null, "nested_struct": null}, {"id": 7, "int_array": null, - "int_array_array": [null,[5,6]], + "int_array_Array": [null,[5,6]], "int_map": {"k1": null, "k3": null}, - "int_map_array": null, + "int_Map_Array": null, "nested_struct": { - "a": 7, + "A": 7, "b": [2,3,null], - "c": {"d": [ + "C": {"d": [ [], [null], null]}, http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/ComplexTypesTbl/nullable.parq ---------------------------------------------------------------------- diff --git a/testdata/ComplexTypesTbl/nullable.parq b/testdata/ComplexTypesTbl/nullable.parq index 01955a6..2c72f52 100644 Binary files a/testdata/ComplexTypesTbl/nullable.parq and b/testdata/ComplexTypesTbl/nullable.parq differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test index 2546e9c..45c739e 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test @@ -1,7 +1,6 @@ ==== ---- QUERY # Create a table and populate with data file -drop table if exists resolution_by_name_test; create table resolution_by_name_test stored as parquet as select * from functional_parquet.tinytable; select a, b from resolution_by_name_test; @@ -69,11 +68,7 @@ string,string 'eeeeeeee','f' ==== ---- QUERY -drop table resolution_by_name_test; -==== ----- QUERY # Test nested types resolution -drop table if exists nested_resolution_by_name_test; create table nested_resolution_by_name_test like functional_parquet.complextypestbl; ==== ---- SHELL @@ -191,11 +186,7 @@ string 'NULL' ==== ---- QUERY -drop table nested_resolution_by_name_test; -==== ----- QUERY # Test switched key/value map fields -drop table if exists switched_map_fields_resolution_test; create table switched_map_fields_resolution_test (int_map map<string,int>) stored as parquet; ==== @@ -227,9 +218,6 @@ switched_map.parq' has an incompatible Parquet schema for column required int32 value [i:0 d:1 r:1] ==== ---- QUERY -drop table switched_map_fields_resolution_test -==== ----- QUERY # Check that we handle bad options gracefully set parquet_fallback_schema_resolution="FOO" ---- CATCH http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/34353218/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index de4f826..c286c3a 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -507,7 +507,7 @@ class TestParquet(ImpalaTestSuite): assert d_schema_elt.converted_type == None @SkipIfOldAggsJoins.nested_types - def test_resolution_by_name(self, unique_database, vector): + def test_resolution_by_name(self, vector, unique_database): self.run_test_case('QueryTest/parquet-resolution-by-name', vector, use_db=unique_database)
