IMPALA-4826: Fix error during a scan on repeated root schema in Parquet. Having the repetition level set to REPEATED on the root schema resulted a scan to fail with error when Impala tried to parse that table.
As a solution, the 'REPEATED' repetition level is ignored when the root schema is processed. The reasoning behind is that the Parquet format description says that the repetition level of the root schema should not be set to REPEATED anyway, so it's safe to ignore it in case it is set to this value for some reason. Change-Id: I7ea84589e1d122ad9d43adde46893ec0ecc5f9c4 Reviewed-on: http://gerrit.cloudera.org:8080/7870 Reviewed-by: Dan Hecht <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/545eab6d Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/545eab6d Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/545eab6d Branch: refs/heads/master Commit: 545eab6d6202ca3968279d14fad28bd2a6d566f6 Parents: 387bde0 Author: Gabor Kaszab <[email protected]> Authored: Mon Aug 28 17:03:31 2017 +0200 Committer: Impala Public Jenkins <[email protected]> Committed: Wed Sep 6 20:07:56 2017 +0000 ---------------------------------------------------------------------- be/src/exec/parquet-metadata-utils.cc | 4 +++- testdata/data/README | 10 ++++++++++ testdata/data/repeated_root_schema.parquet | Bin 0 -> 7598 bytes tests/query_test/test_scanners.py | 14 ++++++++++++++ 4 files changed, 27 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/545eab6d/be/src/exec/parquet-metadata-utils.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/parquet-metadata-utils.cc b/be/src/exec/parquet-metadata-utils.cc index 8ec0abd..fc34eda 100644 --- a/be/src/exec/parquet-metadata-utils.cc +++ b/be/src/exec/parquet-metadata-utils.cc @@ -339,6 +339,7 @@ Status ParquetSchemaResolver::CreateSchemaTree( return Status(Substitute("File '$0' corrupt: could not reconstruct schema tree from " "flattened schema in file metadata", filename_)); } + bool is_root_schema = (*idx == 0); node->element = &schema[*idx]; ++(*idx); @@ -363,7 +364,8 @@ Status ParquetSchemaResolver::CreateSchemaTree( if (node->element->repetition_type == parquet::FieldRepetitionType::OPTIONAL) { ++max_def_level; - } else if (node->element->repetition_type == parquet::FieldRepetitionType::REPEATED) { + } else if (node->element->repetition_type == parquet::FieldRepetitionType::REPEATED && + !is_root_schema /*PARQUET-843*/) { ++max_rep_level; // Repeated fields add a definition level. This is used to distinguish between an // empty list and a list with an item in it. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/545eab6d/testdata/data/README ---------------------------------------------------------------------- diff --git a/testdata/data/README b/testdata/data/README index 4066a8d..231a901 100644 --- a/testdata/data/README +++ b/testdata/data/README @@ -106,3 +106,13 @@ deprecated_statistics.parquet: Generated with with hive shell, which uses parquet-mr version 1.5.0-cdh5.12.0-SNAPSHOT Contains a copy of the data in functional.alltypessmall with statistics that use the old 'min'/'max' fields. + +repeated_root_schema.parquet: +Generated by hacking Impala's Parquet writer. +Created to reproduce IMPALA-4826. Contains a table of 300 rows where the +repetition level of the root schema is set to REPEATED. +Reproduction steps: +1: Extend HdfsParquetTableWriter::CreateSchema with the following line: + file_metadata_.schema[0].__set_repetition_type(FieldRepetitionType::REQUIRED); +2: Run test_compute_stats and grab the created Parquet file for + alltypes_parquet table. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/545eab6d/testdata/data/repeated_root_schema.parquet ---------------------------------------------------------------------- diff --git a/testdata/data/repeated_root_schema.parquet b/testdata/data/repeated_root_schema.parquet new file mode 100755 index 0000000..4e0fb6e Binary files /dev/null and b/testdata/data/repeated_root_schema.parquet differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/545eab6d/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index e9fd457..d355081 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -298,6 +298,20 @@ class TestParquet(ImpalaTestSuite): vector.get_value('exec_option')['abort_on_error'] = 1 self.run_test_case('QueryTest/parquet-zero-rows', vector, unique_database) + def test_repeated_root_schema(self, vector, unique_database): + """IMPALA-4826: Tests that running a scan on a schema where the root schema's + repetetion level is set to REPEATED succeeds without errors.""" + self.client.execute("create table %s.repeated_root_schema (i int) " + "stored as parquet" % unique_database) + repeated_root_schema_loc = get_fs_path( + "/test-warehouse/%s.db/%s" % (unique_database, "repeated_root_schema")) + check_call(['hdfs', 'dfs', '-copyFromLocal', + os.environ['IMPALA_HOME'] + "/testdata/data/repeated_root_schema.parquet", + repeated_root_schema_loc]) + + result = self.client.execute("select * from %s.repeated_root_schema" % unique_database) + assert len(result.data) == 300 + def test_huge_num_rows(self, vector, unique_database): """IMPALA-5021: Tests that a zero-slot scan on a file with a huge num_rows in the footer succeeds without errors."""
