Repository: impala Updated Branches: refs/heads/master 2e6034786 -> 95f166630
IMPALA-6077: remove Parquet BIT_PACKED def level support The encoding was added in an early version of the Parquet spec and deprecated even in the Parquet 1.0 spec. Parquet-MR switched to generating RLE at the same time as the spec changed in mid-2013. Impala always wrote RLE: see commit 6e293090e60aea300f9e83db67f56a5efd07c35c. The Impala implementation of BIT_PACKED was never correct because it implemented little endian bit unpacking instead of the big endian unpacking required by the spec for levels. Testing: Updated tests to reflect expected behaviour for supported and unsupported def level encodings. Cherry-picks: not for 2.x. Change-Id: I12c75b7f162dd7de8e26cf31be142b692e3624ae Reviewed-on: http://gerrit.cloudera.org:8080/9241 Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/95f16663 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/95f16663 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/95f16663 Branch: refs/heads/master Commit: 95f16663092af3ffa6dddf745973c7065c164436 Parents: 2e60347 Author: Tim Armstrong <tarmstr...@cloudera.com> Authored: Wed Feb 7 10:06:58 2018 -0800 Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org> Committed: Mon Feb 12 21:59:37 2018 +0000 ---------------------------------------------------------------------- be/src/exec/parquet-column-readers.cc | 20 ++-------- be/src/exec/parquet-column-readers.h | 6 +-- common/thrift/generate_error_codes.py | 6 ++- .../queries/QueryTest/parquet-def-levels.test | 41 ++++++++++++++++++-- tests/query_test/test_scanners.py | 12 +++--- 5 files changed, 53 insertions(+), 32 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/95f16663/be/src/exec/parquet-column-readers.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/parquet-column-readers.cc b/be/src/exec/parquet-column-readers.cc index 099fdce..7cf89ba 100644 --- a/be/src/exec/parquet-column-readers.cc +++ b/be/src/exec/parquet-column-readers.cc @@ -47,9 +47,6 @@ DEFINE_bool(convert_legacy_hive_parquet_utc_timestamps, false, "When true, TIMESTAMPs read from files written by Parquet-MR (used by Hive) will " "be converted from UTC to local time. Writes are unaffected."); -// Throttle deprecation warnings to - only print warning with this frequency. -static const int BITPACKED_DEPRECATION_WARNING_FREQUENCY = 100; - // Max data page header size in bytes. This is an estimate and only needs to be an upper // bound. It is theoretically possible to have a page header of any size due to string // value statistics, but in practice we'll have trouble reading string values this large. @@ -104,13 +101,7 @@ Status ParquetLevelDecoder::Init(const string& filename, break; } case parquet::Encoding::BIT_PACKED: - num_bytes = BitUtil::Ceil(num_buffered_values, 8); - bit_reader_.Reset(*data, num_bytes); - LOG_EVERY_N(WARNING, BITPACKED_DEPRECATION_WARNING_FREQUENCY) - << filename << " uses deprecated Parquet BIT_PACKED encoding for rep or def " - << "levels. This will be removed in the future - see IMPALA-6077. Warning " - << "every " << BITPACKED_DEPRECATION_WARNING_FREQUENCY << " occurrences."; - break; + return Status(TErrorCode::PARQUET_BIT_PACKED_LEVELS, filename); default: { stringstream ss; ss << "Unsupported encoding: " << encoding; @@ -189,13 +180,8 @@ bool ParquetLevelDecoder::FillCache(int batch_size, int* num_cached_levels) { *num_cached_levels = batch_size; return true; } - if (encoding_ == parquet::Encoding::RLE) { - return FillCacheRle(batch_size, num_cached_levels); - } else { - DCHECK_EQ(encoding_, parquet::Encoding::BIT_PACKED); - *num_cached_levels = bit_reader_.UnpackBatch(1, batch_size, cached_levels_); - return true; - } + DCHECK_EQ(encoding_, parquet::Encoding::RLE); + return FillCacheRle(batch_size, num_cached_levels); } bool ParquetLevelDecoder::FillCacheRle(int batch_size, int* num_cached_levels) { http://git-wip-us.apache.org/repos/asf/impala/blob/95f16663/be/src/exec/parquet-column-readers.h ---------------------------------------------------------------------- diff --git a/be/src/exec/parquet-column-readers.h b/be/src/exec/parquet-column-readers.h index 3a8ad70..86ca239 100644 --- a/be/src/exec/parquet-column-readers.h +++ b/be/src/exec/parquet-column-readers.h @@ -99,9 +99,6 @@ class ParquetLevelDecoder { /// RLE decoder, used if 'encoding_' is RLE. RleBatchDecoder<uint8_t> rle_decoder_; - /// Bit unpacker, used if 'encoding_' is BIT_PACKED. - BatchedBitReader bit_reader_; - /// Buffer for a batch of levels. The memory is allocated and owned by a pool passed /// in Init(). uint8_t* cached_levels_ = nullptr; @@ -112,8 +109,7 @@ class ParquetLevelDecoder { /// Current index into cached_levels_. int cached_level_idx_ = 0; - /// The parquet encoding used for the levels. Usually RLE but the deprecated BIT_PACKED - /// encoding is also allowed. + /// The parquet encoding used for the levels. Only RLE is supported for now. parquet::Encoding::type encoding_ = parquet::Encoding::PLAIN; /// For error checking and reporting. http://git-wip-us.apache.org/repos/asf/impala/blob/95f16663/common/thrift/generate_error_codes.py ---------------------------------------------------------------------- diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py index bdeb1ed..c0f6894 100755 --- a/common/thrift/generate_error_codes.py +++ b/common/thrift/generate_error_codes.py @@ -347,7 +347,11 @@ error_codes = ( ("SASL_APP_NAME_MISMATCH", 114, "InitAuth() called multiple times with different names. Was called with $0. " - "Now using $1.") + "Now using $1."), + + ("PARQUET_BIT_PACKED_LEVELS", 115, + "Can not read Parquet file $0 with deprecated BIT_PACKED encoding for rep or " + "def levels. Support was removed in Impala 3.0 - see IMPALA-6077."), ) import sys http://git-wip-us.apache.org/repos/asf/impala/blob/95f16663/testdata/workloads/functional-query/queries/QueryTest/parquet-def-levels.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-def-levels.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-def-levels.test index d48c333..e55fc4d 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-def-levels.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-def-levels.test @@ -1,19 +1,20 @@ ==== ---- QUERY +# Test the default encoding - RLE. # Verify that total counts of non-null values are correct. select count(id), count(tinyint_col), count(smallint_col), count(int_col), count(bigint_col), count(float_col), count(double_col), count(date_string_col), count(string_col), count(timestamp_col), count(year), count(month), count(day) -from alltypesagg +from functional_parquet.alltypesagg ---- TYPES BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT ---- RESULTS 11000,9000,10800,10980,10980,10980,10980,11000,11000,11000,11000,11000,10000 ==== ---- QUERY -# Spot-check a subset of values. +# Test the default encoding - RLE. Spot-check a subset of values. select * -from alltypesagg +from functional_parquet.alltypesagg where year = 2010 and month = 1 and int_col is null or int_col % 1000 = 77 order by id, year, month, day ---- TYPES @@ -50,3 +51,37 @@ INT,BOOLEAN,TINYINT,SMALLINT,INT,BIGINT,FLOAT,DOUBLE,STRING,STRING,TIMESTAMP,INT 9000,true,NULL,NULL,NULL,NULL,NULL,NULL,'01/10/10','0',2010-01-10 00:00:00,2010,1,NULL 9077,false,7,77,77,770,84.69999694824219,777.6999999999999,'01/10/10','77',2010-01-10 01:17:29.260000000,2010,1,10 ==== +---- QUERY +# IMPALA-6077: unsupported BIT_PACKED encoding fails when materializing columns. +select id +from alltypesagg_bitpacked +---- CATCH +deprecated BIT_PACKED encoding for rep or def levels. +==== +---- QUERY +# IMPALA-6077: do not need to decode BIT_PACKED encoding when not materializing columns. +select count(*) +from alltypesagg_bitpacked +---- RESULTS +11000 +---- TYPES +BIGINT +==== +---- QUERY +# IMPALA-6077: in future we could return results for this query from metadata, in which +# case it should either work or fail gracefully. For now it still requires materialising +# levels. +select count(id) +from alltypesagg_bitpacked +---- CATCH +deprecated BIT_PACKED encoding for rep or def levels. +==== +---- QUERY +# IMPALA-6077: in future we could return results for this query from metadata, in which +# case it should either work or fail gracefully. For now it still requires materialising +# levels. +select min(int_col) +from alltypesagg_bitpacked +---- CATCH +deprecated BIT_PACKED encoding for rep or def levels. +==== http://git-wip-us.apache.org/repos/asf/impala/blob/95f16663/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 9b252da..97e72a9 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -401,19 +401,19 @@ class TestParquet(ImpalaTestSuite): self.run_test_case('QueryTest/parquet-bad-compressed-dict-page-size', vector, unique_database) - def test_bitpacked_def_levels(self, vector, unique_database): - """Test that Impala can read a Parquet file with the deprecated bit-packed def - level encoding.""" - self.client.execute(("""CREATE TABLE {0}.alltypesagg ( + def test_def_levels(self, vector, unique_database): + """Test that Impala behaves as expected when decoding def levels with different + encodings - RLE, BIT_PACKED, etc.""" + self.client.execute(("""CREATE TABLE {0}.alltypesagg_bitpacked ( id INT, bool_col BOOLEAN, tinyint_col TINYINT, smallint_col SMALLINT, int_col INT, bigint_col BIGINT, float_col FLOAT, double_col DOUBLE, date_string_col STRING, string_col STRING, timestamp_col TIMESTAMP, year INT, month INT, day INT) STORED AS PARQUET""").format(unique_database)) alltypesagg_loc = get_fs_path( - "/test-warehouse/{0}.db/{1}".format(unique_database, "alltypesagg")) + "/test-warehouse/{0}.db/{1}".format(unique_database, "alltypesagg_bitpacked")) check_call(['hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] + "/testdata/data/alltypes_agg_bitpacked_def_levels.parquet", alltypesagg_loc]) - self.client.execute("refresh {0}.alltypesagg".format(unique_database)); + self.client.execute("refresh {0}.alltypesagg_bitpacked".format(unique_database)); self.run_test_case('QueryTest/parquet-def-levels', vector, unique_database)