Repository: impala
Updated Branches:
  refs/heads/master 2e6034786 -> 95f166630


IMPALA-6077: remove Parquet BIT_PACKED def level support

The encoding was added in an early version of the Parquet
spec and deprecated even in the Parquet 1.0 spec.

Parquet-MR switched to generating RLE at the same time as
the spec changed in mid-2013. Impala always wrote RLE:
see commit 6e293090e60aea300f9e83db67f56a5efd07c35c.

The Impala implementation of BIT_PACKED was never correct
because it implemented little endian bit unpacking instead of
the big endian unpacking required by the spec for levels.

Testing:
Updated tests to reflect expected behaviour for supported
and unsupported def level encodings.

Cherry-picks: not for 2.x.

Change-Id: I12c75b7f162dd7de8e26cf31be142b692e3624ae
Reviewed-on: http://gerrit.cloudera.org:8080/9241
Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/95f16663
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/95f16663
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/95f16663

Branch: refs/heads/master
Commit: 95f16663092af3ffa6dddf745973c7065c164436
Parents: 2e60347
Author: Tim Armstrong <tarmstr...@cloudera.com>
Authored: Wed Feb 7 10:06:58 2018 -0800
Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org>
Committed: Mon Feb 12 21:59:37 2018 +0000

----------------------------------------------------------------------
 be/src/exec/parquet-column-readers.cc           | 20 ++--------
 be/src/exec/parquet-column-readers.h            |  6 +--
 common/thrift/generate_error_codes.py           |  6 ++-
 .../queries/QueryTest/parquet-def-levels.test   | 41 ++++++++++++++++++--
 tests/query_test/test_scanners.py               | 12 +++---
 5 files changed, 53 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/95f16663/be/src/exec/parquet-column-readers.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-readers.cc 
b/be/src/exec/parquet-column-readers.cc
index 099fdce..7cf89ba 100644
--- a/be/src/exec/parquet-column-readers.cc
+++ b/be/src/exec/parquet-column-readers.cc
@@ -47,9 +47,6 @@ DEFINE_bool(convert_legacy_hive_parquet_utc_timestamps, false,
     "When true, TIMESTAMPs read from files written by Parquet-MR (used by 
Hive) will "
     "be converted from UTC to local time. Writes are unaffected.");
 
-// Throttle deprecation warnings to - only print warning with this frequency.
-static const int BITPACKED_DEPRECATION_WARNING_FREQUENCY = 100;
-
 // Max data page header size in bytes. This is an estimate and only needs to 
be an upper
 // bound. It is theoretically possible to have a page header of any size due 
to string
 // value statistics, but in practice we'll have trouble reading string values 
this large.
@@ -104,13 +101,7 @@ Status ParquetLevelDecoder::Init(const string& filename,
       break;
     }
     case parquet::Encoding::BIT_PACKED:
-      num_bytes = BitUtil::Ceil(num_buffered_values, 8);
-      bit_reader_.Reset(*data, num_bytes);
-      LOG_EVERY_N(WARNING, BITPACKED_DEPRECATION_WARNING_FREQUENCY)
-          << filename << " uses deprecated Parquet BIT_PACKED encoding for rep 
or def "
-          << "levels. This will be removed in the future - see IMPALA-6077. 
Warning "
-          << "every " << BITPACKED_DEPRECATION_WARNING_FREQUENCY << " 
occurrences.";
-      break;
+      return Status(TErrorCode::PARQUET_BIT_PACKED_LEVELS, filename);
     default: {
       stringstream ss;
       ss << "Unsupported encoding: " << encoding;
@@ -189,13 +180,8 @@ bool ParquetLevelDecoder::FillCache(int batch_size, int* 
num_cached_levels) {
     *num_cached_levels = batch_size;
     return true;
   }
-  if (encoding_ == parquet::Encoding::RLE) {
-    return FillCacheRle(batch_size, num_cached_levels);
-  } else {
-    DCHECK_EQ(encoding_, parquet::Encoding::BIT_PACKED);
-    *num_cached_levels = bit_reader_.UnpackBatch(1, batch_size, 
cached_levels_);
-    return true;
-  }
+  DCHECK_EQ(encoding_, parquet::Encoding::RLE);
+  return FillCacheRle(batch_size, num_cached_levels);
 }
 
 bool ParquetLevelDecoder::FillCacheRle(int batch_size, int* num_cached_levels) 
{

http://git-wip-us.apache.org/repos/asf/impala/blob/95f16663/be/src/exec/parquet-column-readers.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-readers.h 
b/be/src/exec/parquet-column-readers.h
index 3a8ad70..86ca239 100644
--- a/be/src/exec/parquet-column-readers.h
+++ b/be/src/exec/parquet-column-readers.h
@@ -99,9 +99,6 @@ class ParquetLevelDecoder {
   /// RLE decoder, used if 'encoding_' is RLE.
   RleBatchDecoder<uint8_t> rle_decoder_;
 
-  /// Bit unpacker, used if 'encoding_' is BIT_PACKED.
-  BatchedBitReader bit_reader_;
-
   /// Buffer for a batch of levels. The memory is allocated and owned by a 
pool passed
   /// in Init().
   uint8_t* cached_levels_ = nullptr;
@@ -112,8 +109,7 @@ class ParquetLevelDecoder {
   /// Current index into cached_levels_.
   int cached_level_idx_ = 0;
 
-  /// The parquet encoding used for the levels. Usually RLE but the deprecated 
BIT_PACKED
-  /// encoding is also allowed.
+  /// The parquet encoding used for the levels. Only RLE is supported for now.
   parquet::Encoding::type encoding_ = parquet::Encoding::PLAIN;
 
   /// For error checking and reporting.

http://git-wip-us.apache.org/repos/asf/impala/blob/95f16663/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py 
b/common/thrift/generate_error_codes.py
index bdeb1ed..c0f6894 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -347,7 +347,11 @@ error_codes = (
 
   ("SASL_APP_NAME_MISMATCH", 114,
    "InitAuth() called multiple times with different names. Was called with $0. 
"
-   "Now using $1.")
+   "Now using $1."),
+
+  ("PARQUET_BIT_PACKED_LEVELS", 115,
+      "Can not read Parquet file $0 with deprecated BIT_PACKED encoding for 
rep or "
+      "def levels. Support was removed in Impala 3.0 - see IMPALA-6077."),
 )
 
 import sys

http://git-wip-us.apache.org/repos/asf/impala/blob/95f16663/testdata/workloads/functional-query/queries/QueryTest/parquet-def-levels.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-def-levels.test 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-def-levels.test
index d48c333..e55fc4d 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-def-levels.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-def-levels.test
@@ -1,19 +1,20 @@
 ====
 ---- QUERY
+# Test the default encoding - RLE.
 # Verify that total counts of non-null values are correct.
 select count(id), count(tinyint_col), count(smallint_col), count(int_col),
   count(bigint_col), count(float_col), count(double_col), 
count(date_string_col),
   count(string_col), count(timestamp_col), count(year), count(month), 
count(day)
-from alltypesagg
+from functional_parquet.alltypesagg
 ---- TYPES
 
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
 ---- RESULTS
 11000,9000,10800,10980,10980,10980,10980,11000,11000,11000,11000,11000,10000
 ====
 ---- QUERY
-# Spot-check a subset of values.
+# Test the default encoding - RLE. Spot-check a subset of values.
 select *
-from alltypesagg
+from functional_parquet.alltypesagg
 where year = 2010 and month = 1 and int_col is null or int_col % 1000 = 77
 order by id, year, month, day
 ---- TYPES
@@ -50,3 +51,37 @@ 
INT,BOOLEAN,TINYINT,SMALLINT,INT,BIGINT,FLOAT,DOUBLE,STRING,STRING,TIMESTAMP,INT
 9000,true,NULL,NULL,NULL,NULL,NULL,NULL,'01/10/10','0',2010-01-10 
00:00:00,2010,1,NULL
 
9077,false,7,77,77,770,84.69999694824219,777.6999999999999,'01/10/10','77',2010-01-10
 01:17:29.260000000,2010,1,10
 ====
+---- QUERY
+# IMPALA-6077: unsupported BIT_PACKED encoding fails when materializing 
columns.
+select id
+from alltypesagg_bitpacked
+---- CATCH
+deprecated BIT_PACKED encoding for rep or def levels.
+====
+---- QUERY
+# IMPALA-6077: do not need to decode BIT_PACKED encoding when not 
materializing columns.
+select count(*)
+from alltypesagg_bitpacked
+---- RESULTS
+11000
+---- TYPES
+BIGINT
+====
+---- QUERY
+# IMPALA-6077: in future we could return results for this query from metadata, 
in which
+# case it should either work or fail gracefully. For now it still requires 
materialising
+# levels.
+select count(id)
+from alltypesagg_bitpacked
+---- CATCH
+deprecated BIT_PACKED encoding for rep or def levels.
+====
+---- QUERY
+# IMPALA-6077: in future we could return results for this query from metadata, 
in which
+# case it should either work or fail gracefully. For now it still requires 
materialising
+# levels.
+select min(int_col)
+from alltypesagg_bitpacked
+---- CATCH
+deprecated BIT_PACKED encoding for rep or def levels.
+====

http://git-wip-us.apache.org/repos/asf/impala/blob/95f16663/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index 9b252da..97e72a9 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -401,19 +401,19 @@ class TestParquet(ImpalaTestSuite):
     self.run_test_case('QueryTest/parquet-bad-compressed-dict-page-size', 
vector,
         unique_database)
 
-  def test_bitpacked_def_levels(self, vector, unique_database):
-    """Test that Impala can read a Parquet file with the deprecated bit-packed 
def
-       level encoding."""
-    self.client.execute(("""CREATE TABLE {0}.alltypesagg (
+  def test_def_levels(self, vector, unique_database):
+    """Test that Impala behaves as expected when decoding def levels with 
different
+       encodings - RLE, BIT_PACKED, etc."""
+    self.client.execute(("""CREATE TABLE {0}.alltypesagg_bitpacked (
           id INT, bool_col BOOLEAN, tinyint_col TINYINT, smallint_col SMALLINT,
           int_col INT, bigint_col BIGINT, float_col FLOAT, double_col DOUBLE,
           date_string_col STRING, string_col STRING, timestamp_col TIMESTAMP,
           year INT, month INT, day INT) STORED AS 
PARQUET""").format(unique_database))
     alltypesagg_loc = get_fs_path(
-        "/test-warehouse/{0}.db/{1}".format(unique_database, "alltypesagg"))
+        "/test-warehouse/{0}.db/{1}".format(unique_database, 
"alltypesagg_bitpacked"))
     check_call(['hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] +
         "/testdata/data/alltypes_agg_bitpacked_def_levels.parquet", 
alltypesagg_loc])
-    self.client.execute("refresh {0}.alltypesagg".format(unique_database));
+    self.client.execute("refresh 
{0}.alltypesagg_bitpacked".format(unique_database));
 
     self.run_test_case('QueryTest/parquet-def-levels', vector, unique_database)
 

Reply via email to