IMPALA-5008: Fix reading stats for TINYINT and SMALLINT TINYINT and SMALLINT types use 1 and 2 byte slots respectively. However, statistics for the corresponding INT_8 and INT_16 Parquet types are encoded using 4 bytes. When reading back we were missing a conversion to the smaller types, thus overwriting the memory behind them. This was caught by the address sanitizer. The fix is to perform the necessary conversion.
Change-Id: I9b10508db53747e7b08c8bd9a69c763b82135a78 Reviewed-on: http://gerrit.cloudera.org:8080/6226 Reviewed-by: Lars Volker <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/94a8b1a9 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/94a8b1a9 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/94a8b1a9 Branch: refs/heads/master Commit: 94a8b1a9c1689d0302291ce6faf933f43839dc23 Parents: 47f7de9 Author: Lars Volker <[email protected]> Authored: Thu Mar 2 09:31:26 2017 -0800 Committer: Impala Public Jenkins <[email protected]> Committed: Fri Mar 3 08:26:18 2017 +0000 ---------------------------------------------------------------------- be/src/exec/parquet-column-stats.cc | 30 ++++++++++++++++++++++++++---- be/src/runtime/coordinator.cc | 1 - 2 files changed, 26 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/94a8b1a9/be/src/exec/parquet-column-stats.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/parquet-column-stats.cc b/be/src/exec/parquet-column-stats.cc index 202dc02..5985017 100644 --- a/be/src/exec/parquet-column-stats.cc +++ b/be/src/exec/parquet-column-stats.cc @@ -17,6 +17,8 @@ #include "parquet-column-stats.inline.h" +#include <limits> + namespace impala { bool ColumnStatsBase::ReadFromThrift(const parquet::Statistics& thrift_stats, @@ -24,10 +26,30 @@ bool ColumnStatsBase::ReadFromThrift(const parquet::Statistics& thrift_stats, switch (col_type.type) { case TYPE_BOOLEAN: return ColumnStats<bool>::ReadFromThrift(thrift_stats, stats_field, slot); - case TYPE_TINYINT: - return ColumnStats<int32_t>::ReadFromThrift(thrift_stats, stats_field, slot); - case TYPE_SMALLINT: - return ColumnStats<int32_t>::ReadFromThrift(thrift_stats, stats_field, slot); + case TYPE_TINYINT: { + // parquet::Statistics encodes INT_8 values using 4 bytes. + int32_t col_stats; + bool ret = ColumnStats<int32_t>::ReadFromThrift(thrift_stats, stats_field, + &col_stats); + if (!ret || col_stats < std::numeric_limits<int8_t>::min() || + col_stats > std::numeric_limits<int8_t>::max()) { + return false; + } + *static_cast<int8_t*>(slot) = col_stats; + return true; + } + case TYPE_SMALLINT: { + // parquet::Statistics encodes INT_16 values using 4 bytes. + int32_t col_stats; + bool ret = ColumnStats<int32_t>::ReadFromThrift(thrift_stats, stats_field, + &col_stats); + if (!ret || col_stats < std::numeric_limits<int16_t>::min() || + col_stats > std::numeric_limits<int16_t>::max()) { + return false; + } + *static_cast<int16_t*>(slot) = col_stats; + return true; + } case TYPE_INT: return ColumnStats<int32_t>::ReadFromThrift(thrift_stats, stats_field, slot); case TYPE_BIGINT: http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/94a8b1a9/be/src/runtime/coordinator.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/coordinator.cc b/be/src/runtime/coordinator.cc index f8e3dfd..3fcdb54 100644 --- a/be/src/runtime/coordinator.cc +++ b/be/src/runtime/coordinator.cc @@ -17,7 +17,6 @@ #include "runtime/coordinator.h" -#include <limits> #include <map> #include <memory> #include <thrift/protocol/TDebugProtocol.h>
