IMPALA-5347: Parquet scanner microoptimizations A mix of microoptimizations that profiling the parquet scanner revealed. All lead to some measurable improvement and added up to significant speedups for various scans.
* Add ALWAYS_INLINE to hot functions that GCC was mistakenly not inlining in all cases. * Apply __restrict__ in a few places so the compiler knows that it is safe to cache values accessed via those pointers * memset() the whole batch instead of the null indicators is cases where it is almost certainly cheaper. * Avoid unnecessary initialization of often-unused 'val' in ReadSlot(). * Shave a few instructions off the (still very expensive) bit unpacking and dict decoding logic. Performance: Some local TPC-H and targeted-perf benchmarks showed average speedups of ~5%. I did some benchmarks targeted at measuring column materialisation performance using a version of lineitem with duplicated data to make it bigger. These queries all got significantly faster. Dict-encoded DECIMAL: 2.23 -> 1.23s SELECT count(*) FROM biglineitem WHERE l_quantity > 49 Plain-encoded BIGINT: 2.33s -> 1.62s SELECT count(*) FROM biglineitem WHERE l_orderkey != 10 Dict-encoded STRING: 2.73s -> 1.72s SELECT count(*) FROM biglineitem WHERE l_returnflag = 'A' Plain-encoded STRING: 7.07s -> 6.08s (most time spent in Snappy) SELECT count(*) FROM biglineitem WHERE length(l_comment) > 37 Multiple columns: 5.15s -> 3.74s SELECT count(*) FROM biglineitem WHERE l_quantity > 49 and l_partkey != 199 and l_tax < 100 Change-Id: I49ec523a65542fdbabd53fbcc4a8901d769e5cd5 Reviewed-on: http://gerrit.cloudera.org:8080/6950 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/b2782774 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/b2782774 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/b2782774 Branch: refs/heads/master Commit: b2782774491b5e280c4b51e39c393810deaa1e22 Parents: 25290a0 Author: Tim Armstrong <[email protected]> Authored: Sun May 21 15:04:21 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Thu May 25 11:38:06 2017 +0000 ---------------------------------------------------------------------- be/src/common/compiler-util.h | 10 +++++++ be/src/exec/hdfs-parquet-scanner.cc | 15 ++++------- be/src/exec/hdfs-scanner.h | 42 ++++++++++++++++++++++++++--- be/src/exec/parquet-column-readers.cc | 43 +++++++++++++++++++----------- be/src/runtime/tuple.h | 7 +++-- be/src/util/bit-stream-utils.inline.h | 17 +++++++++--- be/src/util/bit-util.h | 7 +++-- be/src/util/dict-encoding.h | 14 ++++++---- be/src/util/rle-encoding.h | 6 +++-- 9 files changed, 114 insertions(+), 47 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/common/compiler-util.h ---------------------------------------------------------------------- diff --git a/be/src/common/compiler-util.h b/be/src/common/compiler-util.h index 7c0379b..5132eb0 100644 --- a/be/src/common/compiler-util.h +++ b/be/src/common/compiler-util.h @@ -43,6 +43,16 @@ /// decision, e.g. not inlining a small function on a hot path. #define ALWAYS_INLINE __attribute__((always_inline)) +/// Clang is pedantic about __restrict__ (e.g. never allows calling a non-__restrict__) +/// member function from a __restrict__-ed memory function and has some apparent bugs +/// (e.g. can't convert a __restrict__ reference to a const& __restrict__ reference). +/// Just disable it. +#ifdef __clang__ +#define RESTRICT +#else +#define RESTRICT __restrict__ +#endif + namespace impala { /// The size of an L1 cache line in bytes on x86-64. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/exec/hdfs-parquet-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index 0119efd..4f90cb1 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -930,12 +930,7 @@ Status HdfsParquetScanner::AssembleRows( while (!column_readers[0]->RowGroupAtEnd()) { // Start a new scratch batch. RETURN_IF_ERROR(scratch_batch_->Reset(state_)); - int scratch_capacity = scratch_batch_->capacity; - - // Initialize tuple memory. - for (int i = 0; i < scratch_capacity; ++i) { - InitTuple(template_tuple_, scratch_batch_->GetTuple(i)); - } + InitTupleBuffer(template_tuple_, scratch_batch_->tuple_mem, scratch_batch_->capacity); // Materialize the top-level slots into the scratch batch column-by-column. int last_num_tuples = -1; @@ -944,12 +939,12 @@ Status HdfsParquetScanner::AssembleRows( for (int c = 0; c < num_col_readers; ++c) { ParquetColumnReader* col_reader = column_readers[c]; if (col_reader->max_rep_level() > 0) { - continue_execution = col_reader->ReadValueBatch( - &scratch_batch_->aux_mem_pool, scratch_capacity, - tuple_byte_size_, scratch_batch_->tuple_mem, &scratch_batch_->num_tuples); + continue_execution = col_reader->ReadValueBatch(&scratch_batch_->aux_mem_pool, + scratch_batch_->capacity, tuple_byte_size_, scratch_batch_->tuple_mem, + &scratch_batch_->num_tuples); } else { continue_execution = col_reader->ReadNonRepeatedValueBatch( - &scratch_batch_->aux_mem_pool, scratch_capacity, tuple_byte_size_, + &scratch_batch_->aux_mem_pool, scratch_batch_->capacity, tuple_byte_size_, scratch_batch_->tuple_mem, &scratch_batch_->num_tuples); } // Check that all column readers populated the same number of values. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/exec/hdfs-scanner.h ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scanner.h b/be/src/exec/hdfs-scanner.h index 24a3c4f..035b41c 100644 --- a/be/src/exec/hdfs-scanner.h +++ b/be/src/exec/hdfs-scanner.h @@ -452,17 +452,51 @@ class HdfsScanner { /// Initialize a tuple. /// TODO: only copy over non-null slots. - /// TODO: InitTuple is called frequently, avoid the if, perhaps via templatization. void InitTuple(const TupleDescriptor* desc, Tuple* template_tuple, Tuple* tuple) { if (template_tuple != NULL) { - memcpy(tuple, template_tuple, desc->byte_size()); + InitTupleFromTemplate(template_tuple, tuple, desc->byte_size()); } else { tuple->ClearNullBits(*desc); } } - // TODO: replace this function with above once we can inline constants from - // scan_node_->tuple_desc() via codegen + /// Initialize 'tuple' with size 'tuple_byte_size' from 'template_tuple' + void InitTupleFromTemplate(Tuple* template_tuple, Tuple* tuple, int tuple_byte_size) { + memcpy(tuple, template_tuple, tuple_byte_size); + } + + /// Initialize a dense array of 'num_tuples' tuples. + /// TODO: we could do better here if we inlined the tuple and null indicator byte + /// widths with codegen to eliminate all the branches in memcpy()/memset(). + void InitTupleBuffer( + Tuple* template_tuple, uint8_t* __restrict__ tuple_mem, int64_t num_tuples) { + const TupleDescriptor* desc = scan_node_->tuple_desc(); + const int tuple_byte_size = desc->byte_size(); + // Handle the different template/non-template cases with different loops to avoid + // unnecessary branches inside the loop. + if (template_tuple != nullptr) { + for (int64_t i = 0; i < num_tuples; ++i) { + InitTupleFromTemplate( + template_tuple, reinterpret_cast<Tuple*>(tuple_mem), tuple_byte_size); + tuple_mem += tuple_byte_size; + } + } else if (tuple_byte_size <= CACHE_LINE_SIZE) { + // If each tuple fits in a cache line, it is quicker to zero the whole memory buffer + // instead of just the null indicators. This is because we are fetching the cache + // line anyway and zeroing a cache line is cheap (a couple of AVX2 instructions) + // compared with the overhead of calling memset() row-by-row. + memset(tuple_mem, 0, num_tuples * tuple_byte_size); + } else { + const int null_bytes_offset = desc->null_bytes_offset(); + const int num_null_bytes = desc->num_null_bytes(); + for (int64_t i = 0; i < num_tuples; ++i) { + reinterpret_cast<Tuple*>(tuple_mem)->ClearNullBits( + null_bytes_offset, num_null_bytes); + tuple_mem += tuple_byte_size; + } + } + } + void InitTuple(Tuple* template_tuple, Tuple* tuple) { InitTuple(scan_node_->tuple_desc(), template_tuple, tuple); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/exec/parquet-column-readers.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/parquet-column-readers.cc b/be/src/exec/parquet-column-readers.cc index 3465632..5c7c14d 100644 --- a/be/src/exec/parquet-column-readers.cc +++ b/be/src/exec/parquet-column-readers.cc @@ -296,9 +296,14 @@ class ScalarColumnReader : public BaseScalarColumnReader { /// caching of rep/def levels. Once a data page and cached levels are available, /// it calls into a more specialized MaterializeValueBatch() for doing the actual /// value materialization using the level caches. - template<bool IN_COLLECTION> - bool ReadValueBatch(MemPool* pool, int max_values, int tuple_size, - uint8_t* tuple_mem, int* num_values) { + /// Use RESTRICT so that the compiler knows that it is safe to cache member + /// variables in registers or on the stack (otherwise gcc's alias analysis + /// conservatively assumes that buffers like 'tuple_mem', 'num_values' or the + /// 'def_levels_' 'rep_levels_' buffers may alias 'this', especially with + /// -fno-strict-alias). + template <bool IN_COLLECTION> + bool ReadValueBatch(MemPool* RESTRICT pool, int max_values, int tuple_size, + uint8_t* RESTRICT tuple_mem, int* RESTRICT num_values) RESTRICT { // Repetition level is only present if this column is nested in a collection type. if (!IN_COLLECTION) DCHECK_EQ(max_rep_level(), 0) << slot_desc()->DebugString(); if (IN_COLLECTION) DCHECK_GT(max_rep_level(), 0) << slot_desc()->DebugString(); @@ -362,9 +367,14 @@ class ScalarColumnReader : public BaseScalarColumnReader { /// level caches have been populated. /// For efficiency, the simple special case of !MATERIALIZED && !IN_COLLECTION is not /// handled in this function. - template<bool IN_COLLECTION, bool IS_DICT_ENCODED> - bool MaterializeValueBatch(MemPool* pool, int max_values, int tuple_size, - uint8_t* tuple_mem, int* num_values) { + /// Use RESTRICT so that the compiler knows that it is safe to cache member + /// variables in registers or on the stack (otherwise gcc's alias analysis + /// conservatively assumes that buffers like 'tuple_mem', 'num_values' or the + /// 'def_levels_' 'rep_levels_' buffers may alias 'this', especially with + /// -fno-strict-alias). + template <bool IN_COLLECTION, bool IS_DICT_ENCODED> + bool MaterializeValueBatch(MemPool* RESTRICT pool, int max_values, int tuple_size, + uint8_t* RESTRICT tuple_mem, int* RESTRICT num_values) RESTRICT { DCHECK(MATERIALIZED || IN_COLLECTION); DCHECK_GT(num_buffered_values_, 0); DCHECK(def_levels_.CacheHasNext()); @@ -372,7 +382,7 @@ class ScalarColumnReader : public BaseScalarColumnReader { uint8_t* curr_tuple = tuple_mem; int val_count = 0; - while (def_levels_.CacheHasNext()) { + while (def_levels_.CacheHasNext() && val_count < max_values) { Tuple* tuple = reinterpret_cast<Tuple*>(curr_tuple); int def_level = def_levels_.CacheGetNext(); @@ -400,10 +410,8 @@ class ScalarColumnReader : public BaseScalarColumnReader { tuple->SetNull(null_indicator_offset_); } } - curr_tuple += tuple_size; ++val_count; - if (UNLIKELY(val_count == max_values)) break; } *num_values = val_count; return true; @@ -460,11 +468,15 @@ class ScalarColumnReader : public BaseScalarColumnReader { /// Returns false if execution should be aborted for some reason, e.g. parse_error_ is /// set, the query is cancelled, or the scan node limit was reached. Otherwise returns /// true. - template<bool IS_DICT_ENCODED> - inline bool ReadSlot(Tuple* tuple, MemPool* pool) { + /// + /// Force inlining - GCC does not always inline this into hot loops. + template <bool IS_DICT_ENCODED> + ALWAYS_INLINE bool ReadSlot(Tuple* tuple, MemPool* pool) { void* slot = tuple->GetSlot(tuple_offset_); - T val; - T* val_ptr = NeedsConversionInline() ? &val : reinterpret_cast<T*>(slot); + // Use an uninitialized stack allocation for temporary value to avoid running + // constructors doing work unnecessarily, e.g. if T == StringValue. + alignas(T) uint8_t val_buf[sizeof(T)]; + T* val_ptr = reinterpret_cast<T*>(NeedsConversionInline() ? val_buf : slot); if (IS_DICT_ENCODED) { DCHECK_EQ(page_encoding_, parquet::Encoding::PLAIN_DICTIONARY); if (UNLIKELY(!dict_decoder_.GetNextValue(val_ptr))) { @@ -485,9 +497,8 @@ class ScalarColumnReader : public BaseScalarColumnReader { if (UNLIKELY(NeedsValidationInline() && !ValidateSlot(val_ptr, tuple))) { return false; } - if (UNLIKELY(NeedsConversionInline() && - !tuple->IsNull(null_indicator_offset_) && - !ConvertSlot(&val, reinterpret_cast<T*>(slot), pool))) { + if (UNLIKELY(NeedsConversionInline() && !tuple->IsNull(null_indicator_offset_) + && !ConvertSlot(val_ptr, reinterpret_cast<T*>(slot), pool))) { return false; } return true; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/runtime/tuple.h ---------------------------------------------------------------------- diff --git a/be/src/runtime/tuple.h b/be/src/runtime/tuple.h index cf278ae..15b90a4 100644 --- a/be/src/runtime/tuple.h +++ b/be/src/runtime/tuple.h @@ -72,8 +72,11 @@ class Tuple { void Init(int size) { memset(this, 0, size); } void ClearNullBits(const TupleDescriptor& tuple_desc) { - memset(reinterpret_cast<uint8_t*>(this) + tuple_desc.null_bytes_offset(), - 0, tuple_desc.num_null_bytes()); + ClearNullBits(tuple_desc.null_bytes_offset(), tuple_desc.num_null_bytes()); + } + + void ClearNullBits(int null_bytes_offset, int num_null_bytes) { + memset(reinterpret_cast<uint8_t*>(this) + null_bytes_offset, 0, num_null_bytes); } /// The total size of all data represented in this tuple (tuple data and referenced http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/bit-stream-utils.inline.h ---------------------------------------------------------------------- diff --git a/be/src/util/bit-stream-utils.inline.h b/be/src/util/bit-stream-utils.inline.h index dee2456..c8744aa 100644 --- a/be/src/util/bit-stream-utils.inline.h +++ b/be/src/util/bit-stream-utils.inline.h @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. - #ifndef IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H #define IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H +#include "common/compiler-util.h" #include "util/bit-stream-utils.h" namespace impala { @@ -84,14 +84,23 @@ inline bool BitWriter::PutVlqInt(int32_t v) { return result; } -template<typename T> -inline bool BitReader::GetValue(int num_bits, T* v) { +/// Force inlining - this is used in perf-critical loops in Parquet and GCC often doesn't +/// inline it in cases where it's beneficial. +template <typename T> +ALWAYS_INLINE inline bool BitReader::GetValue(int num_bits, T* v) { DCHECK(num_bits == 0 || buffer_ != NULL); // TODO: revisit this limit if necessary DCHECK_LE(num_bits, MAX_BITWIDTH); DCHECK_LE(num_bits, sizeof(T) * 8); - if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false; + // First do a cheap check to see if we may read past the end of the stream, using + // constant upper bounds for 'bit_offset_' and 'num_bits'. + if (UNLIKELY(byte_offset_ + sizeof(buffered_values_) + MAX_BITWIDTH / 8 > max_bytes_)) { + // Now do the precise check. + if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) { + return false; + } + } DCHECK_GE(bit_offset_, 0); DCHECK_LE(bit_offset_, 64); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/bit-util.h ---------------------------------------------------------------------- diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h index 40a324c..581decc 100644 --- a/be/src/util/bit-util.h +++ b/be/src/util/bit-util.h @@ -165,11 +165,10 @@ class BitUtil { } /// Returns the 'num_bits' least-significant bits of 'v'. - static inline uint64_t TrailingBits(uint64_t v, int num_bits) { - if (UNLIKELY(num_bits == 0)) return 0; + /// Force inlining - GCC does not always inline this into hot loops. + static ALWAYS_INLINE uint64_t TrailingBits(uint64_t v, int num_bits) { if (UNLIKELY(num_bits >= 64)) return v; - int n = 64 - num_bits; - return (v << n) >> n; + return ((1UL << num_bits) - 1) & v; } /// Swaps the byte order (i.e. endianess) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/dict-encoding.h ---------------------------------------------------------------------- diff --git a/be/src/util/dict-encoding.h b/be/src/util/dict-encoding.h index e842495..75e9340 100644 --- a/be/src/util/dict-encoding.h +++ b/be/src/util/dict-encoding.h @@ -22,8 +22,9 @@ #include <boost/unordered_map.hpp> -#include "gutil/strings/substitute.h" +#include "common/compiler-util.h" #include "exec/parquet-common.h" +#include "gutil/strings/substitute.h" #include "runtime/mem-pool.h" #include "runtime/string-value.h" #include "util/bit-util.h" @@ -285,8 +286,9 @@ inline int DictEncoder<StringValue>::AddToTable(const StringValue& value, return bytes_added; } -template<typename T> -inline bool DictDecoder<T>::GetNextValue(T* value) { +// Force inlining - GCC does not always inline this into hot loops in Parquet scanner. +template <typename T> +ALWAYS_INLINE inline bool DictDecoder<T>::GetNextValue(T* value) { int index = -1; // Initialize to avoid compiler warning. bool result = data_decoder_.Get(&index); // Use & to avoid branches. @@ -297,8 +299,10 @@ inline bool DictDecoder<T>::GetNextValue(T* value) { return false; } -template<> -inline bool DictDecoder<Decimal16Value>::GetNextValue(Decimal16Value* value) { +// Force inlining - GCC does not always inline this into hot loops in Parquet scanner. +template <> +ALWAYS_INLINE inline bool DictDecoder<Decimal16Value>::GetNextValue( + Decimal16Value* value) { int index; bool result = data_decoder_.Get(&index); if (!result) return false; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/rle-encoding.h ---------------------------------------------------------------------- diff --git a/be/src/util/rle-encoding.h b/be/src/util/rle-encoding.h index 9f39697..2e2dd7f 100644 --- a/be/src/util/rle-encoding.h +++ b/be/src/util/rle-encoding.h @@ -245,8 +245,10 @@ class RleEncoder { uint8_t* literal_indicator_byte_; }; -template<typename T> -inline bool RleDecoder::Get(T* val) { +// Force inlining - this is used in perf-critical loops in Parquet and GCC often +// doesn't inline it in cases where it's beneficial. +template <typename T> +ALWAYS_INLINE inline bool RleDecoder::Get(T* val) { DCHECK_GE(bit_width_, 0); // Profiling has shown that the quality and performance of the generated code is very // sensitive to the exact shape of this check. For example, the version below performs
