IMPALA-5988: optimise MemPool::TryAllocate() Testing: Ran core tests.
Perf: Experiments using this on top of a WIP Avro patch for IMPALA-5307 showed noticable improvements in CPU efficiency - up to 10% Change-Id: I088012084fe535a67a3cd1103ced5a02027f961a Reviewed-on: http://gerrit.cloudera.org:8080/8145 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/226c99e0 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/226c99e0 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/226c99e0 Branch: refs/heads/master Commit: 226c99e01c49d3d25840c5982474740a7dcb9c62 Parents: 4dd0f1b Author: Tim Armstrong <[email protected]> Authored: Thu Sep 21 16:01:15 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Thu Oct 5 02:50:36 2017 +0000 ---------------------------------------------------------------------- be/src/exec/analytic-eval-node.cc | 3 ++- be/src/exec/data-source-scan-node.cc | 3 ++- be/src/exec/hdfs-text-scanner.cc | 2 +- be/src/exec/text-converter.inline.h | 2 +- be/src/exprs/scalar-expr-evaluator.cc | 2 +- be/src/runtime/mem-pool.h | 41 ++++++++++++++++++------------ 6 files changed, 32 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/226c99e0/be/src/exec/analytic-eval-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/analytic-eval-node.cc b/be/src/exec/analytic-eval-node.cc index af4f866..a3551f3 100644 --- a/be/src/exec/analytic-eval-node.cc +++ b/be/src/exec/analytic-eval-node.cc @@ -388,7 +388,8 @@ Status AnalyticEvalNode::AddResultTuple(int64_t stream_idx) { StringValue* sv = reinterpret_cast<StringValue*>( result_tuple->GetSlot(slot_desc->tuple_offset())); if (sv == nullptr || sv->len == 0) continue; - char* new_ptr = reinterpret_cast<char*>(cur_tuple_pool->TryAllocate(sv->len)); + char* new_ptr = reinterpret_cast<char*>( + cur_tuple_pool->TryAllocateUnaligned(sv->len)); if (UNLIKELY(new_ptr == nullptr)) { return cur_tuple_pool->mem_tracker()->MemLimitExceeded(nullptr, "Failed to allocate memory for analytic function's result.", sv->len); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/226c99e0/be/src/exec/data-source-scan-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/data-source-scan-node.cc b/be/src/exec/data-source-scan-node.cc index 78ba492..2639510 100644 --- a/be/src/exec/data-source-scan-node.cc +++ b/be/src/exec/data-source-scan-node.cc @@ -222,7 +222,8 @@ Status DataSourceScanNode::MaterializeNextRow(MemPool* tuple_pool, Tuple* tuple) } const string& val = col.string_vals[val_idx]; size_t val_size = val.size(); - char* buffer = reinterpret_cast<char*>(tuple_pool->TryAllocate(val_size)); + char* buffer = reinterpret_cast<char*>( + tuple_pool->TryAllocateUnaligned(val_size)); if (UNLIKELY(buffer == NULL)) { string details = Substitute(ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", val_size, "string slot"); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/226c99e0/be/src/exec/hdfs-text-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc index eea4e80..a548ca1 100644 --- a/be/src/exec/hdfs-text-scanner.cc +++ b/be/src/exec/hdfs-text-scanner.cc @@ -870,7 +870,7 @@ Status HdfsTextScanner::CopyBoundaryField(FieldLocation* data, MemPool* pool) { bool needs_escape = data->len < 0; int copy_len = needs_escape ? -data->len : data->len; int64_t total_len = copy_len + boundary_column_.len(); - char* str_data = reinterpret_cast<char*>(pool->TryAllocate(total_len)); + char* str_data = reinterpret_cast<char*>(pool->TryAllocateUnaligned(total_len)); if (UNLIKELY(str_data == nullptr)) { string details = Substitute("HdfsTextScanner::CopyBoundaryField() failed to allocate " "$0 bytes.", total_len); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/226c99e0/be/src/exec/text-converter.inline.h ---------------------------------------------------------------------- diff --git a/be/src/exec/text-converter.inline.h b/be/src/exec/text-converter.inline.h index d8dea91..49f4e8a 100644 --- a/be/src/exec/text-converter.inline.h +++ b/be/src/exec/text-converter.inline.h @@ -79,7 +79,7 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* tup // 3. HdfsScanner::WriteCompleteTuple() always calls this function with // 'copy_string' == false. str.ptr = type.IsVarLenStringType() ? - reinterpret_cast<char*>(pool->TryAllocate(buffer_len)) : + reinterpret_cast<char*>(pool->TryAllocateUnaligned(buffer_len)) : reinterpret_cast<char*>(slot); if (UNLIKELY(str.ptr == NULL)) return false; if (need_escape) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/226c99e0/be/src/exprs/scalar-expr-evaluator.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/scalar-expr-evaluator.cc b/be/src/exprs/scalar-expr-evaluator.cc index b2e0aa6..a327dd4 100644 --- a/be/src/exprs/scalar-expr-evaluator.cc +++ b/be/src/exprs/scalar-expr-evaluator.cc @@ -252,7 +252,7 @@ Status ScalarExprEvaluator::GetConstValue(RuntimeState* state, const ScalarExpr& StringVal* sv = reinterpret_cast<StringVal*>(*const_val); if (!sv->is_null && sv->len > 0) { // Make sure the memory is owned by this evaluator. - char* ptr_copy = reinterpret_cast<char*>(mem_pool_->TryAllocate(sv->len)); + char* ptr_copy = reinterpret_cast<char*>(mem_pool_->TryAllocateUnaligned(sv->len)); if (ptr_copy == nullptr) { return mem_pool_->mem_tracker()->MemLimitExceeded( state, "Could not allocate constant string value", sv->len); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/226c99e0/be/src/runtime/mem-pool.h ---------------------------------------------------------------------- diff --git a/be/src/runtime/mem-pool.h b/be/src/runtime/mem-pool.h index 648e382..b748f7f 100644 --- a/be/src/runtime/mem-pool.h +++ b/be/src/runtime/mem-pool.h @@ -118,6 +118,13 @@ class MemPool { return Allocate<true>(size, alignment); } + /// Same as TryAllocate() except returned memory is not aligned at all. + uint8_t* TryAllocateUnaligned(int64_t size) noexcept { + // Call templated implementation directly so that it is inlined here and the + // alignment logic can be optimised out. + return Allocate<true>(size, 1); + } + /// Returns 'byte_size' to the current chunk back to the mem pool. This can /// only be used to return either all or part of the previous allocation returned /// by Allocate(). @@ -234,31 +241,33 @@ class MemPool { } template <bool CHECK_LIMIT_FIRST> - uint8_t* Allocate(int64_t size, int alignment) noexcept { + uint8_t* ALWAYS_INLINE Allocate(int64_t size, int alignment) noexcept { DCHECK_GE(size, 0); if (UNLIKELY(size == 0)) return reinterpret_cast<uint8_t*>(&zero_length_region_); - bool fits_in_chunk = false; if (current_chunk_idx_ != -1) { + ChunkInfo& info = chunks_[current_chunk_idx_]; int64_t aligned_allocated_bytes = BitUtil::RoundUpToPowerOf2( - chunks_[current_chunk_idx_].allocated_bytes, alignment); - if (aligned_allocated_bytes + size <= chunks_[current_chunk_idx_].size) { + info.allocated_bytes, alignment); + if (aligned_allocated_bytes + size <= info.size) { // Ensure the requested alignment is respected. - total_allocated_bytes_ += - aligned_allocated_bytes - chunks_[current_chunk_idx_].allocated_bytes; - chunks_[current_chunk_idx_].allocated_bytes = aligned_allocated_bytes; - fits_in_chunk = true; + int64_t padding = aligned_allocated_bytes - info.allocated_bytes; + uint8_t* result = info.data + aligned_allocated_bytes; + ASAN_UNPOISON_MEMORY_REGION(result, size); + DCHECK_LE(info.allocated_bytes + size, info.size); + info.allocated_bytes += padding + size; + total_allocated_bytes_ += padding + size; + DCHECK_LE(current_chunk_idx_, chunks_.size() - 1); + return result; } } - if (!fits_in_chunk) { - // If we couldn't allocate a new chunk, return NULL. malloc() guarantees alignment - // of alignof(std::max_align_t), so we do not need to do anything additional to - // guarantee alignment. - static_assert( - INITIAL_CHUNK_SIZE >= alignof(std::max_align_t), "Min chunk size too low"); - if (UNLIKELY(!FindChunk(size, CHECK_LIMIT_FIRST))) return NULL; - } + // If we couldn't allocate a new chunk, return NULL. malloc() guarantees alignment + // of alignof(std::max_align_t), so we do not need to do anything additional to + // guarantee alignment. + static_assert( + INITIAL_CHUNK_SIZE >= alignof(std::max_align_t), "Min chunk size too low"); + if (UNLIKELY(!FindChunk(size, CHECK_LIMIT_FIRST))) return NULL; ChunkInfo& info = chunks_[current_chunk_idx_]; uint8_t* result = info.data + info.allocated_bytes;
