IMPALA-6120: Add thread timers for reporting codegen time Add thread times for accurate reporting of codegen time. Also cleaned up a few places where time elapsed was being counted twice.
Sample Profile: Query: SELECT count(*) FROM tpch_parquet.lineitem WHERE l_partkey in (1,6,11,16,21,26,31,36,41); CodeGen:(Total: 37.948ms, non-child: 37.948ms, % non-child: 100.00%) - CodegenInvoluntaryContextSwitches: 0 (0) - CodegenTotalWallClockTime: 37.942ms - CodegenSysTime: 0.000ns - CodegenUserTime: 36.938ms - CodegenVoluntaryContextSwitches: 0 (0) - CompileTime: 2.065ms - IrGenerationTime: 392.351us - LoadTime: 0.000ns - ModuleBitcodeSize: 2.26 MB (2373148) - NumFunctions: 22 (22) - NumInstructions: 381 (381) - OptimizationTime: 21.416ms - PeakMemoryUsage: 190.50 KB (195072) - PrepareTime: 13.496ms Sample Profile with an added 2 sec sleep time to "OptimizationTime": CodeGen:(Total: 2s037ms, non-child: 2s037ms, % non-child: 100.00%) - CodegenInvoluntaryContextSwitches: 0 (0) - CodegenTotalWallClockTime: 2s037ms - CodegenSysTime: 0.000ns - CodegenUserTime: 37.672ms - CodegenVoluntaryContextSwitches: 1 (1) - CompileTime: 2.032ms - IrGenerationTime: 386.948us - LoadTime: 0.000ns - ModuleBitcodeSize: 2.26 MB (2373148) - NumFunctions: 22 (22) - NumInstructions: 381 (381) - OptimizationTime: 2s023ms - PeakMemoryUsage: 190.50 KB (195072) - PrepareTime: 11.598ms Change-Id: I24d5a46b8870bc959b89045432d2e86af72b30e5 Reviewed-on: http://gerrit.cloudera.org:8080/9960 Reviewed-by: Bikramjeet Vig <bikramjeet....@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/fef527d6 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/fef527d6 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/fef527d6 Branch: refs/heads/master Commit: fef527d60b1dc33cc083d44f0d697086dab4f361 Parents: 51cf5b2 Author: Bikramjeet Vig <bikramjeet....@cloudera.com> Authored: Mon Apr 9 14:03:52 2018 -0700 Committer: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Committed: Sat Apr 14 02:21:31 2018 +0000 ---------------------------------------------------------------------- be/src/codegen/llvm-codegen.cc | 21 +++++++++------------ be/src/codegen/llvm-codegen.h | 18 ++++++++++-------- be/src/exec/hdfs-avro-scanner.cc | 2 -- be/src/exec/hdfs-parquet-scanner.cc | 1 - be/src/exec/hdfs-scanner.cc | 2 -- be/src/exec/partitioned-aggregation-node.cc | 5 ----- be/src/exec/select-node.cc | 1 - be/src/exec/text-converter.cc | 1 - be/src/runtime/fragment-instance-state.cc | 16 +++++++++++----- be/src/runtime/tuple.cc | 1 - be/src/util/tuple-row-compare.cc | 1 - 11 files changed, 30 insertions(+), 39 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/codegen/llvm-codegen.cc ---------------------------------------------------------------------- diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc index c8fd8eb..5d5ed15 100644 --- a/be/src/codegen/llvm-codegen.cc +++ b/be/src/codegen/llvm-codegen.cc @@ -201,11 +201,12 @@ LlvmCodeGen::LlvmCodeGen(RuntimeState* state, ObjectPool* pool, load_module_timer_ = ADD_TIMER(profile_, "LoadTime"); prepare_module_timer_ = ADD_TIMER(profile_, "PrepareTime"); module_bitcode_size_ = ADD_COUNTER(profile_, "ModuleBitcodeSize", TUnit::BYTES); - codegen_timer_ = ADD_TIMER(profile_, "CodegenTime"); + ir_generation_timer_ = ADD_TIMER(profile_, "IrGenerationTime"); optimization_timer_ = ADD_TIMER(profile_, "OptimizationTime"); compile_timer_ = ADD_TIMER(profile_, "CompileTime"); num_functions_ = ADD_COUNTER(profile_, "NumFunctions", TUnit::UNIT); num_instructions_ = ADD_COUNTER(profile_, "NumInstructions", TUnit::UNIT); + llvm_thread_counters_ = ADD_THREAD_COUNTERS(profile_, "Codegen"); } Status LlvmCodeGen::CreateFromFile(RuntimeState* state, ObjectPool* pool, @@ -213,6 +214,7 @@ Status LlvmCodeGen::CreateFromFile(RuntimeState* state, ObjectPool* pool, scoped_ptr<LlvmCodeGen>* codegen) { codegen->reset(new LlvmCodeGen(state, pool, parent_mem_tracker, id)); SCOPED_TIMER((*codegen)->profile_->total_time_counter()); + SCOPED_THREAD_COUNTER_MEASUREMENT((*codegen)->llvm_thread_counters()); unique_ptr<llvm::Module> loaded_module; Status status = (*codegen)->LoadModuleFromFile(file, &loaded_module); @@ -229,6 +231,8 @@ Status LlvmCodeGen::CreateFromMemory(RuntimeState* state, ObjectPool* pool, MemTracker* parent_mem_tracker, const string& id, scoped_ptr<LlvmCodeGen>* codegen) { codegen->reset(new LlvmCodeGen(state, pool, parent_mem_tracker, id)); SCOPED_TIMER((*codegen)->profile_->total_time_counter()); + SCOPED_TIMER((*codegen)->prepare_module_timer_); + SCOPED_THREAD_COUNTER_MEASUREMENT((*codegen)->llvm_thread_counters()); // Select the appropriate IR version. We cannot use LLVM IR with SSE4.2 instructions on // a machine without SSE4.2 support. @@ -282,7 +286,6 @@ Status LlvmCodeGen::LoadModuleFromFile( Status LlvmCodeGen::LoadModuleFromMemory(unique_ptr<llvm::MemoryBuffer> module_ir_buf, string module_name, unique_ptr<llvm::Module>* module) { DCHECK(!module_name.empty()); - SCOPED_TIMER(prepare_module_timer_); COUNTER_ADD(module_bitcode_size_, module_ir_buf->getMemBufferRef().getBufferSize()); llvm::Expected<unique_ptr<llvm::Module>> tmp_module = getOwningLazyBitcodeModule(move(module_ir_buf), context()); @@ -305,7 +308,6 @@ Status LlvmCodeGen::LoadModuleFromMemory(unique_ptr<llvm::MemoryBuffer> module_i // TODO: Create separate counters/timers (file size, load time) for each module linked Status LlvmCodeGen::LinkModuleFromLocalFs(const string& file) { - SCOPED_TIMER(profile_->total_time_counter()); unique_ptr<llvm::Module> new_module; RETURN_IF_ERROR(LoadModuleFromFile(file, &new_module)); @@ -366,6 +368,7 @@ Status LlvmCodeGen::CreateImpalaCodegen(RuntimeState* state, // Parse module for cross compiled functions and types SCOPED_TIMER(codegen->profile_->total_time_counter()); SCOPED_TIMER(codegen->prepare_module_timer_); + SCOPED_THREAD_COUNTER_MEASUREMENT(codegen->llvm_thread_counters_); // Get type for StringValue codegen->string_value_type_ = codegen->GetStructType<StringValue>(); @@ -621,7 +624,7 @@ void LlvmCodeGen::CreateIfElseBlocks(llvm::Function* fn, const string& if_name, *else_block = llvm::BasicBlock::Create(context(), else_name, fn, insert_before); } -Status LlvmCodeGen::MaterializeFunctionHelper(llvm::Function* fn) { +Status LlvmCodeGen::MaterializeFunction(llvm::Function* fn) { DCHECK(!is_compiled_); if (fn->isIntrinsic() || !fn->isMaterializable()) return Status::OK(); @@ -642,18 +645,12 @@ Status LlvmCodeGen::MaterializeFunctionHelper(llvm::Function* fn) { for (const string& callee : *callees) { llvm::Function* callee_fn = module_->getFunction(callee); DCHECK(callee_fn != nullptr); - RETURN_IF_ERROR(MaterializeFunctionHelper(callee_fn)); + RETURN_IF_ERROR(MaterializeFunction(callee_fn)); } } return Status::OK(); } -Status LlvmCodeGen::MaterializeFunction(llvm::Function* fn) { - SCOPED_TIMER(profile_->total_time_counter()); - SCOPED_TIMER(prepare_module_timer_); - return MaterializeFunctionHelper(fn); -} - llvm::Function* LlvmCodeGen::GetFunction(const string& symbol, bool clone) { llvm::Function* fn = module_->getFunction(symbol.c_str()); if (fn == NULL) { @@ -1038,7 +1035,6 @@ Status LlvmCodeGen::MaterializeModule() { // It's okay to call this function even if the module has been materialized. Status LlvmCodeGen::FinalizeLazyMaterialization() { - SCOPED_TIMER(prepare_module_timer_); for (llvm::Function& fn : module_->functions()) { if (fn.isMaterializable()) { DCHECK(!module_->isMaterialized()); @@ -1078,6 +1074,7 @@ Status LlvmCodeGen::FinalizeModule() { if (is_corrupt_) return Status("Module is corrupt."); SCOPED_TIMER(profile_->total_time_counter()); + SCOPED_THREAD_COUNTER_MEASUREMENT(llvm_thread_counters_); // Clean up handcrafted functions that have not been finalized. Clean up is done by // deleting the function from the module. Any reference to deleted functions in the http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/codegen/llvm-codegen.h ---------------------------------------------------------------------- diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h index 783269b..53569ca 100644 --- a/be/src/codegen/llvm-codegen.h +++ b/be/src/codegen/llvm-codegen.h @@ -169,7 +169,8 @@ class LlvmCodeGen { void Close(); RuntimeProfile* runtime_profile() { return profile_; } - RuntimeProfile::Counter* codegen_timer() { return codegen_timer_; } + RuntimeProfile::Counter* ir_generation_timer() { return ir_generation_timer_; } + RuntimeProfile::ThreadCounters* llvm_thread_counters() { return llvm_thread_counters_; } /// Turns on/off optimization passes void EnableOptimizations(bool enable); @@ -688,10 +689,6 @@ class LlvmCodeGen { /// This function parses the bitcode of 'fn' to populate basic blocks, instructions /// and other data structures attached to the function object. Return error status /// for any error. - Status MaterializeFunctionHelper(llvm::Function* fn); - - /// Entry point for materializing function 'fn'. Invokes MaterializeFunctionHelper() - /// to do the actual work. Return error status for any error. Status MaterializeFunction(llvm::Function* fn); /// Materialize the module owned by this codegen object. This will materialize all @@ -754,11 +751,12 @@ class LlvmCodeGen { /// Time spent reading the .ir file from the file system. RuntimeProfile::Counter* load_module_timer_; - /// Time spent constructing the in-memory module from the ir. + /// Time spent creating the initial module with the cross-compiled Impala IR. RuntimeProfile::Counter* prepare_module_timer_; - /// Time spent doing codegen (adding IR to the module) - RuntimeProfile::Counter* codegen_timer_; + /// Time spent by ExecNodes while adding IR to the module. Update by + /// FragmentInstanceState during its 'CODEGEN_START' state. + RuntimeProfile::Counter* ir_generation_timer_; /// Time spent optimizing the module. RuntimeProfile::Counter* optimization_timer_; @@ -774,6 +772,10 @@ class LlvmCodeGen { RuntimeProfile::Counter* num_functions_; RuntimeProfile::Counter* num_instructions_; + /// Aggregated llvm thread counters. Also includes the phase represented by + /// 'ir_generation_timer_' and hence is also updated by FragmentInstanceState. + RuntimeProfile::ThreadCounters* llvm_thread_counters_; + /// whether or not optimizations are enabled bool optimizations_enabled_; http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/exec/hdfs-avro-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-avro-scanner.cc b/be/src/exec/hdfs-avro-scanner.cc index fe1bed4..e74b589 100644 --- a/be/src/exec/hdfs-avro-scanner.cc +++ b/be/src/exec/hdfs-avro-scanner.cc @@ -1062,8 +1062,6 @@ Status HdfsAvroScanner::CodegenReadScalar(const AvroSchemaElement& element, Status HdfsAvroScanner::CodegenDecodeAvroData(const HdfsScanNodeBase* node, LlvmCodeGen* codegen, const vector<ScalarExpr*>& conjuncts, llvm::Function** decode_avro_data_fn) { - SCOPED_TIMER(codegen->codegen_timer()); - llvm::Function* materialize_tuple_fn; RETURN_IF_ERROR(CodegenMaterializeTuple(node, codegen, &materialize_tuple_fn)); DCHECK(materialize_tuple_fn != nullptr); http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/exec/hdfs-parquet-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index ae22149..73dd29b 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -1019,7 +1019,6 @@ Status HdfsParquetScanner::Codegen(HdfsScanNodeBase* node, *process_scratch_batch_fn = nullptr; LlvmCodeGen* codegen = node->runtime_state()->codegen(); DCHECK(codegen != nullptr); - SCOPED_TIMER(codegen->codegen_timer()); llvm::Function* fn = codegen->GetFunction(IRFunction::PROCESS_SCRATCH_BATCH, true); DCHECK(fn != nullptr); http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/exec/hdfs-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc index a4aee4d..d191d3f 100644 --- a/be/src/exec/hdfs-scanner.cc +++ b/be/src/exec/hdfs-scanner.cc @@ -323,7 +323,6 @@ Status HdfsScanner::CodegenWriteCompleteTuple(const HdfsScanNodeBase* node, LlvmCodeGen* codegen, const vector<ScalarExpr*>& conjuncts, llvm::Function** write_complete_tuple_fn) { *write_complete_tuple_fn = NULL; - SCOPED_TIMER(codegen->codegen_timer()); RuntimeState* state = node->runtime_state(); // Cast away const-ness. The codegen only sets the cached typed llvm struct. @@ -531,7 +530,6 @@ Status HdfsScanner::CodegenWriteAlignedTuples(const HdfsScanNodeBase* node, LlvmCodeGen* codegen, llvm::Function* write_complete_tuple_fn, llvm::Function** write_aligned_tuples_fn) { *write_aligned_tuples_fn = NULL; - SCOPED_TIMER(codegen->codegen_timer()); DCHECK(write_complete_tuple_fn != NULL); llvm::Function* write_tuples_fn = http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/exec/partitioned-aggregation-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/partitioned-aggregation-node.cc b/be/src/exec/partitioned-aggregation-node.cc index c6c6189..d7b8c0a 100644 --- a/be/src/exec/partitioned-aggregation-node.cc +++ b/be/src/exec/partitioned-aggregation-node.cc @@ -1721,8 +1721,6 @@ Status PartitionedAggregationNode::CodegenCallUda(LlvmCodeGen* codegen, // Status PartitionedAggregationNode::CodegenUpdateTuple( LlvmCodeGen* codegen, llvm::Function** fn) { - SCOPED_TIMER(codegen->codegen_timer()); - for (const SlotDescriptor* slot_desc : intermediate_tuple_desc_->slots()) { if (slot_desc->type().type == TYPE_CHAR) { return Status::Expected("PartitionedAggregationNode::CodegenUpdateTuple(): cannot " @@ -1811,8 +1809,6 @@ Status PartitionedAggregationNode::CodegenUpdateTuple( Status PartitionedAggregationNode::CodegenProcessBatch(LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode) { - SCOPED_TIMER(codegen->codegen_timer()); - llvm::Function* update_tuple_fn; RETURN_IF_ERROR(CodegenUpdateTuple(codegen, &update_tuple_fn)); @@ -1884,7 +1880,6 @@ Status PartitionedAggregationNode::CodegenProcessBatch(LlvmCodeGen* codegen, Status PartitionedAggregationNode::CodegenProcessBatchStreaming( LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode) { DCHECK(is_streaming_preagg_); - SCOPED_TIMER(codegen->codegen_timer()); IRFunction::Type ir_fn = IRFunction::PART_AGG_NODE_PROCESS_BATCH_STREAMING; llvm::Function* process_batch_streaming_fn = codegen->GetFunction(ir_fn, true); http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/exec/select-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/select-node.cc b/be/src/exec/select-node.cc index 0f0683b..df57db0 100644 --- a/be/src/exec/select-node.cc +++ b/be/src/exec/select-node.cc @@ -48,7 +48,6 @@ void SelectNode::Codegen(RuntimeState* state) { DCHECK(state->ShouldCodegen()); ExecNode::Codegen(state); if (IsNodeCodegenDisabled()) return; - SCOPED_TIMER(state->codegen()->codegen_timer()); Status codegen_status = CodegenCopyRows(state); runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status); } http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/exec/text-converter.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc index 9e919a3..783e384 100644 --- a/be/src/exec/text-converter.cc +++ b/be/src/exec/text-converter.cc @@ -112,7 +112,6 @@ Status TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen, return Status("TextConverter::CodegenWriteSlot(): Char isn't supported for" " CodegenWriteSlot"); } - SCOPED_TIMER(codegen->codegen_timer()); // Codegen is_null_string bool is_default_null = (len == 2 && null_col_val[0] == '\\' && null_col_val[1] == 'N'); http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/runtime/fragment-instance-state.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/fragment-instance-state.cc b/be/src/runtime/fragment-instance-state.cc index 7322519..1a0d452 100644 --- a/be/src/runtime/fragment-instance-state.cc +++ b/be/src/runtime/fragment-instance-state.cc @@ -244,11 +244,17 @@ Status FragmentInstanceState::Open() { if (runtime_state_->ShouldCodegen()) { UpdateState(StateEvent::CODEGEN_START); RETURN_IF_ERROR(runtime_state_->CreateCodegen()); - exec_tree_->Codegen(runtime_state_); - // It shouldn't be fatal to fail codegen. However, until IMPALA-4233 is fixed, - // ScalarFnCall has no fall back to interpretation when codegen fails so propagates - // the error status for now. - RETURN_IF_ERROR(runtime_state_->CodegenScalarFns()); + { + SCOPED_TIMER(runtime_state_->codegen()->ir_generation_timer()); + SCOPED_TIMER(runtime_state_->codegen()->runtime_profile()->total_time_counter()); + SCOPED_THREAD_COUNTER_MEASUREMENT( + runtime_state_->codegen()->llvm_thread_counters()); + exec_tree_->Codegen(runtime_state_); + // It shouldn't be fatal to fail codegen. However, until IMPALA-4233 is fixed, + // ScalarFnCall has no fall back to interpretation when codegen fails so propagates + // the error status for now. + RETURN_IF_ERROR(runtime_state_->CodegenScalarFns()); + } LlvmCodeGen* codegen = runtime_state_->codegen(); DCHECK(codegen != nullptr); http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/runtime/tuple.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/tuple.cc b/be/src/runtime/tuple.cc index 627c7a4..0061419 100644 --- a/be/src/runtime/tuple.cc +++ b/be/src/runtime/tuple.cc @@ -311,7 +311,6 @@ Status Tuple::CodegenMaterializeExprs(LlvmCodeGen* codegen, bool collect_string_ if (collect_string_vals) { return Status("CodegenMaterializeExprs() collect_string_vals == true NYI"); } - SCOPED_TIMER(codegen->codegen_timer()); llvm::LLVMContext& context = codegen->context(); // Codegen each compute function from slot_materialize_exprs http://git-wip-us.apache.org/repos/asf/impala/blob/fef527d6/be/src/util/tuple-row-compare.cc ---------------------------------------------------------------------- diff --git a/be/src/util/tuple-row-compare.cc b/be/src/util/tuple-row-compare.cc index 5620dae..f05a88e 100644 --- a/be/src/util/tuple-row-compare.cc +++ b/be/src/util/tuple-row-compare.cc @@ -203,7 +203,6 @@ Status TupleRowComparator::Codegen(RuntimeState* state) { // ret i32 0 // } Status TupleRowComparator::CodegenCompare(LlvmCodeGen* codegen, llvm::Function** fn) { - SCOPED_TIMER(codegen->codegen_timer()); llvm::LLVMContext& context = codegen->context(); const vector<ScalarExpr*>& ordering_exprs = ordering_exprs_; llvm::Function* key_fns[ordering_exprs.size()];