IMPALA-5164: Fix flaky benchmarks Improve benchmarks by detecting involuntary context switches. If a server is heavily loaded due to some other jobs running, benchmarking will not be reliable. We can detect this by using getrusage() to measure context switches. If 90% or more of benchmark time is lost due to switching, we abort the benchmark, but catch the failure and exit silently. We subtract out the time during which we might have been switched out from the total result time, and only count runs which were uninterrupted.
If 10% of the result is valid, that seems barely legitimate, but we will warn when anything less than 50% of the benchmark ran without switching. Testing: ran the free-lists-benchmark until the first test passed and then started "make -j 60" on an Impala directory. Got this result: E0519 23:35:41.482010 8766 benchmark.cc:161] Benchmark failed to complete due to context switching. W0519 23:35:41.548840 8766 benchmark.cc:162] Exiting silently from FreeLists 0 iters on 128 kb to avoid spurious test failure. Also, fixed two benchmarks that had crashes on start due to static initialization order problems and missing pieces of initialization. Some benchmarks are still crashing (expr-benchmark and network-perf-be), but those will require a lot more work to fix. Change-Id: I7e0ddfaa9bd1340990ae2b6ff946d72bbd26c274 Reviewed-on: http://gerrit.cloudera.org:8080/6935 Tested-by: Impala Public Jenkins Reviewed-by: Tim Armstrong <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/26a3d04b Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/26a3d04b Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/26a3d04b Branch: refs/heads/master Commit: 26a3d04b86bc0d4313ca634ad658d85af021ea97 Parents: 0992a6a Author: Zach Amsden <[email protected]> Authored: Sat May 20 00:03:15 2017 +0000 Committer: Tim Armstrong <[email protected]> Committed: Wed Jun 7 14:25:52 2017 +0000 ---------------------------------------------------------------------- be/src/benchmarks/free-lists-benchmark.cc | 15 ++-- be/src/benchmarks/hash-benchmark.cc | 4 + be/src/benchmarks/lock-benchmark.cc | 2 +- be/src/util/benchmark-test.cc | 2 +- be/src/util/benchmark.cc | 108 +++++++++++++++++++------ be/src/util/benchmark.h | 9 ++- 6 files changed, 105 insertions(+), 35 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/26a3d04b/be/src/benchmarks/free-lists-benchmark.cc ---------------------------------------------------------------------- diff --git a/be/src/benchmarks/free-lists-benchmark.cc b/be/src/benchmarks/free-lists-benchmark.cc index 5c09b39..cbadc03 100644 --- a/be/src/benchmarks/free-lists-benchmark.cc +++ b/be/src/benchmarks/free-lists-benchmark.cc @@ -320,7 +320,9 @@ static const int MAX_LIST_ENTRIES = 64; static const int ALLOC_OP = 0; static const int FREE_OP = 1; -static SystemAllocator allocator(64 * 1024); +// Not a static allocation because SystemAllocator's constructor uses gperftool's +// MallocExtension instance, which may not yet be defined due to constructor order +static SystemAllocator* allocator = nullptr; // Simulate doing some work with the buffer. void DoWork(uint8_t* data, int64_t len) { @@ -339,7 +341,7 @@ void DoAlloc(const BenchmarkParams& params, LockedList* free_list, got_buffer = free_list->list.PopFreeBuffer(&buffer); } if (!got_buffer) { - Status status = allocator.Allocate(params.allocation_size, &buffer); + Status status = allocator->Allocate(params.allocation_size, &buffer); if (!status.ok()) LOG(FATAL) << "Failed alloc " << status.msg().msg(); } // Do some processing to simulate a vaguely realistic work pattern. @@ -361,10 +363,10 @@ void DoFree(const BenchmarkParams& params, LockedList* free_list, // Discard around 1/4 of the buffers to amortise the cost of sorting. vector<BufferHandle> buffers = list->GetBuffersToFree(list->Size() - MAX_LIST_ENTRIES * 3 / 4); - for (BufferHandle& buffer : buffers) allocator.Free(move(buffer)); + for (BufferHandle& buffer : buffers) allocator->Free(move(buffer)); } } else { - allocator.Free(move(buffers->back())); + allocator->Free(move(buffers->back())); } buffers->pop_back(); } @@ -392,7 +394,7 @@ void FreeListBenchmarkThread(int thread_id, int num_operations, ops_done += num_ops; } - for (BufferHandle& buffer : buffers) allocator.Free(move(buffer)); + for (BufferHandle& buffer : buffers) allocator->Free(move(buffer)); } /// Execute the benchmark with the BenchmarkParams passed via 'data'. @@ -421,7 +423,7 @@ void FreeListBenchmark(int batch_size, void* data) { for (LockedList* free_list : free_lists) { vector<BufferHandle> buffers = free_list->list.GetBuffersToFree(free_list->list.Size()); - for (BufferHandle& buffer : buffers) allocator.Free(move(buffer)); + for (BufferHandle& buffer : buffers) allocator->Free(move(buffer)); } } @@ -429,6 +431,7 @@ int main(int argc, char** argv) { CpuInfo::Init(); cout << endl << Benchmark::GetMachineInfo() << endl << endl; + allocator = new SystemAllocator(64 * 1024); ObjectPool pool; for (int work_iterations : {0, 1, 2, 4}) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/26a3d04b/be/src/benchmarks/hash-benchmark.cc ---------------------------------------------------------------------- diff --git a/be/src/benchmarks/hash-benchmark.cc b/be/src/benchmarks/hash-benchmark.cc index 5f3a8d8..1b56b7b 100644 --- a/be/src/benchmarks/hash-benchmark.cc +++ b/be/src/benchmarks/hash-benchmark.cc @@ -23,10 +23,12 @@ #include <boost/functional/hash.hpp> #include "codegen/llvm-codegen.h" +#include "common/init.h" #include "experiments/data-provider.h" #include "runtime/mem-tracker.h" #include "runtime/string-value.h" #include "runtime/test-env.h" +#include "service/fe-support.h" #include "util/benchmark.h" #include "util/cpu-info.h" #include "util/hash-util.h" @@ -416,6 +418,8 @@ Function* CodegenCrcHash(LlvmCodeGen* codegen, bool mixed) { int main(int argc, char **argv) { CpuInfo::Init(); cout << Benchmark::GetMachineInfo() << endl; + impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST); + impala::InitFeSupport(); LlvmCodeGen::InitializeLlvm(); const int NUM_ROWS = 1024; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/26a3d04b/be/src/benchmarks/lock-benchmark.cc ---------------------------------------------------------------------- diff --git a/be/src/benchmarks/lock-benchmark.cc b/be/src/benchmarks/lock-benchmark.cc index f668f69..fbb948f 100644 --- a/be/src/benchmarks/lock-benchmark.cc +++ b/be/src/benchmarks/lock-benchmark.cc @@ -189,7 +189,7 @@ int main(int argc, char **argv) { int64_t N = 10000L; const int max_producers = 12; - Benchmark suite("locking"); + Benchmark suite("locking", /* micro = */ false); TestData data[max_producers + 1]; for (int i = 0; i <= max_producers; i += 2) { if (i == 0) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/26a3d04b/be/src/util/benchmark-test.cc ---------------------------------------------------------------------- diff --git a/be/src/util/benchmark-test.cc b/be/src/util/benchmark-test.cc index 6597387..6e330db 100644 --- a/be/src/util/benchmark-test.cc +++ b/be/src/util/benchmark-test.cc @@ -39,7 +39,7 @@ struct MemcpyData { class BenchmarkTest { public: static double Measure(Benchmark::BenchmarkFunction fn, void* data) { - return Benchmark::Measure(fn, data, 50, 10); + return Benchmark::Measure(fn, data, 50, 10, true); } }; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/26a3d04b/be/src/util/benchmark.cc ---------------------------------------------------------------------- diff --git a/be/src/util/benchmark.cc b/be/src/util/benchmark.cc index 77b046e..43570a6 100644 --- a/be/src/util/benchmark.cc +++ b/be/src/util/benchmark.cc @@ -19,6 +19,10 @@ #include <iostream> #include <sstream> +#include <sys/time.h> +#include <sys/resource.h> +#include <unistd.h> + #include "util/benchmark.h" #include "util/cpu-info.h" #include "util/debug-util.h" @@ -28,43 +32,89 @@ namespace impala { +// Private measurement function. This function is a bit unusual in that it +// throws exceptions; the intention is to abort the measurement when an unreliable +// result is detected, but to also provide some useful context as to which benchmark +// was being run. The method used to determine whether the measurement is unreliable +// is detecting context switches using getrusage(). Pass micro_heuristics=false to +// the class constructor if you do not want this behavior. double Benchmark::Measure(BenchmarkFunction function, void* args, - int max_time, int batch_size) { + int max_time, int batch_size, bool micro_heuristics) { int64_t target_cycles = CpuInfo::cycles_per_ms() * max_time; int64_t iters = 0; + struct rusage ru_start; + struct rusage ru_stop; + StopWatch sw; + int64_t lost_time = 0; // Run it with the default batch size to roughly estimate how many iterations - // it will take - StopWatch sw; - sw.Start(); - function(batch_size, args); - sw.Stop(); - iters += batch_size; - - if (sw.ElapsedTime() < target_cycles) { - int64_t iters_guess = (target_cycles / sw.ElapsedTime()) * batch_size; - // Shoot for 110% of the guess. Going a little over is not a big deal. - iters_guess *= 1.1; - // Modify the batch size based on the guess. We ran the function a small number - // of times to estimate how fast the function is. Run the remaining iterations at - // in 20% increments. - // TODO: we can make this more sophisticated if need to be dynamically ramp up and - // ramp down the sizes. - batch_size = max<int>(1, (iters_guess - iters) / 5); + // it will take. + for (;;) { + int64 begin_time = sw.ElapsedTime(); + if (micro_heuristics) getrusage(RUSAGE_THREAD, &ru_start); + sw.Start(); + function(batch_size, args); + sw.Stop(); + if (micro_heuristics) getrusage(RUSAGE_THREAD, &ru_stop); + if (!micro_heuristics || ru_stop.ru_nivcsw == ru_start.ru_nivcsw) { + iters = batch_size; + break; + } + + // Taking too long and we keep getting switched out; either the machine is busy, + // or the benchmark takes too long to run and should not be used with this + // microbenchmark suite. Bail. + if (sw.ElapsedTime() > target_cycles) { + throw std::runtime_error("Benchmark failed to complete due to context switching."); + } + + // Divide the batch size by the number of context switches until we find a size + // small enough that we don't switch + batch_size = max<int>(1, batch_size / (1+(ru_stop.ru_nivcsw - ru_start.ru_nivcsw))); + lost_time += sw.ElapsedTime() - begin_time; } + double iters_guess = (target_cycles / (sw.ElapsedTime() - lost_time)) * batch_size; + // Shoot for 110% of the guess. Going a little over is not a big deal. + iters_guess *= 1.1; + // Modify the batch size based on the guess. We ran the function a small number + // of times to estimate how fast the function is. Run the remaining iterations at + // in 20% increments. + batch_size = max<int>(1, (iters_guess - iters) / 5); + while (sw.ElapsedTime() < target_cycles) { + int64 begin_time = sw.ElapsedTime(); + if (micro_heuristics) getrusage(RUSAGE_THREAD, &ru_start); sw.Start(); function(batch_size, args); sw.Stop(); - iters += batch_size; + if (micro_heuristics) getrusage(RUSAGE_THREAD, &ru_stop); + if (!micro_heuristics || ru_stop.ru_nivcsw == ru_start.ru_nivcsw) { + iters += batch_size; + } else { + // We could have a vastly different estimate for batch size now and might have + // started context switching again. Divide down by 1 + the number of context + // switches as a guess of the number of iterations to perform with each batch. + lost_time += sw.ElapsedTime() - begin_time; + batch_size = max<int>(1, batch_size / (1+(ru_stop.ru_nivcsw - ru_start.ru_nivcsw))); + } + } + + // Arbitrary fudge factor for throwing in the towel - we give up if > 90% of + // measurements were dropped. + if (lost_time > 10 * sw.ElapsedTime()) { + throw std::runtime_error("Benchmark failed to complete due to noisy measurements."); + } + if (lost_time > sw.ElapsedTime()) { + LOG(WARNING) << "More than 50% of benchmark time lost due to context switching."; } - double ms_elapsed = sw.ElapsedTime() / CpuInfo::cycles_per_ms(); + double ms_elapsed = (sw.ElapsedTime() - lost_time) / CpuInfo::cycles_per_ms(); return iters / ms_elapsed; } -Benchmark::Benchmark(const string& name) : name_(name) { +Benchmark::Benchmark(const string& name, bool micro_heuristics) : name_(name), + micro_heuristics_(micro_heuristics) { #ifndef NDEBUG LOG(ERROR) << "WARNING: Running benchmark in DEBUG mode."; #endif @@ -114,11 +164,19 @@ string Benchmark::Measure(int max_time, int initial_batch_size) { 3 * percentile_out_width + padding; stringstream ss; - for (int j = 0; j < NUM_REPS; ++j) { - for (int i = 0; i < benchmarks_.size(); ++i) { - benchmarks_[i].rates.push_back( - Measure(benchmarks_[i].fn, benchmarks_[i].args, max_time, initial_batch_size)); + try { + for (int j = 0; j < NUM_REPS; ++j) { + for (int i = 0; i < benchmarks_.size(); ++i) { + benchmarks_[i].rates.push_back( + Measure(benchmarks_[i].fn, benchmarks_[i].args, max_time, initial_batch_size, + micro_heuristics_)); + } } + } catch (std::exception& e) { + LOG(ERROR) << e.what(); + LOG(WARNING) << "Exiting silently from " << name_ << + " to avoid spurious test failure."; + _exit(0); } ss << name_ << ":" http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/26a3d04b/be/src/util/benchmark.h ---------------------------------------------------------------------- diff --git a/be/src/util/benchmark.h b/be/src/util/benchmark.h index e7cb53c..7a8a7f2 100644 --- a/be/src/util/benchmark.h +++ b/be/src/util/benchmark.h @@ -34,7 +34,11 @@ namespace impala { class Benchmark { public: /// Name of the microbenchmark. This is outputted in the result. - Benchmark(const std::string& name); + /// micro_heuristics is a bool argument which indicates whether micro benchmark + /// style should be used; that is, we look for a set of runs which doesn't context + /// switch so that we can measure pure userland code performance, as opposed to a + /// more complex benchmark that might issue blocking system calls. + Benchmark(const std::string& name, bool micro_heuristics = true); /// Function to benchmark. The function should run iters time (to minimize function /// call overhead). The second argument is opaque and is whatever data the test @@ -65,7 +69,7 @@ class Benchmark { /// Benchmarks the 'function' returning the result as invocations per ms. /// args is an opaque argument passed as the second argument to the function. static double Measure(BenchmarkFunction function, void* args, int max_time, - int initial_batch_size); + int initial_batch_size, bool micro); struct BenchmarkResult { std::string name; @@ -77,6 +81,7 @@ class Benchmark { std::string name_; std::vector<BenchmarkResult> benchmarks_; + bool micro_heuristics_; }; }
