Commit: a9ad15a88f129a0593432e1217610ac840687497 Author: Sergey Sharybin Date: Mon Sep 20 15:49:10 2021 +0200 Branches: cycles-x https://developer.blender.org/rBa9ad15a88f129a0593432e1217610ac840687497
WIP: Cycles X: Schedule work based on occupancy Sacrifice refresh interval and increase GPU occupancy, lowering the final render time. Lowers `Blender 2.80 - Spring` demo file form predicted 30min measured 3 min to render 1/10th of samples) to about 7.5min. It is still considerably higher than the master branch, which finishes the file in just below 3 min, but it is already a better results. The timing is from RTX 5000. The viewport and CPU rendering should stay unaffected by this change. Differential Revision: https://developer.blender.org/D12570 =================================================================== M intern/cycles/integrator/path_trace.cpp M intern/cycles/integrator/path_trace_work.h M intern/cycles/integrator/path_trace_work_cpu.cpp M intern/cycles/integrator/path_trace_work_cpu.h M intern/cycles/integrator/path_trace_work_gpu.cpp M intern/cycles/integrator/path_trace_work_gpu.h M intern/cycles/integrator/render_scheduler.cpp M intern/cycles/integrator/render_scheduler.h M intern/cycles/integrator/work_balancer.h =================================================================== diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index 55f050d7833..5ce75464acd 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -351,14 +351,32 @@ void PathTrace::path_trace(RenderWork &render_work) const double start_time = time_dt(); const int num_works = path_trace_works_.size(); + tbb::parallel_for(0, num_works, [&](int i) { const double work_start_time = time_dt(); + const int num_samples = render_work.path_trace.num_samples; + PathTraceWork *path_trace_work = path_trace_works_[i].get(); - path_trace_work->render_samples(render_work.path_trace.start_sample, - render_work.path_trace.num_samples); - work_balance_infos_[i].time_spent += time_dt() - work_start_time; + + PathTraceWork::RenderStatistics statistics; + path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples); + + const double work_time = time_dt() - work_start_time; + work_balance_infos_[i].time_spent += work_time; + work_balance_infos_[i].occupancy = statistics.occupancy; + + VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds (" + << work_time / num_samples + << " seconds per sample), occupancy: " << statistics.occupancy; }); + float occupancy_accum = 0.0f; + for (const WorkBalanceInfo &balance_info : work_balance_infos_) { + occupancy_accum += balance_info.occupancy; + } + const float occupancy = occupancy_accum / num_works; + render_scheduler_.report_path_trace_occupancy(render_work, occupancy); + render_scheduler_.report_path_trace_time( render_work, time_dt() - start_time, is_cancel_requested()); } diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h index ca64c1c2ffd..97b97f3d888 100644 --- a/intern/cycles/integrator/path_trace_work.h +++ b/intern/cycles/integrator/path_trace_work.h @@ -33,6 +33,10 @@ class RenderBuffers; class PathTraceWork { public: + struct RenderStatistics { + float occupancy = 1.0f; + }; + /* Create path trace work which fits best the device. * * The cancel request flag is used for a cheap check whether cancel is to berformed as soon as @@ -71,7 +75,7 @@ class PathTraceWork { /* Render given number of samples as a synchronous blocking call. * The samples are added to the render buffer associated with this work. */ - virtual void render_samples(int start_sample, int samples_num) = 0; + virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0; /* Copy render result from this work to the corresponding place of the GPU display. * diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp index eaed0d0d636..b9a33b64051 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.cpp +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -67,7 +67,9 @@ void PathTraceWorkCPU::init_execution() device_->get_cpu_kernel_thread_globals(kernel_thread_globals_); } -void PathTraceWorkCPU::render_samples(int start_sample, int samples_num) +void PathTraceWorkCPU::render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) { const int64_t image_width = effective_buffer_params_.width; const int64_t image_height = effective_buffer_params_.height; @@ -106,6 +108,8 @@ void PathTraceWorkCPU::render_samples(int start_sample, int samples_num) for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) { kernel_globals.stop_profiling(); } + + statistics.occupancy = 1.0f; } void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals, diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h index 0ea901e452d..ab729bbf879 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.h +++ b/intern/cycles/integrator/path_trace_work_cpu.h @@ -46,7 +46,9 @@ class PathTraceWorkCPU : public PathTraceWork { virtual void init_execution() override; - virtual void render_samples(int start_sample, int samples_num) override; + virtual void render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) override; virtual void copy_to_gpu_display(GPUDisplay *gpu_display, PassMode pass_mode, diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 7f15237ddbf..10baf869aa6 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -180,7 +180,9 @@ void PathTraceWorkGPU::init_execution() "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); } -void PathTraceWorkGPU::render_samples(int start_sample, int samples_num) +void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) { /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to * add more work (because tiles are smaller, so there is higher chance that more paths will @@ -192,6 +194,9 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num) enqueue_reset(); + int num_iterations = 0; + uint64_t num_busy_accum = 0; + /* TODO: set a hard limit in case of undetected kernel failures? */ while (true) { /* Enqueue work from the scheduler, on start or when there are not enough @@ -228,7 +233,12 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num) if (is_cancel_requested()) { break; } + + num_busy_accum += get_num_active_paths(); + ++num_iterations; } + + statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_; } DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index aee54d4a372..38788122b0d 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -44,7 +44,9 @@ class PathTraceWorkGPU : public PathTraceWork { virtual void alloc_work_memory() override; virtual void init_execution() override; - virtual void render_samples(int start_sample, int samples_num) override; + virtual void render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) override; virtual void copy_to_gpu_display(GPUDisplay *gpu_display, PassMode pass_mode, diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp index 50017daca38..4eb1dd941f9 100644 --- a/intern/cycles/integrator/render_scheduler.cpp +++ b/intern/cycles/integrator/render_scheduler.cpp @@ -155,6 +155,9 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples) state_.end_render_time = 0.0; state_.time_limit_reached = false; + state_.occupancy_num_samples = 0; + state_.occupancy = 1.0f; + first_render_time_.path_trace_per_sample = 0.0; first_render_time_.denoise_time = 0.0; first_render_time_.display_update_time = 0.0; @@ -475,6 +478,13 @@ void RenderScheduler::report_path_trace_time(const RenderWork &render_work, VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds."; } +void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy) +{ + state_.occupancy_num_samples = render_work.path_trace.num_samples; + state_.occupancy = occupancy; + VLOG(4) << "Measured path tracing occupancy: " << occupancy; +} + void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled) @@ -803,8 +813,23 @@ int RenderScheduler::get_num_samples_to_path_trace() const * more than N samples. */ const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update); - const int num_samples_to_render = min(num_samples_pot, - start_sample_ + num_samples_ - path_trace_start_sample); + const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample; + + int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render); + + /* When enough statistics is available and doing an offlien rendering prefer to keep device + * occupied. */ + if (state_.occupancy_num_samples && (background_ || headless_)) { + /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes + * with good performance without forcing occupancy to be higher). */ + int num_samples_to_occupy = state_.occupancy_num_samples; + if (state_.occupancy < 0.5f) { + num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy); + } + + num_samples_to_render = max(num_samples_to_render, + min(num_samples_to_occupy, max_num_samples_to_render)); + } /* If adaptive sampling is not use, render as many samples per update as possible, keeping the * device fully occupied, without much overhead of display updates. */ diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h index 10fbcc52cd6..9c2d107e46d 100644 --- a/intern/cycles/integrator/render_scheduler.h +++ b/intern/cycles/integrator/render_scheduler.h @@ -186,6 +186,7 @@ class RenderScheduler { /* Report time (in seconds) which corresponding part of work took. */ void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled); + void report_path_trace_occupancy(const Ren @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list [email protected] List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
