From: Marek Olšák <marek.ol...@amd.com>

The compiler queue was limited to 3 threads, so shader-db running
on a 16-thread CPU would have a bottleneck on the 3-thread queue.
---
 src/gallium/drivers/radeonsi/si_pipe.c | 39 +++++++++++++++++---------
 src/gallium/drivers/radeonsi/si_pipe.h |  6 ++--
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index f1f1e3ad890..d044b191b71 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -848,21 +848,21 @@ static void si_disk_cache_create(struct si_screen 
*sscreen)
                                                  shader_debug_flags);
                        free(timestamp_str);
                }
        }
 }
 
 struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
                                           const struct pipe_screen_config 
*config)
 {
        struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
-       unsigned num_threads, num_compiler_threads, 
num_compiler_threads_lowprio, i;
+       unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads, i;
 
        if (!sscreen) {
                return NULL;
        }
 
        sscreen->ws = ws;
        ws->query_info(ws, &sscreen->info);
 
        sscreen->debug_flags = debug_get_flags_option("R600_DEBUG",
                                                        debug_options, 0);
@@ -905,40 +905,53 @@ struct pipe_screen *radeonsi_screen_create(struct 
radeon_winsys *ws,
        (void) mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
 
        if (!si_init_gs_info(sscreen) ||
            !si_init_shader_cache(sscreen)) {
                FREE(sscreen);
                return NULL;
        }
 
        si_disk_cache_create(sscreen);
 
-       /* Only enable as many threads as we have target machines, but at most
-        * the number of CPUs - 1 if there is more than one.
-        */
-       num_threads = sysconf(_SC_NPROCESSORS_ONLN);
-       num_threads = MAX2(1, num_threads - 1);
-       num_compiler_threads = MIN2(num_threads, ARRAY_SIZE(sscreen->compiler));
-       num_compiler_threads_lowprio =
-               MIN2(num_threads, ARRAY_SIZE(sscreen->compiler_lowp));
+       /* Determine the number of shader compiler threads. */
+       hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
+
+       if (hw_threads >= 12) {
+               num_comp_hi_threads = hw_threads * 3 / 4;
+               num_comp_lo_threads = hw_threads / 3;
+       } else if (hw_threads >= 6) {
+               num_comp_hi_threads = hw_threads - 2;
+               num_comp_lo_threads = hw_threads / 2;
+       } else if (hw_threads >= 2) {
+               num_comp_hi_threads = hw_threads - 1;
+               num_comp_lo_threads = hw_threads / 2;
+       } else {
+               num_comp_hi_threads = 1;
+               num_comp_lo_threads = 1;
+       }
+
+       num_comp_hi_threads = MIN2(num_comp_hi_threads,
+                                  ARRAY_SIZE(sscreen->compiler));
+       num_comp_lo_threads = MIN2(num_comp_lo_threads,
+                                  ARRAY_SIZE(sscreen->compiler_lowp));
 
        if (!util_queue_init(&sscreen->shader_compiler_queue, "si_shader",
-                            32, num_compiler_threads,
+                            64, num_comp_hi_threads,
                             UTIL_QUEUE_INIT_RESIZE_IF_FULL)) {
                si_destroy_shader_cache(sscreen);
                FREE(sscreen);
                return NULL;
        }
 
        if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
                             "si_shader_low",
-                            32, num_compiler_threads_lowprio,
+                            64, num_comp_lo_threads,
                             UTIL_QUEUE_INIT_RESIZE_IF_FULL |
                             UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
               si_destroy_shader_cache(sscreen);
               FREE(sscreen);
               return NULL;
        }
 
        si_handle_env_var_force_family(sscreen);
 
        if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
@@ -1075,23 +1088,23 @@ struct pipe_screen *radeonsi_screen_create(struct 
radeon_winsys *ws,
        sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 |
                                            SI_CONTEXT_INV_VMEM_L1;
        if (sscreen->info.chip_class <= VI) {
                sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_GLOBAL_L2;
                sscreen->barrier_flags.L2_to_cp |= 
SI_CONTEXT_WRITEBACK_GLOBAL_L2;
        }
 
        if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
                sscreen->debug_flags |= DBG_ALL_SHADERS;
 
-       for (i = 0; i < num_compiler_threads; i++)
+       for (i = 0; i < num_comp_hi_threads; i++)
                si_init_compiler(sscreen, &sscreen->compiler[i]);
-       for (i = 0; i < num_compiler_threads_lowprio; i++)
+       for (i = 0; i < num_comp_lo_threads; i++)
                si_init_compiler(sscreen, &sscreen->compiler_lowp[i]);
 
        /* Create the auxiliary context. This must be done last. */
        sscreen->aux_context = si_create_context(&sscreen->b, 0);
 
        if (sscreen->debug_flags & DBG(TEST_DMA))
                si_test_dma(sscreen);
 
        if (sscreen->debug_flags & (DBG(TEST_VMFAULT_CP) |
                                      DBG(TEST_VMFAULT_SDMA) |
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index a67786c84d9..27efc5099f0 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -522,27 +522,27 @@ struct si_screen {
         * - GS and CS aren't cached, but it's certainly possible to cache
         *   those as well.
         */
        mtx_t                   shader_cache_mutex;
        struct hash_table               *shader_cache;
 
        /* Shader compiler queue for multithreaded compilation. */
        struct util_queue               shader_compiler_queue;
        /* Use at most 3 normal compiler threads on quadcore and better.
         * Hyperthreaded CPUs report the number of threads, but we want
-        * the number of cores. */
-       struct si_compiler              compiler[3]; /* used by the queue only 
*/
+        * the number of cores. We only need this many threads for shader-db. */
+       struct si_compiler              compiler[24]; /* used by the queue only 
*/
 
        struct util_queue               shader_compiler_queue_low_priority;
        /* Use at most 2 low priority threads on quadcore and better.
         * We want to minimize the impact on multithreaded Mesa. */
-       struct si_compiler              compiler_lowp[2]; /* at most 2 threads 
*/
+       struct si_compiler              compiler_lowp[10];
 };
 
 struct si_blend_color {
        struct pipe_blend_color         state;
        bool                            any_nonzeros;
 };
 
 struct si_sampler_view {
        struct pipe_sampler_view        base;
         /* [0..7] = image descriptor
-- 
2.17.0

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to