Hi Keith,

you are right. The main purpose of this patch is to speedup osmesa rendering as 
there is no llvmpipe target at the moment. Also llvmpipe is currently missing 
some important features like aa/fsaa and anisotropic filtering, which is 
available in swrast now. 
So I need to stick with the old rasterizer at the moment, with some 
improvements.

Andreas

-----Ursprüngliche Nachricht-----
Von: Keith Whitwell [mailto:kei...@vmware.com] 
Gesendet: Mittwoch, 10. August 2011 11:17
An: Andreas Fänger
Cc: mesa-dev@lists.freedesktop.org
Betreff: Re: [Mesa-dev] [PATCH] swrast: initial multi-threaded span rendering

I'm not sure it makes a lot of sense to be optimizing swrast at this
stage.  Take a look at llvmpipe and perhaps consider improving the
multithreading already in place in that rasterizer, which is far better
optimized than swrast already.

Keith

On Wed, 2011-08-10 at 08:07 +0000, Andreas Fänger wrote:
> Optional parallel rendering of spans using OpenMP.
> Initial implementation for aa triangles. A new option for scons is
> also provided to activate the openmp support (off by default).
> ---
>  common.py                      |    1 +
>  scons/gallium.py               |   12 +++++++
>  src/mesa/swrast/s_aatritemp.h  |   68 ++++++++++++++++++++++-----------------
>  src/mesa/swrast/s_context.c    |   26 ++++++++++++---
>  src/mesa/swrast/s_texcombine.c |    4 ++
>  src/mesa/tnl/t_pipeline.c      |   12 +++++++
>  6 files changed, 87 insertions(+), 36 deletions(-)
> 
> diff --git a/common.py b/common.py
> index 8657030..cfee1b5 100644
> --- a/common.py
> +++ b/common.py
> @@ -88,6 +88,7 @@ def AddOptions(opts):
>       opts.Add('toolchain', 'compiler toolchain', default_toolchain)
>       opts.Add(BoolOption('gles', 'EXPERIMENTAL: enable OpenGL ES support', 
> 'no'))
>       opts.Add(BoolOption('llvm', 'use LLVM', default_llvm))
> +     opts.Add(BoolOption('openmp', 'EXPERIMENTAL: compile with openmp 
> (swrast)', 'no'))
>       opts.Add(BoolOption('debug', 'DEPRECATED: debug build', 'yes'))
>       opts.Add(BoolOption('profile', 'DEPRECATED: profile build', 'no'))
>       opts.Add(BoolOption('quiet', 'DEPRECATED: profile build', 'yes'))
> diff --git a/scons/gallium.py b/scons/gallium.py
> index 8cd3bc7..7135251 100755
> --- a/scons/gallium.py
> +++ b/scons/gallium.py
> @@ -596,6 +596,18 @@ def generate(env):
>          libs += ['m', 'pthread', 'dl']
>      env.Append(LIBS = libs)
>  
> +    # OpenMP
> +    if env['openmp']:
> +        if env['msvc']:
> +            env.Append(CCFLAGS = ['/openmp'])
> +            # When building openmp release VS2008 link.exe crashes with 
> LNK1103 error.
> +            # Workaround: overwrite PDB flags with empty value as it isn't 
> required anyways
> +            if env['build'] == 'release':
> +                env['PDB'] = ''
> +        if env['gcc']:
> +            env.Append(CCFLAGS = ['-fopenmp'])
> +            env.Append(LIBS = ['gomp'])
> +
>      # Load tools
>      env.Tool('lex')
>      env.Tool('yacc')
> diff --git a/src/mesa/swrast/s_aatritemp.h b/src/mesa/swrast/s_aatritemp.h
> index 91d4f7a..005d12c 100644
> --- a/src/mesa/swrast/s_aatritemp.h
> +++ b/src/mesa/swrast/s_aatritemp.h
> @@ -181,13 +181,18 @@
>        const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
>        const GLfloat dxdy = majDx / majDy;
>        const GLfloat xAdj = dxdy < 0.0F ? -dxdy : 0.0F;
> -      GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
>        GLint iy;
> -      for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
> +      #pragma omp parallel for schedule(dynamic) private(iy) 
> firstprivate(span)
> +      for (iy = iyMin; iy < iyMax; iy++) {
> +         GLfloat x = pMin[0] - (yMin - iy) * dxdy;
>           GLint ix, startX = (GLint) (x - xAdj);
>           GLuint count;
>           GLfloat coverage = 0.0F;
>  
> +#ifdef _OPENMP
> +         /* each thread needs to use a different (global) SpanArrays 
> variable */
> +         span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
> +#endif
>           /* skip over fragments with zero coverage */
>           while (startX < MAX_WIDTH) {
>              coverage = compute_coveragef(pMin, pMid, pMax, startX, iy);
> @@ -228,13 +233,12 @@
>              coverage = compute_coveragef(pMin, pMid, pMax, ix, iy);
>           }
>           
> -         if (ix <= startX)
> -            continue;
> -         
> -         span.x = startX;
> -         span.y = iy;
> -         span.end = (GLuint) ix - (GLuint) startX;
> -         _swrast_write_rgba_span(ctx, &span);
> +         if (ix > startX) {
> +            span.x = startX;
> +            span.y = iy;
> +            span.end = (GLuint) ix - (GLuint) startX;
> +            _swrast_write_rgba_span(ctx, &span);
> +         }
>        }
>     }
>     else {
> @@ -244,13 +248,18 @@
>        const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
>        const GLfloat dxdy = majDx / majDy;
>        const GLfloat xAdj = dxdy > 0 ? dxdy : 0.0F;
> -      GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
>        GLint iy;
> -      for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
> +      #pragma omp parallel for schedule(dynamic) private(iy) 
> firstprivate(span)
> +      for (iy = iyMin; iy < iyMax; iy++) {
> +         GLfloat x = pMin[0] - (yMin - iy) * dxdy;
>           GLint ix, left, startX = (GLint) (x + xAdj);
>           GLuint count, n;
>           GLfloat coverage = 0.0F;
>           
> +#ifdef _OPENMP
> +         /* each thread needs to use a different (global) SpanArrays 
> variable */
> +         span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
> +#endif
>           /* make sure we're not past the window edge */
>           if (startX >= ctx->DrawBuffer->_Xmax) {
>              startX = ctx->DrawBuffer->_Xmax - 1;
> @@ -296,31 +305,30 @@
>           ATTRIB_LOOP_END
>  #endif
>  
> -         if (startX <= ix)
> -            continue;
> +         if (startX > ix) {
> +            n = (GLuint) startX - (GLuint) ix;
>  
> -         n = (GLuint) startX - (GLuint) ix;
> +            left = ix + 1;
>  
> -         left = ix + 1;
> -
> -         /* shift all values to the left */
> -         /* XXX this is temporary */
> -         {
> -            SWspanarrays *array = span.array;
> -            GLint j;
> -            for (j = 0; j < (GLint) n; j++) {
> -               array->coverage[j] = array->coverage[j + left];
> -               COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
> +            /* shift all values to the left */
> +            /* XXX this is temporary */
> +            {
> +               SWspanarrays *array = span.array;
> +               GLint j;
> +               for (j = 0; j < (GLint) n; j++) {
> +                  array->coverage[j] = array->coverage[j + left];
> +                  COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
>  #ifdef DO_Z
> -               array->z[j] = array->z[j + left];
> +                  array->z[j] = array->z[j + left];
>  #endif
> +               }
>              }
> -         }
>  
> -         span.x = left;
> -         span.y = iy;
> -         span.end = n;
> -         _swrast_write_rgba_span(ctx, &span);
> +            span.x = left;
> +            span.y = iy;
> +            span.end = n;
> +            _swrast_write_rgba_span(ctx, &span);
> +         }
>        }
>     }
>  }
> diff --git a/src/mesa/swrast/s_context.c b/src/mesa/swrast/s_context.c
> index def1531..4434f11 100644
> --- a/src/mesa/swrast/s_context.c
> +++ b/src/mesa/swrast/s_context.c
> @@ -772,6 +772,11 @@ _swrast_CreateContext( struct gl_context *ctx )
>  {
>     GLuint i;
>     SWcontext *swrast = (SWcontext *)CALLOC(sizeof(SWcontext));
> +#ifdef _OPENMP
> +   const GLint maxThreads = omp_get_max_threads();
> +#else
> +   const GLint maxThreads = 1;
> +#endif
>  
>     if (SWRAST_DEBUG) {
>        _mesa_debug(ctx, "_swrast_CreateContext\n");
> @@ -806,19 +811,25 @@ _swrast_CreateContext( struct gl_context *ctx )
>     for (i = 0; i < MAX_TEXTURE_IMAGE_UNITS; i++)
>        swrast->TextureSample[i] = NULL;
>  
> -   swrast->SpanArrays = MALLOC_STRUCT(sw_span_arrays);
> +   /* SpanArrays is global and shared by all SWspan instances. However, when
> +    * using multiple threads, it is necessary to have one SpanArrays instance
> +    * per thread.
> +    */
> +   swrast->SpanArrays = (SWspanarrays *) MALLOC(maxThreads * 
> sizeof(SWspanarrays));
>     if (!swrast->SpanArrays) {
>        FREE(swrast);
>        return GL_FALSE;
>     }
> -   swrast->SpanArrays->ChanType = CHAN_TYPE;
> +   for(i = 0; i < maxThreads; i++) {
> +      swrast->SpanArrays[i].ChanType = CHAN_TYPE;
>  #if CHAN_TYPE == GL_UNSIGNED_BYTE
> -   swrast->SpanArrays->rgba = swrast->SpanArrays->rgba8;
> +      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba8;
>  #elif CHAN_TYPE == GL_UNSIGNED_SHORT
> -   swrast->SpanArrays->rgba = swrast->SpanArrays->rgba16;
> +      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba16;
>  #else
> -   swrast->SpanArrays->rgba = swrast->SpanArrays->attribs[FRAG_ATTRIB_COL0];
> +      swrast->SpanArrays[i].rgba = 
> swrast->SpanArrays[i].attribs[FRAG_ATTRIB_COL0];
>  #endif
> +   }
>  
>     /* init point span buffer */
>     swrast->PointSpan.primitive = GL_POINT;
> @@ -826,7 +837,10 @@ _swrast_CreateContext( struct gl_context *ctx )
>     swrast->PointSpan.facing = 0;
>     swrast->PointSpan.array = swrast->SpanArrays;
>  
> -   swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits *
> +   /* TexelBuffer is also global and normally shared by all SWspan instances;
> +    * when running with multiple threads, create one per thread.
> +    */
> +   swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits 
> * maxThreads *
>                                             MAX_WIDTH * 4 * sizeof(GLfloat));
>     if (!swrast->TexelBuffer) {
>        FREE(swrast->SpanArrays);
> diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
> index 086ed0b..80b9dff 100644
> --- a/src/mesa/swrast/s_texcombine.c
> +++ b/src/mesa/swrast/s_texcombine.c
> @@ -48,7 +48,11 @@ typedef float (*float4_array)[4];
>  static INLINE float4_array
>  get_texel_array(SWcontext *swrast, GLuint unit)
>  {
> +#ifdef _OPENMP
> +   return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4 * 
> omp_get_num_threads() + (MAX_WIDTH * 4 * omp_get_thread_num()));
> +#else
>     return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4);
> +#endif
>  }
>  
> 
> diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c
> index 18f095f..881d5d5 100644
> --- a/src/mesa/tnl/t_pipeline.c
> +++ b/src/mesa/tnl/t_pipeline.c
> @@ -146,7 +146,17 @@ void _tnl_run_pipeline( struct gl_context *ctx )
>        _tnl_notify_pipeline_output_change( ctx );
>     }
>  
> +#ifndef _OPENMP
> +   /* Don't adjust FPU precision mode in case multiple threads are to be 
> used.
> +    * This would require that the additional threads also changed the FPU 
> mode
> +    * which is quite a mess as this had to be done in all parallelized 
> sections;
> +    * otherwise the master thread and all other threads are running in 
> different
> +    * modes, producing inconsistent results.
> +    * Note that all x64 implementations don't define/use START_FAST_MATH, so
> +    * this is "hack" is only used in i386 mode
> +    */
>     START_FAST_MATH(__tmp);
> +#endif
>  
>     for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
>        struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
> @@ -154,7 +164,9 @@ void _tnl_run_pipeline( struct gl_context *ctx )
>        break;
>     }
>  
> +#ifndef _OPENMP
>     END_FAST_MATH(__tmp);
> +#endif
>  }
>  
> 



_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to