On 07/18/2011 02:02 PM, Ian Romanick wrote:
> On 07/18/2011 01:54 PM, Chad Versace wrote:
>> On 07/18/2011 11:49 AM, Ian Romanick wrote:
>>> On 07/18/2011 12:55 AM, Chad Versace wrote:
>>>> Until now, the stencil buffer was allocated as a Y tiled buffer, because
>>>> in several locations the PRM states that it is. However, it is actually
>>>> W tiled. From the PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section
>>>> 4.5.2.1 W-Major Format:
>>>>     W-Major Tile Format is used for separate stencil.
>>>
>>>> The GTT is incapable of W fencing, so we allocate the stencil buffer with
>>>> I915_TILING_NONE and decode the tile's layout in software.
>>>
>>>> This fix touches the following portions of code:
>>>>     - In intel_allocate_renderbuffer_storage(), allocate the stencil
>>>>       buffer with I915_TILING_NONE.
>>>>     - In intel_verify_dri2_has_hiz(), verify that the stencil buffer is
>>>>       not tiled.
>>>>     - In the stencil buffer's span functions, the tile's layout must be
>>>>       decoded in software.
>>>
>>>> This commit mutually depends on the xf86-video-intel commit
>>>>     dri: Do not tile stencil buffer
>>>>     Author: Chad Versace <[email protected]>
>>>>     Date:   Mon Jul 18 00:38:00 2011 -0700
>>>
>>>> On Gen6 with separate stencil enabled, fixes the following Piglit tests:
>>>>     bugs/fdo23670-drawpix_stencil
>>>>     general/stencil-drawpixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX16-copypixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX16-drawpixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX16-readpixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX1-copypixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX1-drawpixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX1-readpixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX4-copypixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX4-drawpixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX4-readpixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX8-copypixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX8-drawpixels
>>>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX8-readpixels
>>>>     
>>>> spec/EXT_packed_depth_stencil/fbo-stencil-GL_DEPTH24_STENCIL8-copypixels
>>>>     
>>>> spec/EXT_packed_depth_stencil/fbo-stencil-GL_DEPTH24_STENCIL8-readpixels
>>>>     spec/EXT_packed_depth_stencil/readpixels-24_8
>>>
>>>> Note: This is a candidate for the 7.11 branch.
>>>
>>>> CC: Eric Anholt <[email protected]>
>>>> CC: Kenneth Graunke <[email protected]>
>>>> Signed-off-by: Chad Versace <[email protected]>
>>>> ---
>>>>  src/mesa/drivers/dri/intel/intel_clear.c   |    6 ++
>>>>  src/mesa/drivers/dri/intel/intel_context.c |    9 ++-
>>>>  src/mesa/drivers/dri/intel/intel_fbo.c     |   13 +++--
>>>>  src/mesa/drivers/dri/intel/intel_screen.h  |    9 ++-
>>>>  src/mesa/drivers/dri/intel/intel_span.c    |   85 
>>>> ++++++++++++++++++++--------
>>>>  5 files changed, 89 insertions(+), 33 deletions(-)
>>>
>>>> diff --git a/src/mesa/drivers/dri/intel/intel_clear.c 
>>>> b/src/mesa/drivers/dri/intel/intel_clear.c
>>>> index dfca03c..5ab9873 100644
>>>> --- a/src/mesa/drivers/dri/intel/intel_clear.c
>>>> +++ b/src/mesa/drivers/dri/intel/intel_clear.c
>>>> @@ -143,6 +143,12 @@ intelClear(struct gl_context *ctx, GLbitfield mask)
>>>>         */
>>>>              tri_mask |= BUFFER_BIT_STENCIL;
>>>>           }
>>>> +   else if (intel->has_separate_stencil &&
>>>> +         stencilRegion->tiling == I915_TILING_NONE) {
>>>> +      /* The stencil buffer is actually W tiled, which the hardware
>>>> +       * cannot blit to. */
>>>> +      tri_mask |= BUFFER_BIT_STENCIL;
>>>> +   }
>>>>           else {
>>>>              /* clearing all stencil bits, use blitting */
>>>>              blit_mask |= BUFFER_BIT_STENCIL;
>>>> diff --git a/src/mesa/drivers/dri/intel/intel_context.c 
>>>> b/src/mesa/drivers/dri/intel/intel_context.c
>>>> index 2ba1363..fe8be08 100644
>>>> --- a/src/mesa/drivers/dri/intel/intel_context.c
>>>> +++ b/src/mesa/drivers/dri/intel/intel_context.c
>>>> @@ -1439,7 +1439,12 @@ intel_verify_dri2_has_hiz(struct intel_context 
>>>> *intel,
>>>>        assert(stencil_rb->Base.Format == MESA_FORMAT_S8);
>>>>        assert(depth_rb && depth_rb->Base.Format == MESA_FORMAT_X8_Z24);
>>>
>>>> -      if (stencil_rb->region->tiling == I915_TILING_Y) {
>>>> +      if (stencil_rb->region->tiling == I915_TILING_NONE) {
>>>> +   /*
>>>> +    * The stencil buffer is actually W tiled. The region's tiling is
>>>> +    * I915_TILING_NONE, however, because the GTT is incapable of W
>>>> +    * fencing.
>>>> +    */
>>>>     intel->intelScreen->dri2_has_hiz = INTEL_DRI2_HAS_HIZ_TRUE;
>>>>     return;
>>>>        } else {
>>>> @@ -1527,7 +1532,7 @@ intel_verify_dri2_has_hiz(struct intel_context 
>>>> *intel,
>>>>         * Presently, however, no verification or clean up is necessary, and
>>>>         * execution should not reach here. If the framebuffer still has a 
>>>> hiz
>>>>         * region, then we have already set dri2_has_hiz to true after
>>>> -       * confirming above that the stencil buffer is Y tiled.
>>>> +       * confirming above that the stencil buffer is W tiled.
>>>>         */
>>>>        assert(0);
>>>>     }
>>>> diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c 
>>>> b/src/mesa/drivers/dri/intel/intel_fbo.c
>>>> index 1669af2..507cc33 100644
>>>> --- a/src/mesa/drivers/dri/intel/intel_fbo.c
>>>> +++ b/src/mesa/drivers/dri/intel/intel_fbo.c
>>>> @@ -173,6 +173,9 @@ intel_alloc_renderbuffer_storage(struct gl_context * 
>>>> ctx, struct gl_renderbuffer
>>>
>>>>     if (irb->Base.Format == MESA_FORMAT_S8) {
>>>>        /*
>>>> +       * The stencil buffer is W tiled. However, we request from the 
>>>> kernel a
>>>> +       * non-tiled buffer because the GTT is incapable of W fencing.
>>>> +       *
>>>>         * The stencil buffer has quirky pitch requirements.  From Vol 2a,
>>>>         * 11.5.6.2.1 3DSTATE_STENCIL_BUFFER, field "Surface Pitch":
>>>>         *    The pitch must be set to 2x the value computed based on 
>>>> width, as
>>>> @@ -180,14 +183,14 @@ intel_alloc_renderbuffer_storage(struct gl_context * 
>>>> ctx, struct gl_renderbuffer
>>>>         * To accomplish this, we resort to the nasty hack of doubling the 
>>>> drm
>>>>         * region's cpp and halving its height.
>>>>         *
>>>> -       * If we neglect to double the pitch, then 
>>>> drm_intel_gem_bo_map_gtt()
>>>> -       * maps the memory incorrectly.
>>>> +       * If we neglect to double the pitch, then render corruption occurs.
>>>>         */
>>>>        irb->region = intel_region_alloc(intel->intelScreen,
>>>> -                                 I915_TILING_Y,
>>>> +                                 I915_TILING_NONE,
>>>>                                   cpp * 2,
>>>> -                                 width,
>>>> -                                 height / 2,
>>>> +                                 ALIGN(width, 64),
>>>> +                                 /* XXX: Maybe align to 128? */
>>>> +                                 ALIGN(height / 2, 64),
>>>>                                   GL_TRUE);
>>>>        if (!irb->region)
>>>>    return false;
>>>> diff --git a/src/mesa/drivers/dri/intel/intel_screen.h 
>>>> b/src/mesa/drivers/dri/intel/intel_screen.h
>>>> index b2013af..9dd6a52 100644
>>>> --- a/src/mesa/drivers/dri/intel/intel_screen.h
>>>> +++ b/src/mesa/drivers/dri/intel/intel_screen.h
>>>> @@ -63,9 +63,12 @@
>>>>   * x8_z24 and s8).
>>>>   *
>>>>   * Eventually, intel_update_renderbuffers() makes a DRI2 request for
>>>> - * DRI2BufferStencil and DRI2BufferHiz. If the returned buffers are Y 
>>>> tiled,
>>>> - * then we joyfully set intel_screen.dri2_has_hiz to true and continue as 
>>>> if
>>>> - * nothing happend.
>>>> + * DRI2BufferStencil and DRI2BufferHiz. If the stencil buffer's tiling is
>>>> + * I915_TILING_NONE [1], then we joyfully set intel_screen.dri2_has_hiz to
>>>> + * true and continue as if nothing happend.
>>>> + *
>>>> + * [1] The stencil buffer is actually W tiled. However, we request from 
>>>> the
>>>> + *     kernel a non-tiled buffer because the GTT is incapable of W 
>>>> fencing.
>>>>   *
>>>>   * If the buffers are X tiled, however, the handshake has failed and we 
>>>> must
>>>>   * clean up.
>>>> diff --git a/src/mesa/drivers/dri/intel/intel_span.c 
>>>> b/src/mesa/drivers/dri/intel/intel_span.c
>>>> index 153803f..d306432 100644
>>>> --- a/src/mesa/drivers/dri/intel/intel_span.c
>>>> +++ b/src/mesa/drivers/dri/intel/intel_span.c
>>>> @@ -131,38 +131,77 @@ intel_set_span_functions(struct intel_context *intel,
>>>>     int miny = 0;                                                  \
>>>>     int maxx = rb->Width;                                          \
>>>>     int maxy = rb->Height;                                         \
>>>> -   int stride = rb->RowStride;                                            
>>>> \
>>>> -   uint8_t *buf = rb->Data;                                               
>>>> \
>>>> +                                                                  \
>>>> +   /*                                                                     
>>>> \
>>>> +    * Here we ignore rb->Data and rb->RowStride as set by         \
>>>> +    * intelSpanRenderStart. Since intel_offset_S8 decodes the W tile      
>>>> \
>>>> +    * manually, the region's *real* base address and stride is            
>>>> \
>>>> +    * required.                                                           
>>>> \
>>>> +    */                                                                    
>>>> \
>>>> +   struct intel_renderbuffer *irb = intel_renderbuffer(rb);               
>>>> \
>>>> +   uint8_t *buf = irb->region->buffer->virtual;                           
>>>> \
>>>> +   unsigned stride = irb->region->pitch;                          \
>>>> +   unsigned height = 2 * irb->region->height;                             
>>>> \
>>>> +   bool flip = rb->Name == 0;                                             
>>>> \
>>>
>>>> -/* Don't flip y. */
>>>>  #undef Y_FLIP
>>>> -#define Y_FLIP(y) y
>>>> +#define Y_FLIP(y) ((1 - flip) * y + flip * (height - 1 - y))
>>>
>>>>  /**
>>>>   * \brief Get pointer offset into stencil buffer.
>>>>   *
>>>> - * The stencil buffer interleaves two rows into one. Yay for crazy 
>>>> hardware.
>>>> - * The table below demonstrates how the pointer arithmetic behaves for a 
>>>> buffer
>>>> - * with positive stride (s=stride).
>>>> - *
>>>> - *     x    | y     | byte offset
>>>> - *     --------------------------
>>>> - *     0    | 0     | 0
>>>> - *     0    | 1     | 1
>>>> - *     1    | 0     | 2
>>>> - *     1    | 1     | 3
>>>> - *     ...  | ...   | ...
>>>> - *     0    | 2     | s
>>>> - *     0    | 3     | s + 1
>>>> - *     1    | 2     | s + 2
>>>> - *     1    | 3     | s + 3 
>>>> - *
>>>> + * The stencil buffer is W tiled. Since the GTT is incapable of W 
>>>> fencing, we
>>>> + * must decode the tile's layout in software.
>>>>   *
>>>> + * See
>>>> + *   - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.2.1 W-Major 
>>>> Tile
>>>> + *     Format.
>>>> + *   - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.3 Tiling 
>>>> Algorithm
>>>>   */
>>>> -static inline intptr_t
>>>> -intel_offset_S8(int stride, GLint x, GLint y)
>>>> +static inline uintptr_t
>>>
>>> Why the type change?
> 
>> Two reasons.
> 
>> 1. I redeclared the parameters as unsigned so to generate better code.
>> Since x, y, and stride are unsigned, the division and modulo operators
>> generate shift and bit-logic instructions rather than the slower arithmetic
>> instructions.
> 
> Is stride always unsigned now? 

intelSpanRenderStart still sets rb->RowStride to be negative for system
stencil buffers. But we ignore that and use a positive stride instead.
To quote the hunk above:

+   /*                                                                  \
+    * Here we ignore rb->Data and rb->RowStride as set by              \
+    * intelSpanRenderStart. Since intel_offset_S8 decodes the W tile   \
+    * manually, the region's *real* base address and stride is         \
+    * required.                                                        \
+    */                                                                 \
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);            \
+   uint8_t *buf = irb->region->buffer->virtual;                        \
+   unsigned stride = irb->region->pitch;

By setting the buf and stride to the *real* base address and stride,
intel_offset_S8 can implement the exact W-tiling algorithm as described
in the bspec. (PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.3 Tiling
Algorithm).

If intel_offset_S8 instead used rb->Data and rb->RowStride as the buffer's
base address and stride, then intel_offset_S8 would essentially
need two implementations, like this:
    intptr_t
    intel_offset_S8(...)
    {
        if (stride > 0) {
            // do normal stuff;
        } else {
            // do upside down stuff;
        }
    }

I'd like to avoid such a bifurcated function.

> Will it always be in the future? 

Yes.

> This is the problem that we encountered with bug #37351 (fixed by e8b1c6d).

Ah... Thanks for recalling that bug. I believe setting the return type of
intel_offset_S8 to be intptr_t will eliminate reoccurrence of this bug. Do you 
agree?

>> Below is a comparison of x % 64, signed versus unsigned, compiled with gcc 
>> -O3.
>> In f, the % generates 5 instructions. In g, only one instruction.
> 
>> int f(int x) { return x % 64; }
> 
>> f:
>>      movl    %edi, %edx
>>      sarl    $31, %edx
>>      shrl    $26, %edx
>>      leal    (%rdi,%rdx), %eax
>>      andl    $63, %eax
>>      subl    %edx, %eax
>>      ret
> 
>> unsigned g(unsigned x) { return x % 64);
> 
>> g:
>>      movl    %edi, %eax
>>      andl    $63, %eax
>>      ret
> 
>> 2. I redeclared the return type as unsigned to make it explicit that it does
>> not return a negative offset.
> 
>>>> +intel_offset_S8(uint32_t stride, uint32_t x, uint32_t y)
>>>>  {
>>>> -   return 2 * ((y / 2) * stride + x) + y % 2;
>>>> +   uint32_t tile_size = 4096;
>>>> +   uint32_t tile_width = 64;
>>>> +   uint32_t tile_height = 64;
>>>> +   uint32_t row_size = 64 * stride;
>>>> +
>>>> +   uint32_t tile_x = x / tile_width;
>>>> +   uint32_t tile_y = y / tile_height;
>>>> +
>>>> +   /* The byte's address relative to the tile's base addres. */
>>>> +   uint32_t byte_x = x % tile_width;
>>>> +   uint32_t byte_y = y % tile_height;
>>>> +
>>>> +   uintptr_t u = tile_y * row_size
>>>> +               + tile_x * tile_size
>>>> +               + 512 * (byte_x / 8)
>>>> +               +  64 * (byte_y / 8)
>>>> +               +  32 * ((byte_y / 4) % 2)
>>>> +               +  16 * ((byte_x / 4) % 2)
>>>> +               +   8 * ((byte_y / 2) % 2)
>>>> +               +   4 * ((byte_x / 2) % 2)
>>>> +               +   2 * (byte_y % 2)
>>>> +               +   1 * (byte_x % 2);
>>>> +
>>>> +   /*
>>>> +    * Errata for Gen5:
>>>> +    *
>>>> +    * An additional offset is needed which is not documented in the PRM.
>>>> +    *
>>>> +    * if ((byte_x / 8) % 2 == 1) {
>>>> +    *    if ((byte_y / 8) % 2) == 0) {
>>>> +    *       u += 64;
>>>> +    *    } else {
>>>> +    *       u -= 64;
>>>> +    *    }
>>>> +    * }
>>>> +    *
>>>> +    * The offset is expressed more tersely as
>>>> +    * u += ((int) x & 0x8) * (8 - (((int) y & 0x8) << 1));
>>>> +    */
>>>> +
>>>> +   return u;
>>>>  }
>>>
>>>>  #define WRITE_STENCIL(x, y, src)  buf[intel_offset_S8(stride, x, y)] = 
>>>> src;

-- 
Chad Versace
[email protected]

Attachment: signature.asc
Description: OpenPGP digital signature

_______________________________________________
Intel-gfx mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to