Ok, I was FINALLY able to come up with something for texture tiling which seems to work - this was very very annoying, it _almost_ worked literally within minutes, but I needed a lot of time until it finally did really work.
I needed to convert back the drivers to use a multi-byte upload scheme again (they did this years ago until it was abandoned for a simpler method), since if the blitter auto-tiles the textures on upload color format and texture pitch need to match the exact texture properties (at least I was unable to make it work otherwise).
Also, there are tons of special cases for small textures in the drm, which I don't like but couldn't avoid.
I could not make it work for texture rectangles, I think at least on r200 microtiling for that case should work, but it didn't. Well I don't think it's important.
It is only tested with "normal" 16-bit and 32-bit RGB/RGBA textures, in theory all other formats should probably work too I hope. I lack a good test, tests/manytex and redbook/mipmap are a bit too simple (and you can't see if the tiling is actually correct...). 8-bit formats should work too, if I got the math correctly...
Also, there seem to be some minor differences between r100 and r200 chips as far as macro tiling is concerned (didn't have much time to test on r100 though), and for the heck of it I couldn't figure out what that second micro-tile bit is good for on r100.


Quake3 got about a 15% boost on a 9000pro, and 11% on a 7200 sdr, if the highest texture setting/trilinear/32bit was used. Well, compressed textures are still faster :-).

For the drm, I've also included Andreas Stenglein's cube map patch for the r100 (since this needs a version bump as well).
I've only attached the -core version, I actually plan to do a non-core version too, but I've noticed the non-core version does no longer seem to get all fixes the core version does. So am I the only one who still checks things in for that version? In that case I'll immediately stop touching it...


Roland
Index: shared-core/radeon_drm.h
===================================================================
RCS file: /cvs/dri/drm/shared-core/radeon_drm.h,v
retrieving revision 1.27
diff -u -r1.27 radeon_drm.h
--- shared-core/radeon_drm.h    26 Jan 2005 17:48:59 -0000      1.27
+++ shared-core/radeon_drm.h    8 Feb 2005 18:42:38 -0000
@@ -145,7 +145,13 @@
 #define RADEON_EMIT_PP_TEX_SIZE_2                   75
 #define R200_EMIT_RB3D_BLENDCOLOR                   76
 #define R200_EMIT_TCL_POINT_SPRITE_CNTL             77
-#define RADEON_MAX_STATE_PACKETS                    78
+#define RADEON_EMIT_PP_CUBIC_FACES_0                78
+#define RADEON_EMIT_PP_CUBIC_OFFSETS_T0             79
+#define RADEON_EMIT_PP_CUBIC_FACES_1                80
+#define RADEON_EMIT_PP_CUBIC_OFFSETS_T1             81
+#define RADEON_EMIT_PP_CUBIC_FACES_2                82
+#define RADEON_EMIT_PP_CUBIC_OFFSETS_T2             83
+#define RADEON_MAX_STATE_PACKETS                    84
 
 /* Commands understood by cmd_buffer ioctl.  More can be added but
  * obviously these can't be removed or changed:
Index: shared-core/radeon_drv.h
===================================================================
RCS file: /cvs/dri/drm/shared-core/radeon_drv.h,v
retrieving revision 1.44
diff -u -r1.44 radeon_drv.h
--- shared-core/radeon_drv.h    8 Feb 2005 04:17:14 -0000       1.44
+++ shared-core/radeon_drv.h    8 Feb 2005 18:42:39 -0000
@@ -42,7 +42,7 @@
 
 #define DRIVER_NAME            "radeon"
 #define DRIVER_DESC            "ATI Radeon"
-#define DRIVER_DATE            "20050207"
+#define DRIVER_DATE            "20050208"
 
 /* Interface history:
  *
@@ -82,10 +82,12 @@
  *     - Add hyperz support, add hyperz flags to clear ioctl.
  * 1.14- Add support for color tiling
  *     - Add R100/R200 surface allocation/free support
+ * 1.15- Add support for texture micro tiling
+ *     - Add support for r100 cube maps
  */
 
 #define DRIVER_MAJOR           1
-#define DRIVER_MINOR           14
+#define DRIVER_MINOR           15
 #define DRIVER_PATCHLEVEL      0
 
 enum radeon_family {
@@ -800,6 +802,13 @@
 #define RADEON_PP_TEX_SIZE_1                0x1d0c
 #define RADEON_PP_TEX_SIZE_2                0x1d14
 
+#define RADEON_PP_CUBIC_FACES_0             0x1d24
+#define RADEON_PP_CUBIC_FACES_1             0x1d28
+#define RADEON_PP_CUBIC_FACES_2             0x1d2c
+#define RADEON_PP_CUBIC_OFFSET_T0_0         0x1dd0     /* bits [31:5] */
+#define RADEON_PP_CUBIC_OFFSET_T1_0         0x1e00
+#define RADEON_PP_CUBIC_OFFSET_T2_0         0x1e14
+
 #define SE_VAP_CNTL__TCL_ENA_MASK                          0x00000001
 #define SE_VAP_CNTL__FORCE_W_TO_ONE_MASK                   0x00010000
 #define SE_VAP_CNTL__VF_MAX_VTX_NUM__SHIFT                 0x00000012
Index: shared-core/radeon_state.c
===================================================================
RCS file: /cvs/dri/drm/shared-core/radeon_state.c,v
retrieving revision 1.48
diff -u -r1.48 radeon_state.c
--- shared-core/radeon_state.c  8 Feb 2005 04:17:14 -0000       1.48
+++ shared-core/radeon_state.c  8 Feb 2005 18:42:41 -0000
@@ -129,6 +129,22 @@
                        break;
                }
 
+       case RADEON_EMIT_PP_CUBIC_OFFSETS_T0:
+       case RADEON_EMIT_PP_CUBIC_OFFSETS_T1:
+       case RADEON_EMIT_PP_CUBIC_OFFSETS_T2:{
+                       int i;
+                       for (i = 0; i < 5; i++) {
+                               if (radeon_check_and_fixup_offset(dev_priv,
+                                                                 filp_priv,
+                                                                 &data[i])) {
+                                       DRM_ERROR
+                                           ("Invalid R100 cubic texture 
offset\n");
+                                       return DRM_ERR(EINVAL);
+                               }
+                       }
+               }
+               break;
+
        case RADEON_EMIT_RB3D_COLORPITCH:
        case RADEON_EMIT_RE_LINE_PATTERN:
        case RADEON_EMIT_SE_LINE_WIDTH:
@@ -190,6 +206,9 @@
        case RADEON_EMIT_PP_TEX_SIZE_2:
        case R200_EMIT_RB3D_BLENDCOLOR:
        case R200_EMIT_TCL_POINT_SPRITE_CNTL:
+       case RADEON_EMIT_PP_CUBIC_FACES_0:
+       case RADEON_EMIT_PP_CUBIC_FACES_1:
+       case RADEON_EMIT_PP_CUBIC_FACES_2:
                /* These packets don't contain memory offsets */
                break;
 
@@ -556,6 +575,13 @@
        RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2"}, {
        R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"}, {
        R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"},
+       {
+       RADEON_PP_CUBIC_FACES_0, 1, "RADEON_PP_CUBIC_FACES_0"}, {
+       RADEON_PP_CUBIC_OFFSET_T0_0, 5, "RADEON_PP_CUBIC_OFFSET_T0_0"}, {
+       RADEON_PP_CUBIC_FACES_1, 1, "RADEON_PP_CUBIC_FACES_1"}, {
+       RADEON_PP_CUBIC_OFFSET_T1_0, 5, "RADEON_PP_CUBIC_OFFSET_T1_0"}, {
+       RADEON_PP_CUBIC_FACES_2, 1, "RADEON_PP_CUBIC_FACES_2"}, {
+       RADEON_PP_CUBIC_OFFSET_T2_0, 5, "RADEON_PP_CUBIC_OFFSET_T2_0"},
 };
 
 /* ================================================================
@@ -1461,6 +1487,7 @@
        int size, dwords, tex_width, blit_width;
        u32 height;
        int i;
+       u32 texpitch, microtile;
        RING_LOCALS;
 
        DRM_GET_PRIV_WITH_RETURN(filp_priv, filp);
@@ -1522,6 +1549,16 @@
                DRM_ERROR("invalid texture format %d\n", tex->format);
                return DRM_ERR(EINVAL);
        }
+       texpitch = tex->pitch;
+       if ((texpitch << 22) & RADEON_DST_TILE_MICRO) {
+               microtile = 1;
+               if (tex_width < 64) {
+                       texpitch &= ~(RADEON_DST_TILE_MICRO >> 22);
+                       /* we got tiled coordinates, untile them */
+                       image->x *= 2;
+               }
+       }
+       else microtile = 0;
 
        DRM_DEBUG("tex=%dx%d blit=%d\n", tex_width, tex->height, blit_width);
 
@@ -1574,7 +1611,7 @@
                             RADEON_GMC_CLR_CMP_CNTL_DIS |
                             RADEON_GMC_WR_MSK_DIS);
 
-               buffer[2] = (tex->pitch << 22) | (tex->offset >> 10);
+               buffer[2] = (texpitch << 22) | (tex->offset >> 10);
                buffer[3] = 0xffffffff;
                buffer[4] = 0xffffffff;
                buffer[5] = (image->y << 16) | image->x;
@@ -1582,29 +1619,109 @@
                buffer[7] = dwords;
                buffer += 8;
 
-               if (tex_width >= 32) {
-                       /* Texture image width is larger than the minimum, so we
-                        * can upload it directly.
-                        */
-                       if (DRM_COPY_FROM_USER(buffer, data,
-                                              dwords * sizeof(u32))) {
-                               DRM_ERROR("EFAULT on data, %d dwords\n",
-                                         dwords);
-                               return DRM_ERR(EFAULT);
+               if (microtile) {
+                       /* texture micro tiling in use, minimum texture width 
is thus 16 bytes.
+                          however, we cannot use blitter directly for texture 
width < 64 bytes,
+                          since minimum tex pitch is 64 bytes and we need this 
to match
+                          the texture width, otherwise the blitter will tile 
it wrong.
+                          Thus, tiling manually in this case. Additionally, 
need to special
+                          case tex height = 1, since our actual image will 
have height 2
+                          and we need to ensure we don't read beyond the 
texture size
+                          from user space. */
+                       if (tex->height == 1) {
+                               if (tex_width >= 64 || tex_width <= 16) {
+                                       if (DRM_COPY_FROM_USER(buffer, data,
+                                                      tex_width * 
sizeof(u32))) {
+                                               DRM_ERROR("EFAULT on pad, %d 
bytes\n",
+                                                         tex_width);
+                                               return DRM_ERR(EFAULT);
+                                       }
+                               } else if (tex_width == 32) {
+                                       if (DRM_COPY_FROM_USER(buffer, data, 
16)) {
+                                               DRM_ERROR("EFAULT on pad, %d 
bytes\n",
+                                                         tex_width);
+                                               return DRM_ERR(EFAULT);
+                                       }
+                                       if (DRM_COPY_FROM_USER(buffer + 8, data 
+ 16, 16)) {
+                                               DRM_ERROR("EFAULT on pad, %d 
bytes\n",
+                                                         tex_width);
+                                               return DRM_ERR(EFAULT);
+                                       }
+                               }
+                       } else if (tex_width >= 64 || tex_width == 16) {
+                               if (DRM_COPY_FROM_USER(buffer, data,
+                                                      dwords * sizeof(u32))) {
+                                       DRM_ERROR("EFAULT on data, %d dwords\n",
+                                                 dwords);
+                                       return DRM_ERR(EFAULT);
+                               }
+                       } else if (tex_width < 16) {
+                               for (i = 0; i < tex->height; i++) {
+                                       if (DRM_COPY_FROM_USER(buffer, data, 
tex_width)) {
+                                               DRM_ERROR("EFAULT on pad, %d 
bytes\n",
+                                                         tex_width);
+                                               return DRM_ERR(EFAULT);
+                                       }
+                                       buffer += 4;
+                                       data += tex_width;
+                               }
+                       } else if (tex_width == 32) {
+                       /* TODO: make sure this works when not fitting in one 
buffer
+                               (i.e. 32bytes x 2048...) */
+                               for (i = 0; i < tex->height; i += 2) {
+                                       if (DRM_COPY_FROM_USER(buffer, data, 
16)) {
+                                               DRM_ERROR("EFAULT on pad, %d 
bytes\n",
+                                                         tex_width);
+                                               return DRM_ERR(EFAULT);
+                                       }
+                                       data += 16;
+                                       if (DRM_COPY_FROM_USER(buffer + 8, 
data, 16)) {
+                                               DRM_ERROR("EFAULT on pad, %d 
bytes\n",
+                                                         tex_width);
+                                               return DRM_ERR(EFAULT);
+                                       }
+                                       data += 16;
+                                       if (DRM_COPY_FROM_USER(buffer + 4, 
data, 16)) {
+                                               DRM_ERROR("EFAULT on pad, %d 
bytes\n",
+                                                         tex_width);
+                                               return DRM_ERR(EFAULT);
+                                       }
+                                       data += 16;
+                                       if (DRM_COPY_FROM_USER(buffer + 12, 
data, 16)) {
+                                               DRM_ERROR("EFAULT on pad, %d 
bytes\n",
+                                                         tex_width);
+                                               return DRM_ERR(EFAULT);
+                                       }
+                                       data += 16;
+                                       buffer += 16;
+                               }
                        }
-               } else {
-                       /* Texture image width is less than the minimum, so we
-                        * need to pad out each image scanline to the minimum
-                        * width.
-                        */
-                       for (i = 0; i < tex->height; i++) {
-                               if (DRM_COPY_FROM_USER(buffer, data, 
tex_width)) {
-                                       DRM_ERROR("EFAULT on pad, %d bytes\n",
-                                                 tex_width);
+               }
+               else {
+                       if (tex_width >= 32) {
+                               /* Texture image width is larger than the 
minimum, so we
+                                * can upload it directly.
+                                */
+                               if (DRM_COPY_FROM_USER(buffer, data,
+                                                      dwords * sizeof(u32))) {
+                                       DRM_ERROR("EFAULT on data, %d dwords\n",
+                                                 dwords);
                                        return DRM_ERR(EFAULT);
                                }
-                               buffer += 8;
-                               data += tex_width;
+                       } else {
+                               /* Texture image width is less than the 
minimum, so we
+                                * need to pad out each image scanline to the 
minimum
+                                * width.
+                                */
+                               for (i = 0; i < tex->height; i++) {
+                                       if (DRM_COPY_FROM_USER(buffer, data, 
tex_width)) {
+                                               DRM_ERROR("EFAULT on pad, %d 
bytes\n",
+                                                         tex_width);
+                                               return DRM_ERR(EFAULT);
+                                       }
+                                       buffer += 8;
+                                       data += tex_width;
+                               }
                        }
                }
 
Index: r200/r200_context.c
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/r200/r200_context.c,v
retrieving revision 1.37
diff -u -r1.37 r200_context.c
--- r200/r200_context.c 8 Dec 2004 17:32:46 -0000       1.37
+++ r200/r200_context.c 8 Feb 2005 18:59:27 -0000
@@ -272,6 +272,9 @@
       else
         rmesa->using_hyperz = GL_TRUE;
    }
+ 
+   if ( sPriv->drmMinor >= 15 )
+      rmesa->texmicrotile = GL_TRUE;
 
    /* Init default driver functions then plug in our R200-specific functions
     * (the texture functions are especially important)
Index: r200/r200_context.h
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/r200/r200_context.h,v
retrieving revision 1.26
diff -u -r1.26 r200_context.h
--- r200/r200_context.h 31 Jan 2005 23:40:06 -0000      1.26
+++ r200/r200_context.h 8 Feb 2005 18:59:27 -0000
@@ -167,6 +167,8 @@
    GLuint pp_cubic_faces;              /* cube face 1,2,3,4 log2 sizes */
 
    GLboolean  border_fallback;
+
+   GLuint tile_bits;                   /* hw texture tile bits used on this 
texture */
 };
 
 
@@ -931,6 +938,7 @@
    driOptionCache optionCache;
 
    GLboolean using_hyperz;
+   GLboolean texmicrotile;
 };
 
 #define R200_CONTEXT(ctx)              ((r200ContextPtr)(ctx->DriverCtx))
Index: r200/r200_reg.h
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/r200/r200_reg.h,v
retrieving revision 1.9
diff -u -r1.9 r200_reg.h
--- r200/r200_reg.h     26 Jan 2005 18:05:03 -0000      1.9
+++ r200/r200_reg.h     8 Feb 2005 18:59:27 -0000
@@ -968,6 +968,8 @@
 #define     R200_TXO_ENDIAN_BYTE_SWAP   (1 << 0)
 #define     R200_TXO_ENDIAN_WORD_SWAP   (2 << 0)
 #define     R200_TXO_ENDIAN_HALFDW_SWAP (3 << 0)
+#define     R200_TXO_MACRO_TILE         (1 << 2)
+#define     R200_TXO_MICRO_TILE         (1 << 3)
 #define     R200_TXO_OFFSET_MASK        0xffffffe0
 #define     R200_TXO_OFFSET_SHIFT       5
 #define R200_PP_CUBIC_OFFSET_F1_0         0x2d04
Index: r200/r200_texmem.c
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/r200/r200_texmem.c,v
retrieving revision 1.10
diff -u -r1.10 r200_texmem.c
--- r200/r200_texmem.c  7 Oct 2004 23:30:30 -0000       1.10
+++ r200/r200_texmem.c  8 Feb 2005 18:59:28 -0000
@@ -43,16 +43,13 @@
 #include "context.h"
 #include "colormac.h"
 #include "macros.h"
-#include "radeon_reg.h" /* gets definition for usleep */
 #include "r200_context.h"
-#include "r200_state.h"
 #include "r200_ioctl.h"
-#include "r200_swtcl.h"
 #include "r200_tex.h"
+#include "radeon_reg.h"
 
 #include <unistd.h>  /* for usleep() */
 
-
 /**
  * Destroy any device-dependent state associated with the texture.  This may
  * include NULLing out hardware state that points to the texture.
@@ -253,12 +251,13 @@
 
         /* Blit to framebuffer
          */
-        r200EmitBlit( rmesa, 
-                      blit_format, 
-                      dstPitch, GET_START( &region ),   
-                      dstPitch, t->bufAddr,
-                      0, 0, 
-                      0, done, 
+        r200EmitBlit( rmesa,
+                      blit_format,
+                      dstPitch, GET_START( &region ),
+                      dstPitch | (t->tile_bits >> 16), /* hack */
+                      t->bufAddr,
+                      0, 0,
+                      0, done,
                       width, lines );
         
         r200EmitWait( rmesa, RADEON_WAIT_2D );
@@ -339,7 +338,7 @@
    imageWidth = texImage->Width;
    imageHeight = texImage->Height;
 
-   offset = t->bufAddr;
+   offset = t->bufAddr + t->base.totalSize / 6 * face;
 
    if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
       GLint imageX = 0;
@@ -363,19 +362,47 @@
     * We used to use 1, 2 and 4-byte texels and used to use the texture
     * width to dictate the blit width - but that won't work for compressed
     * textures. (Brian)
+    * NOTE: can't do that with texture tiling. (sroland)
     */
    tex.offset = offset;
-   tex.pitch = BLIT_WIDTH_BYTES / 64;
-   tex.format = R200_TXFORMAT_I8; /* any 1-byte texel format */
+   tex.image = &tmp;
+   /* copy (x,y,width,height,data) */
+   memcpy( &tmp, &t->image[face][hwlevel], sizeof(tmp) );
+   
    if (texImage->TexFormat->TexelBytes) {
-      tex.width = imageWidth * texImage->TexFormat->TexelBytes; /* in bytes */
+      /* use multi-byte upload scheme */
       tex.height = imageHeight;
+      tex.width = imageWidth;
+      tex.format = t->pp_txformat & R200_TXFORMAT_FORMAT_MASK;
+      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 
64, 1);
+      tex.offset += tmp.x & ~1023;
+      tmp.x = tmp.x % 1024;
+      if (t->tile_bits & R200_TXO_MICRO_TILE) {
+        /* need something like "tiled coordinates" ? */
+        tmp.y = tmp.x / (tex.pitch * 128) * 2;
+        tmp.x = tmp.x % (tex.pitch * 128) / 2 / 
texImage->TexFormat->TexelBytes;
+        tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
+      }
+      else {
+        tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
+      }
+      if ((t->tile_bits & R200_TXO_MACRO_TILE) &&
+        (texImage->Width * texImage->TexFormat->TexelBytes >= 256) &&
+        ((!(t->tile_bits & R200_TXO_MICRO_TILE) && (texImage->Height >= 8)) ||
+           (texImage->Height >= 16))) {
+        /* weird: R200 disables macro tiling if mip width is smaller than 256 
bytes,
+           OR if height is smaller than 8 automatically, but if micro tiling 
is active
+           the limit is height 16 instead ? */
+        tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
+      }
    }
    else {
       /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after 
the first two blocks is
          needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). 
*/
       /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real 
pixels. Needed
          so the kernel module reads the right amount of data. */
+      tex.format = R200_TXFORMAT_I8; /* any 1-byte texel format */
+      tex.pitch = (BLIT_WIDTH_BYTES / 64);
       tex.height = (imageHeight + 3) / 4;
       tex.width = (imageWidth + 3) / 4;
       switch (t->pp_txformat & R200_TXFORMAT_FORMAT_MASK) {
@@ -390,19 +431,7 @@
           fprintf(stderr, "unknown compressed tex format in uploadSubImage\n");
       }
    }
-   tex.image = &tmp;
 
-   /* copy (x,y,width,height,data) */
-   memcpy( &tmp, &t->image[face][hwlevel], sizeof(tmp) );
-
-   /* Adjust the base offset to account for the Y-offset.  This is done,
-    * instead of just letting the Y-offset automatically take care of it,
-    * because it is possible, for very large textures, for the Y-offset
-    * to exceede the [-8192,+8191] range.
-    */
-   tex.offset += tmp.y * 1024;
-   tmp.y = 0;
-    
    LOCK_HARDWARE( rmesa );
    do {
       ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_TEXTURE,
@@ -473,7 +502,11 @@
       t->bufAddr = rmesa->r200Screen->texOffset[heap] 
           + t->base.memBlock->ofs;
       t->pp_txoffset = t->bufAddr;
-
+       
+      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
+        /* hope it's safe to add that here... */
+        t->pp_txoffset |= t->tile_bits;
+      }
 
       /* Mark this texobj as dirty on all units:
        */
Index: r200/r200_texstate.c
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/r200/r200_texstate.c,v
retrieving revision 1.18
diff -u -r1.18 r200_texstate.c
--- r200/r200_texstate.c        18 Oct 2004 00:00:41 -0000      1.18
+++ r200/r200_texstate.c        8 Feb 2005 18:59:30 -0000
@@ -125,8 +125,8 @@
 {
    r200TexObjPtr t = (r200TexObjPtr)tObj->DriverData;
    const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
-   GLint curOffset;
-   GLint i;
+   GLint curOffset, blitWidth;
+   GLint i, texelBytes;
    GLint numLevels;
    GLint log2Width, log2Height, log2Depth;
 
@@ -146,6 +146,7 @@
       return;
    }
 
+   texelBytes = baseImage->TexFormat->TexelBytes;
 
    /* Compute which mipmap levels we really want to send to the hardware.
     */
@@ -164,6 +165,28 @@
     * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
     */
    curOffset = 0;
+   blitWidth = BLIT_WIDTH_BYTES;
+   t->tile_bits = 0;
+
+   /* figure out if this texture is suitable for tiling. */
+   if (texelBytes) {
+      if (rmesa->texmicrotile  && (tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
+      /* texrect might be able to use micro tiling too in theory? */
+        (baseImage->Height > 1)) {
+        /* allow 32 (bytes) x 1 mip (which will use two times the space
+        the non-tiled version would use) max if base texture is large enough */
+        if ((numLevels == 1) ||
+          (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
+              (baseImage->Width * texelBytes > 64)) ||
+           ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
+           t->tile_bits |= R200_TXO_MICRO_TILE;
+        }
+      }
+      if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
+        /* we can set macro tiling even for small textures, they will be 
untiled anyway */
+        t->tile_bits |= R200_TXO_MACRO_TILE;
+      }
+   }
 
    for (i = 0; i < numLevels; i++) {
       const struct gl_texture_image *texImage;
@@ -195,28 +218,41 @@
             else size = texImage->CompressedSize;
       }
       else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-         size = ((texImage->Width * texImage->TexFormat->TexelBytes + 63)
-                 & ~63) * texImage->Height;
+        size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
+      }
+      else if (t->tile_bits & R200_TXO_MICRO_TILE) {
+        /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+           though the actual offset may be different (if texture is less than
+           32 bytes width) to the untiled case */
+        int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+        size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
+        blitWidth = MAX2(texImage->Width, 64 / texelBytes);
       }
       else {
-         int w = texImage->Width * texImage->TexFormat->TexelBytes;
-         if (w < 32)
-            w = 32;
-         size = w * texImage->Height * texImage->Depth;
+        int w = (texImage->Width * texelBytes + 31) & ~31;
+        size = w * texImage->Height * texImage->Depth;
+        blitWidth = MAX2(texImage->Width, 64 / texelBytes);
       }
       assert(size > 0);
 
-
       /* Align to 32-byte offset.  It is faster to do this unconditionally
        * (no branch penalty).
        */
 
       curOffset = (curOffset + 0x1f) & ~0x1f;
 
-      t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
-      t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
-      t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
-      t->image[0][i].height = size / t->image[0][i].width;
+      if (texelBytes) {
+        t->image[0][i].x = curOffset; /* fix x and y coords up later together 
with offset */
+        t->image[0][i].y = 0;
+        t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
+        t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
+      }
+      else {
+         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
+         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
+         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
+         t->image[0][i].height = size / t->image[0][i].width;     
+      }
 
 #if 0
       /* for debugging only and only  applicable to non-rectangle targets */
@@ -242,16 +278,13 @@
 
    /* Setup remaining cube face blits, if needed */
    if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      /* Round totalSize up to multiple of BLIT_WIDTH_BYTES */
-      const GLuint faceSize = (t->base.totalSize + BLIT_WIDTH_BYTES - 1)
-                              & ~(BLIT_WIDTH_BYTES-1);
-      const GLuint lines = faceSize / BLIT_WIDTH_BYTES;
+      const GLuint faceSize = t->base.totalSize;
       GLuint face;
-      /* reuse face 0 x/y/width/height - just adjust y */
+      /* reuse face 0 x/y/width/height - just update the offset when uploading 
*/
       for (face = 1; face < 6; face++) {
          for (i = 0; i < numLevels; i++) {
             t->image[face][i].x =  t->image[0][i].x;
-            t->image[face][i].y =  t->image[0][i].y + face * lines;
+            t->image[face][i].y =  t->image[0][i].y;
             t->image[face][i].width  = t->image[0][i].width;
             t->image[face][i].height = t->image[0][i].height;
          }
@@ -310,7 +343,7 @@
    if (baseImage->IsCompressed)
       t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
    else
-      t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * 
baseImage->TexFormat->TexelBytes) + 63) & ~(63);
+      t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * 
texelBytes) + 63) & ~(63);
    t->pp_txpitch -= 32;
 
    t->dirty_state = TEX_ALL;
Index: radeon/radeon_context.c
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/radeon/radeon_context.c,v
retrieving revision 1.29
diff -u -r1.29 radeon_context.c
--- radeon/radeon_context.c     23 Jan 2005 06:27:08 -0000      1.29
+++ radeon/radeon_context.c     8 Feb 2005 18:59:30 -0000
@@ -255,6 +255,9 @@
         rmesa->using_hyperz = GL_TRUE;
    }
 
+   if ( sPriv->drmMinor >= 15 )
+      rmesa->texmicrotile = GL_TRUE;
+
    /* Init default driver functions then plug in our Radeon-specific functions
     * (the texture functions are especially important)
     */
@@ -445,6 +448,7 @@
    }
    (*rmesa->get_ust)( & rmesa->swap_ust );
 
+   if (rmesa->sarea->tiling_enabled != 0) fprintf(stderr, "color tiling 
enabled!\n");
 
 #if DO_DEBUG
    RADEON_DEBUG = driParseDebugString( getenv( "RADEON_DEBUG" ),
Index: radeon/radeon_context.h
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/radeon/radeon_context.h,v
retrieving revision 1.20
diff -u -r1.20 radeon_context.h
--- radeon/radeon_context.h     31 Jan 2005 23:40:06 -0000      1.20
+++ radeon/radeon_context.h     8 Feb 2005 18:59:31 -0000
@@ -162,6 +161,8 @@
    GLuint pp_cubic_faces;              /* cube face 1,2,3,4 log2 sizes */
 
    GLboolean  border_fallback;
+
+   GLuint tile_bits;                   /* hw texture tile bits used on this 
texture */
 };
 
 
@@ -186,7 +187,7 @@
    GLboolean dirty;                      /* dirty-mark in emit_state_list */
    GLboolean (*check)( GLcontext * );    /* is this state active? */
 };
-   
+
 
 
 /* Trying to keep these relatively short as the variables are becoming
@@ -781,6 +782,7 @@
    driOptionCache optionCache;
 
    GLboolean using_hyperz;
+   GLboolean texmicrotile;
 
    /* Performance counters
     */
Index: radeon/radeon_texmem.c
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/radeon/radeon_texmem.c,v
retrieving revision 1.10
diff -u -r1.10 radeon_texmem.c
--- radeon/radeon_texmem.c      23 Jan 2005 06:27:08 -0000      1.10
+++ radeon/radeon_texmem.c      8 Feb 2005 18:59:31 -0000
@@ -46,6 +46,7 @@
 #include "radeon_ioctl.h"
 #include "radeon_tex.h"
 
+#include <unistd.h>  /* for usleep() */
 
 /**
  * Destroy any device-dependent state associated with the texture.  This may
@@ -151,12 +152,12 @@
 
         /* Blit to framebuffer
          */
-        radeonEmitBlit( rmesa, 
-                      blit_format, 
-                      dstPitch, GET_START( &region ),    
-                      dstPitch, t->bufAddr, 
-                      0, 0, 
-                      0, done, 
+        radeonEmitBlit( rmesa,
+                      blit_format,
+                      dstPitch, GET_START( &region ),
+                      dstPitch, t->bufAddr,
+                      0, 0,
+                      0, done,
                       width, lines );
         
         radeonEmitWait( rmesa, RADEON_WAIT_2D );
@@ -248,19 +249,43 @@
     * We used to use 1, 2 and 4-byte texels and used to use the texture
     * width to dictate the blit width - but that won't work for compressed
     * textures. (Brian)
+    * NOTE: can't do that with texture tiling. (sroland)
     */
    tex.offset = offset;
-   tex.pitch = BLIT_WIDTH_BYTES / 64;
-   tex.format = RADEON_TXFORMAT_I8; /* any 1-byte texel format */
+   tex.image = &tmp;
+   /* copy (x,y,width,height,data) */
+   memcpy( &tmp, &t->image[face][hwlevel], sizeof(drm_radeon_tex_image_t) );
+
    if (texImage->TexFormat->TexelBytes) {
-      tex.width = imageWidth * texImage->TexFormat->TexelBytes; /* in bytes */
+      /* use multi-byte upload scheme */
       tex.height = imageHeight;
+      tex.width = imageWidth;
+      tex.format = t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK;
+      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 
64, 1);
+      tex.offset += tmp.x & ~1023;
+      tmp.x = tmp.x % 1024;
+      if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
+        /* need something like "tiled coordinates" ? */
+        tmp.y = tmp.x / (tex.pitch * 128) * 2;
+        tmp.x = tmp.x % (tex.pitch * 128) / 2 / 
texImage->TexFormat->TexelBytes;
+        tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
+      }
+      else {
+        tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
+      }
+      if ((t->tile_bits & RADEON_TXO_MACRO_TILE) &&
+        (texImage->Width * texImage->TexFormat->TexelBytes >= 256)) {
+        /* radeon switches off macro tiling for small textures/mipmaps it 
seems */
+        tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
+      }
    }
    else {
       /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after 
the first two blocks is
          needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). 
*/
       /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real 
pixels. Needed
          so the kernel module reads the right amount of data. */
+      tex.format = RADEON_TXFORMAT_I8; /* any 1-byte texel format */
+      tex.pitch = (BLIT_WIDTH_BYTES / 64);
       tex.height = (imageHeight + 3) / 4;
       tex.width = (imageWidth + 3) / 4;
       switch (t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK) {
@@ -273,10 +298,6 @@
          break;
       }
    }
-   tex.image = &tmp;
-
-   /* copy (x,y,width,height,data) */
-   memcpy( &tmp, &t->image[face][hwlevel], sizeof(drm_radeon_tex_image_t) );
 
    LOCK_HARDWARE( rmesa );
    do {
@@ -344,6 +365,10 @@
           + t->base.memBlock->ofs;
       t->pp_txoffset = t->bufAddr;
 
+      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
+        /* hope it's safe to add that here... */
+        t->pp_txoffset |= t->tile_bits;
+      }
 
       /* Mark this texobj as dirty on all units:
        */
Index: radeon/radeon_texstate.c
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/radeon/radeon_texstate.c,v
retrieving revision 1.13
diff -u -r1.13 radeon_texstate.c
--- radeon/radeon_texstate.c    7 Oct 2004 23:30:30 -0000       1.13
+++ radeon/radeon_texstate.c    8 Feb 2005 18:59:32 -0000
@@ -127,8 +127,8 @@
 {
    radeonTexObjPtr t = (radeonTexObjPtr)tObj->DriverData;
    const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
-   GLint curOffset;
-   GLint i;
+   GLint curOffset, blitWidth;
+   GLint i, texelBytes;
    GLint numLevels;
    GLint log2Width, log2Height, log2Depth;
 
@@ -148,6 +148,7 @@
       return;
    }
 
+   texelBytes = baseImage->TexFormat->TexelBytes;
 
    /* Compute which mipmap levels we really want to send to the hardware.
     */
@@ -166,6 +167,34 @@
     * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
     */
    curOffset = 0;
+   blitWidth = BLIT_WIDTH_BYTES;
+   t->tile_bits = 0;
+
+   /* figure out if this texture is suitable for tiling. */
+   if (texelBytes && (tObj->Target != GL_TEXTURE_RECTANGLE_NV)) {
+      if (rmesa->texmicrotile && (baseImage->Height > 1)) {
+        /* allow 32 (bytes) x 1 mip (which will use two times the space
+           the non-tiled version would use) max if base texture is large 
enough */
+        if ((numLevels == 1) ||
+          (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
+              (baseImage->Width * texelBytes > 64)) ||
+           ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
+           /* R100 has two microtile bits (only the txoffset reg, not the 
blitter)
+              weird: X2 + OPT: 32bit correct, 16bit completely hosed
+                     X2: 32bit correct, 16bit correct
+                     OPT: 32bit large mips correct, small mips hosed, 16bit 
completely hosed */
+           t->tile_bits |= RADEON_TXO_MICRO_TILE_X2 /*| 
RADEON_TXO_MICRO_TILE_OPT*/;
+        }
+      }
+      if ((baseImage->Width * texelBytes >= 256) && (baseImage->Height >= 16)) 
{
+        /* R100 disables macro tiling only if mip width is smaller than 256 
bytes, and not
+           in the case if height is smaller than 16 (not 100% sure), as does 
the r200,
+           so need to disable macro tiling in that case */
+        if ((numLevels == 1) || ((baseImage->Width * texelBytes / 
baseImage->Height) <= 4)) {
+           t->tile_bits |= RADEON_TXO_MACRO_TILE;
+        }
+      }
+   }
 
    for (i = 0; i < numLevels; i++) {
       const struct gl_texture_image *texImage;
@@ -197,28 +226,41 @@
             else size = texImage->CompressedSize;
       }
       else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-        size = ((texImage->Width * texImage->TexFormat->TexelBytes + 63)
-                & ~63) * texImage->Height;
+        size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
+      }
+      else if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
+        /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+           though the actual offset may be different (if texture is less than
+           32 bytes width) to the untiled case */
+        int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+        size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
+        blitWidth = MAX2(texImage->Width, 64 / texelBytes);
       }
       else {
-         int w = texImage->Width * texImage->TexFormat->TexelBytes;
-         if (w < 32)
-            w = 32;
-         size = w * texImage->Height * texImage->Depth;
+        int w = (texImage->Width * texelBytes + 31) & ~31;
+        size = w * texImage->Height * texImage->Depth;
+        blitWidth = MAX2(texImage->Width, 64 / texelBytes);
       }
       assert(size > 0);
 
-
       /* Align to 32-byte offset.  It is faster to do this unconditionally
        * (no branch penalty).
        */
 
       curOffset = (curOffset + 0x1f) & ~0x1f;
 
-      t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
-      t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
-      t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
-      t->image[0][i].height = size / t->image[0][i].width;
+      if (texelBytes) {
+        t->image[0][i].x = curOffset; /* fix x and y coords up later together 
with offset */
+        t->image[0][i].y = 0;
+        t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
+        t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
+      }
+      else {
+         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
+         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
+         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
+         t->image[0][i].height = size / t->image[0][i].width;     
+      }
 
 #if 0
       /* for debugging only and only  applicable to non-rectangle targets */
@@ -263,7 +305,7 @@
    if (baseImage->IsCompressed)
       t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
    else
-      t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * 
baseImage->TexFormat->TexelBytes) + 63) & ~(63);
+      t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * 
texelBytes) + 63) & ~(63);
    t->pp_txpitch -= 32;
 
    t->dirty_state = TEX_ALL;

Reply via email to