Optimized 32 bit character rendering with unrolled rows and pairwise
foreground / background pixel rendering.

If it weren't for the 5x8 font, I would have just assumed everything
was an even width and made the fallback path also pairwise.

In isolation, the 16x32 character case got 2x faster, but that wasn't
a huge real world speedup where the space rendering that was already
at memory bandwidth limits accounted for most of the character
rendering time.  However, in combination with the previous fast
conditional console scrolling that removes most of the space rendering,
it becomes significant.

I also found that at least the efi and intel framebuffers are not
currently mapped write combining, which makes this much slower than
it should be.


Index: rasops32.c
===================================================================
RCS file: /cvs/src/sys/dev/rasops/rasops32.c,v
retrieving revision 1.10
diff -u -p -r1.10 rasops32.c
--- rasops32.c  25 May 2020 09:55:49 -0000      1.10
+++ rasops32.c  26 Jun 2020 14:34:06 -0000
@@ -65,9 +65,14 @@ rasops32_init(struct rasops_info *ri)
 int
 rasops32_putchar(void *cookie, int row, int col, u_int uc, uint32_t
attr)
 {
-       int width, height, cnt, fs, fb, clr[2];
+       int width, height, step, cnt, fs, b, f;
+       uint32_t fb, clr[2];
        struct rasops_info *ri;
-       int32_t *dp, *rp;
+       int64_t *rp, q;
+       union {
+               int64_t q[4];
+               int32_t d[4][2];
+       } u;
        u_char *fr;
 
        ri = (struct rasops_info *)cookie;
@@ -81,48 +86,128 @@ rasops32_putchar(void *cookie, int row, 
                return 0;
 #endif
 
-       rp = (int32_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
+       rp = (int64_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
 
        height = ri->ri_font->fontheight;
        width = ri->ri_font->fontwidth;
+       step = ri->ri_stride >> 3;
 
-       clr[0] = ri->ri_devcmap[(attr >> 16) & 0xf];
-       clr[1] = ri->ri_devcmap[(attr >> 24) & 0xf];
+       b = ri->ri_devcmap[(attr >> 16) & 0xf];
+       f = ri->ri_devcmap[(attr >> 24) & 0xf];
+       u.d[0][0] = b; u.d[0][1] = b;
+       u.d[1][0] = b; u.d[1][1] = f;
+       u.d[2][0] = f; u.d[2][1] = b;
+       u.d[3][0] = f; u.d[3][1] = f;
 
        if (uc == ' ') {
+               q = u.q[0];
                while (height--) {
-                       dp = rp;
-                       DELTA(rp, ri->ri_stride, int32_t *);
-
-                       for (cnt = width; cnt; cnt--)
-                               *dp++ = clr[0];
+                       /* the general, pixel-at-a-time case is fast enough */
+                       for (cnt = 0; cnt < width; cnt++)
+                               ((int *)rp)[cnt] = b;
+                       rp += step;
                }
        } else {
                uc -= ri->ri_font->firstchar;
                fr = (u_char *)ri->ri_font->data + uc * ri->ri_fontscale;
                fs = ri->ri_font->stride;
-
-               while (height--) {
-                       dp = rp;
-                       fb = fr[3] | (fr[2] << 8) | (fr[1] << 16) |
-                           (fr[0] << 24);
-                       fr += fs;
-                       DELTA(rp, ri->ri_stride, int32_t *);
-
-                       for (cnt = width; cnt; cnt--) {
-                               *dp++ = clr[(fb >> 31) & 1];
-                               fb <<= 1;
-                       }
+               /* double-pixel special cases for the common widths */
+               switch (width) {
+                       case 8:
+                               while (height--) {
+                                       fb = fr[0];
+                                       rp[0] = u.q[fb >> 6];
+                                       rp[1] = u.q[(fb >> 4) & 3];
+                                       rp[2] = u.q[(fb >> 2) & 3];
+                                       rp[3] = u.q[fb & 3];
+                                       rp += step;
+                                       fr += 1;
+                               }
+                               break;
+       
+                       case 12:
+                               while (height--) {
+                                       fb = fr[0];
+                                       rp[0] = u.q[fb >> 6];
+                                       rp[1] = u.q[(fb >> 4) & 3];
+                                       rp[2] = u.q[(fb >> 2) & 3];
+                                       rp[3] = u.q[fb & 3];
+                                       fb = fr[1];
+                                       rp[4] = u.q[fb >> 6];
+                                       rp[5] = u.q[(fb >> 4) & 3];
+                                       rp += step;
+                                       fr += 2;
+                               }
+                               break;
+                               
+                       case 16:
+                               while (height--) {
+                                       fb = fr[0];
+                                       rp[0] = u.q[fb >> 6];
+                                       rp[1] = u.q[(fb >> 4) & 3];
+                                       rp[2] = u.q[(fb >> 2) & 3];
+                                       rp[3] = u.q[fb & 3];
+                                       fb = fr[1];
+                                       rp[4] = u.q[fb >> 6];
+                                       rp[5] = u.q[(fb >> 4) & 3];
+                                       rp[6] = u.q[(fb >> 2) & 3];
+                                       rp[7] = u.q[fb & 3];
+                                       rp += step;
+                                       fr += 2;
+                               }
+                               break;  
+                       case 32:
+                               while (height--) {
+                                       fb = fr[0];
+                                       rp[0] = u.q[fb >> 6];
+                                       rp[1] = u.q[(fb >> 4) & 3];
+                                       rp[2] = u.q[(fb >> 2) & 3];
+                                       rp[3] = u.q[fb & 3];
+                                       fb = fr[1];
+                                       rp[4] = u.q[fb >> 6];
+                                       rp[5] = u.q[(fb >> 4) & 3];
+                                       rp[6] = u.q[(fb >> 2) & 3];
+                                       rp[7] = u.q[fb & 3];
+                                       fb = fr[2];
+                                       rp[8] = u.q[fb >> 6];
+                                       rp[9] = u.q[(fb >> 4) & 3];
+                                       rp[10] = u.q[(fb >> 2) & 3];
+                                       rp[11] = u.q[fb & 3];
+                                       fb = fr[3];
+                                       rp[12] = u.q[fb >> 6];
+                                       rp[13] = u.q[(fb >> 4) & 3];
+                                       rp[14] = u.q[(fb >> 2) & 3];
+                                       rp[15] = u.q[fb & 3];
+                                       rp += step;
+                                       fr += 4;
+                               }
+                               break;  
+
+
+                       default: /* there is a 5x8 font, so fall back to 
per-pixel */
+                               clr[0] = b;
+                               clr[1] = f;
+                               while (height--) {
+                                       fb = fr[3] | (fr[2] << 8) | (fr[1] << 
16) |
+                                           (fr[0] << 24);
+                                       fr += fs;
+                                       for (cnt = 0; cnt < width; cnt++) {
+                                               ((int *)rp)[cnt] = clr[fb >> 
31];
+                                               fb <<= 1;
+                                       }
+                                       rp += step;
+                               }
+                               break;
                }
        }
 
-       /* Do underline */
+       /* Do underline a pixel at a time */
        if ((attr & 1) != 0) {
-               DELTA(rp, -(ri->ri_stride << 1), int32_t *);
-
-               while (width--)
-                       *rp++ = clr[1];
+               rp -= step;
+               for (cnt = 0; cnt < width ; cnt++)
+                       ((int *)rp)[cnt] = f;
        }
 
        return 0;
 }
+

Reply via email to