I believe it is mapped as normally cached right now, rather than
uncached or write combining.

Reads aren't ultra-slow, and the timings of 48 byte writes appear to
involve a cacheline read.

128 byte writes are actually slower than 64 byte writes, which I
guessed might be because of automatic prefetching kicking in and
reading the following cacheline.


-------- Original Message --------
Subject: Re: [PATCH} Optimized rasops32 putchar
From: Mark Kettenis <mark.kette...@xs4all.nl>
Date: Sat, June 27, 2020 7:56 am
To: <jo...@armadilloaerospace.com>
Cc: tech@openbsd.org

> From: <jo...@armadilloaerospace.com>
> Date: Fri, 26 Jun 2020 07:42:50 -0700
> 
> Optimized 32 bit character rendering with unrolled rows and pairwise
> foreground / background pixel rendering.
> 
> If it weren't for the 5x8 font, I would have just assumed everything
> was an even width and made the fallback path also pairwise.
> 
> In isolation, the 16x32 character case got 2x faster, but that wasn't
> a huge real world speedup where the space rendering that was already
> at memory bandwidth limits accounted for most of the character
> rendering time. However, in combination with the previous fast
> conditional console scrolling that removes most of the space rendering,
> it becomes significant.
> 
> I also found that at least the efi and intel framebuffers are not
> currently mapped write combining, which makes this much slower than
> it should be.

Hi John,

The framebuffer should be mapped write-combining. In OpenBSD this is
requested by specifying the BUS_SPACE_MAP_PREFETCHABLE flag to
bbus_space_map(9) when mapping the framebuffer.

I'm fairly confident since until last January the initial mapping of
the framebuffer that we used wasn't write-combining. And things were
really, really slow.

Cheers,

Mark

> Index: rasops32.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/rasops/rasops32.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 rasops32.c
> --- rasops32.c 25 May 2020 09:55:49 -0000 1.10
> +++ rasops32.c 26 Jun 2020 14:34:06 -0000
> @@ -65,9 +65,14 @@ rasops32_init(struct rasops_info *ri)
> int
> rasops32_putchar(void *cookie, int row, int col, u_int uc, uint32_t
> attr)
> {
> - int width, height, cnt, fs, fb, clr[2];
> + int width, height, step, cnt, fs, b, f;
> + uint32_t fb, clr[2];
> struct rasops_info *ri;
> - int32_t *dp, *rp;
> + int64_t *rp, q;
> + union {
> + int64_t q[4];
> + int32_t d[4][2];
> + } u;
> u_char *fr;
> 
> ri = (struct rasops_info *)cookie;
> @@ -81,48 +86,128 @@ rasops32_putchar(void *cookie, int row, 
> return 0;
> #endif
> 
> - rp = (int32_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
> + rp = (int64_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
> 
> height = ri->ri_font->fontheight;
> width = ri->ri_font->fontwidth;
> + step = ri->ri_stride >> 3;
> 
> - clr[0] = ri->ri_devcmap[(attr >> 16) & 0xf];
> - clr[1] = ri->ri_devcmap[(attr >> 24) & 0xf];
> + b = ri->ri_devcmap[(attr >> 16) & 0xf];
> + f = ri->ri_devcmap[(attr >> 24) & 0xf];
> + u.d[0][0] = b; u.d[0][1] = b;
> + u.d[1][0] = b; u.d[1][1] = f;
> + u.d[2][0] = f; u.d[2][1] = b;
> + u.d[3][0] = f; u.d[3][1] = f;
> 
> if (uc == ' ') {
> + q = u.q[0];
> while (height--) {
> - dp = rp;
> - DELTA(rp, ri->ri_stride, int32_t *);
> -
> - for (cnt = width; cnt; cnt--)
> - *dp++ = clr[0];
> + /* the general, pixel-at-a-time case is fast enough */
> + for (cnt = 0; cnt < width; cnt++)
> + ((int *)rp)[cnt] = b;
> + rp += step;
> }
> } else {
> uc -= ri->ri_font->firstchar;
> fr = (u_char *)ri->ri_font->data + uc * ri->ri_fontscale;
> fs = ri->ri_font->stride;
> -
> - while (height--) {
> - dp = rp;
> - fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) |
> - (fr[0] << 24);
> - fr += fs;
> - DELTA(rp, ri->ri_stride, int32_t *);
> -
> - for (cnt = width; cnt; cnt--) {
> - *dp++ = clr[(fb >> 31) & 1];
> - fb <<= 1;
> - }
> + /* double-pixel special cases for the common widths */
> + switch (width) {
> + case 8:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + rp += step;
> + fr += 1;
> + }
> + break;
> + 
> + case 12:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp += step;
> + fr += 2;
> + }
> + break;
> + 
> + case 16:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp[6] = u.q[(fb >> 2) & 3];
> + rp[7] = u.q[fb & 3];
> + rp += step;
> + fr += 2;
> + }
> + break; 
> + case 32:
> + while (height--) {
> + fb = fr[0];
> + rp[0] = u.q[fb >> 6];
> + rp[1] = u.q[(fb >> 4) & 3];
> + rp[2] = u.q[(fb >> 2) & 3];
> + rp[3] = u.q[fb & 3];
> + fb = fr[1];
> + rp[4] = u.q[fb >> 6];
> + rp[5] = u.q[(fb >> 4) & 3];
> + rp[6] = u.q[(fb >> 2) & 3];
> + rp[7] = u.q[fb & 3];
> + fb = fr[2];
> + rp[8] = u.q[fb >> 6];
> + rp[9] = u.q[(fb >> 4) & 3];
> + rp[10] = u.q[(fb >> 2) & 3];
> + rp[11] = u.q[fb & 3];
> + fb = fr[3];
> + rp[12] = u.q[fb >> 6];
> + rp[13] = u.q[(fb >> 4) & 3];
> + rp[14] = u.q[(fb >> 2) & 3];
> + rp[15] = u.q[fb & 3];
> + rp += step;
> + fr += 4;
> + }
> + break; 
> +
> +
> + default: /* there is a 5x8 font, so fall back to per-pixel */
> + clr[0] = b;
> + clr[1] = f;
> + while (height--) {
> + fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) |
> + (fr[0] << 24);
> + fr += fs;
> + for (cnt = 0; cnt < width; cnt++) {
> + ((int *)rp)[cnt] = clr[fb >> 31];
> + fb <<= 1;
> + }
> + rp += step;
> + }
> + break;
> }
> }
> 
> - /* Do underline */
> + /* Do underline a pixel at a time */
> if ((attr & 1) != 0) {
> - DELTA(rp, -(ri->ri_stride << 1), int32_t *);
> -
> - while (width--)
> - *rp++ = clr[1];
> + rp -= step;
> + for (cnt = 0; cnt < width ; cnt++)
> + ((int *)rp)[cnt] = f;
> }
> 
> return 0;
> }
> +
> 
>

Reply via email to