I've taken the two benchmarks byte-bs----acc and space-bs-c8-acc-1 and
gradually tweaked their inner loops from something that used memory all the
time to something that used registers more and more efficiently.  I've done
this gradually, pretty much one register at a time.  Along the way, I've also
done a simple common subexpression/loop hoisting thing in which I combined the
pointer to the start of the string and the index into the string into a single
pointer.  Doing this in real life may cause bad problems with the garbage
collector.

At the end, I go a bit mad and start doing heroic optimizations (reading four
bytes at a time, using MMX registers to read 8 bytes at a time, twisted MMX
math to keep 8 space counters in an MMX register + a bit of loop unrolling).

Here follows first the two original inner loops and then the 23 hand-tweaked
versions.

I used the following shell code to isolate the inner loops:

 (for F in hs/byte-bs----acc.s hs/space-bs-c8-acc-1.s hand/*.s ; \
        do echo "------------------------------"; \
           echo "$F:";                            \
           echo ;                                 \
           cat "$F" | perl -e 'while(<>){ if (/Main_zdwcnt_info:/ .. /.section 
.data/) { print; }}' | head -n-1;                         \
        done; \
           echo "=============================="; \
 ) > xx.txt


-Peter

------------------------------
hs/byte-bs----acc.s:

Main_zdwcnt_info:
.LcYL:
        cmpl $0,16(%ebp)
        jle .LcYO
        movl 12(%ebp),%eax
        incl %eax
        movl (%ebp),%ecx
        incl %ecx
        subl $1,16(%ebp)
        movl %eax,12(%ebp)
        movl %ecx,(%ebp)
        jmp Main_zdwcnt_info
.LcYO:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)
------------------------------
hs/space-bs-c8-acc-1.s:

Main_zdwcnt_info:
.Lc16u:
        cmpl $0,16(%ebp)
        jle .Lc16x
        movl 4(%ebp),%eax
        movl 12(%ebp),%ecx
        movzbl (%eax,%ecx,1),%eax
        cmpl $32,%eax
        jne .Lc16F
        movl 12(%ebp),%eax
        incl %eax
        movl (%ebp),%ecx
        incl %ecx
        subl $1,16(%ebp)
        movl %eax,12(%ebp)
        movl %ecx,(%ebp)
        jmp Main_zdwcnt_info
.Lc16x:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)
.Lc16F:
        movl 12(%ebp),%eax
        incl %eax
        subl $1,16(%ebp)
        movl %eax,12(%ebp)
        jmp Main_zdwcnt_info
------------------------------
hand/byte-bs----acc-a.s:

Main_zdwcnt_info:
.LcYN:
        cmpl $0,16(%ebp)
        jle .LcYQ

        movl 00(%ebp),%ecx
        movl 12(%ebp),%eax
        movl 16(%ebp),%edx

        incl %ecx
        incl %eax
        decl %edx

        movl %ecx,00(%ebp)
        movl %eax,12(%ebp)
        movl %edx,16(%ebp)
        jmp Main_zdwcnt_info

.LcYQ:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)
------------------------------
hand/byte-bs----acc-b.s:

Main_zdwcnt_info:
.LcYN:
        cmpl $0,16(%ebp)
        jle .LcYQ

        movl 00(%ebp),%ecx
        movl 12(%ebp),%eax
        movl 16(%ebp),%edx

.L_again:
        cmpl $0,%edx
        jle  .L_out
        incl %ecx
        incl %eax
        decl %edx
        jmp  .L_again
.L_out:
        movl %ecx,00(%ebp)
        movl %eax,12(%ebp)
        movl %edx,16(%ebp)
        jmp Main_zdwcnt_info

.LcYQ:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)
------------------------------
hand/byte-bs----acc-c.s:

Main_zdwcnt_info:
.LcYN:
        cmpl $0,16(%ebp)
        jle .LcYQ

        movl 00(%ebp),%ecx
        movl 12(%ebp),%eax
        movl 16(%ebp),%edx

        cmpl $0,%edx
        jle  .L_out
.L_again:
        incl %ecx
        incl %eax
        decl %edx
        cmpl $0,%edx
        jg   .L_again

.L_out:
        movl %ecx,00(%ebp)
        movl %eax,12(%ebp)
        movl %edx,16(%ebp)
        jmp Main_zdwcnt_info

.LcYQ:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)
------------------------------
hand/byte-bs----acc-d.s:

Main_zdwcnt_info:
.LcYN:
        cmpl $0,16(%ebp)
        jle .LcYQ

        movl 00(%ebp),%ecx
        movl 12(%ebp),%eax
        movl 16(%ebp),%edx

        cmpl $0,%edx
        jle  .L_out
        .align 16
.L_again:
        incl %ecx
        incl %eax
        decl %edx
        cmpl $0,%edx
        jg   .L_again

.L_out:
        movl %ecx,00(%ebp)
        movl %eax,12(%ebp)
        movl %edx,16(%ebp)
        jmp Main_zdwcnt_info

.LcYQ:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-a.s:

Main_zdwcnt_info:

.Lc16w:
        cmpl $0,16(%ebp)
        jle .Lc16z
        movl 4(%ebp),%eax
        movl 12(%ebp),%ecx
        movzbl (%eax,%ecx,1),%eax
        cmpl $32,%eax
        jne .Lc16H
        movl 12(%ebp),%eax
        incl %eax
        movl (%ebp),%ecx
        incl %ecx
        subl $1,16(%ebp)
        movl %eax,12(%ebp)
        movl %ecx,(%ebp)
        jmp Main_zdwcnt_info
.Lc16H:
        movl 12(%ebp),%eax
        incl %eax
        subl $1,16(%ebp)
        movl %eax,12(%ebp)
        jmp Main_zdwcnt_info
.Lc16z:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)



------------------------------
hand/space-bs-c8-acc-1-b.s:

Main_zdwcnt_info:

.Lc16w:
        cmpl $0,16(%ebp)
        jle .Lc16z
        movl 4(%ebp),%eax
        movl 12(%ebp),%ecx
        movzbl (%eax,%ecx,1),%eax
        cmpl $32,%eax
        je .Lc16H

        movl 12(%ebp),%eax
        incl %eax
        subl $1,16(%ebp)
        movl %eax,12(%ebp)
        jmp Main_zdwcnt_info

.Lc16H:
        movl 12(%ebp),%eax
        incl %eax
        movl (%ebp),%ecx
        incl %ecx
        subl $1,16(%ebp)
        movl %eax,12(%ebp)
        movl %ecx,(%ebp)
        jmp Main_zdwcnt_info

.Lc16z:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)



------------------------------
hand/space-bs-c8-acc-1-c.s:

Main_zdwcnt_info:

.Lc16w:
        cmpl $0,16(%ebp)
        jle .Lc16z
        movl 4(%ebp),%eax
        movl 12(%ebp),%ecx
        movzbl (%eax,%ecx,1),%eax
        cmpl $32,%eax
        jne .Lc16H
        movl (%ebp),%ecx
        incl %ecx
        movl 12(%ebp),%eax
        incl %eax
        movl %ecx,(%ebp)
        movl %eax,12(%ebp)
        subl $1,16(%ebp)
        jmp Main_zdwcnt_info
.Lc16z:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)
.Lc16H:
        movl 12(%ebp),%eax
        incl %eax
        movl %eax,12(%ebp)
        subl $1,16(%ebp)
        jmp Main_zdwcnt_info



------------------------------
hand/space-bs-c8-acc-1-d.s:

Main_zdwcnt_info:

.Lc16w:
        cmpl $0,16(%ebp)
        jle .Lc16z
        movl 4(%ebp),%eax
        movl 12(%ebp),%ecx
        movzbl (%eax,%ecx,1),%eax
        cmpl $32,%eax
        jne .Lc16H

        addl $1,(%ebp)
        addl $1,12(%ebp)
        subl $1,16(%ebp)
        jmp Main_zdwcnt_info
.Lc16z:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)
.Lc16H:
        addl $1,12(%ebp)
        subl $1,16(%ebp)
        jmp Main_zdwcnt_info



------------------------------
hand/space-bs-c8-acc-1-e.s:

Main_zdwcnt_info:

.Lc16w:
        cmpl $0,16(%ebp)
        jle .Lc16z
        movl 4(%ebp),%eax
        movl 12(%ebp),%ecx
        movzbl (%eax,%ecx,1),%eax
        cmpl $32,%eax
        jne .Lc16H

        movl 12(%ebp),%eax
        incl %eax

        incl  %ecx
        movl (%ebp),%eax
        incl %eax
        subl $1,16(%ebp)
        movl %ecx,12(%ebp)
        movl %eax,(%ebp)
        jmp Main_zdwcnt_info
.Lc16z:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)
.Lc16H:
        incl %ecx
        subl $1,16(%ebp)
        movl %ecx,12(%ebp)
        jmp Main_zdwcnt_info



------------------------------
hand/space-bs-c8-acc-1-f.s:

Main_zdwcnt_info:

.Lc16w:
        cmpl $0,16(%ebp)
        jle .Lc16z
        movl 4(%ebp),%eax
        movl 12(%ebp),%ecx
        movzbl (%eax,%ecx,1),%eax
        cmpl $32,%eax
        jne .Lc16H

        incl  %ecx
        subl $1,16(%ebp)
        addl $1,(%ebp)
        movl %ecx,12(%ebp)
        jmp Main_zdwcnt_info
.Lc16z:
        movl (%ebp),%esi
        addl $20,%ebp
        jmp *(%ebp)
.Lc16H:
        incl %ecx
        subl $1,16(%ebp)
        movl %ecx,12(%ebp)
        jmp Main_zdwcnt_info



------------------------------
hand/space-bs-c8-acc-1-g.s:

Main_zdwcnt_info:
        movl (%ebp),%esi

.Lc16w:
        cmpl $0,16(%ebp)
        jle .Lc16z
        movl 4(%ebp),%eax
        movl 12(%ebp),%ecx
        movzbl (%eax,%ecx,1),%eax
        cmpl $32,%eax
        jne .Lc16H

        incl  %ecx
        subl $1,16(%ebp)
        inc  %esi
        movl %ecx,12(%ebp)
        jmp .Lc16w
.Lc16z:
        addl $20,%ebp
        jmp *(%ebp)
.Lc16H:
        incl %ecx
        subl $1,16(%ebp)
        movl %ecx,12(%ebp)
        jmp .Lc16w



------------------------------
hand/space-bs-c8-acc-1-h.s:

Main_zdwcnt_info:
        movl (%ebp),%esi
        movl 12(%ebp),%ecx

.Lc16w:
        cmpl $0,16(%ebp)
        jle .Lc16z
        movl 4(%ebp),%eax
        movzbl (%eax,%ecx,1),%eax
        cmpl $32,%eax
        jne .Lc16H

        incl  %ecx
        subl $1,16(%ebp)
        inc  %esi
        jmp .Lc16w
.Lc16z:
        addl $20,%ebp
        jmp *(%ebp)
.Lc16H:
        incl %ecx
        subl $1,16(%ebp)
        jmp .Lc16w



------------------------------
hand/space-bs-c8-acc-1-i.s:

Main_zdwcnt_info:
        movl (%ebp),%esi
        movl 12(%ebp),%ecx
        movl 16(%ebp),%edx

.Lc16w:
        cmpl $0,%edx
        jle .Lc16z
        movl 4(%ebp),%eax
        movzbl (%eax,%ecx,1),%eax
        cmpl $32,%eax
        jne .Lc16H

        incl  %ecx
        decl %edx
        inc  %esi
        jmp .Lc16w
.Lc16z:
        addl $20,%ebp
        jmp *(%ebp)
.Lc16H:
        incl %ecx
        decl %edx
        jmp .Lc16w



------------------------------
hand/space-bs-c8-acc-1-j.s:

Main_zdwcnt_info:
        movl (%ebp),%esi
        movl 4(%ebp),%ecx
        addl 12(%ebp),%ecx
        movl 16(%ebp),%edx

.Lc16w:
        cmpl $0,%edx
        jle .Lc16z
        movzbl (%ecx),%eax
        cmpl $32,%eax
        jne .Lc16H

        incl %ecx
        decl %edx
        inc  %esi
        jmp .Lc16w
.Lc16z:
        addl $20,%ebp
        jmp *(%ebp)
.Lc16H:
        incl %ecx
        decl %edx
        jmp .Lc16w



------------------------------
hand/space-bs-c8-acc-1-k.s:

Main_zdwcnt_info:
        movl (%ebp),%esi
        movl 4(%ebp),%ecx
        addl 12(%ebp),%ecx
        movl 16(%ebp),%edx

.Lc16w:
        cmpl $0,%edx
        jle .Lc16z
        movzbl (%ecx),%eax
        cmpl $32,%eax
        jne .Lc16H

        incl %ecx
        decl %edx
        inc  %esi
        jmp .Lc16w
.Lc16H:
        incl %ecx
        decl %edx
        jmp .Lc16w
.Lc16z:
        addl $20,%ebp
        jmp *(%ebp)



------------------------------
hand/space-bs-c8-acc-1-l.s:

Main_zdwcnt_info:
        movl (%ebp),%esi
        movl 4(%ebp),%ecx
        addl 12(%ebp),%ecx
        movl 16(%ebp),%edx

.Lc16w:
        cmpl $0,%edx
        jle .Lc16z
        movzbl (%ecx),%eax
        incl %ecx
        decl %edx
        cmpl $32,%eax
        jne .Lc16H

        inc  %esi
        jmp .Lc16w
.Lc16H:
        jmp .Lc16w
.Lc16z:
        addl $20,%ebp
        jmp *(%ebp)



------------------------------
hand/space-bs-c8-acc-1-m.s:

Main_zdwcnt_info:
        movl (%ebp),%esi
        movl 4(%ebp),%ecx
        addl 12(%ebp),%ecx
        movl 16(%ebp),%edx

.Lc16w:
        cmpl $0,%edx
        jle .Lc16z
        movzbl (%ecx),%eax
        incl %ecx
        decl %edx
        cmpl $32,%eax
        jne .Lc16w

        inc  %esi
        jmp .Lc16w

.Lc16z:
        addl $20,%ebp
        jmp *(%ebp)



------------------------------
hand/space-bs-c8-acc-1-n.s:

Main_zdwcnt_info:
        movl (%ebp),%esi
        movl 4(%ebp),%ecx
        addl 12(%ebp),%ecx
        movl 16(%ebp),%edx

.Lc16w:
        cmpl $0,%edx
        jle .Lc16z
.Lc16xx:
        movzbl (%ecx),%eax
        incl %ecx
        decl %edx
        cmpl $32,%eax
        jne .Lc16w

        inc  %esi
        cmpl $0,%edx
        jg  .Lc16xx

.Lc16z:
        addl $20,%ebp
        jmp *(%ebp)



------------------------------
hand/space-bs-c8-acc-1-o.s:

Main_zdwcnt_info:
        movl (%ebp),%esi
        movl 4(%ebp),%ecx
        addl 12(%ebp),%ecx
        movl 16(%ebp),%edx

.Lc16w:
        cmpl $0,%edx
        jle .Lc16z
.Lc16xx:
        movzbl (%ecx),%eax
        incl %ecx
        decl %edx
        cmpl $32,%eax
        jne .Lc16w

        inc  %esi
        cmpl $0,%edx
        jle  .Lc16z

        movzbl (%ecx),%eax
        incl %ecx
        decl %edx
        cmpl $32,%eax
        jne .Lc16w

        inc  %esi
        cmpl $0,%edx
        jg  .Lc16xx

.Lc16z:
        addl $20,%ebp
        jmp *(%ebp)



------------------------------
hand/space-bs-c8-acc-1-p.s:

Main_zdwcnt_info:
        movl (%ebp),%esi
        movl 4(%ebp),%ecx
        addl 12(%ebp),%ecx
        movl 16(%ebp),%edx

.Lc16w4:
        cmpl $4,%edx
        jl  .Lc16wxx
        movl (%ecx),%eax

        addl $4,%ecx
        subl $4,%edx

        cmpb $32,%al
        jne .Lc16wa
        incl %esi
.Lc16wa:
        cmpb $32,%ah
        jne .Lc16wb
        incl %esi
.Lc16wb:
        shrl $16,%eax

        cmpb $32,%al
        jne .Lc16wc
        incl %esi
.Lc16wc:
        cmpb $32,%ah
        jne .Lc16w4
        incl %esi
        jmp .Lc16w4



.Lc16w1:
        cmpl $0,%edx
        jle .Lc16z
.Lc16wxx:
        movzbl (%ecx),%eax
        incl %ecx
        decl %edx
        cmpl $32,%eax
        jne .Lc16w1

        inc  %esi
        jmp .Lc16w1


.Lc16z:
        addl $20,%ebp
        jmp *(%ebp)



------------------------------
hand/space-bs-c8-acc-1-q.s:

Main_zdwcnt_info:
        movl (%ebp),%esi        /* #spaces found          */
        movl 4(%ebp),%ecx       /* ptr                    */
        addl 12(%ebp),%ecx      /* ... + idx              */
        movl 16(%ebp),%edx      /* cnt of remaining bytes */

        emms                    /* clear fp tags so we can use mmx instrs */

        mov   $0x20202020,%eax
        movd  %eax,%mm1         /* mm1: 0000000020202020 */
        movq  %mm1,%mm0         /* mm0: 0000000020202020 */
        psllq $32,%mm1          /* mm1: 2020202000000000 */
        por   %mm0,%mm1         /* mm1: 2020202020202020 */

        mov   $0x01010101,%eax
        movd  %eax,%mm2         /* mm2: 0000000001010101 */
        movq  %mm2,%mm0         /* mm0: 0000000001010101 */
        psllq $32,%mm2          /* mm2: 0101010100000000 */
        por   %mm0,%mm2         /* mm2: 0101010101010101 */

        /* MMX loads can use any alignment (potentially at a speed-hit) */

        /* this loop looks at 8 bytes at a time */
.Lc16w8:
        cmpl $8,%edx
        jl  .Lc16w1
        movq (%ecx),%mm0        /* mm0 holds 8 characters */
        addl $8,%ecx
        subl $8,%edx
        pcmpeqb %mm1,%mm0       /* cmp byte for byte with ' ' */
                                /* the result flag is 00 or FF */
        pand  %mm2,%mm0         /* turn FF into 01, which is actually useful */
        
        /* if we could just add the bytes up horizontally in %mm0, sigh.. .*/
        movd  %mm0,%eax
        push  %eax
        add   %ah, %al
        and   $0x03,%eax
        add   %eax,%esi
        pop   %eax
        shr   $16,%eax
        add   %ah,%al
        and   $0x03,%eax
        add   %eax,%esi

        psrlq $32,%mm0
        movd  %mm0,%eax
        push  %eax
        add   %ah, %al
        and   $0x03,%eax
        add   %eax,%esi
        pop   %eax
        shr   $16,%eax
        add   %ah,%al
        and   $0x03,%eax
        add   %eax,%esi

        jmp .Lc16w8


        /* this loop looks at one byte at a time to handle the remainder */
.Lc16w1:
        cmpl $0,%edx
        jle .Lc16z
        movzbl (%ecx),%eax
        incl %ecx
        decl %edx
        cmpl $32,%eax
        jne .Lc16w1

        inc  %esi
        jmp .Lc16w1


        /* done, remember to clear fp/mmx tags with emms */
.Lc16z:
        emms
        addl $20,%ebp
        jmp *(%ebp)



------------------------------
hand/space-bs-c8-acc-1-r.s:

Main_zdwcnt_info:
        movl (%ebp),%esi        /* #spaces found          */
        movl 4(%ebp),%ecx       /* ptr                    */
        addl 12(%ebp),%ecx      /* ... + idx              */
        movl 16(%ebp),%edx      /* cnt of remaining bytes */

        emms                    /* clear fp tags so we can use mmx instrs */

        mov   $0x20202020,%eax
        movd  %eax,%mm1         /* mm1: 0000000020202020 */
        movq  %mm1,%mm0         /* mm0: 0000000020202020 */
        psllq $32,%mm1          /* mm1: 2020202000000000 */
        por   %mm0,%mm1         /* mm1: 2020202020202020 */

        mov   $0x01010101,%eax
        movd  %eax,%mm2         /* mm2: 0000000001010101 */
        movq  %mm2,%mm0         /* mm0: 0000000001010101 */
        psllq $32,%mm2          /* mm2: 0101010100000000 */
        por   %mm0,%mm2         /* mm2: 0101010101010101 */

        /* MMX loads can use any alignment (potentially at a speed-hit)   */
        /* therefore we don't have to try to read 1-7 bytes one at a time */
        /* first in order to end up with an aligned %ecx.                 */

.Lc16_mainloop:
        cmpl $8,%edx
        jl   .Lc16w1
        movl %edx,%eax
        shr  $3,%eax
        cmpl $127,%eax
        jle  .Lc16_127
        movl $127,%eax
.Lc16_127:

        shl  $3,%eax
        sub  %eax,%edx
        shr  $3,%eax
        
        pxor %mm3,%mm3          /* clear block of space counters */

        /* loop up to 127 times in a loop that looks at 8 bytes at a time. */
        /* Going above 255 could overflow the 8 counters in mm3. */
        /* Going above 127 could overflow the horizontal summation code. */

.Lc16w8:
        cmpl $0,%eax
        jle  .Lc16w8end
        movq (%ecx),%mm0        /* mm0 holds 8 characters */
        addl $8,%ecx
        decl %eax
        pcmpeqb %mm1,%mm0       /* cmp byte for byte with ' ' */
                                /* the result flag is 00 or FF */
        pand  %mm2,%mm0         /* turn FF into 01, which is actually useful */
        paddb %mm0,%mm3         /* add to the 8 space counters */
        jmp   .Lc16w8

.Lc16w8end:
        /* sum the 8 space counters in mm3 and add to %esi */

        /* if only MMX had horizontal byte adds... */
        movd  %mm3,%eax
        push  %eax
        add   %ah, %al          /* NOTE!  potential overflow! */
        and   $0xFF,%eax
        add   %eax,%esi
        pop   %eax
        shr   $16,%eax
        add   %ah,%al           /* NOTE!  potential overflow! */
        and   $0xFF,%eax
        add   %eax,%esi

        psrlq $32,%mm3
        movd  %mm3,%eax
        push  %eax
        add   %ah, %al          /* NOTE!  potential overflow! */
        and   $0xFF,%eax
        add   %eax,%esi
        pop   %eax
        shr   $16,%eax
        add   %ah,%al           /* NOTE!  potential overflow! */
        and   $0xFF,%eax
        add   %eax,%esi

        jmp .Lc16_mainloop


        /* this loop looks at one byte at a time to handle the remainder */
.Lc16w1:
        cmpl $0,%edx
        jle .Lc16z
        movzbl (%ecx),%eax
        incl %ecx
        decl %edx
        cmpl $32,%eax
        jne .Lc16w1

        inc  %esi
        jmp .Lc16w1


        /* done, remember to clear fp/mmx tags with emms */
.Lc16z:
        emms
        addl $20,%ebp
        jmp *(%ebp)



------------------------------
hand/space-bs-c8-acc-1-s.s:

Main_zdwcnt_info:
        movl (%ebp),%esi        /* #spaces found          */
        movl 4(%ebp),%ecx       /* ptr                    */
        addl 12(%ebp),%ecx      /* ... + idx              */
        movl 16(%ebp),%edx      /* cnt of remaining bytes */

        emms                    /* clear fp tags so we can use mmx instrs */

        mov   $0x20202020,%eax
        movd  %eax,%mm1         /* mm1: 0000000020202020 */
        movq  %mm1,%mm0         /* mm0: 0000000020202020 */
        psllq $32,%mm1          /* mm1: 2020202000000000 */
        por   %mm0,%mm1         /* mm1: 2020202020202020 */

        mov   $0x01010101,%eax
        movd  %eax,%mm2         /* mm2: 0000000001010101 */
        movq  %mm2,%mm0         /* mm0: 0000000001010101 */
        psllq $32,%mm2          /* mm2: 0101010100000000 */
        por   %mm0,%mm2         /* mm2: 0101010101010101 */

        /* MMX loads can use any alignment (potentially at a speed-hit)   */
        /* therefore we don't have to try to read 1-7 bytes one at a time */
        /* first in order to end up with an aligned %ecx.                 */

.Lc16_mainloop:
        cmpl $8,%edx
        jl   .Lc16w1
        movl %edx,%eax
        shr  $3,%eax
        cmpl $127,%eax
        jle  .Lc16_127
        movl $127,%eax
.Lc16_127:

        shl  $3,%eax
        sub  %eax,%edx
        shr  $3,%eax
        
        pxor %mm3,%mm3          /* clear block of space counters */

        /* loop up to 127 times in a loop that looks at 8 bytes at a time. */
        /* Going above 255 could overflow the 8 counters in mm3. */
        /* Going above 127 could overflow the horizontal summation code. */
        

        cmpl $0,%eax
        jle  .Lc16w8end

        /* this is an unspeakably ugly and sloppy loop unrolling.  Doesn't  */
        /* seem to help much on an Athlon64 3000+.                          */
        test $1,%eax
        jz   .Lc16w8
        incl %eax
        jmp  .Lc16w8x
        
.Lc16w8:
        movq (%ecx),%mm0        /* mm0 holds 8 characters */
        addl $8,%ecx
        pcmpeqb %mm1,%mm0       /* cmp byte for byte with ' ' */
                                /* the result flag is 00 or FF */
        pand  %mm2,%mm0         /* turn FF into 01, which is actually useful */
        paddb %mm0,%mm3         /* add to the 8 space counters */

.Lc16w8x:
        movq (%ecx),%mm0        /* mm0 holds 8 characters */
        addl $8,%ecx
        pcmpeqb %mm1,%mm0       /* cmp byte for byte with ' ' */
                                /* the result flag is 00 or FF */
        pand  %mm2,%mm0         /* turn FF into 01, which is actually useful */
        paddb %mm0,%mm3         /* add to the 8 space counters */

        subl  $2,%eax
        jnz  .Lc16w8

.Lc16w8end:
        /* sum the 8 space counters in mm3 and add to %esi */

        /* if only MMX had horizontal byte adds... */
        movd  %mm3,%eax
        push  %eax
        add   %ah, %al          /* NOTE!  potential overflow! */
        and   $0xFF,%eax
        add   %eax,%esi
        pop   %eax
        shr   $16,%eax
        add   %ah,%al           /* NOTE!  potential overflow! */
        and   $0xFF,%eax
        add   %eax,%esi

        psrlq $32,%mm3
        movd  %mm3,%eax
        push  %eax
        add   %ah, %al          /* NOTE!  potential overflow! */
        and   $0xFF,%eax
        add   %eax,%esi
        pop   %eax
        shr   $16,%eax
        add   %ah,%al           /* NOTE!  potential overflow! */
        and   $0xFF,%eax
        add   %eax,%esi

        jmp .Lc16_mainloop


        /* this loop looks at one byte at a time to handle the remainder */
.Lc16w1:
        cmpl $0,%edx
        jle .Lc16z
        movzbl (%ecx),%eax
        incl %ecx
        decl %edx
        cmpl $32,%eax
        jne .Lc16w1

        inc  %esi
        jmp .Lc16w1


        /* done, remember to clear fp/mmx tags with emms */
.Lc16z:
        emms
        addl $20,%ebp
        jmp *(%ebp)



==============================

_______________________________________________
Haskell-Cafe mailing list
[email protected]
http://www.haskell.org/mailman/listinfo/haskell-cafe

Reply via email to