> Your code still can be simplified a lot. I'm just not quite sure whether it > would > be more practical to commit something first and then refactor it with the > follow > up commits. Or attempt to make a "perfect" patch before committing. [Ma Ling] Yes, I agree with you, let us commit it first, then strength it, such as appending non-temporary instructions for large data copy which is over L1 cache size.
Best Regards Ling > > > pixman/pixman-access-ssse3_x86-64.S | 96 ++++------------------------------ > 1 files changed, 12 insertions(+), 84 deletions(-) > > diff --git a/pixman/pixman-access-ssse3_x86-64.S b/pixman/pixman-access- > ssse3_x86-64.S index e7cf21f..0946d20 100755 > --- a/pixman/pixman-access-ssse3_x86-64.S > +++ b/pixman/pixman-access-ssse3_x86-64.S > @@ -248,116 +248,44 @@ L(shl_0_cache_less_16bytes): > add %rdx, %rdi > BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4) > > -L(shl_4): > +.irp shift, 4, 8, 12 > +L(shl_\shift): > lea -32(%rdx), %rdx > ALIGN (4) > -L(shl_4_loop): > +L(shl_\shift\()_loop): > movaps 16(%rsi), %xmm2 > sub $32, %rdx > movaps 32(%rsi), %xmm3 > lea 32(%rsi), %rsi > movdqa %xmm3, %xmm4 > - palignr $4, %xmm2, %xmm3 > + palignr $\shift, %xmm2, %xmm3 > lea 32(%rdi), %rdi > - palignr $4, %xmm1, %xmm2 > + palignr $\shift, %xmm1, %xmm2 > por %xmm6, %xmm2 > movaps %xmm2, -32(%rdi) > por %xmm6, %xmm3 > movaps %xmm3, -16(%rdi) > - jb L(shl_4_end) > + jb L(shl_\shift\()_end) > > movaps 16(%rsi), %xmm2 > sub $32, %rdx > movaps 32(%rsi), %xmm3 > lea 32(%rsi), %rsi > movdqa %xmm3, %xmm1 > - palignr $4, %xmm2, %xmm3 > + palignr $\shift, %xmm2, %xmm3 > lea 32(%rdi), %rdi > - palignr $4, %xmm4, %xmm2 > + palignr $\shift, %xmm4, %xmm2 > por %xmm6, %xmm2 > movaps %xmm2, -32(%rdi) > por %xmm6, %xmm3 > movaps %xmm3, -16(%rdi) > - jae L(shl_4_loop) > -L(shl_4_end): > + jae L(shl_\shift\()_loop) > +L(shl_\shift\()_end): > lea 32(%rdx), %rdx > - lea 4(%rsi, %rdx), %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4) > - > -L(shl_8): > - lea -32(%rdx), %rdx > - ALIGN (4) > -L(shl_8_loop): > - movaps 16(%rsi), %xmm2 > - sub $32, %rdx > - movaps 32(%rsi), %xmm3 > - lea 32(%rsi), %rsi > - movdqa %xmm3, %xmm4 > - palignr $8, %xmm2, %xmm3 > - lea 32(%rdi), %rdi > - palignr $8, %xmm1, %xmm2 > - por %xmm6, %xmm2 > - movaps %xmm2, -32(%rdi) > - por %xmm6, %xmm3 > - movaps %xmm3, -16(%rdi) > - jb L(shl_8_end) > - > - movaps 16(%rsi), %xmm2 > - sub $32, %rdx > - movaps 32(%rsi), %xmm3 > - lea 32(%rsi), %rsi > - movdqa %xmm3, %xmm1 > - palignr $8, %xmm2, %xmm3 > - lea 32(%rdi), %rdi > - palignr $8, %xmm4, %xmm2 > - por %xmm6, %xmm2 > - movaps %xmm2, -32(%rdi) > - por %xmm6, %xmm3 > - movaps %xmm3, -16(%rdi) > - jae L(shl_8_loop) > -L(shl_8_end): > - lea 32(%rdx), %rdx > - lea 8(%rsi, %rdx), %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4) > - > -L(shl_12): > - lea -32(%rdx), %rdx > - ALIGN (4) > -L(shl_12_loop): > - movaps 16(%rsi), %xmm2 > - sub $32, %rdx > - movaps 32(%rsi), %xmm3 > - lea 32(%rsi), %rsi > - movdqa %xmm3, %xmm4 > - palignr $12, %xmm2, %xmm3 > - lea 32(%rdi), %rdi > - palignr $12, %xmm1, %xmm2 > - por %xmm6, %xmm2 > - movaps %xmm2, -32(%rdi) > - por %xmm6, %xmm3 > - movaps %xmm3, -16(%rdi) > - jb L(shl_12_end) > - > - movaps 16(%rsi), %xmm2 > - sub $32, %rdx > - movaps 32(%rsi), %xmm3 > - lea 32(%rsi), %rsi > - movdqa %xmm3, %xmm1 > - palignr $12, %xmm2, %xmm3 > - lea 32(%rdi), %rdi > - palignr $12, %xmm4, %xmm2 > - por %xmm6, %xmm2 > - movaps %xmm2, -32(%rdi) > - por %xmm6, %xmm3 > - movaps %xmm3, -16(%rdi) > - jae L(shl_12_loop) > -L(shl_12_end): > - lea 32(%rdx), %rdx > - lea 12(%rsi, %rdx), %rsi > + lea \shift\()(%rsi, %rdx), %rsi > add %rdx, %rdi > BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4) > +.endr > > ALIGN (4) > L(fwd_write_44bytes): _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman