Hi,

I've been looking at IPsec performance on UltraSPARC T2 processors. These processors provide HW crypto accelerators, so the normally compute intensive crypto operations become fairly light-weight. After the crypto ops are offloaded, the IP checksum operation becomes pretty hot.

Looking over the code in Solaris (ip_ocsum.s), it looks like there might be some optimizations that would improve IP checksum performance on single-issue processors such as T2. Currently, for each 8-bytes processed, the following is performed in the main loop:

srlx %l0, 32, %o0 ! hi32 to %o0 and %l0, %g1, %l0 ! lo32 to %l0 ldx [%i0+0], %l0 add %l1, %o1, %o1 ! adds and loads add %l2, %o2, %o2

I think it is possible to eliminate the mask operation (srlx), and instead use add with carry operations:

addcc   %l0, %g0, %o0
srlx    %l0, 32, %o1
addc    %o1, %i1, %i1
ldx     [%i0], %l0

The reason this works is that once we are outside the main loop the upper and lower 32-bits of the result are folded together, so a 32-bit carryout ends up having the same effect on the final result as a carryin.

This seems to provide a ~20% reduction in instruction count in the hot loop. I undertook some initial tests and the results from the main loop appear to be identical to the current masking approach. I have attached an example code for interest. Does this seem like a reasonable approach?

Comments welcomed.

Regards,

lawrence
--

Lawrence Spracklen
Architecture Technology Group
http://blogs.sun.com/sprack


        
                ! Simple test for main loop of ip_ocsum_long
                ! test assumes that i) data 8-byte aligned
                ! ii) buf is a multiple of 64-bytes
                !
                ! %i0 contains buffer address 
                ! %i1 contains buf length in bytes
                ! %i2 contains sum 
                
                .section        ".text",#alloc,#execinstr
                .global test
test:

        save    %sp, -408, %sp

        ldx     [%i0], %l0
        ldx     [%i0 + 0x08], %l1
        ldx     [%i0 + 0x10], %l2
        ldx     [%i0 + 0x18], %l3
        ldx     [%i0 + 0x20], %l4
        ldx     [%i0 + 0x28], %l5
        ldx     [%i0 + 0x30], %l6
        ldx     [%i0 + 0x38], %l7

        mov     1, %o3
        sllx    %o3, 32, %o3
        sub     %o3, 1, %o3
inner_loop:     

        add     %i0, 64, %i0

        addcc   %l0, %g0, %o0
        srlx    %l0, 32, %o1
        addc    %o1, %i1, %i1
        ldx     [%i0], %l0

        addcc   %l1, %o0, %o0
        srlx    %l1, 32, %o1
        addc    %o1, %i1, %i1
        ldx     [%i0 + 0x08], %l1

        addcc   %l2, %o0, %o0
        srlx    %l2, 32, %o1
        addc    %o1, %i1, %i1
        ldx     [%i0 + 0x10], %l2

        addcc   %l3, %o0, %o0
        srlx    %l3, 32, %o1
        addc    %o1, %i1, %i1
        ldx     [%i0 + 0x18], %l3

        addcc   %l4, %o0, %o0
        srlx    %l4, 32, %o1
        addc    %o1, %i1, %i1
        ldx     [%i0 + 0x20], %l4

        addcc   %l5, %o0, %o0
        srlx    %l5, 32, %o1
        addc    %o1, %i1, %i1
        ldx     [%i0 + 0x28], %l5

        addcc   %l6, %o0, %o0
        srlx    %l6, 32, %o1
        addc    %o1, %i1, %i1
        ldx     [%i0 + 0x30], %l6

        addcc   %l7, %o0, %o0
        srlx    %l7, 32, %o1
        addc    %o1, %i1, %i1
        ldx     [%i0 + 0x38], %l7

        and     %o0, %o3, %o0
        add     %o0, %i1, %i1

        subcc           %i2, 64, %i2
        bnz             inner_loop
        nop

        !fold results

        srlx    %i1, 32, %l0
        and     %i1, %o3, %i1

        add     %i1, %l0, %i1

        mov     %i1, %i0
        
        ret
        restore         %g0, 0, %g0
        .size   test,(.-test)
_______________________________________________
networking-discuss mailing list
[email protected]

Reply via email to