Module Name: src Committed By: matt Date: Sat Dec 22 18:58:29 UTC 2012
Modified Files: src/sys/arch/arm/cortex: cpu_in_cksum_asm_neon.S Log Message: Rework considerably. Use alternating sets of registers. (Still not faster than normal ARM code). To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S diff -u src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S:1.2 src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S:1.3 --- src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S:1.2 Tue Dec 18 06:05:56 2012 +++ src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S Sat Dec 22 18:58:29 2012 @@ -29,7 +29,7 @@ #include <machine/asm.h> -RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.2 2012/12/18 06:05:56 matt Exp $") +RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.3 2012/12/22 18:58:29 matt Exp $") /* * uint32_t @@ -39,102 +39,144 @@ RCSID("$NetBSD: cpu_in_cksum_asm_neon.S, * r1 = dlen */ ENTRY(cpu_in_cksum_neon) - str lr, [sp, #-8]! /* save lr */ mov ip, r0 /* leave r0 as temp */ add r3, r1, ip /* get end pointer */ - ands r1, ip, #15 /* get qword offset */ - bic ip, ip, #15 /* start on a qword boundary */ - veor q3, q3, q3 /* clear accumulator */ - beq .Lpre_main_loop /* ya, qword boundary start */ - - sub r0, r3, ip /* get length to qword start */ - cmp r0, #16 /* do we have at least a qword? */ - andlt r2, r3, #15 /* no, factor in trailing bytes */ - blt .Ltrailing_bytes /* and do the last partial qword */ - mov r2, #0 /* yes, no trailing bytes */ - bl partial_qword /* do the partial initial qword */ - mov r1, #0 /* no more leading bytes */ + and r1, ip, #7 /* get start offset (leading btyes) */ + and r2, r3, #7 /* get end offset (trailing bytes) */ + bic ip, ip, #7 /* start on a dword boundary */ + add r3, r3, #7 /* round up to a dword boundary */ + bic r3, r3, #7 /* end on a dword boundary */ + veor q2, q2, q2 /* clear accumulator */ + vmvn.u64 q1, q2 /* create leading/trailing masks */ + /* + * Normally the lower addressed is in d6 but in this case we want to + * reverse it since we might only have a single dword and the final + * fold will want the dword to trim in d7 so put the first dword in + * d7 until we know we are going to read more than one. + */ + veor d6, d6, d6 /* clear second dword */ + vld1.64 {d7}, [ip:64]! /* load first dword */ + orrs r0, r1, r2 /* do we have any offsets */ + beq .Lpre_main_loop /* no, proceed to main loop. */ + mov r1, r1, lsl #3 /* leading bytes -> bits */ + movs r2, r2, lsl #3 /* trailing bytes -> bits */ +#ifdef __ARMEL__ + subne r2, r2, #64 /* trim trailing MSBs */ +#else + rsb r1, r1, #0 /* trim leading MSBs */ + rsbne r2, r2, #64 /* trim trailing LSBs */ +#endif + vmov d0, r1, r2 /* move shifts */ + vmovl.u32 q0, d0 /* 2 U32 -> 2 U64 */ + vshl.u64 q1, q1, q0 /* apply shifts to masks */ + vand.u32 d7, d7, d2 /* apply leading mask to 1st dword */ + tst r1, #8 /* was the starting address odd? */ + beq .Lpre_main_loop /* no, go to pre_main_loop */ + veor d2, d2, d2 /* clear d2 (indicate odd addr) */ .Lpre_main_loop: - and r2, r3, #15 /* trailing bytes */ - bic r3, r3, #15 /* last partial or empty qword */ - cmp ip, r3 /* at or past the end? */ - bge .Ltrailing_bytes /* yes, deal with any trailing bytes */ + cmp ip, r3 /* do we just have a single dword? */ + beq .Lfinish_up /* yes, let finish up! */ + vmov d6, d7 /* move 1st dword to loaddr reg */ + vld1.64 {d7}, [ip:64]! /* read rest of initial qword */ .Lmain_loop: - vld1.64 {d4-d5}, [ip:128]! - vmovl.u16 q0, d4 /* 4 U16 -> 4 U32 */ - vadd.u32 q3, q3, q0 /* add 4 U32 to accumulator */ - vmovl.u16 q0, d5 /* 4 U16 -> 4 U32 */ - vadd.u32 q3, q3, q0 /* add 4 U32 to accumulator */ - cmp ip, r3 - blt .Lmain_loop - -.Ltrailing_bytes: - cmp r2, #0 /* any trailing bytes? */ - blne partial_qword /* yes, do final qword */ - ldr lr, [sp], #8 /* fetch LR */ + subs r1, r3, ip /* how much left to do? */ + beq .Lfinish_up /* = 0? we are done. */ + + bics r0, r1, #31 /* we deal with octawords only */ + beq .Lloop_end /* no octawords? exit loop */ + rsbs r0, r0, #128 /* subtract from 128 */ + ble .Lloop128 /* <= 0?, do 128 at a time. */ + add r0, r0, r0, lsr #2 /* multiple by 1.25 */ + add pc, pc, r0 /* and jump! */ + nop + +.Lloop128: + vld1.64 {d8-d9}, [ip:64]! /* 128 left */ + vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vld1.64 {d6-d7}, [ip:64]! + vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + + vld1.64 {d8-d9}, [ip:64]! /* 96 left */ + vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vld1.64 {d6-d7}, [ip:64]! + vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + + vld1.64 {d8-d9}, [ip:64]! /* 64 left */ + vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vld1.64 {d6-d7}, [ip:64]! + vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + + vld1.64 {d8-d9}, [ip:64]! /* 32 left */ + vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vld1.64 {d6-d7}, [ip:64]! + vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + + b .Lmain_loop -.Lfold_csum: +.Lloop_end: /* - * We now have 4 32-bit sums in q3 (each is 20-bits or less). - * Now to get to 1 I32 bit sum. + * We have one to 3 more dwords to process */ - vadd.u32 d6, d6, d7 /* 4 I32 -> 2 I32 */ - vmovl.u32 q3, d6 /* split two I32 into two I64 */ - vadd.u32 d6, d6, d7 /* 2 I32 -> 1 I32 */ - vmovl.u16 q3, d6 /* split two I16 into two I32 */ - vmovl.u32 q3, d6 /* split two I32 into two I64 */ - vadd.u32 d6, d6, d7 /* 2 I16 -> 1 I32 */ - vmov r0, s12 /* fetch csum from d6/q3 */ + rsb r0, r1, #24 + add r0, r0, r0, lsr #1 + add pc, pc, r0 + nop + vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vld1.64 {d6}, [ip:64]! + vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vld1.64 {d6}, [ip:64]! + vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vld1.64 {d7}, [ip:64]! + +.Lfinish_up: /* - * The result could be 0x10000 but we expect the caller to deal - * with it + * Apply remaining data in d6 and d7 */ - RET -END(cpu_in_cksum_neon) - -/* - * Handling partial qwords is tricky. - */ - .type partial_qword, %function -partial_qword: - str lr, [sp, #-8]! /* save LR */ - vld1.64 {d4-d5}, [ip:128]! /* fetch data */ -#ifdef __ARMEB__ - vswp d5, d4 /* on BE, MSW should be in d5 */ -#endif - veor q0, q0, q0 /* create a null mask */ - movs r0, r1, lsl #3 /* any leading bytes? */ - blne _C_LABEL(__neon_leading_qword_bitmask) - vmvn.u64 q0, q0 /* invert leading mask to trailing */ - vand.u32 q2, q2, q0 /* preserve them */ - vmvn.u64 q0, #0 /* create mask */ - movs r0, r2, lsl #3 /* if equal, no trailing bytes */ - blne _C_LABEL(__neon_leading_qword_bitmask) - vand.u32 q2, q2, q0 /* preserve them */ - ldr lr, [sp], #8 /* Fetch LR */ - vmovl.u16 q0, d4 /* 4 U16 -> 4 U32 */ - vadd.u32 q3, q3, q0 /* add 4 U32 to accumulator */ - vmovl.u16 q0, d5 /* 4 U16 -> 4 U32 */ - vadd.u32 q3, q3, q0 /* add 4 U32 to accumulator */ - RET - .size partial_qword, . - partial_qword + vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ + vand d7, d7, d3 /* apply trailing mask */ + vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ + vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ -/* - * uint32_t cpu_in_cksum_neon_v4hdr(void *dptr) - */ -ENTRY(cpu_in_cksum_neon_v4hdr) - bic ip, r0, #7 - vld1.32 {d0-d2},[ip] /* it must be in 24 bytes */ - tst r0, #4 /* depending on 64-bit alignment */ - beq 1f - vmov s0, s5 /* move last U32 to first U32 */ -1: vmovl.u32 q1, d2 /* move s5 to d3 and clear s5 */ - vmovl.u16 q3, d0 /* 4 U16 -> 4 U32 */ - vmovl.u16 q2, d1 /* 4 U16 -> 4 U32 */ - vadd.u32 q3, q3, q2 /* add 4 U32 to accumulator */ - vmovl.u16 q2, d2 /* 4 U16 -> 4 U32 */ - vadd.u32 q3, q3, q2 /* add 4 U32 to accumulator */ - b .Lfold_csum -END(cpu_in_cksum_neon_v4hdr) + /* + * We now have 4 32-bit sums in q2 (each is 20-bits or less). + * Now to get to 1 I32 bit sum. + */ + vadd.u32 d4, d4, d5 /* 4 I32 -> 2 I32 */ + vmov r2, s4 /* get flag for odd start */ + teq r2, #0 /* was start addr even? */ + vmov r0, r1, d4 /* extract two I32 */ + rev16eq r0, r0 /* byte swap if start was odd */ + rev16eq r1, r1 /* byte swap if start was odd */ + adds ip, r0, r1 /* add them producing carry */ +#include "arm/arm/cpu_in_cksum_fold.S" +END(cpu_in_cksum_neon)