Module Name:    src
Committed By:   matt
Date:           Mon Dec 17 00:44:04 UTC 2012

Modified Files:
        src/sys/arch/arm/cortex: files.cortex
Added Files:
        src/sys/arch/arm/cortex: cpu_in_cksum_asm_neon.S cpu_in_cksum_neon.c

Log Message:
Add preliminary version of a NEON based in_cksum routine.


To generate a diff of this commit:
cvs rdiff -u -r0 -r1.1 src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S \
    src/sys/arch/arm/cortex/cpu_in_cksum_neon.c
cvs rdiff -u -r1.2 -r1.3 src/sys/arch/arm/cortex/files.cortex

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/arm/cortex/files.cortex
diff -u src/sys/arch/arm/cortex/files.cortex:1.2 src/sys/arch/arm/cortex/files.cortex:1.3
--- src/sys/arch/arm/cortex/files.cortex:1.2	Sun Sep  2 16:55:10 2012
+++ src/sys/arch/arm/cortex/files.cortex	Mon Dec 17 00:44:03 2012
@@ -1,4 +1,9 @@
-# $NetBSD: files.cortex,v 1.2 2012/09/02 16:55:10 matt Exp $
+# $NetBSD: files.cortex,v 1.3 2012/12/17 00:44:03 matt Exp $
+
+defflag opt_cpu_in_cksum.h			NEON_IN_CKSUM
+
+file	arch/arm/cortex/cpu_in_cksum_neon.c	(inet | inet6) & neon_in_cksum
+file	arch/arm/cortex/cpu_in_cksum_asm_neon.S	(inet | inet6) & neon_in_cksum
 
 device	armperiph {}
 attach	armperiph at mainbus

Added files:

Index: src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S
diff -u /dev/null src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S:1.1
--- /dev/null	Mon Dec 17 00:44:04 2012
+++ src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S	Mon Dec 17 00:44:03 2012
@@ -0,0 +1,141 @@
+/*-
+ * Copyright (c) 2012 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Matt Thomas of 3am Software Foundry.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+#include "assym.h"
+
+RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.1 2012/12/17 00:44:03 matt Exp $")
+
+/*
+ * uint32_t
+ * cpu_in_cksum_neon(const void *dptr, size_t dlen)
+ *
+ *	r0 = dptr
+ *	r1 = dlen
+ */
+ENTRY(cpu_in_cksum_neon)
+	str		lr, [sp, #-8]!	/* save lr */
+	mov		ip, r0		/* leave r0 as temp */
+	add		r3, r1, ip	/* get end pointer */
+	ands		r1, ip, #15	/* get qword offset */
+	bic		ip, ip, #15	/* start on a qword boundary */
+	veor		q3, q3, q3	/* clear accumulator */
+	beq		.Lpre_main_loop	/* ya, qword boundary start */
+
+	sub		r0, r3, ip	/* get length to qword start */
+	cmp		r0, #16		/* do we have at least a qword? */
+	andlt		r2, r3, #15	/* no, factor in trailing bytes */
+	blt		.Ltrailing_bytes /*   and do the last partial qword */
+	mov		r2, #0		/* yes, no trailing bytes */
+	bl		partial_qword	/* do the partial initial qword */
+	mov		r1, #0		/* no more leading bytes */
+
+.Lpre_main_loop:
+	and		r2, r3, #15	/* trailing bytes */
+	bic		r3, r3, #15	/* last partial or empty qword */
+	cmp		ip, r3		/* at or past the end? */
+	bge		.Ltrailing_bytes /* yes, deal with any trailing bytes */
+
+.Lmain_loop:
+	vld1.64		{d4-d5}, [ip:128]!
+	vmovl.u16	q0, d4		/* 4 U16 -> 4 U32 */
+	vadd.u32	q3, q3, q0	/* add 4 U32 to accumulator */
+	vmovl.u16	q0, d5		/* 4 U16 -> 4 U32 */
+	vadd.u32	q3, q3, q0	/* add 4 U32 to accumulator */
+	cmp		ip, r3
+	blt		.Lmain_loop
+
+.Ltrailing_bytes:
+	cmp		r2, #0		/* any trailing bytes? */
+	blne		partial_qword	/* yes, do final qword */
+	ldr		lr, [sp], #8	/* fetch LR */
+
+.Lfold_csum:
+	/*
+	 * We now have 4 32-bit sums in q3 (each is 20-bits or less).
+	 * Now to get to 1 I32 bit sum.
+	 */
+	vadd.u32	d6, d6, d7	/* 4 I32 -> 2 I32 */
+	vmovl.u32	q3, d6		/* split two I32 into two I64 */
+	vadd.u32	d6, d6, d7	/* 2 I32 -> 1 I32 */
+	vmovl.u16	q3, d6		/* split two I16 into two I32 */
+	vmovl.u32	q3, d6		/* split two I32 into two I64 */
+	vadd.u32	d6, d6, d7	/* 2 I16 -> 1 I32 */
+	vmov		r0, s12		/* fetch csum from d6/q3 */
+	/*
+	 * The result could be 0x10000 but we expect the caller to deal
+	 * with it
+	 */
+	RET
+END(cpu_in_cksum_neon)
+
+/*
+ * Handling partial qwords is tricky.
+ */
+	.type		partial_qword, %function
+partial_qword:
+	str		lr, [sp, #-8]!	/* save LR */
+	vld1.64		{d4-d5}, [ip:128]!	/* fetch data */
+	veor		q0, q0, q0	/* create a null mask */
+	movs		r0, r1, lsl #3	/* any leading bytes? */
+	blne		_C_LABEL(__neon_leading_qword_bitmask)
+	vmvn.u64	q0, q0		/* invert leading mask to trailing */
+	vand.u32	q2, q2, q0	/* preserve them */
+	vmvn.u64	q0, #0		/* create mask */
+	movs		r0, r2, lsl #3	/* if equal, no trailing bytes */
+	blne		_C_LABEL(__neon_leading_qword_bitmask)
+	vand.u32	q2, q2, q0	/* preserve them */
+	ldr		lr, [sp], #8	/* Fetch LR */
+	vmovl.u16	q0, d4		/* 4 U16 -> 4 U32 */
+	vadd.u32	q3, q3, q0	/* add 4 U32 to accumulator */
+	vmovl.u16	q0, d5		/* 4 U16 -> 4 U32 */
+	vadd.u32	q3, q3, q0	/* add 4 U32 to accumulator */
+	RET
+	.size		partial_qword, . - partial_qword
+
+/*
+ * uint32_t cpu_in_cksum_neon_v4hdr(void *dptr)
+ */
+ENTRY(cpu_in_cksum_neon_v4hdr)
+	veor		q1, q1, q1
+	bic		ip, r0, #7
+	vld1.32		{d0-d2},[ip]	/* it must be in 24 bytes */
+	mov		r1, #0		/* now we must clear one register */
+	tst		r0, #4		/* depending on 64-bit alignment */
+	beq		1f
+	vmov		s0, s5		/* move last U32 to first U32 */
+1:	vmovl.u32	q1, d2		/* move s5 to d3 and clear s5 */
+	vmovl.u16	q2, d0		/* 4 U16 -> 4 U32 */
+	vadd.u32	q3, q3, q2	/* add 4 U32 to accumulator */
+	vmovl.u16	q2, d1		/* 4 U16 -> 4 U32 */
+	vadd.u32	q3, q3, q2	/* add 4 U32 to accumulator */
+	vmovl.u16	q2, d2		/* 4 U16 -> 4 U32 */
+	vadd.u32	q3, q3, q2	/* add 4 U32 to accumulator */
+	b		.Lfold_csum
+END(cpu_in_cksum_neon_v4hdr)
Index: src/sys/arch/arm/cortex/cpu_in_cksum_neon.c
diff -u /dev/null src/sys/arch/arm/cortex/cpu_in_cksum_neon.c:1.1
--- /dev/null	Mon Dec 17 00:44:04 2012
+++ src/sys/arch/arm/cortex/cpu_in_cksum_neon.c	Mon Dec 17 00:44:03 2012
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2012 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Matt Thomas of 3am Software Foundry.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+
+__KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum_neon.c,v 1.1 2012/12/17 00:44:03 matt Exp $");
+
+#include <sys/param.h>
+#include <sys/cpu.h>
+#include <sys/mbuf.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+
+uint32_t cpu_in_cksum_neon(const void *, size_t);
+uint32_t cpu_in_cksum_neon_v4hdr(const void *);
+
+int
+cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
+{
+	uint32_t csum = initial_sum;
+	int odd = 0;
+
+	/*
+	 * Taken control of the NEON PCU.
+	 */
+	vfp_hijack();
+
+	/*
+	 * Fast path for the normal ip_header
+	 */
+	if (off == 0
+	    && csum == 0
+	    && len == sizeof(struct ip)
+	    && ((uintptr_t)m->m_data & 3) == 0
+	    && m->m_len >= len) {
+		csum = cpu_in_cksum_neon_v4hdr(m->m_data);
+
+		/*
+		 * We are now down with NEON.
+		 */
+		vfp_surrender();
+
+		if (csum == 0x10000)	/* note 0x10000 - 0xffff == 1 */
+			return 1;
+		return csum == 0 ? 0xffff : csum;	/* never return 0. */
+	}
+
+	/*
+	 * Skip the initial mbufs
+	 */
+	while (m->m_len >= off) {
+		m = m->m_next;
+		off -= m->m_len;
+		KASSERT(m != NULL);
+	}
+
+	for (; len > 0; m = m->m_next, off = 0) {
+		KASSERT(m != NULL);
+		int dlen = MIN(m->m_len - off, len);
+		const void *dptr = m->m_data + off;
+		/*
+		 * This routine will add based on the memory layout so
+		 * if the previous len was odd or the this buffer starts
+		 * on an odd address, shift the csum by 8 so its properly
+		 * aligned.  It will be taken care of when we do the final
+		 * checksum fold.
+		 */
+		uint32_t tmpsum = cpu_in_cksum_neon(dptr, dlen);
+		if (odd ^ ((uint32_t)dptr & 1))
+			tmpsum <<= 8;
+		/*
+		 * Accumulate checksum, folding will be done later
+		 */
+		csum += tmpsum;
+		odd ^= dlen & 1;
+		len -= dlen;
+	}
+
+	/*
+	 * We are now down with NEON.
+	 */
+	vfp_surrender();
+
+	/*
+	 * Time to fold the checksum
+	 */
+	csum = (csum >> 16) + (csum & 0xffff);
+	/*
+	 * Now it could be 0x1xxxx so fold again
+	 */
+	csum = (csum >> 16) + (csum & 0xffff);
+
+	KASSERT(csum <= 0x10000);
+	if (csum == 0x10000)	/* note 0x10000 - 0xffff == 1 */
+		return 1;
+	return csum == 0 ? 0xffff : csum;	/* never return 0. */
+}

Reply via email to