Module Name: src Committed By: christos Date: Sun Mar 17 00:42:32 UTC 2013
Added Files: src/common/lib/libc/arch/sparc64/string: memcpy.S memset.S strmacros.h Log Message: Use a single copy of the source. To generate a diff of this commit: cvs rdiff -u -r0 -r1.1 src/common/lib/libc/arch/sparc64/string/memcpy.S \ src/common/lib/libc/arch/sparc64/string/memset.S \ src/common/lib/libc/arch/sparc64/string/strmacros.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Added files: Index: src/common/lib/libc/arch/sparc64/string/memcpy.S diff -u /dev/null src/common/lib/libc/arch/sparc64/string/memcpy.S:1.1 --- /dev/null Sat Mar 16 20:42:32 2013 +++ src/common/lib/libc/arch/sparc64/string/memcpy.S Sat Mar 16 20:42:31 2013 @@ -0,0 +1,1624 @@ +/* $NetBSD: memcpy.S,v 1.1 2013/03/17 00:42:31 christos Exp $ */ + +/* + * Copyright (c) 1996-2002 Eduardo Horvath + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#include "strmacros.h" + +/* + * kernel memcpy + * Assumes regions do not overlap; has no useful return value. + * + * Must not use %g7 (see copyin/copyout above). + */ +ENTRY(memcpy) /* dest, src, size */ + /* + * Swap args for bcopy. Gcc generates calls to memcpy for + * structure assignments. + */ + mov %o0, %o3 + mov %o1, %o0 + mov %o3, %o1 +#if !defined(_KERNEL) || defined(_RUMPKERNEL) +ENTRY(bcopy) /* src, dest, size */ +#endif +#ifdef DEBUG +#if defined(_KERNEL) && !defined(_RUMPKERNEL) + set pmapdebug, %o4 + ld [%o4], %o4 + btst 0x80, %o4 ! PDB_COPY + bz,pt %icc, 3f + nop +#endif + save %sp, -CC64FSZ, %sp + mov %i0, %o1 + set 2f, %o0 + mov %i1, %o2 + call printf + mov %i2, %o3 +! ta 1; nop + restore + .data +2: .asciz "memcpy(%p<-%p,%x)\n" + _ALIGN + .text +3: +#endif + + cmp %o2, BCOPY_SMALL + +Lmemcpy_start: + bge,pt CCCR, 2f ! if >= this many, go be fancy. + cmp %o2, 256 + + mov %o1, %o5 ! Save memcpy return value + /* + * Not much to copy, just do it a byte at a time. + */ + deccc %o2 ! while (--len >= 0) + bl 1f + .empty +0: + inc %o0 + ldsb [%o0 - 1], %o4 ! (++dst)[-1] = *src++; + stb %o4, [%o1] + deccc %o2 + bge 0b + inc %o1 +1: + retl + mov %o5, %o0 + NOTREACHED + + /* + * Plenty of data to copy, so try to do it optimally. + */ +2: +#ifdef USE_BLOCK_STORE_LOAD + ! If it is big enough, use VIS instructions + bge Lmemcpy_block + nop +#endif /* USE_BLOCK_STORE_LOAD */ +Lmemcpy_fancy: + + !! + !! First align the output to a 8-byte entity + !! + + save %sp, -CC64FSZ, %sp + + mov %i0, %l0 + mov %i1, %l1 + + mov %i2, %l2 + btst 1, %l1 + + bz,pt %icc, 4f + btst 2, %l1 + ldub [%l0], %l4 ! Load 1st byte + + deccc 1, %l2 + ble,pn CCCR, Lmemcpy_finish ! XXXX + inc 1, %l0 + + stb %l4, [%l1] ! Store 1st byte + inc 1, %l1 ! Update address + btst 2, %l1 +4: + bz,pt %icc, 4f + + btst 1, %l0 + bz,a 1f + lduh [%l0], %l4 ! Load short + + ldub [%l0], %l4 ! Load bytes + + ldub [%l0+1], %l3 + sllx %l4, 8, %l4 + or %l3, %l4, %l4 + +1: + deccc 2, %l2 + ble,pn CCCR, Lmemcpy_finish ! XXXX + inc 2, %l0 + sth %l4, [%l1] ! Store 1st short + + inc 2, %l1 +4: + btst 4, %l1 + bz,pt CCCR, 4f + + btst 3, %l0 + bz,a,pt CCCR, 1f + lduw [%l0], %l4 ! Load word -1 + + btst 1, %l0 + bz,a,pt %icc, 2f + lduh [%l0], %l4 + + ldub [%l0], %l4 + + lduh [%l0+1], %l3 + sllx %l4, 16, %l4 + or %l4, %l3, %l4 + + ldub [%l0+3], %l3 + sllx %l4, 8, %l4 + ba,pt %icc, 1f + or %l4, %l3, %l4 + +2: + lduh [%l0+2], %l3 + sllx %l4, 16, %l4 + or %l4, %l3, %l4 + +1: + deccc 4, %l2 + ble,pn CCCR, Lmemcpy_finish ! XXXX + inc 4, %l0 + + st %l4, [%l1] ! Store word + inc 4, %l1 +4: + !! + !! We are now 32-bit aligned in the dest. + !! +Lmemcpy_common: + + and %l0, 7, %l4 ! Shift amount + andn %l0, 7, %l0 ! Source addr + + brz,pt %l4, Lmemcpy_noshift8 ! No shift version... + + sllx %l4, 3, %l4 ! In bits + mov 8<<3, %l3 + + ldx [%l0], %o0 ! Load word -1 + sub %l3, %l4, %l3 ! Reverse shift + deccc 12*8, %l2 ! Have enough room? + + sllx %o0, %l4, %o0 + bl,pn CCCR, 2f + and %l3, 0x38, %l3 +Lmemcpy_unrolled8: + + /* + * This is about as close to optimal as you can get, since + * the shifts require EU0 and cannot be paired, and you have + * 3 dependent operations on the data. + */ + +! ldx [%l0+0*8], %o0 ! Already done +! sllx %o0, %l4, %o0 ! Already done + ldx [%l0+1*8], %o1 + ldx [%l0+2*8], %o2 + ldx [%l0+3*8], %o3 + ldx [%l0+4*8], %o4 + ba,pt %icc, 1f + ldx [%l0+5*8], %o5 + .align 8 +1: + srlx %o1, %l3, %g1 + inc 6*8, %l0 + + sllx %o1, %l4, %o1 + or %g1, %o0, %g6 + ldx [%l0+0*8], %o0 + + stx %g6, [%l1+0*8] + srlx %o2, %l3, %g1 + + sllx %o2, %l4, %o2 + or %g1, %o1, %g6 + ldx [%l0+1*8], %o1 + + stx %g6, [%l1+1*8] + srlx %o3, %l3, %g1 + + sllx %o3, %l4, %o3 + or %g1, %o2, %g6 + ldx [%l0+2*8], %o2 + + stx %g6, [%l1+2*8] + srlx %o4, %l3, %g1 + + sllx %o4, %l4, %o4 + or %g1, %o3, %g6 + ldx [%l0+3*8], %o3 + + stx %g6, [%l1+3*8] + srlx %o5, %l3, %g1 + + sllx %o5, %l4, %o5 + or %g1, %o4, %g6 + ldx [%l0+4*8], %o4 + + stx %g6, [%l1+4*8] + srlx %o0, %l3, %g1 + deccc 6*8, %l2 ! Have enough room? + + sllx %o0, %l4, %o0 ! Next loop + or %g1, %o5, %g6 + ldx [%l0+5*8], %o5 + + stx %g6, [%l1+5*8] + bge,pt CCCR, 1b + inc 6*8, %l1 + +Lmemcpy_unrolled8_cleanup: + !! + !! Finished 8 byte block, unload the regs. + !! + srlx %o1, %l3, %g1 + inc 5*8, %l0 + + sllx %o1, %l4, %o1 + or %g1, %o0, %g6 + + stx %g6, [%l1+0*8] + srlx %o2, %l3, %g1 + + sllx %o2, %l4, %o2 + or %g1, %o1, %g6 + + stx %g6, [%l1+1*8] + srlx %o3, %l3, %g1 + + sllx %o3, %l4, %o3 + or %g1, %o2, %g6 + + stx %g6, [%l1+2*8] + srlx %o4, %l3, %g1 + + sllx %o4, %l4, %o4 + or %g1, %o3, %g6 + + stx %g6, [%l1+3*8] + srlx %o5, %l3, %g1 + + sllx %o5, %l4, %o5 + or %g1, %o4, %g6 + + stx %g6, [%l1+4*8] + inc 5*8, %l1 + + mov %o5, %o0 ! Save our unused data + dec 5*8, %l2 +2: + inccc 12*8, %l2 + bz,pn %icc, Lmemcpy_complete + + !! Unrolled 8 times +Lmemcpy_aligned8: +! ldx [%l0], %o0 ! Already done +! sllx %o0, %l4, %o0 ! Shift high word + + deccc 8, %l2 ! Pre-decrement + bl,pn CCCR, Lmemcpy_finish +1: + ldx [%l0+8], %o1 ! Load word 0 + inc 8, %l0 + + srlx %o1, %l3, %g6 + or %g6, %o0, %g6 ! Combine + + stx %g6, [%l1] ! Store result + inc 8, %l1 + + deccc 8, %l2 + bge,pn CCCR, 1b + sllx %o1, %l4, %o0 + + btst 7, %l2 ! Done? + bz,pt CCCR, Lmemcpy_complete + + !! + !! Loadup the last dregs into %o0 and shift it into place + !! + srlx %l3, 3, %g6 ! # bytes in %o0 + dec 8, %g6 ! - 8 + !! n-8 - (by - 8) -> n - by + subcc %l2, %g6, %g0 ! # bytes we need + ble,pt %icc, Lmemcpy_finish + nop + ldx [%l0+8], %o1 ! Need another word + srlx %o1, %l3, %o1 + ba,pt %icc, Lmemcpy_finish + or %o0, %o1, %o0 ! All loaded up. + +Lmemcpy_noshift8: + deccc 6*8, %l2 ! Have enough room? + bl,pn CCCR, 2f + nop + ba,pt %icc, 1f + nop + .align 32 +1: + ldx [%l0+0*8], %o0 + ldx [%l0+1*8], %o1 + ldx [%l0+2*8], %o2 + stx %o0, [%l1+0*8] + stx %o1, [%l1+1*8] + stx %o2, [%l1+2*8] + + + ldx [%l0+3*8], %o3 + ldx [%l0+4*8], %o4 + ldx [%l0+5*8], %o5 + inc 6*8, %l0 + stx %o3, [%l1+3*8] + deccc 6*8, %l2 + stx %o4, [%l1+4*8] + stx %o5, [%l1+5*8] + bge,pt CCCR, 1b + inc 6*8, %l1 +2: + inc 6*8, %l2 +1: + deccc 8, %l2 + bl,pn %icc, 1f ! < 0 --> sub word + nop + ldx [%l0], %g6 + inc 8, %l0 + stx %g6, [%l1] + bg,pt %icc, 1b ! Exactly 0 --> done + inc 8, %l1 +1: + btst 7, %l2 ! Done? + bz,pt CCCR, Lmemcpy_complete + clr %l4 + ldx [%l0], %o0 +Lmemcpy_finish: + + brz,pn %l2, 2f ! 100% complete? + cmp %l2, 8 ! Exactly 8 bytes? + bz,a,pn CCCR, 2f + stx %o0, [%l1] + + btst 4, %l2 ! Word store? + bz CCCR, 1f + srlx %o0, 32, %g6 ! Shift high word down + stw %g6, [%l1] + inc 4, %l1 + mov %o0, %g6 ! Operate on the low bits +1: + btst 2, %l2 + mov %g6, %o0 + bz 1f + srlx %o0, 16, %g6 + + sth %g6, [%l1] ! Store short + inc 2, %l1 + mov %o0, %g6 ! Operate on low bytes +1: + mov %g6, %o0 + btst 1, %l2 ! Byte aligned? + bz 2f + srlx %o0, 8, %g6 + + stb %g6, [%l1] ! Store last byte + inc 1, %l1 ! Update address +2: +Lmemcpy_complete: +#if 0 + !! + !! verify copy success. + !! + + mov %i0, %o2 + mov %i1, %o4 + mov %i2, %l4 +0: + ldub [%o2], %o1 + inc %o2 + ldub [%o4], %o3 + inc %o4 + cmp %o3, %o1 + bnz 1f + dec %l4 + brnz %l4, 0b + nop + ba 2f + nop + +1: + set 0f, %o0 + call printf + sub %i2, %l4, %o5 + set 1f, %o0 + mov %i0, %o2 + mov %i1, %o1 + call printf + mov %i2, %o3 + ta 1 + .data +0: .asciz "memcpy failed: %x@%p != %x@%p byte %d\n" +1: .asciz "memcpy(%p, %p, %lx)\n" + .align 8 + .text +2: +#endif + ret + restore %i1, %g0, %o0 + +#ifdef USE_BLOCK_STORE_LOAD + +/* + * Block copy. Useful for >256 byte copies. + * + * Benchmarking has shown this always seems to be slower than + * the integer version, so this is disabled. Maybe someone will + * figure out why sometime. + */ + +Lmemcpy_block: + sethi %hi(block_disable), %o3 + ldx [ %o3 + %lo(block_disable) ], %o3 + brnz,pn %o3, Lmemcpy_fancy + !! Make sure our trap table is installed + set _C_LABEL(trapbase), %o5 + rdpr %tba, %o3 + sub %o3, %o5, %o3 + brnz,pn %o3, Lmemcpy_fancy ! No, then don't use block load/store + nop +#if defined(_KERNEL) && !defined(_RUMPKERNEL) +/* + * Kernel: + * + * Here we use VIS instructions to do a block clear of a page. + * But before we can do that we need to save and enable the FPU. + * The last owner of the FPU registers is fplwp, and + * fplwp->l_md.md_fpstate is the current fpstate. If that's not + * null, call savefpstate() with it to store our current fp state. + * + * Next, allocate an aligned fpstate on the stack. We will properly + * nest calls on a particular stack so this should not be a problem. + * + * Now we grab either curlwp (or if we're on the interrupt stack + * lwp0). We stash its existing fpstate in a local register and + * put our new fpstate in curlwp->p_md.md_fpstate. We point + * fplwp at curlwp (or lwp0) and enable the FPU. + * + * If we are ever preempted, our FPU state will be saved in our + * fpstate. Then, when we're resumed and we take an FPDISABLED + * trap, the trap handler will be able to fish our FPU state out + * of curlwp (or lwp0). + * + * On exiting this routine we undo the damage: restore the original + * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable + * the MMU. + * + * + * Register usage, Kernel only (after save): + * + * %i0 src + * %i1 dest + * %i2 size + * + * %l0 XXXX DEBUG old fpstate + * %l1 fplwp (hi bits only) + * %l2 orig fplwp + * %l3 orig fpstate + * %l5 curlwp + * %l6 old fpstate + * + * Register ussage, Kernel and user: + * + * %g1 src (retval for memcpy) + * + * %o0 src + * %o1 dest + * %o2 end dest + * %o5 last safe fetchable address + */ + + ENABLE_FPU(0) + + mov %i0, %o0 ! Src addr. + mov %i1, %o1 ! Store our dest ptr here. + mov %i2, %o2 ! Len counter +#endif /* _KERNEL */ + + !! + !! First align the output to a 64-bit entity + !! + + mov %o1, %g1 ! memcpy retval + add %o0, %o2, %o5 ! End of source block + + andn %o0, 7, %o3 ! Start of block + dec %o5 + fzero %f0 + + andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr. + ldd [%o3], %f2 ! Load 1st word + + dec 8, %o3 ! Move %o3 1 word back + btst 1, %o1 + bz 4f + + mov -7, %o4 ! Lowest src addr possible + alignaddr %o0, %o4, %o4 ! Base addr for load. + + cmp %o3, %o4 + be,pt CCCR, 1f ! Already loaded? + mov %o4, %o3 + fmovd %f2, %f0 ! No. Shift + ldd [%o3+8], %f2 ! And load +1: + + faligndata %f0, %f2, %f4 ! Isolate 1st byte + + stda %f4, [%o1] ASI_FL8_P ! Store 1st byte + inc 1, %o1 ! Update address + inc 1, %o0 + dec 1, %o2 +4: + btst 2, %o1 + bz 4f + + mov -6, %o4 ! Calculate src - 6 + alignaddr %o0, %o4, %o4 ! calculate shift mask and dest. + + cmp %o3, %o4 ! Addresses same? + be,pt CCCR, 1f + mov %o4, %o3 + fmovd %f2, %f0 ! Shuffle data + ldd [%o3+8], %f2 ! Load word 0 +1: + faligndata %f0, %f2, %f4 ! Move 1st short low part of f8 + + stda %f4, [%o1] ASI_FL16_P ! Store 1st short + dec 2, %o2 + inc 2, %o1 + inc 2, %o0 +4: + brz,pn %o2, Lmemcpy_blockfinish ! XXXX + + btst 4, %o1 + bz 4f + + mov -4, %o4 + alignaddr %o0, %o4, %o4 ! calculate shift mask and dest. + + cmp %o3, %o4 ! Addresses same? + beq,pt CCCR, 1f + mov %o4, %o3 + fmovd %f2, %f0 ! Shuffle data + ldd [%o3+8], %f2 ! Load word 0 +1: + faligndata %f0, %f2, %f4 ! Move 1st short low part of f8 + + st %f5, [%o1] ! Store word + dec 4, %o2 + inc 4, %o1 + inc 4, %o0 +4: + brz,pn %o2, Lmemcpy_blockfinish ! XXXX + !! + !! We are now 32-bit aligned in the dest. + !! +Lmemcpy_block_common: + + mov -0, %o4 + alignaddr %o0, %o4, %o4 ! base - shift + + cmp %o3, %o4 ! Addresses same? + beq,pt CCCR, 1f + mov %o4, %o3 + fmovd %f2, %f0 ! Shuffle data + ldd [%o3+8], %f2 ! Load word 0 +1: + add %o3, 8, %o0 ! now use %o0 for src + + !! + !! Continue until our dest is block aligned + !! +Lmemcpy_block_aligned8: +1: + brz %o2, Lmemcpy_blockfinish + btst BLOCK_ALIGN, %o1 ! Block aligned? + bz 1f + + faligndata %f0, %f2, %f4 ! Generate result + deccc 8, %o2 + ble,pn %icc, Lmemcpy_blockfinish ! Should never happen + fmovd %f4, %f48 + + std %f4, [%o1] ! Store result + inc 8, %o1 + + fmovd %f2, %f0 + inc 8, %o0 + ba,pt %xcc, 1b ! Not yet. + ldd [%o0], %f2 ! Load next part +Lmemcpy_block_aligned64: +1: + +/* + * 64-byte aligned -- ready for block operations. + * + * Here we have the destination block aligned, but the + * source pointer may not be. Sub-word alignment will + * be handled by faligndata instructions. But the source + * can still be potentially aligned to 8 different words + * in our 64-bit block, so we have 8 different copy routines. + * + * Once we figure out our source alignment, we branch + * to the appropriate copy routine, which sets up the + * alignment for faligndata and loads (sets) the values + * into the source registers and does the copy loop. + * + * When were down to less than 1 block to store, we + * exit the copy loop and execute cleanup code. + * + * Block loads and stores are not properly interlocked. + * Stores save one reg/cycle, so you can start overwriting + * registers the cycle after the store is issued. + * + * Block loads require a block load to a different register + * block or a membar #Sync before accessing the loaded + * data. + * + * Since the faligndata instructions may be offset as far + * as 7 registers into a block (if you are shifting source + * 7 -> dest 0), you need 3 source register blocks for full + * performance: one you are copying, one you are loading, + * and one for interlocking. Otherwise, we would need to + * sprinkle the code with membar #Sync and lose the advantage + * of running faligndata in parallel with block stores. This + * means we are fetching a full 128 bytes ahead of the stores. + * We need to make sure the prefetch does not inadvertently + * cross a page boundary and fault on data that we will never + * store. + * + */ +#if 1 + and %o0, BLOCK_ALIGN, %o3 + srax %o3, 3, %o3 ! Isolate the offset + + brz %o3, L100 ! 0->0 + btst 4, %o3 + bnz %xcc, 4f + btst 2, %o3 + bnz %xcc, 2f + btst 1, %o3 + ba,pt %xcc, L101 ! 0->1 + nop /* XXX spitfire bug */ +2: + bz %xcc, L102 ! 0->2 + nop + ba,pt %xcc, L103 ! 0->3 + nop /* XXX spitfire bug */ +4: + bnz %xcc, 2f + btst 1, %o3 + bz %xcc, L104 ! 0->4 + nop + ba,pt %xcc, L105 ! 0->5 + nop /* XXX spitfire bug */ +2: + bz %xcc, L106 ! 0->6 + nop + ba,pt %xcc, L107 ! 0->7 + nop /* XXX spitfire bug */ +#else + + !! + !! Isolate the word offset, which just happens to be + !! the slot in our jump table. + !! + !! This is 6 insns, most of which cannot be paired, + !! which is about the same as the above version. + !! + rd %pc, %o4 +1: + and %o0, 0x31, %o3 + add %o3, (Lmemcpy_block_jmp - 1b), %o3 + jmpl %o4 + %o3, %g0 + nop + + !! + !! Jump table + !! + +Lmemcpy_block_jmp: + ba,a,pt %xcc, L100 + nop + ba,a,pt %xcc, L101 + nop + ba,a,pt %xcc, L102 + nop + ba,a,pt %xcc, L103 + nop + ba,a,pt %xcc, L104 + nop + ba,a,pt %xcc, L105 + nop + ba,a,pt %xcc, L106 + nop + ba,a,pt %xcc, L107 + nop +#endif + + !! + !! Source is block aligned. + !! + !! Just load a block and go. + !! +L100: +#ifdef RETURN_NAME + sethi %hi(1f), %g1 + ba,pt %icc, 2f + or %g1, %lo(1f), %g1 +1: + .asciz "L100" + .align 8 +2: +#endif + fmovd %f0 , %f62 + ldda [%o0] ASI_BLK_P, %f0 + inc BLOCK_SIZE, %o0 + cmp %o0, %o5 + bleu,a,pn %icc, 3f + ldda [%o0] ASI_BLK_P, %f16 + ba,pt %icc, 3f + membar #Sync + + .align 32 ! ICache align. +3: + faligndata %f62, %f0, %f32 + inc BLOCK_SIZE, %o0 + faligndata %f0, %f2, %f34 + dec BLOCK_SIZE, %o2 + faligndata %f2, %f4, %f36 + cmp %o0, %o5 + faligndata %f4, %f6, %f38 + faligndata %f6, %f8, %f40 + faligndata %f8, %f10, %f42 + faligndata %f10, %f12, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f12, %f14, %f46 + + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f48 + membar #Sync +2: + stda %f32, [%o1] ASI_STORE + faligndata %f14, %f16, %f32 + inc BLOCK_SIZE, %o0 + faligndata %f16, %f18, %f34 + inc BLOCK_SIZE, %o1 + faligndata %f18, %f20, %f36 + dec BLOCK_SIZE, %o2 + faligndata %f20, %f22, %f38 + cmp %o0, %o5 + faligndata %f22, %f24, %f40 + faligndata %f24, %f26, %f42 + faligndata %f26, %f28, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f28, %f30, %f46 + + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f0 + membar #Sync +2: + stda %f32, [%o1] ASI_STORE + faligndata %f30, %f48, %f32 + inc BLOCK_SIZE, %o0 + faligndata %f48, %f50, %f34 + inc BLOCK_SIZE, %o1 + faligndata %f50, %f52, %f36 + dec BLOCK_SIZE, %o2 + faligndata %f52, %f54, %f38 + cmp %o0, %o5 + faligndata %f54, %f56, %f40 + faligndata %f56, %f58, %f42 + faligndata %f58, %f60, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f60, %f62, %f46 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top + membar #Sync +2: + stda %f32, [%o1] ASI_STORE + ba 3b + inc BLOCK_SIZE, %o1 + + !! + !! Source at BLOCK_ALIGN+8 + !! + !! We need to load almost 1 complete block by hand. + !! +L101: +#ifdef RETURN_NAME + sethi %hi(1f), %g1 + ba,pt %icc, 2f + or %g1, %lo(1f), %g1 +1: + .asciz "L101" + .align 8 +2: +#endif +! fmovd %f0, %f0 ! Hoist fmovd + ldd [%o0], %f2 + inc 8, %o0 + ldd [%o0], %f4 + inc 8, %o0 + ldd [%o0], %f6 + inc 8, %o0 + ldd [%o0], %f8 + inc 8, %o0 + ldd [%o0], %f10 + inc 8, %o0 + ldd [%o0], %f12 + inc 8, %o0 + ldd [%o0], %f14 + inc 8, %o0 + + cmp %o0, %o5 + bleu,a,pn %icc, 3f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +3: + faligndata %f0, %f2, %f32 + inc BLOCK_SIZE, %o0 + faligndata %f2, %f4, %f34 + cmp %o0, %o5 + faligndata %f4, %f6, %f36 + dec BLOCK_SIZE, %o2 + faligndata %f6, %f8, %f38 + faligndata %f8, %f10, %f40 + faligndata %f10, %f12, %f42 + faligndata %f12, %f14, %f44 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f48 + membar #Sync +2: + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f14, %f16, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f16, %f18, %f32 + inc BLOCK_SIZE, %o0 + faligndata %f18, %f20, %f34 + inc BLOCK_SIZE, %o1 + faligndata %f20, %f22, %f36 + cmp %o0, %o5 + faligndata %f22, %f24, %f38 + dec BLOCK_SIZE, %o2 + faligndata %f24, %f26, %f40 + faligndata %f26, %f28, %f42 + faligndata %f28, %f30, %f44 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f0 + membar #Sync +2: + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f30, %f48, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f48, %f50, %f32 + inc BLOCK_SIZE, %o0 + faligndata %f50, %f52, %f34 + inc BLOCK_SIZE, %o1 + faligndata %f52, %f54, %f36 + cmp %o0, %o5 + faligndata %f54, %f56, %f38 + dec BLOCK_SIZE, %o2 + faligndata %f56, %f58, %f40 + faligndata %f58, %f60, %f42 + faligndata %f60, %f62, %f44 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f62, %f0, %f46 + + stda %f32, [%o1] ASI_STORE + ba 3b + inc BLOCK_SIZE, %o1 + + !! + !! Source at BLOCK_ALIGN+16 + !! + !! We need to load 6 doubles by hand. + !! +L102: +#ifdef RETURN_NAME + sethi %hi(1f), %g1 + ba,pt %icc, 2f + or %g1, %lo(1f), %g1 +1: + .asciz "L102" + .align 8 +2: +#endif + ldd [%o0], %f4 + inc 8, %o0 + fmovd %f0, %f2 ! Hoist fmovd + ldd [%o0], %f6 + inc 8, %o0 + + ldd [%o0], %f8 + inc 8, %o0 + ldd [%o0], %f10 + inc 8, %o0 + ldd [%o0], %f12 + inc 8, %o0 + ldd [%o0], %f14 + inc 8, %o0 + + cmp %o0, %o5 + bleu,a,pn %icc, 3f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +3: + faligndata %f2, %f4, %f32 + inc BLOCK_SIZE, %o0 + faligndata %f4, %f6, %f34 + cmp %o0, %o5 + faligndata %f6, %f8, %f36 + dec BLOCK_SIZE, %o2 + faligndata %f8, %f10, %f38 + faligndata %f10, %f12, %f40 + faligndata %f12, %f14, %f42 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f48 + membar #Sync +2: + faligndata %f14, %f16, %f44 + + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f16, %f18, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f18, %f20, %f32 + inc BLOCK_SIZE, %o0 + faligndata %f20, %f22, %f34 + inc BLOCK_SIZE, %o1 + faligndata %f22, %f24, %f36 + cmp %o0, %o5 + faligndata %f24, %f26, %f38 + dec BLOCK_SIZE, %o2 + faligndata %f26, %f28, %f40 + faligndata %f28, %f30, %f42 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f0 + membar #Sync +2: + faligndata %f30, %f48, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f48, %f50, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f50, %f52, %f32 + inc BLOCK_SIZE, %o0 + faligndata %f52, %f54, %f34 + inc BLOCK_SIZE, %o1 + faligndata %f54, %f56, %f36 + cmp %o0, %o5 + faligndata %f56, %f58, %f38 + dec BLOCK_SIZE, %o2 + faligndata %f58, %f60, %f40 + faligndata %f60, %f62, %f42 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + faligndata %f62, %f0, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f0, %f2, %f46 + + stda %f32, [%o1] ASI_STORE + ba 3b + inc BLOCK_SIZE, %o1 + + !! + !! Source at BLOCK_ALIGN+24 + !! + !! We need to load 5 doubles by hand. + !! +L103: +#ifdef RETURN_NAME + sethi %hi(1f), %g1 + ba,pt %icc, 2f + or %g1, %lo(1f), %g1 +1: + .asciz "L103" + .align 8 +2: +#endif + fmovd %f0, %f4 + ldd [%o0], %f6 + inc 8, %o0 + ldd [%o0], %f8 + inc 8, %o0 + ldd [%o0], %f10 + inc 8, %o0 + ldd [%o0], %f12 + inc 8, %o0 + ldd [%o0], %f14 + inc 8, %o0 + + cmp %o0, %o5 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + inc BLOCK_SIZE, %o0 +3: + faligndata %f4, %f6, %f32 + cmp %o0, %o5 + faligndata %f6, %f8, %f34 + dec BLOCK_SIZE, %o2 + faligndata %f8, %f10, %f36 + faligndata %f10, %f12, %f38 + faligndata %f12, %f14, %f40 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f48 + membar #Sync +2: + faligndata %f14, %f16, %f42 + inc BLOCK_SIZE, %o0 + faligndata %f16, %f18, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f18, %f20, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f20, %f22, %f32 + cmp %o0, %o5 + faligndata %f22, %f24, %f34 + dec BLOCK_SIZE, %o2 + faligndata %f24, %f26, %f36 + inc BLOCK_SIZE, %o1 + faligndata %f26, %f28, %f38 + faligndata %f28, %f30, %f40 + ble,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f0 + membar #Sync +2: + faligndata %f30, %f48, %f42 + inc BLOCK_SIZE, %o0 + faligndata %f48, %f50, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f50, %f52, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f52, %f54, %f32 + cmp %o0, %o5 + faligndata %f54, %f56, %f34 + dec BLOCK_SIZE, %o2 + faligndata %f56, %f58, %f36 + faligndata %f58, %f60, %f38 + inc BLOCK_SIZE, %o1 + faligndata %f60, %f62, %f40 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + faligndata %f62, %f0, %f42 + inc BLOCK_SIZE, %o0 + faligndata %f0, %f2, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f2, %f4, %f46 + + stda %f32, [%o1] ASI_STORE + ba 3b + inc BLOCK_SIZE, %o1 + + !! + !! Source at BLOCK_ALIGN+32 + !! + !! We need to load 4 doubles by hand. + !! +L104: +#ifdef RETURN_NAME + sethi %hi(1f), %g1 + ba,pt %icc, 2f + or %g1, %lo(1f), %g1 +1: + .asciz "L104" + .align 8 +2: +#endif + fmovd %f0, %f6 + ldd [%o0], %f8 + inc 8, %o0 + ldd [%o0], %f10 + inc 8, %o0 + ldd [%o0], %f12 + inc 8, %o0 + ldd [%o0], %f14 + inc 8, %o0 + + cmp %o0, %o5 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + inc BLOCK_SIZE, %o0 +3: + faligndata %f6, %f8, %f32 + cmp %o0, %o5 + faligndata %f8, %f10, %f34 + dec BLOCK_SIZE, %o2 + faligndata %f10, %f12, %f36 + faligndata %f12, %f14, %f38 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f48 + membar #Sync +2: + faligndata %f14, %f16, %f40 + faligndata %f16, %f18, %f42 + inc BLOCK_SIZE, %o0 + faligndata %f18, %f20, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f20, %f22, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f22, %f24, %f32 + cmp %o0, %o5 + faligndata %f24, %f26, %f34 + faligndata %f26, %f28, %f36 + inc BLOCK_SIZE, %o1 + faligndata %f28, %f30, %f38 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f0 + membar #Sync +2: + faligndata %f30, %f48, %f40 + dec BLOCK_SIZE, %o2 + faligndata %f48, %f50, %f42 + inc BLOCK_SIZE, %o0 + faligndata %f50, %f52, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f52, %f54, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f54, %f56, %f32 + cmp %o0, %o5 + faligndata %f56, %f58, %f34 + faligndata %f58, %f60, %f36 + inc BLOCK_SIZE, %o1 + faligndata %f60, %f62, %f38 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + faligndata %f62, %f0, %f40 + dec BLOCK_SIZE, %o2 + faligndata %f0, %f2, %f42 + inc BLOCK_SIZE, %o0 + faligndata %f2, %f4, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f4, %f6, %f46 + + stda %f32, [%o1] ASI_STORE + ba 3b + inc BLOCK_SIZE, %o1 + + !! + !! Source at BLOCK_ALIGN+40 + !! + !! We need to load 3 doubles by hand. + !! +L105: +#ifdef RETURN_NAME + sethi %hi(1f), %g1 + ba,pt %icc, 2f + or %g1, %lo(1f), %g1 +1: + .asciz "L105" + .align 8 +2: +#endif + fmovd %f0, %f8 + ldd [%o0], %f10 + inc 8, %o0 + ldd [%o0], %f12 + inc 8, %o0 + ldd [%o0], %f14 + inc 8, %o0 + + cmp %o0, %o5 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + inc BLOCK_SIZE, %o0 +3: + faligndata %f8, %f10, %f32 + cmp %o0, %o5 + faligndata %f10, %f12, %f34 + faligndata %f12, %f14, %f36 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f48 + membar #Sync +2: + faligndata %f14, %f16, %f38 + dec BLOCK_SIZE, %o2 + faligndata %f16, %f18, %f40 + inc BLOCK_SIZE, %o0 + faligndata %f18, %f20, %f42 + faligndata %f20, %f22, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f22, %f24, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f24, %f26, %f32 + cmp %o0, %o5 + faligndata %f26, %f28, %f34 + dec BLOCK_SIZE, %o2 + faligndata %f28, %f30, %f36 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f0 + membar #Sync +2: + faligndata %f30, %f48, %f38 + inc BLOCK_SIZE, %o1 + faligndata %f48, %f50, %f40 + inc BLOCK_SIZE, %o0 + faligndata %f50, %f52, %f42 + faligndata %f52, %f54, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f54, %f56, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f56, %f58, %f32 + cmp %o0, %o5 + faligndata %f58, %f60, %f34 + dec BLOCK_SIZE, %o2 + faligndata %f60, %f62, %f36 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + faligndata %f62, %f0, %f38 + inc BLOCK_SIZE, %o1 + faligndata %f0, %f2, %f40 + inc BLOCK_SIZE, %o0 + faligndata %f2, %f4, %f42 + faligndata %f4, %f6, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f6, %f8, %f46 + + stda %f32, [%o1] ASI_STORE + ba 3b + inc BLOCK_SIZE, %o1 + + + !! + !! Source at BLOCK_ALIGN+48 + !! + !! We need to load 2 doubles by hand. + !! +L106: +#ifdef RETURN_NAME + sethi %hi(1f), %g1 + ba,pt %icc, 2f + or %g1, %lo(1f), %g1 +1: + .asciz "L106" + .align 8 +2: +#endif + fmovd %f0, %f10 + ldd [%o0], %f12 + inc 8, %o0 + ldd [%o0], %f14 + inc 8, %o0 + + cmp %o0, %o5 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + inc BLOCK_SIZE, %o0 +3: + faligndata %f10, %f12, %f32 + cmp %o0, %o5 + faligndata %f12, %f14, %f34 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f48 + membar #Sync +2: + faligndata %f14, %f16, %f36 + dec BLOCK_SIZE, %o2 + faligndata %f16, %f18, %f38 + inc BLOCK_SIZE, %o0 + faligndata %f18, %f20, %f40 + faligndata %f20, %f22, %f42 + faligndata %f22, %f24, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f24, %f26, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f26, %f28, %f32 + cmp %o0, %o5 + faligndata %f28, %f30, %f34 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f0 + membar #Sync +2: + faligndata %f30, %f48, %f36 + dec BLOCK_SIZE, %o2 + faligndata %f48, %f50, %f38 + inc BLOCK_SIZE, %o1 + faligndata %f50, %f52, %f40 + faligndata %f52, %f54, %f42 + inc BLOCK_SIZE, %o0 + faligndata %f54, %f56, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f56, %f58, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f58, %f60, %f32 + cmp %o0, %o5 + faligndata %f60, %f62, %f34 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + faligndata %f62, %f0, %f36 + dec BLOCK_SIZE, %o2 + faligndata %f0, %f2, %f38 + inc BLOCK_SIZE, %o1 + faligndata %f2, %f4, %f40 + faligndata %f4, %f6, %f42 + inc BLOCK_SIZE, %o0 + faligndata %f6, %f8, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f8, %f10, %f46 + + stda %f32, [%o1] ASI_STORE + ba 3b + inc BLOCK_SIZE, %o1 + + + !! + !! Source at BLOCK_ALIGN+56 + !! + !! We need to load 1 double by hand. + !! +L107: +#ifdef RETURN_NAME + sethi %hi(1f), %g1 + ba,pt %icc, 2f + or %g1, %lo(1f), %g1 +1: + .asciz "L107" + .align 8 +2: +#endif + fmovd %f0, %f12 + ldd [%o0], %f14 + inc 8, %o0 + + cmp %o0, %o5 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + inc BLOCK_SIZE, %o0 +3: + faligndata %f12, %f14, %f32 + cmp %o0, %o5 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f48 + membar #Sync +2: + faligndata %f14, %f16, %f34 + dec BLOCK_SIZE, %o2 + faligndata %f16, %f18, %f36 + inc BLOCK_SIZE, %o0 + faligndata %f18, %f20, %f38 + faligndata %f20, %f22, %f40 + faligndata %f22, %f24, %f42 + faligndata %f24, %f26, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f26, %f28, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f28, %f30, %f32 + cmp %o0, %o5 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f0 + membar #Sync +2: + faligndata %f30, %f48, %f34 + dec BLOCK_SIZE, %o2 + faligndata %f48, %f50, %f36 + inc BLOCK_SIZE, %o1 + faligndata %f50, %f52, %f38 + faligndata %f52, %f54, %f40 + inc BLOCK_SIZE, %o0 + faligndata %f54, %f56, %f42 + faligndata %f56, %f58, %f44 + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f58, %f60, %f46 + + stda %f32, [%o1] ASI_STORE + + faligndata %f60, %f62, %f32 + cmp %o0, %o5 + bleu,a,pn %icc, 2f + ldda [%o0] ASI_BLK_P, %f16 + membar #Sync +2: + faligndata %f62, %f0, %f34 + dec BLOCK_SIZE, %o2 + faligndata %f0, %f2, %f36 + inc BLOCK_SIZE, %o1 + faligndata %f2, %f4, %f38 + faligndata %f4, %f6, %f40 + inc BLOCK_SIZE, %o0 + faligndata %f6, %f8, %f42 + faligndata %f8, %f10, %f44 + + brlez,pn %o2, Lmemcpy_blockdone + faligndata %f10, %f12, %f46 + + stda %f32, [%o1] ASI_STORE + ba 3b + inc BLOCK_SIZE, %o1 + +Lmemcpy_blockdone: + inc BLOCK_SIZE, %o2 ! Fixup our overcommit + membar #Sync ! Finish any pending loads +#define FINISH_REG(f) \ + deccc 8, %o2; \ + bl,a Lmemcpy_blockfinish; \ + fmovd f, %f48; \ + std f, [%o1]; \ + inc 8, %o1 + + FINISH_REG(%f32) + FINISH_REG(%f34) + FINISH_REG(%f36) + FINISH_REG(%f38) + FINISH_REG(%f40) + FINISH_REG(%f42) + FINISH_REG(%f44) + FINISH_REG(%f46) + FINISH_REG(%f48) +#undef FINISH_REG + !! + !! The low 3 bits have the sub-word bits needed to be + !! stored [because (x-8)&0x7 == x]. + !! +Lmemcpy_blockfinish: + brz,pn %o2, 2f ! 100% complete? + fmovd %f48, %f4 + cmp %o2, 8 ! Exactly 8 bytes? + bz,a,pn CCCR, 2f + std %f4, [%o1] + + btst 4, %o2 ! Word store? + bz CCCR, 1f + nop + st %f4, [%o1] + inc 4, %o1 +1: + btst 2, %o2 + fzero %f0 + bz 1f + + mov -6, %o4 + alignaddr %o1, %o4, %g0 + + faligndata %f0, %f4, %f8 + + stda %f8, [%o1] ASI_FL16_P ! Store short + inc 2, %o1 +1: + btst 1, %o2 ! Byte aligned? + bz 2f + + mov -7, %o0 ! Calculate dest - 7 + alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest. + + faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8 + + stda %f8, [%o1] ASI_FL8_P ! Store 1st byte + inc 1, %o1 ! Update address +2: + membar #Sync +#if 0 + !! + !! verify copy success. + !! + + mov %i0, %o2 + mov %i1, %o4 + mov %i2, %l4 +0: + ldub [%o2], %o1 + inc %o2 + ldub [%o4], %o3 + inc %o4 + cmp %o3, %o1 + bnz 1f + dec %l4 + brnz %l4, 0b + nop + ba 2f + nop + +1: + set block_disable, %o0 + stx %o0, [%o0] + + set 0f, %o0 + call prom_printf + sub %i2, %l4, %o5 + set 1f, %o0 + mov %i0, %o2 + mov %i1, %o1 + call prom_printf + mov %i2, %o3 + ta 1 + .data + _ALIGN +0: .asciz "block memcpy failed: %x@%p != %x@%p byte %d\r\n" +1: .asciz "memcpy(%p, %p, %lx)\r\n" + _ALIGN + .text +2: +#endif +#if defined(_KERNEL) && !defined(_RUMPKERNEL) + +/* + * Weve saved our possible fpstate, now disable the fpu + * and continue with life. + */ + RESTORE_FPU + ret + restore %g1, 0, %o0 ! Return DEST for memcpy +#endif + retl + mov %g1, %o0 +/* + * Use block_disable to turn off block insns for + * memcpy/memset + */ + .data + .align 8 + .globl block_disable +block_disable: .xword 1 + .text +#endif /* USE_BLOCK_STORE_LOAD */ Index: src/common/lib/libc/arch/sparc64/string/memset.S diff -u /dev/null src/common/lib/libc/arch/sparc64/string/memset.S:1.1 --- /dev/null Sat Mar 16 20:42:32 2013 +++ src/common/lib/libc/arch/sparc64/string/memset.S Sat Mar 16 20:42:32 2013 @@ -0,0 +1,214 @@ +/* $NetBSD: memset.S,v 1.1 2013/03/17 00:42:32 christos Exp $ */ + +/* + * Copyright (c) 1996-2002 Eduardo Horvath + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#include "strmacros.h" + +/* + * XXXXXXXXXXXXXXXXXXXX + * We need to make sure that this doesn't use floating point + * before our trap handlers are installed or we could panic + * XXXXXXXXXXXXXXXXXXXX + */ +/* + * memset(addr, c, len) + * + * We want to use VIS instructions if we're clearing out more than + * 256 bytes, but to do that we need to properly save and restore the + * FP registers. Unfortunately the code to do that in the kernel needs + * to keep track of the current owner of the FPU, hence the different + * code. + * + * XXXXX To produce more efficient code, we do not allow lengths + * greater than 0x80000000000000000, which are negative numbers. + * This should not really be an issue since the VA hole should + * cause any such ranges to fail anyway. + */ +#if !defined(_KERNEL) || defined(_RUMPKERNEL) +ENTRY(bzero) + ! %o0 = addr, %o1 = len + mov %o1, %o2 + mov 0, %o1 +#endif +ENTRY(memset) + ! %o0 = addr, %o1 = pattern, %o2 = len + mov %o0, %o4 ! Save original pointer + +Lmemset_internal: + btst 7, %o0 ! Word aligned? + bz,pn %xcc, 0f + nop + inc %o0 + deccc %o2 ! Store up to 7 bytes + bge,a,pt CCCR, Lmemset_internal + stb %o1, [%o0 - 1] + + retl ! Duplicate Lmemset_done + mov %o4, %o0 +0: + /* + * Duplicate the pattern so it fills 64-bits. + */ + andcc %o1, 0x0ff, %o1 ! No need to extend zero + bz,pt %icc, 1f + sllx %o1, 8, %o3 ! sigh. all dependent insns. + or %o1, %o3, %o1 + sllx %o1, 16, %o3 + or %o1, %o3, %o1 + sllx %o1, 32, %o3 + or %o1, %o3, %o1 +1: +#ifdef USE_BLOCK_STORE_LOAD + !! Now we are 64-bit aligned + cmp %o2, 256 ! Use block clear if len > 256 + bge,pt CCCR, Lmemset_block ! use block store insns +#endif /* USE_BLOCK_STORE_LOAD */ + deccc 8, %o2 +Lmemset_longs: + bl,pn CCCR, Lmemset_cleanup ! Less than 8 bytes left + nop +3: + inc 8, %o0 + deccc 8, %o2 + bge,pt CCCR, 3b + stx %o1, [%o0 - 8] ! Do 1 longword at a time + + /* + * Len is in [-8..-1] where -8 => done, -7 => 1 byte to zero, + * -6 => two bytes, etc. Mop up this remainder, if any. + */ +Lmemset_cleanup: + btst 4, %o2 + bz,pt CCCR, 5f ! if (len & 4) { + nop + stw %o1, [%o0] ! *(int *)addr = 0; + inc 4, %o0 ! addr += 4; +5: + btst 2, %o2 + bz,pt CCCR, 7f ! if (len & 2) { + nop + sth %o1, [%o0] ! *(short *)addr = 0; + inc 2, %o0 ! addr += 2; +7: + btst 1, %o2 + bnz,a %icc, Lmemset_done ! if (len & 1) + stb %o1, [%o0] ! *addr = 0; +Lmemset_done: + retl + mov %o4, %o0 ! Restore ponter for memset (ugh) + +#ifdef USE_BLOCK_STORE_LOAD +Lmemset_block: + sethi %hi(block_disable), %o3 + ldx [ %o3 + %lo(block_disable) ], %o3 + brnz,pn %o3, Lmemset_longs + !! Make sure our trap table is installed + set _C_LABEL(trapbase), %o5 + rdpr %tba, %o3 + sub %o3, %o5, %o3 + brnz,pn %o3, Lmemset_longs ! No, then don't use block load/store + nop +/* + * Kernel: + * + * Here we use VIS instructions to do a block clear of a page. + * But before we can do that we need to save and enable the FPU. + * The last owner of the FPU registers is fplwp, and + * fplwp->l_md.md_fpstate is the current fpstate. If that's not + * null, call savefpstate() with it to store our current fp state. + * + * Next, allocate an aligned fpstate on the stack. We will properly + * nest calls on a particular stack so this should not be a problem. + * + * Now we grab either curlwp (or if we're on the interrupt stack + * lwp0). We stash its existing fpstate in a local register and + * put our new fpstate in curlwp->p_md.md_fpstate. We point + * fplwp at curlwp (or lwp0) and enable the FPU. + * + * If we are ever preempted, our FPU state will be saved in our + * fpstate. Then, when we're resumed and we take an FPDISABLED + * trap, the trap handler will be able to fish our FPU state out + * of curlwp (or lwp0). + * + * On exiting this routine we undo the damage: restore the original + * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable + * the MMU. + * + */ + + ENABLE_FPU(0) + + !! We are now 8-byte aligned. We need to become 64-byte aligned. + btst 63, %i0 + bz,pt CCCR, 2f + nop +1: + stx %i1, [%i0] + inc 8, %i0 + btst 63, %i0 + bnz,pt %xcc, 1b + dec 8, %i2 + +2: + brz %i1, 3f ! Skip the memory op + fzero %f0 ! if pattern is 0 + +#ifdef _LP64 + stx %i1, [%i0] ! Flush this puppy to RAM + membar #StoreLoad + ldd [%i0], %f0 +#else + stw %i1, [%i0] ! Flush this puppy to RAM + membar #StoreLoad + ld [%i0], %f0 + fmovsa %icc, %f0, %f1 +#endif + +3: + fmovd %f0, %f2 ! Duplicate the pattern + fmovd %f0, %f4 + fmovd %f0, %f6 + fmovd %f0, %f8 + fmovd %f0, %f10 + fmovd %f0, %f12 + fmovd %f0, %f14 + + !! Remember: we were 8 bytes too far + dec 56, %i2 ! Go one iteration too far +5: + stda %f0, [%i0] ASI_STORE ! Store 64 bytes + deccc BLOCK_SIZE, %i2 + bg,pt %icc, 5b + inc BLOCK_SIZE, %i0 + + membar #Sync +/* + * We've saved our possible fpstate, now disable the fpu + * and continue with life. + */ + RESTORE_FPU + addcc %i2, 56, %i2 ! Restore the count + ba,pt %xcc, Lmemset_longs ! Finish up the remainder + restore +#endif /* USE_BLOCK_STORE_LOAD */ Index: src/common/lib/libc/arch/sparc64/string/strmacros.h diff -u /dev/null src/common/lib/libc/arch/sparc64/string/strmacros.h:1.1 --- /dev/null Sat Mar 16 20:42:32 2013 +++ src/common/lib/libc/arch/sparc64/string/strmacros.h Sat Mar 16 20:42:32 2013 @@ -0,0 +1,119 @@ +/* $NetBSD: strmacros.h,v 1.1 2013/03/17 00:42:32 christos Exp $ */ + +/* + * Copyright (c) 1996-2002 Eduardo Horvath + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions and the following disclaimer. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <machine/asm.h> +#if defined(_KERNEL) && !defined(_RUMPKERNEL) +#define USE_BLOCK_STORE_LOAD /* enable block load/store ops */ +#include "assym.h" +#include <machine/param.h> +#include <machine/ctlreg.h> +#include <machine/psl.h> +#include <machine/frame.h> +#include <machine/intr.h> +#include <machine/locore.h> + +#ifdef USE_BLOCK_STORE_LOAD + +#define BLOCK_SIZE SPARC64_BLOCK_SIZE +#define BLOCK_ALIGN SPARC64_BLOCK_ALIGN + +/* + * The following routines allow fpu use in the kernel. + * + * They allocate a stack frame and use all local regs. Extra + * local storage can be requested by setting the siz parameter, + * and can be accessed at %sp+CC64FSZ. + */ + +#define ENABLE_FPU(siz) \ + save %sp, -(CC64FSZ), %sp; /* Allocate a stack frame */ \ + sethi %hi(FPLWP), %l1; \ + add %fp, STKB-FS_SIZE, %l0; /* Allocate a fpstate */\ + LDPTR [%l1 + %lo(FPLWP)], %l2; /* Load fplwp */ \ + andn %l0, BLOCK_ALIGN, %l0; /* Align it */ \ + clr %l3; /* NULL fpstate */ \ + brz,pt %l2, 1f; /* fplwp == NULL? */ \ + add %l0, -STKB-CC64FSZ-(siz), %sp; /* Set proper %sp */ \ + LDPTR [%l2 + L_FPSTATE], %l3; \ + brz,pn %l3, 1f; /* Make sure we have an fpstate */ \ + mov %l3, %o0; \ + call _C_LABEL(savefpstate); /* Save the old fpstate */ \ +1: \ + set EINTSTACK-STKB, %l4; /* Are we on intr stack? */ \ + cmp %sp, %l4; \ + bgu,pt %xcc, 1f; \ + set INTSTACK-STKB, %l4; \ + cmp %sp, %l4; \ + blu %xcc, 1f; \ +0: \ + sethi %hi(_C_LABEL(lwp0)), %l4; /* Yes, use lpw0 */ \ + ba,pt %xcc, 2f; /* XXXX needs to change to CPUs idle proc */ \ + or %l4, %lo(_C_LABEL(lwp0)), %l5; \ +1: \ + sethi %hi(CURLWP), %l4; /* Use curlwp */ \ + LDPTR [%l4 + %lo(CURLWP)], %l5; \ + brz,pn %l5, 0b; nop; /* If curlwp is NULL need to use lwp0 */\ +2: \ + LDPTR [%l5 + L_FPSTATE], %l6; /* Save old fpstate */ \ + STPTR %l0, [%l5 + L_FPSTATE]; /* Insert new fpstate */\ + STPTR %l5, [%l1 + %lo(FPLWP)]; /* Set new fplwp */ \ + wr %g0, FPRS_FEF, %fprs /* Enable FPU */ + +/* + * Weve saved our possible fpstate, now disable the fpu + * and continue with life. + */ +#ifdef DEBUG +#define __CHECK_FPU \ + LDPTR [%l5 + L_FPSTATE], %l7; \ + cmp %l7, %l0; \ + tnz 1; +#else +#define __CHECK_FPU +#endif + +#define RESTORE_FPU \ + __CHECK_FPU \ + STPTR %l2, [%l1 + %lo(FPLWP)]; /* Restore old fproc */ \ + wr %g0, 0, %fprs; /* Disable fpu */ \ + brz,pt %l3, 1f; /* Skip if no fpstate */ \ + STPTR %l6, [%l5 + L_FPSTATE]; /* Restore old fpstate */\ + \ + mov %l3, %o0; \ + call _C_LABEL(loadfpstate); /* Reload orig fpstate */\ +1: \ + membar #Sync; /* Finish all FP ops */ + +#endif /* USE_BLOCK_STORE_LOAD */ + +#ifdef USE_BLOCK_STORE_LOAD +#if 0 +#define ASI_STORE ASI_BLK_COMMIT_P +#else +#define ASI_STORE ASI_BLK_P +#endif +#endif /* USE_BLOCK_STORE_LOAD */ +#endif /* _KERNEL && !_RUMPKERNEL */