On 10/30/2011 01:29 PM, Ronald S. Bultje wrote:

> From: Loren Merritt <[email protected]>
> 
> But keep INIT_AVX (for backwards compatibility).
> ---
>  libavutil/x86/x86inc.asm |  182 
> +++++++++++++++++++++++++++++++++++++++-------
>  1 files changed, 154 insertions(+), 28 deletions(-)
> 
> diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
> index c84d556..d7a3b3c 100644
> --- a/libavutil/x86/x86inc.asm
> +++ b/libavutil/x86/x86inc.asm
> @@ -1,5 +1,5 @@
>  
> ;*****************************************************************************
> -;* x86inc.asm
> +;* x86inc.asm: x264asm abstraction layer
>  
> ;*****************************************************************************
>  ;* Copyright (C) 2005-2011 x264 project
>  ;*


ok i suppose. if we want a cleaner diff.

> @@ -112,7 +112,7 @@
>  ; we need more flexible macro.
>  
>  ; RET:
> -; Pops anything that was pushed by PROLOGUE
> +; Pops anything that was pushed by PROLOGUE, and returns.
>  
>  ; REP_RET:
>  ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons


ok.

> @@ -297,6 +297,9 @@ DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 
> 56]
>  
>  %macro WIN64_SPILL_XMM 1
>      %assign xmm_regs_used %1
> +    %if mmsize == 8
> +        %assign xmm_regs_used 0
> +    %endif
>      ASSERT xmm_regs_used <= 16
>      %if xmm_regs_used > 6
>          sub rsp, (xmm_regs_used-6)*16+16


great. no need to have separate xmm_reg to avoid spilling xmm in mmx
functions.

> @@ -459,10 +462,24 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset 
> + 28]
>  
>  %assign function_align 16
>  
> -; Symbol prefix for C linkage
> -%macro cglobal 1-2+
> -    %xdefine %1 mangle(program_name %+ _ %+ %1)
> -    %xdefine %1.skip_prologue %1 %+ .skip_prologue
> +; Begin a function.
> +; Applies any symbol mangling needed for C linkage, and sets up a define 
> such that
> +; subsequent uses of the function name automatically refer to the mangled 
> version.
> +; Appends cpuflags to the function name if cpuflags has been specified.
> +%macro cglobal 1-2+ ; name, [PROLOGUE args]
> +%if %0 == 1
> +    cglobal_internal %1 %+ SUFFIX
> +%else
> +    cglobal_internal %1 %+ SUFFIX, %2
> +%endif
> +%endmacro
> +%macro cglobal_internal 1-2+
> +    %ifndef cglobaled_%1
> +        %xdefine %1 mangle(program_name %+ _ %+ %1)
> +        %xdefine %1.skip_prologue %1 %+ .skip_prologue
> +        CAT_XDEFINE cglobaled_, %1, 1
> +    %endif
> +    %xdefine current_function %1
>      %ifidn __OUTPUT_FORMAT__,elf
>          global %1:function hidden
>      %else
> @@ -479,12 +496,14 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset 
> + 28]
>  
>  %macro cextern 1
>      %xdefine %1 mangle(program_name %+ _ %+ %1)
> +    CAT_XDEFINE cglobaled_, %1, 1
>      extern %1
>  %endmacro
>  
> -;like cextern, but without the prefix
> +; like cextern, but without the prefix
>  %macro cextern_naked 1
>      %xdefine %1 mangle(%1)
> +    CAT_XDEFINE cglobaled_, %1, 1
>      extern %1
>  %endmacro

do the above changes do anything functional other than adding the
cpuflags suffix?

> @@ -500,6 +519,58 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 
> 28]
>  SECTION .note.GNU-stack noalloc noexec nowrite progbits
>  %endif
>  
> +; cpuflags
> +
> +%assign cpuflags_mmx      (1<<0)
> +%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
> +%assign cpuflags_sse      (1<<2) | cpuflags_mmx2
> +%assign cpuflags_sse2     (1<<3) | cpuflags_sse
> +%assign cpuflags_sse2slow (1<<4) | cpuflags_sse2
> +%assign cpuflags_sse3     (1<<5) | cpuflags_sse2
> +%assign cpuflags_ssse3    (1<<6) | cpuflags_sse3
> +%assign cpuflags_sse4     (1<<7) | cpuflags_ssse3
> +%assign cpuflags_sse42    (1<<8) | cpuflags_sse4
> +%assign cpuflags_avx      (1<<9) | cpuflags_sse42
> +%assign cpuflags_xop      (1<<10)| cpuflags_avx
> +%assign cpuflags_fma4     (1<<11)| cpuflags_avx
> +
> +%assign cpuflags_cache32  (1<<16)
> +%assign cpuflags_cache64  (1<<17)
> +%assign cpuflags_slowctz  (1<<18)
> +%assign cpuflags_lzcnt    (1<<19)
> +%assign cpuflags_misalign (1<<20)
> +%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant


we also have 3dnow, 3dnow2, and atom

> +
> +%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
> +%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
> +
> +; Takes up to 2 cpuflags from the above list.
> +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the 
> specified cpu.
> +; You shouldn't need to invoke this macro directly, it's a subroutine for 
> INIT_MMX &co.
> +%macro INIT_CPUFLAGS 0-2
> +    %if %0 >= 1
> +        %xdefine cpuname %1
> +        %assign cpuflags cpuflags_%1
> +        %if %0 >= 2
> +            %xdefine cpuname %1_%2
> +            %assign cpuflags cpuflags | cpuflags_%2
> +        %endif
> +        %xdefine SUFFIX _ %+ cpuname
> +        %if cpuflag(avx)
> +            %assign avx_enabled 1
> +        %endif
> +        %if cpuflag(aligned)
> +            %define movu mova
> +        %elifidn %1, sse3
> +            %define movu lddqu
> +        %endif
> +    %else
> +        %xdefine SUFFIX
> +        %undef cpuname
> +        %undef cpuflags
> +    %endif
> +%endmacro
> +
>  ; merge mmx and sse*
>  
>  %macro CAT_XDEFINE 3
> @@ -510,9 +581,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
>      %undef %1%2
>  %endmacro
>  
> -%macro INIT_MMX 0
> +%macro INIT_MMX 0-1+
>      %assign avx_enabled 0
> -    %define RESET_MM_PERMUTATION INIT_MMX
> +    %define RESET_MM_PERMUTATION INIT_MMX %1
>      %define mmsize 8
>      %define num_mmregs 8
>      %define mova movq
> @@ -530,11 +601,12 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
>      CAT_UNDEF nmm, %%i
>      %assign %%i %%i+1
>      %endrep
> +    INIT_CPUFLAGS %1
>  %endmacro
>  
> -%macro INIT_XMM 0
> +%macro INIT_XMM 0-1+
>      %assign avx_enabled 0
> -    %define RESET_MM_PERMUTATION INIT_XMM
> +    %define RESET_MM_PERMUTATION INIT_XMM %1
>      %define mmsize 16
>      %define num_mmregs 8
>      %ifdef ARCH_X86_64
> @@ -550,6 +622,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
>      CAT_XDEFINE nxmm, %%i, %%i
>      %assign %%i %%i+1
>      %endrep
> +    INIT_CPUFLAGS %1
>  %endmacro
>  
>  %macro INIT_AVX 0
> @@ -559,9 +632,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
>      %define RESET_MM_PERMUTATION INIT_AVX
>  %endmacro
>  
> -%macro INIT_YMM 0
> +%macro INIT_YMM 0-1+
>      %assign avx_enabled 1
> -    %define RESET_MM_PERMUTATION INIT_YMM
> +    %define RESET_MM_PERMUTATION INIT_YMM %1
>      %define mmsize 32
>      %define num_mmregs 8
>      %ifdef ARCH_X86_64
> @@ -569,15 +642,18 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
>      %endif
>      %define mova vmovaps
>      %define movu vmovups
> +    %undef movh
> +    %undef movnta


what is this about?

>      %assign %%i 0
>      %rep num_mmregs
>      CAT_XDEFINE m, %%i, ymm %+ %%i
>      CAT_XDEFINE nymm, %%i, %%i
>      %assign %%i %%i+1
>      %endrep
> +    INIT_CPUFLAGS %1
>  %endmacro
>  
> -INIT_MMX
> +INIT_XMM


do we have any code that assumes INIT_MMX as the default?

>  
>  ; I often want to use macros that permute their arguments. e.g. there's no
>  ; efficient way to implement butterfly or transpose or dct without swapping 
> some
> @@ -633,31 +709,46 @@ INIT_MMX
>  %endrep
>  %endmacro
>  
> -; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
> -; function name, then any later calls to that function will automatically
> -; load the permutation, so values can be returned in mmregs.
> -%macro SAVE_MM_PERMUTATION 1 ; name to save as
> +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
> +; calls to that function will automatically load the permutation, so values 
> can
> +; be returned in mmregs.
> +%macro SAVE_MM_PERMUTATION 0-1
> +    %if %0
> +        %xdefine %%f %1_m
> +    %else
> +        %xdefine %%f current_function %+ _m
> +    %endif
>      %assign %%i 0
>      %rep num_mmregs
> -    CAT_XDEFINE %1_m, %%i, m %+ %%i
> +        CAT_XDEFINE %%f, %%i, m %+ %%i
>      %assign %%i %%i+1
>      %endrep
>  %endmacro
>  
>  %macro LOAD_MM_PERMUTATION 1 ; name to load from
> -    %assign %%i 0
> -    %rep num_mmregs
> -    CAT_XDEFINE m, %%i, %1_m %+ %%i
> -    CAT_XDEFINE n, m %+ %%i, %%i
> -    %assign %%i %%i+1
> -    %endrep
> +    %ifdef %1_m0
> +        %assign %%i 0
> +        %rep num_mmregs
> +            CAT_XDEFINE m, %%i, %1_m %+ %%i
> +            CAT_XDEFINE n, m %+ %%i, %%i
> +        %assign %%i %%i+1
> +        %endrep
> +    %endif
>  %endmacro


ok

>  
> +; Append cpuflags to the callee's name iff the appended name is known and 
> the plain name isn't
>  %macro call 1
> -    call %1
> -    %ifdef %1_m0
> -        LOAD_MM_PERMUTATION %1
> +    call_internal %1, %1 %+ SUFFIX
> +%endmacro
> +%macro call_internal 2
> +    %xdefine %%i %1
> +    %ifndef cglobaled_%1
> +        %ifdef cglobaled_%2
> +            %xdefine %%i %2
> +        %endif
>      %endif
> +    call %%i
> +    LOAD_MM_PERMUTATION %%i
>  %endmacro


looks ok

>  
>  ; Substitutions that reduce instruction size but are functionally equivalent
> @@ -789,6 +880,8 @@ AVX_INSTR minpd, 1, 0
>  AVX_INSTR minps, 1, 0
>  AVX_INSTR minsd, 1, 0
>  AVX_INSTR minss, 1, 0
> +AVX_INSTR movsd, 1, 0
> +AVX_INSTR movss, 1, 0
>  AVX_INSTR mpsadbw, 0, 1
>  AVX_INSTR mulpd, 1, 0
>  AVX_INSTR mulps, 1, 0


ok

> @@ -903,3 +996,36 @@ AVX_INSTR xorps, 1, 0
>  AVX_INSTR pfadd, 1, 0
>  AVX_INSTR pfsub, 1, 0
>  AVX_INSTR pfmul, 1, 0
> +
> +; base-4 constants for shuffles
> +%assign i 0
> +%rep 256
> +    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
> +    %if j < 10
> +        CAT_XDEFINE q000, j, i
> +    %elif j < 100
> +        CAT_XDEFINE q00, j, i
> +    %elif j < 1000
> +        CAT_XDEFINE q0, j, i
> +    %else
> +        CAT_XDEFINE q, j, i
> +    %endif
> +%assign i i+1
> +%endrep
> +%undef i
> +%undef j


interesting. how does one use this?

> +
> +%macro FMA_INSTR 3
> +    %macro %1 4-7 %1, %2, %3
> +        %if cpuflag(xop)
> +            v%5 %1, %2, %3, %4
> +        %else
> +            %6 %1, %2, %3
> +            %7 %1, %4
> +        %endif
> +    %endmacro
> +%endmacro
> +
> +FMA_INSTR  pmacsdd,  pmulld, paddd
> +FMA_INSTR  pmacsww,  pmullw, paddw
> +FMA_INSTR pmadcswd, pmaddwd, paddd


fma emulation. neat.

Thanks,
Justin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to