---
 libavcodec/x86/hevc_idct.asm  | 710 ++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/hevcdsp_init.c |  21 ++
 2 files changed, 731 insertions(+)

diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
index d662aa9..848812b 100644
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -2,6 +2,7 @@
 ;* SIMD-optimized IDCT functions for HEVC decoding
 ;* Copyright (c) 2014 Pierre-Edouard LEPERE
 ;* Copyright (c) 2014 James Almer
+;* Copyright (c) 2016 Alexandra Hájková
 ;*
 ;* This file is part of Libav.
 ;*
@@ -22,6 +23,217 @@
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+pd_64: times 4 dd 64
+pd_2048: times 4 dd 2048
+pd_512: times 4 dd 512
+
+; 4x4 transform coeffs
+pw_64: times 8 dw 64
+pw_64_m64: times 4 dw 64, -64
+pw_83_36: times 4 dw 83, 36
+pw_36_m83: times 4 dw 36, -83
+
+; 8x8 transform coeffs
+pw_89_75: times 4 dw 89, 75
+pw_50_18: times 4 dw 50, 18
+
+pw_75_m18: times 4 dw 75, -18
+pw_m89_m50: times 4 dw -89, -50
+
+pw_50_m89: times 4 dw 50, -89
+pw_18_75: times 4 dw 18, 75
+
+pw_18_m50: times 4 dw 18, -50
+pw_75_m89: times 4 dw 75, -89
+
+; 16x16 transformation coeffs
+pw_90_87: times 4 dw 90, 87
+pw_80_70: times 4 dw 80, 70
+pw_57_43: times 4 dw 57, 43
+pw_25_9: times 4 dw 25, 9
+
+pw_87_57: times 4 dw 87, 57
+pw_9_m43: times 4 dw 9, -43
+pw_m80_m90: times 4 dw -80, -90
+pw_m70_m25: times 4 dw -70, -25
+
+pw_80_9: times 4 dw 80, 9
+pw_m70_m87: times 4 dw -70, -87
+pw_m25_57: times 4 dw -25, 57
+pw_90_43: times 4 dw 90, 43
+
+pw_70_m43: times 4 dw 70, -43
+pw_m87_9: times 4 dw -87, 9
+pw_90_25: times 4 dw 90, 25
+pw_m80_m57: times 4 dw -80, -57
+
+pw_57_m80: times 4 dw 57, -80
+pw_m25_90: times 4 dw -25, 90
+pw_m9_m87: times 4 dw -9, -87
+pw_43_70: times 4 dw 43, 70
+
+pw_43_m90: times 4 dw 43, -90
+pw_57_25: times 4 dw 57, 25
+pw_m87_70: times 4 dw -87, 70
+pw_9_m80: times 4 dw 9, -80
+
+pw_25_m70: times 4 dw 25, -70
+pw_90_m80: times 4 dw 90, -80
+pw_43_9: times 4 dw 43, 9
+pw_m57_87: times 4 dw -57, 87
+
+pw_9_m25: times 4 dw 9, -25
+pw_43_m57: times 4 dw 43, -57
+pw_70_m80: times 4 dw 70, -80
+pw_87_m90: times 4 dw 87, -90
+
+; 32x32 transform coeffs
+trans_coeff32_0: times 8 dw 90
+times 4 dw 88, 85
+times 4 dw 82, 78
+times 4 dw 73, 67
+times 4 dw 61, 54
+times 4 dw 46, 38
+times 4 dw 31, 22
+times 4 dw 13, 4
+
+trans_coeff32_1: times 4 dw 90, 82
+times 4 dw 67, 46
+times 4 dw 22, -4
+times 4 dw -31, -54
+times 4 dw -73, -85
+times 4 dw -90, -88
+times 4 dw -78, -61
+times 4 dw -38, -13
+
+trans_coeff32_2: times 4 dw 88, 67
+times 4 dw 31, -13
+times 4 dw -54, -82
+times 4 dw -90, -78
+times 4 dw -46, -4
+times 4 dw 38, 73
+times 4 dw 90, 85
+times 4 dw 61, 22
+
+trans_coeff32_3: times 4 dw 85, 46
+times 4 dw -13, -67
+times 4 dw -90, -73
+times 4 dw -22, 38
+times 4 dw 82, 88
+times 4 dw 54, -4
+times 4 dw -61, -90
+times 4 dw -78, -31
+
+trans_coeff32_4: times 4 dw 82, 22
+times 4 dw -54, -90
+times 4 dw -61, 13
+times 4 dw 78, 85
+times 4 dw 31, -46
+times 4 dw -90, -67
+times 4 dw 4, 73
+times 4 dw 88, 38
+
+trans_coeff32_5: times 4 dw 78, -4
+times 4 dw -82, -73
+times 4 dw 13, 85
+times 4 dw 67, -22
+times 4 dw -88, -61
+times 4 dw 31, 90
+times 4 dw 54, -38
+times 4 dw -90, -46
+
+trans_coeff32_6: times 4 dw 73, -31
+times 4 dw -90, -22
+times 4 dw 78, 67
+times 4 dw -38, -90
+times 4 dw -13, 82
+times 4 dw 61, -46
+times 4 dw -88, -4
+times 4 dw 85, 54
+
+trans_coeff32_7: times 4 dw 67, -54
+times 4 dw -78, 38
+times 4 dw 85, -22
+times 4 dw -90, 4
+times 4 dw 90, 13
+times 4 dw -88, -31
+times 4 dw 82, 46
+times 4 dw -73, -61
+
+trans_coeff32_8: times 4 dw 61, -73
+times 4 dw -46, 82
+times 4 dw 31, -88
+times 4 dw -13, 90
+times 4 dw -4, -90
+times 4 dw 22, 85
+times 4 dw -38, -78
+times 4 dw 54, 67
+
+trans_coeff32_9: times 4 dw 54, -85
+times 4 dw -4, 88
+times 4 dw -46, -61
+times 4 dw 82, 13
+times 4 dw -90, 38
+times 4 dw 67, -78
+times 4 dw -22, 90
+times 4 dw -31, -73
+
+trans_coeff32_10: times 4 dw 46, -90
+times 4 dw 38, 54
+times 4 dw -90, 31
+times 4 dw 61, -88
+times 4 dw 22, 67
+times 4 dw -85, 13
+times 4 dw 73, -82
+times 4 dw 4, 78
+
+trans_coeff32_11: times 4 dw 38, -88
+times 4 dw 73, -4
+times 4 dw -67, 90
+times 4 dw -46, -31
+times 4 dw 85, -78
+times 4 dw 13, 61
+times 4 dw -90, 54
+times 4 dw 22, -82
+
+trans_coeff32_12: times 4 dw 31, -78
+times 4 dw 90, -61
+times 4 dw 4, 54
+times 4 dw -88, 82
+times 4 dw -38, -22
+times 4 dw 73, -90
+times 4 dw 67, -13
+times 4 dw -46, 85
+
+trans_coeff32_13: times 4 dw 22, -61
+times 4 dw 85, -90
+times 4 dw 73, -38
+times 4 dw -4, 46
+times 4 dw -78, 90
+times 4 dw -82, 54
+times 4 dw -13, -31
+times 4 dw 67, -88
+
+trans_coeff32_14: times 4 dw 13, -38
+times 4 dw 61, -78
+times 4 dw 88, -90
+times 4 dw 85, -73
+times 4 dw 54, -31
+times 4 dw 4, 22
+times 4 dw -46, 67
+times 4 dw -82, 90
+
+trans_coeff32_15: times 4 dw 4, -13
+times 4 dw 22, -31
+times 4 dw 38, -46
+times 4 dw 54, -61
+times 4 dw 67, -73
+times 4 dw 78, -82
+times 4 dw 85, -88
+times 4 dw 90, -90
+
 section .text
 
 ; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
@@ -74,6 +286,492 @@ cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
     RET
 %endmacro
 
+; IDCT 4x4, expects input in m0, m1
+; %1 - shift
+; %2 - 1/0 - SCALE and Transpose or not
+%macro TR_4x4 2
+    ; interleaves src0 with src2 to m0
+    ;         and src1 with scr3 to m2
+    ; src0: 00 01 02 03     m0: 00 02 01 21 02 22 03 23
+    ; src1: 10 11 12 13 -->
+    ; src2: 20 21 22 23     m1: 10 30 11 31 12 32 13 33
+    ; src3: 30 31 32 33
+
+    SBUTTERFLY wd, 0, 1, 2
+
+    pmaddwd m2, m0, [pw_64] ; e0
+    pmaddwd m3, m1, [pw_83_36] ; o0
+    pmaddwd m0, [pw_64_m64] ; e1
+    pmaddwd m1, [pw_36_m83] ; o1
+
+%if %2 == 1
+    %assign %%add 1 << (%1 - 1)
+    mova m4, [pd_ %+ %%add]
+    paddd m2 ,m4
+    paddd m0, m4
+%endif
+
+    SUMSUB_BADC d, 3, 2, 1, 0, 4
+
+%if %2 == 1
+    psrad m3, %1 ; e0 + o0
+    psrad m1, %1 ; e1 + o1
+    psrad m2, %1 ; e0 - o0
+    psrad m0, %1 ; e1 - o1
+    ;clip16
+    packssdw m3, m1
+    packssdw m0, m2
+    ; Transpose
+    SBUTTERFLY wd, 3, 0, 1
+    SBUTTERFLY wd, 3, 0, 1
+    SWAP 3, 1, 0
+%else
+    SWAP 3, 0
+    SWAP 3, 2
+%endif
+%endmacro
+
+%macro DEFINE_BIAS 1
+    %assign shift (20 - %1)
+    %assign c_add (1 << (shift - 1))
+    %define arr_add pd_ %+ c_add
+%endmacro
+
+; %1 - bit_depth
+; %2 - register add constant
+; is loaded to
+; shift = 20 - bit_depth
+%macro LOAD_BIAS 2
+    DEFINE_BIAS %1
+    mova %2, [arr_add]
+%endmacro
+
+; %1, %2 - registers to load packed 16 bit values to
+; %3, %4, %5, %6 - vertical offsets
+; %7 - horizontal offset
+%macro LOAD_BLOCK 7
+    movq   %1, [coeffsq + %3 + %7]
+    movhps %1, [coeffsq + %5 + %7]
+    movq   %2, [coeffsq + %4 + %7]
+    movhps %2, [coeffsq + %6 + %7]
+%endmacro
+
+; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_4x4 1
+cglobal hevc_idct_4x4_ %+ %1, 1, 14, 14, coeffs
+    mova m0, [coeffsq]
+    mova m1, [coeffsq + 16]
+
+    TR_4x4 7, 1
+    TR_4x4 20 - %1, 1
+
+    mova [coeffsq], m0
+    mova [coeffsq + 16], m1
+    RET
+%endmacro
+
+; store intermedite e16 coeffs on stack
+; as 8x4 matrix - writes 128 bytes to stack
+; from m10: e8 + o8, with %1 offset
+; and  %3:  e8 - o8, with %2 offset
+; %4 - shift, unused here
+%macro STORE_16 5
+    movu    [rsp + %1], %5
+    movu    [rsp + %2], %3
+%endmacro
+
+; scale, pack (clip16) and store the residuals     0 e8[0] + o8[0] --> + %1
+; 4 at one time (4 columns)                        1 e8[1] + o8[1]
+; from %5: e8/16 + o8/16, with %1 offset                  ...
+; and  %3: e8/16 - o8/16, with %2 offset           6 e8[1] - o8[1]
+; %4 - shift                                       7 e8[0] - o8[0] --> + %2
+%macro STORE_8 5
+    psrad    %5, %4
+    psrad    %3, %4
+    packssdw  %5, %3
+    movq      [coeffsq + %1], %5
+    movhps    [coeffsq + %2], %5
+%endmacro
+
+; %1 - horizontal offset
+; %2 - shift
+; %3, %4 - transform coeffs
+; %5 - vertical offset for e8 + o8
+; %6 - vertical offset for e8 - o8
+; %7 - register with e8 inside
+; %8 - block_size
+%macro E8_O8 8
+    pmaddwd m6, m4, %3
+    pmaddwd m7, m5, %4
+    paddd m6, m7
+
+%if %8 == 8
+    paddd %7, m8
+%endif
+
+    paddd m7, m6, %7 ; o8 + e8
+    psubd %7, m6     ; e8 - o8
+    STORE_%8 %5 + %1, %6 + %1, %7, %2, m7
+%endmacro
+
+; 8x4 residuals are processed and stored
+; %1 - horizontal offset
+; %2 - shift
+; %3 - offset of the even row
+; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
+; %5 - offset of the odd row
+; %6 - block size
+%macro TR_8x4 6
+    ; load 4 columns of even rows
+    LOAD_BLOCK  m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
+
+    TR_4x4 7, 0 ; e8: m0, m1, m2, m3, for 4 columns only
+
+    ; load 4 columns of odd rows
+    LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
+
+    ; 00 01 02 03
+    ; 10 11 12 13      m4: 10 30 11 31 12 32 13 33
+
+    ; ...        -- >
+    ;                  m5: 50 70 51 71 52 72 53 73
+    ; 70 71 72 73
+    SBUTTERFLY wd, 4, 5, 6
+
+    E8_O8 %1, %2, [pw_89_75],  [pw_50_18],   0,      %5 * 7, m0, %6
+    E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5,     %5 * 6, m1, %6
+    E8_O8 %1, %2, [pw_50_m89], [pw_18_75],   %5 * 2, %5 * 5, m2, %6
+    E8_O8 %1, %2, [pw_18_m50], [pw_75_m89],  %5 * 3, %5 * 4, m3, %6
+%endmacro
+
+%macro STORE_PACKED 7
+    movq     [coeffsq + %3 + %7], %1
+    movhps   [coeffsq + %4 + %7], %1
+    movq     [coeffsq + %5 + %7], %2
+    movhps   [coeffsq + %6 + %7], %2
+%endmacro
+
+; transpose src packed in m4, m5
+;                      to m3, m1
+%macro TRANSPOSE 0
+    SBUTTERFLY wd, 4, 5, 8
+    SBUTTERFLY dq, 4, 5, 8
+%endmacro
+
+; %1 - horizontal offset of the block i
+; %2 - vertical offset of the block i
+; %3 - width in bytes
+; %4 - vertical offset for the block j
+; %5 - horizontal offset for the block j
+%macro SWAP_BLOCKS 5
+    ; M_i
+    LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+
+    ; M_j
+    LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
+    TRANSPOSE
+    STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+
+    ; transpose and store M_i
+    SWAP m6, m4
+    SWAP m7, m5
+    TRANSPOSE
+    STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
+%endmacro
+
+; %1 - horizontal offset
+; %2 - 2 - vertical offset of the block
+; %3 - width in bytes
+%macro TRANSPOSE_BLOCK 3
+    LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+    TRANSPOSE
+    STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+%endmacro
+
+%macro TRANSPOSE_8x8 0
+    ; M1 M2 ^T = M1^t M3^t
+    ; M3 M4      M2^t M4^t
+
+    ; M1 4x4 block
+    TRANSPOSE_BLOCK 0, 0, 16
+
+    ; M2 and M3
+    SWAP_BLOCKS 0, 64, 16, 0, 8
+
+    ; M4
+    TRANSPOSE_BLOCK 8, 64, 16
+%endmacro
+
+; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_8x8 1
+cglobal hevc_idct_8x8_ %+ %1, 1, 14, 14, coeffs
+    mova m8, [pd_64]
+    TR_8x4 0, 7, 32, 1, 16, 8
+    TR_8x4 8, 7, 32, 1, 16, 8
+
+    TRANSPOSE_8x8
+
+    LOAD_BIAS %1, m8
+    TR_8x4 0, shift, 32, 1, 16, 8
+    TR_8x4 8, shift, 32, 1, 16, 8
+
+    TRANSPOSE_8x8
+
+    RET
+%endmacro
+
+; %1, 2 - transform constants
+; %3, 4 - regs with interleaved coeffs
+%macro ADD 4
+    pmaddwd m8, %3, %1
+    pmaddwd m9, %4, %2
+    paddd   m8, m9
+    paddd   m10, m8
+%endmacro
+
+; %1 ... %4 transform coeffs
+; %5, %6 offsets for storing e+o/e-o back to coeffsq
+; %7 - shift
+; %8 - add
+; %9 - block_size
+%macro E16_O16 9
+    pxor m10, m10
+    ADD %1, %2, m0, m1
+    ADD %3, %4, m2, m3
+
+    movu m4, [rsp + %5]
+%if %9 == 8
+    paddd m4, %8
+%endif
+
+    paddd m5, m10, m4 ; o16 + e16
+    psubd m4, m10      ; e16 - o16
+    STORE_%9 %5, %6, m4, %7, m5
+%endmacro
+
+%macro TR_16x4 9
+    mova m12, [pd_64]
+
+    ; produce 8x4 matrix of e16 coeffs
+    ; for 4 first rows and store it on stack (128 bytes)
+    TR_8x4 %1, 7, %4, %5, %6, %8
+
+    ; load 8 even rows
+    LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
+    LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, 
%1
+
+    SBUTTERFLY wd, 0, 1, 4
+    SBUTTERFLY wd, 2, 3, 4
+
+    mova m7, %3
+
+    E16_O16 [pw_90_87], [pw_80_70], [pw_57_43], [pw_25_9],  0 + %1, 15 * %6 + 
%1, %2, m7, %7
+    E16_O16 [pw_87_57], [pw_9_m43], [pw_m80_m90], [pw_m70_m25], %6 + %1, 14 * 
%6 + %1, %2, m7, %7
+    E16_O16 [pw_80_9], [pw_m70_m87], [pw_m25_57], [pw_90_43], 2 * %6 + %1, 13 
* %6 + %1, %2, m7, %7
+    E16_O16 [pw_70_m43], [pw_m87_9], [pw_90_25], [pw_m80_m57], 3 * %6 + %1, 12 
* %6 + %1, %2, m7, %7
+    E16_O16 [pw_57_m80], [pw_m25_90], [pw_m9_m87], [pw_43_70], 4 * %6 + %1, 11 
* %6 + %1, %2, m7, %7
+    E16_O16 [pw_43_m90], [pw_57_25], [pw_m87_70], [pw_9_m80], 5 * %6 + %1, 10 
* %6 + %1, %2, m7, %7
+    E16_O16 [pw_25_m70], [pw_90_m80], [pw_43_9], [pw_m57_87], 6 * %6 + %1, 9 * 
%6 + %1, %2, m7, %7
+    E16_O16 [pw_9_m25], [pw_43_m57], [pw_70_m80], [pw_87_m90], 7 * %6 + %1, 8 
* %6 + %1, %2, m7, %7
+%endmacro
+
+%macro TRANSPOSE_16x16 0
+    ; M1  M2  M3  M4 ^T      m1 m5 m9  m13   M_i^T = m_i
+    ; M5  M6  M7  M8    -->  m2 m6 m10 m14
+    ; M9  M10 M11 M12        m3 m7 m11 m15
+    ; M13 M14 M15 M16        m4 m8 m12 m16
+
+    ; M1 4x4 block
+    TRANSPOSE_BLOCK 0, 0, 32
+
+    ; M5, M2
+    SWAP_BLOCKS 0, 128, 32, 0, 8
+    ; M9, M3
+    SWAP_BLOCKS 0, 256, 32, 0, 16
+    ; M13, M4
+    SWAP_BLOCKS 0, 384, 32, 0, 24
+
+    ;M6
+    TRANSPOSE_BLOCK 8, 128, 32
+
+    ; M10, M7
+    SWAP_BLOCKS 8, 256, 32, 128, 16
+    ; M14, M8
+    SWAP_BLOCKS 8, 384, 32, 128, 24
+
+    ;M11
+    TRANSPOSE_BLOCK 16, 256, 32
+
+    ; M15, M12
+    SWAP_BLOCKS 16, 384, 32, 256, 24
+
+    ;M16
+    TRANSPOSE_BLOCK 24, 384, 32
+%endmacro
+
+; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_16x16 1
+cglobal hevc_idct_16x16_ %+ %1, 1, 1, 15, 1024, coeffs
+    TR_16x4 0, 7, [pd_64], 64, 2, 32, 8, 16, 1
+    TR_16x4 8, 7, [pd_64], 64, 2, 32, 8, 16, 1
+    TR_16x4 16, 7, [pd_64], 64, 2, 32, 8, 16, 1
+    TR_16x4 24, 7, [pd_64], 64, 2, 32, 8, 16, 1
+    TRANSPOSE_16x16
+
+    DEFINE_BIAS %1
+    TR_16x4 0, shift, [arr_add], 64, 2, 32, 8, 16, 1
+    TR_16x4 8, shift, [arr_add], 64, 2, 32, 8, 16, 1
+    TR_16x4 16, shift, [arr_add], 64, 2, 32, 8, 16, 1
+    TR_16x4 24, shift, [arr_add], 64, 2, 32, 8, 16, 1
+    TRANSPOSE_16x16
+
+    RET
+%endmacro
+
+; %1 - transform coeffs
+; %2 - stack offset for e32
+; %2, %3 offsets for storing e+o/e-o back to coeffsq
+; %4 - shift
+%macro E32_O32 4
+    pxor m10, m10
+    ADD [%1], [%1 + 16], m0, m1
+    ADD [%1 + 2 * 16], [%1 + 3 * 16], m2, m3
+    ADD [%1 + 4 * 16], [%1 + 5 * 16], m4, m5
+    ADD [%1 + 6 * 16], [%1 + 7 * 16], m6, m7
+
+    movu m11, [rsp + %2]
+    paddd m11, m14
+    paddd m12, m10, m11 ; o32 + e32
+    psubd m11, m10      ; e32 - o32
+    STORE_8 %2, %3, m11, %4, m12
+%endmacro
+
+; %1 - horizontal offset
+; %2 - bitdepth
+%macro TR_32x4 3
+    TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2
+
+    LOAD_BLOCK m0, m1,      64,  3 * 64,  5 * 64,  7 * 64, %1
+    LOAD_BLOCK m2, m3,  9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
+    LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
+    LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
+
+    SBUTTERFLY wd, 0, 1, 8
+    SBUTTERFLY wd, 2, 3, 8
+    SBUTTERFLY wd, 4, 5, 8
+    SBUTTERFLY wd, 6, 7, 8
+
+%if %3 == 1
+    %assign shift 7
+    mova m14, [pd_64]
+%else
+    LOAD_BIAS %2, m14
+%endif
+
+    E32_O32 trans_coeff32_0,           %1, 31 * 64 + %1, shift
+    E32_O32 trans_coeff32_1,      64 + %1, 30 * 64 + %1, shift
+    E32_O32 trans_coeff32_2,  2 * 64 + %1, 29 * 64 + %1, shift
+    E32_O32 trans_coeff32_3,  3 * 64 + %1, 28 * 64 + %1, shift
+    E32_O32 trans_coeff32_4,  4 * 64 + %1, 27 * 64 + %1, shift
+    E32_O32 trans_coeff32_5,  5 * 64 + %1, 26 * 64 + %1, shift
+    E32_O32 trans_coeff32_6,  6 * 64 + %1, 25 * 64 + %1, shift
+    E32_O32 trans_coeff32_7,  7 * 64 + %1, 24 * 64 + %1, shift
+    E32_O32 trans_coeff32_8,  8 * 64 + %1, 23 * 64 + %1, shift
+    E32_O32 trans_coeff32_9,  9 * 64 + %1, 22 * 64 + %1, shift
+    E32_O32 trans_coeff32_10, 10 * 64 + %1, 21 * 64 + %1, shift
+    E32_O32 trans_coeff32_11, 11 * 64 + %1, 20 * 64 + %1, shift
+    E32_O32 trans_coeff32_12, 12 * 64 + %1, 19 * 64 + %1, shift
+    E32_O32 trans_coeff32_13, 13 * 64 + %1, 18 * 64 + %1, shift
+    E32_O32 trans_coeff32_14, 14 * 64 + %1, 17 * 64 + %1, shift
+    E32_O32 trans_coeff32_15, 15 * 64 + %1, 16 * 64 + %1, shift
+%endmacro
+
+%macro TRANSPOSE_32x32 0
+    ; M0  M1 ... M7
+    ; M8         M15
+    ;
+    ; ...
+    ;
+    ; M56        M63
+
+    TRANSPOSE_BLOCK 0, 0, 64 ; M1
+
+    SWAP_BLOCKS 0,     256, 64, 0,     8 ; M8,  M1
+    SWAP_BLOCKS 0, 2 * 256, 64, 0, 2 * 8 ; M16, M2
+    SWAP_BLOCKS 0, 3 * 256, 64, 0, 3 * 8 ; M24, M3
+    SWAP_BLOCKS 0, 4 * 256, 64, 0, 4 * 8
+    SWAP_BLOCKS 0, 5 * 256, 64, 0, 5 * 8
+    SWAP_BLOCKS 0, 6 * 256, 64, 0, 6 * 8
+    SWAP_BLOCKS 0, 7 * 256, 64, 0, 7 * 8
+
+    TRANSPOSE_BLOCK 8, 256, 64 ; M9
+    SWAP_BLOCKS 8, 2 * 256, 64, 256, 2 * 8 ; M17, M10
+    SWAP_BLOCKS 8, 3 * 256, 64, 256, 3 * 8
+    SWAP_BLOCKS 8, 4 * 256, 64, 256, 4 * 8
+    SWAP_BLOCKS 8, 5 * 256, 64, 256, 5 * 8
+    SWAP_BLOCKS 8, 6 * 256, 64, 256, 6 * 8
+    SWAP_BLOCKS 8, 7 * 256, 64, 256, 7 * 8
+
+    TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
+    SWAP_BLOCKS 2 * 8, 3 * 256, 64, 2 * 256, 3 * 8
+    SWAP_BLOCKS 2 * 8, 4 * 256, 64, 2 * 256, 4 * 8
+    SWAP_BLOCKS 2 * 8, 5 * 256, 64, 2 * 256, 5 * 8
+    SWAP_BLOCKS 2 * 8, 6 * 256, 64, 2 * 256, 6 * 8
+    SWAP_BLOCKS 2 * 8, 7 * 256, 64, 2 * 256, 7 * 8
+
+    TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
+    SWAP_BLOCKS 3 * 8, 4 * 256, 64, 3 * 256, 4 * 8
+    SWAP_BLOCKS 3 * 8, 5 * 256, 64, 3 * 256, 5 * 8
+    SWAP_BLOCKS 3 * 8, 6 * 256, 64, 3 * 256, 6 * 8
+    SWAP_BLOCKS 3 * 8, 7 * 256, 64, 3 * 256, 7 * 8
+
+    TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
+    SWAP_BLOCKS 4 * 8, 5 * 256, 64, 4 * 256, 5 * 8
+    SWAP_BLOCKS 4 * 8, 6 * 256, 64, 4 * 256, 6 * 8
+    SWAP_BLOCKS 4 * 8, 7 * 256, 64, 4 * 256, 7 * 8
+
+    TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
+    SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
+    SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
+
+    TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
+    SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
+
+    TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
+%endmacro
+
+; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_32x32 1
+cglobal hevc_idct_32x32_ %+ %1, 1, 1, 15, 4096, coeffs
+    TR_32x4 0, %1, 1
+    TR_32x4 8, %1, 1
+    TR_32x4 16, %1, 1
+    TR_32x4 24, %1, 1
+    TR_32x4 32, %1, 1
+    TR_32x4 40, %1, 1
+    TR_32x4 48, %1, 1
+    TR_32x4 56, %1, 1
+
+    TRANSPOSE_32x32
+
+    TR_32x4 0, %1, 0
+    TR_32x4 8, %1, 0
+    TR_32x4 16, %1, 0
+    TR_32x4 24, %1, 0
+    TR_32x4 32, %1, 0
+    TR_32x4 40, %1, 0
+    TR_32x4 48, %1, 0
+    TR_32x4 56, %1, 0
+
+    TRANSPOSE_32x32
+
+    RET
+%endmacro
+
 ; 8-bit
 INIT_MMX mmxext
 IDCT_DC_NL  4,      8
@@ -84,6 +782,12 @@ IDCT_DC_NL  8,      8
 IDCT_DC    16,  4,  8
 IDCT_DC    32, 16,  8
 
+INIT_XMM avx
+IDCT_4x4 8
+IDCT_8x8 8
+IDCT_16x16 8
+IDCT_32x32 8
+
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 IDCT_DC    16,  2,  8
@@ -100,6 +804,12 @@ IDCT_DC_NL  8,     10
 IDCT_DC    16,  4, 10
 IDCT_DC    32, 16, 10
 
+INIT_XMM avx
+IDCT_4x4 10
+IDCT_8x8 10
+IDCT_16x16 10
+IDCT_32x32 10
+
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 IDCT_DC    16,  2, 10
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 1a675ab..2b107ab 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -78,6 +78,15 @@ IDCT_FUNCS(32x32, sse2);
 IDCT_FUNCS(16x16, avx2);
 IDCT_FUNCS(32x32, avx2);
 
+void ff_hevc_idct_4x4_8_avx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_10_avx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_8_avx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_10_avx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_8_avx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_10_avx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_8_avx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_10_avx(int16_t *coeffs, int col_limit);
+
 #define GET_PIXELS(width, depth, cf)                                           
                           \
 void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, 
ptrdiff_t dststride,             \
                                                            uint8_t *src, 
ptrdiff_t srcstride,             \
@@ -270,6 +279,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
 
+
             c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
             c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
             c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
@@ -329,6 +339,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
 #if HAVE_AVX_EXTERNAL
             SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
             SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
+
+            c->idct[0]         = ff_hevc_idct_4x4_8_avx;
+            c->idct[1]         = ff_hevc_idct_8x8_8_avx;
+            c->idct[2]         = ff_hevc_idct_16x16_8_avx;
+            c->idct[3]         = ff_hevc_idct_32x32_8_avx;
 #endif /* HAVE_AVX_EXTERNAL */
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
@@ -354,6 +369,12 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
             SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
             SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
             SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
+
+            c->idct[0] = ff_hevc_idct_4x4_10_avx;
+            c->idct[1] = ff_hevc_idct_8x8_10_avx;
+            c->idct[2] = ff_hevc_idct_16x16_10_avx;
+            c->idct[3] = ff_hevc_idct_32x32_10_avx;
+
 #endif /* HAVE_AVX_EXTERNAL */
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
-- 
2.1.4

_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to