# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1513930936 -19800 # Fri Dec 22 13:52:16 2017 +0530 # Node ID a2224f4d257cf5f5cd391f455aae3117b7fe65ab # Parent e883724b1af9f60e9d91be3aa6fe7b949e782684 [x265-avx512]x86: AVX512 optimize idct16x16 AVX2 Performance : 11.63x AVX512 Performance (old) : 13.07x AVX512 Performance (opt) : 13.72x
Overall 15.23% gains over avx2 diff -r e883724b1af9 -r a2224f4d257c source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Tue Dec 26 15:35:24 2017 +0530 +++ b/source/common/x86/dct8.asm Fri Dec 22 13:52:16 2017 +0530 @@ -288,6 +288,10 @@ idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7 idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5 +idct16_shuff2: dw 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 +idct16_shuff3: dw 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 +idct16_shuff4: dd 0, 8, 2, 10, 4, 12, 6, 14 +idct16_shuff5: dd 1, 9, 3, 11, 5, 13, 7, 15 tab_AVX512_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43 @@ -4797,8 +4801,8 @@ %macro IDCT16_AVX512_PASS1 3 movu m5, [tab_AVX512_idct16_2 + %1 * 64] - pmaddwd m9, m0, m5 - pmaddwd m10, m7, m5 + pmaddwd m9, m4, m5 + pmaddwd m10, m6, m5 vpsrldq m16, m9, 4 paddd m9, m16 @@ -4806,7 +4810,7 @@ paddd m10, m17 vmovdqu32 m9 {k1}, m10 - pmaddwd m10, m6, m5 + pmaddwd m10, m7, m5 pmaddwd m11, m8, m5 vpsrldq m16, m10, 4 @@ -4822,8 +4826,8 @@ vmovdqu32 m9 {k2}, m10 mova m5, [tab_AVX512_idct16_1 + %1 * 64] - pmaddwd m10, m1, m5 - pmaddwd m11, m3, m5 + pmaddwd m10, m28, m5 + pmaddwd m11, m29, m5 vpsrldq m16, m10, 4 paddd m10, m16 @@ -4831,8 +4835,8 @@ paddd m11, m17 vmovdqu32 m10 {k1}, m11 - pmaddwd m11, m4, m5 - pmaddwd m12, m2, m5 + pmaddwd m11, m30, m5 + pmaddwd m12, m31, m5 vpsrldq m16, m11, 4 paddd m11, m16 @@ -4855,8 +4859,8 @@ psrad m9, IDCT_SHIFT1 mova m5, [tab_AVX512_idct16_2 + %1 * 64 + 64] - pmaddwd m10, m0, m5 - pmaddwd m12, m7, m5 + pmaddwd m10, m4, m5 + pmaddwd m12, m6, m5 vpsrldq m16, m10, 4 @@ -4865,7 +4869,7 @@ paddd m12, m17 vmovdqu32 m10 {k1}, m12 - pmaddwd m12, m6, m5 + pmaddwd m12, m7, m5 pmaddwd m13, m8, m5 @@ -4885,8 +4889,8 @@ mova m5, [tab_AVX512_idct16_1 + %1 * 64 + 64] - pmaddwd m12, m1, m5 - pmaddwd m13, m3, m5 + pmaddwd m12, m28, m5 + pmaddwd m13, m29, m5 vpsrldq m16, m12, 4 @@ -4895,8 +4899,8 @@ paddd m13, m17 vmovdqu32 m12 {k1}, m13 - pmaddwd m13, m4, m5 - pmaddwd m5, m2 + pmaddwd m13, m30, m5 + pmaddwd m5, m31 vpsrldq m16, m13, 4 @@ -5094,199 +5098,174 @@ kmovd k1, r7d mov r7d, 0xCCCC kmovd k2, r7d + mova ym2, [idct16_shuff2] + mova ym3, [idct16_shuff3] + mova ym26, [idct16_shuff4] + mova ym27, [idct16_shuff5] .pass1: - movu xm0, [r0 + 0 * 32] - movu xm1, [r0 + 8 * 32] - punpckhqdq xm2, xm0, xm1 - punpcklqdq xm0, xm1 - vinserti128 ym0, ym0, xm2, 1 - - movu xm1, [r0 + 1 * 32] - movu xm2, [r0 + 9 * 32] - punpckhqdq xm3, xm1, xm2 - punpcklqdq xm1, xm2 - vinserti128 ym1, ym1, xm3, 1 - - movu xm2, [r0 + 2 * 32] - movu xm3, [r0 + 10 * 32] - punpckhqdq xm4, xm2, xm3 - punpcklqdq xm2, xm3 - vinserti128 ym2, ym2, xm4, 1 - - movu xm3, [r0 + 3 * 32] - movu xm4, [r0 + 11 * 32] - punpckhqdq xm5, xm3, xm4 - punpcklqdq xm3, xm4 - vinserti128 ym3, ym3, xm5, 1 - - movu xm4, [r0 + 4 * 32] - movu xm5, [r0 + 12 * 32] - punpckhqdq xm6, xm4, xm5 - punpcklqdq xm4, xm5 - vinserti128 ym4, ym4, xm6, 1 - - movu xm5, [r0 + 5 * 32] - movu xm6, [r0 + 13 * 32] - punpckhqdq xm7, xm5, xm6 - punpcklqdq xm5, xm6 - vinserti128 ym5, ym5, xm7, 1 - - movu xm6, [r0 + 6 * 32] - movu xm7, [r0 + 14 * 32] - punpckhqdq xm8, xm6, xm7 - punpcklqdq xm6, xm7 - vinserti128 ym6, ym6, xm8, 1 - - movu xm7, [r0 + 7 * 32] - movu xm8, [r0 + 15 * 32] - punpckhqdq xm9, xm7, xm8 - punpcklqdq xm7, xm8 - vinserti128 ym7, ym7, xm9, 1 - - punpckhwd ym8, ym0, ym2 ;[8 10] - punpcklwd ym0, ym2 ;[0 2] - - punpckhwd ym2, ym1, ym3 ;[9 11] - punpcklwd ym1, ym3 ;[1 3] - - punpckhwd ym3, ym4, ym6 ;[12 14] - punpcklwd ym4, ym6 ;[4 6] - - punpckhwd ym6, ym5, ym7 ;[13 15] - punpcklwd ym5, ym7 ;[5 7] - - punpckhdq ym7, ym0, ym4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67] - punpckldq ym0, ym4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65] - - punpckhdq ym4, ym8, ym3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147] - punpckldq ym8, ym3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145] - - punpckhdq ym3, ym1, ym5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77] - punpckldq ym1, ym5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75] - - punpckhdq ym5, ym2, ym6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157] - punpckldq ym2, ym6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155] - - punpckhqdq ym6, ym0, ym8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145] - punpcklqdq ym0, ym8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144] - - punpckhqdq ym8, ym7, ym4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147] - punpcklqdq ym7, ym4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146] - - punpckhqdq ym4, ym1, ym2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155] - punpcklqdq ym1, ym2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154] - - punpckhqdq ym2, ym3, ym5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157] - punpcklqdq ym3, ym5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156] - - vinserti64x4 m6, m6, ym6, 1 - vinserti64x4 m0, m0, ym0, 1 - vinserti64x4 m8, m8, ym8, 1 - vinserti64x4 m7, m7, ym7, 1 - vinserti64x4 m4, m4, ym4, 1 - vinserti64x4 m1, m1, ym1, 1 - vinserti64x4 m2, m2, ym2, 1 - vinserti64x4 m3, m3, ym3, 1 - + movu xm0, [r0 + 0 * 32] + vinserti128 ym0, ym0, [r0 + 8 * 32], 1 + movu xm1, [r0 + 2 * 32] + vinserti128 ym1, ym1, [r0 + 10 * 32], 1 + + mova ym9, ym2 + mova ym10, ym3 + vpermi2w ym9, ym0, ym1 + vpermi2w ym10, ym0, ym1 + + movu xm0, [r0 + 4 * 32] + vinserti128 ym0, ym0, [r0 + 12 * 32], 1 + movu xm1, [r0 + 6 * 32] + vinserti128 ym1, ym1, [r0 + 14 * 32], 1 + + mova ym11, ym2 + mova ym12, ym3 + vpermi2w ym11, ym0, ym1 + vpermi2w ym12, ym0, ym1 + + mova ym4, ym26 + mova ym6, ym27 + vpermi2d ym4, ym9, ym11 + vpermi2d ym6, ym9, ym11 + + mova ym7, ym26 + mova ym8, ym27 + vpermi2d ym7, ym10, ym12 + vpermi2d ym8, ym10, ym12 + + vpermq ym4, ym4, q3120 + vpermq ym6, ym6, q3120 + vpermq ym7, ym7, q3120 + vpermq ym8, ym8, q3120 + + movu xm0, [r0 + 1 * 32] + vinserti128 ym0, ym0, [r0 + 9 * 32], 1 + movu xm1, [r0 + 3 * 32] + vinserti128 ym1, ym1, [r0 + 11 * 32], 1 + + mova ym9, ym2 + mova ym10, ym3 + vpermi2w ym9, ym0, ym1 + vpermi2w ym10, ym0, ym1 + + movu xm0, [r0 + 5 * 32] + vinserti128 ym0, ym0, [r0 + 13 * 32], 1 + movu xm1, [r0 + 7 * 32] + vinserti128 ym1, ym1, [r0 + 15 * 32], 1 + + mova ym11, ym2 + mova ym12, ym3 + vpermi2w ym11, ym0, ym1 + vpermi2w ym12, ym0, ym1 + + mova ym28, ym26 + mova ym29, ym27 + vpermi2d ym28, ym9, ym11 + vpermi2d ym29, ym9, ym11 + + mova ym30, ym26 + mova ym31, ym27 + vpermi2d ym30, ym10, ym12 + vpermi2d ym31, ym10, ym12 + + vpermq ym28, ym28, q3120 + vpermq ym29, ym29, q3120 + vpermq ym30, ym30, q3120 + vpermq ym31, ym31, q3120 + + vinserti64x4 m4, m4, ym4, 1 + vinserti64x4 m6, m6, ym6, 1 + vinserti64x4 m7, m7, ym7, 1 + vinserti64x4 m8, m8, ym8, 1 + vinserti64x4 m28, m28, ym28, 1 + vinserti64x4 m29, m29, ym29, 1 + vinserti64x4 m30, m30, ym30, 1 + vinserti64x4 m31, m31, ym31, 1 IDCT16_AVX512_PASS1 0, 18, 19 IDCT16_AVX512_PASS1 2, 20, 21 add r0, 16 - movu xm0, [r0 + 0 * 32] - movu xm1, [r0 + 8 * 32] - punpckhqdq xm2, xm0, xm1 - punpcklqdq xm0, xm1 - vinserti128 ym0, ym0, xm2, 1 - - movu xm1, [r0 + 1 * 32] - movu xm2, [r0 + 9 * 32] - punpckhqdq xm3, xm1, xm2 - punpcklqdq xm1, xm2 - vinserti128 ym1, ym1, xm3, 1 - - movu xm2, [r0 + 2 * 32] - movu xm3, [r0 + 10 * 32] - punpckhqdq xm4, xm2, xm3 - punpcklqdq xm2, xm3 - vinserti128 ym2, ym2, xm4, 1 - - movu xm3, [r0 + 3 * 32] - movu xm4, [r0 + 11 * 32] - punpckhqdq xm5, xm3, xm4 - punpcklqdq xm3, xm4 - vinserti128 ym3, ym3, xm5, 1 - - movu xm4, [r0 + 4 * 32] - movu xm5, [r0 + 12 * 32] - punpckhqdq xm6, xm4, xm5 - punpcklqdq xm4, xm5 - vinserti128 ym4, ym4, xm6, 1 - - movu xm5, [r0 + 5 * 32] - movu xm6, [r0 + 13 * 32] - punpckhqdq xm7, xm5, xm6 - punpcklqdq xm5, xm6 - vinserti128 ym5, ym5, xm7, 1 - - movu xm6, [r0 + 6 * 32] - movu xm7, [r0 + 14 * 32] - punpckhqdq xm8, xm6, xm7 - punpcklqdq xm6, xm7 - vinserti128 ym6, ym6, xm8, 1 - - movu xm7, [r0 + 7 * 32] - movu xm8, [r0 + 15 * 32] - punpckhqdq xm9, xm7, xm8 - punpcklqdq xm7, xm8 - vinserti128 ym7, ym7, xm9, 1 - - punpckhwd ym8, ym0, ym2 ;[8 10] - punpcklwd ym0, ym2 ;[0 2] - - punpckhwd ym2, ym1, ym3 ;[9 11] - punpcklwd ym1, ym3 ;[1 3] - - punpckhwd ym3, ym4, ym6 ;[12 14] - punpcklwd ym4, ym6 ;[4 6] - - punpckhwd ym6, ym5, ym7 ;[13 15] - punpcklwd ym5, ym7 ;[5 7] - - punpckhdq ym7, ym0, ym4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67] - punpckldq ym0, ym4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65] - - punpckhdq ym4, ym8, ym3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147] - punpckldq ym8, ym3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145] - - punpckhdq ym3, ym1, ym5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77] - punpckldq ym1, ym5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75] - - punpckhdq ym5, ym2, ym6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157] - punpckldq ym2, ym6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155] - - punpckhqdq ym6, ym0, ym8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145] - punpcklqdq ym0, ym8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144] - - punpckhqdq ym8, ym7, ym4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147] - punpcklqdq ym7, ym4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146] - - punpckhqdq ym4, ym1, ym2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155] - punpcklqdq ym1, ym2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154] - - punpckhqdq ym2, ym3, ym5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157] - punpcklqdq ym3, ym5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156] - - vinserti64x4 m6, m6, ym6, 1 - vinserti64x4 m0, m0, ym0, 1 - vinserti64x4 m8, m8, ym8, 1 - vinserti64x4 m7, m7, ym7, 1 - vinserti64x4 m4, m4, ym4, 1 - vinserti64x4 m1, m1, ym1, 1 - vinserti64x4 m2, m2, ym2, 1 - vinserti64x4 m3, m3, ym3, 1 + movu xm0, [r0 + 0 * 32] + vinserti128 ym0, ym0, [r0 + 8 * 32], 1 + movu xm1, [r0 + 2 * 32] + vinserti128 ym1, ym1, [r0 + 10 * 32], 1 + + mova ym9, ym2 + mova ym10, ym3 + vpermi2w ym9, ym0, ym1 + vpermi2w ym10, ym0, ym1 + + movu xm0, [r0 + 4 * 32] + vinserti128 ym0, ym0, [r0 + 12 * 32], 1 + movu xm1, [r0 + 6 * 32] + vinserti128 ym1, ym1, [r0 + 14 * 32], 1 + + mova ym11, ym2 + mova ym12, ym3 + vpermi2w ym11, ym0, ym1 + vpermi2w ym12, ym0, ym1 + + mova ym4, ym26 + mova ym6, ym27 + vpermi2d ym4, ym9, ym11 + vpermi2d ym6, ym9, ym11 + + mova ym7, ym26 + mova ym8, ym27 + vpermi2d ym7, ym10, ym12 + vpermi2d ym8, ym10, ym12 + + vpermq ym4, ym4, q3120 + vpermq ym6, ym6, q3120 + vpermq ym7, ym7, q3120 + vpermq ym8, ym8, q3120 + + movu xm0, [r0 + 1 * 32] + vinserti128 ym0, ym0, [r0 + 9 * 32], 1 + movu xm1, [r0 + 3 * 32] + vinserti128 ym1, ym1, [r0 + 11 * 32], 1 + + mova ym9, ym2 + mova ym10, ym3 + vpermi2w ym9, ym0, ym1 + vpermi2w ym10, ym0, ym1 + + movu xm0, [r0 + 5 * 32] + vinserti128 ym0, ym0, [r0 + 13 * 32], 1 + movu xm1, [r0 + 7 * 32] + vinserti128 ym1, ym1, [r0 + 15 * 32], 1 + + mova ym11, ym2 + mova ym12, ym3 + vpermi2w ym11, ym0, ym1 + vpermi2w ym12, ym0, ym1 + + mova ym28, ym26 + mova ym29, ym27 + vpermi2d ym28, ym9, ym11 + vpermi2d ym29, ym9, ym11 + + mova ym30, ym26 + mova ym31, ym27 + vpermi2d ym30, ym10, ym12 + vpermi2d ym31, ym10, ym12 + + vpermq ym28, ym28, q3120 + vpermq ym29, ym29, q3120 + vpermq ym30, ym30, q3120 + vpermq ym31, ym31, q3120 + + vinserti64x4 m4, m4, ym4, 1 + vinserti64x4 m6, m6, ym6, 1 + vinserti64x4 m7, m7, ym7, 1 + vinserti64x4 m8, m8, ym8, 1 + vinserti64x4 m28, m28, ym28, 1 + vinserti64x4 m29, m29, ym29, 1 + vinserti64x4 m30, m30, ym30, 1 + vinserti64x4 m31, m31, ym31, 1 IDCT16_AVX512_PASS1 0, 22, 23 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel