Review for the asm part. Search for %%%. Regards, Laurent
%%% Use ga to reduce temp count. %%% %%% All loads of the form [some_label+reg] are not PIC, i.e. they are not legal. %%% RIP-relative code: [rip + constant]. [rip + reg] doesn't exist. %%% The linker patches the non-PIC loads at run time. %%% Fix all such loads by first loading the label with lea reg, [label] %%% followed by the load you want to do with the label. diff --git a/f265/asm/avx2/intra.asm b/f265/asm/avx2/intra.asm new file mode 100644 index 0000000..d3ad1b9 --- /dev/null +++ b/f265/asm/avx2/intra.asm @@ -0,0 +1,1068 @@ %%% inverted or inversed, pick one and stick to it. +; Copyright (c) 2014, VANTRIX CORPORATION. All rights reserved. See LICENSE.txt +; for the full license text. + +%include "x86inc.asm" + +section .data %%% You violate your alignment directives. %%% Divide your patterns by alignment under the proper directives. +align 32 + %%% vertical. %%% The comment is unclear. Replace by %%% Shuffle pattern to regroup the left and top-right pixels together %%% for rows 0/2, 1/3, 4/6, 5/7. %%% planar_8_left. +planar_8: db 14,15, 12,13, 6,7, 4,5, 0,0,0,0,0,0,0,0, ; Row index, shuffled to do 2 rows at the time. + db 10,11, 8, 9, 2,3, 0,1, 0,0,0,0,0,0,0,0, ; Used to get the vertival value in planar. + +angle_mul_ver: dw 1,2,5,6, 0,0,0,0, ; Row index, shuffled to do 2 rows at the time. %%% weight. + dw 3,4,7,8, 0,0,0,0, ; Used to get the weigh and offset of each row on vertical angles. + %%% weight. +angle_mul_hor: dw 1,2,3,4,5,6,7,8, ; Row index. Used to get the weigh and offset of each row on + ; horizontal angles. + %%% multiplier. +angle_inv_mul_hor: dw 0, 1, 2, 3, 4, 5, 6, 7, ; Mutiplayer for inv_angle_8 on horizontal angles. +angle_inv_mul_ver: dw 7, 6, 5, 4, 3, 2, 1, 0, ; Mutiplayer for inv_angle_8 on vertical angles. + +dia_bot_left_8: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ; Invert byte order. + +; Repeat values on a whole 8x8 row. Inversed for use in pure horizontal. %%% hor. +ang_hoz_8: db 3, 3, 3, 3, 3, 3, 3, 3, + db 2, 2, 2, 2, 2, 2, 2, 2, + db 1, 1, 1, 1, 1, 1, 1, 1, + db 0, 0, 0, 0, 0, 0, 0, 0, + %%% neighbour pairs. +; Pshufb pattern to generate neighbours pair. +pair_low: db 0,1, 1,2, 2,3, 3,4, 4,5, 5,6, 6,7, 7,8, +pair_high: db 7,8, 8,9, 9,10, 10,11, 11,12, 12,13, 13,14, 14,15 + +; Multiply high lane by 3 while keeping the low lane as-is. %%% Garbage ', ;'. +triple_last_lane: db 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, ; + db 3,0, 3,0, 3,0, 3,0, 3,0, 3,0, 3,0, 3,0, ; + +planar_wgt_hor: db 7, 1, 6, 2, 5, 3, 4, 4, ; Weight pair, used for planar row weighting. + db 3, 5, 2, 6, 1, 7, 0, 8, + +; Manage neighbour filtering edge case. %%% Remove this stale stuff. +intra_neigh_ab: db 15, 0, 1, 15, 6, 7, 8, 9, 15, 15, 15, 15, 15, 15, 15, 15 ; Re-order pixel to fit all + ; data in 8 bytes. +intra_neigh_rev_ab: db 0, 15, 15, 15, 15, 15, 15, 4, 5, 15, 15, 15, 15, 15, 15, 15 ; Re-order result to align them to + ; their real location. +intra_blend_ab: db 255, 0, 0, 0, 0, 0, 0, 255, 255, 0, 0, 0, 0, 0, 0, 0 ; Blend edge case with common data. + %%% pixels. +intra_neigh_cd: db 0, 15, 14, 0, 9, 8, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, ; Re-order pixel to fit all data in 8 bytes. %%% results. +intra_neigh_rev_cd: db 0, 0, 0, 0, 0, 0, 0, 5, 4, 0, 0, 0, 0, 0, 0, 0 ; Re-order result to align them to their real + ; location. +intra_blend_cd: db 0, 0, 0, 0, 0, 0, 0, 255, 255, 0, 0, 0, 0, 0, 0, 255 ; Blend edge case with common data. + + +; Seed on which the the neighbours offset of inverted angles are calculated. %%% words, times. +; As word (repeated 4 time) for speed-ups. +inv_angle_8: db 16, 16, 16, 16 + db 19, 19, 19, 19 + db 24, 24, 24, 24 + db 30, 30, 30, 30 + db 39, 39, 39, 39 + db 57, 57, 57, 57 + db 102, 102, 102, 102 + db 255, 255, 255, 255 + +; Seed on which the angles weights and offsets are calculated. %%% words, times. +; As word (repeated 4 time) for speed-ups. +intra_angle: db 32, 32, 32, 32 + db 26, 26, 26, 26, + db 21, 21, 21, 21, + db 17, 17, 17, 17, + db 13, 13, 13, 13, + db 9, 9, 9, 9, + db 5, 5, 5, 5, + db 2, 2, 2, 2, + db 0, 0, 0, 0, + db 2, 2, 2, 2, + db 5, 5, 5, 5, + db 9, 9, 9, 9, + db 13, 13, 13, 13, + db 17, 17, 17, 17, + db 21, 21, 21, 21, + db 26, 26, 26, 26, ; TODO: If we want to be more agressive on memory saving, + db 32, 32, 32, 32, ; this could be cut in half. %%% The first and last entries are not used. Adjust. + + +; Blend mask. Will be loaded with an offset, allowing partial merging of 2 registers. %% 0xff lower case (convention). +neigh_avail_msk: dq 0, 0, 0, 0, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF + + +neig_bl_unav_8: db 0,0,0,0,0,0,0,0, 0, 1, 2, 3, 4, 5, 6, 7 +neigh_1_of_2: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, + db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7, +blend_extremity: db 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, +; Pattern used as mask, bias, offset, ... +; As double to use the more efficient vpbroadcastd. +neigh_last_b_of_d: db 3, 7, 11, 15, %%% Do it like 0_1b below, this is hard to read. +pat_1_0b: dw 256, 256, +pat_32w: dw 32, 32, +pat_31w: dw 31, 31, +pat_16w: dw 16, 16, +pat_15b: db 15, 15, 15, 15, +pat_128b: db 128, 128, 128, 128 +pat_14_15b: db 14,15, 14,15, +pat_7_8b: db 7,8, 7,8, +pat_7b: db 7,7,7,7 +pat_8w: dw 8, 8, +pat_0_1b: db 0,1, 0,1, +pat_1b: db 1,1, 1,1, %%% lower case (convention). +pat_255q: dq 0xFF +pat_2w: dw 2, 2,
%%% Remove that junk. +; Intra 4x4 pattern. +intra_copy_4: dd 0x00000000, 0x01010101, 0x00000000,0x00000000, + dd 0x02020202, 0x03030303, 0x00000000,0x00000000, + + +ang_hoz_16: db 1, 1, 1, 1, 1, 1, 1, 1, ; Repeat values on a whole 16x16 row. + db 1, 1, 1, 1, 1, 1, 1, 1, ; Inversed for use in pure horizontal. + db 0, 0, 0, 0, 0, 0, 0, 0, + db 0, 0, 0, 0, 0, 0, 0, 0, + +intra_hor_4: db 3, 1, 2, 2, 1, 3, 0, 4, ; Horizontal weight for 4x4 planar. %%% Vertical. +intra_vert_4: db 3, 1, 3, 1, 3, 1, 3, 1, ; Vertival weight for 4x4 planar. + +pat_4w: dw 4, 4, ; Rounding bias. + +section .text + + +; 8x8 intra prediction functions. %%% There are, intra prediction modes. +; There is 11 assembly function to cover all 8x8 intra prediction. +; - Planar and DC. %%% pure. +; - 3 Pure diagonals: +; - dia_bot_left. +; - dia_top_left. +; - dia_top_right. +; - Pure vertical and horizontal. +; - 4 diagonals: +; - hor_bot. +; - hor_top. +; - vert_left. +; - vert_right. +; %%% parameters. +; They all have the same input parameters, although some input parameter may be ignored. +; - g0: Destination. %%% top-right. +; - g1: Neighbours. 0 is the bottom-left, 64 is the top left, 65 is the top, 123 is the top right. %%% Fix that doc, it's wrong. %%% Mode. +; - g2: Angle. %%% edge. +; - g3: Filter egde flag. + + +; Intra DC 8x8. +DEFFUN f265_lbd_predict_intra_dc_8_avx2, ia=4, at=8844, ti=2, tv=5, ym=1 + ; Logic: + ; Sum all neighbours, except the corners. %%% samples. + ; Divide with bias by the number of sample. + + vpmovzxbw x1, [g1+56] ; Load all data. + vpmovzxbw x2, [g1+64] + + vpaddw y1, y2 ; Add them together. + + vpalignr y2, y1, 8 ; At each step, fold the register in 2... + vpaddw y1, y2 ; ... then add each value together. + + vpalignr y2, y1, 4 + vpaddw y1, y2 + + vpalignr y2, y1, 2 + vpaddw y1, y2 + + vpaddw y1, [pat_8w] ; Load the rounding bias. + vpsrlw y1, y1, 4 ; Divide by 16. + + vpackuswb y1, y1 ; Word to byte. + + vpbroadcastb y1, x1 ; Replicate the value. + + vmovdqa y0, y1 + + and g3, 1 + jz .SKIP_FILTER + + ; 3 cases: + ; - Top left = 2*base + top + left. + ; - Top = 3*base + top. + ; - Left = 3*base + left. + %%% You already did that above, this is wasteful. + vpmovzxbw x2, [g1+64] ; Load top row. %%% neighbours. + vpmovzxbw x3, [g1+56] ; Load left neigbours. + + vpextrb g2d, x1, 1 ; Extract base. + + mov g3, g2 ; Copy base. %%% LEA. + shl g2, 1 ; Base * 2. + add g2, 2 ; Add rounding bias. + add g3, g2 ; Base * 3 + rounding bias. + %%% MOVD. + vpinsrw x4, g3d, 0 + vpbroadcastw x4, x4 ; Broadcast base * 3 + rounding bias. + + movzx g4, byte [g1+64] ; Load the first top and left value. + movzx g5, byte [g1+63] + %%% Suboptimal. Pack into 1 ymm first. + vpaddw x2, x4 ; 3 * Base + top row + rounding bias. + vpaddw x3, x4 ; + + vpsrlw x2, 2 ; Divide by 4. + vpsrlw x3, 2 ; + %%% Pack once. Low/top will be already at the correct location. %%% Use vpermq for left. + vpackuswb x2, x2 ; Word to byte. + vpackuswb x3, x3 ; + vinserti128 y3, x3, 1 + + vpblendd y0, y2, y0, 0xfc ; Save in top row. + + vmovdqu y5, [ang_hoz_8] ; + vpbroadcastq y2, [pat_255q] ; + + vpshufb y6, y3, y5 ; Replicate 8x the 4 lower value. + vpblendvb y1, y1, y6, y2 ; Blend only the first value of each row. + %%% Do actually shift, don't align. + vpalignr y3, y3, 4 ; Shift by 4 to do the 4 last rows. + + vpshufb y6, y3, y5 ; Replicate 8x the 4 lower value. + vpblendvb y0, y0, y6, y2 ; Blend only the first value of each row. + + %%% top-left. + ; Do top right. + add g4, g5 ; Top + left. + add g2, g4 ; Top + left + 2*base + bias. + shr g2, 2 ; Get the average. + + vmovdqa y2, y0 + vpinsrb x2, g2b, 0 + vinserti128 y0, y0, x2, 0 + + .SKIP_FILTER: + + vmovdqu [g0], y0 + vmovdqu [g0+0x20], y1 ; Save the value. + + RET + + +; Intra planar 8x8. %%% Bad declaration. You actually use 9 ymm registers. BUT, you don't need to, check below. +DEFFUN f265_lbd_predict_intra_planar_8_avx2, ia=4, at=8844, ti=0, tv=8, ym=1 %%% Wrong formula. It's 8-x-1 and co. Wrong neighbour names. %%% value = ((8-x-1)*left + (8-y-1)*top + (x+1)*top_right + (y+1)*bottom_left + 8) >> 4); + ; value = ((8-x)*right_neigh + (8-y)*top_neigh + (x+1)*top_right + (y+1)*bottom_left + 8) >> 4); + + vpbroadcastb x6, [g1+56-1] ; Load & broadcast bottom left. + vpmovzxbw x1, [g1+64] ; Load top neighbours. + vpmovzxbw x6, x6 + %%% Use vmovq+vpbroadcastb here. %%% vmovq mem: p23, latency 3. %%% vpbroadcastb reg: p5, latency 3 since cross-lane. %%% vpboradcastb mem: p01 p23 p5, latency 7. + vpbroadcastb y7, [g1+72] ; Load & broadcast top right. + + vpbroadcastd y0, [pat_0_1b] ; Weight distribution pattern. + + vpsllw y2, y1, 3 ; Top row * 8. + vpsubw y1, y6 ; Row delta (top neighbour - bottom left). + + vpsubw y2, y1 ; Top row * 7 + bottom left. + %%% y8 is a bad temp, use y3. + vpsllw y8, y1, 1 ; + vpsubw y6, y2, y8 ; Top row * 5 + 3*bottom left. + vinserti128 y2, y2, x6, 1 ; Get row 2 values. %%% apply. Comment isn't clear. %%% Double the vertical delta removed at each line. + vinserti128 y1, y1, x1, 1 ; Double offset to aply at each line. + + ; Register usage: + ; - y1: row delta. + ; - y2: row sum. + %%% Unaligned load, should be vpbroadcastq. + vbroadcasti128 y3, [g1+64-8] ; Load left column. + vpunpcklbw y3, y7 ; Merge top right with left col. + vpshufb y3, [planar_8] ; Shuffle to do 2 columns at a time. + %%% weights. + vbroadcasti128 y4, [planar_wgt_hor] ; Load weight. + vpbroadcastd y5, [pat_8w] ; Load rounding bias. + + ; Register usage: %%% lower case as you used above. + ; - y0: Weight distribution pattern. %%% row vertical delta. + ; - y1: Row delta. %%% row vertical sum. + ; - y2: Row sum. + ; - y3: Column values. %%% weights. + ; - y4: Column weight. %%% Rounding. + ; - y5: Rouding bias. + %%% Document your arguments. %%% %macro DO_ROW 2 ; %1: alignment offset, %2: destination register. + %macro DO_ROW 2 + %if %1 != 0 %%% Add delta to row sum. + vpsubw y2, y1 ; Add offset to row value. + vpalignr y%2, y3, %1*2 ; Offset column. %%% Repeat the column. + vpshufb y%2, y%2, y0 ; Repeat the pattern. + %else %%% Repeat the column. + vpshufb y%2, y3, y0 ; Repeat the pattern. + %endif + %%% factors. + vpmaddubsw y%2, y4 ; Get the sum of all factor. + vpaddusw y%2, y2 ; Add vertical. + vpaddusw y%2, y5 ; Add rounding bias. + vpsrlw y%2, y%2, 4 + %endmacro + + DO_ROW 0, 6 ; Do row 0 and 2. + DO_ROW 1, 7 ; Do row 1 and 3. + + vpackuswb y6, y7 + vmovdqu [g0], y6 + + vpsubw y2, y1 ; Add offset to row value. + vpsubw y2, y1 ; + + DO_ROW 2, 6 ; Do row 4 and 6. + DO_ROW 3, 7 ; Do row 5 and 7. + + vpackuswb y6, y7 + vmovdqu [g0+0x20], y6 %%% Align. +%unmacro DO_ROW 2 + RET + + +; Intra pure diagonal bottom left 8x8. +DEFFUN f265_lbd_predict_intra_dia_bot_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1 + + vmovdqu x1, [dia_bot_left_8] ; Load pattern. %%% Unnecessary unaligned load. Fix the pattern instead. + vmovdqu x0, [g1+48-1] ; Load all data. %%% Use memory operand. + vpshufb y0, y1 ; Re-order it. + %%% Offset the pixels in the high lane to build rows 2 and 3. + vpalignr y1, y0, 2 ; Offset value: will build rows 2 and 3. + vinserti128 y0, y0, x1, 1 ; + + vpalignr y1, y0, 1 ; Create row 1 and 3. %%% Them. %%% Use plural for 'row' everywhere. + vpunpcklqdq y2, y0, y1 ; Merge then with row 0 and 2. %%% 0 to 3. + vmovdqu [g0], y2 ; Save row 1 to 4. + + vpalignr y1, y0, 5 ; Offset to generate row 4 to 7. + + vpalignr y0, y0, 4 ; Repeat operation above for row 4 to 7. + vpunpcklqdq y2, y0, y1 ; + vmovdqu [g0+0x20], y2 ; + RET + + +; Intra angular horizontal bottom 8x8. %%% Unify code with horizontal-top with macro. %%% Apply comments from vertical top-right. +DEFFUN f265_lbd_predict_intra_hor_bot_8_avx2, ia=4, at=8844, ti=0, tv=10, ym=1 + vbroadcasti128 y9, [g1+48] ; Load left column. + + mov g3, 18 ; Fix angle. + sub g3, g2 + + ; Generate weight and offset. + vpbroadcastd y1, [intra_angle+g3*4] ; Load angle factor. + vbroadcasti128 y2, [angle_mul_hor] ; Load multiplication table. + vpmaddubsw y1, y1, y2 ; Result in offset and weight for each column. + + ; Generate pairs. + vpbroadcastd y2, [pat_31w] ; Load weight mask. + vpand y2, y2, y1 ; Extract weight. + vpbroadcastd y0, [pat_32w] ; Load weight complement base. + vpsubw y0, y2 ; Get weight complement. + vpackuswb y2, y2, y2 ; Word to byte. + vpackuswb y0, y0, y0 ; Word to byte. + vpunpcklbw y0, y2, y0 ; Make the pair. Final Weight. + + ; Generate neighbours offset pattern. + vpbroadcastd y3, [pat_14_15b] ; Load base offset pattern. + vpsrlw y1, y1, 5 ; Get the angle offset. %%% Reduce port 5 pressure. %%% vpsllw y2, y1, 8 %%% vpor y1, y2 + vpackuswb y1, y1, y1 ; Word to byte. + vpunpcklbw y1, y1, y1 ; Double the offset (twice for each pair). + vpsubw y1, y3, y1 ; Add the angle offset to the base offset. + + ; Load patterns. + vpbroadcastd y2, [pat_16w] ; Load rounding bias. + vbroadcasti128 y6, [pair_low] ; Load the neighbour pair generating pattern. + + ; Calculate the offset for the high lane. %%% neighbour position offsets. + vpbroadcastd y7, [pat_1b] ; Load neighbours positions offset. + vpsubb y8, y1, y7 ; Pre-offset by 2 the neighbour position. %%% rows. + vpsubb y8, y8, y7 ; Will be used to calculate 2 row at once. %%% offsetted + vinserti128 y1, y1, x8, 1 ; Put the offseted load pattern on the high lane. + + ; Predict 2 rows. %%% row. + %macro DO_ROW 2 ; %1: Row offset, %2: register in which to put the value. + + %if %1 != 0 + vpsubb y1, y7 ; Update neighbours offset. + %endif + + vpshufb y%2, y9, y1 ; Generate neighbour pair. + %%% row. + ; Calculate rows values. + vpmaddubsw y%2, y%2, y0 ; Multiply with weight. %%% Extra comma. + vpaddw y%2, y2, ; Add rounding bias. + vpsrlw y%2, y%2, 5 ; Weighted average. + vpackuswb y%2, y%2 ; Word to byte. + %endmacro + + DO_ROW 0, 3 ; Do row 0 and 2. + DO_ROW 1, 4 ; Do row 1 and 3. + + vpunpcklqdq y3, y4 ; Merge value. + vmovdqu [g0+0x00], y3 ; Save result. + %%% row 0. + vpsubb y1, y7 ; Update neighbours offset from row 1 to row 4. + vpsubb y1, y7 + + DO_ROW 4, 3 ; Do row 4 and 6. + DO_ROW 5, 4 ; Do row 5 and 7. + + vpunpcklqdq y3, y4 ; Merge value. + vmovdqu [g0+0x20], y3 ; Save result. + + %unmacro DO_ROW 2 + RET + + %%% Invalid declaration, you don't use temporary integer. +; Intra pure horizontal 8x8. +DEFFUN f265_lbd_predict_intra_hor_8_avx2, ia=4, at=8844, ti=3, tv=4, ym=1 + vmovdqu y0, [ang_hoz_8] ; Load shuffle mask. + + vpbroadcastd y1, [g1+63-3] ; Load the first 4 rows. + vpbroadcastd y2, [g1+63-7] ; Load the second 4 rows. + %%% times. + vpshufb y1, y1, y0 ; Replicate 8 timae each value. + vpshufb y2, y2, y0 ; + + and g3, 1 + jz .SKIP_FILTER + + vpmovzxbw x0, [g1+64] ; Load top row. %%% Add movq. + vpbroadcastb x3, [g1+128] ; Load top left. + + vpmovzxbw x3, x3 ; Byte to word. + + vpsubw x0, x3 ; top - top left. + vpsraw x0, 1 ; (top - top left)/2. + + vpbroadcastb x3, [g1+63] ; Load left. + vpmovzxbw x3, x3 ; Byte to word. + vpaddw x0, x3 ; Left + (top - top left)/2. + + vpxor x3, x3 ; Replace negative values by 0. + vpmaxsw x0, x3 ; + + vpackuswb x0, x0 ; Word to byte with unsigned saturation. + + vpblendd y1, y0, y1, 0xfc ; Update the first 8 bytes. + + .SKIP_FILTER: + vmovdqu [g0], y1 ; Save it. + vmovdqu [g0+0x20], y2 ; + + RET + + +; Intra angular horizontal top 8x8. +DEFFUN f265_lbd_predict_intra_hor_top_8_avx2, ia=4, at=8844, ti=0, tv=10, ym=1 + ; Re-order the top neighbour. %%% No need, factor in the constant in the load below instead (-18*4). + mov g3, 18 ; Get the angle's inverted offset. + sub g3, g2 ; %%% No need, factor in the constant in the load below instead (-2*4). + sub g2, 2 ; Get the angle's offset. %%% Unaligned load. + vmovdqu x5, [g1+63] ; Load top neighbour. + vpinsrb x5, [g1+128], 0 ; Insert the top left neighbour. + + ; Generate weight and offset. %%% Invalid load. + vpbroadcastd y1, [intra_angle+g2*4] ; Load angle factor. + vbroadcasti128 y2, [angle_mul_hor] ; Load multiplication table. + vpmaddubsw y1, y1, y2 ; Result in offset and weight. + vpbroadcastd y2, [pat_31w] ; Load weight mask. + vpand y2, y2, y1 ; Get the weight. + + ; Generate weight pairs. + vpbroadcastd y0, [pat_32w] ; Load weight complement base. + vpsubw y0, y2 ; Get weight complements. + vpackuswb y2, y2, y2 ; Word to byte. + vpackuswb y0, y0, y0 ; Word to byte. %%% weight. + vpunpcklbw y0, y0, y2 ; Make the pair. Final Weight. + + ; Generate offset pattern. %%% Period. + vpbroadcastd y3, [pat_7_8b] ; Load the base position pattern + vpsrlw y1, y1, 5 ; Extract neighbour offset. %%% See the bottom function. + vpackuswb y1, y1, y1 ; To byte. + vpunpcklbw y1, y1, y1 ; Double each value (twice for each pair). + vpaddw y1, y3, y1 ; Add offset with base. Result in actual neighbour position. + + ; Import top neighbour with the left ones. + vpbroadcastd y4, [inv_angle_8+g3*4] ; Load the inversed angle values. + vmovdqu x3, [angle_inv_mul_hor] ; Load the weight values. %%% have an invalid offset. + vpmaddubsw y4, y4, y3 ; Get the weight. Some neighbour will give invalid offset. + ; Since we never read them, it's ok. + vpbroadcastd y3, [pat_8w] ; Load inversed angle bias. + vpaddw y4, y3 ; Add inversed angle bias. + vpsraw y4, 4 ; Get inversed neighbour offset. + vpackuswb y4, y4 ; Word to byte. + vpshufb y5, y4 ; Re-order left neighbours. + + ; Load patterns. + vmovdqu x4, [g1+56] ; Load left data. + vpblendd y5, y4, y5, 0xfc ; Blend left neighbours with top neighbours. + vinserti128 y9, y5, x5, 1 ; Double data. + + ; Update offset for the high lane. + vpbroadcastd y2, [pat_16w] ; Load rounding bias. + vpbroadcastd y7, [pat_1b] ; Load neighbours positions offset. + vpsubb y8, y1, y7 ; Pre-offset by 2 the neighbour position. + vpsubb y8, y8, y7 ; Will be used to calculate 2 row at once. %%% offsetted. + vinserti128 y1, y1, x8, 1 ; Put the offseted load pattern on the high lane. + + %%% lower case. + %macro DO_ROW 2 ; %1: Row offset. %2: Register in which to put the value. + + %if %1 != 0 + vpsubb y1, y7 ; Update pair pattern. + %endif + + vpshufb y%2, y9, y1 ; Select needed variables. + + ; Calcultate rows prediction. + vpmaddubsw y%2, y%2, y0 ; Multiply with weight. + vpaddw y%2, y2, ; Add rounding bias. + vpsrlw y%2, y%2, 5 ; Weighted average. + vpackuswb y%2, y%2 ; Word to byte. + %endmacro + + DO_ROW 0, 3 ; Do row 0 and 2. + DO_ROW 1, 4 ; Do row 1 and 3. + + vpunpcklqdq y3, y4 ; Merge value. + vmovdqu [g0+0x00], y3 ; Save result. + + vpsubb y1, y7 ; Update pair pattern from row 1 to row 4. + vpsubb y1, y7 + + DO_ROW 4, 3 ; Do row 4 and 6. + DO_ROW 5, 4 ; Do row 5 and 7. + + vpunpcklqdq y3, y4 ; Merge value. + vmovdqu [g0+0x20], y3 ; Save result. + + %unmacro DO_ROW 2 + RET + + +; Intra pure diagonal top left 8x8. +DEFFUN f265_lbd_predict_intra_dia_top_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1 %%% Unaligned loads. That's not needed. %%% 1) vmovq top offsetted by 1 (unaligned load but OK since it's 8 bytes). %%% 2) vpinsrb to insert top-left. %%% 3) vmovhps to insert left (aligned load). + vmovdqu x0, [g1+64-7] ; Load all data. + vmovdqu x1, [g1+64-8] ; Load top row. + vpinsrb x0, [g1+128], 7 ; Load & insert top left. + %%% Remove. + vpblendd x0, x1, 0xfc ; Blend left with top. + %%% Your row numbers are wrong. %%% Fix all the comments flagged in bottom-left. + vpalignr y1, y0, 2 ; Offset value: will build rows 6 and 5. + vinserti128 y0, y1, x0, 1 ; + + vpalignr y1, y0, 1 ; Create row 6 and 4. + vpunpcklqdq y2, y1, y0 ; Merge then with row 0 and 2. + vmovdqu [g0+0x20], y2 ; + + vpalignr y0, y0, 4 ; Offset to generate row 0 to 3. + + vpalignr y1, y0, 1 ; Repeat operation above for row 0 to 3. + vpunpcklqdq y2, y1, y0 ; + vmovdqu [g0], y2 ; Save row 0 to 3. + RET + + %%% Vertical. %%% Invalid declaration, you don't use temporary integer. +; Intra angular vertival left 8x8. +DEFFUN f265_lbd_predict_intra_vert_left_8_avx2, ia=4, at=8844, ti=2, tv=11, ym=1 %%% Duplicated code with horizontal-top. + sub g2, 18 %%% Unaligned load. + vmovdqu x0, [g1+64-8] ; Load top left data. + vmovdqu x4, x0 ; Keep a copy. %%% top-left. + vpinsrb x0, [g1+128], 8 ; Load top right. + %%% neighbours. + ; Re-order the left neighbour. %%% Illegal load. + vpbroadcastd y2, [inv_angle_8+g2*4] ; Load the inversed angle values. + vmovdqu x3, [angle_inv_mul_ver] ; Load the inversed weight values. %%% have an invalid offset. + vpmaddubsw y2, y2, y3 ; Get the weight. Some neighbour will give invalid offset. + ; Since we never use them, it's ok. + vpbroadcastd y3, [pat_8w] ; Load inversed angle bias. + vpaddw y2, y3 ; Add inversed angle bias. + vpsraw y2, 4 ; Get inversed neighbour offset. + vpsubb y2, y3, y2 ; Invert the index. + vpackuswb y2, y2 ; Word to byte. + vpshufb y0, y2 ; Re-order left neighbours. + %%% re-ordered. + ; Blend re-ordened neighbours with the top neighbours. %%% Don't load top above. Use vmovhps here (will have wrong order) + vpermq to dupe and reorder. + vpblendd y0, y0, y4, 0xfc ; Merge left neighbours with top neighbours. + vinserti128 y10, y0, x0, 1 ; Double top row. + %%% ==== Begin copy&paste === %%% All the following code is the same as vertical-right, except for %%% vpunpcklbw register order, pair_low/high, vpaddb/vpsubb. %%% I think you lost the optimization for angle_mul_ver in vert_right. %%% The comments mismatch. %%% Use a macro with vertical-right as the "correct" version. + ; Calculate the angle offset base. %%% Illegal load. + vpbroadcastd y2, [intra_angle+g2*4] ; Load angle factor. + vmovdqa y8, y2 ; Keep a copy. Will be used to increment the offset. + vpmaddubsw y7, y2, [triple_last_lane] ; Offset the high lane. + vpackuswb y7, y7 ; Word to byte. + + ; Calculate the weight. + vmovdqu y3, [angle_mul_ver] ; Load multiplication table. + vpmaddubsw y2, y3, y2 ; Result in offset|weight. + vpbroadcastd y3, [pat_31w] ; Load mask. + vpand y3, y3, y2 ; Weight. + vpbroadcastd y4, [pat_32w] ; Load weight complement base. + vpsubw y4, y3 ; Get the weight complement. %%% Final weight. + vpunpcklbw y2, y3, y4 ; Make the pair. Final Weight. + + ; Load patterns. + vpbroadcastd y0, [pat_16w] ; Load rounding offet. + vpbroadcastd y1, [pat_1_0b] ; Load weight distribution pattern. + vbroadcasti128 y6, [pair_high] ; Load pair making pattern. + vpbroadcastd y9, [pat_7b] ; Load "word shift as byte shift" mask pattern. + + ; Register usage : + ; - g0: Result array. + ; - y0: Rounding bias. + ; - y1: Word replication pattern. + ; - y2: Weights, distributed to do 2 rows at a time. + ; - y3: 2 rows of results ([0|2] then [4|5]). + ; - y4: 2 rows of results ([1|3] then [6|7]) . + ; - y5: Temp. + ; - y6: Generate pair. + ; - y7: Angle sum. Used to generate the offset. + ; - y8: Angle value. Add it to the sum at each row. + ; - y9: "Word shift as byte shift" mask pattern. + ; - y10: Top row. Replicated. + %%% Register. Lower case, comma. + %macro DO_ROW 2 ; %1: Row offset. %2: Eegister in which to put the value. + + %if %1 != 0 + vpaddb y7, y8 ; Add the angle to the sum. Generate the offset. + %endif + + ; Generate the neighbours pairs. + vpsrlw y%2, y7, 5 ; Get the offset. + vpand y%2, y9 ; Mask to simulate byte shift. + vpsubb y%2, y6, y%2, ; Generate pair offset. + vpshufb y%2, y10, y%2 ; Shuffle data into pair. + + ; Broadcast the current weights. + %if %1 != 0 + vpalignr y5, y2, %1*2 ; Get weights. + vpshufb y5, y1 ; Broadcast weights. + %else + vpshufb y5, y2, y1 ; Broadcast weights. + %endif + %%% row. + ; Calculates rows predictions. + vpmaddubsw y%2, y%2, y5 ; Multiply values with weights. + vpaddw y%2, y0, ; Add rounding bias. + vpsrlw y%2, y%2, 5 ; Weighted average. + vpackuswb y%2, y%2 ; Word to byte. + %endmacro + + DO_ROW 0, 3 ; Do row 0 and 2. + DO_ROW 2, 4 ; Do row 1 and 3. + %%% Same error as vertical-right. + vpunpcklqdq y3, y4 ; Merge value. + vmovdqu [g0+0x00], y3 ; Save result. + + vpaddb y7, y8 ; Offset from row 1 to row 4. + vpaddb y7, y8 + + DO_ROW 4, 3 ; Do row 4 and 6. + DO_ROW 6, 4 ; Do row 5 and 7. + + vpunpcklqdq y3, y4 ; Merge value. + vmovdqu [g0+0x20], y3 ; Save result. + + %unmacro DO_ROW 2 + RET %%% ==== End copy&paste === + + +; Intra pure vertical 8x8. %%% Invalid declaration, you don't use temporary integer. %%% You use 'ver' elsewhere, stick to one abbreviation. +DEFFUN f265_lbd_predict_intra_vert_8_avx2, ia=4, at=8844, ti=2, tv=7, ym=1 %%% Load overflow (32 bytes when 8 are needed) and useless operation. %%% vpbroadcastq x0, [g1+64] + vmovdqu y0, [g1+64] ; Load all data. + vpbroadcastq y0, x0 ; Copy the row 4 times. Holds row 0 to 3. + vmovdqa y6, y0 ; Copy it. Holds row 4 to 7. + + and g3, 1 + jz .SKIP_FILTER + %%% Add movq. + vpbroadcastb x3, [g1+128] ; Load top left. %%% You already loaded [g1+64] above. + vpbroadcastb x1, [g1+64] ; Load top. %%% neighbours. + vpmovzxbw x2, [g1+64-8] ; Load left neigh. + + vpmovzxbw x3, x3 ; Word to byte. + vpmovzxbw x1, x1 + + vpsubw x2, x3 ; Left - top left. + vpsraw x2, 1 ; Signed divide by 2. + vpaddw x2, x1 ; Top + (left - top left)/2. + + vpxor x3, x3 + vpmaxsw x2, x3 ; Clip negative value to 0. + vpackuswb x2, x2 ; Word to byte with unsigned saturation. + vinserti128 y2, x2, 1 ; Double the data. + %%% replication. + vmovdqu y3, [ang_hoz_8] ; Load replicatino pattern. %%% that, blends. + vpbroadcastq y4, [pat_255q] ; Pattern taht blend in a word out of 8. %%% See comments in DC function. + %%% values. + vpshufb y5, y2, y3 ; Replicate 8x the 4 lower value. + + vpalignr y2, y2, 4 ; Shift to get the last 4 rows. + + vpblendvb y6, y5, y4 ; Blend only the first value of each row. + + vpshufb y5, y2, y3 ; Replicate 8x the 4 lower value. + vpblendvb y0, y5, y4 ; Blend only the first value of each row. + + .SKIP_FILTER: + vmovdqu [g0+0x00], y0 ; Save it. + vmovdqu [g0+0x20], y6 ; + + RET + + +; Intra angular vertical right 8x8. %%% Invalid declaration, you use y10. %%% Invalid declaration, you don't use temporary integer. +DEFFUN f265_lbd_predict_intra_vert_right_8_avx2, ia=4, at=8844, ti=2, tv=10, ym=1 + + vbroadcasti128 y10, [g1+64] ; Load top row. %%% Broken math. 18 + 14 = 32. Effective angle range is 0-34. %%% See comments in horizontal case. + sub g2, 18 ; Bring the angle within the range 0 to 14. %%% cases. + ; 0,7 and 14 are special case with their own function. + %%% Illegal load. + vpbroadcastd y2, [intra_angle+g2*4] ; Load angle factor. %%% to increment the sum at every row. %%% Suboptimal. Make vpbroadcastd destination y8 directly. + vmovdqa y8, y2 ; Keep a copy by which the sum is incremented at every row. + %%% rows. + vpmaddubsw y7, y2, [triple_last_lane] ; Multiply high lane by 3. Offset required to do 2 row at the time. + vpackuswb y7, y7 ; This is the angle sum for each row. + %%% Offset and weight for all rows. + vpmaddubsw y2, y2, [angle_mul_ver] ; Result in offset and weight for all rows. + vpbroadcastd y3, [pat_31w] ; Load mask. + vpand y3, y3, y2 ; Keep only weight. + + vpbroadcastd y4, [pat_32w] ; Load weight complement base. + vpsubw y4, y3 ; Get weight complement. %%% weight. + vpunpcklbw y2, y4, y3 ; Make the pair. Final Weight. + + ; Load all masks. + vpbroadcastd y0, [pat_16w] ; Load rounding bias. + vpbroadcastd y1, [pat_1_0b] ; Load weight distribution pattern. + vbroadcasti128 y6, [pair_low] ; Load pair making pattern. + vpbroadcastd y9, [pat_7b] ; Load "word shift as byte shift" mask pattern. + + ; Register usage. + ; - g0: Result array. + ; - y0: Rounding bias. + ; - y1: Word replication pattern. + ; - y2: Weights, distributed to do 2 rows at a time. + ; - y3: 2 rows of results [0|2]. + ; - y4: 2 rows of results [1|3]. + ; - y5: Temp. + ; - y6: Generate pair. + ; - y7: Angle sum. Used to generate the offset. + ; - y8: Angle value. Add it to the sum at each row. + ; - y9: "Word shift as byte shift" mask pattern. + ; - y10: Top row. Replicated. + %%% lower case, comma. + %macro DO_ROW 2 ; %1: Row offset. %2: Register in which to put the value. + + %if %1 != 0 + vpaddb y7, y8 ; Add the angle to the current angle sum. + %endif + + vpsrlw y%2, y7, 5 ; Generate neighbour offset. + vpand y%2, y9 ; Shift can only be on word or greater value. Mask to simulate byte shift. + vpaddb y%2, y6 ; Add offset to pairing mask. + vpshufb y%2, y10, y%2 ; Generate pair. + + %if %1 != 0 + vpalignr y5, y2, %1*2 ; Get weight. + vpshufb y5, y1 ; Broadcast weight. + %else + vpshufb y5, y2, y1 ; Broadcast weight. + %endif + + vpmaddubsw y%2, y%2, y5 ; Multiply values with weight. + vpaddw y%2, y0, ; Add rounding bias. + vpsrlw y%2, y%2, 5 ; Weighted average. + vpackuswb y%2, y%2 ; Word to byte. + %endmacro + + DO_ROW 0, 3 ; Do row 0 and 2. + DO_ROW 2, 4 ; Do row 1 and 3. + %%% Suboptimal. Do that using the last pack from DO_ROW. + vpunpcklqdq y3, y4 ; Merge value. + vmovdqu [g0+0x00], y3 ; Save result. + %%% row 0? + vpaddb y7, y8 ; Skip from row 1 to row 4. + vpaddb y7, y8 ; + + DO_ROW 4, 3 ; Do row 4 and 6. + DO_ROW 6, 4 ; Do row 5 and 7. + + vpunpcklqdq y3, y4 ; Merge value. + vmovdqu [g0+0x20], y3 ; Save result. + + %unmacro DO_ROW 2 + RET + + +; Intra angular top right 8x8. +DEFFUN f265_lbd_predict_intra_dia_top_right_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1 %%% Note: unaligned load. To be investigated for performance when neighbour function is fixed. %%% Apply comments from the other functions. + vmovdqu x0, [g1+65] ; Load all data. + + vpalignr y1, y0, 2 ; Offset value: will build rows 2 and 3. %%% offsetted. + vinserti128 y0, y0, x1, 1 ; Push offseted value in high lane. + + vpalignr y1, y0, 1 ; Create rows 1 and 3. + vpunpcklqdq y2, y0, y1 ; Merge them with rows 0 and 2. + vmovdqu [g0], y2 ; Save rows 1 to 4. + %%% 0 to 3. + vpalignr y1, y0, 5 ; Offset to generate rows 4 to 7. + + vpalignr y0, y0, 4 ; Repeat operation above for rows 4 to 7. + vpunpcklqdq y2, y0, y1 ; + vmovdqu [g0+0x20], y2 ; + + RET + + %%% Remove the first line, not needed. +; Intra Extract neighbours. +; Extract and filter neighbours for intra prediction. +; +; Input format: +; EAABB +; C +; C +; D +; D +; +; Output format: +; padding [48] [64] padding [128] +; [ ... DDCC AACC ... E] +; +; Input parameters: +; - g0: nbuf[2][160]. %%% pred. +; - g1: *pred. +; - g2: pred_stride. +; - g3: avail[2]. +; - g4: filter_flag. %%% Missing last argument. %%% Invalid declaration, bad temporary integer count. %%% Bad ymm names, I didn't check for Y count. Verify. %%% %%% The rgister usage needs optimization. I suggest %%% 1) figure out all the instructions. %%% 2) do the register coloring. %%% 3) import the code in a new function. +DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=6, tv=9, ym=1 + + ; Test for special case: no left neighbours. + cmp dword [g3+4], 0 + jz .LEFT_NOT_AVAILABLE + %%% neighbours, available. + ; Left neighbour are availables. + + ; Get C and D from the prediction buffer. + ; Pseudo-code: %%% double -> dword. + ; - Load & broadcast as double the left neighbour of each row. %%% Blend the rows together. + ; - Blend each row together. %%% needs + ; - Keep in mind the order need to be inversed. + %%% prediction + ; Get left neighbours (C) from the pred buffer. + + ; Load 2 left neighbours and blend them together. + ; Update the offset for continuous calling. + ; Require the following setup: %%% CB. + ; g1: Prediction source, aligned with the top left cb pixel. %%% neighbour. + ; g6: Even row offset. Start at 0 for the first left neighbours. %%% neighbour. %%% at + ; g7: Odd row offset. Start ar pred_stride for the second neighbours. + ; g8: The offset between 2 rows. Should be 2*pred_stride. %%% lower case, comma, period. + %macro load 2 ; %1: the xmm register in which to save the value. %2: A work register %%% Suboptimal. %%% Useless adds for the last 2 rows. %%% Do lea gX, [g1-4] once. %%% Do lea gY, [pred_stride*3] once. %%% Do your loads as follow: %%% - [gX] %%% - [gX+pred_stride] %%% - [gX+2*pred_stride] %%% - [gX+gY] %%% - (conditionally to pass to next 4-pixel block) lea gX, [gx+4*pred_stride] + vpbroadcastd %1, [g1-4+g6] ; Load & broadcast the left neighbour. %%% period. + vpbroadcastd %2, [g1-4+g7] ; Load & broadcast the next left neighbour, + add g6, g8 ; Update the offset of even row. + add g7, g8 ; Update the offset of odd row. + + vpblendd %1, %1, %2, 0b0101_0101 ; Mix even and odd row: result 1 0 1 0. + %endmacro + %%% Load that just once. + vpbroadcastd x8, [neigh_last_b_of_d] ; Load suffle mask. %%% Delete this. + xor g6, g6 + mov g7, g2 + mov g8, g2 + add g8, g2 + + load x0, x5 ; Load C0 and C1. Result 1 0 1 0. + load x1, x5 ; Load C2 and C3. Result 3 2 3 2. + vpblendd x0, x0, x1, 0b0011_0011 ; Mix 1 0 and 3 2. Result 3 2 1 0. + vpshufb x0, x8 ; Keep the last byte of each dword. + + load x3, x5 ; Load C4 and C5. Result 5 4 5 4. + load x4, x5 ; Load C6 and C7. Result 7 6 7 6. + vpblendd x3, x3, x4, 0b0011_0011 ; Mix 5 4 and 7 6. Result 7 6 5 4. + vpshufb x3, x8 ; Keep the last byte of each dword. + + vpblendw x0, x0, x3, 0b0011_0011 ; [7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0]. + + ; Special case: no top neighbours. + cmp dword [g3], 0 + jz .TOP_NOT_AVAILABLE + + ; Load top (A and B) neighbour from pred. + neg g2 ; Move up 1 row (negative pred_stride). + vmovdqu x1, [g1+g2] ; Load A|B from prediction. %%% vmovd. + vpinsrb x2, [g1+g2-1], 0 ; Load top left (E). + neg g2 ; Return it to a positive value. + + .LEFT_AND_TOP_FETCHED: + %%% values. %%% Why? You re-do it below. + ; Save current value. + vmovdqu [g0+48], x0 + vmovdqu [g0+64], x1 + vmovd [g0+128], x2 + + ; Test if bottom left is available. + cmp dword [g3+4], 8 + jae .BOTTOM_AVAILABLE + %%% Bottom-left. + ; Bottom left not available. + vpshufb x0, [neig_bl_unav_8] ; Expand the last value. + + .BOTTOM_FETCHED: + %%% Dupe store with above (this one is correct). + vmovdqu [g0+48], x0 ; Save partial top and left to allow easy byte extraction. + vmovdqu [g0+64], x1 + %%% Do that right at function entry. %%% Declare your registers so you track where goes where. + movzx g5, byte [g3] ; Load availx. + movzx g6, byte [g3+4] ; Load -availy. + neg g6 + + vpbroadcastb x4, [g0+63+g5] ; Broadcast the last available block. %%% Add movq. + vpbroadcastb x3, [g0+64+g6] ; + %%% number, replace everywhere. %%% Can be made faster. %%% movd ytmp, g5 %%% vpbroadcastb ytmp, ytmp. ; Broadcast the number of pixels available. %%% ypcmpgtb ytmp, [0 1 2 3 4 5 6 7 ... ] ; 0xff for pixels that need replacement. %%% vpblendvb ; Replace (figure out the register order). + mov g7, 16 ; Clip nb of valid data to 16. + cmp g5, g7 + cmovg g5, g7 + neg g5 ; Invert nb avail to match blend mask. + %%% The case availx=12 almost never happens in practice. This wastes time. %%% Handle with BOTTOM_4_AVAILABLE branch from BOTTOM_AVAILABLE below. + neg g6 + mov g7, 16 ; Clip nb of valid data to 16. + cmp g6, g7 + cmova g6, g7 + %%% Invalid load. + vmovdqu x5, [neigh_avail_msk+32+g5] ; Load blend mask. A negative offset will determine + ; the nb of valid neighbours, + vpblendvb x1, x1, x4, x5 ; Replace (blend) invalid value with the broadcasted last valid values. + + vmovdqu x5, [neigh_avail_msk+16+g6] ; Load blend mask. A positive offset will determine + ; the nb of valid neighbours, + vpblendvb x0, x3, x0, x5 ; Replace (blend) invalid value with the broadcasted last valid values. + %%% Alas, no. Not aligned. + vinserti128 y0, y0, x1, 1 ; Save value has a single register to maximize write forwarding. + vmovdqu [g0+48], y0 + + ; Filter only if required (Test ) + cmp g4, 0 + je .END + + ; Pseudo code: + ; Register ordering : D7, D6 ... D0, C7, ... C0, E, A0, ..., A7, B0, ... B6, B7. + ; V[i] = (V[i-1] + 2*V[i] + V[i+1] + 2) >> 2 + ; D7 = D7, B7 = B7 + %%% Wrong register names. + vpbroadcastd ymm8, [pat_1b] ; Load pmadd pattern (actually, just an add and zero extend). + vpbroadcastd ymm7, [pat_2w] ; Load rounding bias. + vmovdqu ymm6, [neigh_1_of_2] ; Load unpack pattern (select 1 byte from each register). %%% Get rid of that. + vmovdqu ymm5, [blend_extremity] ; Load exception blend mask. + + vpslldq xmm4, xmm2, 15 ; Move the top left (e) to the last byte of the xmm register. + vpalignr xmm3, xmm2, xmm0, 1 ; Remove D7 and insert E next to C0. + ; All bytes are shifted by one. Named D|C*. + vpalignr xmm4, xmm1, xmm4, 15 ; Remove B7 and insert E next to A0. %%% Typo. + ; All byte sare shifted by one. Named A|B*. + + vinserti128 ymm0, ymm0, xmm1, 1 ; Pack D|C with A|B. + vinserti128 ymm3, ymm3, xmm4, 1 ; Pack D|C* with A|B*. + %%% Add the neighbours together. + vpmaddubsw ymm0, ymm0, ymm8 ; Add each neighbour together. %%% Re-read, typos, no period, pairs, innermost, will be. + vpmaddubsw ymm3, ymm3, ymm8 ; As D|C|AB* is DC|A|B offsetted by one byte, this will generate all. + ; D|C and A|B peer. The inner most value of D|C|A|B* will by C0+E and E+A0. + + vpaddw ymm1, ymm0, ymm3 ; Add D|C|A|B to D|C|A|B*. + vpaddw ymm1, ymm1, ymm7 ; Add rounding bias. + vpsrlw ymm1, 2 ; Round. + %%% Use a 32-byte pattern instead, this is too expensive. %%% words, adjacent pairs together. + vpalignr ymm2, ymm3, ymm3, 14 ; Shift the word to add adjacent pair to together. %%% lanes, different directions, since, first lane, second lane. + vpalignr ymm3, ymm3, ymm3, 2 ; The high and low lane must be shifted in defferent direction + ; Since E was added at the start of the first and end of the second. %%% Typo. + vpblendd ymm3, ymm3, ymm2, 0x0F ; Merge togethershifted result. %%% Typo / I don't understand. + vpaddw ymm0, ymm3, ymm0 ; Generate the other half of quatuor. + vpaddw ymm0, ymm0, ymm7 ; Add rounding bias. + vpsrlw ymm0, 2 ; Round. + + vpackuswb ymm0, ymm0, ymm1 ; Word to byte. + vpshufb ymm0, ymm6 ; Interleave the result. + + vpinsrb xmm1, [g0+48], 0 ; Manage D7. + vpinsrb xmm1, [g0+79], 15 ; Manage B7. %%% Won't be needed anymore since you'll use two stores. + vinserti128 ymm1, ymm1, xmm1, 1 ; Copy to the high lane. %%% Period. + vpblendvb ymm0, ymm1, ymm5 ; Replace invalid result by the valid D7 and B7 + %%% Unaligned store. + vmovdqu [g0+160+48], ymm0 ; Save it. + %% top-left. + ; Filter top left. + movzx g2, byte [g0+128] ; Load top left. + movzx g3, byte [g0+63] ; Load top. + movzx g4, byte [g0+64] ; Load left. %%% LEA (top_left*2+right+bias). + add g2, g2 ; Top left * 2. %%% Period. + add g3, g4 ; A0 + C0 %%% Period. + add g2, g3 ; A0 + Top left * 2 + C0 + add g2, 2 ; Add rounding bias. + shr g2, 2 ; Round. %% top-left. + mov [g0+160+128], g2b ; Save filtered top left. + %%% Use RET. + jmp .END + + .LEFT_NOT_AVAILABLE: + + ; Test if top is available. + cmp dword [g3], 0 + jz .NOTHING_AVAILABLE + %%% Those double negs should be avoided. + neg g2 + vmovdqu xmm1, [g1+g2] ; Load top value + neg g2 %%% vpbroadcastb does the job. + vpxor xmm2, xmm2 ; + vpshufb xmm0, xmm1, xmm2 ; Broadcast the first byte as the left value. + vmovdqa xmm2, xmm1 ; Set top left. + jmp .LEFT_AND_TOP_FETCHED + + .TOP_NOT_AVAILABLE: + vpbroadcastd xmm2, [pat_15b] + vpshufb xmm1, xmm0, xmm2 ; Replicate C0 as the top neighbours. + vmovdqa xmm2, xmm1 ; Set top left. + jmp .LEFT_AND_TOP_FETCHED + + + .BOTTOM_AVAILABLE: + ; Get D from the pred buffer. %%% shuffle. %%% Apply fixes as above. Don't recompute variables that you already set up before. + vpbroadcastd x8, [neigh_last_b_of_d] ; Load suffle mask. + + ; Init macro register. + mov g6, g2 ; Copy pred_stride. + mov g7, g2 + mov g8, g2 ; + shl g6, 3 ; Start at the 8th row. + shl g7, 3 + add g7, g2 ; Add the odd row offset. %%% Increment. + add g8, g2 ; Incremetn 2 row at the time. + + load x6, x5 ; Load D0 and D1. Result 9 8 9 8. + load x3, x5 ; Load D2 and D3. Result b a b a. + vpblendd x6, x6, x3, 0b0011_0011 ; Mix 9 8 and b a. Result 3 2 1 0. + vpshufb x6, x8 ; Keep the last byte of each word. + + load x3, x5 ; Load D4 and D5. Result d c d c. + load x4, x5 ; Load D6 and D7. Result f e f e. + vpblendd x3, x3, x4, 0b0011_0011 ; Mix d c and f e. Result f e d c. + vpshufb x3, x8 ; + + vpblendw x6, x6, x3, 0b0011_0011 ; [f e d c b a 9 8 f e d c b a 9 8]. + + ; Merge C and D. + vpblendd x0, x0, x6, 0b0011_0011 ; [f e d c b a 9 8 7 6 5 4 3 2 1 0]. + + jmp .BOTTOM_FETCHED + + + .NOTHING_AVAILABLE: + vpbroadcastd ymm0, [pat_128b] ; Store 128 everywhere. + vmovdqa xmm1, xmm0 ; + vmovdqa xmm2, xmm0 ; + + vmovdqu [g0+48], ymm0 ; Save it. + vmovd [g0+128], xmm0 ; %%% Double plus. + vmovdqu [g0++160+48], ymm0 ; Save the filtered version. + vmovd [g0+160+128], xmm0 ; + + .END: %%% Bad argument count. + %unmacro load 1 + RET %%% Add your newline. \ No newline at end of file
