Re: [f265 dev team] Intra prediction 8x8 asembly

Laurent Birtz Mon, 28 Jul 2014 10:23:47 -0700

Review for the asm part. Search for %%%.

Regards,
Laurent

%%% Use ga to reduce temp count.
%%%
%%% All loads of the form [some_label+reg] are not PIC, i.e. they are not legal.
%%% RIP-relative code: [rip + constant]. [rip + reg] doesn't exist.
%%% The linker patches the non-PIC loads at run time.
%%% Fix all such loads by first loading the label with lea reg, [label] 
%%% followed by the load you want to do with the label.
diff --git a/f265/asm/avx2/intra.asm b/f265/asm/avx2/intra.asm
new file mode 100644
index 0000000..d3ad1b9
--- /dev/null
+++ b/f265/asm/avx2/intra.asm
@@ -0,0 +1,1068 @@
%%% inverted or inversed, pick one and stick to it.
+; Copyright (c) 2014, VANTRIX CORPORATION. All rights reserved. See LICENSE.txt
+; for the full license text.
+
+%include "x86inc.asm"
+
+section .data
%%% You violate your alignment directives.
%%% Divide your patterns by alignment under the proper directives.
+align 32
+
%%% vertical.
%%% The comment is unclear. Replace by
%%% Shuffle pattern to regroup the left and top-right pixels together
%%% for rows 0/2, 1/3, 4/6, 5/7.
%%% planar_8_left.
+planar_8:           db  14,15, 12,13, 6,7, 4,5, 0,0,0,0,0,0,0,0,  ; Row index, shuffled to do 2 rows at the time.
+                    db  10,11,  8, 9, 2,3, 0,1, 0,0,0,0,0,0,0,0,  ; Used to get the vertival value in planar.
+
+angle_mul_ver:      dw  1,2,5,6, 0,0,0,0,       ; Row index, shuffled to do 2 rows at the time.
%%% weight.
+                    dw  3,4,7,8, 0,0,0,0,       ; Used to get the weigh and offset of each row on vertical angles.
+
%%% weight.
+angle_mul_hor:      dw  1,2,3,4,5,6,7,8,        ; Row index. Used to get the weigh and offset of each row on
+                                                ; horizontal angles.
+
%%% multiplier.
+angle_inv_mul_hor:  dw  0, 1, 2, 3, 4, 5, 6, 7, ; Mutiplayer for inv_angle_8 on horizontal angles.
+angle_inv_mul_ver:  dw  7, 6, 5, 4, 3, 2, 1, 0, ; Mutiplayer for inv_angle_8 on vertical angles.
+
+dia_bot_left_8:     db  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ; Invert byte order.
+
+; Repeat values on a whole 8x8 row. Inversed for use in pure horizontal.
%%% hor.
+ang_hoz_8:          db  3, 3, 3, 3, 3, 3, 3, 3,
+                    db  2, 2, 2, 2, 2, 2, 2, 2,
+                    db  1, 1, 1, 1, 1, 1, 1, 1,
+                    db  0, 0, 0, 0, 0, 0, 0, 0,
+
%%% neighbour pairs.
+; Pshufb pattern to generate neighbours pair.
+pair_low:           db  0,1, 1,2, 2,3, 3,4, 4,5, 5,6, 6,7, 7,8,
+pair_high:          db  7,8, 8,9, 9,10, 10,11, 11,12, 12,13, 13,14, 14,15
+
+; Multiply high lane by 3 while keeping the low lane as-is.
%%% Garbage ', ;'.
+triple_last_lane:   db  1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, ;
+                    db  3,0, 3,0, 3,0, 3,0, 3,0, 3,0, 3,0, 3,0, ;
+
+planar_wgt_hor:     db  7, 1,  6, 2,  5, 3,  4, 4,  ; Weight pair, used for planar row weighting.
+                    db  3, 5,  2, 6,  1, 7,  0, 8,
+
+; Manage neighbour filtering edge case.
%%% Remove this stale stuff.
+intra_neigh_ab:     db  15, 0, 1, 15, 6, 7, 8, 9, 15, 15, 15, 15, 15, 15, 15, 15 ; Re-order pixel to fit all
+                                                                                 ; data in 8 bytes.
+intra_neigh_rev_ab: db  0, 15, 15, 15, 15, 15, 15, 4, 5, 15, 15, 15, 15, 15, 15, 15 ; Re-order result to align them to
+                                                                                    ; their real location.
+intra_blend_ab:     db  255, 0, 0, 0, 0, 0, 0, 255, 255, 0, 0, 0, 0, 0, 0, 0 ; Blend edge case with common data.
+
%%% pixels.
+intra_neigh_cd:     db  0, 15, 14, 0, 9, 8, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, ; Re-order pixel to fit all data in 8 bytes.
%%% results.
+intra_neigh_rev_cd: db  0, 0, 0, 0, 0, 0, 0, 5, 4, 0, 0, 0, 0, 0, 0, 0 ; Re-order result to align them to their real
+                                                                       ; location.
+intra_blend_cd:     db  0, 0, 0, 0, 0, 0, 0, 255, 255, 0, 0, 0, 0, 0, 0, 255    ; Blend edge case with common data.
+
+
+; Seed on which the the neighbours offset of inverted angles are calculated.
%%% words, times.
+; As word (repeated 4 time) for speed-ups.
+inv_angle_8:        db  16, 16, 16, 16
+                    db  19, 19, 19, 19
+                    db  24, 24, 24, 24
+                    db  30, 30, 30, 30
+                    db  39, 39, 39, 39
+                    db  57, 57, 57, 57
+                    db  102, 102, 102, 102
+                    db  255, 255, 255, 255
+
+; Seed on which the angles weights and offsets are calculated.
%%% words, times.
+; As word (repeated 4 time) for speed-ups.
+intra_angle:        db  32, 32, 32, 32
+                    db  26, 26, 26, 26,
+                    db  21, 21, 21, 21,
+                    db  17, 17, 17, 17,
+                    db  13, 13, 13, 13,
+                    db  9, 9, 9, 9,
+                    db  5, 5, 5, 5,
+                    db  2, 2, 2, 2,
+                    db  0, 0, 0, 0,
+                    db  2, 2, 2, 2,
+                    db  5, 5, 5, 5,
+                    db  9, 9, 9, 9,
+                    db  13, 13, 13, 13,
+                    db  17, 17, 17, 17,
+                    db  21, 21, 21, 21,
+                    db  26, 26, 26, 26,         ; TODO: If we want to be more agressive on memory saving,
+                    db  32, 32, 32, 32,         ; this could be cut in half.
%%% The first and last entries are not used. Adjust.
+
+
+; Blend mask. Will be loaded with an offset, allowing partial merging of 2 registers.
%% 0xff lower case (convention).
+neigh_avail_msk:    dq  0, 0, 0, 0, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+
+
+neig_bl_unav_8:     db  0,0,0,0,0,0,0,0, 0, 1, 2, 3, 4, 5, 6, 7
+neigh_1_of_2:       db  0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15,
+                    db  8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7,
+blend_extremity:    db  255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    db  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255,
+; Pattern used as mask, bias, offset, ...
+; As double to use the more efficient vpbroadcastd.
+neigh_last_b_of_d:  db  3, 7, 11, 15,
%%% Do it like 0_1b below, this is hard to read.
+pat_1_0b:           dw  256, 256,
+pat_32w:            dw  32, 32,
+pat_31w:            dw  31, 31,
+pat_16w:            dw  16, 16,
+pat_15b:            db  15, 15, 15, 15,
+pat_128b:           db  128, 128, 128, 128
+pat_14_15b:         db  14,15, 14,15,
+pat_7_8b:           db  7,8, 7,8,
+pat_7b:             db  7,7,7,7
+pat_8w:             dw  8, 8,
+pat_0_1b:           db  0,1, 0,1,
+pat_1b:             db  1,1, 1,1,
%%% lower case (convention).
+pat_255q:           dq  0xFF
+pat_2w:             dw  2, 2,


%%% Remove that junk.
+; Intra 4x4 pattern.
+intra_copy_4:       dd  0x00000000, 0x01010101, 0x00000000,0x00000000,
+                    dd  0x02020202, 0x03030303, 0x00000000,0x00000000,
+
+
+ang_hoz_16:         db  1, 1, 1, 1, 1, 1, 1, 1, ; Repeat values on a whole 16x16 row.
+                    db  1, 1, 1, 1, 1, 1, 1, 1, ; Inversed for use in pure horizontal.
+                    db  0, 0, 0, 0, 0, 0, 0, 0,
+                    db  0, 0, 0, 0, 0, 0, 0, 0,
+
+intra_hor_4:        db  3, 1, 2, 2, 1, 3, 0, 4, ; Horizontal weight for 4x4 planar.
%%% Vertical.
+intra_vert_4:       db  3, 1, 3, 1, 3, 1, 3, 1, ; Vertival weight for 4x4 planar.
+
+pat_4w:             dw  4, 4,                   ; Rounding bias.
+
+section .text
+
+
+; 8x8 intra prediction functions.
%%% There are, intra prediction modes.
+; There is 11 assembly function to cover all 8x8 intra prediction.
+; - Planar and DC.
%%% pure.
+; - 3 Pure diagonals:
+;   - dia_bot_left.
+;   - dia_top_left.
+;   - dia_top_right.
+; - Pure vertical and horizontal.
+; - 4 diagonals:
+;   - hor_bot.
+;   - hor_top.
+;   - vert_left.
+;   - vert_right.
+;
%%% parameters.
+; They all have the same input parameters, although some input parameter may be ignored.
+; - g0:     Destination.
%%% top-right.
+; - g1:     Neighbours. 0 is the bottom-left, 64 is the top left, 65 is the top, 123 is the top right.
%%% Fix that doc, it's wrong.
%%% Mode.
+; - g2:     Angle.
%%% edge.
+; - g3:     Filter egde flag.
+
+
+; Intra DC 8x8.
+DEFFUN f265_lbd_predict_intra_dc_8_avx2, ia=4, at=8844, ti=2, tv=5, ym=1
+    ; Logic:
+    ; Sum all neighbours, except the corners.
%%% samples.
+    ; Divide with bias by the number of sample.
+
+    vpmovzxbw       x1, [g1+56]             ; Load all data.
+    vpmovzxbw       x2, [g1+64]
+
+    vpaddw          y1, y2                  ; Add them together.
+
+    vpalignr        y2, y1, 8               ; At each step, fold the register in 2...
+    vpaddw          y1, y2                  ; ... then add each value together.
+
+    vpalignr        y2, y1, 4
+    vpaddw          y1, y2
+
+    vpalignr        y2, y1, 2
+    vpaddw          y1, y2
+
+    vpaddw          y1, [pat_8w]            ; Load the rounding bias.
+    vpsrlw          y1, y1, 4               ; Divide by 16.
+
+    vpackuswb       y1, y1                  ; Word to byte.
+
+    vpbroadcastb    y1, x1                  ; Replicate the value.
+
+    vmovdqa         y0, y1
+
+    and             g3, 1
+    jz              .SKIP_FILTER
+
+    ; 3 cases:
+    ; - Top left = 2*base + top + left.
+    ; - Top =  3*base + top.
+    ; - Left = 3*base + left.
+
%%% You already did that above, this is wasteful.
+    vpmovzxbw       x2, [g1+64]             ; Load top row.
%%% neighbours.
+    vpmovzxbw       x3, [g1+56]             ; Load left neigbours.
+
+    vpextrb         g2d, x1, 1              ; Extract base.
+
+    mov             g3, g2                  ; Copy base.
%%% LEA.
+    shl             g2, 1                   ; Base * 2.
+    add             g2, 2                   ; Add rounding bias.
+    add             g3, g2                  ; Base * 3 + rounding bias.
+
%%% MOVD.
+    vpinsrw         x4, g3d, 0
+    vpbroadcastw    x4, x4                  ; Broadcast base * 3 + rounding bias.
+
+    movzx           g4, byte [g1+64]        ; Load the first top and left value.
+    movzx           g5, byte [g1+63]
+
%%% Suboptimal. Pack into 1 ymm first.
+    vpaddw          x2, x4                  ; 3 * Base + top row + rounding bias.
+    vpaddw          x3, x4                  ;
+
+    vpsrlw          x2, 2                   ; Divide by 4.
+    vpsrlw          x3, 2                   ;
+
%%% Pack once. Low/top will be already at the correct location.
%%% Use vpermq for left.
+    vpackuswb       x2, x2                  ; Word to byte.
+    vpackuswb       x3, x3                  ;
+    vinserti128     y3, x3, 1
+
+    vpblendd        y0, y2, y0, 0xfc        ; Save in top row.
+
+    vmovdqu         y5, [ang_hoz_8]         ;
+    vpbroadcastq    y2, [pat_255q]          ;
+
+    vpshufb         y6, y3, y5              ; Replicate 8x the 4 lower value.
+    vpblendvb       y1, y1, y6, y2          ; Blend only the first value of each row.
+
%%% Do actually shift, don't align.
+    vpalignr        y3, y3, 4               ; Shift by 4 to do the 4 last rows.
+
+    vpshufb         y6, y3, y5              ; Replicate 8x the 4 lower value.
+    vpblendvb       y0, y0, y6, y2          ; Blend only the first value of each row.
+
+
%%% top-left.
+    ; Do top right.
+    add             g4, g5                  ; Top + left.
+    add             g2, g4                  ; Top + left + 2*base + bias.
+    shr             g2, 2                   ; Get the average.
+
+    vmovdqa         y2, y0
+    vpinsrb         x2, g2b, 0
+    vinserti128     y0, y0, x2, 0
+
+    .SKIP_FILTER:
+
+    vmovdqu         [g0], y0
+    vmovdqu         [g0+0x20], y1           ; Save the value.
+
+    RET
+
+
+; Intra planar 8x8.
%%% Bad declaration. You actually use 9 ymm registers. BUT, you don't need to, check below.
+DEFFUN f265_lbd_predict_intra_planar_8_avx2, ia=4, at=8844, ti=0, tv=8, ym=1
%%% Wrong formula. It's 8-x-1 and co. Wrong neighbour names.
%%% value = ((8-x-1)*left + (8-y-1)*top + (x+1)*top_right + (y+1)*bottom_left + 8) >> 4);
+    ; value = ((8-x)*right_neigh + (8-y)*top_neigh + (x+1)*top_right + (y+1)*bottom_left + 8) >> 4);
+
+    vpbroadcastb    x6, [g1+56-1]           ; Load & broadcast bottom left.
+    vpmovzxbw       x1, [g1+64]             ; Load top neighbours.
+    vpmovzxbw       x6, x6
+
%%% Use vmovq+vpbroadcastb here.
%%% vmovq mem:        p23, latency 3.
%%% vpbroadcastb reg: p5, latency 3 since cross-lane.
%%% vpboradcastb mem: p01 p23 p5, latency 7.
+    vpbroadcastb    y7, [g1+72]             ; Load & broadcast top right.
+
+    vpbroadcastd    y0, [pat_0_1b]          ; Weight distribution pattern.
+
+    vpsllw          y2, y1, 3               ; Top row * 8.
+    vpsubw          y1, y6                  ; Row delta (top neighbour - bottom left).
+
+    vpsubw          y2, y1                  ; Top row * 7 + bottom left.
+
%%% y8 is a bad temp, use y3.
+    vpsllw          y8, y1, 1               ;
+    vpsubw          y6, y2, y8              ; Top row * 5 + 3*bottom left.
+    vinserti128     y2, y2, x6, 1           ; Get row 2 values.
%%% apply. Comment isn't clear.
%%% Double the vertical delta removed at each line.
+    vinserti128     y1, y1, x1, 1           ; Double offset to aply at each line.
+
+    ; Register usage:
+    ; - y1: row delta.
+    ; - y2: row sum.
+
%%% Unaligned load, should be vpbroadcastq.
+    vbroadcasti128  y3, [g1+64-8]           ; Load left column.
+    vpunpcklbw      y3, y7                  ; Merge top right with left col.
+    vpshufb         y3, [planar_8]          ; Shuffle to do 2 columns at a time.
+
%%% weights.
+    vbroadcasti128  y4, [planar_wgt_hor]    ; Load weight.
+    vpbroadcastd    y5, [pat_8w]            ; Load rounding bias.
+
+    ; Register usage:
%%% lower case as you used above.
+    ; - y0: Weight distribution pattern.
%%% row vertical delta.
+    ; - y1: Row delta.
%%% row vertical sum.
+    ; - y2: Row sum.
+    ; - y3: Column values.
%%% weights.
+    ; - y4: Column weight.
%%% Rounding.
+    ; - y5: Rouding bias.
+
%%% Document your arguments.
%%% %macro DO_ROW 2			     ; %1: alignment offset, %2: destination register.
+    %macro DO_ROW 2
+    %if %1 != 0
%%% Add delta to row sum.
+    vpsubw          y2, y1                  ; Add offset to row value.
+    vpalignr        y%2, y3, %1*2           ; Offset column.
%%% Repeat the column.
+    vpshufb         y%2, y%2, y0            ; Repeat the pattern.
+    %else
%%% Repeat the column.
+    vpshufb         y%2, y3, y0             ; Repeat the pattern.
+    %endif
+
%%% factors.
+    vpmaddubsw      y%2, y4                 ; Get the sum of all factor.
+    vpaddusw        y%2, y2                 ; Add vertical.
+    vpaddusw        y%2, y5                 ; Add rounding bias.
+    vpsrlw          y%2, y%2, 4
+    %endmacro
+
+    DO_ROW          0, 6                    ; Do row 0 and 2.
+    DO_ROW          1, 7                    ; Do row 1 and 3.
+
+    vpackuswb       y6, y7
+    vmovdqu         [g0], y6
+
+    vpsubw          y2, y1                  ; Add offset to row value.
+    vpsubw          y2, y1                  ;
+
+    DO_ROW          2, 6                    ; Do row 4 and 6.
+    DO_ROW          3, 7                    ; Do row 5 and 7.
+
+    vpackuswb       y6, y7
+    vmovdqu         [g0+0x20], y6
%%% Align.
+%unmacro DO_ROW 2
+    RET
+
+
+; Intra pure diagonal bottom left 8x8.
+DEFFUN f265_lbd_predict_intra_dia_bot_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
+
+    vmovdqu         x1, [dia_bot_left_8]    ; Load pattern.
%%% Unnecessary unaligned load. Fix the pattern instead.
+    vmovdqu         x0, [g1+48-1]           ; Load all data.
%%% Use memory operand.
+    vpshufb         y0, y1                  ; Re-order it.
+
%%% Offset the pixels in the high lane to build rows 2 and 3.
+    vpalignr        y1, y0, 2               ; Offset value: will build rows 2 and 3.
+    vinserti128     y0, y0, x1, 1           ;
+
+    vpalignr        y1, y0, 1               ; Create row 1 and 3.
%%% Them.
%%% Use plural for 'row' everywhere.
+    vpunpcklqdq     y2, y0, y1              ; Merge then with row 0 and 2.
%%% 0 to 3.
+    vmovdqu         [g0], y2                ; Save row 1 to 4.
+
+    vpalignr        y1, y0, 5               ; Offset to generate row 4 to 7.
+
+    vpalignr        y0, y0, 4               ; Repeat operation above for row 4 to 7.
+    vpunpcklqdq     y2, y0, y1              ;
+    vmovdqu         [g0+0x20], y2           ;
+    RET
+
+
+; Intra angular horizontal bottom 8x8.
%%% Unify code with horizontal-top with macro.
%%% Apply comments from vertical top-right.
+DEFFUN f265_lbd_predict_intra_hor_bot_8_avx2, ia=4, at=8844, ti=0, tv=10, ym=1
+    vbroadcasti128  y9, [g1+48]             ; Load left column.
+
+    mov             g3, 18                  ; Fix angle.
+    sub             g3, g2
+
+    ; Generate weight and offset.
+    vpbroadcastd    y1, [intra_angle+g3*4]  ; Load angle factor.
+    vbroadcasti128  y2, [angle_mul_hor]     ; Load multiplication table.
+    vpmaddubsw      y1, y1, y2              ; Result in offset and weight for each column.
+
+    ; Generate pairs.
+    vpbroadcastd    y2, [pat_31w]           ; Load weight mask.
+    vpand           y2, y2, y1              ; Extract weight.
+    vpbroadcastd    y0, [pat_32w]           ; Load weight complement base.
+    vpsubw          y0, y2                  ; Get weight complement.
+    vpackuswb       y2, y2, y2              ; Word to byte.
+    vpackuswb       y0, y0, y0              ; Word to byte.
+    vpunpcklbw      y0, y2, y0              ; Make the pair. Final Weight.
+
+    ; Generate neighbours offset pattern.
+    vpbroadcastd    y3, [pat_14_15b]        ; Load base offset pattern.
+    vpsrlw          y1, y1, 5               ; Get the angle offset.

%%% Reduce port 5 pressure.
%%% vpsllw y2, y1, 8
%%% vpor y1, y2
+    vpackuswb       y1, y1, y1              ; Word to byte.
+    vpunpcklbw      y1, y1, y1              ; Double the offset (twice for each pair).

+    vpsubw          y1, y3, y1              ; Add the angle offset to the base offset.
+
+    ; Load patterns.
+    vpbroadcastd    y2, [pat_16w]           ; Load rounding bias.
+    vbroadcasti128  y6, [pair_low]          ; Load the neighbour pair generating pattern.
+
+    ; Calculate the offset for the high lane.
%%% neighbour position offsets.
+    vpbroadcastd    y7, [pat_1b]            ; Load neighbours positions offset.
+    vpsubb          y8, y1, y7              ; Pre-offset by 2 the neighbour position.
%%% rows.
+    vpsubb          y8, y8, y7              ; Will be used to calculate 2 row at once.
%%% offsetted
+    vinserti128     y1, y1, x8, 1           ; Put the offseted load pattern on the high lane.
+
+    ; Predict 2 rows.
%%% row.
+    %macro DO_ROW   2                       ; %1: Row offset, %2: register in which to put the value.
+
+    %if %1 != 0
+    vpsubb          y1, y7                  ; Update neighbours offset.
+    %endif
+
+    vpshufb         y%2, y9, y1             ; Generate neighbour pair.
+
%%% row.
+    ; Calculate rows values.
+    vpmaddubsw      y%2, y%2, y0            ; Multiply with weight.
%%% Extra comma.
+    vpaddw          y%2, y2,                ; Add rounding bias.
+    vpsrlw          y%2, y%2, 5             ; Weighted average.
+    vpackuswb       y%2, y%2                ; Word to byte.
+    %endmacro
+
+    DO_ROW          0, 3                    ; Do row 0 and 2.
+    DO_ROW          1, 4                    ; Do row 1 and 3.
+
+    vpunpcklqdq     y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x00], y3           ; Save result.
+
%%% row 0.
+    vpsubb          y1, y7                  ; Update neighbours offset from row 1 to row 4.
+    vpsubb          y1, y7
+
+    DO_ROW          4, 3                    ; Do row 4 and 6.
+    DO_ROW          5, 4                    ; Do row 5 and 7.
+
+    vpunpcklqdq     y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x20], y3           ; Save result.
+
+    %unmacro DO_ROW 2
+    RET
+
+
%%% Invalid declaration, you don't use temporary integer.
+; Intra pure horizontal 8x8.
+DEFFUN f265_lbd_predict_intra_hor_8_avx2, ia=4, at=8844, ti=3, tv=4, ym=1
+    vmovdqu         y0, [ang_hoz_8]         ; Load shuffle mask.
+
+    vpbroadcastd    y1, [g1+63-3]           ; Load the first 4 rows.
+    vpbroadcastd    y2, [g1+63-7]           ; Load the second 4 rows.
+
%%% times.
+    vpshufb         y1, y1, y0              ; Replicate 8 timae each value.
+    vpshufb         y2, y2, y0              ;
+
+    and             g3, 1
+    jz              .SKIP_FILTER
+
+    vpmovzxbw       x0, [g1+64]             ; Load top row.
%%% Add movq.
+    vpbroadcastb    x3, [g1+128]            ; Load top left.
+
+    vpmovzxbw       x3, x3                  ; Byte to word.
+
+    vpsubw          x0, x3                  ; top - top left.
+    vpsraw          x0, 1                   ; (top - top left)/2.
+
+    vpbroadcastb    x3, [g1+63]             ; Load left.
+    vpmovzxbw       x3, x3                  ; Byte to word.
+    vpaddw          x0, x3                  ; Left + (top - top left)/2.
+
+    vpxor           x3, x3                  ; Replace negative values by 0.
+    vpmaxsw         x0, x3                  ;
+
+    vpackuswb       x0, x0                  ; Word to byte with unsigned saturation.
+
+    vpblendd        y1, y0, y1, 0xfc        ; Update the first 8 bytes.
+
+    .SKIP_FILTER:
+    vmovdqu         [g0], y1                ; Save it.
+    vmovdqu         [g0+0x20], y2           ;
+
+    RET
+
+
+; Intra angular horizontal top 8x8.
+DEFFUN f265_lbd_predict_intra_hor_top_8_avx2, ia=4, at=8844, ti=0, tv=10, ym=1
+    ; Re-order the top neighbour.
%%% No need, factor in the constant in the load below instead (-18*4).
+    mov             g3, 18                  ; Get the angle's inverted offset.
+    sub             g3, g2                  ;

%%% No need, factor in the constant in the load below instead (-2*4).
+    sub             g2, 2                   ; Get the angle's offset.

%%% Unaligned load.
+    vmovdqu         x5, [g1+63]             ; Load top neighbour.
+    vpinsrb         x5, [g1+128], 0         ; Insert the top left neighbour.
+
+    ; Generate weight and offset.
%%% Invalid load.
+    vpbroadcastd    y1, [intra_angle+g2*4]  ; Load angle factor.
+    vbroadcasti128  y2, [angle_mul_hor]     ; Load multiplication table.
+    vpmaddubsw      y1, y1, y2              ; Result in offset and weight.
+    vpbroadcastd    y2, [pat_31w]           ; Load weight mask.
+    vpand           y2, y2, y1              ; Get the weight.
+
+    ; Generate weight pairs.
+    vpbroadcastd    y0, [pat_32w]           ; Load weight complement base.
+    vpsubw          y0, y2                  ; Get weight complements.
+    vpackuswb       y2, y2, y2              ; Word to byte.
+    vpackuswb       y0, y0, y0              ; Word to byte.
%%% weight.
+    vpunpcklbw      y0, y0, y2              ; Make the pair. Final Weight.
+
+    ; Generate offset pattern.
%%% Period.
+    vpbroadcastd    y3, [pat_7_8b]          ; Load the base position pattern
+    vpsrlw          y1, y1, 5               ; Extract neighbour offset.
%%% See the bottom function.
+    vpackuswb       y1, y1, y1              ; To byte.
+    vpunpcklbw      y1, y1, y1              ; Double each value (twice for each pair).
+    vpaddw          y1, y3, y1              ; Add offset with base. Result in actual neighbour position.
+
+    ; Import top neighbour with the left ones.
+    vpbroadcastd    y4, [inv_angle_8+g3*4]  ; Load the inversed angle values.
+    vmovdqu         x3, [angle_inv_mul_hor] ; Load the weight values.
%%% have an invalid offset.
+    vpmaddubsw      y4, y4, y3              ; Get the weight. Some neighbour will give invalid offset.
+                                            ; Since we never read them, it's ok.
+    vpbroadcastd    y3, [pat_8w]            ; Load inversed angle bias.
+    vpaddw          y4, y3                  ; Add inversed angle bias.
+    vpsraw          y4, 4                   ; Get inversed neighbour offset.
+    vpackuswb       y4, y4                  ; Word to byte.
+    vpshufb         y5, y4                  ; Re-order left neighbours.
+
+    ; Load patterns.
+    vmovdqu         x4, [g1+56]             ; Load left data.
+    vpblendd        y5, y4, y5, 0xfc        ; Blend left neighbours with top neighbours.
+    vinserti128     y9, y5, x5, 1           ; Double data.
+
+    ; Update offset for the high lane.
+    vpbroadcastd    y2, [pat_16w]           ; Load rounding bias.
+    vpbroadcastd    y7, [pat_1b]            ; Load neighbours positions offset.
+    vpsubb          y8, y1, y7              ; Pre-offset by 2 the neighbour position.
+    vpsubb          y8, y8, y7              ; Will be used to calculate 2 row at once.
%%% offsetted.
+    vinserti128     y1, y1, x8, 1           ; Put the offseted load pattern on the high lane.
+
+
%%% lower case.
+    %macro DO_ROW   2                       ; %1: Row offset. %2: Register in which to put the value.
+
+    %if %1 != 0
+    vpsubb          y1, y7                  ; Update pair pattern.
+    %endif
+
+    vpshufb         y%2, y9, y1             ; Select needed variables.
+
+    ; Calcultate rows prediction.
+    vpmaddubsw      y%2, y%2, y0            ; Multiply with weight.
+    vpaddw          y%2, y2,                ; Add rounding bias.
+    vpsrlw          y%2, y%2, 5             ; Weighted average.
+    vpackuswb       y%2, y%2                ; Word to byte.
+    %endmacro
+
+    DO_ROW          0, 3                    ; Do row 0 and 2.
+    DO_ROW          1, 4                    ; Do row 1 and 3.
+
+    vpunpcklqdq     y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x00], y3           ; Save result.
+
+    vpsubb          y1, y7                  ; Update pair pattern from row 1 to row 4.
+    vpsubb          y1, y7
+
+    DO_ROW          4, 3                    ; Do row 4 and 6.
+    DO_ROW          5, 4                    ; Do row 5 and 7.
+
+    vpunpcklqdq     y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x20], y3           ; Save result.
+
+    %unmacro DO_ROW 2
+    RET
+
+
+; Intra pure diagonal top left 8x8.
+DEFFUN f265_lbd_predict_intra_dia_top_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
%%% Unaligned loads. That's not needed.
%%% 1) vmovq top offsetted by 1 (unaligned load but OK since it's 8 bytes).
%%% 2) vpinsrb to insert top-left.
%%% 3) vmovhps to insert left (aligned load).
+    vmovdqu         x0, [g1+64-7]           ; Load all data.
+    vmovdqu         x1, [g1+64-8]           ; Load top row.
+    vpinsrb         x0, [g1+128], 7         ; Load & insert top left.
+
%%% Remove.
+    vpblendd        x0, x1, 0xfc            ; Blend left with top.
+
%%% Your row numbers are wrong.
%%% Fix all the comments flagged in bottom-left.
+    vpalignr        y1, y0, 2               ; Offset value: will build rows 6 and 5.
+    vinserti128     y0, y1, x0, 1           ;
+
+    vpalignr        y1, y0, 1               ; Create row 6 and 4.
+    vpunpcklqdq     y2, y1, y0              ; Merge then with row 0 and 2.
+    vmovdqu         [g0+0x20], y2           ;
+
+    vpalignr        y0, y0, 4               ; Offset to generate row 0 to 3.
+
+    vpalignr        y1, y0, 1               ; Repeat operation above for row 0 to 3.
+    vpunpcklqdq     y2, y1, y0              ;
+    vmovdqu         [g0], y2                ; Save row 0 to 3.
+    RET
+
+
%%% Vertical.
%%% Invalid declaration, you don't use temporary integer.
+; Intra angular vertival left 8x8.
+DEFFUN f265_lbd_predict_intra_vert_left_8_avx2, ia=4, at=8844, ti=2, tv=11, ym=1
%%% Duplicated code with horizontal-top.
+    sub             g2, 18
%%% Unaligned load.
+    vmovdqu         x0, [g1+64-8]           ; Load top left data.
+    vmovdqu         x4, x0                  ; Keep a copy.
%%% top-left.
+    vpinsrb         x0, [g1+128], 8         ; Load top right.
+
%%% neighbours.
+    ; Re-order the left neighbour.
%%% Illegal load.
+    vpbroadcastd    y2, [inv_angle_8+g2*4]  ; Load the inversed angle values.
+    vmovdqu         x3, [angle_inv_mul_ver] ; Load the inversed weight values.
%%% have an invalid offset.
+    vpmaddubsw      y2, y2, y3              ; Get the weight. Some neighbour will give invalid offset.
+                                            ; Since we never use them, it's ok.
+    vpbroadcastd    y3, [pat_8w]            ; Load inversed angle bias.
+    vpaddw          y2, y3                  ; Add inversed angle bias.
+    vpsraw          y2, 4                   ; Get inversed neighbour offset.
+    vpsubb          y2, y3, y2              ; Invert the index.
+    vpackuswb       y2, y2                  ; Word to byte.
+    vpshufb         y0, y2                  ; Re-order left neighbours.
+
%%% re-ordered.
+    ; Blend re-ordened neighbours with the top neighbours.
%%% Don't load top above. Use vmovhps here (will have wrong order) + vpermq to dupe and reorder.
+    vpblendd        y0, y0, y4, 0xfc        ; Merge left neighbours with top neighbours.
+    vinserti128     y10, y0, x0, 1          ; Double top row.
+
%%% ==== Begin copy&paste ===
%%% All the following code is the same as vertical-right, except for
%%% vpunpcklbw register order, pair_low/high, vpaddb/vpsubb.
%%% I think you lost the optimization for angle_mul_ver in vert_right.
%%% The comments mismatch.
%%% Use a macro with vertical-right as the "correct" version.
+    ; Calculate the angle offset base.
%%% Illegal load.
+    vpbroadcastd    y2, [intra_angle+g2*4]  ; Load angle factor.
+    vmovdqa         y8, y2                  ; Keep a copy. Will be used to increment the offset.
+    vpmaddubsw      y7, y2, [triple_last_lane] ; Offset the high lane.
+    vpackuswb       y7, y7                  ; Word to byte.
+
+    ; Calculate the weight.
+    vmovdqu         y3, [angle_mul_ver]     ; Load multiplication table.
+    vpmaddubsw      y2, y3, y2              ; Result in offset|weight.
+    vpbroadcastd    y3, [pat_31w]           ; Load mask.
+    vpand           y3, y3, y2              ; Weight.
+    vpbroadcastd    y4, [pat_32w]           ; Load weight complement base.
+    vpsubw          y4, y3                  ; Get the weight complement.
%%% Final weight.
+    vpunpcklbw      y2, y3, y4              ; Make the pair. Final Weight.
+
+    ; Load patterns.
+    vpbroadcastd    y0, [pat_16w]           ; Load rounding offet.
+    vpbroadcastd    y1, [pat_1_0b]          ; Load weight distribution pattern.
+    vbroadcasti128  y6, [pair_high]         ; Load pair making pattern.
+    vpbroadcastd    y9, [pat_7b]            ; Load "word shift as byte shift" mask pattern.
+
+    ; Register usage :
+    ; - g0: Result array.
+    ; - y0: Rounding bias.
+    ; - y1: Word replication pattern.
+    ; - y2: Weights, distributed to do 2 rows at a time.
+    ; - y3: 2 rows of results ([0|2] then [4|5]).
+    ; - y4: 2 rows of results ([1|3] then [6|7]) .
+    ; - y5: Temp.
+    ; - y6: Generate pair.
+    ; - y7: Angle sum. Used to generate the offset.
+    ; - y8: Angle value. Add it to the sum at each row.
+    ; - y9: "Word shift as byte shift" mask pattern.
+    ; - y10: Top row. Replicated.
+
%%% Register. Lower case, comma.
+    %macro DO_ROW 2                         ; %1: Row offset. %2: Eegister in which to put the value.
+
+    %if %1 != 0
+    vpaddb          y7, y8                  ; Add the angle to the sum. Generate the offset.
+    %endif
+
+    ; Generate the neighbours pairs.
+    vpsrlw          y%2, y7, 5              ; Get the offset.
+    vpand           y%2, y9                 ; Mask to simulate byte shift.
+    vpsubb          y%2, y6, y%2,           ; Generate pair offset.
+    vpshufb         y%2, y10, y%2           ; Shuffle data into pair.
+
+    ; Broadcast the current weights.
+    %if %1 != 0
+    vpalignr        y5, y2, %1*2            ; Get weights.
+    vpshufb         y5, y1                  ; Broadcast weights.
+    %else
+    vpshufb         y5, y2, y1              ; Broadcast weights.
+    %endif
+
%%% row.
+    ; Calculates rows predictions.
+    vpmaddubsw      y%2, y%2, y5            ; Multiply values with weights.
+    vpaddw          y%2, y0,                ; Add rounding bias.
+    vpsrlw          y%2, y%2, 5             ; Weighted average.
+    vpackuswb       y%2, y%2                ; Word to byte.
+    %endmacro
+
+    DO_ROW          0, 3                    ; Do row 0 and 2.
+    DO_ROW          2, 4                    ; Do row 1 and 3.
+
%%% Same error as vertical-right.
+    vpunpcklqdq     y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x00], y3           ; Save result.
+
+    vpaddb          y7, y8                  ; Offset from row 1 to row 4.
+    vpaddb          y7, y8
+
+    DO_ROW          4, 3                    ; Do row 4 and 6.
+    DO_ROW          6, 4                    ; Do row 5 and 7.
+
+    vpunpcklqdq     y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x20], y3           ; Save result.
+
+    %unmacro DO_ROW 2
+    RET
%%% ==== End copy&paste ===
+
+
+; Intra pure vertical 8x8.
%%% Invalid declaration, you don't use temporary integer.
%%% You use 'ver' elsewhere, stick to one abbreviation.
+DEFFUN f265_lbd_predict_intra_vert_8_avx2, ia=4, at=8844, ti=2, tv=7, ym=1
%%% Load overflow (32 bytes when 8 are needed) and useless operation.
%%%  vpbroadcastq    x0, [g1+64]
+    vmovdqu         y0, [g1+64]             ; Load all data.
+    vpbroadcastq    y0, x0                  ; Copy the row 4 times. Holds row 0 to 3.
+    vmovdqa         y6, y0                  ; Copy it. Holds row 4 to 7.
+
+    and             g3, 1
+    jz              .SKIP_FILTER
+
%%% Add movq.
+    vpbroadcastb    x3, [g1+128]            ; Load top left.
%%% You already loaded [g1+64] above.
+    vpbroadcastb    x1, [g1+64]             ; Load top.
%%% neighbours.
+    vpmovzxbw       x2, [g1+64-8]           ; Load left neigh.
+
+    vpmovzxbw       x3, x3                  ; Word to byte.
+    vpmovzxbw       x1, x1
+
+    vpsubw          x2, x3                  ; Left - top left.
+    vpsraw          x2, 1                   ; Signed divide by 2.
+    vpaddw          x2, x1                  ; Top + (left - top left)/2.
+
+    vpxor           x3, x3
+    vpmaxsw         x2, x3                  ; Clip negative value to 0.
+    vpackuswb       x2, x2                  ; Word to byte with unsigned saturation.
+    vinserti128     y2, x2, 1               ; Double the data.
+
%%% replication.
+    vmovdqu         y3, [ang_hoz_8]         ; Load replicatino pattern.
%%% that, blends.
+    vpbroadcastq    y4, [pat_255q]          ; Pattern taht blend in a word out of 8.
%%% See comments in DC function.
+
%%% values.
+    vpshufb         y5, y2, y3              ; Replicate 8x the 4 lower value.
+
+    vpalignr        y2, y2, 4               ; Shift to get the last 4 rows.
+
+    vpblendvb       y6, y5, y4              ; Blend only the first value of each row.
+
+    vpshufb         y5, y2, y3              ; Replicate 8x the 4 lower value.
+    vpblendvb       y0, y5, y4              ; Blend only the first value of each row.
+
+    .SKIP_FILTER:
+    vmovdqu         [g0+0x00], y0           ; Save it.
+    vmovdqu         [g0+0x20], y6           ;
+
+    RET
+
+
+; Intra angular vertical right 8x8.
%%% Invalid declaration, you use y10.
%%% Invalid declaration, you don't use temporary integer.
+DEFFUN f265_lbd_predict_intra_vert_right_8_avx2, ia=4, at=8844, ti=2, tv=10, ym=1
+
+    vbroadcasti128  y10, [g1+64]            ; Load top row.
%%% Broken math. 18 + 14 = 32. Effective angle range is 0-34.
%%% See comments in horizontal case.
+    sub             g2, 18                  ; Bring the angle within the range 0 to 14.
%%% cases.
+                                            ; 0,7 and 14 are special case with their own function.
+
%%% Illegal load.
+    vpbroadcastd    y2, [intra_angle+g2*4]  ; Load angle factor.
%%% to increment the sum at every row.
%%% Suboptimal. Make vpbroadcastd destination y8 directly.
+    vmovdqa         y8, y2                  ; Keep a copy by which the sum is incremented at every row.
+
%%% rows.
+    vpmaddubsw      y7, y2, [triple_last_lane]  ; Multiply high lane by 3. Offset required to do 2 row at the time.
+    vpackuswb       y7, y7                  ; This is the angle sum for each row.
+
%%% Offset and weight for all rows.
+    vpmaddubsw      y2, y2, [angle_mul_ver] ; Result in offset and weight for all rows.
+    vpbroadcastd    y3, [pat_31w]           ; Load mask.
+    vpand           y3, y3, y2              ; Keep only weight.
+
+    vpbroadcastd    y4, [pat_32w]           ; Load weight complement base.
+    vpsubw          y4, y3                  ; Get weight complement.
%%% weight.
+    vpunpcklbw      y2, y4, y3              ; Make the pair. Final Weight.
+
+    ; Load all masks.
+    vpbroadcastd    y0, [pat_16w]           ; Load rounding bias.
+    vpbroadcastd    y1, [pat_1_0b]          ; Load weight distribution pattern.
+    vbroadcasti128  y6, [pair_low]          ; Load pair making pattern.
+    vpbroadcastd    y9, [pat_7b]            ; Load "word shift as byte shift" mask pattern.
+
+    ; Register usage.
+    ; - g0: Result array.
+    ; - y0: Rounding bias.
+    ; - y1: Word replication pattern.
+    ; - y2: Weights, distributed to do 2 rows at a time.
+    ; - y3: 2 rows of results [0|2].
+    ; - y4: 2 rows of results [1|3].
+    ; - y5: Temp.
+    ; - y6: Generate pair.
+    ; - y7: Angle sum. Used to generate the offset.
+    ; - y8: Angle value. Add it to the sum at each row.
+    ; - y9: "Word shift as byte shift" mask pattern.
+    ; - y10: Top row. Replicated.
+
%%% lower case, comma.
+    %macro DO_ROW 2                         ; %1: Row offset. %2: Register in which to put the value.
+
+    %if %1 != 0
+    vpaddb          y7, y8                  ; Add the angle to the current angle sum.
+    %endif
+
+    vpsrlw          y%2, y7, 5              ; Generate neighbour offset.
+    vpand           y%2, y9                 ; Shift can only be on word or greater value. Mask to simulate byte shift.
+    vpaddb          y%2, y6                 ; Add offset to pairing mask.
+    vpshufb         y%2, y10, y%2           ; Generate pair.
+
+    %if %1 != 0
+    vpalignr        y5, y2, %1*2            ; Get weight.
+    vpshufb         y5, y1                  ; Broadcast weight.
+    %else
+    vpshufb         y5, y2, y1              ; Broadcast weight.
+    %endif
+
+    vpmaddubsw      y%2, y%2, y5            ; Multiply values with weight.
+    vpaddw          y%2, y0,                ; Add rounding bias.
+    vpsrlw          y%2, y%2, 5             ; Weighted average.
+    vpackuswb       y%2, y%2                ; Word to byte.
+    %endmacro
+
+    DO_ROW          0, 3                    ; Do row 0 and 2.
+    DO_ROW          2, 4                    ; Do row 1 and 3.
+
%%% Suboptimal. Do that using the last pack from DO_ROW.
+    vpunpcklqdq     y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x00], y3           ; Save result.
+
%%% row 0?
+    vpaddb          y7, y8                  ; Skip from row 1 to row 4.
+    vpaddb          y7, y8                  ;
+
+    DO_ROW          4, 3                    ; Do row 4 and 6.
+    DO_ROW          6, 4                    ; Do row 5 and 7.
+
+    vpunpcklqdq     y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x20], y3           ; Save result.
+
+    %unmacro DO_ROW 2
+    RET
+
+
+; Intra angular top right 8x8.
+DEFFUN f265_lbd_predict_intra_dia_top_right_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
%%% Note: unaligned load. To be investigated for performance when neighbour function is fixed.
%%% Apply comments from the other functions.
+    vmovdqu         x0, [g1+65]             ; Load all data.
+
+    vpalignr        y1, y0, 2               ; Offset value: will build rows 2 and 3.
%%% offsetted.
+    vinserti128     y0, y0, x1, 1           ; Push offseted value in high lane.
+
+    vpalignr        y1, y0, 1               ; Create rows 1 and 3.
+    vpunpcklqdq     y2, y0, y1              ; Merge them with rows 0 and 2.
+    vmovdqu         [g0], y2                ; Save rows 1 to 4.
+
%%% 0 to 3.
+    vpalignr        y1, y0, 5               ; Offset to generate rows 4 to 7.
+
+    vpalignr        y0, y0, 4               ; Repeat operation above for rows 4 to 7.
+    vpunpcklqdq     y2, y0, y1              ;
+    vmovdqu         [g0+0x20], y2           ;
+
+    RET
+
+
%%% Remove the first line, not needed.
+; Intra Extract neighbours.
+; Extract and filter neighbours for intra prediction.
+;
+; Input format:
+; EAABB
+; C
+; C
+; D
+; D
+;
+; Output format:
+;   padding   [48]  [64]  padding [128]
+; [ ...       DDCC  AACC  ...     E]
+;
+; Input parameters:
+; - g0: nbuf[2][160].
%%% pred.
+; - g1: *pred.
+; - g2: pred_stride.
+; - g3: avail[2].
+; - g4: filter_flag.
%%% Missing last argument.

%%% Invalid declaration, bad temporary integer count.
%%% Bad ymm names, I didn't check for Y count. Verify.
%%%
%%% The rgister usage needs optimization. I suggest
%%% 1) figure out all the instructions.
%%% 2) do the register coloring.
%%% 3) import the code in a new function.
+DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=6, tv=9, ym=1
+
+    ; Test for special case: no left neighbours.
+    cmp             dword [g3+4], 0
+    jz              .LEFT_NOT_AVAILABLE
+
%%% neighbours, available.
+    ; Left neighbour are availables.
+
+    ; Get C and D from the prediction buffer.
+    ; Pseudo-code:
%%% double -> dword.
+    ; - Load & broadcast as double the left neighbour of each row.
%%% Blend the rows together.
+    ; - Blend each row together.
%%% needs
+    ; - Keep in mind the order need to be inversed.
+
%%% prediction
+    ; Get left neighbours (C) from the pred buffer.
+
+    ; Load 2 left neighbours and blend them together.
+    ; Update the offset for continuous calling.
+    ; Require the following setup:
%%% CB.
+    ; g1: Prediction source, aligned with the top left cb pixel.
%%% neighbour.
+    ; g6: Even row offset. Start at 0 for the first left neighbours.
%%% neighbour.
%%% at
+    ; g7: Odd row offset. Start ar pred_stride for the second neighbours.
+    ; g8: The offset between 2 rows. Should be 2*pred_stride.
%%% lower case, comma, period.
+    %macro load 2                           ; %1: the xmm register in which to save the value. %2: A work register
%%% Suboptimal.
%%% Useless adds for the last 2 rows.
%%% Do lea gX, [g1-4] once.
%%% Do lea gY, [pred_stride*3] once.
%%% Do your loads as follow:
%%% - [gX]
%%% - [gX+pred_stride]
%%% - [gX+2*pred_stride]
%%% - [gX+gY]
%%% - (conditionally to pass to next 4-pixel block) lea gX, [gx+4*pred_stride]
+    vpbroadcastd    %1, [g1-4+g6]           ; Load & broadcast the left neighbour.
%%% period.
+    vpbroadcastd    %2, [g1-4+g7]           ; Load & broadcast the next left neighbour,
+    add             g6, g8                  ; Update the offset of even row.
+    add             g7, g8                  ; Update the offset of odd row.
+
+    vpblendd        %1, %1, %2, 0b0101_0101 ; Mix even and odd row: result 1 0 1 0.
+    %endmacro
+
%%% Load that just once.
+    vpbroadcastd    x8, [neigh_last_b_of_d] ; Load suffle mask.

%%% Delete this.
+    xor             g6, g6
+    mov             g7, g2
+    mov             g8, g2
+    add             g8, g2

+
+    load            x0, x5                  ; Load C0 and C1. Result 1 0 1 0.
+    load            x1, x5                  ; Load C2 and C3. Result 3 2 3 2.
+    vpblendd        x0, x0, x1, 0b0011_0011 ; Mix 1 0 and 3 2. Result 3 2 1 0.
+    vpshufb         x0, x8                  ; Keep the last byte of each dword.
+
+    load            x3, x5                  ; Load C4 and C5. Result 5 4 5 4.
+    load            x4, x5                  ; Load C6 and C7. Result 7 6 7 6.
+    vpblendd        x3, x3, x4, 0b0011_0011 ; Mix 5 4 and 7 6. Result 7 6 5 4.
+    vpshufb         x3, x8                  ; Keep the last byte of each dword.
+
+    vpblendw        x0, x0, x3, 0b0011_0011 ; [7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0].
+
+    ; Special case: no top neighbours.
+    cmp             dword [g3], 0
+    jz              .TOP_NOT_AVAILABLE
+
+    ; Load top (A and B) neighbour from pred.
+    neg             g2                      ; Move up 1 row (negative pred_stride).
+    vmovdqu         x1, [g1+g2]             ; Load A|B from prediction.
%%% vmovd.
+    vpinsrb         x2, [g1+g2-1], 0        ; Load top left (E).
+    neg             g2                      ; Return it to a positive value.
+
+    .LEFT_AND_TOP_FETCHED:
+
%%% values.
%%% Why? You re-do it below.
+    ; Save current value.
+    vmovdqu         [g0+48], x0
+    vmovdqu         [g0+64], x1
+    vmovd           [g0+128], x2
+
+    ; Test if bottom left is available.
+    cmp             dword [g3+4], 8
+    jae             .BOTTOM_AVAILABLE
+
%%% Bottom-left.
+    ; Bottom left not available.
+    vpshufb         x0, [neig_bl_unav_8]    ; Expand the last value.
+
+    .BOTTOM_FETCHED:
+
%%% Dupe store with above (this one is correct).
+    vmovdqu         [g0+48], x0             ; Save partial top and left to allow easy byte extraction.
+    vmovdqu         [g0+64], x1
+
%%% Do that right at function entry.
%%% Declare your registers so you track where goes where.
+    movzx           g5, byte [g3]           ; Load availx.
+    movzx           g6, byte [g3+4]         ; Load -availy.
+    neg             g6
+
+    vpbroadcastb    x4, [g0+63+g5]          ; Broadcast the last available block.
%%% Add movq.
+    vpbroadcastb    x3, [g0+64+g6]          ;
+
%%% number, replace everywhere.
%%% Can be made faster.
%%% movd ytmp, g5
%%% vpbroadcastb ytmp, ytmp.		     ; Broadcast the number of pixels available.
%%% ypcmpgtb ytmp, [0 1 2 3 4 5 6 7 ... ]    ; 0xff for pixels that need replacement.
%%% vpblendvb           		     ; Replace (figure out the register order).

+    mov             g7, 16                  ; Clip nb of valid data to 16.
+    cmp             g5, g7
+    cmovg           g5, g7
+    neg             g5                      ; Invert nb avail to match blend mask.
+
%%% The case availx=12 almost never happens in practice. This wastes time.
%%% Handle with BOTTOM_4_AVAILABLE branch from BOTTOM_AVAILABLE below.
+    neg             g6
+    mov             g7, 16                  ; Clip nb of valid data to 16.
+    cmp             g6, g7
+    cmova           g6, g7
+
%%% Invalid load.
+    vmovdqu         x5, [neigh_avail_msk+32+g5] ; Load blend mask. A negative offset will determine
+                                                ; the nb of valid neighbours,
+    vpblendvb       x1, x1, x4, x5          ; Replace (blend) invalid value with the broadcasted last valid values.
+
+    vmovdqu         x5, [neigh_avail_msk+16+g6] ; Load blend mask. A positive offset will determine
+                                                ; the nb of valid neighbours,
+    vpblendvb       x0, x3, x0, x5          ; Replace (blend) invalid value with the broadcasted last valid values.
+
%%% Alas, no. Not aligned.
+    vinserti128     y0, y0, x1, 1           ; Save value has a single register to maximize write forwarding.
+    vmovdqu         [g0+48], y0
+
+    ; Filter only if required (Test )
+    cmp             g4, 0
+    je              .END
+
+    ; Pseudo code:
+    ; Register ordering : D7, D6 ... D0, C7, ... C0, E, A0, ..., A7, B0, ... B6, B7.
+    ; V[i] = (V[i-1] + 2*V[i] + V[i+1] + 2) >> 2
+    ; D7 = D7, B7 = B7
+
%%% Wrong register names.
+    vpbroadcastd    ymm8, [pat_1b]          ; Load pmadd pattern (actually, just an add and zero extend).
+    vpbroadcastd    ymm7, [pat_2w]          ; Load rounding bias.
+    vmovdqu         ymm6, [neigh_1_of_2]    ; Load unpack pattern (select 1 byte from each register).
%%% Get rid of that.
+    vmovdqu         ymm5, [blend_extremity] ; Load exception blend mask.
+
+    vpslldq         xmm4, xmm2, 15          ; Move the top left (e) to the last byte of the xmm register.
+    vpalignr        xmm3, xmm2, xmm0, 1     ; Remove D7 and insert E next to C0.
+                                            ; All bytes are shifted by one. Named D|C*.
+    vpalignr        xmm4, xmm1, xmm4, 15    ; Remove B7 and insert E next to A0.
%%% Typo.
+                                            ; All byte sare shifted by one. Named A|B*.
+
+    vinserti128     ymm0, ymm0, xmm1, 1     ; Pack D|C with A|B.
+    vinserti128     ymm3, ymm3, xmm4, 1     ; Pack D|C* with A|B*.
+
%%% Add the neighbours together.
+    vpmaddubsw      ymm0, ymm0, ymm8        ; Add each neighbour together.
%%% Re-read, typos, no period, pairs, innermost, will be.
+    vpmaddubsw      ymm3, ymm3, ymm8        ; As D|C|AB* is DC|A|B offsetted by one byte, this will generate all.
+                                            ; D|C and A|B peer. The inner most value of D|C|A|B* will by C0+E and E+A0.
+
+    vpaddw          ymm1, ymm0, ymm3        ; Add D|C|A|B to D|C|A|B*.
+    vpaddw          ymm1, ymm1, ymm7        ; Add rounding bias.
+    vpsrlw          ymm1, 2                 ; Round.
+

%%% Use a 32-byte pattern instead, this is too expensive.
%%% words, adjacent pairs together.
+    vpalignr        ymm2, ymm3, ymm3, 14    ; Shift the word to add adjacent pair to together.
%%% lanes, different directions, since, first lane, second lane.
+    vpalignr        ymm3, ymm3, ymm3, 2     ; The high and low lane must be shifted in defferent direction
+                                            ; Since E was added at the start of the first and end of the second.
%%% Typo.
+    vpblendd        ymm3, ymm3, ymm2, 0x0F  ; Merge togethershifted result.

%%% Typo / I don't understand.
+    vpaddw          ymm0, ymm3, ymm0        ; Generate the other half of quatuor.
+    vpaddw          ymm0, ymm0, ymm7        ; Add rounding bias.
+    vpsrlw          ymm0, 2                 ; Round.
+
+    vpackuswb       ymm0, ymm0, ymm1        ; Word to byte.
+    vpshufb         ymm0, ymm6              ; Interleave the result.
+
+    vpinsrb         xmm1, [g0+48], 0        ; Manage D7.
+    vpinsrb         xmm1, [g0+79], 15       ; Manage B7.
%%% Won't be needed anymore since you'll use two stores.
+    vinserti128     ymm1, ymm1, xmm1, 1     ; Copy to the high lane.
%%% Period.
+    vpblendvb       ymm0, ymm1, ymm5        ; Replace invalid result by the valid D7 and B7
+
%%% Unaligned store.
+    vmovdqu         [g0+160+48], ymm0       ; Save it.
+
%% top-left.
+    ; Filter top left.
+    movzx           g2, byte [g0+128]       ; Load top left.
+    movzx           g3, byte [g0+63]        ; Load top.
+    movzx           g4, byte [g0+64]        ; Load left.
%%% LEA (top_left*2+right+bias).
+    add             g2, g2                  ; Top left * 2.
%%% Period.
+    add             g3, g4                  ; A0 + C0
%%% Period.
+    add             g2, g3                  ; A0 + Top left * 2 + C0
+    add             g2, 2                   ; Add rounding bias.
+    shr             g2, 2                   ; Round.
%% top-left.
+    mov             [g0+160+128], g2b       ; Save filtered top left.
+
%%% Use RET.
+    jmp .END
+
+    .LEFT_NOT_AVAILABLE:
+
+    ; Test if top is available.
+    cmp             dword [g3], 0
+    jz              .NOTHING_AVAILABLE
+
%%% Those double negs should be avoided.
+    neg             g2
+    vmovdqu         xmm1, [g1+g2]           ; Load top value
+    neg             g2
%%% vpbroadcastb does the job.
+    vpxor           xmm2, xmm2              ;
+    vpshufb         xmm0, xmm1, xmm2        ; Broadcast the first byte as the left value.
+    vmovdqa         xmm2, xmm1              ; Set top left.
+    jmp             .LEFT_AND_TOP_FETCHED
+
+    .TOP_NOT_AVAILABLE:
+    vpbroadcastd    xmm2, [pat_15b]
+    vpshufb         xmm1, xmm0, xmm2        ; Replicate C0 as the top neighbours.
+    vmovdqa         xmm2, xmm1              ; Set top left.
+    jmp             .LEFT_AND_TOP_FETCHED
+
+
+    .BOTTOM_AVAILABLE:
+    ; Get D from the pred buffer.
%%% shuffle.
%%% Apply fixes as above. Don't recompute variables that you already set up before.
+    vpbroadcastd    x8, [neigh_last_b_of_d] ; Load suffle mask.
+
+    ; Init macro register.
+    mov             g6, g2                  ; Copy pred_stride.
+    mov             g7, g2
+    mov             g8, g2                  ;
+    shl             g6, 3                   ; Start at the 8th row.
+    shl             g7, 3
+    add             g7, g2                  ; Add the odd row offset.
%%% Increment.
+    add             g8, g2                  ; Incremetn 2 row at the time.
+
+    load            x6, x5                  ; Load D0 and D1. Result 9 8 9 8.
+    load            x3, x5                  ; Load D2 and D3. Result b a b a.
+    vpblendd        x6, x6, x3, 0b0011_0011 ; Mix 9 8 and b a. Result 3 2 1 0.
+    vpshufb         x6, x8                  ; Keep the last byte of each word.
+
+    load            x3, x5                  ; Load D4 and D5. Result d c d c.
+    load            x4, x5                  ; Load D6 and D7. Result f e f e.
+    vpblendd        x3, x3, x4, 0b0011_0011 ; Mix d c and f e. Result f e d c.
+    vpshufb         x3, x8                  ;
+
+    vpblendw        x6, x6, x3, 0b0011_0011 ; [f e d c b a 9 8 f e d c b a 9 8].
+
+    ; Merge C and D.
+    vpblendd        x0, x0, x6, 0b0011_0011 ; [f e d c b a 9 8 7 6 5 4 3 2 1 0].
+
+    jmp             .BOTTOM_FETCHED
+
+
+    .NOTHING_AVAILABLE:
+    vpbroadcastd    ymm0, [pat_128b]        ; Store 128 everywhere.
+    vmovdqa         xmm1, xmm0              ;
+    vmovdqa         xmm2, xmm0              ;
+
+    vmovdqu         [g0+48], ymm0           ; Save it.
+    vmovd           [g0+128], xmm0          ;
%%% Double plus.
+    vmovdqu         [g0++160+48], ymm0      ; Save the filtered version.
+    vmovd           [g0+160+128], xmm0      ;
+
+    .END:
%%% Bad argument count.
+    %unmacro load 1
+    RET
%%% Add your newline.
\ No newline at end of file

Re: [f265 dev team] Intra prediction 8x8 asembly

Reply via email to