Hi,

here is the new version of the patch.

Daniel
On 09/05/2014 11:22 AM, Laurent Birtz wrote:
Hi,

here's the commented review.

Regards,
Laurent

diff --git a/f265/asm.c b/f265/asm.c
index 014d3ea..ba663f8 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -198,8 +198,19 @@ void f265_lbd_scale_qpel_48_avx2(int16_t *dst, int dst_stride, uint8_t *src, int
 void f265_hbd_scale_qpel_c(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int packed_dims);
 
 void f265_lbd_predict_intra_planar_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_planar_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
 void f265_lbd_predict_intra_dc_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dc_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
 void f265_lbd_predict_intra_angular_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_bot_left_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_bot_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_top_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_left_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_ver_left_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_ver_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_ver_right_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_right_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
 void f265_lbd_predict_intra_planar_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
 void f265_lbd_predict_intra_dc_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
 void f265_lbd_predict_intra_dia_bot_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
@@ -1073,6 +1084,17 @@ static void f265_link_asm(int avx2_flag)
         f265_lbd_scale_qpel[7] = f265_lbd_scale_qpel_12_avx2;
         f265_lbd_scale_qpel[8] = f265_lbd_scale_qpel_24_avx2;
         f265_lbd_scale_qpel[9] = f265_lbd_scale_qpel_48_avx2;
+        f265_lbd_predict_intra[0] = f265_lbd_predict_intra_planar_4_avx2;
+        f265_lbd_predict_intra[1] = f265_lbd_predict_intra_dc_4_avx2;
+        f265_lbd_predict_intra[2] = f265_lbd_predict_intra_dia_bot_left_4_avx2;
+        f265_lbd_predict_intra[3] = f265_lbd_predict_intra_hor_bot_4_avx2;
+        f265_lbd_predict_intra[4] = f265_lbd_predict_intra_hor_4_avx2;
+        f265_lbd_predict_intra[5] = f265_lbd_predict_intra_hor_top_4_avx2;
+        f265_lbd_predict_intra[6] = f265_lbd_predict_intra_dia_top_left_4_avx2;
+        f265_lbd_predict_intra[7] = f265_lbd_predict_intra_ver_left_4_avx2;
+        f265_lbd_predict_intra[8] = f265_lbd_predict_intra_ver_4_avx2;
+        f265_lbd_predict_intra[9] = f265_lbd_predict_intra_ver_right_4_avx2;
+        f265_lbd_predict_intra[10] = f265_lbd_predict_intra_dia_top_right_4_avx2;
         f265_lbd_predict_intra[11] = f265_lbd_predict_intra_planar_8_avx2;
         f265_lbd_predict_intra[12] = f265_lbd_predict_intra_dc_8_avx2;
         f265_lbd_predict_intra[13] = f265_lbd_predict_intra_dia_bot_left_8_avx2;
diff --git a/f265/asm/avx2/intra.asm b/f265/asm/avx2/intra.asm
index 9262a2a..d87a35d 100644
--- a/f265/asm/avx2/intra.asm
+++ b/f265/asm/avx2/intra.asm
@@ -21,6 +21,14 @@ neigh_1_of_2:       db  0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15,
 neigh_shift_pair:   db  14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,
                     db  2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,
 
+intra_ox_4:         dw  0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, ; Effect of row.
+intra_inv_ox_4:     dw  3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0, ; Inverted effect.
+intra_ver_4:        dw  1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, ; Vertical angle multipliers.
+intra_hor_4:        db  3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0, ; Horizontal angle multipliers.
+                    db  1,1,1,1, 0,0,0,0, 3,3,3,3, 2,2,2,2
+intra_p_hor_4:      db  3,1,3,1, 3,1,3,1, 2,2,2,2, 2,2,2,2, ; Horizontal angle multipliers for planar.
+                    db  1,3,1,3, 1,3,1,3, 0,4,0,4, 0,4,0,4,
+
 
 align 16
 
@@ -28,14 +36,14 @@ align 16
 ang_hor_8:          db  3, 3, 3, 3, 3, 3, 3, 3,  2, 2, 2, 2, 2, 2, 2, 2,
                     db  1, 1, 1, 1, 1, 1, 1, 1,  0, 0, 0, 0, 0, 0, 0, 0,
 
-; Pshufb pattern to generate neighbour pairs.
+; Repeat values on a whole 8x8 row. Inversed for use in pure horizontal.
 pair_low:           db  0,1, 1,2, 2,3, 3,4, 4,5, 5,6, 6,7, 7,8,
 pair_high:          db  7,8, 8,9, 9,10, 10,11,  11,12, 12,13, 13,14, 14,15
 
 angle_mul_hor:      dw  1, 2, 3, 4, 5, 6, 7, 8,     ; Row index. Used to get the weight and offset of each row on
                                                     ; horizontal angles.
-angle_inv_mul_hor:  dw  0, 1, 2, 3, 4, 5, 6, 7,     ; Multiplier for inv_angle_8 on horizontal angles.
-angle_inv_mul_ver:  dw  7, 6, 5, 4, 3, 2, 1, 0,     ; Multiplier for inv_angle_8 on vertical angles.
+angle_inv_mul_hor:  dw  0, 1, 2, 3, 4, 5, 6, 7,     ; Multiplier for inv_angle on horizontal angles.
+angle_inv_mul_ver:  dw  7, 6, 5, 4, 3, 2, 1, 0,     ; Multiplier for inv_angle on vertical angles.
 
 dia_bot_left_8:     db  14, 13, 12, 11, 10, 9, 8, 7 ; Invert byte order.
                     db  6, 5, 4, 3, 2, 1, 0, 15
@@ -48,17 +56,22 @@ neig_bl_unav_8:     db  0,0,0,0,0,0,0,0, 0, 1, 2, 3, 4, 5, 6, 7
 
 pat_b_m1_to_14:     db  -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
 
+intra_dia_bl_4:     db  6,5,4,3, 5,4,3,2, 4,3,2,1, 3,2,1,0  ; Dia bottom left re-ordering pattern.
+intra_dia_tl_4:     db  0,8,9,10, 7,0,8,9, 6,7,0,8, 5,6,7,0 ; Dia top-left re-ordering pattern.
+intra_dia_tr_4:     db  1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7, ; Dia top right re-ordering pattern.
+
+
 align 4
 
 ; Seed on which the the neighbours offset of inversed angles are calculated.
 ; As words (repeated 4 times) for speed-ups.
-inv_angle_8:        db  16, 16, 16, 16
-                    db  19, 19, 19, 19
-                    db  24, 24, 24, 24
-                    db  30, 30, 30, 30
-                    db  39, 39, 39, 39
-                    db  57, 57, 57, 57
-                    db  102, 102, 102, 102
+inv_angle:          dw  256, 256
+                    dw  315, 315
+                    dw  390, 390
+                    dw  482, 482
+                    dw  630, 630
+                    dw  910, 910
+                    dw  1638, 1638
 
 ; Seed on which the angles weights and offsets are calculated.
 ; As words (repeated 4 times) for speed-ups.
@@ -72,16 +85,21 @@ intra_angle:        db  2, 2, 2, 2,
 
 ; Pattern used as mask, bias, offset, ...
 ; As double to use the more efficient vpbroadcastd.
+intra_p_ver_4:      db  3,1, 2,2, 1,3, 0,4,
+intra_tl_4:         db  3,4, 4,5, 5,6, 6,7
 neigh_last_b_of_d:  db  3, 7, 11, 15,
 pat_q_255:          dq  0xff
 pat_w_8192:         dw  8192, 8192,
+pat_w_4096:         dw  4096, 4096,
 pat_w_2048:         dw  2048, 2048,
 pat_w_1024:         dw  1024, 1024,
+pat_w_128:          dw  128, 128
 pat_w_32:           dw  32, 32,
 pat_w_31:           dw  31, 31,
 pat_w_8:            dw  8, 8,
 pat_b_14_15:        db  14,15, 14,15,
 pat_b_7_8:          db  7,8, 7,8,
+pat_b_4_5:          db  4,5, 4,5,
 pat_b_0_1:          db  0,1, 0,1,
 pat_b_128:          db  128, 128, 128, 128
 pat_b_15:           db  15, 15, 15, 15,
@@ -149,17 +167,17 @@ DEFFUN f265_lbd_predict_intra_dc_8_avx2, ia=4, at=8844, ti=0, tv=6, ym=1
     ; - Top =  3*base + top.
     ; - Left = 3*base + left.
 
-    movd            g2d, x1                 ; Extract base.
+    vmovd           g2d, x1                 ; Extract base.
     and             g2, 0xff
 
     lea             g3, [3*g2+2]            ; Base * 3 + rounding bias.
-    movd            x3, g3d
+    vmovd           x3, g3d
     vpbroadcastw    y3, x3                  ; Broadcast base * 3 + rounding bias.
 
     movzx           g3, byte [g1+64]        ; Load the first top and left value.
     movzx           ga, byte [g1+63]
 
-    vpaddw          y2, y3                  ; 3 * Base + neighbours + rounding bias.
+    vpaddw          y2, y3                  ; 3 * base + neighbours + rounding bias.
     vpsrlw          y2, 2                   ; Divide by 4.
 
     vpackuswb       y2, y2                  ; Word to byte.
@@ -431,7 +449,7 @@ DEFFUN f265_lbd_predict_intra_hor_8_avx2, ia=4, at=8844, ti=0, tv=4, ym=1
 
     vpmovzxbw       x3, x3                  ; Byte to word.
 
-    vpsubw          x0, x3                  ; top - top-left.
+    vpsubw          x0, x3                  ; Top - top-left.
     vpsraw          x0, 1                   ; (top - top-left)/2.
 
     vmovd           x3, [g1+63]             ; Load left.
@@ -462,14 +480,14 @@ DEFFUN f265_lbd_predict_intra_hor_top_8_avx2, ia=4, at=8844, ti=0, tv=7, ym=1
     vpinsrb         x5, [g1+128], 0         ; Insert the top-left neighbour.
 
     ; Import top neighbour with the left ones.
-    lea             g3, [inv_angle_8]
+    lea             g3, [inv_angle]
     vpbroadcastd    y4, [g3+g2*4+18*4]      ; Load the inversed angle values.
     vmovdqu         x3, [angle_inv_mul_hor] ; Load the weight values.
-    vpmaddubsw      y4, y4, y3              ; Get the weight. Some neighbour will have an invalid offset.
+    vpmullw         y4, y4, y3              ; Get the weight. Some neighbour will have an invalid offset.
                                             ; Since we never read them, it's ok.
-    vpbroadcastd    y3, [pat_w_8]           ; Load inversed angle bias.
+    vpbroadcastd    y3, [pat_w_128]         ; Load inversed angle bias.
     vpaddw          y4, y3                  ; Add inversed angle bias.
-    vpsraw          y4, 4                   ; Get inversed neighbour offset.
+    vpsraw          y4, 8                   ; Get inversed neighbour offset.
     vpackuswb       y4, y4                  ; Word to byte.
     vpshufb         y5, y4                  ; Re-order left neighbours.
 
@@ -486,7 +504,7 @@ DEFFUN f265_lbd_predict_intra_hor_top_8_avx2, ia=4, at=8844, ti=0, tv=7, ym=1
 %unmacro DO_ROW 2
 
 
-; Intra pure diagonal top left 8x8.
+; Intra pure diagonal top-left 8x8.
 DEFFUN f265_lbd_predict_intra_dia_top_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
     vmovq           x0, [g1+64-7]           ; Load top row.
     vmovhps         x0, [g1+64]             ; Load left row.
@@ -610,7 +628,7 @@ DEFFUN f265_lbd_predict_intra_dia_top_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym
     DO_ROW          2, 4, %1                ; Do row 1 and 3.
 
     vpackuswb       y3, y4                  ; Merge value.
-    vmovdqu         [g0+0x00], y3           ; Save result.
+    vmovdqu         [g0+0x00], y3           ; Save the result.
 
     vpaddb          y7, y8                  ; Skip from rows 1|3 to rows 4|6.
     vpaddb          y7, y8
@@ -619,7 +637,7 @@ DEFFUN f265_lbd_predict_intra_dia_top_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym
     DO_ROW          6, 4, %1                ; Do row 5 and 7.
 
     vpackuswb       y3, y4                  ; Merge value.
-    vmovdqu         [g0+0x20], y3           ; Save result.
+    vmovdqu         [g0+0x20], y3           ; Save the result.
 %endmacro
 
 
@@ -629,17 +647,18 @@ DEFFUN f265_lbd_predict_intra_ver_left_8_avx2, ia=4, at=8844, ti=0, tv=11, ym=1
     vpinsrb         x0, [g1+128], 8         ; Load top-left.
 
     ; Re-order the left neighbours.
-    lea             g3, [inv_angle_8]
-    vpbroadcastd    y2, [g3+g2*4-18*4]      ; Load the inversed angle values.
+    lea             g3, [inv_angle]
+    vpbroadcastd    x2, [g3+g2*4-18*4]      ; Load the inversed angle values.
     vmovdqu         x3, [angle_inv_mul_ver] ; Load the inversed weight values.
-    vpmaddubsw      y2, y2, y3              ; Get the weight. Some neighbour will have an invalid offset.
+    vpmullw         x2, x2, x3              ; Get the weight. Some neighbour will have an invalid offset.
                                             ; Since we never use them, it's ok.
-    vpbroadcastd    y3, [pat_w_8]           ; Load inversed angle bias.
-    vpaddw          y2, y3                  ; Add inversed angle bias.
-    vpsraw          y2, 4                   ; Get inversed neighbour offset.
-    vpsubb          y2, y3, y2              ; Invert the index.
-    vpackuswb       y2, y2                  ; Word to byte.
-    vpshufb         y0, y2                  ; Re-order left neighbours.
+    vpbroadcastd    x3, [pat_w_128]         ; Load inversed angle bias.
+    vpaddw          x2, x3                  ; Add inversed angle bias.
+    vpsraw          x2, 8                   ; Get inversed neighbour offset.
+    vpbroadcastd    x3, [pat_w_8]           ; Load inversed angle offset.
+    vpsubb          x2, x3, x2              ; Invert the index.
+    vpackuswb       x2, x2                  ; Word to byte.
+    vpshufb         x0, x2                  ; Re-order left neighbours.
 
     ; Blend re-ordered neighbours with the top neighbours.
     vmovhps         x0, [g1+64]
@@ -745,7 +764,7 @@ DEFFUN f265_lbd_predict_intra_dia_top_right_8_avx2, ia=4, at=8844, ti=0, tv=3, y
 ; - g2: pred_stride.
 ; - g3: avail[2].
 ; - g4: filter_flag.
-; - g5: packed (Ignored).
+; - g5: packed (ignored).
 DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
     ; Load availability.
     movzx           g5, byte [g3]           ; Load availx.
@@ -765,7 +784,7 @@ DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
 
     ; Get 4 left neighbours.
     ; Input:
-    ; - %1: the xmm register in which to save the value,
+    ; - %1: the xmm register in which to save the value.
     ; - %2: temp.
     ; - %3: temp.
     ; - ga: first row address. Must be aligned on the dword left of the row.
@@ -784,7 +803,7 @@ DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
     vpshufb         %1, x7                  ; Keep the last byte of each dword.
     %endmacro
 
-    vpbroadcastd    x7, [neigh_last_b_of_d] ; Load suffle mask.
+    vpbroadcastd    x7, [neigh_last_b_of_d] ; Load shuffle mask.
 
     lea             ga, [g1-4]
     lea             g3, [g2*3]
@@ -917,6 +936,7 @@ DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
 
     lea             ga, [ga+g2*4]
     load            x4, x5, x6
+    %unmacro load 3
 
     vpblendd        x3, x3, x4, 0b0101
 
@@ -946,4 +966,618 @@ DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
     vmovd           [g0+160+128], x0
     RET
 
-    %unmacro load 2
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Duplicate above functions for 4x4 block size.
+
+
+; Intra DC 4x4.
+DEFFUN f265_lbd_predict_intra_dc_4_avx2, ia=4, at=8844, ti=0, tv=4, ym=0
+    ; Logic:
+    ; Sum all direct neighbours.
+    ; Divide with bias by the number of samples.
+
+    vmovd           x2, [g1+60]             ; Load left neighbours.
+    vmovd           x3, [g1+64]             ; Load top neighbours.
+
+    vpmovzxbw       x2, x2                   ; Byte to word.
+    vpmovzxbw       x3, x3
+
+    ; Add top and left neighbours.
+    vpaddw          x0, x2, x3
+
+    ; Fold in two the subresult.
+    vpalignr        x1, x0, x0, 4
+    vpaddw          x0, x0, x1
+
+    ; Get a single sum.
+    vphaddw         x0, x0, x0
+
+    ; Round.
+    vmovd           x1, [pat_w_4096]
+    vpmulhrsw       x0, x0, x1              ; Round.
+    vpbroadcastb    x0, x0
+
+    ; Should it be filtered?
+    test            g3, 1
+    jz              .SKIP_FILTER
+
+    ; 3 cases:
+    ; - Top-left = 2*base + top + left.
+    ; - Top =  3*base + top.
+    ; - Left = 3*base + left.
+
+    ; Blend top and left neighbours.
+    vpalignr        x3, x3, x3, 8
+    vpblendd        x2, x3, x2, 0b0011
+
+    movd            g2d, x0                 ; Extract base.
+    and             g2, 0xff
+
+    lea             g3, [3*g2+2]            ; Base * 3 + rounding bias.
+    movd            x1, g3d
+    vpbroadcastw    x1, x1                  ; Broadcast base * 3 + rounding bias.
+
+    vpaddw          x2, x1                  ; 3 * base + neighbours + rounding bias.
+    vpsrlw          x2, 2                   ; Divide by 4.
+    vpackuswb       x2, x2                  ; Word to byte.
+
+    vpalignr        x1, x2, 4
+    vpblendd        x0, x1, x0, 0xfe        ; Blend in the top row.
+
+    vpshufb         x2, x2, [intra_hor_4]
+    vpbroadcastd    x1, [pat_q_255]
+    vpblendvb       x0, x0, x2, x1
+
+    ; Do top-left.
+    movzx           g3, byte [g1+64]        ; Load the first top and left value.
+    movzx           ga, byte [g1+63]
+    add             g3, ga                  ; Top + left.
+    lea             g2, [2*g2+g3+2]         ; Top + left + 2*base + bias.
+    shr             g2, 2                   ; Get the average.
+    vpinsrb         x0, g2b, 0              ; Insert the result.
+
+    .SKIP_FILTER:
+
+    vmovdqu         [g0], x0                ; Save the results.
+
+    RET
+
+
+; Intra planar 4x4.
+DEFFUN f265_lbd_predict_intra_planar_4_avx2, ia=4, at=8844, ti=0, tv=4, ym=1
+    ; Value[x,y] = ((3-x)*left + (3-y)*top + (x+1)*top_right + (y+1)*bottom_left + 4) >> 3);
+
+    ; Get top neighbour weighted values.
+    vpbroadcastd    y1, [g1+64]             ; Load & broadcast top neighbours.
+    vmovd           x2, [g1+59]             ; Load & broadcast bottom-left.
+    vpbroadcastb    y2, x2
+    vpunpcklbw      y1, y1, y2              ; Mix top with bottom-left neighbours.
+    vpmaddubsw      y1, y1, [intra_p_hor_4] ; Get the value of each column.
+
+    ; Get left neighbour weighted values.
+    vpbroadcastd    y3, [g1+60]             ; Load left neighbours.
+    vpshufb         y3, [intra_hor_4]       ; Re-order them.
+    vmovd           x2, [g1+68]             ; Load & broadcast top-right.
+    vpbroadcastb    y2, x2
+    vpunpcklbw      y3, y3, y2              ; Mix top right with left neighbours.
+    vpbroadcastq    y0, [intra_p_ver_4]     ; Load & broadcast weight.
+    vpmaddubsw      y3, y3, y0              ; Get the value of each row.
+
+    vpaddw          y1, y3                  ; Get the final sums.
+    vpbroadcastd    y0, [pat_w_4096]        ; Round the sums.
+    vpmulhrsw       y1, y1, y0
+    vpackuswb       y1, y1
+
+    vmovq           [g0], x1                ; Save the result.
+    vextracti128    x1, y1, 1
+    vmovq           [g0+8], x1
+    RET
+
+
+; Intra pure diagonal bottom-left 4x4.
+DEFFUN f265_lbd_predict_intra_dia_bot_left_4_avx2, ia=4, at=8844, ti=0, tv=1, ym=0
+    vmovq           x0, [g1+56]             ; Load left neighbours.
+    vpshufb         x0, [intra_dia_bl_4]    ; Re-order them.
+    vmovdqu         [g0], x0                ; Save the results.
+    RET
+
+
+; Calculate weight and offset.
+; Register usage:
+; - y0: angle values. Output as the offset.
+; - y1: weight (output).
+; - y2: tmp.
+%macro GEN_WEIGHT_OFF 2                     ; %1: 1 if vertical prediction, %2: 1 if we need to import neighbour
+                                            ; from the other side.
+    %assign IS_VERT %1
+    %assign IS_INVERSED %2
+
+    ; Generate weights.
+    vpbroadcastd    y2, [pat_w_31]          ; Load weight mask.
+    vpand           y1, y0, y2              ; Extract weight.
+    vpbroadcastd    y2, [pat_w_32]          ; Load weight sum.
+    vpsubw          y2, y2, y1              ; Get weight complement.
+
+    %if (IS_VERT && IS_INVERSED) || (!IS_VERT && !IS_INVERSED)
+    vpsllw          y2, y2, 8
+    vpor            y1, y1, y2
+    %elif (IS_VERT && !IS_INVERSED) || (!IS_VERT && IS_INVERSED)
+    vpsllw          y1, y1, 8
+    vpor            y1, y1, y2
+    %endif
+
+    ; Generate offset.
+    vpsrlw          y0, y0, 5               ; Get the offset (word).
+
+    %if !IS_VERT && IS_INVERSED
+    vpaddw          y0, [intra_inv_ox_4]    ; Add the effect of rows.
+    %elif !IS_VERT && !IS_INVERSED
+    vpaddw          y0, [intra_ox_4]        ; Add the effect of rows.
+    %elif IS_VERT && !IS_INVERSED
+    vpbroadcastq    y2, [angle_inv_mul_hor]
+    vpaddw          y0, y2                  ; Add the effect of columns.
+    %elif IS_VERT && IS_INVERSED
+    ; No processing in this case.
+    %endif
+
+    ; Double word value (once per byte).
+    vpsllw          y2, y0, 8
+    vpor            y0, y2, y0
+
+    %if !IS_VERT && !IS_INVERSED
+    vpbroadcastd    y2, [pat_b_14_15]       ; Generate offset pairs.
+    vpsubw          y0, y2, y0              ; Final offset.
+
+    %elif !IS_VERT && IS_INVERSED
+    vpbroadcastd    y2, [pat_b_4_5]         ; Generate offset pairs.
+    vpaddw          y0, y2, y0              ; Final offset.
+
+    %elif IS_VERT && !IS_INVERSED
+    vpbroadcastd    y2, [pat_b_0_1]         ; Generate offset pairs.
+    vpaddw          y0, y2, y0              ; Final offset.
+
+    %elif IS_VERT && IS_INVERSED
+    vpbroadcastq    y2, [intra_tl_4]        ; Generate offset pairs.
+    vpsubw          y0, y2, y0              ; Final offset.
+    %endif
+%endmacro
+
+
+; Calculate prediction and save it.
+; Register usage:
+; - y0: offset.
+; - y1: weight.
+; - y%1: neighbours.
+%macro DO_PRED 1
+    vpshufb         y%1, y0                 ; Position neighbours pairs.
+    vpmaddubsw      y%1, y1                 ; Multiply by weight.
+    vpbroadcastw    y1, [pat_w_1024]
+    vpmulhrsw       y%1, y1                 ; Round.
+
+    vpackuswb       y%1, y%1, y%1           ; Word to byte.
+    vmovq           [g0], x%1               ; Save the results.
+    vextracti128    x%1, y%1, 1
+    vmovq           [g0+8], x%1
+%endmacro
+
+
+; Calculate inverted neighbours offset.
+%macro INV_RESHUF 3                         ; %1: 1 if we are doing vertical prediction, %2: output register, %3: tmp.
+    %assign IS_VERT %1
+
+    ; Load inversed angle.
+    lea             g3, [inv_angle]
+    %if !IS_VERT
+    neg             g2
+    vpbroadcastd    %2, [g3+g2*4+18*4]
+    ;vpbroadcastq    %3, [angle_mul_hor]     ; Load offset multiplier.
+    %else
+    vpbroadcastd    %2, [g3+g2*4-18*4]      ; Load the inversed angle values.
+    vmovq           %3, [angle_inv_mul_ver+8] ; Load the inversed weight values.
+    %endif
+    vpmullw         %2, %2, %3              ; Get the weight. Some neighbour will have an invalid offset.
+
+    vpbroadcastd    %3, [pat_w_128]         ; Load offset rounding bias.
+    %if IS_VERT
+    vpaddw          %2, %3                  ; Add rounding.
+    %else
+    vpsubw          %2, %3                  ; Reuse an existing pattern that is "one off".
+    %endif                                  ; Subtract bias to compensate.
+    vpsraw          %2, 8                   ; Round.
+
+    %if IS_VERT
+    vpbroadcastd    %3, [pat_w_8]           ; Load inversed angle offset.
+    vpsubb          %2, %3, %2              ; Invert the index.
+    %endif
+
+    vpackuswb       %2, %2, %2              ; Word to byte.
+%endmacro
+
+
+; Intra angular horizontal bottom 4x4.
+DEFFUN f265_lbd_predict_intra_hor_bot_4_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
+    ; Get angle values.
+    neg             g2
+    lea             g3, [intra_angle]
+    vpbroadcastd    y0, [g3+g2*4+9*4]
+
+    vpbroadcastq    y2, [angle_mul_hor]     ; Load weight mask.
+    vpmaddubsw      y0, y2                  ; Multiply to match lane offsets.
+
+    GEN_WEIGHT_OFF  0, 0
+
+    ; Calculate prediction.
+    vbroadcasti128  y2, [g1+48]             ; Load neighbours.
+
+    DO_PRED         2
+    RET
+
+
+; Intra angular horizontal top 4x4.
+DEFFUN f265_lbd_predict_intra_hor_top_4_avx2, ia=4, at=8844, ti=0, tv=4, ym=1
+    ; Get angle values.
+    lea             g3, [intra_angle]
+    vpbroadcastd    y0, [g3+g2*4-11*4]
+
+    vpbroadcastq    y3, [angle_mul_hor]     ; Load weight mask.
+    vpmaddubsw      y0, y3                  ; Multiply to match lane offsets.
+
+    GEN_WEIGHT_OFF  0, 1
+
+    ; Load inversed angle.
+    INV_RESHUF      0, x2, x3
+
+    vmovq           x3, [g1+64]             ; Load top data.
+    vpshufb         x2, x3, x2              ; Re-order top neighbours.
+    vpalignr        x3, x2, x2, 15
+
+    ; Re-order neighbours.
+    vmovq           x2, [g1+56]             ; Load left data.
+    vpblendd        x2, x3, x2, 0b0011      ; Blend left neighbours with top neighbours.
+    vpinsrb         x2, [g1+128], 8         ; Insert top-left neighbour.
+    vinserti128     y2, y2, x2, 1
+
+    DO_PRED         2
+    RET
+
+
+; Intra angular vertical left 4x4.
+DEFFUN f265_lbd_predict_intra_ver_left_4_avx2, ia=4, at=8844, ti=0, tv=4, ym=1
+
+    vmovq           x0, [g1+64-8]           ; Load left neighbours.
+    vpinsrb         x0, [g1+128], 8         ; Load & insert top-left neighbour.
+
+    ; Re-order the left neighbours.
+    INV_RESHUF      1, x2, x1
+
+    ; Blend re-ordered neighbours with the top neighbours.
+    vpshufb         x2, x0, x2              ; Re-order left neighbours.
+    vpbroadcastd    x3, [g1+64]             ; Load top neighbours with an offset in the register.
+    vpblendd        x3, x2, x3, 0b1110      ; Blend with re-ordered left neighbours.
+    vinserti128     y3, y3, x3, 1           ; Duplicate.
+
+    ; Get angle values.
+    neg             g2
+    lea             g3, [intra_angle]
+    vpbroadcastd    y0, [g3+g2*4+25*4]
+
+    vpmaddubsw      y0, [intra_ver_4]       ; Multiply to match lane offsets.
+
+    GEN_WEIGHT_OFF  1, 1
+    DO_PRED         3
+    RET
+
+
+; Intra angular vertical right 4x4.
+DEFFUN f265_lbd_predict_intra_ver_right_4_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
+    ; Get angle values.
+    lea             g3, [intra_angle]
+    vpbroadcastd    y0, [g3+g2*4-27*4]
+
+    vpmaddubsw      y0, [intra_ver_4]       ; Multiply to match lane offsets.
+
+    GEN_WEIGHT_OFF  1, 0
+
+    ; Calculate prediction.
+    vbroadcasti128  y2, [g1+64]             ; Load neighbours.
+
+    DO_PRED         2
+    RET
+
+%unmacro GEN_WEIGHT_OFF 2
+%unmacro DO_PRED 1
+%unmacro INV_RESHUF 3
+
+
+; Intra pure horizontal 4x4.
+DEFFUN f265_lbd_predict_intra_hor_4_avx2, ia=4, at=8844, ti=0, tv=3, ym=0
+    vmovd           x0, [g1+60]
+    vpshufb         x0, [intra_hor_4]
+
+    and             g3, 1
+    jz              .SKIP_FILTER
+
+    vpmovzxbw       x1, [g1+64]             ; Load top neighbours.
+    vmovd           x2, [g1+128]            ; Load & broadcast top-left neighbour.
+    vpbroadcastb    x2, x2
+    vpmovzxbw       x2, x2                  ; Byte to word.
+    vpsubw          x1, x2                  ; Top - top-left.
+    vpsraw          x1, 1                   ; (top - top-left)/2.
+
+    vmovd           x2, [g1+63]             ; Load & broadcast topmost left neighbour.
+    vpbroadcastb    x2, x2
+    vpmovzxbw       x2, x2                  ; Byte to word.
+    vpaddw          x1, x2                  ; Left + (top - top-left)/2.
+
+    vpxor           x2, x2                  ; Replace negative values by 0.
+    vpmaxsw         x1, x2
+
+    vpackuswb       x1, x1                  ; Word to byte with unsigned saturation.
+
+    vpblendd        x0, x1, x0, 0xfe        ; Update the first row.
+
+    .SKIP_FILTER:
+    vmovdqu         [g0], x0                ; Save the results.
+    RET
+
+
+; Intra pure diagonal top-left 4x4.
+DEFFUN f265_lbd_predict_intra_dia_top_left_4_avx2, ia=4, at=8844, ti=0, tv=1, ym=0
+    vmovq           x0, [g1+56]             ; Load top row.
+    vmovhps         x0, [g1+64]             ; Load left row.
+    vpinsrb         x0, [g1+128], 0         ; Load & insert top-left.
+    vpshufb         x0, [intra_dia_tl_4]    ; Shuffle.
+    vmovdqu         [g0], x0                ; Save the results.
+    RET
+
+
+; Intra pure vertical 4x4.
+DEFFUN f265_lbd_predict_intra_ver_4_avx2, ia=4, at=8844, ti=0, tv=3, ym=0
+    vpbroadcastd    x0, [g1+64]             ; Copy the top neighbours 4 times.
+
+    and             g3, 1
+    jz              .SKIP_FILTER
+
+    vpmovzxbw       x1, [g1+60]             ; Load left column.
+    vmovd           x2, [g1+128]            ; Load & broadcast top-left.
+    vpbroadcastb    x2, x2
+    vpmovzxbw       x2, x2                  ; Byte to word.
+    vpsubw          x1, x2                  ; Left - top-left.
+    vpsraw          x1, 1                   ; (left - top-left)/2.
+
+    vmovd           x2, [g1+64]             ; Load top.
+    vpbroadcastb    x2, x2
+    vpmovzxbw       x2, x2                  ; Byte to word.
+    vpaddw          x1, x2                  ; Top + (left - top-left)/2.
+
+    vpxor           x2, x2                  ; Replace negative values by 0.
+    vpmaxsw         x1, x2
+
+    vpackuswb       x1, x1                  ; Word to byte with unsigned saturation.
+    vpshufb         x1, [intra_hor_4]       ; Re-order them.
+
+    vpbroadcastd    x2, [pat_q_255]
+    vpblendvb       x0, x0, x1, x2          ; Update the first byte of every rows.
+
+    .SKIP_FILTER:
+    vmovdqu         [g0], x0                ; Save the results.
+    RET
+
+
+; Intra angular top right 4x4.
+DEFFUN f265_lbd_predict_intra_dia_top_right_4_avx2, ia=4, at=8844, ti=0, tv=1, ym=0
+    vmovq           x0, [g1+64]             ; Load all data.
+    vpshufb         x0, [intra_dia_tr_4]    ; Shuffle data.
+    vmovdqu         [g0], x0                ; Save it.
+    RET
+
+
+; Extract and filter neighbours for intra prediction.
+;
+; Input format:
+; EAB
+; C
+; D
+;
+; Output format:
+;   padding   [56] [64]  padding [128]
+; [ ...       DC   AB    ...     E]
+;
+; Input parameters:
+; - g0: nbuf[2][160].
+; - g1: pred.
+; - g2: pred_stride.
+; - g3: avail[2].
+; - g4: filter_flag.
+; - g5: packed (ignored).
+DEFFUN f265_lbd_extract_intra_neigh_4_avx2, ia=6, at=884844, ti=1, tv=5, ym=1
+    ; Get 4 left neighbours.
+    ; Input:
+    ; - %1: the xmm register in which to save the value.
+    ; - %2: temp.
+    ; - %3: temp.
+    ; - ga: first row address. Must be aligned on the dword left of the row.
+    ; - g2: pred_stride.
+    ; - g3: 3*pred_stride.
+    %macro load     3
+    vpbroadcastd    %1, [ga]                ; Load & broadcast the left neighbour.
+    vpbroadcastd    %2, [ga+g2]             ; Load & broadcast the next left neighbour.
+    vpblendd        %1, %1, %2, 0b0101_0101 ; Mix even and odd row: result 1 0 1 0.
+
+    vpbroadcastd    %2, [ga+g2*2]           ; Load & broadcast the next left neighbour.
+    vpbroadcastd    %3, [ga+g3]             ; Load & broadcast the next left neighbour.
+    vpblendd        %2, %2, %3, 0b0101_0101 ; Mix even and odd row: result 3 2 3 2.
+
+    vpblendd        %1, %1, %2, 0b0011_0011 ; Mix 1 0 and 3 2. Result 3 2 1 0.
+    vpshufb         %1, x4                  ; Keep the last byte of each dword.
+    %endmacro
+
+
+    ; Load availability.
+    movzx           g5, byte [g3]           ; Load availx.
+    movzx           g6, byte [g3+4]         ; Load availy.
+
+    ; Test if left neighbours are available.
+    cmp             g6, 0
+    jz              .LEFT_NOT_AVAILABLE
+
+    ; Left neighbours are available.
+    vpbroadcastd    x4, [neigh_last_b_of_d] ; Load shuffle mask.
+
+    lea             ga, [g1-4]              ; Align to load the left neighbours as a double.
+    lea             g3, [g2*3]              ; 3* pred_stride.
+    load            x0, x1, x2              ; Load C0 to C3.
+
+    ; Test if bottom-left neighbours are available.
+    cmp             g6, 4
+    jg              .BOTTOM_AVAIL
+
+    ; Bottom-left neighbours are unavailable. Broadcast the bottommost left neighbour.
+    vpalignr        x1, x0, x0, 4
+    vpbroadcastb    x1, x1
+    vpblendd        x0, x0, x1, 0b0001
+
+    ; Left present and loaded. Bottom-left loaded or emulated.
+    .LEFT_LOADED:
+
+    ; Test if the top neighbours are present.
+    cmp             g5, 0
+    jz              .TOP_NOT_AVAILABLE
+
+    ; Load top and top-left neighbours.
+    mov             ga, g2
+    neg             ga
+    vmovhps         x0, [g1+ga]             ; Load top neighbours. Also eagerly load top-left neighbour.
+    vmovd           x2, [g1+ga-1]           ; Load top-left.
+
+    .TOP_LOADED:
+
+    ; Top-right neighbours might be missing.
+    vpalignr        x1, x0, x0, 11          ; Broadcast rightmost top neighbours.
+    vpbroadcastb    x1, x1
+
+    sub             g5, 1                   ; Remove 1 from the number of available top neighbours (nb_availy).
+    movd            x3, g5d                 ; Required because vpcmpgtb is signed and 128 is a possible value.
+    vpbroadcastb    x3, x3                  ; Broadcast nb_availy.
+    vmovdqu         x4, [neig_bl_unav_8]    ; Neighbour is present if nb_availy is at least the index of the neighbour.
+    vpcmpgtb        x4,  x3                 ; Generate mask from availability,
+    vpblendvb       x0, x0, x1, x4          ; Blend broadcasted neighbours if required.
+
+    .LEFT_AND_TOP_LOADED:
+
+    ; Save unfiltered neighbours.
+    vmovq           [g0+56], x0
+    vmovhps         [g0+64], x0
+    vmovd           [g0+128], x2
+
+    ; Test if filtering is required.
+    cmp             g4, 0
+    je              .SKIP_FILTER
+
+    ; Filter.
+    ; Pseudo code:
+    ; Register ordering : D3, D2, D1, D0, C3, ..., C0, E, A0, ..., A3, B0, ... B3.
+    ; V*[i] = (V[i-1] + 2*V[i] + V[i+1] + 2) >> 2.
+    ; D*3 = D3, B*3 = B3.
+
+    vpalignr        x3, x0, x0, 7           ; Offset top neighbours, keeping space for top-left.
+
+    vmovd           g4d, x2                 ; Push top-left to register.
+    vmovd           g3d, x0                 ; Keep a copy of D3.
+    vpextrb         g2d, x0, 15             ; Keep a copy of B3.
+
+    vpextrb         g1d, x0, 8              ; Extract A0.
+    vpinsrb         x0, g1d, 9              ; Insert the A0 that will filter E.
+    vpinsrb         x0, g4d, 8              ; Insert E.
+    vpinsrb         x3, g4d, 0              ; Insert the E that will filter A0.
+    vinserti128     y0, y0, x3, 1           ; Merge top row with left row.
+
+    vpbroadcastd    y4, [pat_b_1]
+    vpmaddubsw      y1, y0, y4              ; Add pairs of neighbours. Generate odd-even pairs (C3+C2, ...).
+    vpalignr        y0, y0, 1               ; Offset pairs by one byte.
+    vpmaddubsw      y0, y0, y4              ; Add pairs of neighbours. Generate even-odd pairs (C2+C1, ...).
+    vpaddw          y2, y0, y1              ; Add both pairs together. Generate half of the sum (C3+2*C2+C1).
+
+    vpalignr        y3, y0, y0, 14          ; Offset low pairs.
+    vpblendd        y0, y0, y3, 0x0f        ; Keep high lane intact.
+
+    vpalignr        y3, y1, y1, 2           ; Offset high pairs.
+    vpblendd        y1, y1, y3, 0xf0        ; Keep low lane intact.
+
+    vpaddw          y0, y0, y1              ; Generate remaining values (C2+2*C1+C0).
+
+    ; Round.
+    vpbroadcastd    y4, [pat_w_8192]
+    vpmulhrsw       y0, y0, y4
+    vpmulhrsw       y2, y2, y4
+
+    ; Pack to byte.
+    vpackuswb       y0, y0, y0
+    vpackuswb       y2, y2, y2
+
+    ; Get filtered left neighbours.
+    vpunpcklbw      y3, y0, y2              ; Intermix result.
+    vpinsrb         x3, g3d, 0              ; Insert not-filtered D3.
+    vmovq           [g0+160+56], x3         ; Save filtered left neighbours.
+    vmovhps         [g0+160+128], x3        ; Save filtered top-left.
+
+    ; Get filtered top neighbours.
+    vpunpcklbw      y3, y2, y0              ; Intermix result.
+    vextracti128    x3, y3, 1               ; Extract high lane.
+    vpinsrb         x3, g2d, 7              ; Insert not-filtered B3.
+    vmovq           [g0+160+64], x3         ; Save filtered left neighbours.
+
+    .SKIP_FILTER:
+    RET
+
+
+    .TOP_NOT_AVAILABLE:
+    ; Left neighbours are present and loaded.
+    vpalignr        x2, x0, x0, 7           ; C0 as E.
+    vpbroadcastb    x2, x2                  ; Broadcast C0.
+    vpblendd        x0, x0, x2, 0b1100      ; Blend broadcasted value as top neighbours.
+    jmp             .LEFT_AND_TOP_LOADED
+
+
+    .BOTTOM_AVAIL:
+    ; Left neighbours are loaded.
+    lea             ga, [ga+g2*4]
+    load            x1, x2, x3              ; Load D0 to D3.
+    vpblendd        x0, x0, x1, 0b0001      ; Blend bottom with left.
+    jmp             .LEFT_LOADED
+
+
+    .LEFT_NOT_AVAILABLE:
+    ; Left is not available.
+
+    ; Check if the top is available.
+    cmp             g5, 0
+    je              .NOTHING_AVAILABLE
+
+    ; Top is available. Load and broadcast on left.
+    mov             ga, g2
+    neg             ga
+    vmovq           x2, [g1+ga]             ; Load top neighbours. Preemptively load top-right.
+    vpalignr        x0, x2, x2, 8           ; Place it within the register to the top neighbour offset.
+    vpbroadcastb    x2, x2                  ; Broadcast C0.
+    vpblendd        x0, x0, x2, 0b0011      ; Blend left with top.
+    jmp             .TOP_LOADED
+
+
+    .NOTHING_AVAILABLE:
+    ; No neighbours present. Use the default value.
+    vpbroadcastd    x0, [pat_b_128]         ; Broadcast default value.
+
+    vmovq           [g0+56], x0             ; Save unfiltered.
+    vmovq           [g0+64], x0
+    vmovq           [g0+128], x0
+
+    vmovq           [g0+160+56], x0         ; Save filtered.
+    vmovq           [g0+160+64], x0
+    vmovq           [g0+160+128], x0
+
+    %unmacro load 3
+    RET
+
diff --git a/snippets/asm.py b/snippets/asm.py
index 0a7003d..d638575 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -149,7 +149,7 @@ def declare_all():
             luma_qpel_indices_avx2.append("X" if index == "X" else "%s_%s" % (index, frac))
 
     intra_pred_indices_seed = ["4", "8", "16", "32"]
-    intra_pred_indices_avx2_seed = ["X", "8", "X", "X"]
+    intra_pred_indices_avx2_seed = ["4", "8", "X", "X"]
     intra_pred_indices = []
     intra_pred_indices_avx2 = []
     for index in intra_pred_indices_seed:

Reply via email to