Hi,
here is the new version of the patch.
Daniel
On 09/05/2014 11:22 AM, Laurent Birtz wrote:
Hi,
here's the commented review.
Regards,
Laurent
diff --git a/f265/asm.c b/f265/asm.c
index 014d3ea..ba663f8 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -198,8 +198,19 @@ void f265_lbd_scale_qpel_48_avx2(int16_t *dst, int dst_stride, uint8_t *src, int
void f265_hbd_scale_qpel_c(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int packed_dims);
void f265_lbd_predict_intra_planar_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_planar_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
void f265_lbd_predict_intra_dc_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dc_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
void f265_lbd_predict_intra_angular_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_bot_left_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_bot_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_top_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_left_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_ver_left_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_ver_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_ver_right_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_right_4_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
void f265_lbd_predict_intra_planar_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
void f265_lbd_predict_intra_dc_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
void f265_lbd_predict_intra_dia_bot_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
@@ -1073,6 +1084,17 @@ static void f265_link_asm(int avx2_flag)
f265_lbd_scale_qpel[7] = f265_lbd_scale_qpel_12_avx2;
f265_lbd_scale_qpel[8] = f265_lbd_scale_qpel_24_avx2;
f265_lbd_scale_qpel[9] = f265_lbd_scale_qpel_48_avx2;
+ f265_lbd_predict_intra[0] = f265_lbd_predict_intra_planar_4_avx2;
+ f265_lbd_predict_intra[1] = f265_lbd_predict_intra_dc_4_avx2;
+ f265_lbd_predict_intra[2] = f265_lbd_predict_intra_dia_bot_left_4_avx2;
+ f265_lbd_predict_intra[3] = f265_lbd_predict_intra_hor_bot_4_avx2;
+ f265_lbd_predict_intra[4] = f265_lbd_predict_intra_hor_4_avx2;
+ f265_lbd_predict_intra[5] = f265_lbd_predict_intra_hor_top_4_avx2;
+ f265_lbd_predict_intra[6] = f265_lbd_predict_intra_dia_top_left_4_avx2;
+ f265_lbd_predict_intra[7] = f265_lbd_predict_intra_ver_left_4_avx2;
+ f265_lbd_predict_intra[8] = f265_lbd_predict_intra_ver_4_avx2;
+ f265_lbd_predict_intra[9] = f265_lbd_predict_intra_ver_right_4_avx2;
+ f265_lbd_predict_intra[10] = f265_lbd_predict_intra_dia_top_right_4_avx2;
f265_lbd_predict_intra[11] = f265_lbd_predict_intra_planar_8_avx2;
f265_lbd_predict_intra[12] = f265_lbd_predict_intra_dc_8_avx2;
f265_lbd_predict_intra[13] = f265_lbd_predict_intra_dia_bot_left_8_avx2;
diff --git a/f265/asm/avx2/intra.asm b/f265/asm/avx2/intra.asm
index 9262a2a..d87a35d 100644
--- a/f265/asm/avx2/intra.asm
+++ b/f265/asm/avx2/intra.asm
@@ -21,6 +21,14 @@ neigh_1_of_2: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15,
neigh_shift_pair: db 14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,
db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,
+intra_ox_4: dw 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, ; Effect of row.
+intra_inv_ox_4: dw 3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0, ; Inverted effect.
+intra_ver_4: dw 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, ; Vertical angle multipliers.
+intra_hor_4: db 3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0, ; Horizontal angle multipliers.
+ db 1,1,1,1, 0,0,0,0, 3,3,3,3, 2,2,2,2
+intra_p_hor_4: db 3,1,3,1, 3,1,3,1, 2,2,2,2, 2,2,2,2, ; Horizontal angle multipliers for planar.
+ db 1,3,1,3, 1,3,1,3, 0,4,0,4, 0,4,0,4,
+
align 16
@@ -28,14 +36,14 @@ align 16
ang_hor_8: db 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-; Pshufb pattern to generate neighbour pairs.
+; Repeat values on a whole 8x8 row. Inversed for use in pure horizontal.
pair_low: db 0,1, 1,2, 2,3, 3,4, 4,5, 5,6, 6,7, 7,8,
pair_high: db 7,8, 8,9, 9,10, 10,11, 11,12, 12,13, 13,14, 14,15
angle_mul_hor: dw 1, 2, 3, 4, 5, 6, 7, 8, ; Row index. Used to get the weight and offset of each row on
; horizontal angles.
-angle_inv_mul_hor: dw 0, 1, 2, 3, 4, 5, 6, 7, ; Multiplier for inv_angle_8 on horizontal angles.
-angle_inv_mul_ver: dw 7, 6, 5, 4, 3, 2, 1, 0, ; Multiplier for inv_angle_8 on vertical angles.
+angle_inv_mul_hor: dw 0, 1, 2, 3, 4, 5, 6, 7, ; Multiplier for inv_angle on horizontal angles.
+angle_inv_mul_ver: dw 7, 6, 5, 4, 3, 2, 1, 0, ; Multiplier for inv_angle on vertical angles.
dia_bot_left_8: db 14, 13, 12, 11, 10, 9, 8, 7 ; Invert byte order.
db 6, 5, 4, 3, 2, 1, 0, 15
@@ -48,17 +56,22 @@ neig_bl_unav_8: db 0,0,0,0,0,0,0,0, 0, 1, 2, 3, 4, 5, 6, 7
pat_b_m1_to_14: db -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+intra_dia_bl_4: db 6,5,4,3, 5,4,3,2, 4,3,2,1, 3,2,1,0 ; Dia bottom left re-ordering pattern.
+intra_dia_tl_4: db 0,8,9,10, 7,0,8,9, 6,7,0,8, 5,6,7,0 ; Dia top-left re-ordering pattern.
+intra_dia_tr_4: db 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7, ; Dia top right re-ordering pattern.
+
+
align 4
; Seed on which the the neighbours offset of inversed angles are calculated.
; As words (repeated 4 times) for speed-ups.
-inv_angle_8: db 16, 16, 16, 16
- db 19, 19, 19, 19
- db 24, 24, 24, 24
- db 30, 30, 30, 30
- db 39, 39, 39, 39
- db 57, 57, 57, 57
- db 102, 102, 102, 102
+inv_angle: dw 256, 256
+ dw 315, 315
+ dw 390, 390
+ dw 482, 482
+ dw 630, 630
+ dw 910, 910
+ dw 1638, 1638
; Seed on which the angles weights and offsets are calculated.
; As words (repeated 4 times) for speed-ups.
@@ -72,16 +85,21 @@ intra_angle: db 2, 2, 2, 2,
; Pattern used as mask, bias, offset, ...
; As double to use the more efficient vpbroadcastd.
+intra_p_ver_4: db 3,1, 2,2, 1,3, 0,4,
+intra_tl_4: db 3,4, 4,5, 5,6, 6,7
neigh_last_b_of_d: db 3, 7, 11, 15,
pat_q_255: dq 0xff
pat_w_8192: dw 8192, 8192,
+pat_w_4096: dw 4096, 4096,
pat_w_2048: dw 2048, 2048,
pat_w_1024: dw 1024, 1024,
+pat_w_128: dw 128, 128
pat_w_32: dw 32, 32,
pat_w_31: dw 31, 31,
pat_w_8: dw 8, 8,
pat_b_14_15: db 14,15, 14,15,
pat_b_7_8: db 7,8, 7,8,
+pat_b_4_5: db 4,5, 4,5,
pat_b_0_1: db 0,1, 0,1,
pat_b_128: db 128, 128, 128, 128
pat_b_15: db 15, 15, 15, 15,
@@ -149,17 +167,17 @@ DEFFUN f265_lbd_predict_intra_dc_8_avx2, ia=4, at=8844, ti=0, tv=6, ym=1
; - Top = 3*base + top.
; - Left = 3*base + left.
- movd g2d, x1 ; Extract base.
+ vmovd g2d, x1 ; Extract base.
and g2, 0xff
lea g3, [3*g2+2] ; Base * 3 + rounding bias.
- movd x3, g3d
+ vmovd x3, g3d
vpbroadcastw y3, x3 ; Broadcast base * 3 + rounding bias.
movzx g3, byte [g1+64] ; Load the first top and left value.
movzx ga, byte [g1+63]
- vpaddw y2, y3 ; 3 * Base + neighbours + rounding bias.
+ vpaddw y2, y3 ; 3 * base + neighbours + rounding bias.
vpsrlw y2, 2 ; Divide by 4.
vpackuswb y2, y2 ; Word to byte.
@@ -431,7 +449,7 @@ DEFFUN f265_lbd_predict_intra_hor_8_avx2, ia=4, at=8844, ti=0, tv=4, ym=1
vpmovzxbw x3, x3 ; Byte to word.
- vpsubw x0, x3 ; top - top-left.
+ vpsubw x0, x3 ; Top - top-left.
vpsraw x0, 1 ; (top - top-left)/2.
vmovd x3, [g1+63] ; Load left.
@@ -462,14 +480,14 @@ DEFFUN f265_lbd_predict_intra_hor_top_8_avx2, ia=4, at=8844, ti=0, tv=7, ym=1
vpinsrb x5, [g1+128], 0 ; Insert the top-left neighbour.
; Import top neighbour with the left ones.
- lea g3, [inv_angle_8]
+ lea g3, [inv_angle]
vpbroadcastd y4, [g3+g2*4+18*4] ; Load the inversed angle values.
vmovdqu x3, [angle_inv_mul_hor] ; Load the weight values.
- vpmaddubsw y4, y4, y3 ; Get the weight. Some neighbour will have an invalid offset.
+ vpmullw y4, y4, y3 ; Get the weight. Some neighbour will have an invalid offset.
; Since we never read them, it's ok.
- vpbroadcastd y3, [pat_w_8] ; Load inversed angle bias.
+ vpbroadcastd y3, [pat_w_128] ; Load inversed angle bias.
vpaddw y4, y3 ; Add inversed angle bias.
- vpsraw y4, 4 ; Get inversed neighbour offset.
+ vpsraw y4, 8 ; Get inversed neighbour offset.
vpackuswb y4, y4 ; Word to byte.
vpshufb y5, y4 ; Re-order left neighbours.
@@ -486,7 +504,7 @@ DEFFUN f265_lbd_predict_intra_hor_top_8_avx2, ia=4, at=8844, ti=0, tv=7, ym=1
%unmacro DO_ROW 2
-; Intra pure diagonal top left 8x8.
+; Intra pure diagonal top-left 8x8.
DEFFUN f265_lbd_predict_intra_dia_top_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
vmovq x0, [g1+64-7] ; Load top row.
vmovhps x0, [g1+64] ; Load left row.
@@ -610,7 +628,7 @@ DEFFUN f265_lbd_predict_intra_dia_top_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym
DO_ROW 2, 4, %1 ; Do row 1 and 3.
vpackuswb y3, y4 ; Merge value.
- vmovdqu [g0+0x00], y3 ; Save result.
+ vmovdqu [g0+0x00], y3 ; Save the result.
vpaddb y7, y8 ; Skip from rows 1|3 to rows 4|6.
vpaddb y7, y8
@@ -619,7 +637,7 @@ DEFFUN f265_lbd_predict_intra_dia_top_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym
DO_ROW 6, 4, %1 ; Do row 5 and 7.
vpackuswb y3, y4 ; Merge value.
- vmovdqu [g0+0x20], y3 ; Save result.
+ vmovdqu [g0+0x20], y3 ; Save the result.
%endmacro
@@ -629,17 +647,18 @@ DEFFUN f265_lbd_predict_intra_ver_left_8_avx2, ia=4, at=8844, ti=0, tv=11, ym=1
vpinsrb x0, [g1+128], 8 ; Load top-left.
; Re-order the left neighbours.
- lea g3, [inv_angle_8]
- vpbroadcastd y2, [g3+g2*4-18*4] ; Load the inversed angle values.
+ lea g3, [inv_angle]
+ vpbroadcastd x2, [g3+g2*4-18*4] ; Load the inversed angle values.
vmovdqu x3, [angle_inv_mul_ver] ; Load the inversed weight values.
- vpmaddubsw y2, y2, y3 ; Get the weight. Some neighbour will have an invalid offset.
+ vpmullw x2, x2, x3 ; Get the weight. Some neighbour will have an invalid offset.
; Since we never use them, it's ok.
- vpbroadcastd y3, [pat_w_8] ; Load inversed angle bias.
- vpaddw y2, y3 ; Add inversed angle bias.
- vpsraw y2, 4 ; Get inversed neighbour offset.
- vpsubb y2, y3, y2 ; Invert the index.
- vpackuswb y2, y2 ; Word to byte.
- vpshufb y0, y2 ; Re-order left neighbours.
+ vpbroadcastd x3, [pat_w_128] ; Load inversed angle bias.
+ vpaddw x2, x3 ; Add inversed angle bias.
+ vpsraw x2, 8 ; Get inversed neighbour offset.
+ vpbroadcastd x3, [pat_w_8] ; Load inversed angle offset.
+ vpsubb x2, x3, x2 ; Invert the index.
+ vpackuswb x2, x2 ; Word to byte.
+ vpshufb x0, x2 ; Re-order left neighbours.
; Blend re-ordered neighbours with the top neighbours.
vmovhps x0, [g1+64]
@@ -745,7 +764,7 @@ DEFFUN f265_lbd_predict_intra_dia_top_right_8_avx2, ia=4, at=8844, ti=0, tv=3, y
; - g2: pred_stride.
; - g3: avail[2].
; - g4: filter_flag.
-; - g5: packed (Ignored).
+; - g5: packed (ignored).
DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
; Load availability.
movzx g5, byte [g3] ; Load availx.
@@ -765,7 +784,7 @@ DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
; Get 4 left neighbours.
; Input:
- ; - %1: the xmm register in which to save the value,
+ ; - %1: the xmm register in which to save the value.
; - %2: temp.
; - %3: temp.
; - ga: first row address. Must be aligned on the dword left of the row.
@@ -784,7 +803,7 @@ DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
vpshufb %1, x7 ; Keep the last byte of each dword.
%endmacro
- vpbroadcastd x7, [neigh_last_b_of_d] ; Load suffle mask.
+ vpbroadcastd x7, [neigh_last_b_of_d] ; Load shuffle mask.
lea ga, [g1-4]
lea g3, [g2*3]
@@ -917,6 +936,7 @@ DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
lea ga, [ga+g2*4]
load x4, x5, x6
+ %unmacro load 3
vpblendd x3, x3, x4, 0b0101
@@ -946,4 +966,618 @@ DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
vmovd [g0+160+128], x0
RET
- %unmacro load 2
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Duplicate above functions for 4x4 block size.
+
+
+; Intra DC 4x4.
+DEFFUN f265_lbd_predict_intra_dc_4_avx2, ia=4, at=8844, ti=0, tv=4, ym=0
+ ; Logic:
+ ; Sum all direct neighbours.
+ ; Divide with bias by the number of samples.
+
+ vmovd x2, [g1+60] ; Load left neighbours.
+ vmovd x3, [g1+64] ; Load top neighbours.
+
+ vpmovzxbw x2, x2 ; Byte to word.
+ vpmovzxbw x3, x3
+
+ ; Add top and left neighbours.
+ vpaddw x0, x2, x3
+
+ ; Fold in two the subresult.
+ vpalignr x1, x0, x0, 4
+ vpaddw x0, x0, x1
+
+ ; Get a single sum.
+ vphaddw x0, x0, x0
+
+ ; Round.
+ vmovd x1, [pat_w_4096]
+ vpmulhrsw x0, x0, x1 ; Round.
+ vpbroadcastb x0, x0
+
+ ; Should it be filtered?
+ test g3, 1
+ jz .SKIP_FILTER
+
+ ; 3 cases:
+ ; - Top-left = 2*base + top + left.
+ ; - Top = 3*base + top.
+ ; - Left = 3*base + left.
+
+ ; Blend top and left neighbours.
+ vpalignr x3, x3, x3, 8
+ vpblendd x2, x3, x2, 0b0011
+
+ movd g2d, x0 ; Extract base.
+ and g2, 0xff
+
+ lea g3, [3*g2+2] ; Base * 3 + rounding bias.
+ movd x1, g3d
+ vpbroadcastw x1, x1 ; Broadcast base * 3 + rounding bias.
+
+ vpaddw x2, x1 ; 3 * base + neighbours + rounding bias.
+ vpsrlw x2, 2 ; Divide by 4.
+ vpackuswb x2, x2 ; Word to byte.
+
+ vpalignr x1, x2, 4
+ vpblendd x0, x1, x0, 0xfe ; Blend in the top row.
+
+ vpshufb x2, x2, [intra_hor_4]
+ vpbroadcastd x1, [pat_q_255]
+ vpblendvb x0, x0, x2, x1
+
+ ; Do top-left.
+ movzx g3, byte [g1+64] ; Load the first top and left value.
+ movzx ga, byte [g1+63]
+ add g3, ga ; Top + left.
+ lea g2, [2*g2+g3+2] ; Top + left + 2*base + bias.
+ shr g2, 2 ; Get the average.
+ vpinsrb x0, g2b, 0 ; Insert the result.
+
+ .SKIP_FILTER:
+
+ vmovdqu [g0], x0 ; Save the results.
+
+ RET
+
+
+; Intra planar 4x4.
+DEFFUN f265_lbd_predict_intra_planar_4_avx2, ia=4, at=8844, ti=0, tv=4, ym=1
+ ; Value[x,y] = ((3-x)*left + (3-y)*top + (x+1)*top_right + (y+1)*bottom_left + 4) >> 3);
+
+ ; Get top neighbour weighted values.
+ vpbroadcastd y1, [g1+64] ; Load & broadcast top neighbours.
+ vmovd x2, [g1+59] ; Load & broadcast bottom-left.
+ vpbroadcastb y2, x2
+ vpunpcklbw y1, y1, y2 ; Mix top with bottom-left neighbours.
+ vpmaddubsw y1, y1, [intra_p_hor_4] ; Get the value of each column.
+
+ ; Get left neighbour weighted values.
+ vpbroadcastd y3, [g1+60] ; Load left neighbours.
+ vpshufb y3, [intra_hor_4] ; Re-order them.
+ vmovd x2, [g1+68] ; Load & broadcast top-right.
+ vpbroadcastb y2, x2
+ vpunpcklbw y3, y3, y2 ; Mix top right with left neighbours.
+ vpbroadcastq y0, [intra_p_ver_4] ; Load & broadcast weight.
+ vpmaddubsw y3, y3, y0 ; Get the value of each row.
+
+ vpaddw y1, y3 ; Get the final sums.
+ vpbroadcastd y0, [pat_w_4096] ; Round the sums.
+ vpmulhrsw y1, y1, y0
+ vpackuswb y1, y1
+
+ vmovq [g0], x1 ; Save the result.
+ vextracti128 x1, y1, 1
+ vmovq [g0+8], x1
+ RET
+
+
+; Intra pure diagonal bottom-left 4x4.
+DEFFUN f265_lbd_predict_intra_dia_bot_left_4_avx2, ia=4, at=8844, ti=0, tv=1, ym=0
+ vmovq x0, [g1+56] ; Load left neighbours.
+ vpshufb x0, [intra_dia_bl_4] ; Re-order them.
+ vmovdqu [g0], x0 ; Save the results.
+ RET
+
+
+; Calculate weight and offset.
+; Register usage:
+; - y0: angle values. Output as the offset.
+; - y1: weight (output).
+; - y2: tmp.
+%macro GEN_WEIGHT_OFF 2 ; %1: 1 if vertical prediction, %2: 1 if we need to import neighbour
+ ; from the other side.
+ %assign IS_VERT %1
+ %assign IS_INVERSED %2
+
+ ; Generate weights.
+ vpbroadcastd y2, [pat_w_31] ; Load weight mask.
+ vpand y1, y0, y2 ; Extract weight.
+ vpbroadcastd y2, [pat_w_32] ; Load weight sum.
+ vpsubw y2, y2, y1 ; Get weight complement.
+
+ %if (IS_VERT && IS_INVERSED) || (!IS_VERT && !IS_INVERSED)
+ vpsllw y2, y2, 8
+ vpor y1, y1, y2
+ %elif (IS_VERT && !IS_INVERSED) || (!IS_VERT && IS_INVERSED)
+ vpsllw y1, y1, 8
+ vpor y1, y1, y2
+ %endif
+
+ ; Generate offset.
+ vpsrlw y0, y0, 5 ; Get the offset (word).
+
+ %if !IS_VERT && IS_INVERSED
+ vpaddw y0, [intra_inv_ox_4] ; Add the effect of rows.
+ %elif !IS_VERT && !IS_INVERSED
+ vpaddw y0, [intra_ox_4] ; Add the effect of rows.
+ %elif IS_VERT && !IS_INVERSED
+ vpbroadcastq y2, [angle_inv_mul_hor]
+ vpaddw y0, y2 ; Add the effect of columns.
+ %elif IS_VERT && IS_INVERSED
+ ; No processing in this case.
+ %endif
+
+ ; Double word value (once per byte).
+ vpsllw y2, y0, 8
+ vpor y0, y2, y0
+
+ %if !IS_VERT && !IS_INVERSED
+ vpbroadcastd y2, [pat_b_14_15] ; Generate offset pairs.
+ vpsubw y0, y2, y0 ; Final offset.
+
+ %elif !IS_VERT && IS_INVERSED
+ vpbroadcastd y2, [pat_b_4_5] ; Generate offset pairs.
+ vpaddw y0, y2, y0 ; Final offset.
+
+ %elif IS_VERT && !IS_INVERSED
+ vpbroadcastd y2, [pat_b_0_1] ; Generate offset pairs.
+ vpaddw y0, y2, y0 ; Final offset.
+
+ %elif IS_VERT && IS_INVERSED
+ vpbroadcastq y2, [intra_tl_4] ; Generate offset pairs.
+ vpsubw y0, y2, y0 ; Final offset.
+ %endif
+%endmacro
+
+
+; Calculate prediction and save it.
+; Register usage:
+; - y0: offset.
+; - y1: weight.
+; - y%1: neighbours.
+%macro DO_PRED 1
+ vpshufb y%1, y0 ; Position neighbours pairs.
+ vpmaddubsw y%1, y1 ; Multiply by weight.
+ vpbroadcastw y1, [pat_w_1024]
+ vpmulhrsw y%1, y1 ; Round.
+
+ vpackuswb y%1, y%1, y%1 ; Word to byte.
+ vmovq [g0], x%1 ; Save the results.
+ vextracti128 x%1, y%1, 1
+ vmovq [g0+8], x%1
+%endmacro
+
+
+; Calculate inverted neighbours offset.
+%macro INV_RESHUF 3 ; %1: 1 if we are doing vertical prediction, %2: output register, %3: tmp.
+ %assign IS_VERT %1
+
+ ; Load inversed angle.
+ lea g3, [inv_angle]
+ %if !IS_VERT
+ neg g2
+ vpbroadcastd %2, [g3+g2*4+18*4]
+ ;vpbroadcastq %3, [angle_mul_hor] ; Load offset multiplier.
+ %else
+ vpbroadcastd %2, [g3+g2*4-18*4] ; Load the inversed angle values.
+ vmovq %3, [angle_inv_mul_ver+8] ; Load the inversed weight values.
+ %endif
+ vpmullw %2, %2, %3 ; Get the weight. Some neighbour will have an invalid offset.
+
+ vpbroadcastd %3, [pat_w_128] ; Load offset rounding bias.
+ %if IS_VERT
+ vpaddw %2, %3 ; Add rounding.
+ %else
+ vpsubw %2, %3 ; Reuse an existing pattern that is "one off".
+ %endif ; Subtract bias to compensate.
+ vpsraw %2, 8 ; Round.
+
+ %if IS_VERT
+ vpbroadcastd %3, [pat_w_8] ; Load inversed angle offset.
+ vpsubb %2, %3, %2 ; Invert the index.
+ %endif
+
+ vpackuswb %2, %2, %2 ; Word to byte.
+%endmacro
+
+
+; Intra angular horizontal bottom 4x4.
+DEFFUN f265_lbd_predict_intra_hor_bot_4_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
+ ; Get angle values.
+ neg g2
+ lea g3, [intra_angle]
+ vpbroadcastd y0, [g3+g2*4+9*4]
+
+ vpbroadcastq y2, [angle_mul_hor] ; Load weight mask.
+ vpmaddubsw y0, y2 ; Multiply to match lane offsets.
+
+ GEN_WEIGHT_OFF 0, 0
+
+ ; Calculate prediction.
+ vbroadcasti128 y2, [g1+48] ; Load neighbours.
+
+ DO_PRED 2
+ RET
+
+
+; Intra angular horizontal top 4x4.
+DEFFUN f265_lbd_predict_intra_hor_top_4_avx2, ia=4, at=8844, ti=0, tv=4, ym=1
+ ; Get angle values.
+ lea g3, [intra_angle]
+ vpbroadcastd y0, [g3+g2*4-11*4]
+
+ vpbroadcastq y3, [angle_mul_hor] ; Load weight mask.
+ vpmaddubsw y0, y3 ; Multiply to match lane offsets.
+
+ GEN_WEIGHT_OFF 0, 1
+
+ ; Load inversed angle.
+ INV_RESHUF 0, x2, x3
+
+ vmovq x3, [g1+64] ; Load top data.
+ vpshufb x2, x3, x2 ; Re-order top neighbours.
+ vpalignr x3, x2, x2, 15
+
+ ; Re-order neighbours.
+ vmovq x2, [g1+56] ; Load left data.
+ vpblendd x2, x3, x2, 0b0011 ; Blend left neighbours with top neighbours.
+ vpinsrb x2, [g1+128], 8 ; Insert top-left neighbour.
+ vinserti128 y2, y2, x2, 1
+
+ DO_PRED 2
+ RET
+
+
+; Intra angular vertical left 4x4.
+DEFFUN f265_lbd_predict_intra_ver_left_4_avx2, ia=4, at=8844, ti=0, tv=4, ym=1
+
+ vmovq x0, [g1+64-8] ; Load left neighbours.
+ vpinsrb x0, [g1+128], 8 ; Load & insert top-left neighbour.
+
+ ; Re-order the left neighbours.
+ INV_RESHUF 1, x2, x1
+
+ ; Blend re-ordered neighbours with the top neighbours.
+ vpshufb x2, x0, x2 ; Re-order left neighbours.
+ vpbroadcastd x3, [g1+64] ; Load top neighbours with an offset in the register.
+ vpblendd x3, x2, x3, 0b1110 ; Blend with re-ordered left neighbours.
+ vinserti128 y3, y3, x3, 1 ; Duplicate.
+
+ ; Get angle values.
+ neg g2
+ lea g3, [intra_angle]
+ vpbroadcastd y0, [g3+g2*4+25*4]
+
+ vpmaddubsw y0, [intra_ver_4] ; Multiply to match lane offsets.
+
+ GEN_WEIGHT_OFF 1, 1
+ DO_PRED 3
+ RET
+
+
+; Intra angular vertical right 4x4.
+DEFFUN f265_lbd_predict_intra_ver_right_4_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
+ ; Get angle values.
+ lea g3, [intra_angle]
+ vpbroadcastd y0, [g3+g2*4-27*4]
+
+ vpmaddubsw y0, [intra_ver_4] ; Multiply to match lane offsets.
+
+ GEN_WEIGHT_OFF 1, 0
+
+ ; Calculate prediction.
+ vbroadcasti128 y2, [g1+64] ; Load neighbours.
+
+ DO_PRED 2
+ RET
+
+%unmacro GEN_WEIGHT_OFF 2
+%unmacro DO_PRED 1
+%unmacro INV_RESHUF 3
+
+
+; Intra pure horizontal 4x4.
+DEFFUN f265_lbd_predict_intra_hor_4_avx2, ia=4, at=8844, ti=0, tv=3, ym=0
+ vmovd x0, [g1+60]
+ vpshufb x0, [intra_hor_4]
+
+ and g3, 1
+ jz .SKIP_FILTER
+
+ vpmovzxbw x1, [g1+64] ; Load top neighbours.
+ vmovd x2, [g1+128] ; Load & broadcast top-left neighbour.
+ vpbroadcastb x2, x2
+ vpmovzxbw x2, x2 ; Byte to word.
+ vpsubw x1, x2 ; Top - top-left.
+ vpsraw x1, 1 ; (top - top-left)/2.
+
+ vmovd x2, [g1+63] ; Load & broadcast topmost left neighbour.
+ vpbroadcastb x2, x2
+ vpmovzxbw x2, x2 ; Byte to word.
+ vpaddw x1, x2 ; Left + (top - top-left)/2.
+
+ vpxor x2, x2 ; Replace negative values by 0.
+ vpmaxsw x1, x2
+
+ vpackuswb x1, x1 ; Word to byte with unsigned saturation.
+
+ vpblendd x0, x1, x0, 0xfe ; Update the first row.
+
+ .SKIP_FILTER:
+ vmovdqu [g0], x0 ; Save the results.
+ RET
+
+
+; Intra pure diagonal top-left 4x4.
+DEFFUN f265_lbd_predict_intra_dia_top_left_4_avx2, ia=4, at=8844, ti=0, tv=1, ym=0
+ vmovq x0, [g1+56] ; Load top row.
+ vmovhps x0, [g1+64] ; Load left row.
+ vpinsrb x0, [g1+128], 0 ; Load & insert top-left.
+ vpshufb x0, [intra_dia_tl_4] ; Shuffle.
+ vmovdqu [g0], x0 ; Save the results.
+ RET
+
+
+; Intra pure vertical 4x4.
+DEFFUN f265_lbd_predict_intra_ver_4_avx2, ia=4, at=8844, ti=0, tv=3, ym=0
+ vpbroadcastd x0, [g1+64] ; Copy the top neighbours 4 times.
+
+ and g3, 1
+ jz .SKIP_FILTER
+
+ vpmovzxbw x1, [g1+60] ; Load left column.
+ vmovd x2, [g1+128] ; Load & broadcast top-left.
+ vpbroadcastb x2, x2
+ vpmovzxbw x2, x2 ; Byte to word.
+ vpsubw x1, x2 ; Left - top-left.
+ vpsraw x1, 1 ; (left - top-left)/2.
+
+ vmovd x2, [g1+64] ; Load top.
+ vpbroadcastb x2, x2
+ vpmovzxbw x2, x2 ; Byte to word.
+ vpaddw x1, x2 ; Top + (left - top-left)/2.
+
+ vpxor x2, x2 ; Replace negative values by 0.
+ vpmaxsw x1, x2
+
+ vpackuswb x1, x1 ; Word to byte with unsigned saturation.
+ vpshufb x1, [intra_hor_4] ; Re-order them.
+
+ vpbroadcastd x2, [pat_q_255]
+ vpblendvb x0, x0, x1, x2 ; Update the first byte of every rows.
+
+ .SKIP_FILTER:
+ vmovdqu [g0], x0 ; Save the results.
+ RET
+
+
+; Intra angular top right 4x4.
+DEFFUN f265_lbd_predict_intra_dia_top_right_4_avx2, ia=4, at=8844, ti=0, tv=1, ym=0
+ vmovq x0, [g1+64] ; Load all data.
+ vpshufb x0, [intra_dia_tr_4] ; Shuffle data.
+ vmovdqu [g0], x0 ; Save it.
+ RET
+
+
+; Extract and filter neighbours for intra prediction.
+;
+; Input format:
+; EAB
+; C
+; D
+;
+; Output format:
+; padding [56] [64] padding [128]
+; [ ... DC AB ... E]
+;
+; Input parameters:
+; - g0: nbuf[2][160].
+; - g1: pred.
+; - g2: pred_stride.
+; - g3: avail[2].
+; - g4: filter_flag.
+; - g5: packed (ignored).
+DEFFUN f265_lbd_extract_intra_neigh_4_avx2, ia=6, at=884844, ti=1, tv=5, ym=1
+ ; Get 4 left neighbours.
+ ; Input:
+ ; - %1: the xmm register in which to save the value.
+ ; - %2: temp.
+ ; - %3: temp.
+ ; - ga: first row address. Must be aligned on the dword left of the row.
+ ; - g2: pred_stride.
+ ; - g3: 3*pred_stride.
+ %macro load 3
+ vpbroadcastd %1, [ga] ; Load & broadcast the left neighbour.
+ vpbroadcastd %2, [ga+g2] ; Load & broadcast the next left neighbour.
+ vpblendd %1, %1, %2, 0b0101_0101 ; Mix even and odd row: result 1 0 1 0.
+
+ vpbroadcastd %2, [ga+g2*2] ; Load & broadcast the next left neighbour.
+ vpbroadcastd %3, [ga+g3] ; Load & broadcast the next left neighbour.
+ vpblendd %2, %2, %3, 0b0101_0101 ; Mix even and odd row: result 3 2 3 2.
+
+ vpblendd %1, %1, %2, 0b0011_0011 ; Mix 1 0 and 3 2. Result 3 2 1 0.
+ vpshufb %1, x4 ; Keep the last byte of each dword.
+ %endmacro
+
+
+ ; Load availability.
+ movzx g5, byte [g3] ; Load availx.
+ movzx g6, byte [g3+4] ; Load availy.
+
+ ; Test if left neighbours are available.
+ cmp g6, 0
+ jz .LEFT_NOT_AVAILABLE
+
+ ; Left neighbours are available.
+ vpbroadcastd x4, [neigh_last_b_of_d] ; Load shuffle mask.
+
+ lea ga, [g1-4] ; Align to load the left neighbours as a double.
+ lea g3, [g2*3] ; 3* pred_stride.
+ load x0, x1, x2 ; Load C0 to C3.
+
+ ; Test if bottom-left neighbours are available.
+ cmp g6, 4
+ jg .BOTTOM_AVAIL
+
+ ; Bottom-left neighbours are unavailable. Broadcast the bottommost left neighbour.
+ vpalignr x1, x0, x0, 4
+ vpbroadcastb x1, x1
+ vpblendd x0, x0, x1, 0b0001
+
+ ; Left present and loaded. Bottom-left loaded or emulated.
+ .LEFT_LOADED:
+
+ ; Test if the top neighbours are present.
+ cmp g5, 0
+ jz .TOP_NOT_AVAILABLE
+
+ ; Load top and top-left neighbours.
+ mov ga, g2
+ neg ga
+ vmovhps x0, [g1+ga] ; Load top neighbours. Also eagerly load top-left neighbour.
+ vmovd x2, [g1+ga-1] ; Load top-left.
+
+ .TOP_LOADED:
+
+ ; Top-right neighbours might be missing.
+ vpalignr x1, x0, x0, 11 ; Broadcast rightmost top neighbours.
+ vpbroadcastb x1, x1
+
+ sub g5, 1 ; Remove 1 from the number of available top neighbours (nb_availy).
+ movd x3, g5d ; Required because vpcmpgtb is signed and 128 is a possible value.
+ vpbroadcastb x3, x3 ; Broadcast nb_availy.
+ vmovdqu x4, [neig_bl_unav_8] ; Neighbour is present if nb_availy is at least the index of the neighbour.
+ vpcmpgtb x4, x3 ; Generate mask from availability,
+ vpblendvb x0, x0, x1, x4 ; Blend broadcasted neighbours if required.
+
+ .LEFT_AND_TOP_LOADED:
+
+ ; Save unfiltered neighbours.
+ vmovq [g0+56], x0
+ vmovhps [g0+64], x0
+ vmovd [g0+128], x2
+
+ ; Test if filtering is required.
+ cmp g4, 0
+ je .SKIP_FILTER
+
+ ; Filter.
+ ; Pseudo code:
+ ; Register ordering : D3, D2, D1, D0, C3, ..., C0, E, A0, ..., A3, B0, ... B3.
+ ; V*[i] = (V[i-1] + 2*V[i] + V[i+1] + 2) >> 2.
+ ; D*3 = D3, B*3 = B3.
+
+ vpalignr x3, x0, x0, 7 ; Offset top neighbours, keeping space for top-left.
+
+ vmovd g4d, x2 ; Push top-left to register.
+ vmovd g3d, x0 ; Keep a copy of D3.
+ vpextrb g2d, x0, 15 ; Keep a copy of B3.
+
+ vpextrb g1d, x0, 8 ; Extract A0.
+ vpinsrb x0, g1d, 9 ; Insert the A0 that will filter E.
+ vpinsrb x0, g4d, 8 ; Insert E.
+ vpinsrb x3, g4d, 0 ; Insert the E that will filter A0.
+ vinserti128 y0, y0, x3, 1 ; Merge top row with left row.
+
+ vpbroadcastd y4, [pat_b_1]
+ vpmaddubsw y1, y0, y4 ; Add pairs of neighbours. Generate odd-even pairs (C3+C2, ...).
+ vpalignr y0, y0, 1 ; Offset pairs by one byte.
+ vpmaddubsw y0, y0, y4 ; Add pairs of neighbours. Generate even-odd pairs (C2+C1, ...).
+ vpaddw y2, y0, y1 ; Add both pairs together. Generate half of the sum (C3+2*C2+C1).
+
+ vpalignr y3, y0, y0, 14 ; Offset low pairs.
+ vpblendd y0, y0, y3, 0x0f ; Keep high lane intact.
+
+ vpalignr y3, y1, y1, 2 ; Offset high pairs.
+ vpblendd y1, y1, y3, 0xf0 ; Keep low lane intact.
+
+ vpaddw y0, y0, y1 ; Generate remaining values (C2+2*C1+C0).
+
+ ; Round.
+ vpbroadcastd y4, [pat_w_8192]
+ vpmulhrsw y0, y0, y4
+ vpmulhrsw y2, y2, y4
+
+ ; Pack to byte.
+ vpackuswb y0, y0, y0
+ vpackuswb y2, y2, y2
+
+ ; Get filtered left neighbours.
+ vpunpcklbw y3, y0, y2 ; Intermix result.
+ vpinsrb x3, g3d, 0 ; Insert not-filtered D3.
+ vmovq [g0+160+56], x3 ; Save filtered left neighbours.
+ vmovhps [g0+160+128], x3 ; Save filtered top-left.
+
+ ; Get filtered top neighbours.
+ vpunpcklbw y3, y2, y0 ; Intermix result.
+ vextracti128 x3, y3, 1 ; Extract high lane.
+ vpinsrb x3, g2d, 7 ; Insert not-filtered B3.
+ vmovq [g0+160+64], x3 ; Save filtered left neighbours.
+
+ .SKIP_FILTER:
+ RET
+
+
+ .TOP_NOT_AVAILABLE:
+ ; Left neighbours are present and loaded.
+ vpalignr x2, x0, x0, 7 ; C0 as E.
+ vpbroadcastb x2, x2 ; Broadcast C0.
+ vpblendd x0, x0, x2, 0b1100 ; Blend broadcasted value as top neighbours.
+ jmp .LEFT_AND_TOP_LOADED
+
+
+ .BOTTOM_AVAIL:
+ ; Left neighbours are loaded.
+ lea ga, [ga+g2*4]
+ load x1, x2, x3 ; Load D0 to D3.
+ vpblendd x0, x0, x1, 0b0001 ; Blend bottom with left.
+ jmp .LEFT_LOADED
+
+
+ .LEFT_NOT_AVAILABLE:
+ ; Left is not available.
+
+ ; Check if the top is available.
+ cmp g5, 0
+ je .NOTHING_AVAILABLE
+
+ ; Top is available. Load and broadcast on left.
+ mov ga, g2
+ neg ga
+ vmovq x2, [g1+ga] ; Load top neighbours. Preemptively load top-right.
+ vpalignr x0, x2, x2, 8 ; Place it within the register to the top neighbour offset.
+ vpbroadcastb x2, x2 ; Broadcast C0.
+ vpblendd x0, x0, x2, 0b0011 ; Blend left with top.
+ jmp .TOP_LOADED
+
+
+ .NOTHING_AVAILABLE:
+ ; No neighbours present. Use the default value.
+ vpbroadcastd x0, [pat_b_128] ; Broadcast default value.
+
+ vmovq [g0+56], x0 ; Save unfiltered.
+ vmovq [g0+64], x0
+ vmovq [g0+128], x0
+
+ vmovq [g0+160+56], x0 ; Save filtered.
+ vmovq [g0+160+64], x0
+ vmovq [g0+160+128], x0
+
+ %unmacro load 3
+ RET
+
diff --git a/snippets/asm.py b/snippets/asm.py
index 0a7003d..d638575 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -149,7 +149,7 @@ def declare_all():
luma_qpel_indices_avx2.append("X" if index == "X" else "%s_%s" % (index, frac))
intra_pred_indices_seed = ["4", "8", "16", "32"]
- intra_pred_indices_avx2_seed = ["X", "8", "X", "X"]
+ intra_pred_indices_avx2_seed = ["4", "8", "X", "X"]
intra_pred_indices = []
intra_pred_indices_avx2 = []
for index in intra_pred_indices_seed: