diff --git a/f265/asm.c b/f265/asm.c
index ffee220..5c7ed57 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -55,6 +55,10 @@ void f265_lbd_get_sb_flags_16_avx2(f265_tb_enc *tb, int16_t *qc);
void f265_lbd_get_sb_flags_32_avx2(f265_tb_enc *tb, int16_t *qc);
void f265_hbd_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc);
+void f265_lbd_preprocess_tb_c(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
+void f265_lbd_preprocess_tb_avx2(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
+void f265_hbd_preprocess_tb_c(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
+
int f265_lbd_fsad_c(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims);
int f265_lbd_fsad_4_avx2(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims);
int f265_lbd_fsad_8_avx2(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims);
@@ -341,6 +345,7 @@ f265_lbd_idct_func f265_lbd_idct[5];
f265_lbd_quant_func f265_lbd_quant[4];
f265_lbd_dequant_func f265_lbd_dequant[4];
f265_lbd_get_sb_flags_func f265_lbd_get_sb_flags[4];
+f265_lbd_preprocess_tb_func f265_lbd_preprocess_tb;
f265_lbd_fsad_func f265_lbd_fsad[10];
f265_lbd_sad3_func f265_lbd_sad3[10];
f265_lbd_sad4_func f265_lbd_sad4[10];
@@ -352,6 +357,7 @@ f265_hbd_idct_func f265_hbd_idct[5];
f265_hbd_quant_func f265_hbd_quant[4];
f265_hbd_dequant_func f265_hbd_dequant[4];
f265_hbd_get_sb_flags_func f265_hbd_get_sb_flags[4];
+f265_hbd_preprocess_tb_func f265_hbd_preprocess_tb;
f265_hbd_fsad_func f265_hbd_fsad[10];
f265_hbd_sad3_func f265_hbd_sad3[10];
f265_hbd_sad4_func f265_hbd_sad4[10];
@@ -405,6 +411,8 @@ static void f265_link_asm(int avx2_flag)
f265_hbd_get_sb_flags[1] = f265_hbd_get_sb_flags_c;
f265_hbd_get_sb_flags[2] = f265_hbd_get_sb_flags_c;
f265_hbd_get_sb_flags[3] = f265_hbd_get_sb_flags_c;
+ f265_lbd_preprocess_tb = f265_lbd_preprocess_tb_c;
+ f265_hbd_preprocess_tb = f265_hbd_preprocess_tb_c;
f265_lbd_fsad[0] = f265_lbd_fsad_c;
f265_lbd_fsad[1] = f265_lbd_fsad_c;
f265_lbd_fsad[2] = f265_lbd_fsad_c;
@@ -547,6 +555,7 @@ static void f265_link_asm(int avx2_flag)
f265_lbd_get_sb_flags[1] = f265_lbd_get_sb_flags_8_avx2;
f265_lbd_get_sb_flags[2] = f265_lbd_get_sb_flags_16_avx2;
f265_lbd_get_sb_flags[3] = f265_lbd_get_sb_flags_32_avx2;
+ f265_lbd_preprocess_tb = f265_lbd_preprocess_tb_avx2;
f265_lbd_fsad[1] = f265_lbd_fsad_4_avx2;
f265_lbd_fsad[2] = f265_lbd_fsad_8_avx2;
f265_lbd_fsad[3] = f265_lbd_fsad_16_avx2;
diff --git a/f265/asm.h b/f265/asm.h
index 62bbbb4..d48addf 100644
--- a/f265/asm.h
+++ b/f265/asm.h
@@ -3,6 +3,7 @@
// Special code.
typedef struct f265_tb_enc f265_tb_enc;
+typedef struct f265_tt_enc f265_tt_enc;
// Typedefs.
typedef void(*f265_lbd_dct_func)(int16_t *dst, uint8_t *src, int src_stride, uint8_t *pred, int pred_stride, uint8_t *spill);
@@ -15,6 +16,8 @@ typedef void(*f265_lbd_dequant_func)(int16_t *dst, int16_t *src, int bs, int mul
typedef void(*f265_hbd_dequant_func)(int16_t *dst, int16_t *src, int bs, int mult, int add, int shift);
typedef void(*f265_lbd_get_sb_flags_func)(f265_tb_enc *tb, int16_t *qc);
typedef void(*f265_hbd_get_sb_flags_func)(f265_tb_enc *tb, int16_t *qc);
+typedef void(*f265_lbd_preprocess_tb_func)(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
+typedef void(*f265_hbd_preprocess_tb_func)(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
typedef int(*f265_lbd_fsad_func)(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims);
typedef int(*f265_hbd_fsad_func)(int16_t *src, int src_stride, int16_t *ref, int ref_stride, int packed_dims);
typedef void(*f265_lbd_sad3_func)(int *costs, uint8_t *src, int src_stride, uint8_t **refs, int ref_stride, int packed_dims);
@@ -58,6 +61,10 @@ extern f265_lbd_get_sb_flags_func f265_lbd_get_sb_flags[4];
// Indices: 4, 8, 16, 32.
extern f265_hbd_get_sb_flags_func f265_hbd_get_sb_flags[4];
+extern f265_lbd_preprocess_tb_func f265_lbd_preprocess_tb;
+
+extern f265_hbd_preprocess_tb_func f265_hbd_preprocess_tb;
+
// Indices: 2, 4, 8, 16, 32, 64, 6, 12, 24, 48.
extern f265_lbd_fsad_func f265_lbd_fsad[10];
diff --git a/f265/asm/avx2/encode.asm b/f265/asm/avx2/encode.asm
index 68e5026..d727371 100644
--- a/f265/asm/avx2/encode.asm
+++ b/f265/asm/avx2/encode.asm
@@ -5,11 +5,14 @@
section .data
+extern f265_scan_map_data
+extern f265_scan_map_idx
+
align 4
pat_quant_dw_1: dd 1
pat_dequant_dw_1: dw 1,1
pat_sb_shuf_8: db 3,1,2,0, 3,2,1,0, 3,1,2,0
-
+pat_b_127: times 4 db 127
align 8
pat_sb_neigh_32: dq 0x7f7f7f7f7f7f7f7f
@@ -39,6 +42,20 @@ pat_sb_shuf_32: db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x
dq 0x3030303030303030
dq 0x0c0c0c0c0c0c0c0c
dq 0x0303030303030303
+pat_pp_null_sb: dq 0x1, 0x8, 0x8000, 0x8000000000000000
+pat_pp_reorder: dw 0xffff,0xffff,0xffff,0x0f0e,0xffff,0xffff,0x0706,0x0d0c
+ dw 0x0302,0x0908,0xffff,0xffff,0x0100,0xffff,0xffff,0xffff
+ dw 0xffff,0xffff,0x0504,0x0b0a,0xffff,0x0302,0x0908,0x0100
+ dw 0x0f0e,0x0706,0x0d0c,0xffff,0x0504,0x0b0a,0xffff,0xffff
+ dw 0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff
+ dw 0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff
+ dw 0x0f0e,0x0d0c,0x0b0a,0x0908,0x0706,0x0504,0x0302,0x0100
+ dw 0x0f0e,0x0d0c,0x0b0a,0x0908,0x0706,0x0504,0x0302,0x0100
+ dw 0xffff,0xffff,0x0f0e,0x0706,0xffff,0xffff,0x0d0c,0x0504
+ dw 0x0b0a,0x0302,0xffff,0xffff,0x0908,0x0100,0xffff,0xffff
+ dw 0xffff,0xffff,0x0b0a,0x0302,0xffff,0xffff,0x0908,0x0100
+ dw 0x0f0e,0x0706,0xffff,0xffff,0x0d0c,0x0504,0xffff,0xffff
+
section .text
; ---------------------- QUANT/DEQUANT macros ---------------------
@@ -377,3 +394,172 @@ DEFFUN f265_lbd_get_sb_flags_32_avx2, ia=2, at=88, ti=3, tv=6, ym=1
vpacksswb y0, y1
ret
+
+; void fenc_preprocess_tb(f265_tt_enc *tt, int16_t *qc, uint8_t *spill)
+; Input parameters:
+; - g0: tt.
+; - g1: qc.
+; - g2: spill buffer.
+;
+; We use two passes to minimize latency. The first pass stores the reordered
+; coefficients. The second pass processes them.
+;
+; Register usage, pass 1.
+; - ga: tmp.
+; - g0: tt.
+; - g1: qc.
+; - g2: iteration counter preserved for pass 2.
+; - g3: tmp.
+; - g4: tt->nz_flags[0].
+; - g5: offset map.
+; - g6: stride.
+; - g7: stride*3.
+; - g8: iteration counter.
+; - g9: null subblock flag.
+; - y0-1: tmp.
+; - y2-3: patterns.
+;
+; Register usage, pass 2.
+; - ga: tmp.
+; - g0: tt.
+; - g1: tmp.
+; - g2: iteration counter.
+; - g3: nz.
+; - g4: gt1.
+; - g5: gt2.
+; - g6: tt->sb.
+; - g7: tt->levels.
+; - g8: iteration end.
+; - y0-2: tmp.
+; - y3-8: patterns.
+;
+; TODO: add special case for 4x4, possibly from the function dispatcher to avoid a branch.
+DEFFUN f265_lbd_preprocess_tb_avx2, ia=3, at=888, ti=7, tv=9, ym=1
+
+ ; Initialize first pass.
+ mov g3, [g0+8] ; tt->tb.
+ mov g4, [g3+8] ; tb->nz_flags[0].
+ movzx ga, byte [g3+0] ; tb->lg_bs.
+ movzx g3, byte [g3+1] ; tb->order.
+ lea g5, [pat_pp_null_sb] ; Remember whether there is a null subblock.
+ xor g9, g9
+ test g4, [g5+8*ga-16]
+ setz g9b
+ mov g6, 2 ; Stride.
+ shlx g6, g6, ga
+ lea g7, [3*g6] ; 3*stride.
+ lea g5, [f265_scan_map_idx] ; Coefficient offset map.
+ lea ga, [3*ga-3*2] ; 3*(lg_bs-2).
+ add ga, g5
+ movzx ga, byte [ga+g3]
+ lea g5, [f265_scan_map_data + 256]
+ add g5, ga
+ lea ga, [pat_pp_reorder] ; Reorder patterns.
+ shl g3, 6
+ vmovdqu y2, [ga+g3]
+ vmovdqu y3, [ga+g3+32]
+ mov g8, g2 ; Iteration counter.
+
+ ; First pass.
+ tzcnt ga, g4 ; Position of the next subblock.
+ .loop_pass1:
+ movzx g3, byte [g5+ga] ; Subblock offset.
+ blsi ga, g4 ; Update the non-zero bitfield.
+ xor g4, ga
+
+ lea ga, [g1 + 8*g3] ; Load the coefficients.
+ vmovdqu y0, [ga]
+ vpunpcklqdq y0, [ga + g6]
+ vmovdqu y1, [ga + 2*g6 - 16] ; Offset to avoid cross-lane merges.
+ vpunpcklqdq y1, [ga + g7 - 16]
+ vpblendd y0, y0, y1, 0xf0
+
+ vpshufb y1, y0, y3 ; Reorder within lanes, leaving holes for missing values.
+ vpshufb y0, y0, y2
+ vpermq y1, y1, 0x4e
+ vpor y0, y1
+
+ vpacksswb y1, y0, y0 ; Pack to 8-bit.
+ vpermq y1, y1, 8
+ vpabsw y0, y0 ; Make the 16-bit levels absolute.
+ vmovdqu [g8], y0 ; Store the 16-bit levels and 8-bit levels aligned (64 bytes).
+ vmovdqu [g8+32], y1
+ add g8, 64
+
+ tzcnt ga, g4 ; Pass to the next subblock.
+ jnc .loop_pass1
+
+ ; Initialize second pass.
+ mov g6, [g0+16] ; tt->sb.
+ mov g7, [g0+24] ; tt->levels.
+ vpbroadcastd y8, [pat_b_127] ; 127.
+ vmovdqu y7, [pat_sb_shuf_16+16] ; Reverse order (borrowed from horizontal order).
+ vpcmpeqb y3, y3 ; -1.
+ vpxor y4, y4 ; 0.
+ vpabsb y5, y3 ; 1.
+ vpaddb y6, y5, y5 ; 2.
+ add qword [g0+8], 40 ; tb++.
+
+ ; Second pass.
+ .loop_pass2:
+ vmovdqu y0, [g2] ; 16-bit absolute levels.
+ vmovdqu y1, [g2+32] ; 8-bit signed levels.
+ vpabsb y2, y1 ; 8-bit absolute levels.
+ vpminub y2, y8 ; Convert -128 to 127 to avoid signed issues below.
+ add g2, 64
+
+ vpshufb y1, y7 ; sign reverse.
+ vpcmpgtb y1, y1, y3
+ vpmovmskb ga, x1
+ not gaw
+ vpcmpgtb y1, y2, y4 ; nz.
+ vpmovmskb g3, x1
+ vpshufb y1, y7 ; nz reverse.
+ vpmovmskb g1, x1
+ vpcmpgtb y1, y2, y5 ; gt1.
+ vpmovmskb g4, x1
+ vpcmpgtb y1, y2, y6 ; gt2.
+ vpmovmskb g5, x1
+
+ pext ga, ga, g1 ; Extract the signs.
+ mov [g6+0], g3w ; Store the non-zero flags and the signs.
+ mov [g6+2], gaw
+
+ pext g4, g4, g3 ; Extract the 8 gt1 flags.
+ and g4, 0xff
+ mov [g6+6], g4b ; Store the gt1 flags.
+
+ pext g5, g5, g3 ; Extract all gt2 flags.
+ blsi ga, g4 ; Extract the first gt1 flag set, or 0 if none.
+ and g5, ga ; Extract the first gt2 flag, or 0 if none.
+ setnz gab ; True if the gt2 flag is 1.
+ popcnt g1, g3 ; Count the number of non-zero flags.
+ shl ga, 5 ; Store packed_data.
+ or ga, g1
+ mov [g6+7], gab
+
+ blsi ga, g4 ; Extract the first gt1 flag set, or 0 if none.
+ xor ga, g4 ; Clear the gt2 position in the gt1 flags.
+ or ga, g5 ; Set the gt2 position in the gt1 flags if the gt2 flag is set.
+ or ga, 0xff00 ; Set the bit of all coefficients that haven't been inferred.
+ pdep ga, ga, g3
+ mov [g6+4], gaw ; Store the remaining flags.
+
+ vmovdqu [g7], y0 ; Store the levels tentatively.
+ xor g1, g1 ; levels++ if there are uninferred coefficients.
+ test ga, ga
+ setnz g1b
+ shl g1, 5
+ add g7, g1
+
+ add g6, 8 ; Pass to the next subblock.
+ cmp g2, g8
+ jnz .loop_pass2
+
+ ; Finish.
+ mov qword [g6], 0 ; Add the null subblock as needed.
+ lea g6, [g6+8*g9]
+ mov [g0+16], g6 ; tt->sb.
+ mov [g0+24], g7 ; tt->levels.
+ RET
+
diff --git a/f265/asm/x86inc.asm b/f265/asm/x86inc.asm
index 4135a90..f8427a4 100644
--- a/f265/asm/x86inc.asm
+++ b/f265/asm/x86inc.asm
@@ -42,8 +42,8 @@
; The general-purpose registers (GPR) are named gN, where N is an integer
; argument position. For example, g0 and g1 are the registers that contain the
; first and the second integer argument of a function respectively. gN is
-; 64-bit, gNd is 32-bit, gNb is 8-bit. As a special case, ga is the rax register
-; and gs is the rsp register.
+; 64-bit, gNd is 32-bit, gNw is 16-bit, gNb is 8-bit. As a special case, ga is
+; the rax register and gs is the rsp register.
;
; The vector registers (VEC) are named xN and yN, where N is a floating point
; argument position. For example, x0 and x1 are the registers that contain the
@@ -106,40 +106,41 @@
%endmacro
; Declare the general-purpose registers.
-%macro DECLARE_GPR 4 ; %1: register name, %2: "q" name, %3, "d" name, %4: "b" name.
+%macro DECLARE_GPR 5 ; %1: register name, %2: "q" name, %3, "d" name, %4, "w" name, %5: "b" name.
%define g%1 %2
%define g%1d %3
- %define g%1b %4
+ %define g%1w %4
+ %define g%1b %5
%endmacro
%ifdef ARCH_AMD64
-DECLARE_GPR 0, rdi, edi, dil
-DECLARE_GPR 1, rsi, esi, sil
-DECLARE_GPR 2, rdx, edx, dl
-DECLARE_GPR 3, rcx, ecx, cl
-DECLARE_GPR 4, r8, r8d, r8b
-DECLARE_GPR 5, r9, r9d, r9b
-DECLARE_GPR 6, r10, r10d, r10b
-DECLARE_GPR 7, r11, r11d, r11b
+DECLARE_GPR 0, rdi, edi, di, dil
+DECLARE_GPR 1, rsi, esi, si, sil
+DECLARE_GPR 2, rdx, edx, dx, dl
+DECLARE_GPR 3, rcx, ecx, cx, cl
+DECLARE_GPR 4, r8, r8d, r8w, r8b
+DECLARE_GPR 5, r9, r9d, r9w, r9b
+DECLARE_GPR 6, r10, r10d, r10w, r10b
+DECLARE_GPR 7, r11, r11d, r11w, r11b
%else
-DECLARE_GPR 0, rcx, ecx, cl
-DECLARE_GPR 1, rdx, edx, dl
-DECLARE_GPR 2, r8, r8d, r8b
-DECLARE_GPR 3, r9, r9d, r9b
-DECLARE_GPR 4, r10, r10d, r10b
-DECLARE_GPR 5, r11, r11d, r11b
-DECLARE_GPR 6, rdi, edi, dil
-DECLARE_GPR 7, rsi, esi, sil
+DECLARE_GPR 0, rcx, ecx, cx, cl
+DECLARE_GPR 1, rdx, edx, dx, dl
+DECLARE_GPR 2, r8, r8d, r8w, r8b
+DECLARE_GPR 3, r9, r9d, r9w, r9b
+DECLARE_GPR 4, r10, r10d, r10w, r10b
+DECLARE_GPR 5, r11, r11d, r11w, r11b
+DECLARE_GPR 6, rdi, edi, di, dil
+DECLARE_GPR 7, rsi, esi, si, sil
%endif
-DECLARE_GPR 8, rbx, ebx, bl
-DECLARE_GPR 9, rbp, ebp, bpl
-DECLARE_GPR 10, r12, r12d, r12b
-DECLARE_GPR 11, r13, r13d, r13b
-DECLARE_GPR 12, r14, r14d, r14b
-DECLARE_GPR 13, r15, r15d, r15b
-DECLARE_GPR a, rax, eax, al
-DECLARE_GPR s, rsp, esp, spl
+DECLARE_GPR 8, rbx, ebx, bx, bl
+DECLARE_GPR 9, rbp, ebp, bp, bpl
+DECLARE_GPR 10, r12, r12d, r12w, r12b
+DECLARE_GPR 11, r13, r13d, r13w, r13b
+DECLARE_GPR 12, r14, r14d, r14w, r14b
+DECLARE_GPR 13, r15, r15d, r15w, r15b
+DECLARE_GPR a, rax, eax, ax, al
+DECLARE_GPR s, rsp, esp, sp, spl
; Declare the vector registers.
%assign _count 0
@@ -245,6 +246,17 @@ DECLARE_GPR s, rsp, esp, spl
; Define the function prologue.
%macro PROLOGUE 0-*
+ ; Clean up the previous invocation.
+ %undef stack_off
+ %undef _ia
+ %undef _at
+ %undef _fa
+ %undef _ti
+ %undef _tv
+ %undef _ym
+ %undef _x64_xmm_save_size
+
+
; Extract the parameters as _key tokens.
; Number of arguments that matched.
@@ -391,15 +403,18 @@ DECLARE_GPR s, rsp, esp, spl
%endmacro
-; Define the function epilog.
+; Define the function epilog. A function may have multiple return statements,
+; so the epilog can be present multiple times.
%macro EPILOG 0
+ ; Back-up stack_off.
+ %assign stack_off_bak stack_off
+
; Clear the high YMM registers.
%if _ym
vzeroupper
%endif
-
; Restore the XMM registers.
%ifdef ARCH_X64
%assign _reg_count _fa + _tv - 6
@@ -428,19 +443,13 @@ DECLARE_GPR s, rsp, esp, spl
ASSERT(stack_off == 0)
+ ; Restore stack_off.
+ %assign stack_off stack_off_bak
; Clean up.
- %undef stack_off
- %undef _ia
- %undef _at
- %undef _fa
- %undef _ti
- %undef _tv
- %undef _ym
%undef _iter
%undef _reg_idx
%undef _reg_count
- %undef _x64_xmm_save_size
%endmacro
diff --git a/f265/enc.h b/f265/enc.h
index d1334ca..98c818a 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -1166,14 +1166,14 @@ struct f265_tb_enc
// Pointers in the transform tree encoding data: current transform node,
// transform block, transform subblock and transform coefficient level.
-typedef struct f265_tt_enc
+// The typedef is is asm.h, do not redefine it.
+struct f265_tt_enc
{
uint8_t *tn;
f265_tb_enc *tb;
f265_sb_enc *sb;
int16_t *levels;
-
-} f265_tt_enc;
+};
// Parameters for fenc_rec_block(). This is a stub.
typedef struct f265_rec_params
@@ -1875,8 +1875,8 @@ struct f265_enc_thread
f265_tb_enc tb[3*256];
// Subblock data in encoding order. A subblock only uses an entry if it is
- // non-zero.
- f265_sb_enc sb[3*256];
+ // non-zero. One extra entry used for unification.
+ f265_sb_enc sb[3*256+1];
// Subblock coefficient levels in encoding order. A subblock only uses an
// entry if it has remaining levels that must be encoded.
@@ -2766,7 +2766,7 @@ int fenc_quant_c(int16_t *dst, int16_t *src, int bs, int mult, int add, int shif
void fenc_dequant_c(int16_t *dst, int16_t *src, int bs, int mult, int add, int shift);
void fenc_init_transform_tree(f265_enc_thread *t);
void fenc_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc);
-void fenc_preprocess_tb(f265_tt_enc *tt, int16_t *qc);
+void fenc_preprocess_tb_c(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
int fenc_rec_block(f265_rec_params *rp);
int fenc_rec_tb(f265_enc_thread *t, f265_pix *pred, int pred_stride, int comp, int lg_bs, int dst_flag, int order,
int zero_flag, int ct_ox, int ct_oy, int depth, int intra_flag, int final_enc_flag);
diff --git a/f265/hm.c b/f265/hm.c
index 17013c4..0dd5516 100644
--- a/f265/hm.c
+++ b/f265/hm.c
@@ -676,148 +676,3 @@ void fenc_hm_set_intra(f265_enc_thread *t)
}
#endif
-// Future implementation notes.
-
-// Assembly for intra angular:
-// - Consider sharing functions between block sizes.
-// - Consider doing a function that will do -135, -90, -45, 0, 45 degrees in one
-// shot (fast estimation). May also do planar + DC. Handle or ignore filtering
-// issues.
-// - Consider computing the SAD instead of storing rows (fast estimation).
-// - Consider using pshufb based on the inverse angle to project neighbours.
-// - If horizontal, translate the horizontal case to the vertical case (flip
-// the neighbours).
-// - Call the appropriate dispatch function for the projection angle (3 or 4
-// cases).
-// - If the projection is vertical:
-// - Broadcast the pixels.
-// - When filtering:
-// - Compute column filtering in a register.
-// - Replace the first byte of every row using palignr and vpblendvb.
-// - Else if the projection is -45 or 45 degrees:
-// - Find a way to use palignr to avoid loads.
-// - Else:
-// - Go load heavy. Load neigbhours from cache for every row at the computed
-// offset to avoid branches.
-// - Pack neighbours as ABBCCDDE. Use pmaddubsw to multiply by the fractions
-// (32-iFact, iFact). Shift and pack two rows at a time.
-// - Assume the fractional case even when iFact==0 for a row to avoid
-// branches.
-// - If horizontal, flip using punpck.
-
-// Reconst:8 -> unfiltered/filtered:tmp@16,out@8 -> prediction:tmp@16,out@8 ->
-// (src-pred):tmp@16,out@16 -> DCT1D&clip:tmp@32,out@16 -> DCT1D&clip:tmp@32,out@16 ->
-// quantization:tmp@32,out@16 -> dequantization:same ->
-// DCT1D&clip:tmp@32,out@16 -> DCT1D&add&clip:tmp@32,out@8
-//
-// Assuming high bit depth:
-// Same, but:
-// - unfiltered/filtered/prediction: double "tmp" sizes.
-// - input/output: double size.
-
-// Coefficient encoding:
-//
-// One quantization function per transform block size.
-// Special case:
-// - May use a function to process 4 4x4 blocks together.
-// - May split between a function that does pure quant and a function that does
-// quant + nz_flags stuff.
-// Quantization function assembly:
-// - Quantize every row in raster scan.
-// - PACKSS columns together to get 1 byte per 4-coeff group.
-// - OR rows to get 1 byte per subblock.
-// - PCMPEQ to get 1 flag per subblock.
-// - PSHUFB/reorder to get the raster order and the encode order (nz_flags[0..1]).
-// - PMOVMSKB to get the flags in general purpose.
-// - Compute tb->nz_flags[2..3]:
-// - Use nz_flags[0] as base.
-// - Below: PDEP all but the bottom row.
-// - Right: PEXT to remove first column, PDEP to add zeroes to right column.
-//
-// One preprocessing function for all block sizes.
-// Preprocessing function assembly:
-// - Get the coefficient stride from lg_bs.
-// - Set up vpgatherqq index register.
-// - Set up function pointer to reorder the coefficients.
-// - BSF over the non-zero subblocks:
-// - first_coeff_pos = coeff_off[pos].
-// - Load coefficients using vpgatherqq (clone the index register first).
-// - Call func to reorder the 16 coefficients in YMM in encode order:
-// - PSHUFB to reorder each lane.
-// - Swap lanes in new register.
-// - PALIGNR to the merge the high-low lanes together.
-// - PSHUFB to reorder each lane.
-// - (Copy coeffs so we don't lose their content as needed here).
-// - PACKSSWD to have one byte per coeff.
-// - PCMPGTB -1 to get mask of signs.
-// - PABSB to get absolute coeff values (bytes).
-// - PCMPGTB 0 to get non-zero coeff flags.
-// - PCMPGTB 1 to get greater-than-1 flags.
-// - PCMPGTB 2 to get greater-than-2 flags.
-// - PMOVMSKB to nz_flags.
-// - PMOVMSKB to signs.
-// - PMOVMSKB to gt1.
-// - PMOVMSKB to gt2.
-// - PEXT signs using nz_flags.
-// - PEXT gt1 using nz_flags.
-// - PEXT gt2 using nz_flags.
-// - AND gt1, 0xff (keep only first 8 bits).
-// - BLSI TMP, gt1 (extract first gt1 bit set, if any).
-// - AND gt2, TMP (extract first gt2 bit).
-// - SETNE gt2 (set gt2 bit value, 0 if it doesn't exist).
-// - POPCNT nb_nz, nz_flags.
-// (There are remaining levels if nb_gt1 + gt2 > 1 || nb_nz > 8).
-// (Alternatively, check if remain_flags is non-zero).
-// - POPCNT nb_gt1, gt1
-// - ADD nb_gt1, gt2
-// - CMP nb_gt1, 1
-// - SETGT gt1_overflow
-// - CMP nb_nz, 8
-// - SETGT nb_nz_overflow
-// - OR remain_flag, gt1_overflow, nb_nz_overflow
-// - Possibly branch to compute coefficient base levels below.
-// - Else, just store the absolute coeff in order:
-// - PABSW 16-bit coefficients.
-// - MOVDQA 16-bit coefficients in store location.
-// - remain_flag << 5 (update store location if the store was required).
-// - ADD store pointer, remain_flag.
-// - SB pointer += sizeof(SB).
-// - How to reverse the 16 signs:
-// - BSWAP.
-// - PDEP to insert 4 zeros per group of 4 bits.
-// - MOV to YMM.
-// - PHUFB.
-// - MOV to GPR.
-// - PEXT to extract the reordered bits.
-// - Shift right to align with bit 0.
-// - Other possibility:
-// - Broadcast in YMM.
-// - AND with bit identification mask.
-// - Mov high lane to low.
-// - PACKSSWB.
-// - PSHUFB.
-// - PCMPEQ
-// - PMOVMSKB.
-// - Other possibility:
-// - Process in group of four in C loop (map).
-// - How to put back the gt1 flags in place:
-// - PDEP using the coefficient non-zero flags.
-// - Can be done for gt2 directly.
-// // Branch to compute exactly the base levels, slow so don't do unless
-// // required. NOT VERY USEFUL, the base levels must be computed anyway for the
-// // rice param update.
-// - PEXT A, 0xff, nz_flags (remove 1 for the 8 first non-zero coefficients).
-// - VBROADCASTW A (move coefficient bits back to ymm).
-// - PAND A, [word_mask] (keep only the bit corresponding to the coeff).
-// - PCMPEQW A, 0 (-1 if the bit is not set).
-// - PCMPEQW A, 0 (-1 if the bit is set).
-// - PADDW coeffs, A (remove 1 for the 8 first non-zero coefficients).
-// - PEXT A, gt2, nz_flags (put back gt2 bit at its coeff location).
-// - (same instructions as above to remove 1 for the gt2 coefficient).
-// - PACKSSWD B, A to have one byte per coeff.
-// - PCMPEQB B, 0 to get mask of remaining coeffs (2x).
-// - PMOVMSKB B to remain_flags.
-// - PADDW A, -1 (remove 1 for each non-zero coefficient).
-// - MOVDQA A, [remain] (store remaining coefficient level).
-// - Update store pointer, etc.
-
diff --git a/f265/rec.c b/f265/rec.c
index 65f8f01..5d8fc91 100644
--- a/f265/rec.c
+++ b/f265/rec.c
@@ -674,7 +674,7 @@ void fenc_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc)
// Preprocess the quantized coefficients in a non-empty transform block for
// encoding.
-void fenc_preprocess_tb(f265_tt_enc *tt, int16_t *qc)
+void fenc_preprocess_tb_c(f265_tt_enc *tt, int16_t *qc, uint8_t *spill)
{
f265_tb_enc *tb = tt->tb;
f265_sb_enc *sb = tt->sb;
@@ -839,9 +839,8 @@ int fenc_rec_block(f265_rec_params *rp)
// Get the subblock flags.
fenc_get_sb_flags[lg_bs-2](tb, quant);
- // Preprocess the coefficients. Should be done with one assembly
- // function.
- fenc_preprocess_tb(tt, quant);
+ // Preprocess the coefficients.
+ fenc_preprocess_tb(tt, quant, t->store);
}
return 1;
diff --git a/snippets/asm.py b/snippets/asm.py
index b6d34db..557dd51 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -164,6 +164,10 @@ def declare_all():
ret = "void", args="f265_tb_enc *tb, int16_t *qc",
indices=["4", "8", "16", "32"], avx2_lbd=1)
+ df("preprocess_tb", bd=1,
+ ret = "void", args="f265_tt_enc *tt, int16_t *qc, uint8_t *spill",
+ avx2_lbd=1)
+
df("fsad", bd=1,
ret = "int", args="f265_pix *src, int src_stride, f265_pix *ref, int ref_stride, int packed_dims",
indices=amp_indices,
@@ -261,6 +265,7 @@ def get_c_special_code():
def get_h_special_code():
s = ""
s += "typedef struct f265_tb_enc f265_tb_enc;\n"
+ s += "typedef struct f265_tt_enc f265_tt_enc;\n"
return s
# Generate the output text for the C/header file.
diff --git a/snippets/scan_map.c b/snippets/scan_map.c
index 9801574..7723262 100644
--- a/snippets/scan_map.c
+++ b/snippets/scan_map.c
@@ -168,6 +168,62 @@ void gen_pdep_map()
printf(" dq 0x%016llx\n", (unsigned long long int)pdep[order][lane]);
}
+// Generate the coefficient shuffle map used in assembly for coefficient
+// preprocessing.
+void gen_pp_map()
+{
+ // Pshufb pattern per order/selector for same/complement lane.
+ uint8_t pshufb[3][2][16];
+
+ for (int order = 0; order < 3; order++)
+ {
+ // Get the raster-to-encoding map.
+ uint8_t *enc_to_raster = fenc_scan_map_data + fenc_scan_map_idx[2][order];
+ uint8_t raster_to_enc[16];
+ for (int enc_pos = 0; enc_pos < 16; enc_pos++) raster_to_enc[enc_to_raster[enc_pos]] = enc_pos;
+
+ // Pass each source position.
+ for (int64_t src_pos = 0; src_pos < 16; src_pos++)
+ {
+ int64_t dst_pos = raster_to_enc[src_pos];
+ int src_lane = src_pos >= 8, dst_lane = dst_pos >= 8;
+
+ // The pattern selector is 0 if src_lane == dst_lane.
+ int select = src_lane != dst_lane;
+
+ // Effective source location in the pshufb pattern.
+ int pshufb_src = src_pos%8;
+
+ // Effective destination where the coefficient goes in the
+ // lanes.
+ int pshufb_dst0 = src_lane*8 + (dst_pos%8);
+ int pshufb_dst1 = !src_lane*8 + (dst_pos%8);
+
+ // Put the coefficient in the effective destination lane.
+ pshufb[order][select][pshufb_dst0] = pshufb_src;
+
+ // Put a hole in the complement lane.
+ pshufb[order][!select][pshufb_dst1] = 0xff;
+ }
+ }
+
+ printf("ASM preprocessor patterns:\n");
+ for (int order = 0; order < 3; order++)
+ for (int reg = 0; reg < 2; reg++)
+ for (int lane = 0; lane < 2; lane++)
+ {
+ printf(" dw ");
+ for (int i = 0; i < 8; i++)
+ {
+ int raw = pshufb[order][reg][lane*8 + i];
+ int val1 = (raw == 0xff) ? 0xff : raw*2;
+ int val2 = (raw == 0xff) ? 0xff : raw*2 + 1;
+ printf("0x%02x%02x%s", val2, val1, i == 7 ? "" : ",");
+ }
+ printf("\n");
+ }
+}
+
void gen_last_coeff_table()
{
int t[5*16];
@@ -204,6 +260,7 @@ int main()
{
gen_scan_map();
gen_pdep_map();
+ gen_pp_map();
gen_last_coeff_table();
return 0;
}