Patch to review for AVX2 assembly for get_sb_flags().
Global speed-up: 8%.
Thanks,
Laurent
diff --git a/f265/asm.c b/f265/asm.c
index 16ee29f..ffee220 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -49,6 +49,10 @@ void f265_lbd_dequant_32_avx2(int16_t *dst, int16_t *src, int bs, int mult, int
void f265_hbd_dequant_c(int16_t *dst, int16_t *src, int bs, int mult, int add, int shift);
void f265_lbd_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc);
+void f265_lbd_get_sb_flags_4_avx2(f265_tb_enc *tb, int16_t *qc);
+void f265_lbd_get_sb_flags_8_avx2(f265_tb_enc *tb, int16_t *qc);
+void f265_lbd_get_sb_flags_16_avx2(f265_tb_enc *tb, int16_t *qc);
+void f265_lbd_get_sb_flags_32_avx2(f265_tb_enc *tb, int16_t *qc);
void f265_hbd_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc);
int f265_lbd_fsad_c(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims);
@@ -539,6 +543,10 @@ static void f265_link_asm(int avx2_flag)
f265_lbd_dequant[1] = f265_lbd_dequant_8_avx2;
f265_lbd_dequant[2] = f265_lbd_dequant_16_avx2;
f265_lbd_dequant[3] = f265_lbd_dequant_32_avx2;
+ f265_lbd_get_sb_flags[0] = f265_lbd_get_sb_flags_4_avx2;
+ f265_lbd_get_sb_flags[1] = f265_lbd_get_sb_flags_8_avx2;
+ f265_lbd_get_sb_flags[2] = f265_lbd_get_sb_flags_16_avx2;
+ f265_lbd_get_sb_flags[3] = f265_lbd_get_sb_flags_32_avx2;
f265_lbd_fsad[1] = f265_lbd_fsad_4_avx2;
f265_lbd_fsad[2] = f265_lbd_fsad_8_avx2;
f265_lbd_fsad[3] = f265_lbd_fsad_16_avx2;
diff --git a/f265/asm/avx2/encode.asm b/f265/asm/avx2/encode.asm
index c4733a9..0c13b25 100644
--- a/f265/asm/avx2/encode.asm
+++ b/f265/asm/avx2/encode.asm
@@ -4,9 +4,45 @@
%include "x86inc.asm"
section .data
+
align 4
pat_quant_dw_1: dd 1
pat_dequant_dw_1: dw 1,1
+pat_sb_shuf_8: db 3,1,2,0, 3,2,1,0, 3,1,2,0
+
+
+align 8
+pat_sb_neigh_32: dq 0x7f7f7f7f7f7f7f7f
+
+align 16
+pat_sb_shuf_16: db 15, 11, 14, 7, 10, 13, 3, 6, 9, 12, 2, 5, 8, 1, 4, 0
+ db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ db 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0
+pat_sb_merge_32: dd 0, 4, 1, 5, 2, 6, 3, 7
+pat_sb_shuf_32: db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00 ; pshufb
+ db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+ db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+ db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+ db 0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00
+ db 0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00
+ db 0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00
+ db 0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00
+ db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+ db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+ db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+ db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+ dq 0xecc6183030200000 ; pdep
+ dq 0x131860c0c0c18400
+ dq 0x00218303030618c8
+ dq 0x0000040c0c186337
+ dq 0xffff000000000000
+ dq 0x0000ffff00000000
+ dq 0x00000000ffff0000
+ dq 0x000000000000ffff
+ dq 0xc0c0c0c0c0c0c0c0
+ dq 0x3030303030303030
+ dq 0x0c0c0c0c0c0c0c0c
+ dq 0x0303030303030303
section .text
; ---------------------- QUANT/DEQUANT macros ---------------------
@@ -174,3 +210,174 @@ DEFFUN f265_lbd_dequant_8_avx2, ia=6, at=884444, ti=0, tv=6, ym=1
%unmacro CALC_LOOP_ITER 1
%unmacro MULTIPLY 3
+
+; void fenc_get_sb_flags(f265_tb_enc *tb, int16_t *qc)
+; Input parameters:
+; - g0: tb.
+; - g1: qc.
+DEFFUN f265_lbd_get_sb_flags_4_avx2, ia=2, at=88, ti=0, tv=0, ym=0
+
+ ; Assume there is one non-zero subblock.
+ mov qword [g0 + 8], 1
+ mov qword [g0 + 16], 0
+ mov qword [g0 + 24], 0
+ RET
+
+DEFFUN f265_lbd_get_sb_flags_8_avx2, ia=2, at=88, ti=1, tv=2, ym=1
+
+ ; Get one byte per subblock.
+ vmovdqu y0, [g1+0] ; OR rows together (0-2, 1-3, 4-6, 5-7).
+ vpor y0, [g1+32]
+ vmovdqu y1, [g1+64]
+ vpor y1, [g1+96]
+ vpacksswb y0, y1 ; Collapse 8-byte subblock to 1-byte subblock.
+ vpacksswb y0, y0
+ vpacksswb y0, y0
+ vextracti128 x1, y0, 1 ; OR with high lane.
+ vpor y0, y1
+ vpxor y1, y1 ; Load zero.
+ vpcmpeqb y0, y1 ; 0xff if subblock is non-zero.
+ vpcmpeqb y0, y1
+
+ ; Shuffle the subblocks in encoding order.
+ movzx g1, byte [g0+1] ; Load order.
+ lea g2, [pat_sb_shuf_8] ; Shuffle.
+ vpshufb y1, y0, [g2 + 4*g1]
+
+ ; Get the subblock flags.
+ vpmovmskb g1d, y0 ; Raster order.
+ and g1, 0xf
+ vpmovmskb g2d, y1 ; Encoding order.
+ and g2, 0xf
+ mov [g0+8], g2 ; Store in encoding order.
+
+ ; Get the neighbour flags.
+ mov g2, g1 ; Preserve.
+ shr g1, 1 ; Move the columns left by 1.
+ and g1, 5 ; Remove the non-existing column.
+ mov [g0+16], g1 ; Store right.
+ shr g2, 2 ; Move the rows up by 1.
+ mov [g0+24], g2 ; Store bottom.
+ RET
+
+DEFFUN f265_lbd_get_sb_flags_16_avx2, ia=2, at=88, ti=1, tv=3, ym=1
+
+ ; Get one byte per subblock.
+ %macro LOAD_ROW 2 ; %1: output, %2: true if g1 must be incremented.
+ vmovdqu %1, [g1+0] ; OR rows together.
+ vpor %1, [g1+32]
+ vpor %1, [g1+64]
+ vpor %1, [g1+96]
+ %if %2
+ sub g1, -128 ; Increment g1 to minimize overall code size.
+ %endif
+ %endmacro
+ LOAD_ROW y0, 1
+ LOAD_ROW y1, 1
+ vpacksswb y0, y1 ; Collapse.
+ LOAD_ROW y1, 1
+ LOAD_ROW y2, 0
+ vpacksswb y1, y2
+ vpacksswb y0, y1
+ vpacksswb y0, y0
+ vextracti128 x1, y0, 1 ; Unpack with high lane.
+ vpunpcklwd y0, y1
+ vpxor y1, y1 ; Load zero.
+ vpcmpeqb y0, y1 ; 0xff if subblock is non-zero.
+ vpcmpeqb y0, y1
+ %unmacro LOAD_ROW 2
+
+ ; Shuffle the subblocks in encoding order.
+ movzx g1, byte [g0+1]
+ shl g1, 4
+ lea g2, [pat_sb_shuf_16]
+ vpshufb y1, y0, [g2+g1]
+
+ ; Get the subblock flags.
+ vpmovmskb g1d, x0
+ vpmovmskb g2d, x1
+ mov [g0+8], g2
+
+ ; Get the neighbour flags.
+ mov g2, g1
+ shr g1, 1
+ and g1, 0x7777
+ mov [g0+16], g1
+ shr g2, 4
+ mov [g0+24], g2
+ RET
+
+DEFFUN f265_lbd_get_sb_flags_32_avx2, ia=2, at=88, ti=3, tv=6, ym=1
+
+ ; Get one byte per subblock. Final output in y2 and y3.
+ vmovdqu y5, [pat_sb_merge_32] ; Load row merge pattern.
+ %macro LOAD_REG 2 ; %1: output, %2: tmp.
+ call .load_sb_row ; Load and merge every row.
+ vmovdqu %1, y0 ; Preserve.
+ call .load_sb_row
+ vpacksswb %1, %1, y0 ; Pack and reorder.
+ vpermd %1, y5, %1
+ call .load_sb_row
+ vmovdqu %2, y0
+ call .load_sb_row
+ vpacksswb %2, %2, y0
+ vpermd %2, y5, %2
+ vpacksswb %1, %1, %2
+ vpermq %1, %1, 0xd8
+ %endmacro
+ LOAD_REG y2, y3
+ LOAD_REG y3, y4
+ %unmacro LOAD_REG 2
+ vpxor y0, y0 ; 0xff if subblock is non-zero.
+ vpcmpeqb y2, y0
+ vpcmpeqb y2, y0
+ vpcmpeqb y3, y0
+ vpcmpeqb y3, y0
+
+ ; Shuffle the subblocks in encoding order.
+ movzx g1, byte [g0+1]
+ shl g1, 5
+ lea g3, [pat_sb_shuf_32] ; Pshufb base.
+ lea g2, [g3+g1+64*3] ; Pdep pointer.
+ vpshufb y0, y2, [g3+2*g1] ; Reorder within each lane.
+ vpshufb y1, y3, [g3+2*g1+32]
+ vpmovmskb g1, y0 ; Reorder using the lane bitfields.
+ pdep g4, g1, [g2] ; Low 16-bit, first register.
+ shr g1, 16 ; High 16-bit.
+ pdep g3, g1, [g2+8]
+ or g4, g3
+ vpmovmskb g1, y1 ; Second register.
+ pdep g3, g1, [g2+16]
+ or g4, g3
+ shr g1, 16
+ pdep g3, g1, [g2+24]
+ or g4, g3
+ mov [g0+8], g4 ; Store in encoding order.
+
+ ; Get the neighbour flags.
+ vpmovmskb g1, y2 ; Combine into a single bitfield.
+ vpmovmskb g2, y3
+ shl g2, 32
+ or g1, g2
+ mov g2, g1
+ shr g1, 1
+ and g1, [pat_sb_neigh_32]
+ mov [g0+16], g1
+ shr g2, 8
+ mov [g0+24], g2
+ RET
+
+ ; Load one row of subblocks. Output register y0, tmp register y1.
+ .load_sb_row:
+ vmovdqu y0, [g1+0*32]
+ vmovdqu y1, [g1+1*32]
+ vpor y0, [g1+2*32]
+ vpor y1, [g1+3*32]
+ vpor y0, [g1+4*32]
+ vpor y1, [g1+5*32]
+ vpor y0, [g1+6*32]
+ vpor y1, [g1+7*32]
+ add g1, 8*32
+ vpacksswb y0, y1
+ ret
+
diff --git a/f265/enc.h b/f265/enc.h
index 73e34dc..73c11b4 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -2761,17 +2761,23 @@ int8_t fenc_rc_frame_start(f265_enc_thread *t, f265_frame *prev);
void fenc_rc_frame_end(f265_enc_thread *t, int32_t actual_bits, float avg_qp);
// rec.c
+void fenc_do_dct_1d(int16_t *dst, int16_t *src, int lg_bs, int dst_flag, int idx_map[2][2], int shift, int clip_flag);
+int fenc_quant_c(int16_t *dst, int16_t *src, int bs, int mult, int add, int shift);
+void fenc_dequant_c(int16_t *dst, int16_t *src, int bs, int mult, int add, int shift);
void fenc_init_transform_tree(f265_enc_thread *t);
+void fenc_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc);
void fenc_preprocess_tb(f265_tt_enc *tt, int16_t *qc);
int fenc_rec_block(f265_rec_params *rp);
int fenc_rec_tb(f265_enc_thread *t, f265_pix *pred, int pred_stride, int comp, int lg_bs, int dst_flag, int order,
- int zero_flag, int ct_ox, int ct_oy, int depth, int intra_cb, int final);
-int fenc_rec_intra_tb(f265_enc_thread *t, int comp, int lg_bs, int mode, int zero_flag, int ct_ox, int ct_oy,
- int depth);
+ int zero_flag, int ct_ox, int ct_oy, int depth, int intra_flag, int final_enc_flag);
+int fenc_rec_intra_tb(f265_enc_thread *t, int comp, int lg_bs, int mode, int zero_flag, int ct_ox, int ct_oy, int depth);
int fenc_rec_intra_tt(f265_enc_thread *t, f265_cb *cb, int split_part_flag, int part_idx, int lg_bs,
int ct_ox, int ct_oy);
-void fenc_rec_inter_cb(f265_enc_thread *t, f265_cb *cb);
+int fenc_rec_inter_tb(f265_enc_thread *t, f265_pix *pred, int pred_stride, int comp, int lg_bs, int dst_flag,
+ int order, int zero_flag, int ct_ox, int ct_oy, int depth);
int fenc_rec_inter_tt(f265_enc_thread *t, f265_cb *cb, f265_pix pred[3][64*64], int lg_bs, int cb_ox, int cb_oy);
+void fenc_rec_inter_cb(f265_enc_thread *t, f265_cb *cb);
+void fenc_set_tt0_blueprint(f265_enc_thread *t, f265_cb *cb);
void fenc_rec_cb(f265_enc_thread *t, f265_cb *cb);
void fenc_rec_ctb(f265_enc_thread *t);
int fenc_do_rdoq(f265_rec_params *rp, int16_t *dst, int16_t *src);
diff --git a/snippets/asm.py b/snippets/asm.py
index 3ef8b82..b6d34db 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -162,7 +162,7 @@ def declare_all():
df("get_sb_flags", bd=1,
ret = "void", args="f265_tb_enc *tb, int16_t *qc",
- indices=["4", "8", "16", "32"], avx2_lbd=[])
+ indices=["4", "8", "16", "32"], avx2_lbd=1)
df("fsad", bd=1,
ret = "int", args="f265_pix *src, int src_stride, f265_pix *ref, int ref_stride, int packed_dims",
diff --git a/snippets/scan_map.c b/snippets/scan_map.c
index 8e3cc74..9801574 100644
--- a/snippets/scan_map.c
+++ b/snippets/scan_map.c
@@ -5,6 +5,7 @@
#include <stdint.h>
#include <string.h>
#include <ctype.h>
+#include <stdlib.h>
// Block count: 1 + 4 + 16 + 64 = 85.
// Size required: 85 * 3 = 255. Round to 256 for the coefficient map.
@@ -92,6 +93,81 @@ void gen_scan_map()
printf("\n};\n");
}
+int qsort_helper(const void *a, const void *b)
+{
+ int c = *(uint8_t*)a;
+ int d = *(uint8_t*)b;
+ if (c < d) return -1;
+ if (c > d) return 1;
+ return 0;
+}
+
+// Generate the pshufb/parallel deposit map used in assembly for 32x32
+// coefficient scanning.
+void gen_pdep_map()
+{
+ // Pshufb pattern per order/lane.
+ uint8_t pshufb[3][4][16];
+
+ // Parallel deposit bitfield per order/lane.
+ uint64_t pdep[3][4];
+
+ for (int order = 0; order < 3; order++)
+ {
+ // Get the raster-to-encoding map.
+ uint8_t *enc_to_raster = fenc_scan_map_data + fenc_scan_map_idx[3][order];
+ uint8_t raster_to_enc[64];
+ for (int enc_pos = 0; enc_pos < 64; enc_pos++) raster_to_enc[enc_to_raster[enc_pos]] = enc_pos;
+
+ for (int lane = 0; lane < 4; lane++)
+ {
+ pdep[order][lane] = 0;
+
+ // Destination for each coefficient.
+ uint8_t *dst_array = raster_to_enc + lane*16;
+
+ // Sorted destination positions.
+ uint8_t lane_dst_pos[16];
+ memcpy(lane_dst_pos, dst_array, 16);
+ qsort(lane_dst_pos, 16, 1, qsort_helper);
+
+ // Pass each source position.
+ for (int64_t src_pos = 0; src_pos < 16; src_pos++)
+ {
+ int64_t dst_pos = dst_array[src_pos];
+
+ // Find the destination offset for pshufb.
+ int pshufb_off = -1;
+ for (int i = 0; i < 16; i++)
+ {
+ if (dst_pos == lane_dst_pos[i])
+ {
+ pshufb_off = i;
+ break;
+ }
+ }
+
+ // Update the patterns.
+ pshufb[order][lane][pshufb_off] = src_pos;
+ pdep[order][lane] += (1llu<<dst_pos);
+ }
+ }
+ }
+
+ printf("ASM 32x32 patterns:\n");
+ for (int order = 0; order < 3; order++)
+ for (int lane = 0; lane < 4; lane++)
+ {
+ printf(" db ");
+ for (int i = 0; i < 16; i++) printf("0x%02x%s", pshufb[order][lane][i], i == 15 ? "" : ",");
+ printf("\n");
+ }
+
+ for (int order = 0; order < 3; order++)
+ for (int lane = 0; lane < 4; lane++)
+ printf(" dq 0x%016llx\n", (unsigned long long int)pdep[order][lane]);
+}
+
void gen_last_coeff_table()
{
int t[5*16];
@@ -127,6 +203,7 @@ void gen_last_coeff_table()
int main()
{
gen_scan_map();
+ gen_pdep_map();
gen_last_coeff_table();
return 0;
}