Patch to review for AVX2 assembly for get_sb_flags().
Global speed-up: 8%.

Thanks,
Laurent
diff --git a/f265/asm.c b/f265/asm.c
index 16ee29f..ffee220 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -49,6 +49,10 @@ void f265_lbd_dequant_32_avx2(int16_t *dst, int16_t *src, int bs, int mult, int
 void f265_hbd_dequant_c(int16_t *dst, int16_t *src, int bs, int mult, int add, int shift);
 
 void f265_lbd_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc);
+void f265_lbd_get_sb_flags_4_avx2(f265_tb_enc *tb, int16_t *qc);
+void f265_lbd_get_sb_flags_8_avx2(f265_tb_enc *tb, int16_t *qc);
+void f265_lbd_get_sb_flags_16_avx2(f265_tb_enc *tb, int16_t *qc);
+void f265_lbd_get_sb_flags_32_avx2(f265_tb_enc *tb, int16_t *qc);
 void f265_hbd_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc);
 
 int f265_lbd_fsad_c(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims);
@@ -539,6 +543,10 @@ static void f265_link_asm(int avx2_flag)
         f265_lbd_dequant[1] = f265_lbd_dequant_8_avx2;
         f265_lbd_dequant[2] = f265_lbd_dequant_16_avx2;
         f265_lbd_dequant[3] = f265_lbd_dequant_32_avx2;
+        f265_lbd_get_sb_flags[0] = f265_lbd_get_sb_flags_4_avx2;
+        f265_lbd_get_sb_flags[1] = f265_lbd_get_sb_flags_8_avx2;
+        f265_lbd_get_sb_flags[2] = f265_lbd_get_sb_flags_16_avx2;
+        f265_lbd_get_sb_flags[3] = f265_lbd_get_sb_flags_32_avx2;
         f265_lbd_fsad[1] = f265_lbd_fsad_4_avx2;
         f265_lbd_fsad[2] = f265_lbd_fsad_8_avx2;
         f265_lbd_fsad[3] = f265_lbd_fsad_16_avx2;
diff --git a/f265/asm/avx2/encode.asm b/f265/asm/avx2/encode.asm
index c4733a9..0c13b25 100644
--- a/f265/asm/avx2/encode.asm
+++ b/f265/asm/avx2/encode.asm
@@ -4,9 +4,45 @@
 %include "x86inc.asm"
 
 section .data
+
 align 4
 pat_quant_dw_1:     dd 1
 pat_dequant_dw_1:   dw 1,1
+pat_sb_shuf_8:      db 3,1,2,0, 3,2,1,0, 3,1,2,0
+
+
+align 8
+pat_sb_neigh_32:    dq 0x7f7f7f7f7f7f7f7f
+
+align 16
+pat_sb_shuf_16:     db 15, 11, 14, 7, 10, 13, 3, 6, 9, 12, 2, 5, 8, 1, 4, 0
+                    db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+                    db 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0
+pat_sb_merge_32:    dd 0, 4, 1, 5, 2, 6, 3, 7
+pat_sb_shuf_32:     db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00 ; pshufb
+                    db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+                    db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+                    db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+                    db 0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00
+                    db 0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00
+                    db 0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00
+                    db 0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00
+                    db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+                    db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+                    db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+                    db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x02,0x09,0x01,0x08,0x00
+                    dq 0xecc6183030200000 ; pdep
+                    dq 0x131860c0c0c18400
+                    dq 0x00218303030618c8
+                    dq 0x0000040c0c186337
+                    dq 0xffff000000000000
+                    dq 0x0000ffff00000000
+                    dq 0x00000000ffff0000
+                    dq 0x000000000000ffff
+                    dq 0xc0c0c0c0c0c0c0c0
+                    dq 0x3030303030303030
+                    dq 0x0c0c0c0c0c0c0c0c
+                    dq 0x0303030303030303
 
 section .text
 ; ---------------------- QUANT/DEQUANT macros ---------------------
@@ -174,3 +210,174 @@ DEFFUN f265_lbd_dequant_8_avx2, ia=6, at=884444, ti=0, tv=6, ym=1
 %unmacro CALC_LOOP_ITER 1
 %unmacro MULTIPLY 3
 
+
+; void fenc_get_sb_flags(f265_tb_enc *tb, int16_t *qc)
+; Input parameters:
+; - g0:     tb.
+; - g1:     qc.
+DEFFUN f265_lbd_get_sb_flags_4_avx2, ia=2, at=88, ti=0, tv=0, ym=0
+
+    ; Assume there is one non-zero subblock.
+    mov             qword [g0 + 8], 1
+    mov             qword [g0 + 16], 0
+    mov             qword [g0 + 24], 0
+    RET
+
+DEFFUN f265_lbd_get_sb_flags_8_avx2, ia=2, at=88, ti=1, tv=2, ym=1
+
+    ; Get one byte per subblock.
+    vmovdqu         y0, [g1+0]              ; OR rows together (0-2, 1-3, 4-6, 5-7).
+    vpor            y0, [g1+32]
+    vmovdqu         y1, [g1+64]
+    vpor            y1, [g1+96]
+    vpacksswb       y0, y1                  ; Collapse 8-byte subblock to 1-byte subblock.
+    vpacksswb       y0, y0
+    vpacksswb       y0, y0
+    vextracti128    x1, y0, 1               ; OR with high lane.
+    vpor            y0, y1
+    vpxor           y1, y1                  ; Load zero.
+    vpcmpeqb        y0, y1                  ; 0xff if subblock is non-zero.
+    vpcmpeqb        y0, y1
+
+    ; Shuffle the subblocks in encoding order.
+    movzx           g1, byte [g0+1]         ; Load order.
+    lea             g2, [pat_sb_shuf_8]     ; Shuffle.
+    vpshufb         y1, y0, [g2 + 4*g1]
+
+    ; Get the subblock flags.
+    vpmovmskb       g1d, y0                 ; Raster order.
+    and             g1, 0xf
+    vpmovmskb       g2d, y1                 ; Encoding order.
+    and             g2, 0xf
+    mov             [g0+8], g2              ; Store in encoding order.
+
+    ; Get the neighbour flags.
+    mov             g2, g1                  ; Preserve.
+    shr             g1, 1                   ; Move the columns left by 1.
+    and             g1, 5                   ; Remove the non-existing column.
+    mov             [g0+16], g1             ; Store right.
+    shr             g2, 2                   ; Move the rows up by 1.
+    mov             [g0+24], g2             ; Store bottom.
+    RET
+
+DEFFUN f265_lbd_get_sb_flags_16_avx2, ia=2, at=88, ti=1, tv=3, ym=1
+
+    ; Get one byte per subblock.
+    %macro LOAD_ROW 2                       ; %1: output, %2: true if g1 must be incremented.
+    vmovdqu         %1, [g1+0]              ; OR rows together.
+    vpor            %1, [g1+32]
+    vpor            %1, [g1+64]
+    vpor            %1, [g1+96]
+    %if %2
+    sub             g1, -128                ; Increment g1 to minimize overall code size.
+    %endif
+    %endmacro
+    LOAD_ROW        y0, 1
+    LOAD_ROW        y1, 1
+    vpacksswb       y0, y1                  ; Collapse.
+    LOAD_ROW        y1, 1
+    LOAD_ROW        y2, 0
+    vpacksswb       y1, y2
+    vpacksswb       y0, y1
+    vpacksswb       y0, y0
+    vextracti128    x1, y0, 1               ; Unpack with high lane.
+    vpunpcklwd      y0, y1
+    vpxor           y1, y1                  ; Load zero.
+    vpcmpeqb        y0, y1                  ; 0xff if subblock is non-zero.
+    vpcmpeqb        y0, y1
+    %unmacro LOAD_ROW 2
+
+    ; Shuffle the subblocks in encoding order.
+    movzx           g1, byte [g0+1]
+    shl             g1, 4
+    lea             g2, [pat_sb_shuf_16]
+    vpshufb         y1, y0, [g2+g1]
+
+    ; Get the subblock flags.
+    vpmovmskb       g1d, x0
+    vpmovmskb       g2d, x1
+    mov             [g0+8], g2
+
+    ; Get the neighbour flags.
+    mov             g2, g1
+    shr             g1, 1
+    and             g1, 0x7777
+    mov             [g0+16], g1
+    shr             g2, 4
+    mov             [g0+24], g2
+    RET
+
+DEFFUN f265_lbd_get_sb_flags_32_avx2, ia=2, at=88, ti=3, tv=6, ym=1
+
+    ; Get one byte per subblock. Final output in y2 and y3.
+    vmovdqu         y5, [pat_sb_merge_32]   ; Load row merge pattern.
+    %macro LOAD_REG 2                       ; %1: output, %2: tmp.
+    call .load_sb_row                       ; Load and merge every row.
+    vmovdqu         %1, y0                  ; Preserve.
+    call .load_sb_row
+    vpacksswb       %1, %1, y0              ; Pack and reorder.
+    vpermd          %1, y5, %1
+    call .load_sb_row
+    vmovdqu         %2, y0
+    call .load_sb_row
+    vpacksswb       %2, %2, y0
+    vpermd          %2, y5, %2
+    vpacksswb       %1, %1, %2
+    vpermq          %1, %1, 0xd8
+    %endmacro
+    LOAD_REG        y2, y3
+    LOAD_REG        y3, y4
+    %unmacro LOAD_REG 2
+    vpxor           y0, y0                  ; 0xff if subblock is non-zero.
+    vpcmpeqb        y2, y0
+    vpcmpeqb        y2, y0
+    vpcmpeqb        y3, y0
+    vpcmpeqb        y3, y0
+
+    ; Shuffle the subblocks in encoding order.
+    movzx           g1, byte [g0+1]
+    shl             g1, 5
+    lea             g3, [pat_sb_shuf_32]    ; Pshufb base.
+    lea             g2, [g3+g1+64*3]        ; Pdep pointer.
+    vpshufb         y0, y2, [g3+2*g1]       ; Reorder within each lane.
+    vpshufb         y1, y3, [g3+2*g1+32]
+    vpmovmskb       g1, y0                  ; Reorder using the lane bitfields.
+    pdep            g4, g1, [g2]            ; Low 16-bit, first register.
+    shr             g1, 16                  ; High 16-bit.
+    pdep            g3, g1, [g2+8]
+    or              g4, g3
+    vpmovmskb       g1, y1                  ; Second register.
+    pdep            g3, g1, [g2+16]
+    or              g4, g3
+    shr             g1, 16
+    pdep            g3, g1, [g2+24]
+    or              g4, g3
+    mov             [g0+8], g4              ; Store in encoding order.
+
+    ; Get the neighbour flags.
+    vpmovmskb       g1, y2                  ; Combine into a single bitfield.
+    vpmovmskb       g2, y3
+    shl             g2, 32
+    or              g1, g2
+    mov             g2, g1
+    shr             g1, 1
+    and             g1, [pat_sb_neigh_32]
+    mov             [g0+16], g1
+    shr             g2, 8
+    mov             [g0+24], g2
+    RET
+
+    ; Load one row of subblocks. Output register y0, tmp register y1.
+    .load_sb_row:
+    vmovdqu         y0, [g1+0*32]
+    vmovdqu         y1, [g1+1*32]
+    vpor            y0, [g1+2*32]
+    vpor            y1, [g1+3*32]
+    vpor            y0, [g1+4*32]
+    vpor            y1, [g1+5*32]
+    vpor            y0, [g1+6*32]
+    vpor            y1, [g1+7*32]
+    add             g1, 8*32
+    vpacksswb       y0, y1
+    ret
+
diff --git a/f265/enc.h b/f265/enc.h
index 73e34dc..73c11b4 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -2761,17 +2761,23 @@ int8_t fenc_rc_frame_start(f265_enc_thread *t, f265_frame *prev);
 void fenc_rc_frame_end(f265_enc_thread *t, int32_t actual_bits, float avg_qp);
 
 // rec.c
+void fenc_do_dct_1d(int16_t *dst, int16_t *src, int lg_bs, int dst_flag, int idx_map[2][2], int shift, int clip_flag);
+int fenc_quant_c(int16_t *dst, int16_t *src, int bs, int mult, int add, int shift);
+void fenc_dequant_c(int16_t *dst, int16_t *src, int bs, int mult, int add, int shift);
 void fenc_init_transform_tree(f265_enc_thread *t);
+void fenc_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc);
 void fenc_preprocess_tb(f265_tt_enc *tt, int16_t *qc);
 int fenc_rec_block(f265_rec_params *rp);
 int fenc_rec_tb(f265_enc_thread *t, f265_pix *pred, int pred_stride, int comp, int lg_bs, int dst_flag, int order,
-                int zero_flag, int ct_ox, int ct_oy, int depth, int intra_cb, int final);
-int fenc_rec_intra_tb(f265_enc_thread *t, int comp, int lg_bs, int mode, int zero_flag, int ct_ox, int ct_oy,
-                      int depth);
+                int zero_flag, int ct_ox, int ct_oy, int depth, int intra_flag, int final_enc_flag);
+int fenc_rec_intra_tb(f265_enc_thread *t, int comp, int lg_bs, int mode, int zero_flag, int ct_ox, int ct_oy, int depth);
 int fenc_rec_intra_tt(f265_enc_thread *t, f265_cb *cb, int split_part_flag, int part_idx, int lg_bs,
                       int ct_ox, int ct_oy);
-void fenc_rec_inter_cb(f265_enc_thread *t, f265_cb *cb);
+int fenc_rec_inter_tb(f265_enc_thread *t, f265_pix *pred, int pred_stride, int comp, int lg_bs, int dst_flag,
+                      int order, int zero_flag, int ct_ox, int ct_oy, int depth);
 int fenc_rec_inter_tt(f265_enc_thread *t, f265_cb *cb, f265_pix pred[3][64*64], int lg_bs, int cb_ox, int cb_oy);
+void fenc_rec_inter_cb(f265_enc_thread *t, f265_cb *cb);
+void fenc_set_tt0_blueprint(f265_enc_thread *t, f265_cb *cb);
 void fenc_rec_cb(f265_enc_thread *t, f265_cb *cb);
 void fenc_rec_ctb(f265_enc_thread *t);
 int fenc_do_rdoq(f265_rec_params *rp, int16_t *dst, int16_t *src);
diff --git a/snippets/asm.py b/snippets/asm.py
index 3ef8b82..b6d34db 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -162,7 +162,7 @@ def declare_all():
 
     df("get_sb_flags", bd=1,
        ret = "void", args="f265_tb_enc *tb, int16_t *qc",
-       indices=["4", "8", "16", "32"], avx2_lbd=[])
+       indices=["4", "8", "16", "32"], avx2_lbd=1)
 
     df("fsad", bd=1,
        ret = "int", args="f265_pix *src, int src_stride, f265_pix *ref, int ref_stride, int packed_dims",
diff --git a/snippets/scan_map.c b/snippets/scan_map.c
index 8e3cc74..9801574 100644
--- a/snippets/scan_map.c
+++ b/snippets/scan_map.c
@@ -5,6 +5,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <ctype.h>
+#include <stdlib.h>
 
 // Block count: 1 + 4 + 16 + 64 = 85.
 // Size required: 85 * 3 = 255. Round to 256 for the coefficient map.
@@ -92,6 +93,81 @@ void gen_scan_map()
     printf("\n};\n");
 }
 
+int qsort_helper(const void *a, const void *b)
+{
+    int c = *(uint8_t*)a;
+    int d = *(uint8_t*)b;
+    if (c < d) return -1;
+    if (c > d) return 1;
+    return 0;
+}
+
+// Generate the pshufb/parallel deposit map used in assembly for 32x32
+// coefficient scanning.
+void gen_pdep_map()
+{
+    // Pshufb pattern per order/lane.
+    uint8_t pshufb[3][4][16];
+
+    // Parallel deposit bitfield per order/lane.
+    uint64_t pdep[3][4];
+
+    for (int order = 0; order < 3; order++)
+    {
+        // Get the raster-to-encoding map.
+        uint8_t *enc_to_raster = fenc_scan_map_data + fenc_scan_map_idx[3][order];
+        uint8_t raster_to_enc[64];
+        for (int enc_pos = 0; enc_pos < 64; enc_pos++) raster_to_enc[enc_to_raster[enc_pos]] = enc_pos;
+
+        for (int lane = 0; lane < 4; lane++)
+        {
+            pdep[order][lane] = 0;
+
+            // Destination for each coefficient.
+            uint8_t *dst_array = raster_to_enc + lane*16;
+
+            // Sorted destination positions.
+            uint8_t lane_dst_pos[16];
+            memcpy(lane_dst_pos, dst_array, 16);
+            qsort(lane_dst_pos, 16, 1, qsort_helper);
+
+            // Pass each source position.
+            for (int64_t src_pos = 0; src_pos < 16; src_pos++)
+            {
+                int64_t dst_pos = dst_array[src_pos];
+
+                // Find the destination offset for pshufb.
+                int pshufb_off = -1;
+                for (int i = 0; i < 16; i++)
+                {
+                    if (dst_pos == lane_dst_pos[i])
+                    {
+                        pshufb_off = i;
+                        break;
+                    }
+                }
+
+                // Update the patterns.
+                pshufb[order][lane][pshufb_off] = src_pos;
+                pdep[order][lane] += (1llu<<dst_pos);
+            }
+        }
+    }
+
+    printf("ASM 32x32 patterns:\n");
+    for (int order = 0; order < 3; order++)
+        for (int lane = 0; lane < 4; lane++)
+        {
+            printf("                    db ");
+            for (int i = 0; i < 16; i++) printf("0x%02x%s", pshufb[order][lane][i], i == 15 ? "" : ",");
+            printf("\n");
+        }
+
+    for (int order = 0; order < 3; order++)
+        for (int lane = 0; lane < 4; lane++)
+            printf("                    dq 0x%016llx\n", (unsigned long long int)pdep[order][lane]);
+}
+
 void gen_last_coeff_table()
 {
     int t[5*16];
@@ -127,6 +203,7 @@ void gen_last_coeff_table()
 int main()
 {
     gen_scan_map();
+    gen_pdep_map();
     gen_last_coeff_table();
     return 0;
 }

Reply via email to