diff --git a/f265/asm.c b/f265/asm.c
index ffee220..5c7ed57 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -55,6 +55,10 @@ void f265_lbd_get_sb_flags_16_avx2(f265_tb_enc *tb, int16_t *qc);
 void f265_lbd_get_sb_flags_32_avx2(f265_tb_enc *tb, int16_t *qc);
 void f265_hbd_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc);
 
+void f265_lbd_preprocess_tb_c(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
+void f265_lbd_preprocess_tb_avx2(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
+void f265_hbd_preprocess_tb_c(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
+
 int f265_lbd_fsad_c(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims);
 int f265_lbd_fsad_4_avx2(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims);
 int f265_lbd_fsad_8_avx2(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims);
@@ -341,6 +345,7 @@ f265_lbd_idct_func f265_lbd_idct[5];
 f265_lbd_quant_func f265_lbd_quant[4];
 f265_lbd_dequant_func f265_lbd_dequant[4];
 f265_lbd_get_sb_flags_func f265_lbd_get_sb_flags[4];
+f265_lbd_preprocess_tb_func f265_lbd_preprocess_tb;
 f265_lbd_fsad_func f265_lbd_fsad[10];
 f265_lbd_sad3_func f265_lbd_sad3[10];
 f265_lbd_sad4_func f265_lbd_sad4[10];
@@ -352,6 +357,7 @@ f265_hbd_idct_func f265_hbd_idct[5];
 f265_hbd_quant_func f265_hbd_quant[4];
 f265_hbd_dequant_func f265_hbd_dequant[4];
 f265_hbd_get_sb_flags_func f265_hbd_get_sb_flags[4];
+f265_hbd_preprocess_tb_func f265_hbd_preprocess_tb;
 f265_hbd_fsad_func f265_hbd_fsad[10];
 f265_hbd_sad3_func f265_hbd_sad3[10];
 f265_hbd_sad4_func f265_hbd_sad4[10];
@@ -405,6 +411,8 @@ static void f265_link_asm(int avx2_flag)
     f265_hbd_get_sb_flags[1] = f265_hbd_get_sb_flags_c;
     f265_hbd_get_sb_flags[2] = f265_hbd_get_sb_flags_c;
     f265_hbd_get_sb_flags[3] = f265_hbd_get_sb_flags_c;
+    f265_lbd_preprocess_tb = f265_lbd_preprocess_tb_c;
+    f265_hbd_preprocess_tb = f265_hbd_preprocess_tb_c;
     f265_lbd_fsad[0] = f265_lbd_fsad_c;
     f265_lbd_fsad[1] = f265_lbd_fsad_c;
     f265_lbd_fsad[2] = f265_lbd_fsad_c;
@@ -547,6 +555,7 @@ static void f265_link_asm(int avx2_flag)
         f265_lbd_get_sb_flags[1] = f265_lbd_get_sb_flags_8_avx2;
         f265_lbd_get_sb_flags[2] = f265_lbd_get_sb_flags_16_avx2;
         f265_lbd_get_sb_flags[3] = f265_lbd_get_sb_flags_32_avx2;
+        f265_lbd_preprocess_tb = f265_lbd_preprocess_tb_avx2;
         f265_lbd_fsad[1] = f265_lbd_fsad_4_avx2;
         f265_lbd_fsad[2] = f265_lbd_fsad_8_avx2;
         f265_lbd_fsad[3] = f265_lbd_fsad_16_avx2;
diff --git a/f265/asm.h b/f265/asm.h
index 62bbbb4..d48addf 100644
--- a/f265/asm.h
+++ b/f265/asm.h
@@ -3,6 +3,7 @@
 
 // Special code.
 typedef struct f265_tb_enc f265_tb_enc;
+typedef struct f265_tt_enc f265_tt_enc;
 
 // Typedefs.
 typedef void(*f265_lbd_dct_func)(int16_t *dst, uint8_t *src, int src_stride, uint8_t *pred, int pred_stride, uint8_t *spill);
@@ -15,6 +16,8 @@ typedef void(*f265_lbd_dequant_func)(int16_t *dst, int16_t *src, int bs, int mul
 typedef void(*f265_hbd_dequant_func)(int16_t *dst, int16_t *src, int bs, int mult, int add, int shift);
 typedef void(*f265_lbd_get_sb_flags_func)(f265_tb_enc *tb, int16_t *qc);
 typedef void(*f265_hbd_get_sb_flags_func)(f265_tb_enc *tb, int16_t *qc);
+typedef void(*f265_lbd_preprocess_tb_func)(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
+typedef void(*f265_hbd_preprocess_tb_func)(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
 typedef int(*f265_lbd_fsad_func)(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims);
 typedef int(*f265_hbd_fsad_func)(int16_t *src, int src_stride, int16_t *ref, int ref_stride, int packed_dims);
 typedef void(*f265_lbd_sad3_func)(int *costs, uint8_t *src, int src_stride, uint8_t **refs, int ref_stride, int packed_dims);
@@ -58,6 +61,10 @@ extern f265_lbd_get_sb_flags_func f265_lbd_get_sb_flags[4];
 // Indices: 4, 8, 16, 32.
 extern f265_hbd_get_sb_flags_func f265_hbd_get_sb_flags[4];
 
+extern f265_lbd_preprocess_tb_func f265_lbd_preprocess_tb;
+
+extern f265_hbd_preprocess_tb_func f265_hbd_preprocess_tb;
+
 // Indices: 2, 4, 8, 16, 32, 64, 6, 12, 24, 48.
 extern f265_lbd_fsad_func f265_lbd_fsad[10];
 
diff --git a/f265/asm/avx2/encode.asm b/f265/asm/avx2/encode.asm
index 68e5026..d727371 100644
--- a/f265/asm/avx2/encode.asm
+++ b/f265/asm/avx2/encode.asm
@@ -5,11 +5,14 @@
 
 section .data
 
+extern f265_scan_map_data
+extern f265_scan_map_idx
+
 align 4
 pat_quant_dw_1:     dd 1
 pat_dequant_dw_1:   dw 1,1
 pat_sb_shuf_8:      db 3,1,2,0, 3,2,1,0, 3,1,2,0
-
+pat_b_127:          times 4 db 127
 
 align 8
 pat_sb_neigh_32:    dq 0x7f7f7f7f7f7f7f7f
@@ -39,6 +42,20 @@ pat_sb_shuf_32:     db 0x0f,0x07,0x0e,0x06,0x0d,0x05,0x0c,0x04,0x0b,0x03,0x0a,0x
                     dq 0x3030303030303030
                     dq 0x0c0c0c0c0c0c0c0c
                     dq 0x0303030303030303
+pat_pp_null_sb:     dq 0x1, 0x8, 0x8000, 0x8000000000000000
+pat_pp_reorder:     dw 0xffff,0xffff,0xffff,0x0f0e,0xffff,0xffff,0x0706,0x0d0c
+                    dw 0x0302,0x0908,0xffff,0xffff,0x0100,0xffff,0xffff,0xffff
+                    dw 0xffff,0xffff,0x0504,0x0b0a,0xffff,0x0302,0x0908,0x0100
+                    dw 0x0f0e,0x0706,0x0d0c,0xffff,0x0504,0x0b0a,0xffff,0xffff
+                    dw 0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff
+                    dw 0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff
+                    dw 0x0f0e,0x0d0c,0x0b0a,0x0908,0x0706,0x0504,0x0302,0x0100
+                    dw 0x0f0e,0x0d0c,0x0b0a,0x0908,0x0706,0x0504,0x0302,0x0100
+                    dw 0xffff,0xffff,0x0f0e,0x0706,0xffff,0xffff,0x0d0c,0x0504
+                    dw 0x0b0a,0x0302,0xffff,0xffff,0x0908,0x0100,0xffff,0xffff
+                    dw 0xffff,0xffff,0x0b0a,0x0302,0xffff,0xffff,0x0908,0x0100
+                    dw 0x0f0e,0x0706,0xffff,0xffff,0x0d0c,0x0504,0xffff,0xffff
+
 
 section .text
 ; ---------------------- QUANT/DEQUANT macros ---------------------
@@ -377,3 +394,172 @@ DEFFUN f265_lbd_get_sb_flags_32_avx2, ia=2, at=88, ti=3, tv=6, ym=1
     vpacksswb       y0, y1
     ret
 
+
+; void fenc_preprocess_tb(f265_tt_enc *tt, int16_t *qc, uint8_t *spill)
+; Input parameters:
+; - g0:     tt.
+; - g1:     qc.
+; - g2:     spill buffer.
+;
+; We use two passes to minimize latency. The first pass stores the reordered
+; coefficients. The second pass processes them.
+;
+; Register usage, pass 1.
+; - ga:     tmp.
+; - g0:     tt.
+; - g1:     qc.
+; - g2:     iteration counter preserved for pass 2.
+; - g3:     tmp.
+; - g4:     tt->nz_flags[0].
+; - g5:     offset map.
+; - g6:     stride.
+; - g7:     stride*3.
+; - g8:     iteration counter.
+; - g9:     null subblock flag.
+; - y0-1:   tmp.
+; - y2-3:   patterns.
+;
+; Register usage, pass 2.
+; - ga:     tmp.
+; - g0:     tt.
+; - g1:     tmp.
+; - g2:     iteration counter.
+; - g3:     nz.
+; - g4:     gt1.
+; - g5:     gt2.
+; - g6:     tt->sb.
+; - g7:     tt->levels.
+; - g8:     iteration end.
+; - y0-2:   tmp.
+; - y3-8:   patterns.
+;
+; TODO: add special case for 4x4, possibly from the function dispatcher to avoid a branch.
+DEFFUN f265_lbd_preprocess_tb_avx2, ia=3, at=888, ti=7, tv=9, ym=1
+
+    ; Initialize first pass.
+    mov             g3, [g0+8]              ; tt->tb.
+    mov             g4, [g3+8]              ; tb->nz_flags[0].
+    movzx           ga, byte [g3+0]         ; tb->lg_bs.
+    movzx           g3, byte [g3+1]         ; tb->order.
+    lea             g5, [pat_pp_null_sb]    ; Remember whether there is a null subblock.
+    xor             g9, g9
+    test            g4, [g5+8*ga-16]
+    setz            g9b
+    mov             g6, 2                   ; Stride.
+    shlx            g6, g6, ga
+    lea             g7, [3*g6]              ; 3*stride.
+    lea             g5, [f265_scan_map_idx] ; Coefficient offset map.
+    lea             ga, [3*ga-3*2]          ; 3*(lg_bs-2).
+    add             ga, g5
+    movzx           ga, byte [ga+g3]
+    lea             g5, [f265_scan_map_data + 256]
+    add             g5, ga
+    lea             ga, [pat_pp_reorder]    ; Reorder patterns.
+    shl             g3, 6
+    vmovdqu         y2, [ga+g3]
+    vmovdqu         y3, [ga+g3+32]
+    mov             g8, g2                  ; Iteration counter.
+
+    ; First pass.
+    tzcnt           ga, g4                  ; Position of the next subblock.
+    .loop_pass1:
+    movzx           g3, byte [g5+ga]        ; Subblock offset.
+    blsi            ga, g4                  ; Update the non-zero bitfield.
+    xor             g4, ga
+
+    lea             ga, [g1 + 8*g3]         ; Load the coefficients.
+    vmovdqu         y0, [ga]
+    vpunpcklqdq     y0, [ga + g6]
+    vmovdqu         y1, [ga + 2*g6 - 16]    ; Offset to avoid cross-lane merges.
+    vpunpcklqdq     y1, [ga + g7 - 16]
+    vpblendd        y0, y0, y1, 0xf0
+
+    vpshufb         y1, y0, y3              ; Reorder within lanes, leaving holes for missing values.
+    vpshufb         y0, y0, y2
+    vpermq          y1, y1, 0x4e
+    vpor            y0, y1
+
+    vpacksswb       y1, y0, y0              ; Pack to 8-bit.
+    vpermq          y1, y1, 8
+    vpabsw          y0, y0                  ; Make the 16-bit levels absolute.
+    vmovdqu         [g8], y0                ; Store the 16-bit levels and 8-bit levels aligned (64 bytes).
+    vmovdqu         [g8+32], y1
+    add             g8, 64
+
+    tzcnt           ga, g4                  ; Pass to the next subblock.
+    jnc             .loop_pass1
+
+    ; Initialize second pass.
+    mov             g6, [g0+16]             ; tt->sb.
+    mov             g7, [g0+24]             ; tt->levels.
+    vpbroadcastd    y8, [pat_b_127]         ; 127.
+    vmovdqu         y7, [pat_sb_shuf_16+16] ; Reverse order (borrowed from horizontal order).
+    vpcmpeqb        y3, y3                  ; -1.
+    vpxor           y4, y4                  ; 0.
+    vpabsb          y5, y3                  ; 1.
+    vpaddb          y6, y5, y5              ; 2.
+    add             qword [g0+8], 40        ; tb++.
+
+    ; Second pass.
+    .loop_pass2:
+    vmovdqu         y0, [g2]                ; 16-bit absolute levels.
+    vmovdqu         y1, [g2+32]             ; 8-bit signed levels.
+    vpabsb          y2, y1                  ; 8-bit absolute levels.
+    vpminub         y2, y8                  ; Convert -128 to 127 to avoid signed issues below.
+    add             g2, 64
+
+    vpshufb         y1, y7                  ; sign reverse.
+    vpcmpgtb        y1, y1, y3
+    vpmovmskb       ga, x1
+    not             gaw
+    vpcmpgtb        y1, y2, y4              ; nz.
+    vpmovmskb       g3, x1
+    vpshufb         y1, y7                  ; nz reverse.
+    vpmovmskb       g1, x1
+    vpcmpgtb        y1, y2, y5              ; gt1.
+    vpmovmskb       g4, x1
+    vpcmpgtb        y1, y2, y6              ; gt2.
+    vpmovmskb       g5, x1
+
+    pext            ga, ga, g1              ; Extract the signs.
+    mov             [g6+0], g3w             ; Store the non-zero flags and the signs.
+    mov             [g6+2], gaw
+
+    pext            g4, g4, g3              ; Extract the 8 gt1 flags.
+    and             g4, 0xff
+    mov             [g6+6], g4b             ; Store the gt1 flags.
+
+    pext            g5, g5, g3              ; Extract all gt2 flags.
+    blsi            ga, g4                  ; Extract the first gt1 flag set, or 0 if none.
+    and             g5, ga                  ; Extract the first gt2 flag, or 0 if none.
+    setnz           gab                     ; True if the gt2 flag is 1.
+    popcnt          g1, g3                  ; Count the number of non-zero flags.
+    shl             ga, 5                   ; Store packed_data.
+    or              ga, g1
+    mov             [g6+7], gab
+
+    blsi            ga, g4                  ; Extract the first gt1 flag set, or 0 if none.
+    xor             ga, g4                  ; Clear the gt2 position in the gt1 flags.
+    or              ga, g5                  ; Set the gt2 position in the gt1 flags if the gt2 flag is set.
+    or              ga, 0xff00              ; Set the bit of all coefficients that haven't been inferred.
+    pdep            ga, ga, g3
+    mov             [g6+4], gaw             ; Store the remaining flags.
+
+    vmovdqu         [g7], y0                ; Store the levels tentatively.
+    xor             g1, g1                  ; levels++ if there are uninferred coefficients.
+    test            ga, ga
+    setnz           g1b
+    shl             g1, 5
+    add             g7, g1
+
+    add             g6, 8                   ; Pass to the next subblock.
+    cmp             g2, g8
+    jnz             .loop_pass2
+
+    ; Finish.
+    mov             qword [g6], 0           ; Add the null subblock as needed.
+    lea             g6, [g6+8*g9]
+    mov             [g0+16], g6             ; tt->sb.
+    mov             [g0+24], g7             ; tt->levels.
+    RET
+
diff --git a/f265/asm/x86inc.asm b/f265/asm/x86inc.asm
index 4135a90..f8427a4 100644
--- a/f265/asm/x86inc.asm
+++ b/f265/asm/x86inc.asm
@@ -42,8 +42,8 @@
 ; The general-purpose registers (GPR) are named gN, where N is an integer
 ; argument position. For example, g0 and g1 are the registers that contain the
 ; first and the second integer argument of a function respectively. gN is
-; 64-bit, gNd is 32-bit, gNb is 8-bit. As a special case, ga is the rax register
-; and gs is the rsp register.
+; 64-bit, gNd is 32-bit, gNw is 16-bit, gNb is 8-bit. As a special case, ga is
+; the rax register and gs is the rsp register.
 ;
 ; The vector registers (VEC) are named xN and yN, where N is a floating point
 ; argument position. For example, x0 and x1 are the registers that contain the
@@ -106,40 +106,41 @@
 %endmacro
 
 ; Declare the general-purpose registers.
-%macro DECLARE_GPR 4        ; %1: register name, %2: "q" name, %3, "d" name, %4: "b" name.
+%macro DECLARE_GPR 5        ; %1: register name, %2: "q" name, %3, "d" name, %4, "w" name, %5: "b" name.
     %define g%1  %2
     %define g%1d %3
-    %define g%1b %4
+    %define g%1w %4
+    %define g%1b %5
 %endmacro
 
 %ifdef ARCH_AMD64
-DECLARE_GPR 0,  rdi, edi,  dil
-DECLARE_GPR 1,  rsi, esi,  sil
-DECLARE_GPR 2,  rdx, edx,  dl
-DECLARE_GPR 3,  rcx, ecx,  cl
-DECLARE_GPR 4,  r8,  r8d,  r8b
-DECLARE_GPR 5,  r9,  r9d,  r9b
-DECLARE_GPR 6,  r10, r10d, r10b
-DECLARE_GPR 7,  r11, r11d, r11b
+DECLARE_GPR 0,  rdi, edi,  di, dil
+DECLARE_GPR 1,  rsi, esi,  si, sil
+DECLARE_GPR 2,  rdx, edx,  dx, dl
+DECLARE_GPR 3,  rcx, ecx,  cx, cl
+DECLARE_GPR 4,  r8,  r8d,  r8w, r8b
+DECLARE_GPR 5,  r9,  r9d,  r9w, r9b
+DECLARE_GPR 6,  r10, r10d, r10w, r10b
+DECLARE_GPR 7,  r11, r11d, r11w, r11b
 %else
-DECLARE_GPR 0,  rcx, ecx,  cl
-DECLARE_GPR 1,  rdx, edx,  dl
-DECLARE_GPR 2,  r8,  r8d,  r8b
-DECLARE_GPR 3,  r9,  r9d,  r9b
-DECLARE_GPR 4,  r10, r10d, r10b
-DECLARE_GPR 5,  r11, r11d, r11b
-DECLARE_GPR 6,  rdi, edi,  dil
-DECLARE_GPR 7,  rsi, esi,  sil
+DECLARE_GPR 0,  rcx, ecx,  cx, cl
+DECLARE_GPR 1,  rdx, edx,  dx, dl
+DECLARE_GPR 2,  r8,  r8d,  r8w, r8b
+DECLARE_GPR 3,  r9,  r9d,  r9w, r9b
+DECLARE_GPR 4,  r10, r10d, r10w, r10b
+DECLARE_GPR 5,  r11, r11d, r11w, r11b
+DECLARE_GPR 6,  rdi, edi,  di, dil
+DECLARE_GPR 7,  rsi, esi,  si, sil
 %endif
 
-DECLARE_GPR 8,  rbx, ebx,  bl
-DECLARE_GPR 9,  rbp, ebp,  bpl
-DECLARE_GPR 10, r12, r12d, r12b
-DECLARE_GPR 11, r13, r13d, r13b
-DECLARE_GPR 12, r14, r14d, r14b
-DECLARE_GPR 13, r15, r15d, r15b
-DECLARE_GPR a,  rax, eax,  al
-DECLARE_GPR s,  rsp, esp,  spl
+DECLARE_GPR 8,  rbx, ebx,  bx, bl
+DECLARE_GPR 9,  rbp, ebp,  bp, bpl
+DECLARE_GPR 10, r12, r12d, r12w, r12b
+DECLARE_GPR 11, r13, r13d, r13w, r13b
+DECLARE_GPR 12, r14, r14d, r14w, r14b
+DECLARE_GPR 13, r15, r15d, r15w, r15b
+DECLARE_GPR a,  rax, eax,  ax, al
+DECLARE_GPR s,  rsp, esp,  sp, spl
 
 ; Declare the vector registers.
 %assign _count 0
@@ -245,6 +246,17 @@ DECLARE_GPR s,  rsp, esp,  spl
 ; Define the function prologue.
 %macro PROLOGUE 0-*
 
+    ; Clean up the previous invocation.
+    %undef stack_off
+    %undef _ia
+    %undef _at
+    %undef _fa
+    %undef _ti
+    %undef _tv
+    %undef _ym
+    %undef _x64_xmm_save_size
+
+
     ; Extract the parameters as _key tokens.
 
     ; Number of arguments that matched.
@@ -391,15 +403,18 @@ DECLARE_GPR s,  rsp, esp,  spl
 
 %endmacro
 
-; Define the function epilog.
+; Define the function epilog. A function may have multiple return statements,
+; so the epilog can be present multiple times.
 %macro EPILOG 0
 
+    ; Back-up stack_off.
+    %assign stack_off_bak stack_off
+
     ; Clear the high YMM registers.
     %if _ym
         vzeroupper
     %endif
 
-
     ; Restore the XMM registers.
     %ifdef ARCH_X64
     %assign _reg_count _fa + _tv - 6
@@ -428,19 +443,13 @@ DECLARE_GPR s,  rsp, esp,  spl
 
     ASSERT(stack_off == 0)
 
+    ; Restore stack_off.
+    %assign stack_off stack_off_bak
 
     ; Clean up.
-    %undef stack_off
-    %undef _ia
-    %undef _at
-    %undef _fa
-    %undef _ti
-    %undef _tv
-    %undef _ym
     %undef _iter
     %undef _reg_idx
     %undef _reg_count
-    %undef _x64_xmm_save_size
 
 %endmacro
 
diff --git a/f265/enc.h b/f265/enc.h
index d1334ca..98c818a 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -1166,14 +1166,14 @@ struct f265_tb_enc
 
 // Pointers in the transform tree encoding data: current transform node,
 // transform block, transform subblock and transform coefficient level.
-typedef struct f265_tt_enc
+// The typedef is is asm.h, do not redefine it.
+struct f265_tt_enc
 {
     uint8_t *tn;
     f265_tb_enc *tb;
     f265_sb_enc *sb;
     int16_t *levels;
-
-} f265_tt_enc;
+};
 
 // Parameters for fenc_rec_block(). This is a stub.
 typedef struct f265_rec_params
@@ -1875,8 +1875,8 @@ struct f265_enc_thread
     f265_tb_enc tb[3*256];
 
     // Subblock data in encoding order. A subblock only uses an entry if it is
-    // non-zero.
-    f265_sb_enc sb[3*256];
+    // non-zero. One extra entry used for unification.
+    f265_sb_enc sb[3*256+1];
 
     // Subblock coefficient levels in encoding order. A subblock only uses an
     // entry if it has remaining levels that must be encoded.
@@ -2766,7 +2766,7 @@ int fenc_quant_c(int16_t *dst, int16_t *src, int bs, int mult, int add, int shif
 void fenc_dequant_c(int16_t *dst, int16_t *src, int bs, int mult, int add, int shift);
 void fenc_init_transform_tree(f265_enc_thread *t);
 void fenc_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc);
-void fenc_preprocess_tb(f265_tt_enc *tt, int16_t *qc);
+void fenc_preprocess_tb_c(f265_tt_enc *tt, int16_t *qc, uint8_t *spill);
 int fenc_rec_block(f265_rec_params *rp);
 int fenc_rec_tb(f265_enc_thread *t, f265_pix *pred, int pred_stride, int comp, int lg_bs, int dst_flag, int order,
                 int zero_flag, int ct_ox, int ct_oy, int depth, int intra_flag, int final_enc_flag);
diff --git a/f265/hm.c b/f265/hm.c
index 17013c4..0dd5516 100644
--- a/f265/hm.c
+++ b/f265/hm.c
@@ -676,148 +676,3 @@ void fenc_hm_set_intra(f265_enc_thread *t)
 }
 #endif
 
-// Future implementation notes.
-
-// Assembly for intra angular:
-// - Consider sharing functions between block sizes.
-// - Consider doing a function that will do -135, -90, -45, 0, 45 degrees in one
-//   shot (fast estimation). May also do planar + DC. Handle or ignore filtering
-//   issues.
-// - Consider computing the SAD instead of storing rows (fast estimation).
-// - Consider using pshufb based on the inverse angle to project neighbours.
-// - If horizontal, translate the horizontal case to the vertical case (flip
-//                  the neighbours).
-// - Call the appropriate dispatch function for the projection angle (3 or 4
-//   cases).
-//   - If the projection is vertical:
-//     - Broadcast the pixels.
-//     - When filtering:
-//       - Compute column filtering in a register.
-//       - Replace the first byte of every row using palignr and vpblendvb.
-//   - Else if the projection is -45 or 45 degrees:
-//     - Find a way to use palignr to avoid loads.
-//   - Else:
-//     - Go load heavy. Load neigbhours from cache for every row at the computed
-//       offset to avoid branches.
-//     - Pack neighbours as ABBCCDDE. Use pmaddubsw to multiply by the fractions
-//       (32-iFact, iFact). Shift and pack two rows at a time.
-//     - Assume the fractional case even when iFact==0 for a row to avoid
-//       branches.
-// - If horizontal, flip using punpck.
-
-// Reconst:8 -> unfiltered/filtered:tmp@16,out@8 -> prediction:tmp@16,out@8 ->
-// (src-pred):tmp@16,out@16 -> DCT1D&clip:tmp@32,out@16 -> DCT1D&clip:tmp@32,out@16 ->
-// quantization:tmp@32,out@16 -> dequantization:same ->
-// DCT1D&clip:tmp@32,out@16 -> DCT1D&add&clip:tmp@32,out@8
-//
-// Assuming high bit depth:
-// Same, but:
-// - unfiltered/filtered/prediction: double "tmp" sizes.
-// - input/output: double size.
-
-// Coefficient encoding:
-//
-// One quantization function per transform block size.
-// Special case:
-// - May use a function to process 4 4x4 blocks together.
-// - May split between a function that does pure quant and a function that does
-//   quant + nz_flags stuff.
-// Quantization function assembly:
-// - Quantize every row in raster scan.
-// - PACKSS columns together to get 1 byte per 4-coeff group.
-// - OR rows to get 1 byte per subblock.
-// - PCMPEQ to get 1 flag per subblock.
-// - PSHUFB/reorder to get the raster order and the encode order (nz_flags[0..1]).
-// - PMOVMSKB to get the flags in general purpose.
-// - Compute tb->nz_flags[2..3]:
-//   - Use nz_flags[0] as base.
-//   - Below: PDEP all but the bottom row.
-//   - Right: PEXT to remove first column, PDEP to add zeroes to right column.
-//
-// One preprocessing function for all block sizes.
-// Preprocessing function assembly:
-// - Get the coefficient stride from lg_bs.
-// - Set up vpgatherqq index register.
-// - Set up function pointer to reorder the coefficients.
-// - BSF over the non-zero subblocks:
-//   - first_coeff_pos = coeff_off[pos].
-//   - Load coefficients using vpgatherqq (clone the index register first).
-//   - Call func to reorder the 16 coefficients in YMM in encode order:
-//     - PSHUFB to reorder each lane.
-//     - Swap lanes in new register.
-//     - PALIGNR to the merge the high-low lanes together.
-//     - PSHUFB to reorder each lane.
-//   - (Copy coeffs so we don't lose their content as needed here).
-//   - PACKSSWD to have one byte per coeff.
-//   - PCMPGTB -1 to get mask of signs.
-//   - PABSB to get absolute coeff values (bytes).
-//   - PCMPGTB 0 to get non-zero coeff flags.
-//   - PCMPGTB 1 to get greater-than-1 flags.
-//   - PCMPGTB 2 to get greater-than-2 flags.
-//   - PMOVMSKB to nz_flags.
-//   - PMOVMSKB to signs.
-//   - PMOVMSKB to gt1.
-//   - PMOVMSKB to gt2.
-//   - PEXT signs using nz_flags.
-//   - PEXT gt1 using nz_flags.
-//   - PEXT gt2 using nz_flags.
-//   - AND gt1, 0xff (keep only first 8 bits).
-//   - BLSI TMP, gt1 (extract first gt1 bit set, if any).
-//   - AND gt2, TMP (extract first gt2 bit).
-//   - SETNE gt2 (set gt2 bit value, 0 if it doesn't exist).
-//   - POPCNT nb_nz, nz_flags.
-//     (There are remaining levels if nb_gt1 + gt2 > 1 || nb_nz > 8).
-//     (Alternatively, check if remain_flags is non-zero).
-//   - POPCNT nb_gt1, gt1
-//   - ADD nb_gt1, gt2
-//   - CMP nb_gt1, 1
-//   - SETGT gt1_overflow
-//   - CMP nb_nz, 8
-//   - SETGT nb_nz_overflow
-//   - OR remain_flag, gt1_overflow, nb_nz_overflow
-//   - Possibly branch to compute coefficient base levels below.
-//   - Else, just store the absolute coeff in order:
-//     - PABSW 16-bit coefficients.
-//     - MOVDQA 16-bit coefficients in store location.
-//     - remain_flag << 5 (update store location if the store was required).
-//     - ADD store pointer, remain_flag.
-//   - SB pointer += sizeof(SB).
-//   - How to reverse the 16 signs:
-//     - BSWAP.
-//     - PDEP to insert 4 zeros per group of 4 bits.
-//     - MOV to YMM.
-//     - PHUFB.
-//     - MOV to GPR.
-//     - PEXT to extract the reordered bits.
-//     - Shift right to align with bit 0.
-//     - Other possibility:
-//       - Broadcast in YMM.
-//       - AND with bit identification mask.
-//       - Mov high lane to low.
-//       - PACKSSWB.
-//       - PSHUFB.
-//       - PCMPEQ
-//       - PMOVMSKB.
-//     - Other possibility:
-//       - Process in group of four in C loop (map).
-//    - How to put back the gt1 flags in place:
-//       - PDEP using the coefficient non-zero flags.
-//       - Can be done for gt2 directly.
-//   // Branch to compute exactly the base levels, slow so don't do unless
-//   // required. NOT VERY USEFUL, the base levels must be computed anyway for the
-//   // rice param update.
-//   - PEXT A, 0xff, nz_flags (remove 1 for the 8 first non-zero coefficients).
-//   - VBROADCASTW A       (move coefficient bits back to ymm).
-//   - PAND A, [word_mask] (keep only the bit corresponding to the coeff).
-//   - PCMPEQW A, 0        (-1 if the bit is not set).
-//   - PCMPEQW A, 0        (-1 if the bit is set).
-//   - PADDW coeffs, A     (remove 1 for the 8 first non-zero coefficients).
-//   - PEXT  A, gt2, nz_flags (put back gt2 bit at its coeff location).
-//   - (same instructions as above to remove 1 for the gt2 coefficient).
-//   - PACKSSWD B, A to have one byte per coeff.
-//   - PCMPEQB B, 0 to get mask of remaining coeffs (2x).
-//   - PMOVMSKB B to remain_flags.
-//   - PADDW A, -1                (remove 1 for each non-zero coefficient).
-//   - MOVDQA A, [remain]         (store remaining coefficient level).
-//   - Update store pointer, etc.
-
diff --git a/f265/rec.c b/f265/rec.c
index 65f8f01..5d8fc91 100644
--- a/f265/rec.c
+++ b/f265/rec.c
@@ -674,7 +674,7 @@ void fenc_get_sb_flags_c(f265_tb_enc *tb, int16_t *qc)
 
 // Preprocess the quantized coefficients in a non-empty transform block for
 // encoding.
-void fenc_preprocess_tb(f265_tt_enc *tt, int16_t *qc)
+void fenc_preprocess_tb_c(f265_tt_enc *tt, int16_t *qc, uint8_t *spill)
 {
     f265_tb_enc *tb = tt->tb;
     f265_sb_enc *sb = tt->sb;
@@ -839,9 +839,8 @@ int fenc_rec_block(f265_rec_params *rp)
         // Get the subblock flags.
         fenc_get_sb_flags[lg_bs-2](tb, quant);
 
-        // Preprocess the coefficients. Should be done with one assembly
-        // function.
-        fenc_preprocess_tb(tt, quant);
+        // Preprocess the coefficients.
+        fenc_preprocess_tb(tt, quant, t->store);
     }
 
     return 1;
diff --git a/snippets/asm.py b/snippets/asm.py
index b6d34db..557dd51 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -164,6 +164,10 @@ def declare_all():
        ret = "void", args="f265_tb_enc *tb, int16_t *qc",
        indices=["4", "8", "16", "32"], avx2_lbd=1)
 
+    df("preprocess_tb", bd=1,
+       ret = "void", args="f265_tt_enc *tt, int16_t *qc, uint8_t *spill",
+       avx2_lbd=1)
+
     df("fsad", bd=1,
        ret = "int", args="f265_pix *src, int src_stride, f265_pix *ref, int ref_stride, int packed_dims",
        indices=amp_indices,
@@ -261,6 +265,7 @@ def get_c_special_code():
 def get_h_special_code():
     s = ""
     s += "typedef struct f265_tb_enc f265_tb_enc;\n"
+    s += "typedef struct f265_tt_enc f265_tt_enc;\n"
     return s
 
 # Generate the output text for the C/header file.
diff --git a/snippets/scan_map.c b/snippets/scan_map.c
index 9801574..7723262 100644
--- a/snippets/scan_map.c
+++ b/snippets/scan_map.c
@@ -168,6 +168,62 @@ void gen_pdep_map()
             printf("                    dq 0x%016llx\n", (unsigned long long int)pdep[order][lane]);
 }
 
+// Generate the coefficient shuffle map used in assembly for coefficient
+// preprocessing.
+void gen_pp_map()
+{
+    // Pshufb pattern per order/selector for same/complement lane.
+    uint8_t pshufb[3][2][16];
+
+    for (int order = 0; order < 3; order++)
+    {
+        // Get the raster-to-encoding map.
+        uint8_t *enc_to_raster = fenc_scan_map_data + fenc_scan_map_idx[2][order];
+        uint8_t raster_to_enc[16];
+        for (int enc_pos = 0; enc_pos < 16; enc_pos++) raster_to_enc[enc_to_raster[enc_pos]] = enc_pos;
+
+        // Pass each source position.
+        for (int64_t src_pos = 0; src_pos < 16; src_pos++)
+        {
+            int64_t dst_pos = raster_to_enc[src_pos];
+            int src_lane = src_pos >= 8, dst_lane = dst_pos >= 8;
+
+            // The pattern selector is 0 if src_lane == dst_lane.
+            int select = src_lane != dst_lane;
+
+            // Effective source location in the pshufb pattern.
+            int pshufb_src = src_pos%8;
+
+            // Effective destination where the coefficient goes in the
+            // lanes.
+            int pshufb_dst0 = src_lane*8 + (dst_pos%8);
+            int pshufb_dst1 = !src_lane*8 + (dst_pos%8);
+
+            // Put the coefficient in the effective destination lane.
+            pshufb[order][select][pshufb_dst0] = pshufb_src;
+
+            // Put a hole in the complement lane.
+            pshufb[order][!select][pshufb_dst1] = 0xff;
+        }
+    }
+
+    printf("ASM preprocessor patterns:\n");
+    for (int order = 0; order < 3; order++)
+        for (int reg = 0; reg < 2; reg++)
+            for (int lane = 0; lane < 2; lane++)
+            {
+                printf("                    dw ");
+                for (int i = 0; i < 8; i++)
+                {
+                    int raw = pshufb[order][reg][lane*8 + i];
+                    int val1 = (raw == 0xff) ? 0xff : raw*2;
+                    int val2 = (raw == 0xff) ? 0xff : raw*2 + 1;
+                    printf("0x%02x%02x%s", val2, val1, i == 7 ? "" : ",");
+                }
+                printf("\n");
+            }
+}
+
 void gen_last_coeff_table()
 {
     int t[5*16];
@@ -204,6 +260,7 @@ int main()
 {
     gen_scan_map();
     gen_pdep_map();
+    gen_pp_map();
     gen_last_coeff_table();
     return 0;
 }

Reply via email to