Thanks,
Laurent
diff --git a/SConstruct b/SConstruct
index 5a593e8..aad54a3 100644
--- a/SConstruct
+++ b/SConstruct
@@ -30,6 +30,7 @@ vars.AddVariables(
                                       '-Wno-uninitialized'
                                       % ('' if mingw else '-fPIC'))),
     (BoolVariable('asm', 'Assembly support', 1)),
+    (BoolVariable('fastc', 'Make C code faster but non-bit-exact with assembly', 0)),
     (EnumVariable('bd', 'Bit depth targets', 'all',
                   allowed_values=('low', 'high', 'all'))),
     (EnumVariable('libav', 'Version of libavcodec/libavformat', 'new',
@@ -86,8 +87,8 @@ lbd_str = hbd_str = comment
 if f265_cfg['bd'] == 'all' or f265_cfg['bd'] == 'low': lbd_str = ""
 if f265_cfg['bd'] == 'all' or f265_cfg['bd'] == 'high': hbd_str = ""
 
-asm_str = comment
-if f265_cfg['asm']: asm_str = ""
+asm_str = "" if f265_cfg['asm'] else comment
+fast_c_str = "" if f265_cfg['fastc'] else comment
 
 no_str = old_str = new_str = comment
 if f265_cfg['libav'] == 'none': no_str = ""
@@ -116,6 +117,9 @@ s = ("#ifndef F265_CONFIG_H\n"
      "// Defined if assembly support is enabled.\n"
      "%s#define F265_HAVE_ASM\n"
      "\n"
+     "// Defined if non-bit-exact C code is enabled.\n"
+     "%s#define F265_FAST_C\n"
+     "\n"
      "// Defined if debugging is enabled.\n"
      "#define F265_DEBUG\n"
      "\n"
@@ -132,7 +136,7 @@ s = ("#ifndef F265_CONFIG_H\n"
      "// Version number.\n"
      "#define F265_VERSION \"%s\"\n"
      "\n"
-     "#endif") % (lbd_str, hbd_str, asm_str, perf_str, no_str, old_str, new_str, F265_VERSION)
+     "#endif") % (lbd_str, hbd_str, asm_str, fast_c_str, perf_str, no_str, old_str, new_str, F265_VERSION)
 if not os.path.isdir('build/f265'): Execute(Mkdir('build/f265'))
 write_file('build/f265/f265_config.h.tmp', s)
 f265_cfg.Command('build/f265/f265_config.h', 'build/f265/f265_config.h.tmp',
diff --git a/f265/analyze.c b/f265/analyze.c
index 2f40485..6df6789 100644
--- a/f265/analyze.c
+++ b/f265/analyze.c
@@ -342,10 +342,10 @@ static void fenc_get_inter_part_depth_range(f265_enc_thread *t, int inter_part,
 
 // Return the SSD of the block specified.
 static int64_t fenc_an_block_ssd(f265_enc_thread *t, f265_pix *src0, int src0_stride, f265_pix *src1, int src1_stride,
-                                 int bs, int comp)
+                                 int lg_bs, int comp)
 {
     // Compute the reconstruction distortion.
-    int64_t dist = fenc_ssd(src0, src0_stride, src1, src1_stride, bs, bs, 0, 8);
+    int64_t dist = fenc_fssd[lg_bs-2](src0, src0_stride, src1, src1_stride, 1<<lg_bs);
     #ifdef VAN_SCALE_UP_PRECISION
     dist <<= 5;
     #endif
@@ -407,7 +407,11 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
     {
         // Compute the prediction distortion.
         f265_pix *src = t->src_frame->src_planes[comp] + plane_off;
-        cost = t->me.dist[t->an.mode_metric](src, ref_stride, pred, bs, bs, bs, 8);
+
+        if (likely(!t->an.mode_metric))
+            cost = fenc_fsad[lg_bs-1](src, ref_stride, pred, bs, bs<<8|bs);
+        else
+            cost = t->me.dist[t->an.mode_metric](src, ref_stride, pred, bs, bs, bs, 8);
 
         // Avoid Valgrind errors. FIXME.
         *nz_flag = 1;
@@ -463,7 +467,7 @@ printf("sad=%d, threshold=%d\n", sad, threshold);
         // Compute the reconstruction distortion.
         f265_pix *src = t->src_frame->src_planes[comp] + plane_off;
         f265_pix *rec = t->src_frame->rec_planes[comp ? 3+comp : 0] + plane_off;
-        int64_t dist = fenc_an_block_ssd(t, rec, ref_stride, src, ref_stride, bs, comp);
+        int64_t dist = fenc_an_block_ssd(t, rec, ref_stride, src, ref_stride, lg_bs, comp);
 
         cost = coeff_cost + dist;
         #ifdef VAN_TRACE_ANALYSIS
@@ -1895,7 +1899,7 @@ static int64_t fenc_analyze_inter_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
     }
 
     // Compute the reconstruction distortion.
-    int64_t dist = fenc_an_block_ssd(t, dist_rec, dist_rec_stride, src, ref_stride, bs, comp);
+    int64_t dist = fenc_an_block_ssd(t, dist_rec, dist_rec_stride, src, ref_stride, lg_bs, comp);
 
     int64_t cost = coeff_cost + dist;
     #ifdef VAN_TRACE_ANALYSIS
diff --git a/f265/asm.c b/f265/asm.c
index 5c7ed57..7edecc3 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -92,6 +92,14 @@ void f265_lbd_sad4_24_avx2(int *costs, uint8_t *src, int src_stride, uint8_t **r
 void f265_lbd_sad4_48_avx2(int *costs, uint8_t *src, int src_stride, uint8_t **refs, int ref_stride, int packed_dims);
 void f265_hbd_sad4_c(int *costs, int16_t *src, int src_stride, int16_t **refs, int ref_stride, int packed_dims);
 
+int f265_lbd_fssd_c(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_lbd_fssd_4_avx2(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_lbd_fssd_8_avx2(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_lbd_fssd_16_avx2(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_lbd_fssd_32_avx2(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_lbd_fssd_64_avx2(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_hbd_fssd_c(int16_t *src0, int stride0, int16_t *src1, int stride1, int width);
+
 void f265_lbd_avg_pix_c(uint8_t *dst, uint8_t *src0, int src0_stride, uint8_t *src1, int src1_stride, int packed_dims);
 void f265_lbd_avg_pix_4_avx2(uint8_t *dst, uint8_t *src0, int src0_stride, uint8_t *src1, int src1_stride, int packed_dims);
 void f265_lbd_avg_pix_8_avx2(uint8_t *dst, uint8_t *src0, int src0_stride, uint8_t *src1, int src1_stride, int packed_dims);
@@ -349,6 +357,7 @@ f265_lbd_preprocess_tb_func f265_lbd_preprocess_tb;
 f265_lbd_fsad_func f265_lbd_fsad[10];
 f265_lbd_sad3_func f265_lbd_sad3[10];
 f265_lbd_sad4_func f265_lbd_sad4[10];
+f265_lbd_fssd_func f265_lbd_fssd[5];
 f265_lbd_avg_pix_func f265_lbd_avg_pix[10];
 f265_lbd_interpol_luma_qpel_pix_func f265_lbd_interpol_luma_qpel_pix[30];
 
@@ -361,6 +370,7 @@ f265_hbd_preprocess_tb_func f265_hbd_preprocess_tb;
 f265_hbd_fsad_func f265_hbd_fsad[10];
 f265_hbd_sad3_func f265_hbd_sad3[10];
 f265_hbd_sad4_func f265_hbd_sad4[10];
+f265_hbd_fssd_func f265_hbd_fssd[5];
 f265_hbd_avg_pix_func f265_hbd_avg_pix[10];
 f265_hbd_interpol_luma_qpel_pix_func f265_hbd_interpol_luma_qpel_pix[30];
 
@@ -465,6 +475,16 @@ static void f265_link_asm(int avx2_flag)
     f265_hbd_sad4[7] = f265_hbd_sad4_c;
     f265_hbd_sad4[8] = f265_hbd_sad4_c;
     f265_hbd_sad4[9] = f265_hbd_sad4_c;
+    f265_lbd_fssd[0] = f265_lbd_fssd_c;
+    f265_lbd_fssd[1] = f265_lbd_fssd_c;
+    f265_lbd_fssd[2] = f265_lbd_fssd_c;
+    f265_lbd_fssd[3] = f265_lbd_fssd_c;
+    f265_lbd_fssd[4] = f265_lbd_fssd_c;
+    f265_hbd_fssd[0] = f265_hbd_fssd_c;
+    f265_hbd_fssd[1] = f265_hbd_fssd_c;
+    f265_hbd_fssd[2] = f265_hbd_fssd_c;
+    f265_hbd_fssd[3] = f265_hbd_fssd_c;
+    f265_hbd_fssd[4] = f265_hbd_fssd_c;
     f265_lbd_avg_pix[1] = f265_lbd_avg_pix_c;
     f265_lbd_avg_pix[2] = f265_lbd_avg_pix_c;
     f265_lbd_avg_pix[3] = f265_lbd_avg_pix_c;
@@ -580,6 +600,11 @@ static void f265_link_asm(int avx2_flag)
         f265_lbd_sad4[7] = f265_lbd_sad4_12_avx2;
         f265_lbd_sad4[8] = f265_lbd_sad4_24_avx2;
         f265_lbd_sad4[9] = f265_lbd_sad4_48_avx2;
+        f265_lbd_fssd[0] = f265_lbd_fssd_4_avx2;
+        f265_lbd_fssd[1] = f265_lbd_fssd_8_avx2;
+        f265_lbd_fssd[2] = f265_lbd_fssd_16_avx2;
+        f265_lbd_fssd[3] = f265_lbd_fssd_32_avx2;
+        f265_lbd_fssd[4] = f265_lbd_fssd_64_avx2;
         f265_lbd_avg_pix[1] = f265_lbd_avg_pix_4_avx2;
         f265_lbd_avg_pix[2] = f265_lbd_avg_pix_8_avx2;
         f265_lbd_avg_pix[3] = f265_lbd_avg_pix_16_avx2;
diff --git a/f265/asm.h b/f265/asm.h
index d48addf..6402d1f 100644
--- a/f265/asm.h
+++ b/f265/asm.h
@@ -24,6 +24,8 @@ typedef void(*f265_lbd_sad3_func)(int *costs, uint8_t *src, int src_stride, uint
 typedef void(*f265_hbd_sad3_func)(int *costs, int16_t *src, int src_stride, int16_t **refs, int ref_stride, int packed_dims);
 typedef void(*f265_lbd_sad4_func)(int *costs, uint8_t *src, int src_stride, uint8_t **refs, int ref_stride, int packed_dims);
 typedef void(*f265_hbd_sad4_func)(int *costs, int16_t *src, int src_stride, int16_t **refs, int ref_stride, int packed_dims);
+typedef int(*f265_lbd_fssd_func)(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+typedef int(*f265_hbd_fssd_func)(int16_t *src0, int stride0, int16_t *src1, int stride1, int width);
 typedef void(*f265_lbd_avg_pix_func)(uint8_t *dst, uint8_t *src0, int src0_stride, uint8_t *src1, int src1_stride, int packed_dims);
 typedef void(*f265_hbd_avg_pix_func)(int16_t *dst, int16_t *src0, int src0_stride, int16_t *src1, int src1_stride, int packed_dims);
 typedef void(*f265_lbd_interpol_luma_qpel_pix_func)(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
@@ -83,6 +85,12 @@ extern f265_lbd_sad4_func f265_lbd_sad4[10];
 // Indices: X, 4, 8, 16, 32, 64, X, 12, 24, 48.
 extern f265_hbd_sad4_func f265_hbd_sad4[10];
 
+// Indices: 4, 8, 16, 32, 64.
+extern f265_lbd_fssd_func f265_lbd_fssd[5];
+
+// Indices: 4, 8, 16, 32, 64.
+extern f265_hbd_fssd_func f265_hbd_fssd[5];
+
 // Indices: X, 4, 8, 16, 32, 64, X, 12, 24, 48.
 extern f265_lbd_avg_pix_func f265_lbd_avg_pix[10];
 
diff --git a/f265/asm/avx2/pixel.asm b/f265/asm/avx2/pixel.asm
index 4ade0af..58c9e76 100644
--- a/f265/asm/avx2/pixel.asm
+++ b/f265/asm/avx2/pixel.asm
@@ -8,6 +8,8 @@ section .data
 align 4
 pat_w_32:           dw 32,32                ; We duplicate the factors to broadcast them faster.
 pat_dw_2048:        dd 2048
+pat_b_127:          times 4 db 127
+pat_w_1:            dw 1, 1
 
 align 16
 pat_if_luma_8bit:   db -1,4,-1,4, -10,58,-10,58, 17,-5,17,-5, 1,0,1,0
@@ -392,6 +394,143 @@ DEFINE_SAD 64, 4
 %unmacro DEFINE_SAD 2
 
 
+; int fenc_fssd(f265_pix *src0, int stride0, f265_pix *src1, int stride1, int width)
+; Input parameters:
+; - g0:     source 0.
+; - g1:     source 0 stride.
+; - g2:     source 1.
+; - g3:     source 1 stride.
+; - g4:     width.
+%macro DEFINE_SSD 1                         ; %1: width.
+
+DEFFUN f265_lbd_fssd_%1_avx2, ia=4, at=8484, ti=0, tv=6, ym=1
+
+    ; Initialize.
+    vpbroadcastd    y4, [pat_b_127]         ; Maximum pixel value.
+    vpbroadcastd    y5, [pat_w_1]           ; Multiply-add two 16-bit values.
+    %if %1 >= 16
+    vpxor           y0, y0                  ; Accumulator.
+    %endif
+
+    %macro SSD_REG 6                        ; %1: acc, %2: src0, %3: src1, %4: tmp, %5: 16-bit pat, %6: 32-bit pat.
+    vpsubusb        y%4, y%2, y%3           ; unsigned_saturate(A-B)|unsigned_saturate(B-A).
+    vpsubusb        y%2, y%3, y%2
+    vpor            y%2, y%4
+    vpminub         y%2, y%5                ; Avoid overflows.
+    vpmaddubsw      y%2, y%2                ; Compute 16-bit SSDs (sum of two signed 14-bit values fit in 16-bit).
+    vpmaddwd        y%2, y%6                ; Sum the 32-bit SSDs.
+    %if %1 != %2
+    vpaddd          y%1, y%2                ; Accumulate.
+    %endif
+    %endmacro
+
+    ; Width 4.
+    %if %1 == 4
+    vmovdqu         y0, [g0]
+    vpunpckldq      y0, [g0+g1]
+    vmovdqu         y2, [g2]
+    vpunpckldq      y2, [g2+g3]
+    lea             g0, [g0+2*g1-4]
+    lea             g2, [g2+2*g3]
+    vmovdqu         y1, [g0]
+    vpunpckldq      y1, [g0+g1]
+    vmovdqu         y3, [g2]
+    vpunpckldq      y3, [g2+g3]
+    vpblendd        y0, y1, 0x0c
+    vpunpcklqdq     y2, y3
+    SSD_REG         0, 0, 2, 1, 4, 5
+
+    ; Width 8.
+    %elif %1 == 8
+    %macro ONE_PASS 3                       ; %1: src0, %2: src1, %3: tmp.
+    vmovdqu         y%1, [g0]
+    vpunpcklqdq     y%1, [g0+g1]
+    vmovdqu         y%2, [g2]
+    vpunpcklqdq     y%2, [g2+g3]
+    lea             g0, [g0+2*g1-16]
+    lea             g2, [g2+2*g3-16]
+    vmovdqu         y%3, [g0]
+    vpunpcklqdq     y%3, [g0+g1]
+    vpblendd        y%1, y%3, 0xf0
+    vmovdqu         y%3, [g2]
+    vpunpcklqdq     y%3, [g2+g3]
+    vpblendd        y%2, y%3, 0xf0
+    SSD_REG         %1, %1, %2, %3, 4, 5
+    %endmacro
+    ONE_PASS        0, 1, 2
+    lea             g0, [g0+2*g1+16]
+    lea             g2, [g2+2*g3+16]
+    ONE_PASS        1, 2, 3
+    vpaddd          y0, y1
+    %unmacro ONE_PASS 3
+
+    ; Width 16.
+    %elif %1 == 16
+    mov             ga, 8                   ; Loop counter.
+    .loop:
+    vmovdqu         y1, [g0]
+    vpblendd        y1, [g0+g1-16], 0xf0
+    vmovdqu         y2, [g2]
+    vpblendd        y2, [g2+g3-16], 0xf0
+    lea             g0, [g0+2*g1]
+    lea             g2, [g2+2*g3]
+    SSD_REG         0, 1, 2, 3, 4, 5
+    sub             ga, 1
+    jnz             .loop
+
+    ; Width 32.
+    %elif %1 == 32
+    mov             ga, 16                  ; Loop counter.
+    .loop:
+    vmovdqu         y1, [g0]
+    vmovdqu         y2, [g2]
+    SSD_REG         0, 1, 2, 3, 4, 5
+    vmovdqu         y1, [g0+g1]
+    vmovdqu         y2, [g2+g3]
+    SSD_REG         0, 1, 2, 3, 4, 5
+    lea             g0, [g0+2*g1]
+    lea             g2, [g2+2*g3]
+    sub             ga, 1
+    jnz             .loop
+
+    ; Width 64.
+    %elif %1 == 64
+    mov             ga, 64                  ; Loop counter.
+    .loop:
+    vmovdqu         y1, [g0]
+    vmovdqu         y2, [g2]
+    SSD_REG         0, 1, 2, 3, 4, 5
+    vmovdqu         y1, [g0+32]
+    vmovdqu         y2, [g2+32]
+    SSD_REG         0, 1, 2, 3, 4, 5
+    add             g0, g1
+    add             g2, g3
+    sub             ga, 1
+    jnz             .loop
+    %endif
+
+    ; Combine.
+    vpshufd         y1, y0, 0xe
+    vpaddd          y0, y1
+    vpshufd         y1, y0, 1
+    vpaddd          y0, y1
+    %if %1 != 4
+    vpermq          y1, y0, 2
+    vpaddd          y0, y1
+    %endif
+    vmovd           gad, x0
+    RET
+    %unmacro SSD_REG 6
+
+%endmacro
+DEFINE_SSD 4
+DEFINE_SSD 8
+DEFINE_SSD 16
+DEFINE_SSD 32
+DEFINE_SSD 64
+%unmacro DEFINE_SSD 1
+
+
 ; void fenc_avg_pix(f265_pix *dst, f265_pix *src0, int src0_stride, f265_pix *src1, int src1_stride, int packed_dims)
 ; Input parameters:
 ; - g0:     destination.
diff --git a/f265/pixel.c b/f265/pixel.c
index 9586434..78842eb 100644
--- a/f265/pixel.c
+++ b/f265/pixel.c
@@ -1100,6 +1100,28 @@ int32_t fenc_ssd(f265_pix *src0, int32_t stride0, f265_pix *src1, int32_t stride
     return ssd;
 }
 
+// Fast sum of square differences (SSD). The assembly implementation caps the
+// per-pixel difference to 127 at low bit depth to make the computations faster.
+// Only square blocks are supported. Rename once the symbol above no longer
+// conflicts.
+int fenc_fssd_c(f265_pix *src0, int stride0, f265_pix *src1, int stride1, int width)
+{
+    int ssd = 0;
+    for (int y = 0; y < width; y++, src0 += stride0, src1 += stride1)
+        for (int x = 0; x < width; x++)
+        {
+            #ifdef F265_FAST_C
+            int tmp = src0[x] - src1[x];
+            ssd += tmp*tmp;
+            #else
+            int tmp = F265_ABS(src0[x] - src1[x]);
+            tmp = F265_MIN(127, tmp);
+            ssd += tmp*tmp;
+            #endif
+        }
+    return ssd;
+}
+
 // Same as the SAD function, but compute the sum of square differences (SSD).
 int32_t fenc_ssd16(int16_t *src0, int32_t stride0, int16_t *src1, int32_t stride1,
                    int32_t width, int32_t height, int32_t subshift, int32_t bitdepth)
diff --git a/snippets/asm.py b/snippets/asm.py
index 557dd51..83a6d6d 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -183,6 +183,10 @@ def declare_all():
        indices=luma_amp_indices_x,
        avx2_lbd=1)
 
+    df("fssd", bd=1,
+       ret = "int", args="f265_pix *src0, int stride0, f265_pix *src1, int stride1, int width",
+       indices=["4", "8", "16", "32", "64"], avx2_lbd=1)
+
     df("avg_pix", bd=1,
        args="f265_pix *dst, f265_pix *src0, int src0_stride, f265_pix *src1, int src1_stride, int packed_dims",
        indices=luma_amp_indices_x,

Reply via email to