Thanks,
Laurent
diff --git a/SConstruct b/SConstruct
index 5a593e8..aad54a3 100644
--- a/SConstruct
+++ b/SConstruct
@@ -30,6 +30,7 @@ vars.AddVariables(
'-Wno-uninitialized'
% ('' if mingw else '-fPIC'))),
(BoolVariable('asm', 'Assembly support', 1)),
+ (BoolVariable('fastc', 'Make C code faster but non-bit-exact with assembly', 0)),
(EnumVariable('bd', 'Bit depth targets', 'all',
allowed_values=('low', 'high', 'all'))),
(EnumVariable('libav', 'Version of libavcodec/libavformat', 'new',
@@ -86,8 +87,8 @@ lbd_str = hbd_str = comment
if f265_cfg['bd'] == 'all' or f265_cfg['bd'] == 'low': lbd_str = ""
if f265_cfg['bd'] == 'all' or f265_cfg['bd'] == 'high': hbd_str = ""
-asm_str = comment
-if f265_cfg['asm']: asm_str = ""
+asm_str = "" if f265_cfg['asm'] else comment
+fast_c_str = "" if f265_cfg['fastc'] else comment
no_str = old_str = new_str = comment
if f265_cfg['libav'] == 'none': no_str = ""
@@ -116,6 +117,9 @@ s = ("#ifndef F265_CONFIG_H\n"
"// Defined if assembly support is enabled.\n"
"%s#define F265_HAVE_ASM\n"
"\n"
+ "// Defined if non-bit-exact C code is enabled.\n"
+ "%s#define F265_FAST_C\n"
+ "\n"
"// Defined if debugging is enabled.\n"
"#define F265_DEBUG\n"
"\n"
@@ -132,7 +136,7 @@ s = ("#ifndef F265_CONFIG_H\n"
"// Version number.\n"
"#define F265_VERSION \"%s\"\n"
"\n"
- "#endif") % (lbd_str, hbd_str, asm_str, perf_str, no_str, old_str, new_str, F265_VERSION)
+ "#endif") % (lbd_str, hbd_str, asm_str, fast_c_str, perf_str, no_str, old_str, new_str, F265_VERSION)
if not os.path.isdir('build/f265'): Execute(Mkdir('build/f265'))
write_file('build/f265/f265_config.h.tmp', s)
f265_cfg.Command('build/f265/f265_config.h', 'build/f265/f265_config.h.tmp',
diff --git a/f265/analyze.c b/f265/analyze.c
index 2f40485..6df6789 100644
--- a/f265/analyze.c
+++ b/f265/analyze.c
@@ -342,10 +342,10 @@ static void fenc_get_inter_part_depth_range(f265_enc_thread *t, int inter_part,
// Return the SSD of the block specified.
static int64_t fenc_an_block_ssd(f265_enc_thread *t, f265_pix *src0, int src0_stride, f265_pix *src1, int src1_stride,
- int bs, int comp)
+ int lg_bs, int comp)
{
// Compute the reconstruction distortion.
- int64_t dist = fenc_ssd(src0, src0_stride, src1, src1_stride, bs, bs, 0, 8);
+ int64_t dist = fenc_fssd[lg_bs-2](src0, src0_stride, src1, src1_stride, 1<<lg_bs);
#ifdef VAN_SCALE_UP_PRECISION
dist <<= 5;
#endif
@@ -407,7 +407,11 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
{
// Compute the prediction distortion.
f265_pix *src = t->src_frame->src_planes[comp] + plane_off;
- cost = t->me.dist[t->an.mode_metric](src, ref_stride, pred, bs, bs, bs, 8);
+
+ if (likely(!t->an.mode_metric))
+ cost = fenc_fsad[lg_bs-1](src, ref_stride, pred, bs, bs<<8|bs);
+ else
+ cost = t->me.dist[t->an.mode_metric](src, ref_stride, pred, bs, bs, bs, 8);
// Avoid Valgrind errors. FIXME.
*nz_flag = 1;
@@ -463,7 +467,7 @@ printf("sad=%d, threshold=%d\n", sad, threshold);
// Compute the reconstruction distortion.
f265_pix *src = t->src_frame->src_planes[comp] + plane_off;
f265_pix *rec = t->src_frame->rec_planes[comp ? 3+comp : 0] + plane_off;
- int64_t dist = fenc_an_block_ssd(t, rec, ref_stride, src, ref_stride, bs, comp);
+ int64_t dist = fenc_an_block_ssd(t, rec, ref_stride, src, ref_stride, lg_bs, comp);
cost = coeff_cost + dist;
#ifdef VAN_TRACE_ANALYSIS
@@ -1895,7 +1899,7 @@ static int64_t fenc_analyze_inter_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
}
// Compute the reconstruction distortion.
- int64_t dist = fenc_an_block_ssd(t, dist_rec, dist_rec_stride, src, ref_stride, bs, comp);
+ int64_t dist = fenc_an_block_ssd(t, dist_rec, dist_rec_stride, src, ref_stride, lg_bs, comp);
int64_t cost = coeff_cost + dist;
#ifdef VAN_TRACE_ANALYSIS
diff --git a/f265/asm.c b/f265/asm.c
index 5c7ed57..7edecc3 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -92,6 +92,14 @@ void f265_lbd_sad4_24_avx2(int *costs, uint8_t *src, int src_stride, uint8_t **r
void f265_lbd_sad4_48_avx2(int *costs, uint8_t *src, int src_stride, uint8_t **refs, int ref_stride, int packed_dims);
void f265_hbd_sad4_c(int *costs, int16_t *src, int src_stride, int16_t **refs, int ref_stride, int packed_dims);
+int f265_lbd_fssd_c(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_lbd_fssd_4_avx2(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_lbd_fssd_8_avx2(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_lbd_fssd_16_avx2(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_lbd_fssd_32_avx2(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_lbd_fssd_64_avx2(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+int f265_hbd_fssd_c(int16_t *src0, int stride0, int16_t *src1, int stride1, int width);
+
void f265_lbd_avg_pix_c(uint8_t *dst, uint8_t *src0, int src0_stride, uint8_t *src1, int src1_stride, int packed_dims);
void f265_lbd_avg_pix_4_avx2(uint8_t *dst, uint8_t *src0, int src0_stride, uint8_t *src1, int src1_stride, int packed_dims);
void f265_lbd_avg_pix_8_avx2(uint8_t *dst, uint8_t *src0, int src0_stride, uint8_t *src1, int src1_stride, int packed_dims);
@@ -349,6 +357,7 @@ f265_lbd_preprocess_tb_func f265_lbd_preprocess_tb;
f265_lbd_fsad_func f265_lbd_fsad[10];
f265_lbd_sad3_func f265_lbd_sad3[10];
f265_lbd_sad4_func f265_lbd_sad4[10];
+f265_lbd_fssd_func f265_lbd_fssd[5];
f265_lbd_avg_pix_func f265_lbd_avg_pix[10];
f265_lbd_interpol_luma_qpel_pix_func f265_lbd_interpol_luma_qpel_pix[30];
@@ -361,6 +370,7 @@ f265_hbd_preprocess_tb_func f265_hbd_preprocess_tb;
f265_hbd_fsad_func f265_hbd_fsad[10];
f265_hbd_sad3_func f265_hbd_sad3[10];
f265_hbd_sad4_func f265_hbd_sad4[10];
+f265_hbd_fssd_func f265_hbd_fssd[5];
f265_hbd_avg_pix_func f265_hbd_avg_pix[10];
f265_hbd_interpol_luma_qpel_pix_func f265_hbd_interpol_luma_qpel_pix[30];
@@ -465,6 +475,16 @@ static void f265_link_asm(int avx2_flag)
f265_hbd_sad4[7] = f265_hbd_sad4_c;
f265_hbd_sad4[8] = f265_hbd_sad4_c;
f265_hbd_sad4[9] = f265_hbd_sad4_c;
+ f265_lbd_fssd[0] = f265_lbd_fssd_c;
+ f265_lbd_fssd[1] = f265_lbd_fssd_c;
+ f265_lbd_fssd[2] = f265_lbd_fssd_c;
+ f265_lbd_fssd[3] = f265_lbd_fssd_c;
+ f265_lbd_fssd[4] = f265_lbd_fssd_c;
+ f265_hbd_fssd[0] = f265_hbd_fssd_c;
+ f265_hbd_fssd[1] = f265_hbd_fssd_c;
+ f265_hbd_fssd[2] = f265_hbd_fssd_c;
+ f265_hbd_fssd[3] = f265_hbd_fssd_c;
+ f265_hbd_fssd[4] = f265_hbd_fssd_c;
f265_lbd_avg_pix[1] = f265_lbd_avg_pix_c;
f265_lbd_avg_pix[2] = f265_lbd_avg_pix_c;
f265_lbd_avg_pix[3] = f265_lbd_avg_pix_c;
@@ -580,6 +600,11 @@ static void f265_link_asm(int avx2_flag)
f265_lbd_sad4[7] = f265_lbd_sad4_12_avx2;
f265_lbd_sad4[8] = f265_lbd_sad4_24_avx2;
f265_lbd_sad4[9] = f265_lbd_sad4_48_avx2;
+ f265_lbd_fssd[0] = f265_lbd_fssd_4_avx2;
+ f265_lbd_fssd[1] = f265_lbd_fssd_8_avx2;
+ f265_lbd_fssd[2] = f265_lbd_fssd_16_avx2;
+ f265_lbd_fssd[3] = f265_lbd_fssd_32_avx2;
+ f265_lbd_fssd[4] = f265_lbd_fssd_64_avx2;
f265_lbd_avg_pix[1] = f265_lbd_avg_pix_4_avx2;
f265_lbd_avg_pix[2] = f265_lbd_avg_pix_8_avx2;
f265_lbd_avg_pix[3] = f265_lbd_avg_pix_16_avx2;
diff --git a/f265/asm.h b/f265/asm.h
index d48addf..6402d1f 100644
--- a/f265/asm.h
+++ b/f265/asm.h
@@ -24,6 +24,8 @@ typedef void(*f265_lbd_sad3_func)(int *costs, uint8_t *src, int src_stride, uint
typedef void(*f265_hbd_sad3_func)(int *costs, int16_t *src, int src_stride, int16_t **refs, int ref_stride, int packed_dims);
typedef void(*f265_lbd_sad4_func)(int *costs, uint8_t *src, int src_stride, uint8_t **refs, int ref_stride, int packed_dims);
typedef void(*f265_hbd_sad4_func)(int *costs, int16_t *src, int src_stride, int16_t **refs, int ref_stride, int packed_dims);
+typedef int(*f265_lbd_fssd_func)(uint8_t *src0, int stride0, uint8_t *src1, int stride1, int width);
+typedef int(*f265_hbd_fssd_func)(int16_t *src0, int stride0, int16_t *src1, int stride1, int width);
typedef void(*f265_lbd_avg_pix_func)(uint8_t *dst, uint8_t *src0, int src0_stride, uint8_t *src1, int src1_stride, int packed_dims);
typedef void(*f265_hbd_avg_pix_func)(int16_t *dst, int16_t *src0, int src0_stride, int16_t *src1, int src1_stride, int packed_dims);
typedef void(*f265_lbd_interpol_luma_qpel_pix_func)(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
@@ -83,6 +85,12 @@ extern f265_lbd_sad4_func f265_lbd_sad4[10];
// Indices: X, 4, 8, 16, 32, 64, X, 12, 24, 48.
extern f265_hbd_sad4_func f265_hbd_sad4[10];
+// Indices: 4, 8, 16, 32, 64.
+extern f265_lbd_fssd_func f265_lbd_fssd[5];
+
+// Indices: 4, 8, 16, 32, 64.
+extern f265_hbd_fssd_func f265_hbd_fssd[5];
+
// Indices: X, 4, 8, 16, 32, 64, X, 12, 24, 48.
extern f265_lbd_avg_pix_func f265_lbd_avg_pix[10];
diff --git a/f265/asm/avx2/pixel.asm b/f265/asm/avx2/pixel.asm
index 4ade0af..58c9e76 100644
--- a/f265/asm/avx2/pixel.asm
+++ b/f265/asm/avx2/pixel.asm
@@ -8,6 +8,8 @@ section .data
align 4
pat_w_32: dw 32,32 ; We duplicate the factors to broadcast them faster.
pat_dw_2048: dd 2048
+pat_b_127: times 4 db 127
+pat_w_1: dw 1, 1
align 16
pat_if_luma_8bit: db -1,4,-1,4, -10,58,-10,58, 17,-5,17,-5, 1,0,1,0
@@ -392,6 +394,143 @@ DEFINE_SAD 64, 4
%unmacro DEFINE_SAD 2
+; int fenc_fssd(f265_pix *src0, int stride0, f265_pix *src1, int stride1, int width)
+; Input parameters:
+; - g0: source 0.
+; - g1: source 0 stride.
+; - g2: source 1.
+; - g3: source 1 stride.
+; - g4: width.
+%macro DEFINE_SSD 1 ; %1: width.
+
+DEFFUN f265_lbd_fssd_%1_avx2, ia=4, at=8484, ti=0, tv=6, ym=1
+
+ ; Initialize.
+ vpbroadcastd y4, [pat_b_127] ; Maximum pixel value.
+ vpbroadcastd y5, [pat_w_1] ; Multiply-add two 16-bit values.
+ %if %1 >= 16
+ vpxor y0, y0 ; Accumulator.
+ %endif
+
+ %macro SSD_REG 6 ; %1: acc, %2: src0, %3: src1, %4: tmp, %5: 16-bit pat, %6: 32-bit pat.
+ vpsubusb y%4, y%2, y%3 ; unsigned_saturate(A-B)|unsigned_saturate(B-A).
+ vpsubusb y%2, y%3, y%2
+ vpor y%2, y%4
+ vpminub y%2, y%5 ; Avoid overflows.
+ vpmaddubsw y%2, y%2 ; Compute 16-bit SSDs (sum of two signed 14-bit values fit in 16-bit).
+ vpmaddwd y%2, y%6 ; Sum the 32-bit SSDs.
+ %if %1 != %2
+ vpaddd y%1, y%2 ; Accumulate.
+ %endif
+ %endmacro
+
+ ; Width 4.
+ %if %1 == 4
+ vmovdqu y0, [g0]
+ vpunpckldq y0, [g0+g1]
+ vmovdqu y2, [g2]
+ vpunpckldq y2, [g2+g3]
+ lea g0, [g0+2*g1-4]
+ lea g2, [g2+2*g3]
+ vmovdqu y1, [g0]
+ vpunpckldq y1, [g0+g1]
+ vmovdqu y3, [g2]
+ vpunpckldq y3, [g2+g3]
+ vpblendd y0, y1, 0x0c
+ vpunpcklqdq y2, y3
+ SSD_REG 0, 0, 2, 1, 4, 5
+
+ ; Width 8.
+ %elif %1 == 8
+ %macro ONE_PASS 3 ; %1: src0, %2: src1, %3: tmp.
+ vmovdqu y%1, [g0]
+ vpunpcklqdq y%1, [g0+g1]
+ vmovdqu y%2, [g2]
+ vpunpcklqdq y%2, [g2+g3]
+ lea g0, [g0+2*g1-16]
+ lea g2, [g2+2*g3-16]
+ vmovdqu y%3, [g0]
+ vpunpcklqdq y%3, [g0+g1]
+ vpblendd y%1, y%3, 0xf0
+ vmovdqu y%3, [g2]
+ vpunpcklqdq y%3, [g2+g3]
+ vpblendd y%2, y%3, 0xf0
+ SSD_REG %1, %1, %2, %3, 4, 5
+ %endmacro
+ ONE_PASS 0, 1, 2
+ lea g0, [g0+2*g1+16]
+ lea g2, [g2+2*g3+16]
+ ONE_PASS 1, 2, 3
+ vpaddd y0, y1
+ %unmacro ONE_PASS 3
+
+ ; Width 16.
+ %elif %1 == 16
+ mov ga, 8 ; Loop counter.
+ .loop:
+ vmovdqu y1, [g0]
+ vpblendd y1, [g0+g1-16], 0xf0
+ vmovdqu y2, [g2]
+ vpblendd y2, [g2+g3-16], 0xf0
+ lea g0, [g0+2*g1]
+ lea g2, [g2+2*g3]
+ SSD_REG 0, 1, 2, 3, 4, 5
+ sub ga, 1
+ jnz .loop
+
+ ; Width 32.
+ %elif %1 == 32
+ mov ga, 16 ; Loop counter.
+ .loop:
+ vmovdqu y1, [g0]
+ vmovdqu y2, [g2]
+ SSD_REG 0, 1, 2, 3, 4, 5
+ vmovdqu y1, [g0+g1]
+ vmovdqu y2, [g2+g3]
+ SSD_REG 0, 1, 2, 3, 4, 5
+ lea g0, [g0+2*g1]
+ lea g2, [g2+2*g3]
+ sub ga, 1
+ jnz .loop
+
+ ; Width 64.
+ %elif %1 == 64
+ mov ga, 64 ; Loop counter.
+ .loop:
+ vmovdqu y1, [g0]
+ vmovdqu y2, [g2]
+ SSD_REG 0, 1, 2, 3, 4, 5
+ vmovdqu y1, [g0+32]
+ vmovdqu y2, [g2+32]
+ SSD_REG 0, 1, 2, 3, 4, 5
+ add g0, g1
+ add g2, g3
+ sub ga, 1
+ jnz .loop
+ %endif
+
+ ; Combine.
+ vpshufd y1, y0, 0xe
+ vpaddd y0, y1
+ vpshufd y1, y0, 1
+ vpaddd y0, y1
+ %if %1 != 4
+ vpermq y1, y0, 2
+ vpaddd y0, y1
+ %endif
+ vmovd gad, x0
+ RET
+ %unmacro SSD_REG 6
+
+%endmacro
+DEFINE_SSD 4
+DEFINE_SSD 8
+DEFINE_SSD 16
+DEFINE_SSD 32
+DEFINE_SSD 64
+%unmacro DEFINE_SSD 1
+
+
; void fenc_avg_pix(f265_pix *dst, f265_pix *src0, int src0_stride, f265_pix *src1, int src1_stride, int packed_dims)
; Input parameters:
; - g0: destination.
diff --git a/f265/pixel.c b/f265/pixel.c
index 9586434..78842eb 100644
--- a/f265/pixel.c
+++ b/f265/pixel.c
@@ -1100,6 +1100,28 @@ int32_t fenc_ssd(f265_pix *src0, int32_t stride0, f265_pix *src1, int32_t stride
return ssd;
}
+// Fast sum of square differences (SSD). The assembly implementation caps the
+// per-pixel difference to 127 at low bit depth to make the computations faster.
+// Only square blocks are supported. Rename once the symbol above no longer
+// conflicts.
+int fenc_fssd_c(f265_pix *src0, int stride0, f265_pix *src1, int stride1, int width)
+{
+ int ssd = 0;
+ for (int y = 0; y < width; y++, src0 += stride0, src1 += stride1)
+ for (int x = 0; x < width; x++)
+ {
+ #ifdef F265_FAST_C
+ int tmp = src0[x] - src1[x];
+ ssd += tmp*tmp;
+ #else
+ int tmp = F265_ABS(src0[x] - src1[x]);
+ tmp = F265_MIN(127, tmp);
+ ssd += tmp*tmp;
+ #endif
+ }
+ return ssd;
+}
+
// Same as the SAD function, but compute the sum of square differences (SSD).
int32_t fenc_ssd16(int16_t *src0, int32_t stride0, int16_t *src1, int32_t stride1,
int32_t width, int32_t height, int32_t subshift, int32_t bitdepth)
diff --git a/snippets/asm.py b/snippets/asm.py
index 557dd51..83a6d6d 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -183,6 +183,10 @@ def declare_all():
indices=luma_amp_indices_x,
avx2_lbd=1)
+ df("fssd", bd=1,
+ ret = "int", args="f265_pix *src0, int stride0, f265_pix *src1, int stride1, int width",
+ indices=["4", "8", "16", "32", "64"], avx2_lbd=1)
+
df("avg_pix", bd=1,
args="f265_pix *dst, f265_pix *src0, int src0_stride, f265_pix *src1, int src1_stride, int packed_dims",
indices=luma_amp_indices_x,