Here is the patch that add 8x8 assembly.
diff --git a/SConstruct b/SConstruct
index 744e05d..3141dbc 100644
--- a/SConstruct
+++ b/SConstruct
@@ -158,7 +158,7 @@ bdi_env = ref_env.Clone(CPPPATH = ['build', '.', 'f265/ktools'])
for c_file in bdi_c_files:
obj_files += bdi_env.Object('build/f265/' + c_file[:-2], 'f265/' + c_file)
-bdi_a_files = ['pixel.asm', 'dct.asm', 'encode.asm']
+bdi_a_files = ['pixel.asm', 'dct.asm', 'encode.asm', 'intra.asm']
if f265_cfg['asm']:
asm_dir = 'f265/asm/'
asm_arch = "ARCH_X64" if mingw else "ARCH_AMD64"
@@ -214,7 +214,9 @@ if env['cli'] == 1:
lib_to_link_static= ['build/cli/cli.o', 'build/libf265.a']
if env['libav'] != 'none':
if f265_cfg['static']:
- for static_dep in ['libavformat', 'libavcodec','libavdevice', 'libavfilter', 'libavutil', 'libswscale' ]:
+ static_lib = [ 'libavformat', 'libavcodec','libavdevice', 'libavfilter', 'libavutil', 'libswscale' ]
+ if mingw: static_lib.extend([ 'libpthread', 'libz' ])
+ for static_dep in static_lib:
lib_to_link_static.append(os.path.join(f265_cfg['static'], static_dep + '.a'));
cli_env.Append(LIBS = f265_cfg['static_dep'].split())
else:
@@ -231,7 +233,9 @@ if env['yuvdiff'] == 1:
lib_to_link_static= ['build/yuvdiff_d/yuvdiff.o']
if env['libav'] != 'none':
if f265_cfg['static']:
- for static_dep in ['libavformat', 'libavcodec','libavdevice', 'libavfilter', 'libavutil', 'libswscale' ]:
+ static_lib = [ 'libavformat', 'libavcodec','libavdevice', 'libavfilter', 'libavutil', 'libswscale' ]
+ if mingw: static_lib.extend([ 'libpthread', 'libz' ])
+ for static_dep in static_lib:
lib_to_link_static.append(os.path.join(f265_cfg['static'], static_dep + '.a'));
cli_env.Append(LIBS = f265_cfg['static_dep'].split())
else:
diff --git a/doc/compile.txt b/doc/compile.txt
index e498ea3..41ef3f9 100644
--- a/doc/compile.txt
+++ b/doc/compile.txt
@@ -317,9 +317,10 @@ libraries are missing, use the static_dep to add/remove then.
$ scons static_dep="z pthread ${LIB_TO_ADD}"
-For windows, you will need to add "ws2_32".
-
- $ scons static_dep="z pthread ws2_32"
+For Windows, you need to copy libpthread.a and libz.a in the static folder.
+The only dynamic libraries you need is the "ws2_32".
+
+ $ scons static_dep="ws2_32"
Step 1. Get and compile libav
@@ -367,7 +368,12 @@ the libraries.
Copy the libraries. You must replace "${PATH_TO_LIBAV}" with the path to the
libav root directory.
- & cp ${PATH_TO_LIBAV}/lib*/*.a static_libav/
+ $ cp ${PATH_TO_LIBAV}/lib*/*.a static_libav/
+
+On Windows, you also need to copy libpthread.a and libz.a in static_libav
+
+ $ cp /c/mingw/mingw64/x86_64-w64-mingw32/lib/libpthread.a static_libav/
+ $ cp /c/mingw/mingw64/x86_64-w64-mingw32/lib/libz.a static_libav/
Step 3. Configure and recompile f265
@@ -380,9 +386,9 @@ Start by displaying the configuration
Find the line starting with "cflags" and copy the actual value. Then configure
scons.
-On Windows, you must first add "ws2_32" to static_dep.
+On Windows, you only need "ws2_32" in static_dep.
- $ scons -h static_dep="z pthread ws2_32"
+ $ scons -h static_dep="ws2_32"
Depending on your setup, libav might have linked against additional libraries.
You must add them to the "static_dep" configuration. For example, if libav
diff --git a/f265/analyze.c b/f265/analyze.c
index c415cbf..c828145 100644
--- a/f265/analyze.c
+++ b/f265/analyze.c
@@ -523,7 +523,7 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
int bs = 1<<lg_bs;
int ref_stride = t->me.ref_stride;
int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_ox, ct_oy);
- int filter_edge_flag, filter_neighbour_flag, neighbour_bilinear_flag;
+ int filter_edge_flag, filter_neighbour_flag;
int64_t cost;
f265_pix *neighbours;
@@ -531,20 +531,20 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
if (likely(ib->intra_neighbour_mode == 2))
{
// FIXME, optimize this.
- fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, &neighbour_bilinear_flag,
- comp, lg_bs, mode, 0);
+ fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+ comp, lg_bs, mode);
neighbours = ib->neighbours[filter_neighbour_flag];
}
// Predict the reconstructed neighbours.
else if (ib->intra_neighbour_mode == 1)
{
- int smooth_intra_flag = F265_GET_FLAG(t->enc->gd.eflags, F265_PF_SMOOTH_INTRA);
- fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, &neighbour_bilinear_flag,
- comp, lg_bs, mode, smooth_intra_flag);
- fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, comp, bs, ct_ox, ct_oy);
- if (filter_neighbour_flag) fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0], bs, 8,
- neighbour_bilinear_flag);
+ fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+ comp, lg_bs, mode);
+
+ int ct_o[2] = {ct_ox, ct_oy};
+ fenc_extract_intra_neighbours(t, ib->neighbours, ct_o, filter_neighbour_flag, 1<<8 | comp, lg_bs);
+
neighbours = ib->neighbours[filter_neighbour_flag];
}
@@ -552,7 +552,9 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
else
{
// Do not filter anything when using the source pixels.
- fenc_predict_intra_neighbours(t, ib->neighbours[0], 0, comp, bs, ct_ox, ct_oy);
+ int ct_o[2] = {ct_ox, ct_oy};
+ fenc_extract_intra_neighbours(t, ib->neighbours, ct_o, 0, 0<<8 | comp, lg_bs);
+
filter_edge_flag = 0;
neighbours = ib->neighbours[0];
}
@@ -1801,7 +1803,6 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
{
f265_analysis *an = &t->an;
f265_intra_block *ib = &t->intra_block;
- int bs = 1<<lg_bs;
int rdo_restore_flag = 1;
int64_t best_cost;
@@ -1888,10 +1889,8 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
// neighbours, then the previous passes also use cached neighbours.
if (ib->cache_neighbour_flags[0]|ib->cache_neighbour_flags[1]|ib->cache_neighbour_flags[2])
{
- int bilinear_flag = bs == 32 && F265_GET_FLAG(t->enc->gd.eflags, F265_PF_SMOOTH_INTRA);
- fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, 0, bs,
- cb->cb_off[0] + cb_ox, cb->cb_off[1] + cb_oy);
- fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0], bs, 8, bilinear_flag);
+ int ct_o[2] = {cb->cb_off[0] + cb_ox, cb->cb_off[1] + cb_oy};
+ fenc_extract_intra_neighbours(t, ib->neighbours, ct_o, 1, 1<<8 | 0, lg_bs);
}
// Set the partition data.
diff --git a/f265/asm.c b/f265/asm.c
index 7edecc3..99ec15a 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -136,6 +136,28 @@ void f265_hbd_interpol_luma_qpel_pix_h_c(int16_t *dst, int dst_stride, int16_t *
void f265_hbd_interpol_luma_qpel_pix_v_c(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
void f265_hbd_interpol_luma_qpel_pix_d_c(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
+void f265_lbd_predict_intra_planar_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dc_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_angular_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_planar_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dc_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_bot_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_bot_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_top_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_vert_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_vert_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_vert_right_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_right_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_hbd_predict_intra_planar_c(int16_t *dst, int16_t *neighbours, int mode, int packed);
+void f265_hbd_predict_intra_dc_c(int16_t *dst, int16_t *neighbours, int mode, int packed);
+void f265_hbd_predict_intra_angular_c(int16_t *dst, int16_t *neighbours, int mode, int packed);
+
+void f265_lbd_extract_intra_neigh_c(uint8_t nbuf[2][160], uint8_t *pred, int pred_stride, int avail[2], int filter, int packed);
+void f265_lbd_extract_intra_neigh_8_avx2(uint8_t nbuf[2][160], uint8_t *pred, int pred_stride, int avail[2], int filter, int packed);
+void f265_hbd_extract_intra_neigh_c(int16_t nbuf[2][160], int16_t *pred, int pred_stride, int avail[2], int filter, int packed);
+
// Special code.
#ifdef F265_HAVE_ASM
int f265_lbd_fsad_12_avx2(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims)
@@ -360,6 +382,8 @@ f265_lbd_sad4_func f265_lbd_sad4[10];
f265_lbd_fssd_func f265_lbd_fssd[5];
f265_lbd_avg_pix_func f265_lbd_avg_pix[10];
f265_lbd_interpol_luma_qpel_pix_func f265_lbd_interpol_luma_qpel_pix[30];
+f265_lbd_predict_intra_func f265_lbd_predict_intra[44];
+f265_lbd_extract_intra_neigh_func f265_lbd_extract_intra_neigh[4];
f265_hbd_dct_func f265_hbd_dct[5];
f265_hbd_idct_func f265_hbd_idct[5];
@@ -373,6 +397,8 @@ f265_hbd_sad4_func f265_hbd_sad4[10];
f265_hbd_fssd_func f265_hbd_fssd[5];
f265_hbd_avg_pix_func f265_hbd_avg_pix[10];
f265_hbd_interpol_luma_qpel_pix_func f265_hbd_interpol_luma_qpel_pix[30];
+f265_hbd_predict_intra_func f265_hbd_predict_intra[44];
+f265_hbd_extract_intra_neigh_func f265_hbd_extract_intra_neigh[4];
// Linkage at runtime.
static void f265_link_asm(int avx2_flag)
@@ -549,6 +575,102 @@ static void f265_link_asm(int avx2_flag)
f265_hbd_interpol_luma_qpel_pix[27] = f265_hbd_interpol_luma_qpel_pix_h_c;
f265_hbd_interpol_luma_qpel_pix[28] = f265_hbd_interpol_luma_qpel_pix_v_c;
f265_hbd_interpol_luma_qpel_pix[29] = f265_hbd_interpol_luma_qpel_pix_d_c;
+ f265_lbd_predict_intra[0] = f265_lbd_predict_intra_planar_c;
+ f265_lbd_predict_intra[1] = f265_lbd_predict_intra_dc_c;
+ f265_lbd_predict_intra[2] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[3] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[4] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[5] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[6] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[7] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[8] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[9] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[10] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[11] = f265_lbd_predict_intra_planar_c;
+ f265_lbd_predict_intra[12] = f265_lbd_predict_intra_dc_c;
+ f265_lbd_predict_intra[13] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[14] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[15] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[16] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[17] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[18] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[19] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[20] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[21] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[22] = f265_lbd_predict_intra_planar_c;
+ f265_lbd_predict_intra[23] = f265_lbd_predict_intra_dc_c;
+ f265_lbd_predict_intra[24] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[25] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[26] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[27] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[28] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[29] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[30] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[31] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[32] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[33] = f265_lbd_predict_intra_planar_c;
+ f265_lbd_predict_intra[34] = f265_lbd_predict_intra_dc_c;
+ f265_lbd_predict_intra[35] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[36] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[37] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[38] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[39] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[40] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[41] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[42] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[43] = f265_lbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[0] = f265_hbd_predict_intra_planar_c;
+ f265_hbd_predict_intra[1] = f265_hbd_predict_intra_dc_c;
+ f265_hbd_predict_intra[2] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[3] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[4] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[5] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[6] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[7] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[8] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[9] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[10] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[11] = f265_hbd_predict_intra_planar_c;
+ f265_hbd_predict_intra[12] = f265_hbd_predict_intra_dc_c;
+ f265_hbd_predict_intra[13] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[14] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[15] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[16] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[17] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[18] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[19] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[20] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[21] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[22] = f265_hbd_predict_intra_planar_c;
+ f265_hbd_predict_intra[23] = f265_hbd_predict_intra_dc_c;
+ f265_hbd_predict_intra[24] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[25] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[26] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[27] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[28] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[29] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[30] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[31] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[32] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[33] = f265_hbd_predict_intra_planar_c;
+ f265_hbd_predict_intra[34] = f265_hbd_predict_intra_dc_c;
+ f265_hbd_predict_intra[35] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[36] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[37] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[38] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[39] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[40] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[41] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[42] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[43] = f265_hbd_predict_intra_angular_c;
+ f265_lbd_extract_intra_neigh[0] = f265_lbd_extract_intra_neigh_c;
+ f265_lbd_extract_intra_neigh[1] = f265_lbd_extract_intra_neigh_c;
+ f265_lbd_extract_intra_neigh[2] = f265_lbd_extract_intra_neigh_c;
+ f265_lbd_extract_intra_neigh[3] = f265_lbd_extract_intra_neigh_c;
+ f265_hbd_extract_intra_neigh[0] = f265_hbd_extract_intra_neigh_c;
+ f265_hbd_extract_intra_neigh[1] = f265_hbd_extract_intra_neigh_c;
+ f265_hbd_extract_intra_neigh[2] = f265_hbd_extract_intra_neigh_c;
+ f265_hbd_extract_intra_neigh[3] = f265_hbd_extract_intra_neigh_c;
#ifdef F265_HAVE_ASM
if (avx2_flag)
@@ -631,6 +753,18 @@ static void f265_link_asm(int avx2_flag)
f265_lbd_interpol_luma_qpel_pix[27] = f265_lbd_interpol_luma_qpel_pix_48_h_avx2;
f265_lbd_interpol_luma_qpel_pix[28] = f265_lbd_interpol_luma_qpel_pix_48_v_avx2;
f265_lbd_interpol_luma_qpel_pix[29] = f265_lbd_interpol_luma_qpel_pix_48_d_avx2;
+ f265_lbd_predict_intra[11] = f265_lbd_predict_intra_planar_8_avx2;
+ f265_lbd_predict_intra[12] = f265_lbd_predict_intra_dc_8_avx2;
+ f265_lbd_predict_intra[13] = f265_lbd_predict_intra_dia_bot_left_8_avx2;
+ f265_lbd_predict_intra[14] = f265_lbd_predict_intra_hor_bot_8_avx2;
+ f265_lbd_predict_intra[15] = f265_lbd_predict_intra_hor_8_avx2;
+ f265_lbd_predict_intra[16] = f265_lbd_predict_intra_hor_top_8_avx2;
+ f265_lbd_predict_intra[17] = f265_lbd_predict_intra_dia_top_left_8_avx2;
+ f265_lbd_predict_intra[18] = f265_lbd_predict_intra_vert_left_8_avx2;
+ f265_lbd_predict_intra[19] = f265_lbd_predict_intra_vert_8_avx2;
+ f265_lbd_predict_intra[20] = f265_lbd_predict_intra_vert_right_8_avx2;
+ f265_lbd_predict_intra[21] = f265_lbd_predict_intra_dia_top_right_8_avx2;
+ f265_lbd_extract_intra_neigh[1] = f265_lbd_extract_intra_neigh_8_avx2;
}
#endif
}
diff --git a/f265/asm.h b/f265/asm.h
index 6402d1f..98e2f6d 100644
--- a/f265/asm.h
+++ b/f265/asm.h
@@ -30,6 +30,10 @@ typedef void(*f265_lbd_avg_pix_func)(uint8_t *dst, uint8_t *src0, int src0_strid
typedef void(*f265_hbd_avg_pix_func)(int16_t *dst, int16_t *src0, int src0_stride, int16_t *src1, int src1_stride, int packed_dims);
typedef void(*f265_lbd_interpol_luma_qpel_pix_func)(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
typedef void(*f265_hbd_interpol_luma_qpel_pix_func)(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
+typedef void(*f265_lbd_predict_intra_func)(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+typedef void(*f265_hbd_predict_intra_func)(int16_t *dst, int16_t *neighbours, int mode, int packed);
+typedef void(*f265_lbd_extract_intra_neigh_func)(uint8_t nbuf[2][160], uint8_t *pred, int pred_stride, int avail[2], int filter, int packed);
+typedef void(*f265_hbd_extract_intra_neigh_func)(int16_t nbuf[2][160], int16_t *pred, int pred_stride, int avail[2], int filter, int packed);
// Globals.
@@ -103,4 +107,16 @@ extern f265_lbd_interpol_luma_qpel_pix_func f265_lbd_interpol_luma_qpel_pix[30];
// Indices: X, X, X, 4_h, 4_v, 4_d, 8_h, 8_v, 8_d, 16_h, 16_v, 16_d, 32_h, 32_v, 32_d, 64_h, 64_v, 64_d, X, X, X, 12_h, 12_v, 12_d, 24_h, 24_v, 24_d, 48_h, 48_v, 48_d.
extern f265_hbd_interpol_luma_qpel_pix_func f265_hbd_interpol_luma_qpel_pix[30];
+// Indices: planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular.
+extern f265_lbd_predict_intra_func f265_lbd_predict_intra[44];
+
+// Indices: planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular.
+extern f265_hbd_predict_intra_func f265_hbd_predict_intra[44];
+
+// Indices: 4, 8, 16, 32.
+extern f265_lbd_extract_intra_neigh_func f265_lbd_extract_intra_neigh[4];
+
+// Indices: 4, 8, 16, 32.
+extern f265_hbd_extract_intra_neigh_func f265_hbd_extract_intra_neigh[4];
+
diff --git a/f265/bdi.h b/f265/bdi.h
index 9d2e4ce..c8fab2c 100644
--- a/f265/bdi.h
+++ b/f265/bdi.h
@@ -502,6 +502,7 @@ extern const int16_t f265_lambdas[52];
extern uint16_t f265_mv_costs[52][F265_NB_MV_COSTS];
extern const int8_t f265_hpel_src0[16];
extern const int8_t f265_hpel_src1[16];
+extern const int8_t f265_mode_to_intra_pred[35];
///////////////////////////////////////////////////////////////////////////////
diff --git a/f265/bdi_ro.c b/f265/bdi_ro.c
index 05972f0..32e2ba7 100644
--- a/f265/bdi_ro.c
+++ b/f265/bdi_ro.c
@@ -608,3 +608,5 @@ const int16_t f265_lambdas[52] =
const int8_t f265_hpel_src0[16] = { 0, 0, 1, 1, 0, 1, 1, 1, 2, 2, 3, 3, 0, 1, 1, 1 };
const int8_t f265_hpel_src1[16] = { 9, 1, 9, 0, 2, 2, 3, 2, 9, 3, 9, 2, 2, 2, 3, 2 };
+// Map intra prediction mode to optimized assembly function.
+const int8_t f265_mode_to_intra_pred[35] = {0, 1, 2, 3,3,3,3,3,3,3, 4, 5,5,5,5,5,5,5, 6, 7,7,7,7,7,7,7, 8, 9,9,9,9,9,9,9, 10};
diff --git a/f265/enc.h b/f265/enc.h
index 3af6647..79fbf64 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -798,7 +798,14 @@ typedef struct f265_intra_block
int8_t intra_dist_mode;
// Unfiltered/filtered neighbours of the current partition.
- F265_ALIGN64 f265_pix neighbours[2][129];
+ // Layout:
+ // - [0-63] : Bottom left and left, packed near index 63
+ // (index 63 is always the top-most left neighbour).
+ // - [64-127] : Top and top leftm packed near index 64.
+ // (index 64 is always the left-most top neighbour).
+ // - [128] : Top-left neighbours
+ // - [129-159] : Alignement padding.
+ F265_ALIGN64 f265_pix neighbours[2][160];
} f265_intra_block;
@@ -2831,17 +2838,13 @@ void fenc_mc_chroma_b(f265_enc_thread *t, f265_pix *dst, int dst_stride, f265_re
int packed_dims, int plane_off, int comp);
// intra.c
-void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag, int *neighbour_bilinear_flag,
- int comp, int lg_bs, int mode, int smooth_intra_flag);
+void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag,
+ int comp, int lg_bs, int mode);
void fenc_get_intra_encode_flags(int *dst_flag, int *order, int comp, int lg_bs, int mode);
-void fenc_predict_intra_neighbours(f265_enc_thread *t, f265_pix dst[129], int rec_flag,
- int comp, int bs, int ct_ox, int ct_oy);
-void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int bd, int bilinear_flag);
-void fenc_predict_intra_planar(f265_pix *dst, f265_pix *nbuf, int lg_bs);
-void fenc_predict_intra_dc(f265_pix *dst, f265_pix *nbuf, int lg_bs, int filter_edge_flag);
-void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd, int filter_edge_flag, int mode);
+void fenc_extract_intra_neighbours(f265_enc_thread *t, f265_pix dst[2][160], int ct_o[2],
+ int packed, int filter, int lg_bs);
void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, int bd, int mode, int filter_edge_flag);
-void fenc_predict_intra(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy);
+void fenc_predict_intra_block(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy);
void fenc_get_intra_pred_mode(f265_enc_thread *t, f265_cb *cb, int partition_idx, int *mpm_list);
// inter.h
diff --git a/f265/intra.c b/f265/intra.c
index d512e2b..f97560c 100644
--- a/f265/intra.c
+++ b/f265/intra.c
@@ -45,13 +45,12 @@ static finline void fenc_get_intra_nb_avail(f265_enc_thread *t, int avail[2], in
// - filter_edge_flag: true if the edges are filtered for DC/vertical/horizontal.
// - filter_neighbour_flag: true if the neighbours are filtered.
// - neighbour_bilinear_flag: true if the neighbours are filtered bilinearly.
-void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag, int *neighbour_bilinear_flag,
- int comp, int lg_bs, int mode, int smooth_intra_flag)
+void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag,
+ int comp, int lg_bs, int mode)
{
int bs = 1<<lg_bs;
*filter_edge_flag = !comp && bs != 32;
*filter_neighbour_flag = !comp && f265_intra_mode_dist[mode] > f265_intra_dist_thresholds[lg_bs-2];
- *neighbour_bilinear_flag = bs == 32 && smooth_intra_flag;
}
// Get the intra encoding flags.
@@ -69,160 +68,169 @@ void fenc_get_intra_encode_flags(int *dst_flag, int *order, int comp, int lg_bs,
// image component. 'src' points to the top-left neighbour pixel of the block,
// 'nx' and 'ny' are the number of pixels to predict in each direction. 'bd' is
// the bit depth.
-static finline void fenc_extract_intra_neighbours(f265_pix dst[129], f265_pix *src, int src_stride,
- int availx, int availy, int nx, int ny, int bd)
+static finline void fenc_extract_unfiltered_intra_neigh(f265_pix dst[129], f265_pix *src, int src_stride,
+ int avail[2], int packed)
{
// The following logic relies on the slice layout restrictions.
+ int nb_pix = (packed&255)*2;
+ int availx = avail[0];
+ int availy = avail[1];
// Copy top-left tentatively.
- dst[0] = src[0];
+ dst[128] = src[-1-src_stride];
// Left is fully available, copy.
- if (likely(availy >= ny))
+ if (likely(availy >= nb_pix))
{
- for (int i = 0; i < ny; i++) dst[65+i] = src[(1+i)*src_stride];
+ for (int i = 0; i < nb_pix; i++) dst[63-i] = src[i*src_stride-1];
}
// Left is partially available, copy and broadcast.
else if (likely(availy > 0))
{
- for (int i = 0; i < availy; i++) dst[65+i] = src[(1+i)*src_stride];
- f265_pix p = dst[64+availy];
- for (int i = availy; i < ny; i++) dst[65+i] = p;
+ for (int i = 0; i < availy; i++) dst[63-i] = src[i*src_stride-1];
+ f265_pix p = dst[63-availy+1];
+ for (int i = availy; i < nb_pix; i++) dst[63-i] = p;
}
// Left and top-left are not available but top is. Broadcast the first
// pixel directly above the block.
else if (likely(availx > 0))
{
- f265_pix p = src[1];
- dst[0] = p;
- for (int i = 0; i < ny; i++) dst[65+i] = p;
+ f265_pix p = src[-src_stride];
+ dst[128] = p;
+ for (int i = 0; i < nb_pix; i++) dst[63-i] = p;
}
// Nothing is available, perform DC prediction.
else
{
- f265_pix p = 1<<(bd-1);
- for (int i = 0; i < nx+1; i++) dst[i] = p;
- for (int i = 0; i < ny; i++) dst[65+i] = p;
+ f265_pix p = 1<<((packed>>8)-1);
+ dst[128] = p;
+ for (int i = 0; i < nb_pix; i++) dst[64+i] = p;
+ for (int i = 0; i < nb_pix; i++) dst[63-i] = p;
return;
}
// Top is fully available, copy.
- if (likely(availx >= nx))
+ if (likely(availx >= nb_pix))
{
- for (int i = 0; i < nx; i++) dst[1+i] = src[1+i];
+ for (int i = 0; i < nb_pix; i++) dst[64+i] = src[i-src_stride];
}
// Top is partially available, copy and broadcast.
else if (likely(availx > 0))
{
- for (int i = 0; i < availx; i++) dst[1+i] = src[1+i];
- f265_pix p = dst[availx];
- for (int i = availx; i < nx; i++) dst[1+i] = p;
+ for (int i = 0; i < availx; i++) dst[64+i] = src[i-src_stride];
+ f265_pix p = dst[64+availx-1];
+ for (int i = availx; i < nb_pix; i++) dst[64+i] = p;
}
// Top-left, top, top-right are not available. Broadcast the first pixel
// directly left of the block.
else
{
- f265_pix p = dst[65];
- for (int i = 0; i < nx+1; i++) dst[i] = p;
+ f265_pix p = dst[63];
+ dst[128] = p;
+ for (int i = 0; i < nb_pix; i++) dst[64+i] = p;
}
}
-// Predict the unfiltered neighbour pixels of the specified intra block at
-// pixel offset (ct_ox, ct_oy) in the CTB with block size 'bs'. 'rec_flag' is
-// true if the reconstructed pixels are used for the prediction, false if the
-// source pixels are used as approximation. This function assumes that
-// constrained intra prediction is not used.
-//
-// Layout of the destination array, by offset: 0 (top-left), 1 (top and
-// top-right), 65 (left and bottom left).
-void fenc_predict_intra_neighbours(f265_enc_thread *t, f265_pix dst[129], int rec_flag,
- int comp, int bs, int ct_ox, int ct_oy)
-{
- int avail[2];
- fenc_get_intra_nb_avail(t, avail, comp, ct_ox, ct_oy);
- int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_ox-1, ct_oy-1);
- f265_pix *src = rec_flag ? t->src_frame->rec_planes[comp ? 3+comp : 0] + plane_off :
- t->src_frame->src_planes[comp] + plane_off;
- fenc_extract_intra_neighbours(dst, src, t->me.ref_stride, avail[0], avail[1], bs<<1, bs<<1,
- t->enc->gd.bit_depth[!!comp]);
-}
-
// Filter the neighbour pixels of the block specified.
-void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int bd, int bilinear_flag)
+static void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int bd, int bilinear_flag)
{
int bs2 = bs<<1;
- int top_left = src[0], top_last = src[bs2], left_last = src[64+bs2];
+ int top_left = src[128], top_last = src[64+bs2-1], left_last = src[63-bs2+1];
// Check for bilinear filtering.
if (bilinear_flag)
{
- int top_middle = src[32], left_middle = src[64+32];
+ int top_middle = src[63+32], left_middle = src[64-32];
int threshold = 1<<(bd-5);
bilinear_flag = F265_ABS(top_left + top_last - (top_middle<<1)) < threshold &&
F265_ABS(top_left + left_last - (left_middle<<1)) < threshold;
if (bilinear_flag)
{
- dst[0] = top_left;
- dst[64] = top_last;
- dst[64+64] = left_last;
+ dst[128] = top_left;
+ dst[127] = top_last;
+ dst[0] = left_last;
for (int i = 0; i < 63; i++)
{
- dst[1+i] = ((63-i)*top_left + (i+1)*top_last + 32)>>6;
- dst[65+i] = ((63-i)*top_left + (i+1)*left_last + 32)>>6;
+ dst[64+i] = ((63-i)*top_left + (i+1)*top_last + 32)>>6;
+ dst[63-i] = ((63-i)*top_left + (i+1)*left_last + 32)>>6;
}
return;
}
}
// Regular filtering.
- dst[0] = ((top_left<<1) + src[1] + src[65] + 2)>>2;
- dst[bs2] = top_last;
- dst[64+bs2] = left_last;
- for (int i = 1; i < bs2; i++) dst[i] = ((src[i]<<1) + src[i-1] + src[i+1] + 2)>>2;
- dst[65] = ((src[65]<<1) + top_left + src[66] + 2)>>2;
- for (int i = 66; i < 64+bs2; i++) dst[i] = ((src[i]<<1) + src[i-1] + src[i+1] + 2)>>2;
+ dst[128] = ((top_left<<1) + src[64] + src[63] + 2)>>2;
+ dst[63+bs2] = top_last;
+ dst[64-bs2] = left_last;
+ dst[64] = ((src[64]<<1) + top_left + src[65] + 2)>>2;
+ dst[63] = ((src[63]<<1) + top_left + src[62] + 2)>>2;
+
+ for (int i = 1; i < bs2-1; i++)
+ {
+ dst[64+i] = ((src[64+i]<<1) + src[64+i-1] + src[64+i+1] + 2)>>2;
+ dst[63-i] = ((src[63-i]<<1) + src[63-i-1] + src[63-i+1] + 2)>>2;
+ }
}
// Intra planar prediction.
-void fenc_predict_intra_planar(f265_pix *dst, f265_pix *nbuf, int lg_bs)
+// Packed:
+// - Bit 0 : filter_edge_flag
+// - Bit 1-7 : lg_bs
+// - Bit 8-15 : bd
+void fenc_predict_intra_planar_c(f265_pix *dst, f265_pix *nbuf, int mode, int packed)
{
+ int lg_bs = (packed>>1) & 127;
int bs = 1<<lg_bs;
- int top_right = nbuf[1+bs];
- int bottom_left = nbuf[65+bs];
+ int top_right = nbuf[64+bs];
+ int bottom_left = nbuf[63-bs];
for (int y = 0; y < bs; y++)
for (int x = 0; x < bs; x++)
- dst[y*bs+x] = ((bs-1-x)*nbuf[65+y] + (bs-1-y)*nbuf[1+x] + (x+1)*top_right + (y+1)*bottom_left + bs)
+ dst[y*bs+x] = ((bs-1-x)*nbuf[63-y] + (bs-1-y)*nbuf[64+x] + (x+1)*top_right + (y+1)*bottom_left + bs)
>>(lg_bs+1);
}
// Intra DC prediction.
-void fenc_predict_intra_dc(f265_pix *dst, f265_pix *nbuf, int lg_bs, int filter_edge_flag)
+// Packed:
+// - Bit 0 : filter_edge_flag
+// - Bit 1-7 : lg_bs
+// - Bit 8-15 : bd
+void fenc_predict_intra_dc_c(f265_pix *dst, f265_pix *nbuf, int mode, int packed)
{
+ int lg_bs = (packed>>1) & 127;
+ int filter_edge_flag = packed&1;
int bs = 1<<lg_bs;
int dc_val = bs;
- for (int i = 0; i < bs; i++) dc_val += nbuf[1+i] + nbuf[65+i];
+
+ for (int i = 0; i < bs; i++) dc_val += nbuf[63-i] + nbuf[64+i];
dc_val = dc_val>>(lg_bs+1);
for (int i = 0; i < bs*bs; i++) dst[i] = dc_val;
if (filter_edge_flag)
{
- dst[0] = ((dc_val<<1) + nbuf[1] + nbuf[65] + 2)>>2;
+ dst[0] = ((dc_val<<1) + nbuf[64] + nbuf[63] + 2)>>2;
for (int i = 1; i < bs; i++)
{
- dst[i] = (nbuf[i+1] + 3*dc_val + 2)>>2;
- dst[i*bs] = (nbuf[65+i] + 3*dc_val + 2)>>2;
+ dst[i] = (nbuf[64+i] + 3*dc_val + 2)>>2;
+ dst[i*bs] = (nbuf[63-i] + 3*dc_val + 2)>>2;
}
}
}
// Intra angular prediction.
-void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd, int filter_edge_flag, int mode)
+// Packed:
+// - Bit 0 : filter_edge_flag
+// - Bit 1-7 : lg_bs
+// - Bit 8-15 : bd
+void fenc_predict_intra_angular_c(f265_pix *dst, f265_pix *nbuf, int mode, int packed)
{
+ int bd = packed>>8;
+ int lg_bs = (packed>>1) & 127;
+ int filter_edge_flag = packed&1;
int bs = 1<<lg_bs;
// Flip the neighbours in the horizontal case.
@@ -230,8 +238,8 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
f265_pix ntmp[129];
if (hor_flag)
{
- ntmp[0] = nbuf[0];
- for (int i = 0; i < (bs<<1); i++) { ntmp[1+i] = nbuf[65+i]; ntmp[65+i] = nbuf[1+i]; }
+ ntmp[128] = nbuf[128];
+ for (int i = 0; i < (bs<<1); i++) { ntmp[63-i] = nbuf[64+i]; ntmp[64+i] = nbuf[63-i]; }
nbuf = ntmp;
}
@@ -244,13 +252,13 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
{
for (int y = 0; y < bs; y++)
for (int x = 0; x < bs; x++)
- dst[y*bs+x] = nbuf[1+x];
+ dst[y*bs+x] = nbuf[64+x];
if (filter_edge_flag)
{
- int top_left = nbuf[0], top = nbuf[1];
+ int top_left = nbuf[128], top = nbuf[64];
for (int y = 0; y < bs; y++)
- dst[y*bs] = F265_CLAMP(top + ((nbuf[65+y] - top_left)>>1), 0, (1<<bd)-1);
+ dst[y*bs] = F265_CLAMP(top + ((nbuf[63-y] - top_left)>>1), 0, (1<<bd)-1);
}
}
@@ -276,15 +284,16 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
for (int i = 0; i < nb_projected; i++)
{
inv_angle_sum += inv_angle;
- ref[-2-i] = nbuf[64+(inv_angle_sum>>8)];
+ ref[-2-i] = nbuf[64-(inv_angle_sum>>8)];
}
// Copy the top-left and top pixels.
- for (int i = 0; i < bs+1; i++) ref[-1+i] = nbuf[i];
+ ref[-1] = nbuf[128];
+ for (int i = 0; i < bs; i++) ref[i] = nbuf[64+i];
}
// Use the top and top-right neighbours.
- else ref = nbuf+1;
+ else ref = nbuf+64;
// Pass every row.
int angle_sum = 0;
@@ -311,32 +320,59 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
}
}
+// Extract and filter the transform block's neighbours.
+// In the assembly version, the neighbours are always filtered.
+// Packed:
+// - Bit 0-7 : bs
+// - Bit 8-15 : bd
+void fenc_extract_intra_neigh_c(f265_pix *dst, f265_pix *pred, int pred_stride,
+ int avail[2], int filter, int packed)
+{
+ fenc_extract_unfiltered_intra_neigh(dst, pred, pred_stride, avail, packed);
+
+ if (filter)
+ fenc_filter_intra_neighbours(dst+160, dst, packed&255, packed>>8, (packed&255)==32);
+}
+
+// Extract and filter the transform block neighbours at the location indicated by ct_ox and ct_oy.
+void fenc_extract_intra_neighbours(f265_enc_thread *t, f265_pix dst[2][160], int ct_o[2],
+ int filter, int packed, int lg_bs) /* rec_flag << 8 | comp */
+{
+ // Get the unfiltered neighbours.
+ int comp = packed & 255;
+ int rec_flag = packed>>8;
+
+ int avail[2];
+ fenc_get_intra_nb_avail(t, avail, comp, ct_o[0], ct_o[1]);
+ int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_o[0], ct_o[1]);
+ f265_pix *src = rec_flag ? t->src_frame->rec_planes[comp ? 3+comp : 0] + plane_off :
+ t->src_frame->src_planes[comp] + plane_off;
+
+ fenc_extract_intra_neigh[lg_bs-2](dst, src, t->me.ref_stride, avail, filter,
+ (t->enc->gd.bit_depth[!!comp] << 8) | 1 << lg_bs);
+}
+
// Predict the pixels of the intra mode specified.
-void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, int bd, int mode, int filter_edge_flag)
+// TODO: Stub. Should be inlined.
+inline void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, int bd, int mode, int filter_edge_flag)
{
- if (mode == 0) fenc_predict_intra_planar(dst, neighbours, lg_bs);
- else if (mode == 1) fenc_predict_intra_dc(dst, neighbours, lg_bs, filter_edge_flag);
- else fenc_predict_intra_angular(dst, neighbours, lg_bs, bd, filter_edge_flag, mode);
+ fenc_predict_intra[(lg_bs-2)*11 + f265_mode_to_intra_pred[mode]](dst, neighbours, mode, (filter_edge_flag|(lg_bs<<1)|(bd<<8)));
}
// Predict the intra block with the mode and the CTB offset specified.
-void fenc_predict_intra(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy)
+void fenc_predict_intra_block(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy)
{
f265_intra_block *ib = &t->intra_block;
int chroma_flag = !!comp;
- int bs = 1<<lg_bs;
int bd = t->enc->gd.bit_depth[chroma_flag];
- int smooth_intra_flag = F265_GET_FLAG(t->enc->gd.eflags, F265_PF_SMOOTH_INTRA);
- int filter_edge_flag, filter_neighbour_flag, neighbour_bilinear_flag;
+ int filter_edge_flag, filter_neighbour_flag;
- // Predict the unfiltered neighbours. Assuming 4:2:0.
- fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, comp, bs, ct_ox, ct_oy);
+ // Get and filter the neighbours.
+ fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+ comp, lg_bs, mode);
+ int ct_o[2] = {ct_ox, ct_oy};
+ fenc_extract_intra_neighbours(t, ib->neighbours, ct_o, filter_neighbour_flag, 1<<8 | comp, lg_bs);
- // Filter the neighbours.
- fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, &neighbour_bilinear_flag,
- comp, lg_bs, mode, smooth_intra_flag);
- if (filter_neighbour_flag) fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0],
- bs, bd, neighbour_bilinear_flag);
f265_pix *neighbours = ib->neighbours[filter_neighbour_flag];
// Do the prediction.
diff --git a/f265/rec.c b/f265/rec.c
index decc88f..f261e28 100644
--- a/f265/rec.c
+++ b/f265/rec.c
@@ -932,7 +932,7 @@ int fenc_rec_intra_tb(f265_enc_thread *t, int comp, int lg_bs, int mode, int zer
{
f265_pix pred[32*32];
int dst_flag, order;
- fenc_predict_intra(t, pred, comp, lg_bs, mode, ct_ox, ct_oy);
+ fenc_predict_intra_block(t, pred, comp, lg_bs, mode, ct_ox, ct_oy);
fenc_get_intra_encode_flags(&dst_flag, &order, comp, lg_bs, mode);
if (F265_GET_FLAG(t->enc->gd.eflags, F265_PF_RDOQ))
diff --git a/snippets/asm.py b/snippets/asm.py
index 83a6d6d..8c3b00c 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -115,13 +115,17 @@ declare_dict = odict()
# currently implemented in assembly. If "arch" is specified, it shadows
# "arch_lbd" and "arch_hbd". If single_c=true, the same C function is mapped to
# every slot.
-def declare_func(name, ret="void", args="", bd=0, indices=None, avx2=None, avx2_lbd=None, avx2_hbd=None, single_c=True):
+def declare_func(name, ret="void", args="", bd=0, indices=[""], c=1, c_lbd=None, c_hbd=None, \
+ avx2=None, avx2_lbd=None, avx2_hbd=None, single_c=True):
f = Function()
f.name = name
f.ret = ret
f.args = args
f.bd = bd
f.indices = indices
+ f.c = c
+ f.c_lbd = c_lbd
+ f.c_hbd = c_hbd
f.avx2 = avx2
f.avx2_lbd = avx2_lbd
f.avx2_hbd = avx2_hbd
@@ -132,16 +136,27 @@ def declare_func(name, ret="void", args="", bd=0, indices=None, avx2=None, avx2_
def declare_all():
df = declare_func
- amp_indices = ["2", "4", "8", "16", "32", "64", "6", "12", "24", "48"]
- luma_amp_indices_x = ["X", "4", "8", "16", "32", "64", "X", "12", "24", "48"]
- luma_amp_indices = ["4", "8", "16", "32", "64", "12", "24", "48"]
+ amp_indices = ["2", "4", "8", "16", "32", "64", "6", "12", "24", "48"]
+ luma_amp_indices = ["X", "4", "8", "16", "32", "64", "X", "12", "24", "48"]
luma_qpel_indices = []
+ luma_qpel_indices_c = []
luma_qpel_indices_avx2 = []
- for index in luma_amp_indices_x:
+ for index in luma_amp_indices:
for frac in [ "h", "v", "d"]:
luma_qpel_indices.append("X" if index == "X" else "%s_%s" % (index, frac))
- if index != "X" and int(index) % 8 == 0:
- luma_qpel_indices_avx2.append("%s_%s" % (index, frac))
+ luma_qpel_indices_c.append("X" if index == "X" else frac)
+ luma_qpel_indices_avx2.append("X" if index == "X" or int(index) % 8 != 0 else "%s_%s" % (index, frac))
+
+ intra_pred_indices_seed = ["4", "8", "16", "32"]
+ intra_pred_indices_avx2_seed = ["X", "8", "X", "X"]
+ intra_pred_indices = []
+ intra_pred_indices_avx2 = []
+ for index in intra_pred_indices_seed:
+ for frac in [ "planar", "dc", "angular", "angular", "angular", "angular", "angular", "angular", "angular", "angular", "angular"]:
+ intra_pred_indices.append("X" if index == "X" else "%s" % (frac))
+ for index in intra_pred_indices_avx2_seed:
+ for frac in [ "planar", "dc", "dia_bot_left", "hor_bot", "hor", "hor_top", "dia_top_left", "vert_left", "vert", "vert_right", "dia_top_right"]:
+ intra_pred_indices_avx2.append("X" if index == "X" else "%s_%s" % (frac, index))
# Declarations go here.
df("dct", bd=1, single_c=False,
@@ -175,12 +190,12 @@ def declare_all():
df("sad3", bd=1,
args="int *costs, f265_pix *src, int src_stride, f265_pix **refs, int ref_stride, int packed_dims",
- indices=luma_amp_indices_x,
+ indices=luma_amp_indices,
avx2_lbd=1)
df("sad4", bd=1,
args="int *costs, f265_pix *src, int src_stride, f265_pix **refs, int ref_stride, int packed_dims",
- indices=luma_amp_indices_x,
+ indices=luma_amp_indices,
avx2_lbd=1)
df("fssd", bd=1,
@@ -189,14 +204,25 @@ def declare_all():
df("avg_pix", bd=1,
args="f265_pix *dst, f265_pix *src0, int src0_stride, f265_pix *src1, int src1_stride, int packed_dims",
- indices=luma_amp_indices_x,
- avx2_lbd=["4", "8", "16", "32", "64", "12", "24", "48"])
+ indices=luma_amp_indices,
+ avx2_lbd=1)
- df("interpol_luma_qpel_pix", bd=1,
+ df("interpol_luma_qpel_pix", bd=1, single_c=False,
args="f265_pix *dst, int dst_stride, f265_pix *src, int src_stride, int frac, int packed_dims, uint8_t *spill",
indices=luma_qpel_indices,
+ c=luma_qpel_indices_c,
avx2_lbd=luma_qpel_indices_avx2)
+ df("predict_intra", bd=1, single_c=False,
+ args="f265_pix *dst, f265_pix *neighbours, int mode, int packed",
+ indices=intra_pred_indices,
+ avx2_lbd=intra_pred_indices_avx2)
+
+
+ df("extract_intra_neigh", bd=1, single_c=True,
+ args="f265_pix nbuf[2][160], f265_pix *pred, int pred_stride, int avail[2], int filter, int packed",
+ indices=["4", "8", "16", "32"],
+ avx2_lbd=["X", "8", "X", "X"])
### AVX2 SAD special code. ###
def avx2_sad_special_code():
@@ -300,19 +326,19 @@ def get_output():
for arch in arch_list:
assign_text[arch] = ""
+ # List all function declaration
+ function_list = set()
+
# Pass every function.
for f in declare_dict.values():
- # Base function name.
- base_func_name = "%s_%s" % (prog, f.name)
-
# Iterate on the bit depths, if any.
bd_list = ["lbd", "hbd"] if f.bd else [None]
for bd in bd_list:
- # Adjust the function name for the bit depth.
- bd_func_name = base_func_name
- if bd != None: bd_func_name = "%s_%s_%s" % (prog, bd, f.name)
+ # Function name. Include the bd only if defined.
+ bd_func_name = "%s_%s" % (prog, f.name)
+ if bd != None : bd_func_name = "%s_%s_%s" % (prog, bd, f.name)
# Do the substitutions for the bit depth in the arguments and the
# return type.
@@ -324,62 +350,55 @@ def get_output():
typedef_text += "typedef %s(*%s_func)(%s);\n" % (func_ret_str, bd_func_name, func_args_str)
# Declare the global variable, along with documentation.
- var_indice_str = "[%d]" % (len(f.indices)) if f.indices else ""
+ has_indices = len(f.indices) > 1 if f.indices != None else 0
+ var_indice_str = "[%d]" % (len(f.indices)) if has_indices else ""
global_str = "%s_func %s%s;\n" % (bd_func_name, bd_func_name, var_indice_str)
if bd != "hbd": global_var_text_lbd += global_str
else: global_var_text_hbd += global_str
- if f.indices != None: extern_var_text += "// Indices: %s.\n" % (", ".join(f.indices))
+ if has_indices: extern_var_text += "// Indices: %s.\n" % (", ".join(f.indices))
extern_var_text += "extern " + global_str + "\n";
# Iterate on the indices, if any.
index_list = f.indices if f.indices != None else [None]
for index_pos in range(len(index_list)):
- index = index_list[index_pos]
-
- # Adjust the function name for the index.
- index_func_name = bd_func_name
- if index != None: index_func_name += "_" + index
-
# Iterate on the architectures.
for arch in arch_list:
+ index = None
- # Adjust the function name for the architecture.
- arch_func_name = index_func_name + "_" + arch
- if f.single_c and arch == "c":
- arch_func_name = bd_func_name + "_c"
-
- # Check whether the architecture supports this function.
+ # Use the f.[arch] if no architecture is defined.
+ field = getattr(f, arch)
- # Skipped slot.
- if index == "X":
- support_flag = 0
+ # Use the f.[arch_bd] if the bit depth is defined and the generic arch is not.
+ if field == None and bd != None:
+ field = getattr(f, "%s_%s" % (arch, bd))
- # C always supports the function.
- elif arch == "c":
- support_flag = 1
+ # If the field is true (1), use the default field name.
+ if type(field) is int and field == 1:
+ field = f.indices
- # Handle assembly.
- else:
- # Get the relevant fields.
- bdi_field = getattr(f, arch)
- bd_field = bdi_field if bd == None else getattr(f, "%s_%s" % (arch, bd))
+ # Get the field value.
+ if type(field) is list is list:
+ index = field[index_pos]
- # Do the shadowing.
- field = bd_field if bdi_field == None else bdi_field
+ # Add the architecture to the function name.
+ arch_func_name = bd_func_name
+ if index is not None and len(index): arch_func_name += "_" + index
+ arch_func_name += "_" + arch
+ if arch == "c" and f.single_c :
+ arch_func_name = bd_func_name + "_c"
- # Explicitly supported.
- support_flag = field == 1 or type(field) is list and index in field
+ # Test if we should skip this index.
+ support_flag = 0 if index == "X" or (index == None) else 1
# Declare the prototype.
if (arch == "c" and f.single_c and index_pos == 0) or\
(support_flag and (arch != "c" or not f.single_c)):
- # Kludge for the interpolation functions.
- if arch_func_name.find("interpol") != -1 and arch == "c":
- for frac in [ "h", "v", "d"]:
- proto_text += "%s %s_%s_c(%s);\n" % (func_ret_str, bd_func_name, frac, func_args_str);
- # Normal declaration.
- else:
- proto_text += "%s %s(%s);\n" % (func_ret_str, arch_func_name, func_args_str);
+ s = "%s %s(%s);\n" % (func_ret_str, arch_func_name, func_args_str);
+
+ # Insert it only if it's not already declared.
+ if s not in function_list:
+ proto_text += s
+ function_list.add(s)
# Not supported, skip.
if not support_flag: continue
@@ -387,12 +406,8 @@ def get_output():
# Do the assignments.
assign_tabs = " "
if arch != "c": assign_tabs += " "
- assign_index_str = "[%d]" % (index_pos) if f.indices else ""
- assign_val = arch_func_name
- # Kludge for the interpolation functions.
- if arch_func_name.find("interpol") != -1 and arch == "c":
- assign_val = "%s_%s_c" % (arch_func_name[:-2], index[-1])
- assign_text[arch] += "%s%s%s = %s;\n" % (assign_tabs, bd_func_name, assign_index_str, assign_val)
+ assign_index_str = "[%d]" % (index_pos) if has_indices else ""
+ assign_text[arch] += "%s%s%s = %s;\n" % (assign_tabs, bd_func_name, assign_index_str, arch_func_name)
proto_text += "\n"
diff --git a/test/data/bench.ini b/test/data/bench.ini
index 71a3621..d11d908 100644
--- a/test/data/bench.ini
+++ b/test/data/bench.ini
@@ -32,7 +32,7 @@ work=work
# List of videos to encode, as known to f265test. By default, all frames are
# encoded. Use "video_name:nb_frames" to specify the number of frames to encode.
-videos=SpeedYUV:10 RaceHorsesYUV:20
+videos=speedyuv:30 racehorsesyuv:30
# QP range, same as f265test.
qp=10-30
@@ -71,7 +71,7 @@ gnuplot_font=
#display=firefox $HTML_FILE#summary_speed_curve_ssim
display=firefox $HTML_FILE
-# Test definition. The part following "result_" is the test name. It is scanned
+# Test definition. The part following "test_" is the test name. It is scanned
# for the encoder name.
[test_f265_base]
@@ -99,6 +99,24 @@ desc=A test for f265.
#f265_special=
+
+
+[test_f265_intra]
+enc=f265
+clobber=0
+params=quality=25;ref=0;cb-range=3,4;tb-range=2,4
+desc=Intra assembly.
+
+[test_f265_intra_old]
+enc=f265
+bin=/home/daniel/code/f265/f265Clean/build/f265cli_
+clobber=0
+params=quality=25;ref=0;cb-range=3,4;tb-range=2,4;
+desc=No intra assembly.
+
+
+
+
# Report section. The part following "report_" is the report section name.
[report_summary]
@@ -123,4 +141,3 @@ hist_points=25 50 75
# Report description added to the report (HTML-escaped).
desc=Summary of the tests
-
diff --git a/test/data/videos.ini b/test/data/videos.ini
index 10084af..319501f 100644
--- a/test/data/videos.ini
+++ b/test/data/videos.ini
@@ -93,3 +93,14 @@ resolution=1920x1080
frames=500
bitrate=640
+[video_crash]
+path=crash.yuv
+resolution=640x480
+frames=300
+bitrate=640
+
+[video_space]
+path=sp.mp4
+resolution=768x432
+frames=950
+bitrate=640