Here is the patch after your second set of comments.
On 08/01/2014 05:01 PM, Daniel Giguere wrote:
Here is the fixed patch.
Daniel
On 07/28/2014 01:25 PM, Laurent Birtz wrote:
Review for the asm part. Search for %%%.
Regards,
Laurent
diff --git a/SConstruct b/SConstruct
index 744e05d..ab9772d 100644
--- a/SConstruct
+++ b/SConstruct
@@ -158,7 +158,7 @@ bdi_env = ref_env.Clone(CPPPATH = ['build', '.', 'f265/ktools'])
for c_file in bdi_c_files:
obj_files += bdi_env.Object('build/f265/' + c_file[:-2], 'f265/' + c_file)
-bdi_a_files = ['pixel.asm', 'dct.asm', 'encode.asm']
+bdi_a_files = ['pixel.asm', 'dct.asm', 'encode.asm', 'intra.asm']
if f265_cfg['asm']:
asm_dir = 'f265/asm/'
asm_arch = "ARCH_X64" if mingw else "ARCH_AMD64"
diff --git a/f265/analyze.c b/f265/analyze.c
index d200aa4..dadb2f1 100644
--- a/f265/analyze.c
+++ b/f265/analyze.c
@@ -527,7 +527,7 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
int bs = 1<<lg_bs;
int ref_stride = t->me.ref_stride;
int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_ox, ct_oy);
- int filter_edge_flag, filter_neighbour_flag, neighbour_bilinear_flag;
+ int filter_edge_flag, filter_neighbour_flag;
int64_t cost;
f265_pix *neighbours;
@@ -535,20 +535,20 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
if (likely(ib->intra_neighbour_mode == 2))
{
// FIXME, optimize this.
- fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, &neighbour_bilinear_flag,
- comp, lg_bs, mode, 0);
+ fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+ comp, lg_bs, mode);
neighbours = ib->neighbours[filter_neighbour_flag];
}
// Predict the reconstructed neighbours.
else if (ib->intra_neighbour_mode == 1)
{
- int smooth_intra_flag = F265_GET_FLAG(t->enc->gd.eflags, F265_PF_SMOOTH_INTRA);
- fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, &neighbour_bilinear_flag,
- comp, lg_bs, mode, smooth_intra_flag);
- fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, comp, bs, ct_ox, ct_oy);
- if (filter_neighbour_flag) fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0], bs, 8,
- neighbour_bilinear_flag);
+ fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+ comp, lg_bs, mode);
+
+ int ct_off[2] = {ct_ox, ct_oy};
+ fenc_extract_intra_neighbours(t, ib->neighbours, ct_off, filter_neighbour_flag, 1<<8 | comp, lg_bs);
+
neighbours = ib->neighbours[filter_neighbour_flag];
}
@@ -556,7 +556,9 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
else
{
// Do not filter anything when using the source pixels.
- fenc_predict_intra_neighbours(t, ib->neighbours[0], 0, comp, bs, ct_ox, ct_oy);
+ int ct_off[2] = {ct_ox, ct_oy};
+ fenc_extract_intra_neighbours(t, ib->neighbours, ct_off, 0, 0<<8 | comp, lg_bs);
+
filter_edge_flag = 0;
neighbours = ib->neighbours[0];
}
@@ -1805,7 +1807,6 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
{
f265_analysis *an = &t->an;
f265_intra_block *ib = &t->intra_block;
- int bs = 1<<lg_bs;
int rdo_restore_flag = 1;
int64_t best_cost;
@@ -1892,10 +1893,8 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
// neighbours, then the previous passes also use cached neighbours.
if (ib->cache_neighbour_flags[0]|ib->cache_neighbour_flags[1]|ib->cache_neighbour_flags[2])
{
- int bilinear_flag = bs == 32 && F265_GET_FLAG(t->enc->gd.eflags, F265_PF_SMOOTH_INTRA);
- fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, 0, bs,
- cb->cb_off[0] + cb_ox, cb->cb_off[1] + cb_oy);
- fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0], bs, 8, bilinear_flag);
+ int ct_off[2] = {cb->cb_off[0] + cb_ox, cb->cb_off[1] + cb_oy};
+ fenc_extract_intra_neighbours(t, ib->neighbours, ct_off, 1, 1<<8 | 0, lg_bs);
}
// Set the partition data.
diff --git a/f265/asm.c b/f265/asm.c
index 7edecc3..620c3a4 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -136,6 +136,28 @@ void f265_hbd_interpol_luma_qpel_pix_h_c(int16_t *dst, int dst_stride, int16_t *
void f265_hbd_interpol_luma_qpel_pix_v_c(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
void f265_hbd_interpol_luma_qpel_pix_d_c(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
+void f265_lbd_predict_intra_planar_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dc_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_angular_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_planar_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dc_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_bot_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_bot_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_top_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_ver_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_ver_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_ver_right_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_right_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_hbd_predict_intra_planar_c(int16_t *dst, int16_t *neighbours, int mode, int packed);
+void f265_hbd_predict_intra_dc_c(int16_t *dst, int16_t *neighbours, int mode, int packed);
+void f265_hbd_predict_intra_angular_c(int16_t *dst, int16_t *neighbours, int mode, int packed);
+
+void f265_lbd_extract_intra_neigh_c(uint8_t nbuf[2][160], uint8_t *pred, int pred_stride, int avail[2], int filter, int packed);
+void f265_lbd_extract_intra_neigh_8_avx2(uint8_t nbuf[2][160], uint8_t *pred, int pred_stride, int avail[2], int filter, int packed);
+void f265_hbd_extract_intra_neigh_c(int16_t nbuf[2][160], int16_t *pred, int pred_stride, int avail[2], int filter, int packed);
+
// Special code.
#ifdef F265_HAVE_ASM
int f265_lbd_fsad_12_avx2(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims)
@@ -360,6 +382,8 @@ f265_lbd_sad4_func f265_lbd_sad4[10];
f265_lbd_fssd_func f265_lbd_fssd[5];
f265_lbd_avg_pix_func f265_lbd_avg_pix[10];
f265_lbd_interpol_luma_qpel_pix_func f265_lbd_interpol_luma_qpel_pix[30];
+f265_lbd_predict_intra_func f265_lbd_predict_intra[44];
+f265_lbd_extract_intra_neigh_func f265_lbd_extract_intra_neigh[4];
f265_hbd_dct_func f265_hbd_dct[5];
f265_hbd_idct_func f265_hbd_idct[5];
@@ -373,6 +397,8 @@ f265_hbd_sad4_func f265_hbd_sad4[10];
f265_hbd_fssd_func f265_hbd_fssd[5];
f265_hbd_avg_pix_func f265_hbd_avg_pix[10];
f265_hbd_interpol_luma_qpel_pix_func f265_hbd_interpol_luma_qpel_pix[30];
+f265_hbd_predict_intra_func f265_hbd_predict_intra[44];
+f265_hbd_extract_intra_neigh_func f265_hbd_extract_intra_neigh[4];
// Linkage at runtime.
static void f265_link_asm(int avx2_flag)
@@ -549,6 +575,102 @@ static void f265_link_asm(int avx2_flag)
f265_hbd_interpol_luma_qpel_pix[27] = f265_hbd_interpol_luma_qpel_pix_h_c;
f265_hbd_interpol_luma_qpel_pix[28] = f265_hbd_interpol_luma_qpel_pix_v_c;
f265_hbd_interpol_luma_qpel_pix[29] = f265_hbd_interpol_luma_qpel_pix_d_c;
+ f265_lbd_predict_intra[0] = f265_lbd_predict_intra_planar_c;
+ f265_lbd_predict_intra[1] = f265_lbd_predict_intra_dc_c;
+ f265_lbd_predict_intra[2] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[3] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[4] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[5] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[6] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[7] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[8] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[9] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[10] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[11] = f265_lbd_predict_intra_planar_c;
+ f265_lbd_predict_intra[12] = f265_lbd_predict_intra_dc_c;
+ f265_lbd_predict_intra[13] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[14] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[15] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[16] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[17] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[18] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[19] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[20] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[21] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[22] = f265_lbd_predict_intra_planar_c;
+ f265_lbd_predict_intra[23] = f265_lbd_predict_intra_dc_c;
+ f265_lbd_predict_intra[24] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[25] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[26] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[27] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[28] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[29] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[30] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[31] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[32] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[33] = f265_lbd_predict_intra_planar_c;
+ f265_lbd_predict_intra[34] = f265_lbd_predict_intra_dc_c;
+ f265_lbd_predict_intra[35] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[36] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[37] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[38] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[39] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[40] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[41] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[42] = f265_lbd_predict_intra_angular_c;
+ f265_lbd_predict_intra[43] = f265_lbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[0] = f265_hbd_predict_intra_planar_c;
+ f265_hbd_predict_intra[1] = f265_hbd_predict_intra_dc_c;
+ f265_hbd_predict_intra[2] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[3] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[4] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[5] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[6] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[7] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[8] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[9] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[10] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[11] = f265_hbd_predict_intra_planar_c;
+ f265_hbd_predict_intra[12] = f265_hbd_predict_intra_dc_c;
+ f265_hbd_predict_intra[13] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[14] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[15] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[16] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[17] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[18] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[19] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[20] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[21] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[22] = f265_hbd_predict_intra_planar_c;
+ f265_hbd_predict_intra[23] = f265_hbd_predict_intra_dc_c;
+ f265_hbd_predict_intra[24] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[25] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[26] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[27] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[28] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[29] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[30] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[31] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[32] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[33] = f265_hbd_predict_intra_planar_c;
+ f265_hbd_predict_intra[34] = f265_hbd_predict_intra_dc_c;
+ f265_hbd_predict_intra[35] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[36] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[37] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[38] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[39] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[40] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[41] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[42] = f265_hbd_predict_intra_angular_c;
+ f265_hbd_predict_intra[43] = f265_hbd_predict_intra_angular_c;
+ f265_lbd_extract_intra_neigh[0] = f265_lbd_extract_intra_neigh_c;
+ f265_lbd_extract_intra_neigh[1] = f265_lbd_extract_intra_neigh_c;
+ f265_lbd_extract_intra_neigh[2] = f265_lbd_extract_intra_neigh_c;
+ f265_lbd_extract_intra_neigh[3] = f265_lbd_extract_intra_neigh_c;
+ f265_hbd_extract_intra_neigh[0] = f265_hbd_extract_intra_neigh_c;
+ f265_hbd_extract_intra_neigh[1] = f265_hbd_extract_intra_neigh_c;
+ f265_hbd_extract_intra_neigh[2] = f265_hbd_extract_intra_neigh_c;
+ f265_hbd_extract_intra_neigh[3] = f265_hbd_extract_intra_neigh_c;
#ifdef F265_HAVE_ASM
if (avx2_flag)
@@ -631,6 +753,18 @@ static void f265_link_asm(int avx2_flag)
f265_lbd_interpol_luma_qpel_pix[27] = f265_lbd_interpol_luma_qpel_pix_48_h_avx2;
f265_lbd_interpol_luma_qpel_pix[28] = f265_lbd_interpol_luma_qpel_pix_48_v_avx2;
f265_lbd_interpol_luma_qpel_pix[29] = f265_lbd_interpol_luma_qpel_pix_48_d_avx2;
+ f265_lbd_predict_intra[11] = f265_lbd_predict_intra_planar_8_avx2;
+ f265_lbd_predict_intra[12] = f265_lbd_predict_intra_dc_8_avx2;
+ f265_lbd_predict_intra[13] = f265_lbd_predict_intra_dia_bot_left_8_avx2;
+ f265_lbd_predict_intra[14] = f265_lbd_predict_intra_hor_bot_8_avx2;
+ f265_lbd_predict_intra[15] = f265_lbd_predict_intra_hor_8_avx2;
+ f265_lbd_predict_intra[16] = f265_lbd_predict_intra_hor_top_8_avx2;
+ f265_lbd_predict_intra[17] = f265_lbd_predict_intra_dia_top_left_8_avx2;
+ f265_lbd_predict_intra[18] = f265_lbd_predict_intra_ver_left_8_avx2;
+ f265_lbd_predict_intra[19] = f265_lbd_predict_intra_ver_8_avx2;
+ f265_lbd_predict_intra[20] = f265_lbd_predict_intra_ver_right_8_avx2;
+ f265_lbd_predict_intra[21] = f265_lbd_predict_intra_dia_top_right_8_avx2;
+ f265_lbd_extract_intra_neigh[1] = f265_lbd_extract_intra_neigh_8_avx2;
}
#endif
}
diff --git a/f265/asm.h b/f265/asm.h
index 6402d1f..98e2f6d 100644
--- a/f265/asm.h
+++ b/f265/asm.h
@@ -30,6 +30,10 @@ typedef void(*f265_lbd_avg_pix_func)(uint8_t *dst, uint8_t *src0, int src0_strid
typedef void(*f265_hbd_avg_pix_func)(int16_t *dst, int16_t *src0, int src0_stride, int16_t *src1, int src1_stride, int packed_dims);
typedef void(*f265_lbd_interpol_luma_qpel_pix_func)(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
typedef void(*f265_hbd_interpol_luma_qpel_pix_func)(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
+typedef void(*f265_lbd_predict_intra_func)(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+typedef void(*f265_hbd_predict_intra_func)(int16_t *dst, int16_t *neighbours, int mode, int packed);
+typedef void(*f265_lbd_extract_intra_neigh_func)(uint8_t nbuf[2][160], uint8_t *pred, int pred_stride, int avail[2], int filter, int packed);
+typedef void(*f265_hbd_extract_intra_neigh_func)(int16_t nbuf[2][160], int16_t *pred, int pred_stride, int avail[2], int filter, int packed);
// Globals.
@@ -103,4 +107,16 @@ extern f265_lbd_interpol_luma_qpel_pix_func f265_lbd_interpol_luma_qpel_pix[30];
// Indices: X, X, X, 4_h, 4_v, 4_d, 8_h, 8_v, 8_d, 16_h, 16_v, 16_d, 32_h, 32_v, 32_d, 64_h, 64_v, 64_d, X, X, X, 12_h, 12_v, 12_d, 24_h, 24_v, 24_d, 48_h, 48_v, 48_d.
extern f265_hbd_interpol_luma_qpel_pix_func f265_hbd_interpol_luma_qpel_pix[30];
+// Indices: planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular.
+extern f265_lbd_predict_intra_func f265_lbd_predict_intra[44];
+
+// Indices: planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular.
+extern f265_hbd_predict_intra_func f265_hbd_predict_intra[44];
+
+// Indices: 4, 8, 16, 32.
+extern f265_lbd_extract_intra_neigh_func f265_lbd_extract_intra_neigh[4];
+
+// Indices: 4, 8, 16, 32.
+extern f265_hbd_extract_intra_neigh_func f265_hbd_extract_intra_neigh[4];
+
diff --git a/f265/asm/avx2/intra.asm b/f265/asm/avx2/intra.asm
new file mode 100644
index 0000000..5d06c00
--- /dev/null
+++ b/f265/asm/avx2/intra.asm
@@ -0,0 +1,948 @@
+; Copyright (c) 2014, VANTRIX CORPORATION. All rights reserved. See LICENSE.txt
+; for the full license text.
+
+%include "x86inc.asm"
+
+section .data
+align 32
+
+planar_8_left: db 14,15, 12,13, 6,7, 4,5, 0,0,0,0,0,0,0,0, ; Shuffle pattern to regroup the left and top-right
+ db 10,11, 8, 9, 2,3, 0,1, 0,0,0,0,0,0,0,0, ; pixels together for rows 0/2, 1/3, 4/6, 5/7.
+
+angle_mul_ver: dw 1, 2, 5, 6, 0, 0, 0, 0, ; Row index, shuffled to do 2 rows at the time.
+ dw 3, 4, 7, 8, 0, 0, 0, 0, ; Used to get the weight and offset of each row on vertical angles.
+
+triple_last_lane: db 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, ; Multiply high lane by 3 while keeping the
+ db 3,0, 3,0, 3,0, 3,0, 3,0, 3,0, 3,0, 3,0, ; low lane as-is.
+
+neigh_1_of_2: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15,
+ db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7,
+
+neigh_shift_pair: db 14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,
+ db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,
+
+
+align 16
+
+; Repeat values on a whole 8x8 row. Inversed for use in pure horizontal.
+ang_hor_8: db 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
+ db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+
+; Pshufb pattern to generate neighbour pairs.
+pair_low: db 0,1, 1,2, 2,3, 3,4, 4,5, 5,6, 6,7, 7,8,
+pair_high: db 7,8, 8,9, 9,10, 10,11, 11,12, 12,13, 13,14, 14,15
+
+angle_mul_hor: dw 1, 2, 3, 4, 5, 6, 7, 8, ; Row index. Used to get the weight and offset of each row on
+ ; horizontal angles.
+angle_inv_mul_hor: dw 0, 1, 2, 3, 4, 5, 6, 7, ; Multiplier for inv_angle_8 on horizontal angles.
+angle_inv_mul_ver: dw 7, 6, 5, 4, 3, 2, 1, 0, ; Multiplier for inv_angle_8 on vertical angles.
+
+dia_bot_left_8: db 14, 13, 12, 11, 10, 9, 8, 7 ; Invert byte order.
+ db 6, 5, 4, 3, 2, 1, 0, 15
+
+planar_wgt_hor: db 7, 1, 6, 2, 5, 3, 4, 4, ; Weight pair, used for planar row weighting.
+ db 3, 5, 2, 6, 1, 7, 0, 8,
+
+; Manage neighbour filtering edge case.
+neig_bl_unav_8: db 0,0,0,0,0,0,0,0, 0, 1, 2, 3, 4, 5, 6, 7
+
+pat_b_0_to_16: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+align 4
+
+; Seed on which the the neighbours offset of inversed angles are calculated.
+; As words (repeated 4 times) for speed-ups.
+inv_angle_8: db 16, 16, 16, 16
+ db 19, 19, 19, 19
+ db 24, 24, 24, 24
+ db 30, 30, 30, 30
+ db 39, 39, 39, 39
+ db 57, 57, 57, 57
+ db 102, 102, 102, 102
+
+; Seed on which the angles weights and offsets are calculated.
+; As words (repeated 4 times) for speed-ups.
+intra_angle: db 2, 2, 2, 2,
+ db 5, 5, 5, 5,
+ db 9, 9, 9, 9,
+ db 13, 13, 13, 13,
+ db 17, 17, 17, 17,
+ db 21, 21, 21, 21,
+ db 26, 26, 26, 26,
+
+; Pattern used as mask, bias, offset, ...
+; As double to use the more efficient vpbroadcastd.
+neigh_last_b_of_d: db 3, 7, 11, 15,
+pat_q_255: dq 0xff
+pat_w_8192: dw 8192, 8192,
+pat_w_2048: dw 2048, 2048,
+pat_w_1024: dw 1024, 1024,
+pat_w_32: dw 32, 32,
+pat_w_31: dw 31, 31,
+pat_w_8: dw 8, 8,
+pat_b_14_15: db 14,15, 14,15,
+pat_b_7_8: db 7,8, 7,8,
+pat_b_0_1: db 0,1, 0,1,
+pat_b_128: db 128, 128, 128, 128
+pat_b_15: db 15, 15, 15, 15,
+pat_b_7: db 7, 7, 7, 7
+pat_b_1: db 1, 1, 1, 1,
+
+
+section .text
+
+; 8x8 intra prediction functions.
+; There are 11 assembly function to cover all 8x8 intra prediction modes.
+; - Planar and DC.
+; - 3 pure diagonals:
+; - dia_bot_left.
+; - dia_top_left.
+; - dia_top_right.
+; - Pure vertical and horizontal.
+; - 4 diagonals:
+; - hor_bot.
+; - hor_top.
+; - ver_left.
+; - ver_right.
+;
+; They all have the same input parameters, although some input parameters may be ignored.
+; - g0: Destination.
+; - g1: Neighbours. 48 is the bottommost left neighbour. 63 is the topmost left neighbour.
+; 64 is the leftmost top neighbours. 79 is the rightmost top neighbours. 128 is the top-left neighbour.
+; - g2: Mode.
+; - g3: Filter edge flag.
+
+
+; Intra DC 8x8.
+DEFFUN f265_lbd_predict_intra_dc_8_avx2, ia=4, at=8844, ti=0, tv=6, ym=1
+ ; Logic:
+ ; Sum all neighbours, except the corners.
+ ; Divide with bias by the number of samples.
+
+ vpmovzxbw x1, [g1+56] ; Load all data.
+ vpmovzxbw x0, [g1+64]
+
+ vinserti128 y2, y0, x1, 1 ; Keep a copy for filtering.
+
+ vpaddw y1, y0 ; Add them together.
+
+ vpalignr y0, y1, 8 ; At each step, fold the register in 2...
+ vpaddw y1, y0 ; ... then add each value together.
+
+ vpalignr y0, y1, 4
+ vpaddw y1, y0
+
+ vpalignr y0, y1, 2
+ vpaddw y1, y0
+
+ vmovd x0, [pat_w_2048]
+ vpmulhrsw y1, y1, y0 ; Round.
+
+ vpbroadcastb y1, x1 ; Replicate the value.
+ vmovdqa y0, y1
+
+ and g3, 1
+ jz .SKIP_FILTER
+
+ ; 3 cases:
+ ; - Top-left = 2*base + top + left.
+ ; - Top = 3*base + top.
+ ; - Left = 3*base + left.
+
+ movd g2d, x1 ; Extract base.
+ and g2, 0xff
+
+ lea g3, [3*g2+2] ; Base * 3 + rounding bias.
+ movd x3, g3d
+ vpbroadcastw y3, x3 ; Broadcast base * 3 + rounding bias.
+
+ movzx g3, byte [g1+64] ; Load the first top and left value.
+ movzx ga, byte [g1+63]
+
+ vpaddw y2, y3 ; 3 * Base + neighbours + rounding bias.
+ vpsrlw y2, 2 ; Divide by 4.
+
+ vpackuswb y2, y2 ; Word to byte.
+
+ vpblendd y0, y2, y0, 0xfc ; Save in top row.
+
+ vpermq y2, y2, 0b10_10_10_10 ; Broadcast left column.
+
+ vmovdqu y3, [ang_hor_8]
+ vpbroadcastq y5, [pat_q_255]
+
+ vpshufb y4, y2, y3 ; Replicate 8x the 4 lower values.
+ vpsrldq y2, y2, 4 ; Shift by 4 to do the 4 last rows.
+ vpblendvb y1, y1, y4, y5 ; Blend only the first value of each row.
+
+ vpshufb y4, y2, y3 ; Replicate 8x the 4 lower values.
+ vpblendvb y0, y0, y4, y5 ; Blend only the first value of each row.
+
+ ; Do top-left.
+ add g3, ga ; Top + left.
+ lea g2, [2*g2+g3+2] ; Top + left + 2*base + bias.
+ shr g2, 2 ; Get the average.
+
+ vmovdqa y2, y0
+ vpinsrb x2, g2b, 0
+ vinserti128 y0, y0, x2, 0
+
+ .SKIP_FILTER:
+
+ vmovdqu [g0], y0
+ vmovdqu [g0+0x20], y1 ; Save the value.
+
+ RET
+
+
+; Intra planar 8x8.
+DEFFUN f265_lbd_predict_intra_planar_8_avx2, ia=4, at=8844, ti=0, tv=8, ym=1
+ ; value = ((8-x-1)*left + (8-y-1)*top + (x+1)*top_right + (y+1)*bottom_left + 8) >> 4);
+
+ vmovd x6, [g1+56-1] ; Load & broadcast bottom-left.
+ vpbroadcastb x6, x6
+
+ vpmovzxbw x1, [g1+64] ; Load top neighbours.
+ vpmovzxbw x6, x6
+
+ vmovq x7, [g1+72] ; Load & broadcast top right.
+ vpbroadcastb y7, x7
+
+ vpbroadcastd y0, [pat_b_0_1] ; Weight distribution pattern.
+
+ vpsllw y2, y1, 3 ; Top row * 8.
+ vpsubw y1, y6 ; Row delta (top neighbour - bottom-left).
+
+ vpsubw y2, y1 ; Top row * 7 + bottom-left.
+
+ vpsllw y3, y1, 1 ;
+ vpsubw y6, y2, y3 ; Top row * 5 + 3*bottom-left.
+ vinserti128 y2, y2, x6, 1 ; Get row 2 values.
+ vinserti128 y1, y1, x1, 1 ; Double the vertical delta removed at each line.
+
+ ; Register usage:
+ ; - y1: row delta.
+ ; - y2: row sum.
+
+ vpbroadcastq y3, [g1+64-8] ; Load left column.
+ vpunpcklbw y3, y7 ; Merge top right with left col.
+ vpshufb y3, [planar_8_left] ; Shuffle to do 2 columns at a time.
+
+ vbroadcasti128 y4, [planar_wgt_hor] ; Load weights.
+ vpbroadcastd y5, [pat_w_2048] ; Load rounding bias.
+
+ ; Register usage:
+ ; - y0: weight distribution pattern.
+ ; - y1: row vertical delta.
+ ; - y2: row vertical sum.
+ ; - y3: column values.
+ ; - y4: column weights.
+ ; - y5: rounding bias.
+
+ %macro DO_ROW 2 ; %1: alignment offset, %2: destination register.
+ %if %1 != 0
+ vpsubw y2, y1 ; Add delta to row sum.
+ vpalignr y%2, y3, %1*2 ; Offset column.
+ vpshufb y%2, y%2, y0 ; Repeat the column.
+ %else
+ vpshufb y%2, y3, y0 ; Repeat the column.
+ %endif
+
+ vpmaddubsw y%2, y4 ; Get the sum of all factors.
+ vpaddusw y%2, y2 ; Add vertical.
+ vpmulhrsw y%2, y5 ; Round.
+ %endmacro
+
+ DO_ROW 0, 6 ; Do row 0 and 2.
+ DO_ROW 1, 7 ; Do row 1 and 3.
+
+ vpackuswb y6, y7
+ vmovdqu [g0], y6
+
+ vpsubw y2, y1 ; Add offset to row value.
+ vpsubw y2, y1 ;
+
+ DO_ROW 2, 6 ; Do row 4 and 6.
+ DO_ROW 3, 7 ; Do row 5 and 7.
+
+ vpackuswb y6, y7
+ vmovdqu [g0+0x20], y6
+ %unmacro DO_ROW 2
+ RET
+
+
+; Intra pure diagonal bottom-left 8x8.
+DEFFUN f265_lbd_predict_intra_dia_bot_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
+
+ vmovdqu x0, [g1+48] ; Load all data.
+ vpshufb y0, [dia_bot_left_8] ; Re-order it.
+
+ vpalignr y1, y0, 2 ; Offset the pixels in the high lane to build rows 2 and 3.
+ vinserti128 y0, y0, x1, 1 ;
+
+ vpalignr y1, y0, 1 ; Create row 1 and 3.
+ vpunpcklqdq y2, y0, y1 ; Merge them with rows 0 and 2.
+ vmovdqu [g0], y2 ; Save row 0 to 3.
+
+ vpalignr y1, y0, 5 ; Offset to generate rows 4 to 7.
+
+ vpalignr y0, y0, 4 ; Repeat operation above for rows 4 to 7.
+ vpunpcklqdq y2, y0, y1 ;
+ vmovdqu [g0+0x20], y2 ;
+ RET
+
+
+; Do horizontal prediction on a single row.
+; Input:
+; - y0: weights.
+; - y1: neighbours offset pattern.
+; - y2: rounding bias.
+; - y5: neighbour offset increment for each row.
+; - y6: left neighbours.
+; - %1: row offset.
+; - %2: output register.
+%macro DO_ROW 2
+ %if %1 != 0
+ vpsubb y1, y5 ; Update neighbours offset.
+ %endif
+
+ vpshufb y%2, y6, y1 ; Generate neighbour pair.
+
+ ; Calculate row values.
+ vpmaddubsw y%2, y%2, y0 ; Multiply with weight.
+ vpmulhrsw y%2, y%2, y2 ; Round.
+%endmacro
+
+; Predict intra from left neighbours.
+; Input:
+; - y0: weights. (for DO_ROW)
+; - y1: neighbours offset pattern.
+; - y6: left neighbours.
+; Register usage:
+; - y2: rounding bias.
+; - y3: temp.
+; - y4: temp.
+; - y5: neighbour offset increment for each row.
+%macro PRED_LEFT 0
+ ; Load patterns.
+ vpbroadcastd y2, [pat_w_1024] ; Load rounding bias.
+
+ ; Calculate the offset for the high lane.
+ vpbroadcastd y5, [pat_b_1] ; Load neighbour position offsets.
+ vpsubb y3, y1, y5 ; Pre-offset by 2 the neighbour position.
+ vpsubb y3, y3, y5 ; Will be used to calculate 2 rows at once.
+ vinserti128 y1, y1, x3, 1 ; Put the offsetted load pattern on the high lane.
+
+ DO_ROW 0, 3 ; Do row 0 and 2.
+ DO_ROW 1, 4 ; Do row 1 and 3.
+
+ vpackuswb y3, y4 ; Merge value.
+ vmovdqu [g0+0x00], y3 ; Save result.
+
+ vpsubb y1, y5 ; Skip from rows 1|3 to rows 4|6.
+ vpsubb y1, y5
+
+ DO_ROW 4, 3 ; Do row 4 and 6.
+ DO_ROW 5, 4 ; Do row 5 and 7.
+
+ vpackuswb y3, y4 ; Merge value.
+ vmovdqu [g0+0x20], y3 ; Save result.
+%endmacro
+
+; Generate offset and weight for intra left prediction.
+; Input:
+; - g2: mode.
+; - %1: 1 for hor top. 0 for hoz bottom.
+; Register usage:
+; - ga: temp.
+; - y2: temp.
+; Output:
+; - y0: weights.
+; - y1: neighbours offset pattern.
+%macro GEN_LEFT_WEIGHT 1
+ ; Generate weight and offset.
+ lea ga, [intra_angle]
+ %if %1
+ vpbroadcastd y1, [ga+g2*4-11*4] ; Load angle factor.
+ neg g2 ; Get the angle's inversed offset.
+ %else
+ neg g2 ; Get the angle's inversed offset.
+ vpbroadcastd y1, [ga+g2*4+9*4] ; Load angle factor.
+ %endif
+ vbroadcasti128 y2, [angle_mul_hor] ; Load multiplication table.
+ vpmaddubsw y1, y1, y2 ; Result in offset and weight for each column.
+
+ ; Generate weight.
+ vpbroadcastd y2, [pat_w_31] ; Load weight mask.
+ vpand y2, y2, y1 ; Extract weight.
+
+ ; Generate weight pairs.
+ vpbroadcastd y0, [pat_w_32] ; Load weight complement base.
+ vpsubw y0, y2 ; Get weight complements.
+
+ vpackuswb y2, y2, y2 ; Word to byte.
+ vpackuswb y0, y0, y0 ; Word to byte.
+ %if %1
+ vpunpcklbw y0, y0, y2 ; Make the pair. Final weight.
+ %else
+ vpunpcklbw y0, y2, y0 ; Make the pair. Final weight.
+ %endif
+
+ ; Generate offsets.
+ vpsrlw y1, y1, 5 ; Extract neighbour offset.
+ vpsllw y2, y1, 8 ; Double the offset (twice for each pair).
+ vpor y1, y2
+
+ %if %1
+ vpbroadcastd y2, [pat_b_7_8] ; Load base offset pattern.
+ vpaddw y1, y2, y1 ; Add offset with base. Result in actual neighbour position.
+ %else
+ vpbroadcastd y2, [pat_b_14_15] ; Load base offset pattern.
+ vpsubw y1, y2, y1 ; Add the angle offset to the base offset.
+ %endif
+%endmacro
+
+
+; Intra angular horizontal bottom 8x8.
+DEFFUN f265_lbd_predict_intra_hor_bot_8_avx2, ia=4, at=8844, ti=0, tv=7, ym=1
+ vbroadcasti128 y6, [g1+48] ; Load left column.
+
+ GEN_LEFT_WEIGHT 0
+ PRED_LEFT
+ RET
+
+
+; Intra pure horizontal 8x8.
+DEFFUN f265_lbd_predict_intra_hor_8_avx2, ia=4, at=8844, ti=0, tv=4, ym=1
+ vmovdqu y0, [ang_hor_8] ; Load shuffle mask.
+
+ vpbroadcastd y1, [g1+63-3] ; Load the first 4 rows.
+ vpbroadcastd y2, [g1+63-7] ; Load the second 4 rows.
+
+ vpshufb y1, y1, y0 ; Replicate 8 times each value.
+ vpshufb y2, y2, y0 ;
+
+ and g3, 1
+ jz .SKIP_FILTER
+
+ vpmovzxbw x0, [g1+64] ; Load top row.
+ vmovd x3, [g1+128] ; Load & broadcast top-left.
+ vpbroadcastb x3, x3
+
+ vpmovzxbw x3, x3 ; Byte to word.
+
+ vpsubw x0, x3 ; top - top-left.
+ vpsraw x0, 1 ; (top - top-left)/2.
+
+ vmovd x3, [g1+63] ; Load left.
+ vpbroadcastb x3, x3
+ vpmovzxbw x3, x3 ; Byte to word.
+ vpaddw x0, x3 ; Left + (top - top-left)/2.
+
+ vpxor x3, x3 ; Replace negative values by 0.
+ vpmaxsw x0, x3 ;
+
+ vpackuswb x0, x0 ; Word to byte with unsigned saturation.
+
+ vpblendd y1, y0, y1, 0xfc ; Update the first 8 bytes.
+
+ .SKIP_FILTER:
+ vmovdqu [g0], y1 ; Save it.
+ vmovdqu [g0+0x20], y2 ;
+
+ RET
+
+
+; Intra angular horizontal top 8x8.
+DEFFUN f265_lbd_predict_intra_hor_top_8_avx2, ia=4, at=8844, ti=0, tv=7, ym=1
+ GEN_LEFT_WEIGHT 1
+
+ vmovdqu x5, [g1+64] ; Load top neighbour.
+ vpalignr x5, x5, x5, 15
+ vpinsrb x5, [g1+128], 0 ; Insert the top-left neighbour.
+
+ ; Import top neighbour with the left ones.
+ lea g3, [inv_angle_8]
+ vpbroadcastd y4, [g3+g2*4+18*4] ; Load the inversed angle values.
+ vmovdqu x3, [angle_inv_mul_hor] ; Load the weight values.
+ vpmaddubsw y4, y4, y3 ; Get the weight. Some neighbour will have an invalid offset.
+ ; Since we never read them, it's ok.
+ vpbroadcastd y3, [pat_w_8] ; Load inversed angle bias.
+ vpaddw y4, y3 ; Add inversed angle bias.
+ vpsraw y4, 4 ; Get inversed neighbour offset.
+ vpackuswb y4, y4 ; Word to byte.
+ vpshufb y5, y4 ; Re-order left neighbours.
+
+ ; Load patterns.
+ vmovq x4, [g1+56] ; Load left data.
+ vpblendd y5, y4, y5, 0xfc ; Blend left neighbours with top neighbours.
+ vinserti128 y6, y5, x5, 1 ; Double data.
+
+ PRED_LEFT
+ RET
+
+%unmacro GEN_LEFT_WEIGHT 1
+%unmacro PRED_LEFT 0
+%unmacro DO_ROW 2
+
+
+; Intra pure diagonal top left 8x8.
+DEFFUN f265_lbd_predict_intra_dia_top_left_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
+ vmovq x0, [g1+64-7] ; Load top row.
+ vmovhps x0, [g1+64] ; Load left row.
+ vpinsrb x0, [g1+128], 7 ; Load & insert top-left.
+
+ vpalignr y1, y0, 2 ; Offset the pixels in the high lane to build rows 6 and 7.
+ vinserti128 y0, y1, x0, 1 ;
+
+ vpalignr y1, y0, 1 ; Create row 5 and 7.
+ vpunpcklqdq y2, y1, y0 ; Merge them with rows 4 and 6.
+ vmovdqu [g0+0x20], y2 ;
+
+ vpalignr y0, y0, 4 ; Offset to generate rows 0 to 3.
+
+ vpalignr y1, y0, 1 ; Repeat operation above for row 0 to 3.
+ vpunpcklqdq y2, y1, y0 ;
+ vmovdqu [g0], y2 ; Save rows 0 to 3.
+ RET
+
+
+; Generate vertical intra prediction of 2 rows.
+; Input:
+; - %1: row index.
+; - %3: 0 for vertical right. 1 for vertical left.
+; - y0: rounding bias.
+; - y1: broadcast weights paterns.
+; - y2: pre-calculated weights.
+; - y5: temp.
+; - y6: base offset pattern
+; - y7: angle sum.
+; - y8: angle increment.
+; - y9: shift mask.
+; - y10: neighbours.
+; Output:
+; - %2: predicted row.
+%macro DO_ROW 3
+ %if %1 != 0
+ vpaddb y7, y8 ; Add the angle to the current angle sum. Generate the offset.
+ %endif
+
+ ; Generate the neighbours pairs.
+ vpsrlw y%2, y7, 5 ; Generate neighbour offset.
+ vpand y%2, y9 ; Shift can only be on word or greater value. Mask to simulate byte shift.
+ %if %3
+ vpsubb y%2, y6, y%2, ; Generate pair offset.
+ %else
+ vpaddb y%2, y6 ; Add offset to pairing mask.
+ %endif
+ vpshufb y%2, y10, y%2 ; Generate pair.
+
+ ; Broadcast the current weights.
+ %if %1 != 0
+ vpalignr y5, y2, %1*2 ; Get weights.
+ vpshufb y5, y1 ; Broadcast weights.
+ %else
+ vpshufb y5, y2, y1 ; Broadcast weights.
+ %endif
+
+ ; Calculates row predictions.
+ vpmaddubsw y%2, y%2, y5 ; Multiply values with weight.
+ vpmulhrsw y%2, y%2, y0 ; Round.
+%endmacro
+
+; Input:
+; - %1: 0 for vertical right. 1 for vertical left.
+; - g0: Result array.
+; - y10: Top row. Replicated.
+; Register usage :
+; - y0: Rounding bias.
+; - y1: Word replication pattern.
+; - y2: Weights, distributed to do 2 rows at a time.
+; - y3: 2 rows of results [0|2].
+; - y4: 2 rows of results [1|3].
+; - y5: Temp.
+; - y6: Generate pair.
+; - y7: Angle sum. Used to generate the offset.
+; - y8: Angle value. Add it to the sum at each row.
+; - y9: "Word shift as byte shift" mask pattern.
+%macro ANG_VERTICAL_PRED 1
+ ; Calculate the angle offset base.
+ lea g3, [intra_angle]
+ %if %1
+ neg g2
+ vpbroadcastd y8, [g3+g2*4+25*4] ; Load angle factor.
+ %else
+ vpbroadcastd y8, [g3+g2*4-27*4] ; Load angle factor.
+ %endif
+
+ vpmaddubsw y7, y8, [triple_last_lane] ; Multiply high lane by 3. Offset required to do 2 rows at the time.
+ vpackuswb y7, y7 ; This is the angle sum for each row.
+
+ ; Calculate the weight.
+ %if %1
+ vmovdqu y3, [angle_mul_ver] ; Load multiplication table.
+ vpmaddubsw y2, y3, y8 ; Offset and weight for all rows.
+ %else
+ vpmaddubsw y2, y8, [angle_mul_ver] ; Offset and weight for all rows.
+ %endif
+ vpbroadcastd y3, [pat_w_31] ; Load mask.
+ vpand y3, y3, y2 ; Weight.
+
+ vpbroadcastd y4, [pat_w_32] ; Load weights complement base.
+ vpsubw y4, y3 ; Get the weight complement.
+ %if %1
+ vpunpcklbw y2, y3, y4 ; Make the pair. Final weight.
+ %else
+ vpunpcklbw y2, y4, y3 ; Make the pair. Final weight.
+ %endif
+
+ ; Load patterns.
+ %if %1
+ vbroadcasti128 y6, [pair_high] ; Load pair making pattern.
+ %else
+ vbroadcasti128 y6, [pair_low] ; Load pair making pattern.
+ %endif
+ vpbroadcastd y0, [pat_w_1024] ; Load rounding bias.
+ vpbroadcastd y1, [pat_b_0_1] ; Load weight distribution pattern.
+ vpbroadcastd y9, [pat_b_7] ; Load "word shift as byte shift" mask pattern.
+
+ DO_ROW 0, 3, %1 ; Do row 0 and 2.
+ DO_ROW 2, 4, %1 ; Do row 1 and 3.
+
+ vpackuswb y3, y4 ; Merge value.
+ vmovdqu [g0+0x00], y3 ; Save result.
+
+ vpaddb y7, y8 ; Skip from rows 1|3 to rows 4|6.
+ vpaddb y7, y8
+
+ DO_ROW 4, 3, %1 ; Do row 4 and 6.
+ DO_ROW 6, 4, %1 ; Do row 5 and 7.
+
+ vpackuswb y3, y4 ; Merge value.
+ vmovdqu [g0+0x20], y3 ; Save result.
+%endmacro
+
+
+; Intra angular vertical left 8x8.
+DEFFUN f265_lbd_predict_intra_ver_left_8_avx2, ia=4, at=8844, ti=0, tv=11, ym=1
+ vmovq x0, [g1+64-8] ; Load top and left data.
+ vpinsrb x0, [g1+128], 8 ; Load top-left.
+
+ ; Re-order the left neighbours.
+ lea g3, [inv_angle_8]
+ vpbroadcastd y2, [g3+g2*4-18*4] ; Load the inversed angle values.
+ vmovdqu x3, [angle_inv_mul_ver] ; Load the inversed weight values.
+ vpmaddubsw y2, y2, y3 ; Get the weight. Some neighbour will have an invalid offset.
+ ; Since we never use them, it's ok.
+ vpbroadcastd y3, [pat_w_8] ; Load inversed angle bias.
+ vpaddw y2, y3 ; Add inversed angle bias.
+ vpsraw y2, 4 ; Get inversed neighbour offset.
+ vpsubb y2, y3, y2 ; Invert the index.
+ vpackuswb y2, y2 ; Word to byte.
+ vpshufb y0, y2 ; Re-order left neighbours.
+
+ ; Blend re-ordered neighbours with the top neighbours.
+ vmovhps x0, [g1+64]
+ vinserti128 y10, y0, x0, 1 ; Double top row.
+
+ ANG_VERTICAL_PRED 1
+ RET
+
+
+; Intra pure vertical 8x8.
+DEFFUN f265_lbd_predict_intra_ver_8_avx2, ia=4, at=8844, ti=0, tv=6, ym=1
+ vpbroadcastq y0, [g1+64] ; Copy the top neighbours 4 times. Holds row 0 to 3.
+ vmovdqa y4, y0 ; Copy it. Holds row 4 to 7.
+
+ and g3, 1
+ jz .SKIP_FILTER
+
+ vmovd x3, [g1+128] ; Load left.
+ vpbroadcastb x3, x3
+ vpmovzxbw x2, [g1+64-8] ; Load left neighbours.
+ vpbroadcastb x1, x0 ; Broadcast top neighbours.
+
+ vpmovzxbw x3, x3 ; Word to byte.
+ vpmovzxbw x1, x1
+
+ vpsubw x2, x3 ; Left - top-left.
+ vpsraw x2, 1 ; Signed divide by 2.
+ vpaddw x2, x1 ; Top + (left - top-left)/2.
+
+ vpxor x3, x3
+ vpmaxsw x2, x3 ; Clip negative value to 0.
+ vpackuswb x2, x2 ; Word to byte with unsigned saturation.
+ vinserti128 y2, x2, 1 ; Double the data.
+
+ vmovdqu y3, [ang_hor_8] ; Load replication pattern.
+ vpbroadcastq y1, [pat_q_255] ; Pattern that blends in a word out of 8.
+
+ vpshufb y5, y2, y3 ; Replicate 8x the 4 lower values.
+
+ vpsrldq y2, y2, 4 ; Shift by 4 to do the 4 last rows.
+
+ vpblendvb y4, y5, y1 ; Blend only the first value of each row.
+
+ vpshufb y5, y2, y3 ; Replicate 8x the 4 lower value.
+ vpblendvb y0, y5, y1 ; Blend only the first value of each row.
+
+ .SKIP_FILTER:
+ vmovdqu [g0+0x00], y0 ; Save it.
+ vmovdqu [g0+0x20], y4 ;
+
+ RET
+
+
+; Intra angular vertical right 8x8.
+DEFFUN f265_lbd_predict_intra_ver_right_8_avx2, ia=4, at=8844, ti=0, tv=11, ym=1
+
+ vbroadcasti128 y10, [g1+64] ; Load top row.
+
+ ANG_VERTICAL_PRED 0
+ RET
+
+%unmacro DO_ROW 3
+%unmacro ANG_VERTICAL_PRED 1
+
+
+; Intra angular top right 8x8.
+DEFFUN f265_lbd_predict_intra_dia_top_right_8_avx2, ia=4, at=8844, ti=0, tv=3, ym=1
+ vmovdqu x0, [g1+64] ; Load all data.
+
+ vpalignr y1, y0, 3 ; Offset the pixels in the high lane to build rows 2 and 3.
+ vpalignr y0, y0, 1 ;
+ vinserti128 y0, y0, x1, 1 ; Push offsetted value in high lane.
+
+ vpalignr y1, y0, 1 ; Create rows 1 and 3.
+ vpunpcklqdq y2, y0, y1 ; Merge them with rows 0 and 2.
+ vmovdqu [g0], y2 ; Save rows 0 to 3.
+
+ vpalignr y1, y0, 5 ; Offset to generate rows 4 to 7.
+
+ vpalignr y0, y0, 4 ; Repeat operation above for rows 4 to 7.
+ vpunpcklqdq y2, y0, y1 ;
+ vmovdqu [g0+0x20], y2 ;
+
+ RET
+
+
+; Extract and filter neighbours for intra prediction.
+;
+; Input format:
+; EAABB
+; C
+; C
+; D
+; D
+;
+; Output format:
+; padding [48] [64] padding [128]
+; [ ... DDCC AACC ... E]
+;
+; Input parameters:
+; - g0: nbuf[2][160].
+; - g1: pred.
+; - g2: pred_stride.
+; - g3: avail[2].
+; - g4: filter_flag.
+; - g5: packed (Ignored).
+DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=1, tv=8, ym=1
+ ; Load availability.
+ movzx g5, byte [g3] ; Load availx.
+ movzx g6, byte [g3+4] ; Load availy.
+
+ ; Test for special case: no left neighbours.
+ cmp g6, 0
+ jz .LEFT_NOT_AVAILABLE
+
+ ; Left neighbours are available.
+
+ ; Get C from the prediction buffer.
+ ; Pseudo-code:
+ ; - Load & broadcast as dword the left neighbour of each row.
+ ; - Blend the rows together.
+ ; - Keep in mind the order needs to be inversed.
+
+ ; Get 4 left neighbours.
+ ; Input:
+ ; - %1: the xmm register in which to save the value,
+ ; - %2: temp.
+ ; - %3: temp.
+ ; - ga: first row address. Must be aligned on the dword left of the row.
+ ; - g2: pred_stride.
+ ; - g3: 3*pred_stride.
+ %macro load 3
+ vpbroadcastd %1, [ga] ; Load & broadcast the left neighbour.
+ vpbroadcastd %2, [ga+g2] ; Load & broadcast the next left neighbour.
+ vpblendd %1, %1, %2, 0b0101_0101 ; Mix even and odd row: result 1 0 1 0.
+
+ vpbroadcastd %2, [ga+g2*2] ; Load & broadcast the next left neighbour.
+ vpbroadcastd %3, [ga+g3] ; Load & broadcast the next left neighbour.
+ vpblendd %2, %2, %3, 0b0101_0101 ; Mix even and odd row: result 3 2 3 2.
+
+ vpblendd %1, %1, %2, 0b0011_0011 ; Mix 1 0 and 3 2. Result 3 2 1 0.
+ vpshufb %1, x7 ; Keep the last byte of each dword.
+ %endmacro
+
+ vpbroadcastd x7, [neigh_last_b_of_d] ; Load suffle mask.
+
+ lea ga, [g1-4]
+ lea g3, [g2*3]
+ load x0, x1, x2 ; Load C0 to C3.
+
+ lea ga, [ga+g2*4]
+ load x3, x1, x2 ; Load C4 to C7.
+
+ vpblendd x0, x0, x3, 0b01010101 ; Get C7..C0.
+
+ ; Special case: no top neighbours.
+ cmp g5, 0
+ jz .TOP_NOT_AVAILABLE
+
+ ; Load top (A and B) neighbour from pred.
+ mov ga, g2
+ neg ga ; Move up 1 row (negative pred_stride).
+ vmovdqu x1, [g1+ga] ; Load A|B from prediction.
+ vmovd x2, [g1+ga-1] ; Load top-left (E).
+
+ .LEFT_AND_TOP_FETCHED:
+
+ ; Test if bottom-left is available.
+ cmp g6, 8
+ ja .BOTTOM_AVAILABLE
+
+ ; Bottom-left not available.
+ vpshufb x0, [neig_bl_unav_8] ; Expand the last value.
+
+ .BOTTOM_FETCHED:
+
+ vmovdqu [g0+48], x0 ; Save partial top and left to allow easy byte extraction.
+ vmovdqu [g0+64], x1
+
+ movd x3, g5d
+ vpbroadcastb x3, x3
+ vpcmpgtb x3, [pat_b_0_to_16]
+ vmovd x4, [g0+63+g5] ; Broadcast the last available block.
+ vpbroadcastb x4, x4
+ vpblendvb x1, x4, x1, x3 ; Replace (blend) invalid value with the broadcasted last valid values.
+
+ vmovdqu [g0+48], x0 ; Save values.
+ vmovdqu [g0+64], x1
+ vmovdqu [g0+128], x2
+
+ ; Filter only if required.
+ cmp g4, 0
+ je .END
+
+ ; Pseudo code:
+ ; Register ordering : D7, D6 ... D0, C7, ... C0, E, A0, ..., A7, B0, ... B6, B7.
+ ; V[i] = (V[i-1] + 2*V[i] + V[i+1] + 2) >> 2
+ ; D7 = D7, B7 = B7
+
+ vpbroadcastd y6, [pat_b_1] ; Load pmadd pattern (actually, just an add and zero extend).
+ vpbroadcastd y5, [pat_w_8192] ; Load rounding bias.
+
+ vpslldq x4, x2, 15 ; Move the top-left (e) to the last byte of the xmm register.
+ vpalignr x3, x2, x0, 1 ; Remove D7 and insert E next to C0.
+ ; All bytes are shifted by one. Named D|C*.
+ vpalignr x4, x1, x4, 15 ; Remove B7 and insert E next to A0.
+ ; All bytes are shifted by one. Named A|B*.
+
+ vinserti128 y0, y0, x1, 1 ; Pack D|C with A|B.
+ vinserti128 y3, y3, x4, 1 ; Pack D|C* with A|B*.
+
+ vpmaddubsw y0, y0, y6 ; Add the neighbours together.
+ vpmaddubsw y3, y3, y6 ; As D|C|AB* is DC|A|B offsetted by one byte, this will generate all
+ ; D|C and A|B peer. The innermost value of D|C|A|B* will be C0+E and E+A0.
+
+ vpaddw y1, y0, y3 ; Add D|C|A|B to D|C|A|B*.
+ vpmulhrsw y1, y1, y5 ; Round.
+
+ vpshufb y3, [neigh_shift_pair] ;
+ vpaddw y0, y3, y0 ; Generate the missing pair sums.
+ vpmulhrsw y0, y0, y5 ; Round.
+
+ vpackuswb y0, y0, y1 ; Word to byte.
+ vpshufb y0, [neigh_1_of_2] ; Interleave the result.
+
+ vextracti128 x1, y0, 1
+
+ vpinsrb x0, [g0+48], 0 ; Manage D7.
+ vmovdqu [g0+160+48], x0 ; Save it.
+
+ vpinsrb x1, [g0+79], 15 ; Manage B7.
+ vmovdqu [g0+160+64], x1 ; Save it.
+
+ ; Filter top-left.
+ movzx g2, byte [g0+128] ; Load top-left.
+ movzx g3, byte [g0+63] ; Load top.
+ movzx g4, byte [g0+64] ; Load left.
+ lea g2, [g2*2+g3+2] ; Top-left * 2 + top + bias.
+ add g2, g4 ; Top-left * 2 + top + left + bias.
+ shr g2, 2 ; Round.
+ mov [g0+160+128], g2b ; Save filtered top-left.
+
+ .END:
+ RET
+
+
+ .LEFT_NOT_AVAILABLE:
+
+ ; Test if top is available.
+ cmp g5, 0
+ jz .NOTHING_AVAILABLE
+
+ mov ga, g2
+ neg ga
+ vmovdqu x1, [g1+ga] ; Load top value
+ vpbroadcastb x0, x1 ; Broadcast the first byte as the left value.
+ vmovdqa x2, x1 ; Set top-left.
+ jmp .LEFT_AND_TOP_FETCHED
+
+
+ .TOP_NOT_AVAILABLE:
+
+ vpbroadcastd x2, [pat_b_15]
+ vpshufb x1, x0, x2 ; Replicate C0 as the top neighbours.
+ vmovdqa x2, x1 ; Set top-left.
+ jmp .LEFT_AND_TOP_FETCHED
+
+
+ .BOTTOM_AVAILABLE:
+
+ ; Get D from the pred buffer.
+ lea ga, [g1-4+8*g2]
+ load x3, x4, x5
+
+ lea ga, [ga+g2*4]
+ load x4, x5, x6
+
+ vpblendd x3, x3, x4, 0b0101
+
+ ; Merge C and D.
+ vpblendd x0, x0, x3, 0b0011 ; [f e d c b a 9 8 7 6 5 4 3 2 1 0].
+
+ cmp g6, 12
+ ja .BOTTOM_FETCHED
+
+ ; Broadcast the 5th value over the invalid neighbours.
+ vpalignr x3, x0, x0, 4
+ vpbroadcastb x3, x3
+ vpblendd x0, x0, x3, 0b0001
+ jmp .BOTTOM_FETCHED
+
+
+ .NOTHING_AVAILABLE:
+
+ vpbroadcastd y0, [pat_b_128] ; Store 128 everywhere.
+
+ vmovdqu [g0+48], x0 ; Save it.
+ vmovdqu [g0+64], x0
+ vmovd [g0+128], x0
+
+ vmovdqu [g0+160+48], x0 ; Save the filtered version.
+ vmovdqu [g0+160+64], x0
+ vmovd [g0+160+128], x0
+ RET
+
+ %unmacro load 2
diff --git a/f265/bdi.h b/f265/bdi.h
index 9d2e4ce..c8fab2c 100644
--- a/f265/bdi.h
+++ b/f265/bdi.h
@@ -502,6 +502,7 @@ extern const int16_t f265_lambdas[52];
extern uint16_t f265_mv_costs[52][F265_NB_MV_COSTS];
extern const int8_t f265_hpel_src0[16];
extern const int8_t f265_hpel_src1[16];
+extern const int8_t f265_mode_to_intra_pred[35];
///////////////////////////////////////////////////////////////////////////////
diff --git a/f265/bdi_ro.c b/f265/bdi_ro.c
index ea26a9e..a771aaa 100644
--- a/f265/bdi_ro.c
+++ b/f265/bdi_ro.c
@@ -608,3 +608,6 @@ const int16_t f265_lambdas[52] =
const int8_t f265_hpel_src0[16] = { 0, 0, 1, 1, 0, 1, 1, 1, 2, 2, 3, 3, 0, 1, 1, 1 };
const int8_t f265_hpel_src1[16] = { 9, 1, 9, 0, 2, 2, 3, 2, 9, 3, 9, 2, 2, 2, 3, 2 };
+// Map intra prediction mode to optimized assembly function.
+const int8_t f265_mode_to_intra_pred[35] = {0, 1, 2, 3,3,3,3,3,3,3, 4, 5,5,5,5,5,5,5, 6, 7,7,7,7,7,7,7,
+ 8, 9,9,9,9,9,9,9, 10};
diff --git a/f265/enc.h b/f265/enc.h
index 2711f8e..367bea7 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -798,7 +798,14 @@ typedef struct f265_intra_block
int8_t intra_dist_mode;
// Unfiltered/filtered neighbours of the current partition.
- F265_ALIGN64 f265_pix neighbours[2][129];
+ // Layout:
+ // - [0-63] : Bottom left and left neighbours, packed near index 63
+ // (index 63 is always the topmost left neighbour).
+ // - [64-127] : Top and top left neighbours, packed near index 64.
+ // (index 64 is always the leftmost top neighbour).
+ // - [128] : Top-left neighbours
+ // - [129-159] : Alignment padding.
+ F265_ALIGN64 f265_pix neighbours[2][160];
} f265_intra_block;
@@ -2837,17 +2844,13 @@ void fenc_mc_chroma_b(f265_enc_thread *t, f265_pix *dst, int dst_stride, f265_re
int packed_dims, int plane_off, int comp);
// intra.c
-void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag, int *neighbour_bilinear_flag,
- int comp, int lg_bs, int mode, int smooth_intra_flag);
+void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag,
+ int comp, int lg_bs, int mode);
void fenc_get_intra_encode_flags(int *dst_flag, int *order, int comp, int lg_bs, int mode);
-void fenc_predict_intra_neighbours(f265_enc_thread *t, f265_pix dst[129], int rec_flag,
- int comp, int bs, int ct_ox, int ct_oy);
-void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int bd, int bilinear_flag);
-void fenc_predict_intra_planar(f265_pix *dst, f265_pix *nbuf, int lg_bs);
-void fenc_predict_intra_dc(f265_pix *dst, f265_pix *nbuf, int lg_bs, int filter_edge_flag);
-void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd, int filter_edge_flag, int mode);
+void fenc_extract_intra_neighbours(f265_enc_thread *t, f265_pix dst[2][160], int ct_off[2],
+ int packed, int filter, int lg_bs);
void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, int bd, int mode, int filter_edge_flag);
-void fenc_predict_intra(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy);
+void fenc_predict_intra_block(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy);
void fenc_get_intra_pred_mode(f265_enc_thread *t, f265_cb *cb, int partition_idx, int *mpm_list);
// inter.h
diff --git a/f265/intra.c b/f265/intra.c
index d512e2b..d17e3cf 100644
--- a/f265/intra.c
+++ b/f265/intra.c
@@ -45,13 +45,12 @@ static finline void fenc_get_intra_nb_avail(f265_enc_thread *t, int avail[2], in
// - filter_edge_flag: true if the edges are filtered for DC/vertical/horizontal.
// - filter_neighbour_flag: true if the neighbours are filtered.
// - neighbour_bilinear_flag: true if the neighbours are filtered bilinearly.
-void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag, int *neighbour_bilinear_flag,
- int comp, int lg_bs, int mode, int smooth_intra_flag)
+void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag,
+ int comp, int lg_bs, int mode)
{
int bs = 1<<lg_bs;
*filter_edge_flag = !comp && bs != 32;
*filter_neighbour_flag = !comp && f265_intra_mode_dist[mode] > f265_intra_dist_thresholds[lg_bs-2];
- *neighbour_bilinear_flag = bs == 32 && smooth_intra_flag;
}
// Get the intra encoding flags.
@@ -66,163 +65,205 @@ void fenc_get_intra_encode_flags(int *dst_flag, int *order, int comp, int lg_bs,
}
// Extract the unfiltered neighbour pixels of the specified intra block for one
-// image component. 'src' points to the top-left neighbour pixel of the block,
-// 'nx' and 'ny' are the number of pixels to predict in each direction. 'bd' is
-// the bit depth.
-static finline void fenc_extract_intra_neighbours(f265_pix dst[129], f265_pix *src, int src_stride,
- int availx, int availy, int nx, int ny, int bd)
+// image component. 'src' points to the top-left pixel of the block,
+// 'avail' are the number of pixels available in each direction. 'packed' is
+// the bitdepth and the block size (bd << 8 | bs).
+static finline void fenc_extract_unfiltered_intra_neigh(f265_pix dst[129], f265_pix *src, int src_stride,
+ int avail[2], int packed)
{
// The following logic relies on the slice layout restrictions.
+ int bs = packed&255;
+ int bd = packed>>8;
+
+ int nb_pix = bs*2;
+ int availx = avail[0];
+ int availy = avail[1];
// Copy top-left tentatively.
- dst[0] = src[0];
+ dst[128] = src[-1-src_stride];
// Left is fully available, copy.
- if (likely(availy >= ny))
+ if (likely(availy >= nb_pix))
{
- for (int i = 0; i < ny; i++) dst[65+i] = src[(1+i)*src_stride];
+ for (int i = 0; i < nb_pix; i++) dst[63-i] = src[i*src_stride-1];
}
// Left is partially available, copy and broadcast.
else if (likely(availy > 0))
{
- for (int i = 0; i < availy; i++) dst[65+i] = src[(1+i)*src_stride];
- f265_pix p = dst[64+availy];
- for (int i = availy; i < ny; i++) dst[65+i] = p;
+ for (int i = 0; i < availy; i++) dst[63-i] = src[i*src_stride-1];
+ f265_pix p = dst[63-availy+1];
+ for (int i = availy; i < nb_pix; i++) dst[63-i] = p;
}
// Left and top-left are not available but top is. Broadcast the first
// pixel directly above the block.
else if (likely(availx > 0))
{
- f265_pix p = src[1];
- dst[0] = p;
- for (int i = 0; i < ny; i++) dst[65+i] = p;
+ f265_pix p = src[-src_stride];
+ dst[128] = p;
+ for (int i = 0; i < nb_pix; i++) dst[63-i] = p;
}
// Nothing is available, perform DC prediction.
else
{
f265_pix p = 1<<(bd-1);
- for (int i = 0; i < nx+1; i++) dst[i] = p;
- for (int i = 0; i < ny; i++) dst[65+i] = p;
+ dst[128] = p;
+ for (int i = 0; i < nb_pix; i++) dst[64+i] = p;
+ for (int i = 0; i < nb_pix; i++) dst[63-i] = p;
return;
}
// Top is fully available, copy.
- if (likely(availx >= nx))
+ if (likely(availx >= nb_pix))
{
- for (int i = 0; i < nx; i++) dst[1+i] = src[1+i];
+ for (int i = 0; i < nb_pix; i++) dst[64+i] = src[i-src_stride];
}
// Top is partially available, copy and broadcast.
else if (likely(availx > 0))
{
- for (int i = 0; i < availx; i++) dst[1+i] = src[1+i];
- f265_pix p = dst[availx];
- for (int i = availx; i < nx; i++) dst[1+i] = p;
+ for (int i = 0; i < availx; i++) dst[64+i] = src[i-src_stride];
+ f265_pix p = dst[64+availx-1];
+ for (int i = availx; i < nb_pix; i++) dst[64+i] = p;
}
// Top-left, top, top-right are not available. Broadcast the first pixel
// directly left of the block.
else
{
- f265_pix p = dst[65];
- for (int i = 0; i < nx+1; i++) dst[i] = p;
+ f265_pix p = dst[63];
+ dst[128] = p;
+ for (int i = 0; i < nb_pix; i++) dst[64+i] = p;
}
}
-// Predict the unfiltered neighbour pixels of the specified intra block at
-// pixel offset (ct_ox, ct_oy) in the CTB with block size 'bs'. 'rec_flag' is
-// true if the reconstructed pixels are used for the prediction, false if the
-// source pixels are used as approximation. This function assumes that
-// constrained intra prediction is not used.
-//
-// Layout of the destination array, by offset: 0 (top-left), 1 (top and
-// top-right), 65 (left and bottom left).
-void fenc_predict_intra_neighbours(f265_enc_thread *t, f265_pix dst[129], int rec_flag,
- int comp, int bs, int ct_ox, int ct_oy)
-{
- int avail[2];
- fenc_get_intra_nb_avail(t, avail, comp, ct_ox, ct_oy);
- int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_ox-1, ct_oy-1);
- f265_pix *src = rec_flag ? t->src_frame->rec_planes[comp ? 3+comp : 0] + plane_off :
- t->src_frame->src_planes[comp] + plane_off;
- fenc_extract_intra_neighbours(dst, src, t->me.ref_stride, avail[0], avail[1], bs<<1, bs<<1,
- t->enc->gd.bit_depth[!!comp]);
-}
-
// Filter the neighbour pixels of the block specified.
-void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int bd, int bilinear_flag)
+static void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int bd, int bilinear_flag)
{
int bs2 = bs<<1;
- int top_left = src[0], top_last = src[bs2], left_last = src[64+bs2];
+ int top_left = src[128], top_last = src[64+bs2-1], left_last = src[63-bs2+1];
// Check for bilinear filtering.
if (bilinear_flag)
{
- int top_middle = src[32], left_middle = src[64+32];
+ int top_middle = src[63+32], left_middle = src[64-32];
int threshold = 1<<(bd-5);
bilinear_flag = F265_ABS(top_left + top_last - (top_middle<<1)) < threshold &&
F265_ABS(top_left + left_last - (left_middle<<1)) < threshold;
if (bilinear_flag)
{
- dst[0] = top_left;
- dst[64] = top_last;
- dst[64+64] = left_last;
+ dst[128] = top_left;
+ dst[127] = top_last;
+ dst[0] = left_last;
for (int i = 0; i < 63; i++)
{
- dst[1+i] = ((63-i)*top_left + (i+1)*top_last + 32)>>6;
- dst[65+i] = ((63-i)*top_left + (i+1)*left_last + 32)>>6;
+ dst[64+i] = ((63-i)*top_left + (i+1)*top_last + 32)>>6;
+ dst[63-i] = ((63-i)*top_left + (i+1)*left_last + 32)>>6;
}
return;
}
}
// Regular filtering.
- dst[0] = ((top_left<<1) + src[1] + src[65] + 2)>>2;
- dst[bs2] = top_last;
- dst[64+bs2] = left_last;
- for (int i = 1; i < bs2; i++) dst[i] = ((src[i]<<1) + src[i-1] + src[i+1] + 2)>>2;
- dst[65] = ((src[65]<<1) + top_left + src[66] + 2)>>2;
- for (int i = 66; i < 64+bs2; i++) dst[i] = ((src[i]<<1) + src[i-1] + src[i+1] + 2)>>2;
+ dst[128] = ((top_left<<1) + src[64] + src[63] + 2)>>2;
+ dst[63+bs2] = top_last;
+ dst[64-bs2] = left_last;
+ dst[64] = ((src[64]<<1) + top_left + src[65] + 2)>>2;
+ dst[63] = ((src[63]<<1) + top_left + src[62] + 2)>>2;
+
+ for (int i = 1; i < bs2-1; i++)
+ {
+ dst[64+i] = ((src[64+i]<<1) + src[64+i-1] + src[64+i+1] + 2)>>2;
+ dst[63-i] = ((src[63-i]<<1) + src[63-i-1] + src[63-i+1] + 2)>>2;
+ }
+}
+
+// Extract and filter the transform block's neighbours.
+// In the assembly version, the neighbours are always filtered.
+// Packed:
+// - Bit 0-7 : bs
+// - Bit 8-15 : bd
+void fenc_extract_intra_neigh_c(f265_pix *dst, f265_pix *pred, int pred_stride,
+ int avail[2], int filter, int packed)
+{
+ int bs = packed&255;
+ int bd = packed>>8;
+
+ fenc_extract_unfiltered_intra_neigh(dst, pred, pred_stride, avail, packed);
+
+ if (filter)
+ fenc_filter_intra_neighbours(dst+160, dst, bs, bd, (bs)==32);
+}
+
+// Extract and filter the transform block neighbours at the location indicated
+// by ct_off. Packed: rec_flag << 8 | comp
+void fenc_extract_intra_neighbours(f265_enc_thread *t, f265_pix dst[2][160], int ct_off[2],
+ int filter, int packed, int lg_bs)
+{
+ int comp = packed & 255;
+ int rec_flag = packed>>8;
+
+ int avail[2];
+ fenc_get_intra_nb_avail(t, avail, comp, ct_off[0], ct_off[1]);
+ int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_off[0], ct_off[1]);
+ f265_pix *src = rec_flag ? t->src_frame->rec_planes[comp ? 3+comp : 0] + plane_off :
+ t->src_frame->src_planes[comp] + plane_off;
+
+ fenc_extract_intra_neigh[lg_bs-2](dst, src, t->me.ref_stride, avail, filter,
+ (t->enc->gd.bit_depth[!!comp] << 8) | 1 << lg_bs);
}
// Intra planar prediction.
-void fenc_predict_intra_planar(f265_pix *dst, f265_pix *nbuf, int lg_bs)
+// Packed:
+// - Bit 0 : filter_edge_flag
+// - Bit 1-7 : lg_bs
+// - Bit 8-15 : bd
+void fenc_predict_intra_planar_c(f265_pix *dst, f265_pix *nbuf, int mode, int packed)
{
+ int lg_bs = (packed>>1) & 127;
+
int bs = 1<<lg_bs;
- int top_right = nbuf[1+bs];
- int bottom_left = nbuf[65+bs];
+ int top_right = nbuf[64+bs];
+ int bottom_left = nbuf[63-bs];
for (int y = 0; y < bs; y++)
for (int x = 0; x < bs; x++)
- dst[y*bs+x] = ((bs-1-x)*nbuf[65+y] + (bs-1-y)*nbuf[1+x] + (x+1)*top_right + (y+1)*bottom_left + bs)
+ dst[y*bs+x] = ((bs-1-x)*nbuf[63-y] + (bs-1-y)*nbuf[64+x] + (x+1)*top_right + (y+1)*bottom_left + bs)
>>(lg_bs+1);
}
// Intra DC prediction.
-void fenc_predict_intra_dc(f265_pix *dst, f265_pix *nbuf, int lg_bs, int filter_edge_flag)
+void fenc_predict_intra_dc_c(f265_pix *dst, f265_pix *nbuf, int mode, int packed)
{
+ int lg_bs = (packed>>1) & 127;
+ int filter_edge_flag = packed&1;
+
int bs = 1<<lg_bs;
int dc_val = bs;
- for (int i = 0; i < bs; i++) dc_val += nbuf[1+i] + nbuf[65+i];
+
+ for (int i = 0; i < bs; i++) dc_val += nbuf[63-i] + nbuf[64+i];
dc_val = dc_val>>(lg_bs+1);
for (int i = 0; i < bs*bs; i++) dst[i] = dc_val;
if (filter_edge_flag)
{
- dst[0] = ((dc_val<<1) + nbuf[1] + nbuf[65] + 2)>>2;
+ dst[0] = ((dc_val<<1) + nbuf[64] + nbuf[63] + 2)>>2;
for (int i = 1; i < bs; i++)
{
- dst[i] = (nbuf[i+1] + 3*dc_val + 2)>>2;
- dst[i*bs] = (nbuf[65+i] + 3*dc_val + 2)>>2;
+ dst[i] = (nbuf[64+i] + 3*dc_val + 2)>>2;
+ dst[i*bs] = (nbuf[63-i] + 3*dc_val + 2)>>2;
}
}
}
// Intra angular prediction.
-void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd, int filter_edge_flag, int mode)
+void fenc_predict_intra_angular_c(f265_pix *dst, f265_pix *nbuf, int mode, int packed)
{
+ int filter_edge_flag = packed&1;
+ int lg_bs = (packed>>1) & 127;
+ int bd = packed>>8;
+
int bs = 1<<lg_bs;
// Flip the neighbours in the horizontal case.
@@ -230,8 +271,8 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
f265_pix ntmp[129];
if (hor_flag)
{
- ntmp[0] = nbuf[0];
- for (int i = 0; i < (bs<<1); i++) { ntmp[1+i] = nbuf[65+i]; ntmp[65+i] = nbuf[1+i]; }
+ ntmp[128] = nbuf[128];
+ for (int i = 0; i < (bs<<1); i++) { ntmp[63-i] = nbuf[64+i]; ntmp[64+i] = nbuf[63-i]; }
nbuf = ntmp;
}
@@ -244,13 +285,13 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
{
for (int y = 0; y < bs; y++)
for (int x = 0; x < bs; x++)
- dst[y*bs+x] = nbuf[1+x];
+ dst[y*bs+x] = nbuf[64+x];
if (filter_edge_flag)
{
- int top_left = nbuf[0], top = nbuf[1];
+ int top_left = nbuf[128], top = nbuf[64];
for (int y = 0; y < bs; y++)
- dst[y*bs] = F265_CLAMP(top + ((nbuf[65+y] - top_left)>>1), 0, (1<<bd)-1);
+ dst[y*bs] = F265_CLAMP(top + ((nbuf[63-y] - top_left)>>1), 0, (1<<bd)-1);
}
}
@@ -276,15 +317,16 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
for (int i = 0; i < nb_projected; i++)
{
inv_angle_sum += inv_angle;
- ref[-2-i] = nbuf[64+(inv_angle_sum>>8)];
+ ref[-2-i] = nbuf[64-(inv_angle_sum>>8)];
}
// Copy the top-left and top pixels.
- for (int i = 0; i < bs+1; i++) ref[-1+i] = nbuf[i];
+ ref[-1] = nbuf[128];
+ for (int i = 0; i < bs; i++) ref[i] = nbuf[64+i];
}
// Use the top and top-right neighbours.
- else ref = nbuf+1;
+ else ref = nbuf+64;
// Pass every row.
int angle_sum = 0;
@@ -312,31 +354,38 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
}
// Predict the pixels of the intra mode specified.
-void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, int bd, int mode, int filter_edge_flag)
+// TODO: Stub. Should be inlined.
+inline void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, int bd, int mode,
+ int filter_edge_flag)
{
- if (mode == 0) fenc_predict_intra_planar(dst, neighbours, lg_bs);
- else if (mode == 1) fenc_predict_intra_dc(dst, neighbours, lg_bs, filter_edge_flag);
- else fenc_predict_intra_angular(dst, neighbours, lg_bs, bd, filter_edge_flag, mode);
+ // Get the function offset based on the block size.
+ int idx = (lg_bs-2)*11;
+
+ // Add the function offset based vased on the mode
+ idx += f265_mode_to_intra_pred[mode];
+
+ // Generate the packed data.
+ int packed = filter_edge_flag;
+ packed |= (lg_bs<<1);
+ packed |= (bd<<8);
+
+ fenc_predict_intra[idx](dst, neighbours, mode, packed);
}
// Predict the intra block with the mode and the CTB offset specified.
-void fenc_predict_intra(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy)
+void fenc_predict_intra_block(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy)
{
f265_intra_block *ib = &t->intra_block;
int chroma_flag = !!comp;
- int bs = 1<<lg_bs;
int bd = t->enc->gd.bit_depth[chroma_flag];
- int smooth_intra_flag = F265_GET_FLAG(t->enc->gd.eflags, F265_PF_SMOOTH_INTRA);
- int filter_edge_flag, filter_neighbour_flag, neighbour_bilinear_flag;
+ int filter_edge_flag, filter_neighbour_flag;
- // Predict the unfiltered neighbours. Assuming 4:2:0.
- fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, comp, bs, ct_ox, ct_oy);
+ // Get and filter the neighbours.
+ fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+ comp, lg_bs, mode);
+ int ct_off[2] = {ct_ox, ct_oy};
+ fenc_extract_intra_neighbours(t, ib->neighbours, ct_off, filter_neighbour_flag, 1<<8 | comp, lg_bs);
- // Filter the neighbours.
- fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, &neighbour_bilinear_flag,
- comp, lg_bs, mode, smooth_intra_flag);
- if (filter_neighbour_flag) fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0],
- bs, bd, neighbour_bilinear_flag);
f265_pix *neighbours = ib->neighbours[filter_neighbour_flag];
// Do the prediction.
diff --git a/f265/rec.c b/f265/rec.c
index decc88f..f261e28 100644
--- a/f265/rec.c
+++ b/f265/rec.c
@@ -932,7 +932,7 @@ int fenc_rec_intra_tb(f265_enc_thread *t, int comp, int lg_bs, int mode, int zer
{
f265_pix pred[32*32];
int dst_flag, order;
- fenc_predict_intra(t, pred, comp, lg_bs, mode, ct_ox, ct_oy);
+ fenc_predict_intra_block(t, pred, comp, lg_bs, mode, ct_ox, ct_oy);
fenc_get_intra_encode_flags(&dst_flag, &order, comp, lg_bs, mode);
if (F265_GET_FLAG(t->enc->gd.eflags, F265_PF_RDOQ))
diff --git a/snippets/asm.py b/snippets/asm.py
index 83a6d6d..75d8790 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -115,13 +115,17 @@ declare_dict = odict()
# currently implemented in assembly. If "arch" is specified, it shadows
# "arch_lbd" and "arch_hbd". If single_c=true, the same C function is mapped to
# every slot.
-def declare_func(name, ret="void", args="", bd=0, indices=None, avx2=None, avx2_lbd=None, avx2_hbd=None, single_c=True):
+def declare_func(name, ret="void", args="", bd=0, indices=[""], c=1, c_lbd=None, c_hbd=None, \
+ avx2=None, avx2_lbd=None, avx2_hbd=None, single_c=True):
f = Function()
f.name = name
f.ret = ret
f.args = args
f.bd = bd
f.indices = indices
+ f.c = c
+ f.c_lbd = c_lbd
+ f.c_hbd = c_hbd
f.avx2 = avx2
f.avx2_lbd = avx2_lbd
f.avx2_hbd = avx2_hbd
@@ -132,16 +136,27 @@ def declare_func(name, ret="void", args="", bd=0, indices=None, avx2=None, avx2_
def declare_all():
df = declare_func
- amp_indices = ["2", "4", "8", "16", "32", "64", "6", "12", "24", "48"]
- luma_amp_indices_x = ["X", "4", "8", "16", "32", "64", "X", "12", "24", "48"]
- luma_amp_indices = ["4", "8", "16", "32", "64", "12", "24", "48"]
+ amp_indices = ["2", "4", "8", "16", "32", "64", "6", "12", "24", "48"]
+ luma_amp_indices = ["X", "4", "8", "16", "32", "64", "X", "12", "24", "48"]
luma_qpel_indices = []
+ luma_qpel_indices_c = []
luma_qpel_indices_avx2 = []
- for index in luma_amp_indices_x:
+ for index in luma_amp_indices:
for frac in [ "h", "v", "d"]:
luma_qpel_indices.append("X" if index == "X" else "%s_%s" % (index, frac))
- if index != "X" and int(index) % 8 == 0:
- luma_qpel_indices_avx2.append("%s_%s" % (index, frac))
+ luma_qpel_indices_c.append("X" if index == "X" else frac)
+ luma_qpel_indices_avx2.append("X" if index == "X" or int(index) % 8 != 0 else "%s_%s" % (index, frac))
+
+ intra_pred_indices_seed = ["4", "8", "16", "32"]
+ intra_pred_indices_avx2_seed = ["X", "8", "X", "X"]
+ intra_pred_indices = []
+ intra_pred_indices_avx2 = []
+ for index in intra_pred_indices_seed:
+ for frac in [ "planar", "dc", "angular", "angular", "angular", "angular", "angular", "angular", "angular", "angular", "angular"]:
+ intra_pred_indices.append("X" if index == "X" else "%s" % (frac))
+ for index in intra_pred_indices_avx2_seed:
+ for frac in [ "planar", "dc", "dia_bot_left", "hor_bot", "hor", "hor_top", "dia_top_left", "ver_left", "ver", "ver_right", "dia_top_right"]:
+ intra_pred_indices_avx2.append("X" if index == "X" else "%s_%s" % (frac, index))
# Declarations go here.
df("dct", bd=1, single_c=False,
@@ -175,12 +190,12 @@ def declare_all():
df("sad3", bd=1,
args="int *costs, f265_pix *src, int src_stride, f265_pix **refs, int ref_stride, int packed_dims",
- indices=luma_amp_indices_x,
+ indices=luma_amp_indices,
avx2_lbd=1)
df("sad4", bd=1,
args="int *costs, f265_pix *src, int src_stride, f265_pix **refs, int ref_stride, int packed_dims",
- indices=luma_amp_indices_x,
+ indices=luma_amp_indices,
avx2_lbd=1)
df("fssd", bd=1,
@@ -189,14 +204,24 @@ def declare_all():
df("avg_pix", bd=1,
args="f265_pix *dst, f265_pix *src0, int src0_stride, f265_pix *src1, int src1_stride, int packed_dims",
- indices=luma_amp_indices_x,
- avx2_lbd=["4", "8", "16", "32", "64", "12", "24", "48"])
+ indices=luma_amp_indices,
+ avx2_lbd=1)
- df("interpol_luma_qpel_pix", bd=1,
+ df("interpol_luma_qpel_pix", bd=1, single_c=False,
args="f265_pix *dst, int dst_stride, f265_pix *src, int src_stride, int frac, int packed_dims, uint8_t *spill",
indices=luma_qpel_indices,
+ c=luma_qpel_indices_c,
avx2_lbd=luma_qpel_indices_avx2)
+ df("predict_intra", bd=1, single_c=False,
+ args="f265_pix *dst, f265_pix *neighbours, int mode, int packed",
+ indices=intra_pred_indices,
+ avx2_lbd=intra_pred_indices_avx2)
+
+ df("extract_intra_neigh", bd=1, single_c=True,
+ args="f265_pix nbuf[2][160], f265_pix *pred, int pred_stride, int avail[2], int filter, int packed",
+ indices=["4", "8", "16", "32"],
+ avx2_lbd=["X", "8", "X", "X"])
### AVX2 SAD special code. ###
def avx2_sad_special_code():
@@ -300,19 +325,19 @@ def get_output():
for arch in arch_list:
assign_text[arch] = ""
+ # List all function declarations.
+ function_list = set()
+
# Pass every function.
for f in declare_dict.values():
- # Base function name.
- base_func_name = "%s_%s" % (prog, f.name)
-
# Iterate on the bit depths, if any.
bd_list = ["lbd", "hbd"] if f.bd else [None]
for bd in bd_list:
- # Adjust the function name for the bit depth.
- bd_func_name = base_func_name
- if bd != None: bd_func_name = "%s_%s_%s" % (prog, bd, f.name)
+ # Function name. Include the bd only if defined.
+ bd_func_name = "%s_%s" % (prog, f.name)
+ if bd != None : bd_func_name = "%s_%s_%s" % (prog, bd, f.name)
# Do the substitutions for the bit depth in the arguments and the
# return type.
@@ -324,62 +349,55 @@ def get_output():
typedef_text += "typedef %s(*%s_func)(%s);\n" % (func_ret_str, bd_func_name, func_args_str)
# Declare the global variable, along with documentation.
- var_indice_str = "[%d]" % (len(f.indices)) if f.indices else ""
+ has_indices = len(f.indices) > 1 if f.indices != None else 0
+ var_indice_str = "[%d]" % (len(f.indices)) if has_indices else ""
global_str = "%s_func %s%s;\n" % (bd_func_name, bd_func_name, var_indice_str)
if bd != "hbd": global_var_text_lbd += global_str
else: global_var_text_hbd += global_str
- if f.indices != None: extern_var_text += "// Indices: %s.\n" % (", ".join(f.indices))
+ if has_indices: extern_var_text += "// Indices: %s.\n" % (", ".join(f.indices))
extern_var_text += "extern " + global_str + "\n";
# Iterate on the indices, if any.
index_list = f.indices if f.indices != None else [None]
for index_pos in range(len(index_list)):
- index = index_list[index_pos]
-
- # Adjust the function name for the index.
- index_func_name = bd_func_name
- if index != None: index_func_name += "_" + index
-
# Iterate on the architectures.
for arch in arch_list:
+ index = None
- # Adjust the function name for the architecture.
- arch_func_name = index_func_name + "_" + arch
- if f.single_c and arch == "c":
- arch_func_name = bd_func_name + "_c"
-
- # Check whether the architecture supports this function.
+ # Use f.[arch] if the generic architecture is defined.
+ field = getattr(f, arch)
- # Skipped slot.
- if index == "X":
- support_flag = 0
+ # Use the f.[arch_bd] if the bit depth is defined and the generic arch is not.
+ if field == None and bd != None:
+ field = getattr(f, "%s_%s" % (arch, bd))
- # C always supports the function.
- elif arch == "c":
- support_flag = 1
+ # If the field is true (1), use the default indices.
+ if type(field) is int and field == 1:
+ field = f.indices
- # Handle assembly.
- else:
- # Get the relevant fields.
- bdi_field = getattr(f, arch)
- bd_field = bdi_field if bd == None else getattr(f, "%s_%s" % (arch, bd))
+ # Get the field value.
+ if type(field) is list:
+ index = field[index_pos]
- # Do the shadowing.
- field = bd_field if bdi_field == None else bdi_field
+ # Add the architecture to the function name.
+ arch_func_name = bd_func_name
+ if index is not None and len(index): arch_func_name += "_" + index
+ arch_func_name += "_" + arch
+ if arch == "c" and f.single_c :
+ arch_func_name = bd_func_name + "_c"
- # Explicitly supported.
- support_flag = field == 1 or type(field) is list and index in field
+ # Test if we should skip this index.
+ support_flag = 0 if index == "X" or (index == None) else 1
# Declare the prototype.
if (arch == "c" and f.single_c and index_pos == 0) or\
(support_flag and (arch != "c" or not f.single_c)):
- # Kludge for the interpolation functions.
- if arch_func_name.find("interpol") != -1 and arch == "c":
- for frac in [ "h", "v", "d"]:
- proto_text += "%s %s_%s_c(%s);\n" % (func_ret_str, bd_func_name, frac, func_args_str);
- # Normal declaration.
- else:
- proto_text += "%s %s(%s);\n" % (func_ret_str, arch_func_name, func_args_str);
+ s = "%s %s(%s);\n" % (func_ret_str, arch_func_name, func_args_str);
+
+ # Insert it only if it's not already declared.
+ if s not in function_list:
+ proto_text += s
+ function_list.add(s)
# Not supported, skip.
if not support_flag: continue
@@ -387,12 +405,8 @@ def get_output():
# Do the assignments.
assign_tabs = " "
if arch != "c": assign_tabs += " "
- assign_index_str = "[%d]" % (index_pos) if f.indices else ""
- assign_val = arch_func_name
- # Kludge for the interpolation functions.
- if arch_func_name.find("interpol") != -1 and arch == "c":
- assign_val = "%s_%s_c" % (arch_func_name[:-2], index[-1])
- assign_text[arch] += "%s%s%s = %s;\n" % (assign_tabs, bd_func_name, assign_index_str, assign_val)
+ assign_index_str = "[%d]" % (index_pos) if has_indices else ""
+ assign_text[arch] += "%s%s%s = %s;\n" % (assign_tabs, bd_func_name, assign_index_str, arch_func_name)
proto_text += "\n"
@@ -452,4 +466,3 @@ def main():
write_file("../f265/asm.h", h_content)
main()
-