Re: [f265 dev team] Intra prediction 8x8 asembly

Laurent Birtz Thu, 07 Aug 2014 08:47:23 -0700

Hi,
there are still problems. Search for %%%.


Regards,
Laurent

diff --git a/SConstruct b/SConstruct
index 744e05d..ab9772d 100644
--- a/SConstruct
+++ b/SConstruct
@@ -158,7 +158,7 @@ bdi_env = ref_env.Clone(CPPPATH = ['build', '.', 
'f265/ktools'])
 for c_file in bdi_c_files:
     obj_files += bdi_env.Object('build/f265/' + c_file[:-2], 'f265/' + c_file)
 
-bdi_a_files = ['pixel.asm', 'dct.asm', 'encode.asm']
+bdi_a_files = ['pixel.asm', 'dct.asm', 'encode.asm', 'intra.asm']
 if f265_cfg['asm']:
     asm_dir = 'f265/asm/'
     asm_arch = "ARCH_X64" if mingw else "ARCH_AMD64"
diff --git a/f265/analyze.c b/f265/analyze.c
index c415cbf..6b9aa41 100644
--- a/f265/analyze.c
+++ b/f265/analyze.c
@@ -523,7 +523,7 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, 
f265_cb *cb, int *nz_fl
     int bs = 1<<lg_bs;
     int ref_stride = t->me.ref_stride;
     int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_ox, ct_oy);
-    int filter_edge_flag, filter_neighbour_flag, neighbour_bilinear_flag;
+    int filter_edge_flag, filter_neighbour_flag;
     int64_t cost;
     f265_pix *neighbours;
 
@@ -531,20 +531,20 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, 
f265_cb *cb, int *nz_fl
     if (likely(ib->intra_neighbour_mode == 2))
     {
         // FIXME, optimize this.
-        fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, 
&neighbour_bilinear_flag,
-                                    comp, lg_bs, mode, 0);
+        fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+                                    comp, lg_bs, mode);
         neighbours = ib->neighbours[filter_neighbour_flag];
     }
 
     // Predict the reconstructed neighbours.
     else if (ib->intra_neighbour_mode == 1)
     {
-        int smooth_intra_flag = F265_GET_FLAG(t->enc->gd.eflags, 
F265_PF_SMOOTH_INTRA);
-        fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, 
&neighbour_bilinear_flag,
-                                    comp, lg_bs, mode, smooth_intra_flag);
-        fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, comp, bs, 
ct_ox, ct_oy);
-        if (filter_neighbour_flag) 
fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0], bs, 8,
-                                                                
neighbour_bilinear_flag);
+        fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+                                    comp, lg_bs, mode);
+
+        int ct_off[2] = {ct_ox, ct_oy};
+        fenc_extract_intra_neighbours(t, ib->neighbours, ct_off, 
filter_neighbour_flag, 1<<8 | comp, lg_bs);
+
         neighbours = ib->neighbours[filter_neighbour_flag];
     }
 
@@ -552,7 +552,9 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, 
f265_cb *cb, int *nz_fl
     else
     {
         // Do not filter anything when using the source pixels.
-        fenc_predict_intra_neighbours(t, ib->neighbours[0], 0, comp, bs, 
ct_ox, ct_oy);
+        int ct_off[2] = {ct_ox, ct_oy};
+        fenc_extract_intra_neighbours(t, ib->neighbours, ct_off, 0, 0<<8 | 
comp, lg_bs);
+
         filter_edge_flag = 0;
         neighbours = ib->neighbours[0];
     }
@@ -1801,7 +1803,6 @@ static int64_t 
fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
 {
     f265_analysis *an = &t->an;
     f265_intra_block *ib = &t->intra_block;
-    int bs = 1<<lg_bs;
     int rdo_restore_flag = 1;
     int64_t best_cost;
 
@@ -1888,10 +1889,8 @@ static int64_t 
fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
     // neighbours, then the previous passes also use cached neighbours.
     if 
(ib->cache_neighbour_flags[0]|ib->cache_neighbour_flags[1]|ib->cache_neighbour_flags[2])
     {
-        int bilinear_flag = bs == 32 && F265_GET_FLAG(t->enc->gd.eflags, 
F265_PF_SMOOTH_INTRA);
-        fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, 0, bs,
-                                      cb->cb_off[0] + cb_ox, cb->cb_off[1] + 
cb_oy);
-        fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0], bs, 
8, bilinear_flag);
+        int ct_off[2] = {cb->cb_off[0] + cb_ox, cb->cb_off[1] + cb_oy};
+        fenc_extract_intra_neighbours(t, ib->neighbours, ct_off, 1, 1<<8 | 0, 
lg_bs);
     }
 
     // Set the partition data.
diff --git a/f265/asm.c b/f265/asm.c
index 7edecc3..620c3a4 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -136,6 +136,28 @@ void f265_hbd_interpol_luma_qpel_pix_h_c(int16_t *dst, int 
dst_stride, int16_t *
 void f265_hbd_interpol_luma_qpel_pix_v_c(int16_t *dst, int dst_stride, int16_t 
*src, int src_stride, int frac, int packed_dims, uint8_t *spill);
 void f265_hbd_interpol_luma_qpel_pix_d_c(int16_t *dst, int dst_stride, int16_t 
*src, int src_stride, int frac, int packed_dims, uint8_t *spill);
 
+void f265_lbd_predict_intra_planar_c(uint8_t *dst, uint8_t *neighbours, int 
mode, int packed);
+void f265_lbd_predict_intra_dc_c(uint8_t *dst, uint8_t *neighbours, int mode, 
int packed);
+void f265_lbd_predict_intra_angular_c(uint8_t *dst, uint8_t *neighbours, int 
mode, int packed);
+void f265_lbd_predict_intra_planar_8_avx2(uint8_t *dst, uint8_t *neighbours, 
int mode, int packed);
+void f265_lbd_predict_intra_dc_8_avx2(uint8_t *dst, uint8_t *neighbours, int 
mode, int packed);
+void f265_lbd_predict_intra_dia_bot_left_8_avx2(uint8_t *dst, uint8_t 
*neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_bot_8_avx2(uint8_t *dst, uint8_t *neighbours, 
int mode, int packed);
+void f265_lbd_predict_intra_hor_8_avx2(uint8_t *dst, uint8_t *neighbours, int 
mode, int packed);
+void f265_lbd_predict_intra_hor_top_8_avx2(uint8_t *dst, uint8_t *neighbours, 
int mode, int packed);
+void f265_lbd_predict_intra_dia_top_left_8_avx2(uint8_t *dst, uint8_t 
*neighbours, int mode, int packed);
+void f265_lbd_predict_intra_ver_left_8_avx2(uint8_t *dst, uint8_t *neighbours, 
int mode, int packed);
+void f265_lbd_predict_intra_ver_8_avx2(uint8_t *dst, uint8_t *neighbours, int 
mode, int packed);
+void f265_lbd_predict_intra_ver_right_8_avx2(uint8_t *dst, uint8_t 
*neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_right_8_avx2(uint8_t *dst, uint8_t 
*neighbours, int mode, int packed);
+void f265_hbd_predict_intra_planar_c(int16_t *dst, int16_t *neighbours, int 
mode, int packed);
+void f265_hbd_predict_intra_dc_c(int16_t *dst, int16_t *neighbours, int mode, 
int packed);
+void f265_hbd_predict_intra_angular_c(int16_t *dst, int16_t *neighbours, int 
mode, int packed);
+
+void f265_lbd_extract_intra_neigh_c(uint8_t nbuf[2][160], uint8_t *pred, int 
pred_stride, int avail[2], int filter, int packed);
+void f265_lbd_extract_intra_neigh_8_avx2(uint8_t nbuf[2][160], uint8_t *pred, 
int pred_stride, int avail[2], int filter, int packed);
+void f265_hbd_extract_intra_neigh_c(int16_t nbuf[2][160], int16_t *pred, int 
pred_stride, int avail[2], int filter, int packed);
+
 // Special code.
 #ifdef F265_HAVE_ASM
 int f265_lbd_fsad_12_avx2(uint8_t *src, int src_stride, uint8_t *ref, int 
ref_stride, int packed_dims)
@@ -360,6 +382,8 @@ f265_lbd_sad4_func f265_lbd_sad4[10];
 f265_lbd_fssd_func f265_lbd_fssd[5];
 f265_lbd_avg_pix_func f265_lbd_avg_pix[10];
 f265_lbd_interpol_luma_qpel_pix_func f265_lbd_interpol_luma_qpel_pix[30];
+f265_lbd_predict_intra_func f265_lbd_predict_intra[44];
+f265_lbd_extract_intra_neigh_func f265_lbd_extract_intra_neigh[4];
 
 f265_hbd_dct_func f265_hbd_dct[5];
 f265_hbd_idct_func f265_hbd_idct[5];
@@ -373,6 +397,8 @@ f265_hbd_sad4_func f265_hbd_sad4[10];
 f265_hbd_fssd_func f265_hbd_fssd[5];
 f265_hbd_avg_pix_func f265_hbd_avg_pix[10];
 f265_hbd_interpol_luma_qpel_pix_func f265_hbd_interpol_luma_qpel_pix[30];
+f265_hbd_predict_intra_func f265_hbd_predict_intra[44];
+f265_hbd_extract_intra_neigh_func f265_hbd_extract_intra_neigh[4];
 
 // Linkage at runtime.
 static void f265_link_asm(int avx2_flag)
@@ -549,6 +575,102 @@ static void f265_link_asm(int avx2_flag)
     f265_hbd_interpol_luma_qpel_pix[27] = f265_hbd_interpol_luma_qpel_pix_h_c;
     f265_hbd_interpol_luma_qpel_pix[28] = f265_hbd_interpol_luma_qpel_pix_v_c;
     f265_hbd_interpol_luma_qpel_pix[29] = f265_hbd_interpol_luma_qpel_pix_d_c;
+    f265_lbd_predict_intra[0] = f265_lbd_predict_intra_planar_c;
+    f265_lbd_predict_intra[1] = f265_lbd_predict_intra_dc_c;
+    f265_lbd_predict_intra[2] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[3] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[4] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[5] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[6] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[7] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[8] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[9] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[10] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[11] = f265_lbd_predict_intra_planar_c;
+    f265_lbd_predict_intra[12] = f265_lbd_predict_intra_dc_c;
+    f265_lbd_predict_intra[13] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[14] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[15] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[16] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[17] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[18] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[19] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[20] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[21] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[22] = f265_lbd_predict_intra_planar_c;
+    f265_lbd_predict_intra[23] = f265_lbd_predict_intra_dc_c;
+    f265_lbd_predict_intra[24] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[25] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[26] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[27] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[28] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[29] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[30] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[31] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[32] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[33] = f265_lbd_predict_intra_planar_c;
+    f265_lbd_predict_intra[34] = f265_lbd_predict_intra_dc_c;
+    f265_lbd_predict_intra[35] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[36] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[37] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[38] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[39] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[40] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[41] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[42] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[43] = f265_lbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[0] = f265_hbd_predict_intra_planar_c;
+    f265_hbd_predict_intra[1] = f265_hbd_predict_intra_dc_c;
+    f265_hbd_predict_intra[2] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[3] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[4] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[5] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[6] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[7] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[8] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[9] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[10] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[11] = f265_hbd_predict_intra_planar_c;
+    f265_hbd_predict_intra[12] = f265_hbd_predict_intra_dc_c;
+    f265_hbd_predict_intra[13] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[14] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[15] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[16] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[17] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[18] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[19] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[20] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[21] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[22] = f265_hbd_predict_intra_planar_c;
+    f265_hbd_predict_intra[23] = f265_hbd_predict_intra_dc_c;
+    f265_hbd_predict_intra[24] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[25] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[26] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[27] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[28] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[29] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[30] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[31] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[32] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[33] = f265_hbd_predict_intra_planar_c;
+    f265_hbd_predict_intra[34] = f265_hbd_predict_intra_dc_c;
+    f265_hbd_predict_intra[35] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[36] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[37] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[38] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[39] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[40] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[41] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[42] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[43] = f265_hbd_predict_intra_angular_c;
+    f265_lbd_extract_intra_neigh[0] = f265_lbd_extract_intra_neigh_c;
+    f265_lbd_extract_intra_neigh[1] = f265_lbd_extract_intra_neigh_c;
+    f265_lbd_extract_intra_neigh[2] = f265_lbd_extract_intra_neigh_c;
+    f265_lbd_extract_intra_neigh[3] = f265_lbd_extract_intra_neigh_c;
+    f265_hbd_extract_intra_neigh[0] = f265_hbd_extract_intra_neigh_c;
+    f265_hbd_extract_intra_neigh[1] = f265_hbd_extract_intra_neigh_c;
+    f265_hbd_extract_intra_neigh[2] = f265_hbd_extract_intra_neigh_c;
+    f265_hbd_extract_intra_neigh[3] = f265_hbd_extract_intra_neigh_c;
 
     #ifdef F265_HAVE_ASM
     if (avx2_flag)
@@ -631,6 +753,18 @@ static void f265_link_asm(int avx2_flag)
         f265_lbd_interpol_luma_qpel_pix[27] = 
f265_lbd_interpol_luma_qpel_pix_48_h_avx2;
         f265_lbd_interpol_luma_qpel_pix[28] = 
f265_lbd_interpol_luma_qpel_pix_48_v_avx2;
         f265_lbd_interpol_luma_qpel_pix[29] = 
f265_lbd_interpol_luma_qpel_pix_48_d_avx2;
+        f265_lbd_predict_intra[11] = f265_lbd_predict_intra_planar_8_avx2;
+        f265_lbd_predict_intra[12] = f265_lbd_predict_intra_dc_8_avx2;
+        f265_lbd_predict_intra[13] = 
f265_lbd_predict_intra_dia_bot_left_8_avx2;
+        f265_lbd_predict_intra[14] = f265_lbd_predict_intra_hor_bot_8_avx2;
+        f265_lbd_predict_intra[15] = f265_lbd_predict_intra_hor_8_avx2;
+        f265_lbd_predict_intra[16] = f265_lbd_predict_intra_hor_top_8_avx2;
+        f265_lbd_predict_intra[17] = 
f265_lbd_predict_intra_dia_top_left_8_avx2;
+        f265_lbd_predict_intra[18] = f265_lbd_predict_intra_ver_left_8_avx2;
+        f265_lbd_predict_intra[19] = f265_lbd_predict_intra_ver_8_avx2;
+        f265_lbd_predict_intra[20] = f265_lbd_predict_intra_ver_right_8_avx2;
+        f265_lbd_predict_intra[21] = 
f265_lbd_predict_intra_dia_top_right_8_avx2;
+        f265_lbd_extract_intra_neigh[1] = f265_lbd_extract_intra_neigh_8_avx2;
     }
     #endif
 }
diff --git a/f265/asm.h b/f265/asm.h
index 6402d1f..98e2f6d 100644
--- a/f265/asm.h
+++ b/f265/asm.h
@@ -30,6 +30,10 @@ typedef void(*f265_lbd_avg_pix_func)(uint8_t *dst, uint8_t 
*src0, int src0_strid
 typedef void(*f265_hbd_avg_pix_func)(int16_t *dst, int16_t *src0, int 
src0_stride, int16_t *src1, int src1_stride, int packed_dims);
 typedef void(*f265_lbd_interpol_luma_qpel_pix_func)(uint8_t *dst, int 
dst_stride, uint8_t *src, int src_stride, int frac, int packed_dims, uint8_t 
*spill);
 typedef void(*f265_hbd_interpol_luma_qpel_pix_func)(int16_t *dst, int 
dst_stride, int16_t *src, int src_stride, int frac, int packed_dims, uint8_t 
*spill);
+typedef void(*f265_lbd_predict_intra_func)(uint8_t *dst, uint8_t *neighbours, 
int mode, int packed);
+typedef void(*f265_hbd_predict_intra_func)(int16_t *dst, int16_t *neighbours, 
int mode, int packed);
+typedef void(*f265_lbd_extract_intra_neigh_func)(uint8_t nbuf[2][160], uint8_t 
*pred, int pred_stride, int avail[2], int filter, int packed);
+typedef void(*f265_hbd_extract_intra_neigh_func)(int16_t nbuf[2][160], int16_t 
*pred, int pred_stride, int avail[2], int filter, int packed);
 
 // Globals.
 
@@ -103,4 +107,16 @@ extern f265_lbd_interpol_luma_qpel_pix_func 
f265_lbd_interpol_luma_qpel_pix[30];
 // Indices: X, X, X, 4_h, 4_v, 4_d, 8_h, 8_v, 8_d, 16_h, 16_v, 16_d, 32_h, 
32_v, 32_d, 64_h, 64_v, 64_d, X, X, X, 12_h, 12_v, 12_d, 24_h, 24_v, 24_d, 
48_h, 48_v, 48_d.
 extern f265_hbd_interpol_luma_qpel_pix_func 
f265_hbd_interpol_luma_qpel_pix[30];
 
+// Indices: planar, dc, angular, angular, angular, angular, angular, angular, 
angular, angular, angular, planar, dc, angular, angular, angular, angular, 
angular, angular, angular, angular, angular, planar, dc, angular, angular, 
angular, angular, angular, angular, angular, angular, angular, planar, dc, 
angular, angular, angular, angular, angular, angular, angular, angular, angular.
+extern f265_lbd_predict_intra_func f265_lbd_predict_intra[44];
+
+// Indices: planar, dc, angular, angular, angular, angular, angular, angular, 
angular, angular, angular, planar, dc, angular, angular, angular, angular, 
angular, angular, angular, angular, angular, planar, dc, angular, angular, 
angular, angular, angular, angular, angular, angular, angular, planar, dc, 
angular, angular, angular, angular, angular, angular, angular, angular, angular.
+extern f265_hbd_predict_intra_func f265_hbd_predict_intra[44];
+
+// Indices: 4, 8, 16, 32.
+extern f265_lbd_extract_intra_neigh_func f265_lbd_extract_intra_neigh[4];
+
+// Indices: 4, 8, 16, 32.
+extern f265_hbd_extract_intra_neigh_func f265_hbd_extract_intra_neigh[4];
+
 
diff --git a/f265/asm/avx2/intra.asm b/f265/asm/avx2/intra.asm
new file mode 100644
index 0000000..d73cd3e
--- /dev/null
+++ b/f265/asm/avx2/intra.asm
@@ -0,0 +1,978 @@
+; Copyright (c) 2014, VANTRIX CORPORATION. All rights reserved. See LICENSE.txt
+; for the full license text.
+
+%include "x86inc.asm"
+
+section .data
+align 32
+
+planar_8_left:      db  14,15, 12,13, 6,7, 4,5, 0,0,0,0,0,0,0,0, ; Shuffle 
pattern to regroup the left and top-right
+                    db  10,11,  8, 9, 2,3, 0,1, 0,0,0,0,0,0,0,0, ; pixels 
together for rows 0/2, 1/3, 4/6, 5/7.
+
+angle_mul_ver:      dw  1, 2, 5, 6,  0, 0, 0, 0, ; Row index, shuffled to do 2 
rows at the time.
+                    dw  3, 4, 7, 8,  0, 0, 0, 0, ; Used to get the weight and 
offset of each row on vertical angles.
+
+triple_last_lane:   db  1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, ; Multiply 
high lane by 3 while keeping the
+                    db  3,0, 3,0, 3,0, 3,0, 3,0, 3,0, 3,0, 3,0, ; low lane 
as-is.
+
+neigh_1_of_2:       db  0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15,
+                    db  8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7,
+
+neigh_shift_pair:   db  14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,
+                    db  2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,
+
+
+align 16
+
+; Repeat values on a whole 8x8 row. Inversed for use in pure horizontal.
+ang_hor_8:          db  3, 3, 3, 3, 3, 3, 3, 3,  2, 2, 2, 2, 2, 2, 2, 2,
+                    db  1, 1, 1, 1, 1, 1, 1, 1,  0, 0, 0, 0, 0, 0, 0, 0,
+
+; Pshufb pattern to generate neighbour pairs.
+pair_low:           db  0,1, 1,2, 2,3, 3,4, 4,5, 5,6, 6,7, 7,8,
+pair_high:          db  7,8, 8,9, 9,10, 10,11,  11,12, 12,13, 13,14, 14,15
+
+angle_mul_hor:      dw  1, 2, 3, 4, 5, 6, 7, 8,     ; Row index. Used to get 
the weight and offset of each row on
+                                                    ; horizontal angles.
+angle_inv_mul_hor:  dw  0, 1, 2, 3, 4, 5, 6, 7,     ; Multiplier for 
inv_angle_8 on horizontal angles.
+angle_inv_mul_ver:  dw  7, 6, 5, 4, 3, 2, 1, 0,     ; Multiplier for 
inv_angle_8 on vertical angles.
+
+dia_bot_left_8:     db  14, 13, 12, 11, 10, 9, 8, 7 ; Invert byte order.
+                    db  6, 5, 4, 3, 2, 1, 0, 15
+
+planar_wgt_hor:     db  7, 1,  6, 2,  5, 3,  4, 4,  ; Weight pair, used for 
planar row weighting.
+                    db  3, 5,  2, 6,  1, 7,  0, 8,
+
+; Manage neighbour filtering edge case.
+neig_bl_unav_8:     db  0,0,0,0,0,0,0,0, 0, 1, 2, 3, 4, 5, 6, 7
+
+pat_b_0_to_16:      db  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
%%% Unused.
+pat_b_16_to_0:      db  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+
+align 4
+
+; Seed on which the the neighbours offset of inversed angles are calculated.
+; As words (repeated 4 times) for speed-ups.
+inv_angle_8:        db  16, 16, 16, 16
+                    db  19, 19, 19, 19
+                    db  24, 24, 24, 24
+                    db  30, 30, 30, 30
+                    db  39, 39, 39, 39
+                    db  57, 57, 57, 57
+                    db  102, 102, 102, 102
+
+; Seed on which the angles weights and offsets are calculated.
+; As words (repeated 4 times) for speed-ups.
+intra_angle:        db  2, 2, 2, 2,
+                    db  5, 5, 5, 5,
+                    db  9, 9, 9, 9,
+                    db  13, 13, 13, 13,
+                    db  17, 17, 17, 17,
+                    db  21, 21, 21, 21,
+                    db  26, 26, 26, 26,
+
+; Pattern used as mask, bias, offset, ...
+; As double to use the more efficient vpbroadcastd.
+neigh_last_b_of_d:  db  3, 7, 11, 15,
+pat_q_255:          dq  0xff
+pat_w_8192:         dw  8192, 8192,
+pat_w_2048:         dw  2048, 2048,
+pat_w_1024:         dw  1024, 1024,
+pat_w_32:           dw  32, 32,
+pat_w_31:           dw  31, 31,
+pat_w_8:            dw  8, 8,
+pat_b_14_15:        db  14,15, 14,15,
+pat_b_7_8:          db  7,8, 7,8,
+pat_b_0_1:          db  0,1, 0,1,
+pat_b_128:          db  128, 128, 128, 128
+pat_b_15:           db  15, 15, 15, 15,
+pat_b_7:            db  7, 7, 7, 7
+pat_b_1:            db  1, 1, 1, 1,
+
+
+; Intra 4x4 pattern.
+;intra_copy_4:       dd  0x00000000, 0x01010101, 0x00000000,0x00000000,
+;                    dd  0x02020202, 0x03030303, 0x00000000,0x00000000,;
+;
+;
+;ang_hor_16:         db  1, 1, 1, 1, 1, 1, 1, 1, ; Repeat values on a whole 
16x16 row.
+;                    db  1, 1, 1, 1, 1, 1, 1, 1, ; Inversed for use in pure 
horizontal.
+;                    db  0, 0, 0, 0, 0, 0, 0, 0,
+;                    db  0, 0, 0, 0, 0, 0, 0, 0,
+;
+;intra_hor_4:        db  3, 1, 2, 2, 1, 3, 0, 4, ; Horizontal weight for 4x4 
planar.
+;intra_ver_4:        db  3, 1, 3, 1, 3, 1, 3, 1, ; Vertical weight for 4x4 
planar.
+;
+;pat_4w:             dw  4, 4,                   ; Rounding bias.
+
+section .text
+
+
+; 8x8 intra prediction functions.
+; There are 11 assembly function to cover all 8x8 intra prediction modes.
+; - Planar and DC.
+; - 3 pure diagonals:
+;   - dia_bot_left.
+;   - dia_top_left.
+;   - dia_top_right.
+; - Pure vertical and horizontal.
+; - 4 diagonals:
+;   - hor_bot.
+;   - hor_top.
+;   - ver_left.
+;   - ver_right.
+;
+; They all have the same input parameters, although some input parameters may 
be ignored.
+; - g0:     Destination.
+; - g1:     Neighbours. 48 is the bottommost left neighbour. 63 is the topmost 
left neighbour.
%%% top-left neighbour.
+;           64 is the leftmost top neighbours. 79 is the rightmost top 
neighbours. 128 is the top-left neighbours.
+; - g2:     Mode.
+; - g3:     Filter edge flag.
+
+
+; Intra DC 8x8.
+DEFFUN f265_lbd_predict_intra_dc_8_avx2, ia=4, at=8844, ti=0, tv=6, ym=1
+    ; Logic:
+    ; Sum all neighbours, except the corners.
+    ; Divide with bias by the number of samples.
+
+    vpmovzxbw       x1, [g1+56]             ; Load all data.
+    vpmovzxbw       x0, [g1+64]
+
+    vinserti128     y2, y0, x1, 1           ; Keep a copy for filtering.
+
+    vpaddw          y1, y0                  ; Add them together.
+
+    vpalignr        y0, y1, 8               ; At each step, fold the register 
in 2...
+    vpaddw          y1, y0                  ; ... then add each value together.
+
+    vpalignr        y0, y1, 4
+    vpaddw          y1, y0
+
+    vpalignr        y0, y1, 2
+    vpaddw          y1, y0
+
+    vmovd           x0, [pat_w_2048]
+    vpmulhrsw       y1, y1, y0              ; Round.
+
+    vpbroadcastb    y1, x1                  ; Replicate the value.
+    vmovdqa         y0, y1
+
+    and             g3, 1
+    jz              .SKIP_FILTER
+
+    ; 3 cases:
+    ; - Top-left = 2*base + top + left.
+    ; - Top =  3*base + top.
+    ; - Left = 3*base + left.
+
+    movd            g2d, x1                 ; Extract base.
+    and             g2, 0xff
+
+    lea             g3, [3*g2+2]            ; Base * 3 + rounding bias.
+    movd            x3, g3d
+    vpbroadcastw    y3, x3                  ; Broadcast base * 3 + rounding 
bias.
+
+    movzx           g3, byte [g1+64]        ; Load the first top and left 
value.
+    movzx           ga, byte [g1+63]
+
+    vpaddw          y2, y3                  ; 3 * Base + neighbours + rounding 
bias.
+    vpsrlw          y2, 2                   ; Divide by 4.
+
+    vpackuswb       y2, y2                  ; Word to byte.
+
+    vpblendd        y0, y2, y0, 0xfc        ; Save in top row.
+
%%% Column.
+    vpermq          y2, y2, 0b10_10_10_10   ; Broadcast left colum.
+
+    vmovdqu         y3, [ang_hor_8]
+    vpbroadcastq    y5, [pat_q_255]
+
+    vpshufb         y4, y2, y3              ; Replicate 8x the 4 lower values.
+    vpsrldq         y2, y2, 4               ; Shift by 4 to do the 4 last rows.
+    vpblendvb       y1, y1, y4, y5          ; Blend only the first value of 
each row.
+
+    vpshufb         y4, y2, y3              ; Replicate 8x the 4 lower values.
+    vpblendvb       y0, y0, y4, y5          ; Blend only the first value of 
each row.
+
+    ; Do top-left.
+    add             g3, ga                  ; Top + left.
+    lea             g2, [2*g2+g3+2]         ; Top + left + 2*base + bias.
+    shr             g2, 2                   ; Get the average.
+
+    vmovdqa         y2, y0
+    vpinsrb         x2, g2b, 0
+    vinserti128     y0, y0, x2, 0
+
+    .SKIP_FILTER:
+
+    vmovdqu         [g0], y0
+    vmovdqu         [g0+0x20], y1           ; Save the value.
+
+    RET
+
+
+; Intra planar 8x8.
+DEFFUN f265_lbd_predict_intra_planar_8_avx2, ia=4, at=8844, ti=0, tv=8, ym=1
+    ; value = ((8-x-1)*left + (8-y-1)*top + (x+1)*top_right + 
(y+1)*bottom_left + 8) >> 4);
+
+    vmovd           x6, [g1+56-1]           ; Load & broadcast bottom-left.
+    vpbroadcastb    x6, x6
+
+    vpmovzxbw       x1, [g1+64]             ; Load top neighbours.
+    vpmovzxbw       x6, x6
+
+    vmovq           x7, [g1+72]             ; Load & broadcast top right.
+    vpbroadcastb    y7, x7
+
+    vpbroadcastd    y0, [pat_b_0_1]         ; Weight distribution pattern.
+
+    vpsllw          y2, y1, 3               ; Top row * 8.
+    vpsubw          y1, y6                  ; Row delta (top neighbour - 
bottom-left).
+
+    vpsubw          y2, y1                  ; Top row * 7 + bottom-left.
+
+    vpsllw          y3, y1, 1               ;
+    vpsubw          y6, y2, y3              ; Top row * 5 + 3*bottom-left.
+    vinserti128     y2, y2, x6, 1           ; Get row 2 values.
+    vinserti128     y1, y1, x1, 1           ; Double the vertical delta 
removed at each line.
+
+    ; Register usage:
+    ; - y1: row delta.
+    ; - y2: row sum.
+
+    vpbroadcastq    y3, [g1+64-8]           ; Load left column.
+    vpunpcklbw      y3, y7                  ; Merge top right with left col.
+    vpshufb         y3, [planar_8_left]     ; Shuffle to do 2 columns at a 
time.
+
+    vbroadcasti128  y4, [planar_wgt_hor]    ; Load weights.
+    vpbroadcastd    y5, [pat_w_2048]        ; Load rounding bias.
+
+    ; Register usage:
+    ; - y0: weight distribution pattern.
+    ; - y1: row vertical delta.
+    ; - y2: row vertical sum.
+    ; - y3: column values.
+    ; - y4: column weights.
+    ; - y5: rounding bias.
+
+    %macro DO_ROW 2                         ; %1: alignment offset, %2: 
destination register.
+    %if %1 != 0
+    vpsubw          y2, y1                  ; Add delta to row sum.
+    vpalignr        y%2, y3, %1*2           ; Offset column.
+    vpshufb         y%2, y%2, y0            ; Repeat the column.
+    %else
+    vpshufb         y%2, y3, y0             ; Repeat the column.
+    %endif
+
+    vpmaddubsw      y%2, y4                 ; Get the sum of all factors.
+    vpaddusw        y%2, y2                 ; Add vertical.
+    vpmulhrsw       y%2, y5                 ; Round.
+    %endmacro
+
+    DO_ROW          0, 6                    ; Do row 0 and 2.
+    DO_ROW          1, 7                    ; Do row 1 and 3.
+
+    vpackuswb       y6, y7
+    vmovdqu         [g0], y6
+
+    vpsubw          y2, y1                  ; Add offset to row value.
+    vpsubw          y2, y1                  ;
+
+    DO_ROW          2, 6                    ; Do row 4 and 6.
+    DO_ROW          3, 7                    ; Do row 5 and 7.
+
+    vpackuswb       y6, y7
+    vmovdqu         [g0+0x20], y6
+    %unmacro DO_ROW 2
+    RET
+
+
+; Intra pure diagonal bottom-left 8x8.
+DEFFUN f265_lbd_predict_intra_dia_bot_left_8_avx2, ia=4, at=8844, ti=0, tv=3, 
ym=1
+
+    vmovdqu         x0, [g1+48]             ; Load all data.
+    vpshufb         y0, [dia_bot_left_8]    ; Re-order it.
+
+    vpalignr        y1, y0, 2               ; Offset the pixels in the high 
lane to build rows 2 and 3.
+    vinserti128     y0, y0, x1, 1           ;
+
+    vpalignr        y1, y0, 1               ; Create row 1 and 3.
+    vpunpcklqdq     y2, y0, y1              ; Merge them with rows 0 and 2.
+    vmovdqu         [g0], y2                ; Save row 0 to 3.
+
+    vpalignr        y1, y0, 5               ; Offset to generate rows 4 to 7.
+
+    vpalignr        y0, y0, 4               ; Repeat operation above for rows 
4 to 7.
+    vpunpcklqdq     y2, y0, y1              ;
+    vmovdqu         [g0+0x20], y2           ;
+    RET
%%% Missing white line (you used 2 lines to separate functions above).
+
+; Do horizontal prediction on a single row.
+; Input:
+; - y0: weights.
%%% period.
+; - y1: neighbours offset pattern;
%%% period.
+; - y2: rounding bias
%%% That's actually y5. This is your missing y7 below, see comments.
%%% Period.
+; - y7: neighbour offset increment for each row
+; - y6: left neighbours.
+; - %1: row offset.
+; - %2: output register.
+%macro DO_ROW       2
+    %if %1 != 0
+    vpsubb          y1, y5                  ; Update neighbours offset.
+    %endif
+
+    vpshufb         y%2, y6, y1             ; Generate neighbour pair.
+
+    ; Calculate row values.
+    vpmaddubsw      y%2, y%2, y0            ; Multiply with weight.
+    vpmulhrsw       y%2, y%2, y2            ; Round.
+    vpackuswb       y%2, y%2                ; Word to byte.
%%% Align.
+    %endmacro
+
+; Predict intra from left neighbours.
+; Input:
+; - y0: weights. (for DO_ROW)
%%% period.
+; - y1: neighbours offset pattern;
+; - y6: left neighbours.
+; Register usage:
+; - y2: rounding bias
+; - y3: temp.
+; - y4: temp.
+; - y5: neighbour offset increment for each row.
+%macro PRED_LEFT 0
+    ; Load patterns.
%%% Alignment.
+    vpbroadcastd    y2, [pat_w_1024]          ; Load rounding bias.
+
+    ; Calculate the offset for the high lane.
+    vpbroadcastd    y5, [pat_b_1]           ; Load neighbour position offsets.
+    vpsubb          y3, y1, y5              ; Pre-offset by 2 the neighbour 
position.
+    vpsubb          y3, y3, y5              ; Will be used to calculate 2 rows 
at once.
+    vinserti128     y1, y1, x3, 1           ; Put the offsetted load pattern 
on the high lane.
+
+    DO_ROW          0, 3                    ; Do row 0 and 2.
+    DO_ROW          1, 4                    ; Do row 1 and 3.
+
%%% Get rid of this. Fix the vpackuswb.
+    vpunpcklqdq     y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x00], y3           ; Save result.
+
+    vpsubb          y1, y5                  ; Skip from rows 1|3 to rows 4|6.
+    vpsubb          y1, y5
+
+    DO_ROW          4, 3                    ; Do row 4 and 6.
+    DO_ROW          5, 4                    ; Do row 5 and 7.
+
+    vpunpcklqdq     y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x20], y3           ; Save result.
%%% Align.
+    %endmacro
+
+; Generate offset and weight for intra left prediction.
+; Input:
+; - g2: mode.
%%% hor.
+; - %1: 1 for hor top, 0 for hoz bottom.
+; Register usage:
+; - ga: temp
+; - y2: temp.
+; Output:
+; - y0: weights.
%%% period.
+; - y1: neighbours offset pattern;
+%macro GEN_LEFT_WEIGHT 1
+    ; Generate weight and offset.
+    lea             ga, [intra_angle]
+    %if %1
+    vpbroadcastd    y1, [ga+g2*4-11*4]      ; Load angle factor.
+    neg             g2                      ; Get the angle's inversed offset.
+    %else
+    neg             g2                      ; Get the angle's inversed offset.
+    vpbroadcastd    y1, [ga+g2*4+9*4]       ; Load angle factor.
+    %endif
+    vbroadcasti128  y2, [angle_mul_hor]     ; Load multiplication table.
+    vpmaddubsw      y1, y1, y2              ; Result in offset and weight for 
each column.
+
+    ; Generate weight.
+    vpbroadcastd    y2, [pat_w_31]          ; Load weight mask.
+    vpand           y2, y2, y1              ; Extract weight.
+
+    ; Generate weight pairs.
+    vpbroadcastd    y0, [pat_w_32]          ; Load weight complement base.
+    vpsubw          y0, y2                  ; Get weight complements.
+
+    vpackuswb       y2, y2, y2              ; Word to byte.
+    vpackuswb       y0, y0, y0              ; Word to byte.
+    %if %1
+    vpunpcklbw      y0, y0, y2              ; Make the pair. Final weight.
+    %else
+    vpunpcklbw      y0, y2, y0              ; Make the pair. Final weight.
+    %endif
+
+    ; Generate offsets.
+    vpsrlw          y1, y1, 5               ; Extract neighbour offset.
+    vpsllw          y2, y1, 8               ; Double the offset (twice for 
each pair).
+    vpor            y1, y2
+
+    %if %1
+    vpbroadcastd    y2, [pat_b_7_8]         ; Load base offset pattern.
+    vpaddw          y1, y2, y1              ; Add offset with base. Result in 
actual neighbour position.
+    %else
+    vpbroadcastd    y2, [pat_b_14_15]       ; Load base offset pattern.
+    vpsubw          y1, y2, y1              ; Add the angle offset to the base 
offset.
+    %endif
+
%%% Align.
+    %endmacro
+
+
+; Intra angular horizontal bottom 8x8.
%%% You say tv=8. Where is y7?
+DEFFUN f265_lbd_predict_intra_hor_bot_8_avx2, ia=4, at=8844, ti=0, tv=8, ym=1
+    vbroadcasti128  y6, [g1+48]             ; Load left column.
+
+    GEN_LEFT_WEIGHT 0
+    PRED_LEFT
+    RET
+
+
+; Intra pure horizontal 8x8.
+DEFFUN f265_lbd_predict_intra_hor_8_avx2, ia=4, at=8844, ti=0, tv=4, ym=1
+    vmovdqu         y0, [ang_hor_8]         ; Load shuffle mask.
+
+    vpbroadcastd    y1, [g1+63-3]           ; Load the first 4 rows.
+    vpbroadcastd    y2, [g1+63-7]           ; Load the second 4 rows.
+
+    vpshufb         y1, y1, y0              ; Replicate 8 times each value.
+    vpshufb         y2, y2, y0              ;
+
+    and             g3, 1
+    jz              .SKIP_FILTER
+
+    vpmovzxbw       x0, [g1+64]             ; Load top row.
+    vmovd           x3, [g1+128]            ; Load & broadcast top-left.
+    vpbroadcastb    x3, x3
+
+    vpmovzxbw       x3, x3                  ; Byte to word.
+
+    vpsubw          x0, x3                  ; top - top-left.
+    vpsraw          x0, 1                   ; (top - top-left)/2.
+
+    vmovd           x3, [g1+63]             ; Load left.
+    vpbroadcastb    x3, x3
+    vpmovzxbw       x3, x3                  ; Byte to word.
+    vpaddw          x0, x3                  ; Left + (top - top-left)/2.
+
+    vpxor           x3, x3                  ; Replace negative values by 0.
+    vpmaxsw         x0, x3                  ;
+
+    vpackuswb       x0, x0                  ; Word to byte with unsigned 
saturation.
+
+    vpblendd        y1, y0, y1, 0xfc        ; Update the first 8 bytes.
+
+    .SKIP_FILTER:
+    vmovdqu         [g0], y1                ; Save it.
+    vmovdqu         [g0+0x20], y2           ;
+
+    RET
+
+
+; Intra angular horizontal top 8x8.
%%% Where is y7?
+DEFFUN f265_lbd_predict_intra_hor_top_8_avx2, ia=4, at=8844, ti=0, tv=8, ym=1
+    GEN_LEFT_WEIGHT 1
+
+    vmovdqu         x5, [g1+64]             ; Load top neighbour.
+    vpalignr        x5, x5, x5, 15
+    vpinsrb         x5, [g1+128], 0         ; Insert the top-left neighbour.
+
+    ; Import top neighbour with the left ones.
+    lea             g3, [inv_angle_8]
+    vpbroadcastd    y4, [g3+g2*4+18*4]      ; Load the inversed angle values.
+    vmovdqu         x3, [angle_inv_mul_hor] ; Load the weight values.
%%% will have an invalid offset.
%%% Fix everywhere.
+    vpmaddubsw      y4, y4, y3              ; Get the weight. Some neighbour 
will give invalid offset.
+                                            ; Since we never read them, it's 
ok.
+    vpbroadcastd    y3, [pat_w_8]           ; Load inversed angle bias.
+    vpaddw          y4, y3                  ; Add inversed angle bias.
+    vpsraw          y4, 4                   ; Get inversed neighbour offset.
+    vpackuswb       y4, y4                  ; Word to byte.
+    vpshufb         y5, y4                  ; Re-order left neighbours.
+
+    ; Load patterns.
+    vmovq           x4, [g1+56]             ; Load left data.
+    vpblendd        y5, y4, y5, 0xfc        ; Blend left neighbours with top 
neighbours.
+    vinserti128     y6, y5, x5, 1           ; Double data.
+
+    PRED_LEFT
+    RET
+
+%unmacro GEN_LEFT_WEIGHT 1
+%unmacro PRED_LEFT 0
+%unmacro DO_ROW 2
+
+
+; Intra pure diagonal top left 8x8.
+DEFFUN f265_lbd_predict_intra_dia_top_left_8_avx2, ia=4, at=8844, ti=0, tv=3, 
ym=1
+    vmovq           x0, [g1+64-7]           ; Load top row.
+    vmovhps         x0, [g1+64]             ; Load left row.
+    vpinsrb         x0, [g1+128], 7         ; Load & insert top-left.
+
+    vpalignr        y1, y0, 2               ; Offset the pixels in the high 
lane to build rows 6 and 7.
+    vinserti128     y0, y1, x0, 1           ;
+
+    vpalignr        y1, y0, 1               ; Create row 5 and 7.
+    vpunpcklqdq     y2, y1, y0              ; Merge them with rows 4 and 6.
+    vmovdqu         [g0+0x20], y2           ;
+
+    vpalignr        y0, y0, 4               ; Offset to generate rows 0 to 3.
+
+    vpalignr        y1, y0, 1               ; Repeat operation above for row 0 
to 3.
+    vpunpcklqdq     y2, y1, y0              ;
+    vmovdqu         [g0], y2                ; Save rows 0 to 3.
+    RET
+
+
+; Generate vertical intra prediction of 2 rows.
+; Input:
+; - %1: row index.
+; - y0: rounding bias.
+; - y1: broadcast weights paterns.
+; - y2: pre-calculated weights.
+; - y5: temp.
+; - y6: base offset pattern
+; - y7: angle sum.
+; - y8: angle increment.
+; - y9: shift mask.
+; - y10: neighbours.
+; Output:
+; - %2: predicted row.
%%% %1 and %2 are already documented above. DRY.
%%% You didn't document %3.
%%% Extra space left.
+ %macro DO_ROW 3                            ; %1: row offset, %2: register in 
which to put the value.
+    %if %1 != 0
+    vpaddb          y7, y8                  ; Add the angle to the current 
angle sum. Generate the offset.
+    %endif
+
+    ; Generate the neighbours pairs.
+    vpsrlw          y%2, y7, 5              ; Generate neighbour offset.
+    vpand           y%2, y9                 ; Shift can only be on word or 
greater value. Mask to simulate byte shift.
+    %if %3
+    vpsubb          y%2, y6, y%2,           ; Generate pair offset.
+    %else
+    vpaddb          y%2, y6                 ; Add offset to pairing mask.
+    %endif
+    vpshufb         y%2, y10, y%2           ; Generate pair.
+
+    ; Broadcast the current weights.
+    %if %1 != 0
+    vpalignr        y5, y2, %1*2            ; Get weights.
+    vpshufb         y5, y1                  ; Broadcast weights.
+    %else
+    vpshufb         y5, y2, y1              ; Broadcast weights.
+    %endif
+
+    ; Calculates row predictions.
+    vpmaddubsw      y%2, y%2, y5            ; Multiply values with weight.
+    vpmulhrsw       y%2, y%2, y0            ; Round.
%%% Align.
+    %endmacro
+
+; Input:
+; - g0: Result array.
+; - y10: Top row. Replicated.
+; Register usage :
+; - y0: Rounding bias.
%%% Replication.
+; - y1: Word replicatxion pattern.
+; - y2: Weights, distributed to do 2 rows at a time.
+; - y3: 2 rows of results [0|2].
+; - y4: 2 rows of results [1|3].
+; - y5: Temp.
+; - y6: Generate pair.
+; - y7: Angle sum. Used to generate the offset.
+; - y8: Angle value. Add it to the sum at each row.
+; - y9: "Word shift as byte shift" mask pattern.
+%macro ANG_VERTICAL_PRED 1                  ; %1: 0 for right, 1 for left.
+    ; Calculate the angle offset base.
+    lea             g3, [intra_angle]
+    %if %1
+    neg             g2
+    vpbroadcastd    y8, [g3+g2*4+25*4]      ; Load angle factor.
+    %else
+    vpbroadcastd    y8, [g3+g2*4-27*4]      ; Load angle factor.
+    %endif
+
%%% 2 rows.
+    vpmaddubsw      y7, y8, [triple_last_lane]  ; Multiply high lane by 3. 
Offset required to do 2 row at the time.
+    vpackuswb       y7, y7                  ; This is the angle sum for each 
row.
+
+    ; Calculate the weight.
+    %if %1
+    vmovdqu         y3, [angle_mul_ver]     ; Load multiplication table.
+    vpmaddubsw      y2, y3, y8              ; Offset and weight for all rows.
+    %else
+    vpmaddubsw      y2, y8, [angle_mul_ver] ; Offset and weight for all rows.
+    %endif
+    vpbroadcastd    y3, [pat_w_31]          ; Load mask.
+    vpand           y3, y3, y2              ; Weight.
+
+    vpbroadcastd    y4, [pat_w_32]          ; Load weights complement base.
+    vpsubw          y4, y3                  ; Get the weight complement.
+    %if %1
+    vpunpcklbw      y2, y3, y4              ; Make the pair. Final weight.
+    %else
+    vpunpcklbw      y2, y4, y3              ; Make the pair. Final weight.
+    %endif
+
+
+    ; Load patterns.
+    %if %1
+    vbroadcasti128  y6, [pair_high]         ; Load pair making pattern.
+    %else
+    vbroadcasti128  y6, [pair_low]          ; Load pair making pattern.
+    %endif
+    vpbroadcastd    y0, [pat_w_1024]        ; Load rounding bias.
+    vpbroadcastd    y1, [pat_b_0_1]         ; Load weight distribution pattern.
+    vpbroadcastd    y9, [pat_b_7]           ; Load "word shift as byte shift" 
mask pattern.
+
+    DO_ROW          0, 3, %1                ; Do row 0 and 2.
+    DO_ROW          2, 4, %1                ; Do row 1 and 3.
+
+    vpackuswb       y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x00], y3           ; Save result.
+
+    vpaddb          y7, y8                  ; Skip from rows 1|3 to rows 4|6.
+    vpaddb          y7, y8
+
+    DO_ROW          4, 3, %1                ; Do row 4 and 6.
+    DO_ROW          6, 4, %1                ; Do row 5 and 7.
+
+    vpackuswb       y3, y4                  ; Merge value.
+    vmovdqu         [g0+0x20], y3           ; Save result.
+
%%% Align.
+    %endmacro
+
+
+; Intra angular vertical left 8x8.
+DEFFUN f265_lbd_predict_intra_ver_left_8_avx2, ia=4, at=8844, ti=0, tv=11, ym=1
+    vmovq           x0, [g1+64-8]           ; Load top and left data.
+    vpinsrb         x0, [g1+128], 8         ; Load top-left.
+
+    ; Re-order the left neighbours.
+    lea             g3, [inv_angle_8]
+    vpbroadcastd    y2, [g3+g2*4-18*4]      ; Load the inversed angle values.
+    vmovdqu         x3, [angle_inv_mul_ver] ; Load the inversed weight values.
+    vpmaddubsw      y2, y2, y3              ; Get the weight. Some neighbour 
will give invalid offset.
+                                            ; Since we never use them, it's ok.
+    vpbroadcastd    y3, [pat_w_8]           ; Load inversed angle bias.
+    vpaddw          y2, y3                  ; Add inversed angle bias.
+    vpsraw          y2, 4                   ; Get inversed neighbour offset.
+    vpsubb          y2, y3, y2              ; Invert the index.
+    vpackuswb       y2, y2                  ; Word to byte.
+    vpshufb         y0, y2                  ; Re-order left neighbours.
+
+    ; Blend re-ordered neighbours with the top neighbours.
+    vmovhps         x0, [g1+64]
+    vinserti128     y10, y0, x0, 1          ; Double top row.
+
+    ANG_VERTICAL_PRED 1
+    RET
+
+
+; Intra pure vertical 8x8.
+DEFFUN f265_lbd_predict_intra_ver_8_avx2, ia=4, at=8844, ti=0, tv=6, ym=1
+    vpbroadcastq    y0, [g1+64]             ; Copy the top neighbours 4 times. 
Holds row 0 to 3.
+    vmovdqa         y4, y0                  ; Copy it. Holds row 4 to 7.
+
+    and             g3, 1
+    jz              .SKIP_FILTER
%%% Extra white line.
+
+
+    vmovd           x3, [g1+128]            ; Load left.
+    vpbroadcastb    x3, x3
+    vpmovzxbw       x2, [g1+64-8]           ; Load left neighbours.
+    vpbroadcastb    x1, x0                  ; Broadcast top neighbours.
+
+    vpmovzxbw       x3, x3                  ; Word to byte.
+    vpmovzxbw       x1, x1
+
+    vpsubw          x2, x3                  ; Left - top-left.
+    vpsraw          x2, 1                   ; Signed divide by 2.
+    vpaddw          x2, x1                  ; Top + (left - top-left)/2.
+
+    vpxor           x3, x3
+    vpmaxsw         x2, x3                  ; Clip negative value to 0.
+    vpackuswb       x2, x2                  ; Word to byte with unsigned 
saturation.
+    vinserti128     y2, x2, 1               ; Double the data.
+
+    vmovdqu         y3, [ang_hor_8]         ; Load replication pattern.
+    vpbroadcastq    y1, [pat_q_255]         ; Pattern that blends in a word 
out of 8.
+
+    vpshufb         y5, y2, y3              ; Replicate 8x the 4 lower values.
+
+    vpsrldq         y2, y2, 4               ; Shift by 4 to do the 4 last rows.
+
+    vpblendvb       y4, y5, y1              ; Blend only the first value of 
each row.
+
+    vpshufb         y5, y2, y3              ; Replicate 8x the 4 lower value.
+    vpblendvb       y0, y5, y1              ; Blend only the first value of 
each row.
+
+    .SKIP_FILTER:
+    vmovdqu         [g0+0x00], y0           ; Save it.
+    vmovdqu         [g0+0x20], y4           ;
+
+    RET
+
+
+; Intra angular vertical right 8x8.
+DEFFUN f265_lbd_predict_intra_ver_right_8_avx2, ia=4, at=8844, ti=0, tv=11, 
ym=1
+
+    vbroadcasti128  y10, [g1+64]            ; Load top row.
+
+    ANG_VERTICAL_PRED 0
+    RET
+
+%unmacro DO_ROW 3
+%unmacro ANG_VERTICAL_PRED 1
%%% Add white line.
+
+; Intra angular top right 8x8.
+DEFFUN f265_lbd_predict_intra_dia_top_right_8_avx2, ia=4, at=8844, ti=0, tv=3, 
ym=1
+    vmovdqu         x0, [g1+64]             ; Load all data.
+
+    vpalignr        y1, y0, 3               ; Offset the pixels in the high 
lane to build rows 2 and 3.
+    vpalignr        y0, y0, 1               ;
+    vinserti128     y0, y0, x1, 1           ; Push offsetted value in high 
lane.
+
+    vpalignr        y1, y0, 1               ; Create rows 1 and 3.
+    vpunpcklqdq     y2, y0, y1              ; Merge them with rows 0 and 2.
+    vmovdqu         [g0], y2                ; Save rows 0 to 3.
+
+    vpalignr        y1, y0, 5               ; Offset to generate rows 4 to 7.
+
+    vpalignr        y0, y0, 4               ; Repeat operation above for rows 
4 to 7.
+    vpunpcklqdq     y2, y0, y1              ;
+    vmovdqu         [g0+0x20], y2           ;
+
+    RET
+
+
+; Extract and filter neighbours for intra prediction.
+;
+; Input format:
+; EAABB
+; C
+; C
+; D
+; D
+;
+; Output format:
+;   padding   [48]  [64]  padding [128]
+; [ ...       DDCC  AACC  ...     E]
+;
+; Input parameters:
+; - g0: nbuf[2][160].
+; - g1: pred.
+; - g2: pred_stride.
+; - g3: avail[2].
+; - g4: filter_flag.
+; - g5: packed (Ignored).
%%% You say ti=2. Where is g7?
+DEFFUN f265_lbd_extract_intra_neigh_8_avx2, ia=6, at=884844, ti=2, tv=8, ym=1
+    ; Load availability.
+    movzx           g5, byte [g3]           ; Load availx.
+    movzx           g6, byte [g3+4]         ; Load availy.
+
+    ; Test for special case: no left neighbours.
+    cmp             g6, 0
+    jz              .LEFT_NOT_AVAILABLE
+
+    ; Left neighbours are available.
+
+    ; Get C from the prediction buffer.
+    ; Pseudo-code:
+    ; - Load & broadcast as dword the left neighbour of each row.
+    ; - Blend the rows together.
+    ; - Keep in mind the order needs to be inversed.
+
%%% neighbours.
+    ; Get 4 left neighbour.
+    ; Input:
%%% Missing ':'.
+    ; - %1 the xmm register in which to save the value,
+    ; - %2: temp.
+    ; - %3: temp.
+    ; - ga: first row address. Must be aligned on the dword left of the row.
+    ; - g2: pred_stride.
+    ; - g3: 3*pred_stride.
+    %macro load2    3
+    vpbroadcastd    %1, [ga]                ; Load & broadcast the left 
neighbour.
+    vpbroadcastd    %2, [ga+g2]             ; Load & broadcast the next left 
neighbour.
+    vpblendd        %1, %1, %2, 0b0101_0101 ; Mix even and odd row: result 1 0 
1 0.
+
+    vpbroadcastd    %2, [ga+g2*2]           ; Load & broadcast the next left 
neighbour.
+    vpbroadcastd    %3, [ga+g3]             ; Load & broadcast the next left 
neighbour.
+    vpblendd        %2, %2, %3, 0b0101_0101 ; Mix even and odd row: result 3 2 
3 2.
+
+    vpblendd        %1, %1, %2, 0b0011_0011 ; Mix 1 0 and 3 2. Result 3 2 1 0.
+    vpshufb         %1, x7                  ; Keep the last byte of each dword.
+    %endmacro
+
+    vpbroadcastd    x7, [neigh_last_b_of_d] ; Load suffle mask.
+
+    lea             ga, [g1-4]
+    lea             g3, [g2*3]
+    load2           x0, x1, x2              ; Load C0 to C3.
+
+    lea             ga, [ga+g2*4]
+    load2           x3, x1, x2              ; Load C4 to C7.
+
%%% Alignment.
+     vpblendd        x0, x0, x3, 0b01010101 ; Get C7..C0.
+
+    ; Special case: no top neighbours.
+    cmp             g5, 0
+    jz              .TOP_NOT_AVAILABLE
+
+    ; Load top (A and B) neighbour from pred.
+    mov             ga, g2
+    neg             ga                      ; Move up 1 row (negative 
pred_stride).
+    vmovdqu         x1, [g1+ga]             ; Load A|B from prediction.
+    vmovd           x2, [g1+ga-1]           ; Load top-left (E).
+
+    .LEFT_AND_TOP_FETCHED:
+
+    ; Test if bottom-left is available.
+    cmp             g6, 8
+    ja              .BOTTOM_AVAILABLE
+    jl              .LEFT_4_AVAILABLE
+
+    ; Bottom-left not available.
+    vpshufb         x0, [neig_bl_unav_8]    ; Expand the last value.
+
+    .BOTTOM_FETCHED:
+
+    vmovdqu         [g0+48], x0             ; Save partial top and left to 
allow easy byte extraction.
+    vmovdqu         [g0+64], x1
+
+    movd            x3, g5d
+    vpbroadcastb    x3, x3
+    vpcmpgtb        x3, [pat_b_0_to_16]
+    vmovd           x4, [g0+63+g5]          ; Broadcast the last available 
block.
+    vpbroadcastb    x4, x4
+    vpblendvb       x1, x4, x1, x3          ; Replace (blend) invalid value 
with the broadcasted last valid values.
+
+    vmovdqu         [g0+48], x0             ; Save values.
+    vmovdqu         [g0+64], x1
+    vmovdqu         [g0+128], x2
+
+    ; Filter only if required.
+    cmp             g4, 0
+    je              .END
+
+    ; Pseudo code:
+    ; Register ordering : D7, D6 ... D0, C7, ... C0, E, A0, ..., A7, B0, ... 
B6, B7.
+    ; V[i] = (V[i-1] + 2*V[i] + V[i+1] + 2) >> 2
+    ; D7 = D7, B7 = B7
+
+    vpbroadcastd    y6, [pat_b_1]           ; Load pmadd pattern (actually, 
just an add and zero extend).
+    vpbroadcastd    y5, [pat_w_8192]        ; Load rounding bias.
+
+    vpslldq         x4, x2, 15              ; Move the top-left (e) to the 
last byte of the xmm register.
+    vpalignr        x3, x2, x0, 1           ; Remove D7 and insert E next to 
C0.
+                                            ; All bytes are shifted by one. 
Named D|C*.
+    vpalignr        x4, x1, x4, 15          ; Remove B7 and insert E next to 
A0.
+                                            ; All bytes are shifted by one. 
Named A|B*.
+
+    vinserti128     y0, y0, x1, 1           ; Pack D|C with A|B.
+    vinserti128     y3, y3, x4, 1           ; Pack D|C* with A|B*.
+
+    vpmaddubsw      y0, y0, y6              ; Add the neighbours together.
+    vpmaddubsw      y3, y3, y6              ; As D|C|AB* is DC|A|B offsetted 
by one byte, this will generate all
+                                            ; D|C and A|B peer. The innermost 
value of D|C|A|B* will be C0+E and E+A0.
+
+    vpaddw          y1, y0, y3              ; Add D|C|A|B to D|C|A|B*.
+    vpmulhrsw       y1, y1, y5              ; Round.
+
+    vpshufb         y3, [neigh_shift_pair]  ;
+    vpaddw          y0, y3, y0              ; Generate the missing pair sums.
+    vpmulhrsw       y0, y0, y5              ; Round.
+
+    vpackuswb       y0, y0, y1              ; Word to byte.
+    vpshufb         y0, [neigh_1_of_2]      ; Interleave the result.
+
+    vextracti128    x1, y0, 1
+
+    vpinsrb         x0, [g0+48], 0          ; Manage D7.
+    vmovdqu         [g0+160+48], x0         ; Save it.
+
+    vpinsrb         x1, [g0+79], 15         ; Manage B7.
+    vmovdqu         [g0+160+64], x1         ; Save it.
+
+    ; Filter top-left.
+    movzx           g2, byte [g0+128]       ; Load top-left.
+    movzx           g3, byte [g0+63]        ; Load top.
+    movzx           g4, byte [g0+64]        ; Load left.
+    lea             g2, [g2*2+g3+2]         ; Top-left * 2 + top + bias.
+    add             g2, g4                  ; Top-left * 2 + top + left + bias.
+    shr             g2, 2                   ; Round.
+    mov             [g0+160+128], g2b       ; Save filtered top-left.
+
+    .END:
+    RET
+
+
+    .LEFT_NOT_AVAILABLE:
+
+    ; Test if top is available.
+    cmp             g5, 0
+    jz              .NOTHING_AVAILABLE
+
+    mov             ga, g2
+    neg             ga
+    vmovdqu         x1, [g1+ga]             ; Load top value
+    vpbroadcastb    x0, x1                  ; Broadcast the first byte as the 
left value.
+    vmovdqa         x2, x1                  ; Set top-left.
+    jmp             .LEFT_AND_TOP_FETCHED
+
+
+    .TOP_NOT_AVAILABLE:
+
+    vpbroadcastd    x2, [pat_b_15]
+    vpshufb         x1, x0, x2              ; Replicate C0 as the top 
neighbours.
+    vmovdqa         x2, x1                  ; Set top-left.
+    jmp             .LEFT_AND_TOP_FETCHED
+
+
%%% Remove that, can't happen.
+    .LEFT_4_AVAILABLE:
+
+    ; Broadcast the 13th value over the invalid neighbours.
+    vpalignr        x3, x0, x0, 12
+    vpbroadcastb    x3, x3
+    vpblendd        x0, x0, x3, 0b0111
+    jmp             .BOTTOM_FETCHED
+
+
+    .BOTTOM_AVAILABLE:
+
+    ; Get D from the pred buffer.
+    lea             ga, [g1-4+8*g2]
+    load2           x3, x4, x5
+
+    lea             ga, [ga+g2*4]
+    load2           x4, x5, x6
+
+    vpblendd        x3, x3, x4, 0b0101
+
+    ; Merge C and D.
%%% Align.
+    vpblendd        x0, x0, x3, 0b0011 ; [f e d c b a 9 8 7 6 5 4 3 2 1 0].
+
+    cmp             g6, 12
+    ja             .BOTTOM_FETCHED
+
+    ; Broadcast the 5th value over the invalid neighbours.
+    vpalignr        x3, x0, x0, 4
+    vpbroadcastb    x3, x3
+    vpblendd        x0, x0, x3, 0b0001
+    jmp             .BOTTOM_FETCHED
+
+
+    .NOTHING_AVAILABLE:
+
+    vpbroadcastd    y0, [pat_b_128]         ; Store 128 everywhere.
+
+    vmovdqu         [g0+48], x0             ; Save it.
+    vmovdqu         [g0+64], x0
+    vmovd           [g0+128], x0
+
+    vmovdqu         [g0+160+48], x0         ; Save the filtered version.
+    vmovdqu         [g0+160+64], x0
+    vmovd           [g0+160+128], x0
+
%%% No such macro.
+    %unmacro load 2
+    %unmacro load2 2
%%% Move that RET up, it's misleading here.
+    RET
diff --git a/f265/bdi.h b/f265/bdi.h
index 9d2e4ce..c8fab2c 100644
--- a/f265/bdi.h
+++ b/f265/bdi.h
@@ -502,6 +502,7 @@ extern const int16_t f265_lambdas[52];
 extern uint16_t f265_mv_costs[52][F265_NB_MV_COSTS];
 extern const int8_t f265_hpel_src0[16];
 extern const int8_t f265_hpel_src1[16];
+extern const int8_t f265_mode_to_intra_pred[35];
 
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/f265/bdi_ro.c b/f265/bdi_ro.c
index 05972f0..317655c 100644
--- a/f265/bdi_ro.c
+++ b/f265/bdi_ro.c
@@ -608,3 +608,6 @@ const int16_t f265_lambdas[52] =
 const int8_t f265_hpel_src0[16] = { 0, 0, 1, 1, 0, 1, 1, 1, 2, 2, 3, 3, 0, 1, 
1, 1 };
 const int8_t f265_hpel_src1[16] = { 9, 1, 9, 0, 2, 2, 3, 2, 9, 3, 9, 2, 2, 2, 
3, 2 };
 
+// Map intra prediction mode to optimized assembly function.
+const int8_t f265_mode_to_intra_pred[35] = {0, 1, 2, 3,3,3,3,3,3,3, 4, 
5,5,5,5,5,5,5, 6, 7,7,7,7,7,7,7,
+                                            8, 9,9,9,9,9,9,9, 10};
diff --git a/f265/enc.h b/f265/enc.h
index 3af6647..a717af9 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -798,7 +798,14 @@ typedef struct f265_intra_block
     int8_t intra_dist_mode;
 
     // Unfiltered/filtered neighbours of the current partition.
-    F265_ALIGN64 f265_pix neighbours[2][129];
+    // Layout:
+    // - [0-63]    : Bottom left and left neighbours, packed near index 63 
+    //               (index 63 is always the topmost left neighbour).
+    // - [64-127]  : Top and top left neighbours, packed near index 64.
+    //               (index 64 is always the leftmost top neighbour).
+    // - [128]     : Top-left neighbours
+    // - [129-159] : Alignment padding.
+    F265_ALIGN64 f265_pix neighbours[2][160];
 
 } f265_intra_block;
 
@@ -2831,17 +2838,13 @@ void fenc_mc_chroma_b(f265_enc_thread *t, f265_pix 
*dst, int dst_stride, f265_re
                       int packed_dims, int plane_off, int comp);
 
 // intra.c
-void fenc_get_intra_filter_flags(int *filter_edge_flag, int 
*filter_neighbour_flag, int *neighbour_bilinear_flag,
-                                 int comp, int lg_bs, int mode, int 
smooth_intra_flag);
+void fenc_get_intra_filter_flags(int *filter_edge_flag, int 
*filter_neighbour_flag,
+                                 int comp, int lg_bs, int mode);
 void fenc_get_intra_encode_flags(int *dst_flag, int *order, int comp, int 
lg_bs, int mode);
-void fenc_predict_intra_neighbours(f265_enc_thread *t, f265_pix dst[129], int 
rec_flag,
-                                   int comp, int bs, int ct_ox, int ct_oy);
-void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int 
bd, int bilinear_flag);
-void fenc_predict_intra_planar(f265_pix *dst, f265_pix *nbuf, int lg_bs);
-void fenc_predict_intra_dc(f265_pix *dst, f265_pix *nbuf, int lg_bs, int 
filter_edge_flag);
-void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int 
bd, int filter_edge_flag, int mode);
+void fenc_extract_intra_neighbours(f265_enc_thread *t, f265_pix dst[2][160],  
int ct_off[2],
+                                   int packed, int filter, int lg_bs);
 void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, 
int bd, int mode, int filter_edge_flag);
-void fenc_predict_intra(f265_enc_thread *t, f265_pix *dst, int comp, int 
lg_bs, int mode, int ct_ox, int ct_oy);
+void fenc_predict_intra_block(f265_enc_thread *t, f265_pix *dst, int comp, int 
lg_bs, int mode, int ct_ox, int ct_oy);
 void fenc_get_intra_pred_mode(f265_enc_thread *t, f265_cb *cb, int 
partition_idx, int *mpm_list);
 
 // inter.h
diff --git a/f265/intra.c b/f265/intra.c
index d512e2b..d17e3cf 100644
--- a/f265/intra.c
+++ b/f265/intra.c
@@ -45,13 +45,12 @@ static finline void fenc_get_intra_nb_avail(f265_enc_thread 
*t, int avail[2], in
 // - filter_edge_flag: true if the edges are filtered for 
DC/vertical/horizontal.
 // - filter_neighbour_flag: true if the neighbours are filtered.
 // - neighbour_bilinear_flag: true if the neighbours are filtered bilinearly.
-void fenc_get_intra_filter_flags(int *filter_edge_flag, int 
*filter_neighbour_flag, int *neighbour_bilinear_flag,
-                                 int comp, int lg_bs, int mode, int 
smooth_intra_flag)
+void fenc_get_intra_filter_flags(int *filter_edge_flag, int 
*filter_neighbour_flag,
+                                 int comp, int lg_bs, int mode)
 {
     int bs = 1<<lg_bs;
     *filter_edge_flag = !comp && bs != 32;
     *filter_neighbour_flag = !comp && f265_intra_mode_dist[mode] > 
f265_intra_dist_thresholds[lg_bs-2];
-    *neighbour_bilinear_flag = bs == 32 && smooth_intra_flag;
 }
 
 // Get the intra encoding flags.
@@ -66,163 +65,205 @@ void fenc_get_intra_encode_flags(int *dst_flag, int 
*order, int comp, int lg_bs,
 }
 
 // Extract the unfiltered neighbour pixels of the specified intra block for one
-// image component. 'src' points to the top-left neighbour pixel of the block,
-// 'nx' and 'ny' are the number of pixels to predict in each direction. 'bd' is
-// the bit depth.
-static finline void fenc_extract_intra_neighbours(f265_pix dst[129], f265_pix 
*src, int src_stride,
-                                                  int availx, int availy, int 
nx, int ny, int bd)
+// image component. 'src' points to the top-left pixel of the block,
+// 'avail' are the number of pixels available in each direction. 'packed' is
+// the bitdepth and the block size (bd << 8 | bs).
+static finline void fenc_extract_unfiltered_intra_neigh(f265_pix dst[129], 
f265_pix *src, int src_stride,
+                                                  int avail[2], int packed)
 {
     // The following logic relies on the slice layout restrictions.
+    int bs = packed&255;
+    int bd = packed>>8;
+
+    int nb_pix = bs*2;
+    int availx = avail[0];
+    int availy = avail[1];
 
     // Copy top-left tentatively.
-    dst[0] = src[0];
+    dst[128] = src[-1-src_stride];
 
     // Left is fully available, copy.
-    if (likely(availy >= ny))
+    if (likely(availy >= nb_pix))
     {
-        for (int i = 0; i < ny; i++) dst[65+i] = src[(1+i)*src_stride];
+        for (int i = 0; i < nb_pix; i++) dst[63-i] = src[i*src_stride-1];
     }
 
     // Left is partially available, copy and broadcast.
     else if (likely(availy > 0))
     {
-        for (int i = 0; i < availy; i++) dst[65+i] = src[(1+i)*src_stride];
-        f265_pix p = dst[64+availy];
-        for (int i = availy; i < ny; i++) dst[65+i] = p;
+        for (int i = 0; i < availy; i++) dst[63-i] = src[i*src_stride-1];
+        f265_pix p = dst[63-availy+1];
+        for (int i = availy; i < nb_pix; i++) dst[63-i] = p;
     }
 
     // Left and top-left are not available but top is. Broadcast the first
     // pixel directly above the block.
     else if (likely(availx > 0))
     {
-        f265_pix p = src[1];
-        dst[0] = p;
-        for (int i = 0; i < ny; i++) dst[65+i] = p;
+        f265_pix p = src[-src_stride];
+        dst[128] = p;
+        for (int i = 0; i < nb_pix; i++) dst[63-i] = p;
     }
 
     // Nothing is available, perform DC prediction.
     else
     {
         f265_pix p = 1<<(bd-1);
-        for (int i = 0; i < nx+1; i++) dst[i] = p;
-        for (int i = 0; i < ny; i++) dst[65+i] = p;
+        dst[128] = p;
+        for (int i = 0; i < nb_pix; i++) dst[64+i] = p;
+        for (int i = 0; i < nb_pix; i++) dst[63-i] = p;
         return;
     }
 
     // Top is fully available, copy.
-    if (likely(availx >= nx))
+    if (likely(availx >= nb_pix))
     {
-        for (int i = 0; i < nx; i++) dst[1+i] = src[1+i];
+        for (int i = 0; i < nb_pix; i++) dst[64+i] = src[i-src_stride];
     }
 
     // Top is partially available, copy and broadcast.
     else if (likely(availx > 0))
     {
-        for (int i = 0; i < availx; i++) dst[1+i] = src[1+i];
-        f265_pix p = dst[availx];
-        for (int i = availx; i < nx; i++) dst[1+i] = p;
+        for (int i = 0; i < availx; i++) dst[64+i] = src[i-src_stride];
+        f265_pix p = dst[64+availx-1];
+        for (int i = availx; i < nb_pix; i++) dst[64+i] = p;
     }
 
     // Top-left, top, top-right are not available. Broadcast the first pixel
     // directly left of the block.
     else
     {
-        f265_pix p = dst[65];
-        for (int i = 0; i < nx+1; i++) dst[i] = p;
+        f265_pix p = dst[63];
+        dst[128] = p;
+        for (int i = 0; i < nb_pix; i++) dst[64+i] = p;
     }
 }
 
-// Predict the unfiltered neighbour pixels of the specified intra block at
-// pixel offset (ct_ox, ct_oy) in the CTB with block size 'bs'. 'rec_flag' is
-// true if the reconstructed pixels are used for the prediction, false if the
-// source pixels are used as approximation. This function assumes that
-// constrained intra prediction is not used.
-//
-// Layout of the destination array, by offset: 0 (top-left), 1 (top and
-// top-right), 65 (left and bottom left).
-void fenc_predict_intra_neighbours(f265_enc_thread *t, f265_pix dst[129], int 
rec_flag,
-                                   int comp, int bs, int ct_ox, int ct_oy)
-{
-    int avail[2];
-    fenc_get_intra_nb_avail(t, avail, comp, ct_ox, ct_oy);
-    int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_ox-1, ct_oy-1);
-    f265_pix *src = rec_flag ? t->src_frame->rec_planes[comp ? 3+comp : 0] + 
plane_off :
-                               t->src_frame->src_planes[comp] + plane_off;
-    fenc_extract_intra_neighbours(dst, src, t->me.ref_stride, avail[0], 
avail[1], bs<<1, bs<<1,
-                                  t->enc->gd.bit_depth[!!comp]);
-}
-
 // Filter the neighbour pixels of the block specified.
-void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int 
bd, int bilinear_flag)
+static void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, 
int bd, int bilinear_flag)
 {
     int bs2 = bs<<1;
-    int top_left = src[0], top_last = src[bs2], left_last = src[64+bs2];
+    int top_left = src[128], top_last = src[64+bs2-1], left_last = 
src[63-bs2+1];
 
     // Check for bilinear filtering.
     if (bilinear_flag)
     {
-        int top_middle = src[32], left_middle = src[64+32];
+        int top_middle = src[63+32], left_middle = src[64-32];
         int threshold = 1<<(bd-5);
         bilinear_flag = F265_ABS(top_left + top_last - (top_middle<<1)) < 
threshold &&
                         F265_ABS(top_left + left_last - (left_middle<<1)) < 
threshold;
         if (bilinear_flag)
         {
-            dst[0] = top_left;
-            dst[64] = top_last;
-            dst[64+64] = left_last;
+            dst[128] = top_left;
+            dst[127] = top_last;
+            dst[0] = left_last;
             for (int i = 0; i < 63; i++)
             {
-                dst[1+i] =  ((63-i)*top_left + (i+1)*top_last  + 32)>>6;
-                dst[65+i] = ((63-i)*top_left + (i+1)*left_last + 32)>>6;
+                dst[64+i] = ((63-i)*top_left + (i+1)*top_last  + 32)>>6;
+                dst[63-i] = ((63-i)*top_left + (i+1)*left_last + 32)>>6;
             }
             return;
         }
     }
 
     // Regular filtering.
-    dst[0] = ((top_left<<1) + src[1] + src[65] + 2)>>2;
-    dst[bs2] = top_last;
-    dst[64+bs2] = left_last;
-    for (int i = 1; i < bs2; i++) dst[i] = ((src[i]<<1) + src[i-1] + src[i+1] 
+ 2)>>2;
-    dst[65] = ((src[65]<<1) + top_left + src[66] + 2)>>2;
-    for (int i = 66; i < 64+bs2; i++) dst[i] = ((src[i]<<1) + src[i-1] + 
src[i+1] + 2)>>2;
+    dst[128] = ((top_left<<1) + src[64] + src[63] + 2)>>2;
+    dst[63+bs2] = top_last;
+    dst[64-bs2] = left_last;
+    dst[64] = ((src[64]<<1) + top_left + src[65] + 2)>>2;
+    dst[63] = ((src[63]<<1) + top_left + src[62] + 2)>>2;
+
+    for (int i = 1; i < bs2-1; i++)
+    {
+        dst[64+i] = ((src[64+i]<<1) + src[64+i-1] + src[64+i+1] + 2)>>2;
+        dst[63-i] = ((src[63-i]<<1) + src[63-i-1] + src[63-i+1] + 2)>>2;
+    }
+}
+
+// Extract and filter the transform block's neighbours.
+// In the assembly version, the neighbours are always filtered.
+// Packed:
+// - Bit 0-7  : bs
+// - Bit 8-15 : bd
+void fenc_extract_intra_neigh_c(f265_pix *dst, f265_pix *pred, int pred_stride,
+                                int avail[2], int filter, int packed)
+{
+    int bs = packed&255;
+    int bd = packed>>8;
+
+    fenc_extract_unfiltered_intra_neigh(dst, pred, pred_stride, avail, packed);
+
+    if (filter)
+        fenc_filter_intra_neighbours(dst+160, dst, bs, bd, (bs)==32);
+}
+
+// Extract and filter the transform block neighbours at the location indicated
+// by ct_off. Packed: rec_flag << 8 | comp
+void fenc_extract_intra_neighbours(f265_enc_thread *t, f265_pix dst[2][160], 
int ct_off[2],
+                                   int filter, int packed, int lg_bs)
+{
+    int comp = packed & 255;
+    int rec_flag = packed>>8;
+
+    int avail[2];
+    fenc_get_intra_nb_avail(t, avail, comp, ct_off[0], ct_off[1]);
+    int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_off[0], 
ct_off[1]);
+    f265_pix *src = rec_flag ? t->src_frame->rec_planes[comp ? 3+comp : 0] + 
plane_off :
+                               t->src_frame->src_planes[comp] + plane_off;
+
+    fenc_extract_intra_neigh[lg_bs-2](dst, src, t->me.ref_stride, avail, 
filter,
+                                      (t->enc->gd.bit_depth[!!comp] << 8) | 1 
<< lg_bs);
 }
 
 // Intra planar prediction.
-void fenc_predict_intra_planar(f265_pix *dst, f265_pix *nbuf, int lg_bs)
+// Packed:
+// - Bit 0    : filter_edge_flag
+// - Bit 1-7  : lg_bs
+// - Bit 8-15 : bd
+void fenc_predict_intra_planar_c(f265_pix *dst, f265_pix *nbuf, int mode, int 
packed)
 {
+    int lg_bs = (packed>>1) & 127;
+
     int bs = 1<<lg_bs;
-    int top_right = nbuf[1+bs];
-    int bottom_left = nbuf[65+bs];
+    int top_right = nbuf[64+bs];
+    int bottom_left = nbuf[63-bs];
     for (int y = 0; y < bs; y++)
         for (int x = 0; x < bs; x++)
-            dst[y*bs+x] = ((bs-1-x)*nbuf[65+y] + (bs-1-y)*nbuf[1+x] + 
(x+1)*top_right + (y+1)*bottom_left + bs)
+            dst[y*bs+x] = ((bs-1-x)*nbuf[63-y] + (bs-1-y)*nbuf[64+x] + 
(x+1)*top_right + (y+1)*bottom_left + bs)
                           >>(lg_bs+1);
 }
 
 // Intra DC prediction.
-void fenc_predict_intra_dc(f265_pix *dst, f265_pix *nbuf, int lg_bs, int 
filter_edge_flag)
+void fenc_predict_intra_dc_c(f265_pix *dst, f265_pix *nbuf, int mode, int 
packed)
 {
+    int lg_bs = (packed>>1) & 127;
+    int filter_edge_flag = packed&1;
+
     int bs = 1<<lg_bs;
     int dc_val = bs;
-    for (int i = 0; i < bs; i++) dc_val += nbuf[1+i] + nbuf[65+i];
+
+    for (int i = 0; i < bs; i++) dc_val += nbuf[63-i] + nbuf[64+i];
     dc_val = dc_val>>(lg_bs+1);
     for (int i = 0; i < bs*bs; i++) dst[i] = dc_val;
 
     if (filter_edge_flag)
     {
-        dst[0] = ((dc_val<<1) + nbuf[1] + nbuf[65] + 2)>>2;
+        dst[0] = ((dc_val<<1) + nbuf[64] + nbuf[63] + 2)>>2;
         for (int i = 1; i < bs; i++)
         {
-            dst[i] = (nbuf[i+1] + 3*dc_val + 2)>>2;
-            dst[i*bs] = (nbuf[65+i] + 3*dc_val + 2)>>2;
+            dst[i] = (nbuf[64+i] + 3*dc_val + 2)>>2;
+            dst[i*bs] = (nbuf[63-i] + 3*dc_val + 2)>>2;
         }
     }
 }
 
 // Intra angular prediction.
-void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int 
bd, int filter_edge_flag, int mode)
+void fenc_predict_intra_angular_c(f265_pix *dst, f265_pix *nbuf, int mode, int 
packed)
 {
+    int filter_edge_flag = packed&1;
+    int lg_bs = (packed>>1) & 127;
+    int bd = packed>>8;
+
     int bs = 1<<lg_bs;
 
     // Flip the neighbours in the horizontal case.
@@ -230,8 +271,8 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix 
*nbuf, int lg_bs, int bd
     f265_pix ntmp[129];
     if (hor_flag)
     {
-        ntmp[0] = nbuf[0];
-        for (int i = 0; i < (bs<<1); i++) { ntmp[1+i] = nbuf[65+i]; ntmp[65+i] 
= nbuf[1+i]; }
+        ntmp[128] = nbuf[128];
+        for (int i = 0; i < (bs<<1); i++) { ntmp[63-i] = nbuf[64+i]; 
ntmp[64+i] = nbuf[63-i]; }
         nbuf = ntmp;
     }
 
@@ -244,13 +285,13 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix 
*nbuf, int lg_bs, int bd
     {
         for (int y = 0; y < bs; y++)
             for (int x = 0; x < bs; x++)
-                dst[y*bs+x] = nbuf[1+x];
+                dst[y*bs+x] = nbuf[64+x];
 
         if (filter_edge_flag)
         {
-            int top_left = nbuf[0], top = nbuf[1];
+            int top_left = nbuf[128], top = nbuf[64];
             for (int y = 0; y < bs; y++)
-                dst[y*bs] = F265_CLAMP(top + ((nbuf[65+y] - top_left)>>1), 0, 
(1<<bd)-1);
+                dst[y*bs] = F265_CLAMP(top + ((nbuf[63-y] - top_left)>>1), 0, 
(1<<bd)-1);
         }
     }
 
@@ -276,15 +317,16 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix 
*nbuf, int lg_bs, int bd
             for (int i = 0; i < nb_projected; i++)
             {
                 inv_angle_sum += inv_angle;
-                ref[-2-i] = nbuf[64+(inv_angle_sum>>8)];
+                ref[-2-i] = nbuf[64-(inv_angle_sum>>8)];
             }
 
             // Copy the top-left and top pixels.
-            for (int i = 0; i < bs+1; i++) ref[-1+i] = nbuf[i];
+            ref[-1] = nbuf[128];
+            for (int i = 0; i < bs; i++) ref[i] = nbuf[64+i];
         }
 
         // Use the top and top-right neighbours.
-        else ref = nbuf+1;
+        else ref = nbuf+64;
 
         // Pass every row.
         int angle_sum = 0;
@@ -312,31 +354,38 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix 
*nbuf, int lg_bs, int bd
 }
 
 // Predict the pixels of the intra mode specified.
-void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, 
int bd, int mode, int filter_edge_flag)
+// TODO: Stub. Should be inlined.
+inline void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int 
lg_bs, int bd, int mode,
+                                    int filter_edge_flag)
 {
-    if (mode == 0) fenc_predict_intra_planar(dst, neighbours, lg_bs);
-    else if (mode == 1) fenc_predict_intra_dc(dst, neighbours, lg_bs, 
filter_edge_flag);
-    else fenc_predict_intra_angular(dst, neighbours, lg_bs, bd, 
filter_edge_flag, mode);
+    // Get the function offset based on the block size.
+    int idx = (lg_bs-2)*11;
+
+    // Add the function offset based vased on the mode
+    idx += f265_mode_to_intra_pred[mode];
+
+    // Generate the packed data.
+    int packed = filter_edge_flag;
+    packed |= (lg_bs<<1);
+    packed |= (bd<<8);
+
+    fenc_predict_intra[idx](dst, neighbours, mode, packed);
 }
 
 // Predict the intra block with the mode and the CTB offset specified.
-void fenc_predict_intra(f265_enc_thread *t, f265_pix *dst, int comp, int 
lg_bs, int mode, int ct_ox, int ct_oy)
+void fenc_predict_intra_block(f265_enc_thread *t, f265_pix *dst, int comp, int 
lg_bs, int mode, int ct_ox, int ct_oy)
 {
     f265_intra_block *ib = &t->intra_block;
     int chroma_flag = !!comp;
-    int bs = 1<<lg_bs;
     int bd = t->enc->gd.bit_depth[chroma_flag];
-    int smooth_intra_flag = F265_GET_FLAG(t->enc->gd.eflags, 
F265_PF_SMOOTH_INTRA);
-    int filter_edge_flag, filter_neighbour_flag, neighbour_bilinear_flag;
+    int filter_edge_flag, filter_neighbour_flag;
 
-    // Predict the unfiltered neighbours. Assuming 4:2:0.
-    fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, comp, bs, ct_ox, 
ct_oy);
+    // Get and filter the neighbours.
+    fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+                                comp, lg_bs, mode);
+    int ct_off[2] = {ct_ox, ct_oy};
+    fenc_extract_intra_neighbours(t, ib->neighbours, ct_off, 
filter_neighbour_flag, 1<<8 | comp, lg_bs);
 
-    // Filter the neighbours.
-    fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, 
&neighbour_bilinear_flag,
-                                comp, lg_bs, mode, smooth_intra_flag);
-    if (filter_neighbour_flag) fenc_filter_intra_neighbours(ib->neighbours[1], 
ib->neighbours[0],
-                                                            bs, bd, 
neighbour_bilinear_flag);
     f265_pix *neighbours = ib->neighbours[filter_neighbour_flag];
 
     // Do the prediction.
diff --git a/f265/rec.c b/f265/rec.c
index decc88f..f261e28 100644
--- a/f265/rec.c
+++ b/f265/rec.c
@@ -932,7 +932,7 @@ int fenc_rec_intra_tb(f265_enc_thread *t, int comp, int 
lg_bs, int mode, int zer
 {
     f265_pix pred[32*32];
     int dst_flag, order;
-    fenc_predict_intra(t, pred, comp, lg_bs, mode, ct_ox, ct_oy);
+    fenc_predict_intra_block(t, pred, comp, lg_bs, mode, ct_ox, ct_oy);
     fenc_get_intra_encode_flags(&dst_flag, &order, comp, lg_bs, mode);
 
     if (F265_GET_FLAG(t->enc->gd.eflags, F265_PF_RDOQ))
diff --git a/snippets/asm.py b/snippets/asm.py
index 83a6d6d..75d8790 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -115,13 +115,17 @@ declare_dict = odict()
 # currently implemented in assembly. If "arch" is specified, it shadows
 # "arch_lbd" and "arch_hbd". If single_c=true, the same C function is mapped to
 # every slot.
-def declare_func(name, ret="void", args="", bd=0, indices=None, avx2=None, 
avx2_lbd=None, avx2_hbd=None, single_c=True):
+def declare_func(name, ret="void", args="", bd=0, indices=[""], c=1, 
c_lbd=None, c_hbd=None, \
+                 avx2=None, avx2_lbd=None, avx2_hbd=None, single_c=True):
     f = Function()
     f.name = name
     f.ret = ret
     f.args = args
     f.bd = bd
     f.indices = indices
+    f.c = c
+    f.c_lbd = c_lbd
+    f.c_hbd = c_hbd
     f.avx2 = avx2
     f.avx2_lbd = avx2_lbd
     f.avx2_hbd = avx2_hbd
@@ -132,16 +136,27 @@ def declare_func(name, ret="void", args="", bd=0, 
indices=None, avx2=None, avx2_
 def declare_all():
     df = declare_func
 
-    amp_indices =        ["2", "4", "8", "16", "32", "64", "6", "12", "24", 
"48"]
-    luma_amp_indices_x = ["X", "4", "8", "16", "32", "64", "X", "12", "24", 
"48"]
-    luma_amp_indices =   ["4", "8", "16", "32", "64", "12", "24", "48"]
+    amp_indices =       ["2", "4", "8", "16", "32", "64", "6", "12", "24", 
"48"]
+    luma_amp_indices =  ["X", "4", "8", "16", "32", "64", "X", "12", "24", 
"48"]
     luma_qpel_indices = []
+    luma_qpel_indices_c = []
     luma_qpel_indices_avx2 = []
-    for index in luma_amp_indices_x:
+    for index in luma_amp_indices:
         for frac in [ "h", "v", "d"]:
             luma_qpel_indices.append("X" if index == "X" else "%s_%s" % 
(index, frac))
-            if index != "X" and int(index) % 8 == 0:
-                luma_qpel_indices_avx2.append("%s_%s" % (index, frac))
+            luma_qpel_indices_c.append("X" if index == "X" else frac)
+            luma_qpel_indices_avx2.append("X" if index == "X" or int(index) % 
8 != 0 else "%s_%s" % (index, frac))
+
+    intra_pred_indices_seed = ["4", "8", "16", "32"]
+    intra_pred_indices_avx2_seed = ["X", "8", "X", "X"]
+    intra_pred_indices = []
+    intra_pred_indices_avx2 = []
+    for index in intra_pred_indices_seed:
+        for frac in [ "planar", "dc", "angular", "angular", "angular", 
"angular", "angular", "angular", "angular", "angular", "angular"]:
+            intra_pred_indices.append("X" if index == "X" else "%s" % (frac))
+    for index in intra_pred_indices_avx2_seed:
+        for frac in [ "planar", "dc", "dia_bot_left", "hor_bot", "hor", 
"hor_top", "dia_top_left", "ver_left", "ver", "ver_right", "dia_top_right"]:
+            intra_pred_indices_avx2.append("X" if index == "X" else "%s_%s" % 
(frac, index))
 
     # Declarations go here.
     df("dct", bd=1, single_c=False,
@@ -175,12 +190,12 @@ def declare_all():
 
     df("sad3", bd=1,
        args="int *costs, f265_pix *src, int src_stride, f265_pix **refs, int 
ref_stride, int packed_dims",
-       indices=luma_amp_indices_x,
+       indices=luma_amp_indices,
        avx2_lbd=1)
 
     df("sad4", bd=1,
        args="int *costs, f265_pix *src, int src_stride, f265_pix **refs, int 
ref_stride, int packed_dims",
-       indices=luma_amp_indices_x,
+       indices=luma_amp_indices,
        avx2_lbd=1)
 
     df("fssd", bd=1,
@@ -189,14 +204,24 @@ def declare_all():
 
     df("avg_pix", bd=1,
        args="f265_pix *dst, f265_pix *src0, int src0_stride, f265_pix *src1, 
int src1_stride, int packed_dims",
-       indices=luma_amp_indices_x,
-       avx2_lbd=["4", "8", "16", "32", "64", "12", "24", "48"])
+       indices=luma_amp_indices,
+       avx2_lbd=1)
 
-    df("interpol_luma_qpel_pix", bd=1,
+    df("interpol_luma_qpel_pix", bd=1, single_c=False,
        args="f265_pix *dst, int dst_stride, f265_pix *src, int src_stride, int 
frac, int packed_dims, uint8_t *spill",
        indices=luma_qpel_indices,
+       c=luma_qpel_indices_c,
        avx2_lbd=luma_qpel_indices_avx2)
 
+    df("predict_intra", bd=1, single_c=False,
+       args="f265_pix *dst, f265_pix *neighbours, int mode, int packed",
+       indices=intra_pred_indices,
+       avx2_lbd=intra_pred_indices_avx2)
+
+    df("extract_intra_neigh", bd=1, single_c=True,
+       args="f265_pix nbuf[2][160], f265_pix *pred, int pred_stride, int 
avail[2], int filter, int packed",
+       indices=["4", "8", "16", "32"],
+       avx2_lbd=["X", "8", "X", "X"])
 
 ### AVX2 SAD special code. ###
 def avx2_sad_special_code():
@@ -300,19 +325,19 @@ def get_output():
     for arch in arch_list:
         assign_text[arch] = ""
 
+    # List all function declarations.
+    function_list = set()
+
     # Pass every function.
     for f in declare_dict.values():
 
-        # Base function name.
-        base_func_name = "%s_%s" % (prog, f.name)
-
         # Iterate on the bit depths, if any.
         bd_list = ["lbd", "hbd"] if f.bd else [None]
         for bd in bd_list:
 
-            # Adjust the function name for the bit depth.
-            bd_func_name = base_func_name
-            if bd != None: bd_func_name = "%s_%s_%s" % (prog, bd, f.name)
+            # Function name. Include the bd only if defined.
+            bd_func_name = "%s_%s" % (prog, f.name)
+            if bd != None : bd_func_name = "%s_%s_%s" % (prog, bd, f.name)
 
             # Do the substitutions for the bit depth in the arguments and the
             # return type.
@@ -324,62 +349,55 @@ def get_output():
             typedef_text += "typedef %s(*%s_func)(%s);\n" % (func_ret_str, 
bd_func_name, func_args_str)
 
             # Declare the global variable, along with documentation.
-            var_indice_str = "[%d]" % (len(f.indices)) if f.indices else ""
+            has_indices = len(f.indices) > 1 if f.indices != None else 0
+            var_indice_str = "[%d]" % (len(f.indices)) if has_indices else ""
             global_str = "%s_func %s%s;\n" % (bd_func_name, bd_func_name, 
var_indice_str)
             if bd != "hbd": global_var_text_lbd += global_str
             else: global_var_text_hbd += global_str
-            if f.indices != None: extern_var_text += "// Indices: %s.\n" % (", 
".join(f.indices))
+            if has_indices: extern_var_text += "// Indices: %s.\n" % (", 
".join(f.indices))
             extern_var_text += "extern " + global_str + "\n";
 
             # Iterate on the indices, if any.
             index_list = f.indices if f.indices != None else [None]
             for index_pos in range(len(index_list)):
-                index = index_list[index_pos]
-
-                # Adjust the function name for the index.
-                index_func_name = bd_func_name
-                if index != None: index_func_name += "_" + index
-
                 # Iterate on the architectures.
                 for arch in arch_list:
+                    index = None
 
-                    # Adjust the function name for the architecture.
-                    arch_func_name = index_func_name + "_" + arch
-                    if f.single_c and arch == "c":
-                        arch_func_name = bd_func_name + "_c"
-
-                    # Check whether the architecture supports this function.
+                    # Use f.[arch] if the generic architecture is defined.
+                    field = getattr(f, arch)
 
-                    # Skipped slot.
-                    if index == "X":
-                        support_flag = 0
+                    # Use the f.[arch_bd] if the bit depth is defined and the 
generic arch is not.
+                    if field == None and bd != None:
+                        field = getattr(f, "%s_%s" % (arch, bd))
 
-                    # C always supports the function.
-                    elif arch == "c":
-                        support_flag = 1
+                    # If the field is true (1), use the default indices.
+                    if type(field) is int and field == 1:
+                        field = f.indices
 
-                    # Handle assembly.
-                    else:
-                        # Get the relevant fields.
-                        bdi_field = getattr(f, arch)
-                        bd_field = bdi_field if bd == None else getattr(f, 
"%s_%s" % (arch, bd))
+                    # Get the field value.
+                    if type(field) is list:
+                        index = field[index_pos]
 
-                        # Do the shadowing.
-                        field = bd_field if bdi_field == None else bdi_field
+                    # Add the architecture to the function name.
+                    arch_func_name = bd_func_name
+                    if index is not None and len(index): arch_func_name += "_" 
+ index
+                    arch_func_name += "_" + arch
+                    if arch == "c" and f.single_c :
+                        arch_func_name = bd_func_name + "_c"
 
-                        # Explicitly supported.
-                        support_flag = field == 1 or type(field) is list and 
index in field
+                    # Test if we should skip this index.
+                    support_flag = 0 if index == "X" or (index == None) else 1
 
                     # Declare the prototype.
                     if (arch == "c" and f.single_c and index_pos == 0) or\
                        (support_flag and (arch != "c" or not f.single_c)):
-                        # Kludge for the interpolation functions.
-                        if arch_func_name.find("interpol") != -1 and arch == 
"c":
-                            for frac in [ "h", "v", "d"]:
-                                proto_text += "%s %s_%s_c(%s);\n" % 
(func_ret_str, bd_func_name, frac, func_args_str);
-                        # Normal declaration.
-                        else:
-                            proto_text += "%s %s(%s);\n" % (func_ret_str, 
arch_func_name, func_args_str);
+                        s = "%s %s(%s);\n" % (func_ret_str, arch_func_name, 
func_args_str);
+
+                        # Insert it only if it's not already declared.
+                        if s not in function_list:
+                            proto_text += s
+                            function_list.add(s)
 
                     # Not supported, skip.
                     if not support_flag: continue
@@ -387,12 +405,8 @@ def get_output():
                     # Do the assignments.
                     assign_tabs = "    "
                     if arch != "c": assign_tabs += "    "
-                    assign_index_str = "[%d]" % (index_pos) if f.indices else 
""
-                    assign_val = arch_func_name
-                    # Kludge for the interpolation functions.
-                    if arch_func_name.find("interpol") != -1 and arch == "c":
-                        assign_val = "%s_%s_c" % (arch_func_name[:-2], 
index[-1])
-                    assign_text[arch] += "%s%s%s = %s;\n" % (assign_tabs, 
bd_func_name, assign_index_str, assign_val)
+                    assign_index_str = "[%d]" % (index_pos) if has_indices 
else ""
+                    assign_text[arch] += "%s%s%s = %s;\n" % (assign_tabs, 
bd_func_name, assign_index_str, arch_func_name)
 
         proto_text += "\n"
 
@@ -452,4 +466,3 @@ def main():
     write_file("../f265/asm.h", h_content)
 
 main()
-
diff --git a/test/data/bench.ini b/test/data/bench.ini
index 71a3621..ed943ad 100644
--- a/test/data/bench.ini
+++ b/test/data/bench.ini
@@ -123,4 +123,3 @@ hist_points=25 50 75
 
 # Report description added to the report (HTML-escaped).
 desc=Summary of the tests
-
diff --git a/test/data/videos.ini b/test/data/videos.ini
index 10084af..4ddefb7 100644
--- a/test/data/videos.ini
+++ b/test/data/videos.ini
@@ -92,4 +92,3 @@ path=Basketball_1920x1080_50.yuv
 resolution=1920x1080
 frames=500
 bitrate=640
-

Re: [f265 dev team] Intra prediction 8x8 asembly

Reply via email to