Here is the patch that add 8x8 assembly.

diff --git a/SConstruct b/SConstruct
index 744e05d..3141dbc 100644
--- a/SConstruct
+++ b/SConstruct
@@ -158,7 +158,7 @@ bdi_env = ref_env.Clone(CPPPATH = ['build', '.', 'f265/ktools'])
 for c_file in bdi_c_files:
     obj_files += bdi_env.Object('build/f265/' + c_file[:-2], 'f265/' + c_file)
 
-bdi_a_files = ['pixel.asm', 'dct.asm', 'encode.asm']
+bdi_a_files = ['pixel.asm', 'dct.asm', 'encode.asm', 'intra.asm']
 if f265_cfg['asm']:
     asm_dir = 'f265/asm/'
     asm_arch = "ARCH_X64" if mingw else "ARCH_AMD64"
@@ -214,7 +214,9 @@ if env['cli'] == 1:
     lib_to_link_static= ['build/cli/cli.o', 'build/libf265.a']
     if env['libav'] != 'none':
         if f265_cfg['static']:
-            for static_dep in ['libavformat', 'libavcodec','libavdevice', 'libavfilter', 'libavutil', 'libswscale' ]:
+            static_lib = [ 'libavformat', 'libavcodec','libavdevice', 'libavfilter', 'libavutil', 'libswscale' ]
+            if mingw: static_lib.extend([ 'libpthread', 'libz' ])
+            for static_dep in static_lib:
                 lib_to_link_static.append(os.path.join(f265_cfg['static'], static_dep + '.a'));
             cli_env.Append(LIBS = f265_cfg['static_dep'].split())
         else:
@@ -231,7 +233,9 @@ if env['yuvdiff'] == 1:
     lib_to_link_static= ['build/yuvdiff_d/yuvdiff.o']
     if env['libav'] != 'none':
         if f265_cfg['static']:
-            for static_dep in ['libavformat', 'libavcodec','libavdevice', 'libavfilter', 'libavutil', 'libswscale' ]:
+            static_lib = [ 'libavformat', 'libavcodec','libavdevice', 'libavfilter', 'libavutil', 'libswscale' ]
+            if mingw: static_lib.extend([ 'libpthread', 'libz' ])
+            for static_dep in static_lib:
                 lib_to_link_static.append(os.path.join(f265_cfg['static'], static_dep + '.a'));
             cli_env.Append(LIBS = f265_cfg['static_dep'].split())
         else:
diff --git a/doc/compile.txt b/doc/compile.txt
index e498ea3..41ef3f9 100644
--- a/doc/compile.txt
+++ b/doc/compile.txt
@@ -317,9 +317,10 @@ libraries are missing, use the static_dep to add/remove then.
 
     $ scons static_dep="z pthread ${LIB_TO_ADD}"
     
-For windows, you will need to add "ws2_32".
- 
-    $ scons static_dep="z pthread ws2_32"
+For Windows, you need to copy libpthread.a and libz.a in the static folder.
+The only dynamic libraries you need is the "ws2_32".
+
+    $ scons static_dep="ws2_32"
 
 
 Step 1. Get and compile libav
@@ -367,7 +368,12 @@ the libraries.
 Copy the libraries. You must replace "${PATH_TO_LIBAV}" with the path to the
 libav root directory. 
 
-    & cp ${PATH_TO_LIBAV}/lib*/*.a static_libav/
+    $ cp ${PATH_TO_LIBAV}/lib*/*.a static_libav/
+
+On Windows, you also need to copy libpthread.a and libz.a in static_libav
+
+    $ cp /c/mingw/mingw64/x86_64-w64-mingw32/lib/libpthread.a static_libav/
+    $ cp /c/mingw/mingw64/x86_64-w64-mingw32/lib/libz.a static_libav/
 
 
 Step 3. Configure and recompile f265
@@ -380,9 +386,9 @@ Start by displaying the configuration
 Find the line starting with "cflags" and copy the actual value. Then configure
 scons. 
 
-On Windows, you must first add "ws2_32" to static_dep.
+On Windows, you only need "ws2_32" in static_dep.
 
-    $ scons -h static_dep="z pthread ws2_32"
+    $ scons -h static_dep="ws2_32"
 
 Depending on your setup, libav might have linked against additional libraries.
 You must add them to the "static_dep" configuration. For example, if libav
diff --git a/f265/analyze.c b/f265/analyze.c
index c415cbf..c828145 100644
--- a/f265/analyze.c
+++ b/f265/analyze.c
@@ -523,7 +523,7 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
     int bs = 1<<lg_bs;
     int ref_stride = t->me.ref_stride;
     int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_ox, ct_oy);
-    int filter_edge_flag, filter_neighbour_flag, neighbour_bilinear_flag;
+    int filter_edge_flag, filter_neighbour_flag;
     int64_t cost;
     f265_pix *neighbours;
 
@@ -531,20 +531,20 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
     if (likely(ib->intra_neighbour_mode == 2))
     {
         // FIXME, optimize this.
-        fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, &neighbour_bilinear_flag,
-                                    comp, lg_bs, mode, 0);
+        fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+                                    comp, lg_bs, mode);
         neighbours = ib->neighbours[filter_neighbour_flag];
     }
 
     // Predict the reconstructed neighbours.
     else if (ib->intra_neighbour_mode == 1)
     {
-        int smooth_intra_flag = F265_GET_FLAG(t->enc->gd.eflags, F265_PF_SMOOTH_INTRA);
-        fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, &neighbour_bilinear_flag,
-                                    comp, lg_bs, mode, smooth_intra_flag);
-        fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, comp, bs, ct_ox, ct_oy);
-        if (filter_neighbour_flag) fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0], bs, 8,
-                                                                neighbour_bilinear_flag);
+        fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+                                    comp, lg_bs, mode);
+
+        int ct_o[2] = {ct_ox, ct_oy};
+        fenc_extract_intra_neighbours(t, ib->neighbours, ct_o, filter_neighbour_flag, 1<<8 | comp, lg_bs);
+
         neighbours = ib->neighbours[filter_neighbour_flag];
     }
 
@@ -552,7 +552,9 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
     else
     {
         // Do not filter anything when using the source pixels.
-        fenc_predict_intra_neighbours(t, ib->neighbours[0], 0, comp, bs, ct_ox, ct_oy);
+        int ct_o[2] = {ct_ox, ct_oy};
+        fenc_extract_intra_neighbours(t, ib->neighbours, ct_o, 0, 0<<8 | comp, lg_bs);
+
         filter_edge_flag = 0;
         neighbours = ib->neighbours[0];
     }
@@ -1801,7 +1803,6 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
 {
     f265_analysis *an = &t->an;
     f265_intra_block *ib = &t->intra_block;
-    int bs = 1<<lg_bs;
     int rdo_restore_flag = 1;
     int64_t best_cost;
 
@@ -1888,10 +1889,8 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
     // neighbours, then the previous passes also use cached neighbours.
     if (ib->cache_neighbour_flags[0]|ib->cache_neighbour_flags[1]|ib->cache_neighbour_flags[2])
     {
-        int bilinear_flag = bs == 32 && F265_GET_FLAG(t->enc->gd.eflags, F265_PF_SMOOTH_INTRA);
-        fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, 0, bs,
-                                      cb->cb_off[0] + cb_ox, cb->cb_off[1] + cb_oy);
-        fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0], bs, 8, bilinear_flag);
+        int ct_o[2] = {cb->cb_off[0] + cb_ox, cb->cb_off[1] + cb_oy};
+        fenc_extract_intra_neighbours(t, ib->neighbours, ct_o, 1, 1<<8 | 0, lg_bs);
     }
 
     // Set the partition data.
diff --git a/f265/asm.c b/f265/asm.c
index 7edecc3..99ec15a 100644
--- a/f265/asm.c
+++ b/f265/asm.c
@@ -136,6 +136,28 @@ void f265_hbd_interpol_luma_qpel_pix_h_c(int16_t *dst, int dst_stride, int16_t *
 void f265_hbd_interpol_luma_qpel_pix_v_c(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
 void f265_hbd_interpol_luma_qpel_pix_d_c(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
 
+void f265_lbd_predict_intra_planar_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dc_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_angular_c(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_planar_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dc_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_bot_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_bot_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_hor_top_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_vert_left_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_vert_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_vert_right_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_lbd_predict_intra_dia_top_right_8_avx2(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+void f265_hbd_predict_intra_planar_c(int16_t *dst, int16_t *neighbours, int mode, int packed);
+void f265_hbd_predict_intra_dc_c(int16_t *dst, int16_t *neighbours, int mode, int packed);
+void f265_hbd_predict_intra_angular_c(int16_t *dst, int16_t *neighbours, int mode, int packed);
+
+void f265_lbd_extract_intra_neigh_c(uint8_t nbuf[2][160], uint8_t *pred, int pred_stride, int avail[2], int filter, int packed);
+void f265_lbd_extract_intra_neigh_8_avx2(uint8_t nbuf[2][160], uint8_t *pred, int pred_stride, int avail[2], int filter, int packed);
+void f265_hbd_extract_intra_neigh_c(int16_t nbuf[2][160], int16_t *pred, int pred_stride, int avail[2], int filter, int packed);
+
 // Special code.
 #ifdef F265_HAVE_ASM
 int f265_lbd_fsad_12_avx2(uint8_t *src, int src_stride, uint8_t *ref, int ref_stride, int packed_dims)
@@ -360,6 +382,8 @@ f265_lbd_sad4_func f265_lbd_sad4[10];
 f265_lbd_fssd_func f265_lbd_fssd[5];
 f265_lbd_avg_pix_func f265_lbd_avg_pix[10];
 f265_lbd_interpol_luma_qpel_pix_func f265_lbd_interpol_luma_qpel_pix[30];
+f265_lbd_predict_intra_func f265_lbd_predict_intra[44];
+f265_lbd_extract_intra_neigh_func f265_lbd_extract_intra_neigh[4];
 
 f265_hbd_dct_func f265_hbd_dct[5];
 f265_hbd_idct_func f265_hbd_idct[5];
@@ -373,6 +397,8 @@ f265_hbd_sad4_func f265_hbd_sad4[10];
 f265_hbd_fssd_func f265_hbd_fssd[5];
 f265_hbd_avg_pix_func f265_hbd_avg_pix[10];
 f265_hbd_interpol_luma_qpel_pix_func f265_hbd_interpol_luma_qpel_pix[30];
+f265_hbd_predict_intra_func f265_hbd_predict_intra[44];
+f265_hbd_extract_intra_neigh_func f265_hbd_extract_intra_neigh[4];
 
 // Linkage at runtime.
 static void f265_link_asm(int avx2_flag)
@@ -549,6 +575,102 @@ static void f265_link_asm(int avx2_flag)
     f265_hbd_interpol_luma_qpel_pix[27] = f265_hbd_interpol_luma_qpel_pix_h_c;
     f265_hbd_interpol_luma_qpel_pix[28] = f265_hbd_interpol_luma_qpel_pix_v_c;
     f265_hbd_interpol_luma_qpel_pix[29] = f265_hbd_interpol_luma_qpel_pix_d_c;
+    f265_lbd_predict_intra[0] = f265_lbd_predict_intra_planar_c;
+    f265_lbd_predict_intra[1] = f265_lbd_predict_intra_dc_c;
+    f265_lbd_predict_intra[2] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[3] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[4] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[5] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[6] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[7] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[8] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[9] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[10] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[11] = f265_lbd_predict_intra_planar_c;
+    f265_lbd_predict_intra[12] = f265_lbd_predict_intra_dc_c;
+    f265_lbd_predict_intra[13] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[14] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[15] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[16] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[17] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[18] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[19] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[20] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[21] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[22] = f265_lbd_predict_intra_planar_c;
+    f265_lbd_predict_intra[23] = f265_lbd_predict_intra_dc_c;
+    f265_lbd_predict_intra[24] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[25] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[26] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[27] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[28] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[29] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[30] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[31] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[32] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[33] = f265_lbd_predict_intra_planar_c;
+    f265_lbd_predict_intra[34] = f265_lbd_predict_intra_dc_c;
+    f265_lbd_predict_intra[35] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[36] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[37] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[38] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[39] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[40] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[41] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[42] = f265_lbd_predict_intra_angular_c;
+    f265_lbd_predict_intra[43] = f265_lbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[0] = f265_hbd_predict_intra_planar_c;
+    f265_hbd_predict_intra[1] = f265_hbd_predict_intra_dc_c;
+    f265_hbd_predict_intra[2] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[3] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[4] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[5] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[6] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[7] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[8] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[9] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[10] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[11] = f265_hbd_predict_intra_planar_c;
+    f265_hbd_predict_intra[12] = f265_hbd_predict_intra_dc_c;
+    f265_hbd_predict_intra[13] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[14] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[15] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[16] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[17] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[18] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[19] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[20] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[21] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[22] = f265_hbd_predict_intra_planar_c;
+    f265_hbd_predict_intra[23] = f265_hbd_predict_intra_dc_c;
+    f265_hbd_predict_intra[24] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[25] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[26] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[27] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[28] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[29] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[30] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[31] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[32] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[33] = f265_hbd_predict_intra_planar_c;
+    f265_hbd_predict_intra[34] = f265_hbd_predict_intra_dc_c;
+    f265_hbd_predict_intra[35] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[36] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[37] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[38] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[39] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[40] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[41] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[42] = f265_hbd_predict_intra_angular_c;
+    f265_hbd_predict_intra[43] = f265_hbd_predict_intra_angular_c;
+    f265_lbd_extract_intra_neigh[0] = f265_lbd_extract_intra_neigh_c;
+    f265_lbd_extract_intra_neigh[1] = f265_lbd_extract_intra_neigh_c;
+    f265_lbd_extract_intra_neigh[2] = f265_lbd_extract_intra_neigh_c;
+    f265_lbd_extract_intra_neigh[3] = f265_lbd_extract_intra_neigh_c;
+    f265_hbd_extract_intra_neigh[0] = f265_hbd_extract_intra_neigh_c;
+    f265_hbd_extract_intra_neigh[1] = f265_hbd_extract_intra_neigh_c;
+    f265_hbd_extract_intra_neigh[2] = f265_hbd_extract_intra_neigh_c;
+    f265_hbd_extract_intra_neigh[3] = f265_hbd_extract_intra_neigh_c;
 
     #ifdef F265_HAVE_ASM
     if (avx2_flag)
@@ -631,6 +753,18 @@ static void f265_link_asm(int avx2_flag)
         f265_lbd_interpol_luma_qpel_pix[27] = f265_lbd_interpol_luma_qpel_pix_48_h_avx2;
         f265_lbd_interpol_luma_qpel_pix[28] = f265_lbd_interpol_luma_qpel_pix_48_v_avx2;
         f265_lbd_interpol_luma_qpel_pix[29] = f265_lbd_interpol_luma_qpel_pix_48_d_avx2;
+        f265_lbd_predict_intra[11] = f265_lbd_predict_intra_planar_8_avx2;
+        f265_lbd_predict_intra[12] = f265_lbd_predict_intra_dc_8_avx2;
+        f265_lbd_predict_intra[13] = f265_lbd_predict_intra_dia_bot_left_8_avx2;
+        f265_lbd_predict_intra[14] = f265_lbd_predict_intra_hor_bot_8_avx2;
+        f265_lbd_predict_intra[15] = f265_lbd_predict_intra_hor_8_avx2;
+        f265_lbd_predict_intra[16] = f265_lbd_predict_intra_hor_top_8_avx2;
+        f265_lbd_predict_intra[17] = f265_lbd_predict_intra_dia_top_left_8_avx2;
+        f265_lbd_predict_intra[18] = f265_lbd_predict_intra_vert_left_8_avx2;
+        f265_lbd_predict_intra[19] = f265_lbd_predict_intra_vert_8_avx2;
+        f265_lbd_predict_intra[20] = f265_lbd_predict_intra_vert_right_8_avx2;
+        f265_lbd_predict_intra[21] = f265_lbd_predict_intra_dia_top_right_8_avx2;
+        f265_lbd_extract_intra_neigh[1] = f265_lbd_extract_intra_neigh_8_avx2;
     }
     #endif
 }
diff --git a/f265/asm.h b/f265/asm.h
index 6402d1f..98e2f6d 100644
--- a/f265/asm.h
+++ b/f265/asm.h
@@ -30,6 +30,10 @@ typedef void(*f265_lbd_avg_pix_func)(uint8_t *dst, uint8_t *src0, int src0_strid
 typedef void(*f265_hbd_avg_pix_func)(int16_t *dst, int16_t *src0, int src0_stride, int16_t *src1, int src1_stride, int packed_dims);
 typedef void(*f265_lbd_interpol_luma_qpel_pix_func)(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
 typedef void(*f265_hbd_interpol_luma_qpel_pix_func)(int16_t *dst, int dst_stride, int16_t *src, int src_stride, int frac, int packed_dims, uint8_t *spill);
+typedef void(*f265_lbd_predict_intra_func)(uint8_t *dst, uint8_t *neighbours, int mode, int packed);
+typedef void(*f265_hbd_predict_intra_func)(int16_t *dst, int16_t *neighbours, int mode, int packed);
+typedef void(*f265_lbd_extract_intra_neigh_func)(uint8_t nbuf[2][160], uint8_t *pred, int pred_stride, int avail[2], int filter, int packed);
+typedef void(*f265_hbd_extract_intra_neigh_func)(int16_t nbuf[2][160], int16_t *pred, int pred_stride, int avail[2], int filter, int packed);
 
 // Globals.
 
@@ -103,4 +107,16 @@ extern f265_lbd_interpol_luma_qpel_pix_func f265_lbd_interpol_luma_qpel_pix[30];
 // Indices: X, X, X, 4_h, 4_v, 4_d, 8_h, 8_v, 8_d, 16_h, 16_v, 16_d, 32_h, 32_v, 32_d, 64_h, 64_v, 64_d, X, X, X, 12_h, 12_v, 12_d, 24_h, 24_v, 24_d, 48_h, 48_v, 48_d.
 extern f265_hbd_interpol_luma_qpel_pix_func f265_hbd_interpol_luma_qpel_pix[30];
 
+// Indices: planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular.
+extern f265_lbd_predict_intra_func f265_lbd_predict_intra[44];
+
+// Indices: planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular, planar, dc, angular, angular, angular, angular, angular, angular, angular, angular, angular.
+extern f265_hbd_predict_intra_func f265_hbd_predict_intra[44];
+
+// Indices: 4, 8, 16, 32.
+extern f265_lbd_extract_intra_neigh_func f265_lbd_extract_intra_neigh[4];
+
+// Indices: 4, 8, 16, 32.
+extern f265_hbd_extract_intra_neigh_func f265_hbd_extract_intra_neigh[4];
+
 
diff --git a/f265/bdi.h b/f265/bdi.h
index 9d2e4ce..c8fab2c 100644
--- a/f265/bdi.h
+++ b/f265/bdi.h
@@ -502,6 +502,7 @@ extern const int16_t f265_lambdas[52];
 extern uint16_t f265_mv_costs[52][F265_NB_MV_COSTS];
 extern const int8_t f265_hpel_src0[16];
 extern const int8_t f265_hpel_src1[16];
+extern const int8_t f265_mode_to_intra_pred[35];
 
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/f265/bdi_ro.c b/f265/bdi_ro.c
index 05972f0..32e2ba7 100644
--- a/f265/bdi_ro.c
+++ b/f265/bdi_ro.c
@@ -608,3 +608,5 @@ const int16_t f265_lambdas[52] =
 const int8_t f265_hpel_src0[16] = { 0, 0, 1, 1, 0, 1, 1, 1, 2, 2, 3, 3, 0, 1, 1, 1 };
 const int8_t f265_hpel_src1[16] = { 9, 1, 9, 0, 2, 2, 3, 2, 9, 3, 9, 2, 2, 2, 3, 2 };
 
+// Map intra prediction mode to optimized assembly function.
+const int8_t f265_mode_to_intra_pred[35] = {0, 1, 2, 3,3,3,3,3,3,3, 4, 5,5,5,5,5,5,5, 6, 7,7,7,7,7,7,7, 8, 9,9,9,9,9,9,9, 10};
diff --git a/f265/enc.h b/f265/enc.h
index 3af6647..79fbf64 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -798,7 +798,14 @@ typedef struct f265_intra_block
     int8_t intra_dist_mode;
 
     // Unfiltered/filtered neighbours of the current partition.
-    F265_ALIGN64 f265_pix neighbours[2][129];
+    // Layout:
+    // - [0-63]    : Bottom left and left, packed near index 63 
+    //               (index 63 is always the top-most left neighbour).
+    // - [64-127]  : Top and top leftm packed near index 64.
+    //               (index 64 is always the left-most top neighbour).
+    // - [128]     : Top-left neighbours
+    // - [129-159] : Alignement padding.
+    F265_ALIGN64 f265_pix neighbours[2][160];
 
 } f265_intra_block;
 
@@ -2831,17 +2838,13 @@ void fenc_mc_chroma_b(f265_enc_thread *t, f265_pix *dst, int dst_stride, f265_re
                       int packed_dims, int plane_off, int comp);
 
 // intra.c
-void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag, int *neighbour_bilinear_flag,
-                                 int comp, int lg_bs, int mode, int smooth_intra_flag);
+void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag,
+                                 int comp, int lg_bs, int mode);
 void fenc_get_intra_encode_flags(int *dst_flag, int *order, int comp, int lg_bs, int mode);
-void fenc_predict_intra_neighbours(f265_enc_thread *t, f265_pix dst[129], int rec_flag,
-                                   int comp, int bs, int ct_ox, int ct_oy);
-void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int bd, int bilinear_flag);
-void fenc_predict_intra_planar(f265_pix *dst, f265_pix *nbuf, int lg_bs);
-void fenc_predict_intra_dc(f265_pix *dst, f265_pix *nbuf, int lg_bs, int filter_edge_flag);
-void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd, int filter_edge_flag, int mode);
+void fenc_extract_intra_neighbours(f265_enc_thread *t, f265_pix dst[2][160],  int ct_o[2],
+                                   int packed, int filter, int lg_bs);
 void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, int bd, int mode, int filter_edge_flag);
-void fenc_predict_intra(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy);
+void fenc_predict_intra_block(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy);
 void fenc_get_intra_pred_mode(f265_enc_thread *t, f265_cb *cb, int partition_idx, int *mpm_list);
 
 // inter.h
diff --git a/f265/intra.c b/f265/intra.c
index d512e2b..f97560c 100644
--- a/f265/intra.c
+++ b/f265/intra.c
@@ -45,13 +45,12 @@ static finline void fenc_get_intra_nb_avail(f265_enc_thread *t, int avail[2], in
 // - filter_edge_flag: true if the edges are filtered for DC/vertical/horizontal.
 // - filter_neighbour_flag: true if the neighbours are filtered.
 // - neighbour_bilinear_flag: true if the neighbours are filtered bilinearly.
-void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag, int *neighbour_bilinear_flag,
-                                 int comp, int lg_bs, int mode, int smooth_intra_flag)
+void fenc_get_intra_filter_flags(int *filter_edge_flag, int *filter_neighbour_flag,
+                                 int comp, int lg_bs, int mode)
 {
     int bs = 1<<lg_bs;
     *filter_edge_flag = !comp && bs != 32;
     *filter_neighbour_flag = !comp && f265_intra_mode_dist[mode] > f265_intra_dist_thresholds[lg_bs-2];
-    *neighbour_bilinear_flag = bs == 32 && smooth_intra_flag;
 }
 
 // Get the intra encoding flags.
@@ -69,160 +68,169 @@ void fenc_get_intra_encode_flags(int *dst_flag, int *order, int comp, int lg_bs,
 // image component. 'src' points to the top-left neighbour pixel of the block,
 // 'nx' and 'ny' are the number of pixels to predict in each direction. 'bd' is
 // the bit depth.
-static finline void fenc_extract_intra_neighbours(f265_pix dst[129], f265_pix *src, int src_stride,
-                                                  int availx, int availy, int nx, int ny, int bd)
+static finline void fenc_extract_unfiltered_intra_neigh(f265_pix dst[129], f265_pix *src, int src_stride,
+                                                  int avail[2], int packed)
 {
     // The following logic relies on the slice layout restrictions.
+    int nb_pix = (packed&255)*2;
+    int availx = avail[0];
+    int availy = avail[1];
 
     // Copy top-left tentatively.
-    dst[0] = src[0];
+    dst[128] = src[-1-src_stride];
 
     // Left is fully available, copy.
-    if (likely(availy >= ny))
+    if (likely(availy >= nb_pix))
     {
-        for (int i = 0; i < ny; i++) dst[65+i] = src[(1+i)*src_stride];
+        for (int i = 0; i < nb_pix; i++) dst[63-i] = src[i*src_stride-1];
     }
 
     // Left is partially available, copy and broadcast.
     else if (likely(availy > 0))
     {
-        for (int i = 0; i < availy; i++) dst[65+i] = src[(1+i)*src_stride];
-        f265_pix p = dst[64+availy];
-        for (int i = availy; i < ny; i++) dst[65+i] = p;
+        for (int i = 0; i < availy; i++) dst[63-i] = src[i*src_stride-1];
+        f265_pix p = dst[63-availy+1];
+        for (int i = availy; i < nb_pix; i++) dst[63-i] = p;
     }
 
     // Left and top-left are not available but top is. Broadcast the first
     // pixel directly above the block.
     else if (likely(availx > 0))
     {
-        f265_pix p = src[1];
-        dst[0] = p;
-        for (int i = 0; i < ny; i++) dst[65+i] = p;
+        f265_pix p = src[-src_stride];
+        dst[128] = p;
+        for (int i = 0; i < nb_pix; i++) dst[63-i] = p;
     }
 
     // Nothing is available, perform DC prediction.
     else
     {
-        f265_pix p = 1<<(bd-1);
-        for (int i = 0; i < nx+1; i++) dst[i] = p;
-        for (int i = 0; i < ny; i++) dst[65+i] = p;
+        f265_pix p = 1<<((packed>>8)-1);
+        dst[128] = p;
+        for (int i = 0; i < nb_pix; i++) dst[64+i] = p;
+        for (int i = 0; i < nb_pix; i++) dst[63-i] = p;
         return;
     }
 
     // Top is fully available, copy.
-    if (likely(availx >= nx))
+    if (likely(availx >= nb_pix))
     {
-        for (int i = 0; i < nx; i++) dst[1+i] = src[1+i];
+        for (int i = 0; i < nb_pix; i++) dst[64+i] = src[i-src_stride];
     }
 
     // Top is partially available, copy and broadcast.
     else if (likely(availx > 0))
     {
-        for (int i = 0; i < availx; i++) dst[1+i] = src[1+i];
-        f265_pix p = dst[availx];
-        for (int i = availx; i < nx; i++) dst[1+i] = p;
+        for (int i = 0; i < availx; i++) dst[64+i] = src[i-src_stride];
+        f265_pix p = dst[64+availx-1];
+        for (int i = availx; i < nb_pix; i++) dst[64+i] = p;
     }
 
     // Top-left, top, top-right are not available. Broadcast the first pixel
     // directly left of the block.
     else
     {
-        f265_pix p = dst[65];
-        for (int i = 0; i < nx+1; i++) dst[i] = p;
+        f265_pix p = dst[63];
+        dst[128] = p;
+        for (int i = 0; i < nb_pix; i++) dst[64+i] = p;
     }
 }
 
-// Predict the unfiltered neighbour pixels of the specified intra block at
-// pixel offset (ct_ox, ct_oy) in the CTB with block size 'bs'. 'rec_flag' is
-// true if the reconstructed pixels are used for the prediction, false if the
-// source pixels are used as approximation. This function assumes that
-// constrained intra prediction is not used.
-//
-// Layout of the destination array, by offset: 0 (top-left), 1 (top and
-// top-right), 65 (left and bottom left).
-void fenc_predict_intra_neighbours(f265_enc_thread *t, f265_pix dst[129], int rec_flag,
-                                   int comp, int bs, int ct_ox, int ct_oy)
-{
-    int avail[2];
-    fenc_get_intra_nb_avail(t, avail, comp, ct_ox, ct_oy);
-    int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_ox-1, ct_oy-1);
-    f265_pix *src = rec_flag ? t->src_frame->rec_planes[comp ? 3+comp : 0] + plane_off :
-                               t->src_frame->src_planes[comp] + plane_off;
-    fenc_extract_intra_neighbours(dst, src, t->me.ref_stride, avail[0], avail[1], bs<<1, bs<<1,
-                                  t->enc->gd.bit_depth[!!comp]);
-}
-
 // Filter the neighbour pixels of the block specified.
-void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int bd, int bilinear_flag)
+static void fenc_filter_intra_neighbours(f265_pix *dst, f265_pix *src, int bs, int bd, int bilinear_flag)
 {
     int bs2 = bs<<1;
-    int top_left = src[0], top_last = src[bs2], left_last = src[64+bs2];
+    int top_left = src[128], top_last = src[64+bs2-1], left_last = src[63-bs2+1];
 
     // Check for bilinear filtering.
     if (bilinear_flag)
     {
-        int top_middle = src[32], left_middle = src[64+32];
+        int top_middle = src[63+32], left_middle = src[64-32];
         int threshold = 1<<(bd-5);
         bilinear_flag = F265_ABS(top_left + top_last - (top_middle<<1)) < threshold &&
                         F265_ABS(top_left + left_last - (left_middle<<1)) < threshold;
         if (bilinear_flag)
         {
-            dst[0] = top_left;
-            dst[64] = top_last;
-            dst[64+64] = left_last;
+            dst[128] = top_left;
+            dst[127] = top_last;
+            dst[0] = left_last;
             for (int i = 0; i < 63; i++)
             {
-                dst[1+i] =  ((63-i)*top_left + (i+1)*top_last  + 32)>>6;
-                dst[65+i] = ((63-i)*top_left + (i+1)*left_last + 32)>>6;
+                dst[64+i] = ((63-i)*top_left + (i+1)*top_last  + 32)>>6;
+                dst[63-i] = ((63-i)*top_left + (i+1)*left_last + 32)>>6;
             }
             return;
         }
     }
 
     // Regular filtering.
-    dst[0] = ((top_left<<1) + src[1] + src[65] + 2)>>2;
-    dst[bs2] = top_last;
-    dst[64+bs2] = left_last;
-    for (int i = 1; i < bs2; i++) dst[i] = ((src[i]<<1) + src[i-1] + src[i+1] + 2)>>2;
-    dst[65] = ((src[65]<<1) + top_left + src[66] + 2)>>2;
-    for (int i = 66; i < 64+bs2; i++) dst[i] = ((src[i]<<1) + src[i-1] + src[i+1] + 2)>>2;
+    dst[128] = ((top_left<<1) + src[64] + src[63] + 2)>>2;
+    dst[63+bs2] = top_last;
+    dst[64-bs2] = left_last;
+    dst[64] = ((src[64]<<1) + top_left + src[65] + 2)>>2;
+    dst[63] = ((src[63]<<1) + top_left + src[62] + 2)>>2;
+
+    for (int i = 1; i < bs2-1; i++)
+    {
+        dst[64+i] = ((src[64+i]<<1) + src[64+i-1] + src[64+i+1] + 2)>>2;
+        dst[63-i] = ((src[63-i]<<1) + src[63-i-1] + src[63-i+1] + 2)>>2;
+    }
 }
 
 // Intra planar prediction.
-void fenc_predict_intra_planar(f265_pix *dst, f265_pix *nbuf, int lg_bs)
+// Packed:
+// - Bit 0    : filter_edge_flag
+// - Bit 1-7  : lg_bs
+// - Bit 8-15 : bd
+void fenc_predict_intra_planar_c(f265_pix *dst, f265_pix *nbuf, int mode, int packed)
 {
+    int lg_bs = (packed>>1) & 127;
     int bs = 1<<lg_bs;
-    int top_right = nbuf[1+bs];
-    int bottom_left = nbuf[65+bs];
+    int top_right = nbuf[64+bs];
+    int bottom_left = nbuf[63-bs];
     for (int y = 0; y < bs; y++)
         for (int x = 0; x < bs; x++)
-            dst[y*bs+x] = ((bs-1-x)*nbuf[65+y] + (bs-1-y)*nbuf[1+x] + (x+1)*top_right + (y+1)*bottom_left + bs)
+            dst[y*bs+x] = ((bs-1-x)*nbuf[63-y] + (bs-1-y)*nbuf[64+x] + (x+1)*top_right + (y+1)*bottom_left + bs)
                           >>(lg_bs+1);
 }
 
 // Intra DC prediction.
-void fenc_predict_intra_dc(f265_pix *dst, f265_pix *nbuf, int lg_bs, int filter_edge_flag)
+// Packed:
+// - Bit 0    : filter_edge_flag
+// - Bit 1-7  : lg_bs
+// - Bit 8-15 : bd
+void fenc_predict_intra_dc_c(f265_pix *dst, f265_pix *nbuf, int mode, int packed)
 {
+    int lg_bs = (packed>>1) & 127;
+    int filter_edge_flag = packed&1;
     int bs = 1<<lg_bs;
     int dc_val = bs;
-    for (int i = 0; i < bs; i++) dc_val += nbuf[1+i] + nbuf[65+i];
+
+    for (int i = 0; i < bs; i++) dc_val += nbuf[63-i] + nbuf[64+i];
     dc_val = dc_val>>(lg_bs+1);
     for (int i = 0; i < bs*bs; i++) dst[i] = dc_val;
 
     if (filter_edge_flag)
     {
-        dst[0] = ((dc_val<<1) + nbuf[1] + nbuf[65] + 2)>>2;
+        dst[0] = ((dc_val<<1) + nbuf[64] + nbuf[63] + 2)>>2;
         for (int i = 1; i < bs; i++)
         {
-            dst[i] = (nbuf[i+1] + 3*dc_val + 2)>>2;
-            dst[i*bs] = (nbuf[65+i] + 3*dc_val + 2)>>2;
+            dst[i] = (nbuf[64+i] + 3*dc_val + 2)>>2;
+            dst[i*bs] = (nbuf[63-i] + 3*dc_val + 2)>>2;
         }
     }
 }
 
 // Intra angular prediction.
-void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd, int filter_edge_flag, int mode)
+// Packed:
+// - Bit 0    : filter_edge_flag
+// - Bit 1-7  : lg_bs
+// - Bit 8-15 : bd
+void fenc_predict_intra_angular_c(f265_pix *dst, f265_pix *nbuf, int mode, int packed)
 {
+    int bd = packed>>8;
+    int lg_bs = (packed>>1) & 127;
+    int filter_edge_flag = packed&1;
     int bs = 1<<lg_bs;
 
     // Flip the neighbours in the horizontal case.
@@ -230,8 +238,8 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
     f265_pix ntmp[129];
     if (hor_flag)
     {
-        ntmp[0] = nbuf[0];
-        for (int i = 0; i < (bs<<1); i++) { ntmp[1+i] = nbuf[65+i]; ntmp[65+i] = nbuf[1+i]; }
+        ntmp[128] = nbuf[128];
+        for (int i = 0; i < (bs<<1); i++) { ntmp[63-i] = nbuf[64+i]; ntmp[64+i] = nbuf[63-i]; }
         nbuf = ntmp;
     }
 
@@ -244,13 +252,13 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
     {
         for (int y = 0; y < bs; y++)
             for (int x = 0; x < bs; x++)
-                dst[y*bs+x] = nbuf[1+x];
+                dst[y*bs+x] = nbuf[64+x];
 
         if (filter_edge_flag)
         {
-            int top_left = nbuf[0], top = nbuf[1];
+            int top_left = nbuf[128], top = nbuf[64];
             for (int y = 0; y < bs; y++)
-                dst[y*bs] = F265_CLAMP(top + ((nbuf[65+y] - top_left)>>1), 0, (1<<bd)-1);
+                dst[y*bs] = F265_CLAMP(top + ((nbuf[63-y] - top_left)>>1), 0, (1<<bd)-1);
         }
     }
 
@@ -276,15 +284,16 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
             for (int i = 0; i < nb_projected; i++)
             {
                 inv_angle_sum += inv_angle;
-                ref[-2-i] = nbuf[64+(inv_angle_sum>>8)];
+                ref[-2-i] = nbuf[64-(inv_angle_sum>>8)];
             }
 
             // Copy the top-left and top pixels.
-            for (int i = 0; i < bs+1; i++) ref[-1+i] = nbuf[i];
+            ref[-1] = nbuf[128];
+            for (int i = 0; i < bs; i++) ref[i] = nbuf[64+i];
         }
 
         // Use the top and top-right neighbours.
-        else ref = nbuf+1;
+        else ref = nbuf+64;
 
         // Pass every row.
         int angle_sum = 0;
@@ -311,32 +320,59 @@ void fenc_predict_intra_angular(f265_pix *dst, f265_pix *nbuf, int lg_bs, int bd
             }
 }
 
+// Extract and filter the transform block's neighbours.
+// In the assembly version, the neighbours are always filtered.
+// Packed:
+// - Bit 0-7  : bs
+// - Bit 8-15 : bd
+void fenc_extract_intra_neigh_c(f265_pix *dst, f265_pix *pred, int pred_stride,
+                                int avail[2], int filter, int packed)
+{
+    fenc_extract_unfiltered_intra_neigh(dst, pred, pred_stride, avail, packed);
+
+    if (filter)
+        fenc_filter_intra_neighbours(dst+160, dst, packed&255, packed>>8, (packed&255)==32);
+}
+
+// Extract and filter the transform block neighbours at the location indicated by ct_ox and ct_oy.
+void fenc_extract_intra_neighbours(f265_enc_thread *t, f265_pix dst[2][160], int ct_o[2],
+                                   int filter, int packed, int lg_bs)  /* rec_flag << 8 | comp */
+{
+    // Get the unfiltered neighbours.
+    int comp = packed & 255;
+    int rec_flag = packed>>8;
+
+    int avail[2];
+    fenc_get_intra_nb_avail(t, avail, comp, ct_o[0], ct_o[1]);
+    int plane_off = fenc_get_ctb_block_plane_off(t, comp, ct_o[0], ct_o[1]);
+    f265_pix *src = rec_flag ? t->src_frame->rec_planes[comp ? 3+comp : 0] + plane_off :
+                               t->src_frame->src_planes[comp] + plane_off;
+
+    fenc_extract_intra_neigh[lg_bs-2](dst, src, t->me.ref_stride, avail, filter,
+                                      (t->enc->gd.bit_depth[!!comp] << 8) | 1 << lg_bs);
+}
+
 // Predict the pixels of the intra mode specified.
-void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, int bd, int mode, int filter_edge_flag)
+// TODO: Stub. Should be inlined.
+inline void fenc_predict_intra_mode(f265_pix *dst, f265_pix *neighbours, int lg_bs, int bd, int mode, int filter_edge_flag)
 {
-    if (mode == 0) fenc_predict_intra_planar(dst, neighbours, lg_bs);
-    else if (mode == 1) fenc_predict_intra_dc(dst, neighbours, lg_bs, filter_edge_flag);
-    else fenc_predict_intra_angular(dst, neighbours, lg_bs, bd, filter_edge_flag, mode);
+    fenc_predict_intra[(lg_bs-2)*11 + f265_mode_to_intra_pred[mode]](dst, neighbours, mode, (filter_edge_flag|(lg_bs<<1)|(bd<<8)));
 }
 
 // Predict the intra block with the mode and the CTB offset specified.
-void fenc_predict_intra(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy)
+void fenc_predict_intra_block(f265_enc_thread *t, f265_pix *dst, int comp, int lg_bs, int mode, int ct_ox, int ct_oy)
 {
     f265_intra_block *ib = &t->intra_block;
     int chroma_flag = !!comp;
-    int bs = 1<<lg_bs;
     int bd = t->enc->gd.bit_depth[chroma_flag];
-    int smooth_intra_flag = F265_GET_FLAG(t->enc->gd.eflags, F265_PF_SMOOTH_INTRA);
-    int filter_edge_flag, filter_neighbour_flag, neighbour_bilinear_flag;
+    int filter_edge_flag, filter_neighbour_flag;
 
-    // Predict the unfiltered neighbours. Assuming 4:2:0.
-    fenc_predict_intra_neighbours(t, ib->neighbours[0], 1, comp, bs, ct_ox, ct_oy);
+    // Get and filter the neighbours.
+    fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag,
+                                comp, lg_bs, mode);
+    int ct_o[2] = {ct_ox, ct_oy};
+    fenc_extract_intra_neighbours(t, ib->neighbours, ct_o, filter_neighbour_flag, 1<<8 | comp, lg_bs);
 
-    // Filter the neighbours.
-    fenc_get_intra_filter_flags(&filter_edge_flag, &filter_neighbour_flag, &neighbour_bilinear_flag,
-                                comp, lg_bs, mode, smooth_intra_flag);
-    if (filter_neighbour_flag) fenc_filter_intra_neighbours(ib->neighbours[1], ib->neighbours[0],
-                                                            bs, bd, neighbour_bilinear_flag);
     f265_pix *neighbours = ib->neighbours[filter_neighbour_flag];
 
     // Do the prediction.
diff --git a/f265/rec.c b/f265/rec.c
index decc88f..f261e28 100644
--- a/f265/rec.c
+++ b/f265/rec.c
@@ -932,7 +932,7 @@ int fenc_rec_intra_tb(f265_enc_thread *t, int comp, int lg_bs, int mode, int zer
 {
     f265_pix pred[32*32];
     int dst_flag, order;
-    fenc_predict_intra(t, pred, comp, lg_bs, mode, ct_ox, ct_oy);
+    fenc_predict_intra_block(t, pred, comp, lg_bs, mode, ct_ox, ct_oy);
     fenc_get_intra_encode_flags(&dst_flag, &order, comp, lg_bs, mode);
 
     if (F265_GET_FLAG(t->enc->gd.eflags, F265_PF_RDOQ))
diff --git a/snippets/asm.py b/snippets/asm.py
index 83a6d6d..8c3b00c 100755
--- a/snippets/asm.py
+++ b/snippets/asm.py
@@ -115,13 +115,17 @@ declare_dict = odict()
 # currently implemented in assembly. If "arch" is specified, it shadows
 # "arch_lbd" and "arch_hbd". If single_c=true, the same C function is mapped to
 # every slot.
-def declare_func(name, ret="void", args="", bd=0, indices=None, avx2=None, avx2_lbd=None, avx2_hbd=None, single_c=True):
+def declare_func(name, ret="void", args="", bd=0, indices=[""], c=1, c_lbd=None, c_hbd=None, \
+                 avx2=None, avx2_lbd=None, avx2_hbd=None, single_c=True):
     f = Function()
     f.name = name
     f.ret = ret
     f.args = args
     f.bd = bd
     f.indices = indices
+    f.c = c
+    f.c_lbd = c_lbd
+    f.c_hbd = c_hbd
     f.avx2 = avx2
     f.avx2_lbd = avx2_lbd
     f.avx2_hbd = avx2_hbd
@@ -132,16 +136,27 @@ def declare_func(name, ret="void", args="", bd=0, indices=None, avx2=None, avx2_
 def declare_all():
     df = declare_func
 
-    amp_indices =        ["2", "4", "8", "16", "32", "64", "6", "12", "24", "48"]
-    luma_amp_indices_x = ["X", "4", "8", "16", "32", "64", "X", "12", "24", "48"]
-    luma_amp_indices =   ["4", "8", "16", "32", "64", "12", "24", "48"]
+    amp_indices =       ["2", "4", "8", "16", "32", "64", "6", "12", "24", "48"]
+    luma_amp_indices =  ["X", "4", "8", "16", "32", "64", "X", "12", "24", "48"]
     luma_qpel_indices = []
+    luma_qpel_indices_c = []
     luma_qpel_indices_avx2 = []
-    for index in luma_amp_indices_x:
+    for index in luma_amp_indices:
         for frac in [ "h", "v", "d"]:
             luma_qpel_indices.append("X" if index == "X" else "%s_%s" % (index, frac))
-            if index != "X" and int(index) % 8 == 0:
-                luma_qpel_indices_avx2.append("%s_%s" % (index, frac))
+            luma_qpel_indices_c.append("X" if index == "X" else frac)
+            luma_qpel_indices_avx2.append("X" if index == "X" or int(index) % 8 != 0 else "%s_%s" % (index, frac))
+
+    intra_pred_indices_seed = ["4", "8", "16", "32"]
+    intra_pred_indices_avx2_seed = ["X", "8", "X", "X"]
+    intra_pred_indices = []
+    intra_pred_indices_avx2 = []
+    for index in intra_pred_indices_seed:
+        for frac in [ "planar", "dc", "angular", "angular", "angular", "angular", "angular", "angular", "angular", "angular", "angular"]:
+            intra_pred_indices.append("X" if index == "X" else "%s" % (frac))
+    for index in intra_pred_indices_avx2_seed:
+        for frac in [ "planar", "dc", "dia_bot_left", "hor_bot", "hor", "hor_top", "dia_top_left", "vert_left", "vert", "vert_right", "dia_top_right"]:
+            intra_pred_indices_avx2.append("X" if index == "X" else "%s_%s" % (frac, index))
 
     # Declarations go here.
     df("dct", bd=1, single_c=False,
@@ -175,12 +190,12 @@ def declare_all():
 
     df("sad3", bd=1,
        args="int *costs, f265_pix *src, int src_stride, f265_pix **refs, int ref_stride, int packed_dims",
-       indices=luma_amp_indices_x,
+       indices=luma_amp_indices,
        avx2_lbd=1)
 
     df("sad4", bd=1,
        args="int *costs, f265_pix *src, int src_stride, f265_pix **refs, int ref_stride, int packed_dims",
-       indices=luma_amp_indices_x,
+       indices=luma_amp_indices,
        avx2_lbd=1)
 
     df("fssd", bd=1,
@@ -189,14 +204,25 @@ def declare_all():
 
     df("avg_pix", bd=1,
        args="f265_pix *dst, f265_pix *src0, int src0_stride, f265_pix *src1, int src1_stride, int packed_dims",
-       indices=luma_amp_indices_x,
-       avx2_lbd=["4", "8", "16", "32", "64", "12", "24", "48"])
+       indices=luma_amp_indices,
+       avx2_lbd=1)
 
-    df("interpol_luma_qpel_pix", bd=1,
+    df("interpol_luma_qpel_pix", bd=1, single_c=False,
        args="f265_pix *dst, int dst_stride, f265_pix *src, int src_stride, int frac, int packed_dims, uint8_t *spill",
        indices=luma_qpel_indices,
+       c=luma_qpel_indices_c,
        avx2_lbd=luma_qpel_indices_avx2)
 
+    df("predict_intra", bd=1, single_c=False,
+       args="f265_pix *dst, f265_pix *neighbours, int mode, int packed",
+       indices=intra_pred_indices,
+       avx2_lbd=intra_pred_indices_avx2)
+
+
+    df("extract_intra_neigh", bd=1, single_c=True,
+       args="f265_pix nbuf[2][160], f265_pix *pred, int pred_stride, int avail[2], int filter, int packed",
+       indices=["4", "8", "16", "32"],
+       avx2_lbd=["X", "8", "X", "X"])
 
 ### AVX2 SAD special code. ###
 def avx2_sad_special_code():
@@ -300,19 +326,19 @@ def get_output():
     for arch in arch_list:
         assign_text[arch] = ""
 
+    # List all function declaration
+    function_list = set()
+
     # Pass every function.
     for f in declare_dict.values():
 
-        # Base function name.
-        base_func_name = "%s_%s" % (prog, f.name)
-
         # Iterate on the bit depths, if any.
         bd_list = ["lbd", "hbd"] if f.bd else [None]
         for bd in bd_list:
 
-            # Adjust the function name for the bit depth.
-            bd_func_name = base_func_name
-            if bd != None: bd_func_name = "%s_%s_%s" % (prog, bd, f.name)
+            # Function name. Include the bd only if defined.
+            bd_func_name = "%s_%s" % (prog, f.name)
+            if bd != None : bd_func_name = "%s_%s_%s" % (prog, bd, f.name)
 
             # Do the substitutions for the bit depth in the arguments and the
             # return type.
@@ -324,62 +350,55 @@ def get_output():
             typedef_text += "typedef %s(*%s_func)(%s);\n" % (func_ret_str, bd_func_name, func_args_str)
 
             # Declare the global variable, along with documentation.
-            var_indice_str = "[%d]" % (len(f.indices)) if f.indices else ""
+            has_indices = len(f.indices) > 1 if f.indices != None else 0
+            var_indice_str = "[%d]" % (len(f.indices)) if has_indices else ""
             global_str = "%s_func %s%s;\n" % (bd_func_name, bd_func_name, var_indice_str)
             if bd != "hbd": global_var_text_lbd += global_str
             else: global_var_text_hbd += global_str
-            if f.indices != None: extern_var_text += "// Indices: %s.\n" % (", ".join(f.indices))
+            if has_indices: extern_var_text += "// Indices: %s.\n" % (", ".join(f.indices))
             extern_var_text += "extern " + global_str + "\n";
 
             # Iterate on the indices, if any.
             index_list = f.indices if f.indices != None else [None]
             for index_pos in range(len(index_list)):
-                index = index_list[index_pos]
-
-                # Adjust the function name for the index.
-                index_func_name = bd_func_name
-                if index != None: index_func_name += "_" + index
-
                 # Iterate on the architectures.
                 for arch in arch_list:
+                    index = None
 
-                    # Adjust the function name for the architecture.
-                    arch_func_name = index_func_name + "_" + arch
-                    if f.single_c and arch == "c":
-                        arch_func_name = bd_func_name + "_c"
-
-                    # Check whether the architecture supports this function.
+                    # Use the f.[arch] if no architecture is defined.
+                    field = getattr(f, arch)
 
-                    # Skipped slot.
-                    if index == "X":
-                        support_flag = 0
+                    # Use the f.[arch_bd] if the bit depth is defined and the generic arch is not.
+                    if field == None and bd != None:
+                        field = getattr(f, "%s_%s" % (arch, bd))
 
-                    # C always supports the function.
-                    elif arch == "c":
-                        support_flag = 1
+                    # If the field is true (1), use the default field name.
+                    if type(field) is int and field == 1:
+                        field = f.indices
 
-                    # Handle assembly.
-                    else:
-                        # Get the relevant fields.
-                        bdi_field = getattr(f, arch)
-                        bd_field = bdi_field if bd == None else getattr(f, "%s_%s" % (arch, bd))
+                    # Get the field value.
+                    if type(field) is list is list:
+                        index = field[index_pos]
 
-                        # Do the shadowing.
-                        field = bd_field if bdi_field == None else bdi_field
+                    # Add the architecture to the function name.
+                    arch_func_name = bd_func_name
+                    if index is not None and len(index): arch_func_name += "_" + index
+                    arch_func_name += "_" + arch
+                    if arch == "c" and f.single_c :
+                        arch_func_name = bd_func_name + "_c"
 
-                        # Explicitly supported.
-                        support_flag = field == 1 or type(field) is list and index in field
+                    # Test if we should skip this index.
+                    support_flag = 0 if index == "X" or (index == None) else 1
 
                     # Declare the prototype.
                     if (arch == "c" and f.single_c and index_pos == 0) or\
                        (support_flag and (arch != "c" or not f.single_c)):
-                        # Kludge for the interpolation functions.
-                        if arch_func_name.find("interpol") != -1 and arch == "c":
-                            for frac in [ "h", "v", "d"]:
-                                proto_text += "%s %s_%s_c(%s);\n" % (func_ret_str, bd_func_name, frac, func_args_str);
-                        # Normal declaration.
-                        else:
-                            proto_text += "%s %s(%s);\n" % (func_ret_str, arch_func_name, func_args_str);
+                        s = "%s %s(%s);\n" % (func_ret_str, arch_func_name, func_args_str);
+
+                        # Insert it only if it's not already declared.
+                        if s not in function_list:
+                            proto_text += s
+                            function_list.add(s)
 
                     # Not supported, skip.
                     if not support_flag: continue
@@ -387,12 +406,8 @@ def get_output():
                     # Do the assignments.
                     assign_tabs = "    "
                     if arch != "c": assign_tabs += "    "
-                    assign_index_str = "[%d]" % (index_pos) if f.indices else ""
-                    assign_val = arch_func_name
-                    # Kludge for the interpolation functions.
-                    if arch_func_name.find("interpol") != -1 and arch == "c":
-                        assign_val = "%s_%s_c" % (arch_func_name[:-2], index[-1])
-                    assign_text[arch] += "%s%s%s = %s;\n" % (assign_tabs, bd_func_name, assign_index_str, assign_val)
+                    assign_index_str = "[%d]" % (index_pos) if has_indices else ""
+                    assign_text[arch] += "%s%s%s = %s;\n" % (assign_tabs, bd_func_name, assign_index_str, arch_func_name)
 
         proto_text += "\n"
 
diff --git a/test/data/bench.ini b/test/data/bench.ini
index 71a3621..d11d908 100644
--- a/test/data/bench.ini
+++ b/test/data/bench.ini
@@ -32,7 +32,7 @@ work=work
 
 # List of videos to encode, as known to f265test. By default, all frames are
 # encoded. Use "video_name:nb_frames" to specify the number of frames to encode.
-videos=SpeedYUV:10 RaceHorsesYUV:20
+videos=speedyuv:30 racehorsesyuv:30
 
 # QP range, same as f265test.
 qp=10-30
@@ -71,7 +71,7 @@ gnuplot_font=
 #display=firefox $HTML_FILE#summary_speed_curve_ssim
 display=firefox $HTML_FILE
 
-# Test definition. The part following "result_" is the test name. It is scanned
+# Test definition. The part following "test_" is the test name. It is scanned
 # for the encoder name.
 [test_f265_base]
 
@@ -99,6 +99,24 @@ desc=A test for f265.
 #f265_special=
 
 
+
+
+[test_f265_intra]
+enc=f265
+clobber=0
+params=quality=25;ref=0;cb-range=3,4;tb-range=2,4
+desc=Intra assembly.
+
+[test_f265_intra_old]
+enc=f265
+bin=/home/daniel/code/f265/f265Clean/build/f265cli_
+clobber=0
+params=quality=25;ref=0;cb-range=3,4;tb-range=2,4;
+desc=No intra assembly.
+
+
+
+
 # Report section. The part following "report_" is the report section name.
 [report_summary]
 
@@ -123,4 +141,3 @@ hist_points=25 50 75
 
 # Report description added to the report (HTML-escaped).
 desc=Summary of the tests
-
diff --git a/test/data/videos.ini b/test/data/videos.ini
index 10084af..319501f 100644
--- a/test/data/videos.ini
+++ b/test/data/videos.ini
@@ -93,3 +93,14 @@ resolution=1920x1080
 frames=500
 bitrate=640
 
+[video_crash]
+path=crash.yuv
+resolution=640x480
+frames=300
+bitrate=640
+
+[video_space]
+path=sp.mp4
+resolution=768x432
+frames=950
+bitrate=640

Reply via email to