The primary objective of Video Complexity Analyzer (VCA) is to provide a fast 
spatial and temporal complexity predictor for every frame/video segment/video 
in order to enhance the prediction of encoding parameters for applications like 
online per-title encoding.

Example: ./ffmpeg -i input.y4m -vf vca=file=vca.csv -f null -

The `vca.csv` file contains the "E" and "h" features, which correspond to the 
spatial and temporal complexity of frames.

For example for video: 
https://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4 
we expect the following E and h values for the first 10 frames:

POC,E,h
0,0,0.000000
1,0,0.000000
2,0,0.000000
3,0,0.000000
4,0,1.390741
5,0,1.788889
3,0,0.000000
4,0,1.390741
5,0,1.788889
6,0,1.670370
7,0,1.768519
8,0,3.114815
9,0,2.290741
10,0,2.459259

Signed-off-by: Hadi Amirpour <[email protected]>
Signed-off-by: mrcybercat <[email protected]>
---
 Changelog                     |   1 +
 libavfilter/Makefile          |   1 +
 libavfilter/allfilters.c      |   1 +
 libavfilter/vca_dct.c         | 598 +++++++++++++++++++++++
 libavfilter/vca_dct.h         | 131 +++++
 libavfilter/vf_vca.c          | 546 +++++++++++++++++++++
 libavfilter/x86/Makefile      |   1 +
 libavfilter/x86/vf_vca.asm    | 877 ++++++++++++++++++++++++++++++++++
 libavfilter/x86/vf_vca_init.c | 210 ++++++++
 9 files changed, 2366 insertions(+)
 create mode 100644 libavfilter/vca_dct.c
 create mode 100644 libavfilter/vca_dct.h
 create mode 100644 libavfilter/vf_vca.c
 create mode 100644 libavfilter/x86/vf_vca.asm
 create mode 100644 libavfilter/x86/vf_vca_init.c

diff --git a/Changelog b/Changelog
index aff0c78153..b26a263d23 100644
--- a/Changelog
+++ b/Changelog
@@ -47,6 +47,7 @@ version 8.1:
 - ProRes Vulkan encoder
 - LCEVC parser
 - LCEVC enhancement layer exporting in MPEG-TS
+- Add VCA filter


 version 8.0:
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 03bf51d3fd..18421b4801 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -564,6 +564,7 @@ OBJS-$(CONFIG_V360_FILTER)                   += vf_v360.o
 OBJS-$(CONFIG_V360_VULKAN_FILTER)            += vf_v360_vulkan.o
 OBJS-$(CONFIG_VAGUEDENOISER_FILTER)          += vf_vaguedenoiser.o
 OBJS-$(CONFIG_VARBLUR_FILTER)                += vf_varblur.o framesync.o
+OBJS-$(CONFIG_VCA_FILTER)                    += vf_vca.o vca_dct.o
 OBJS-$(CONFIG_VECTORSCOPE_FILTER)            += vf_vectorscope.o
 OBJS-$(CONFIG_VFLIP_FILTER)                  += vf_vflip.o
 OBJS-$(CONFIG_VFLIP_VULKAN_FILTER)           += vf_flip_vulkan.o vulkan.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 66c49d453b..ab81ec6414 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -530,6 +530,7 @@ extern const FFFilter ff_vf_v360;
 extern const FFFilter ff_vf_v360_vulkan;
 extern const FFFilter ff_vf_vaguedenoiser;
 extern const FFFilter ff_vf_varblur;
+extern const FFFilter ff_vf_vca;
 extern const FFFilter ff_vf_vectorscope;
 extern const FFFilter ff_vf_vflip;
 extern const FFFilter ff_vf_vflip_vulkan;
diff --git a/libavfilter/vca_dct.c b/libavfilter/vca_dct.c
new file mode 100644
index 0000000000..6d9a4f79f9
--- /dev/null
+++ b/libavfilter/vca_dct.c
@@ -0,0 +1,598 @@
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mem.h"
+
+#include "vca_dct.h"
+
+#define safe_abs(n) _Generic((n), \
+    signed char: abs(n), short: abs(n), int: abs(n), long: labs(n), long long: 
llabs(n))
+
+
+static const int16_t weights_dct8[64] = {
+    0,  27, 94,  94,  94,  94,  94,  95,  27, 94, 94,  95,  96,  97,  98,  99,
+    94, 94, 95,  97,  99,  101, 104, 107, 94, 95, 97,  99,  103, 107, 113, 120,
+    94, 96, 99,  103, 109, 116, 126, 138, 94, 97, 101, 107, 116, 128, 144, 164,
+    94, 98, 104, 113, 126, 144, 168, 201, 95, 99, 107, 120, 138, 164, 201, 255,
+};
+
+static const int16_t weights_dct16[256] = {
+    0,   27,  93,  93,  93,  93,  93,  93,  93,  93,  93,  94,  94,  94,  94,  
94,  27,  93,  93,
+    93,  93,  94,  94,  94,  94,  94,  94,  94,  94,  94,  95,  95,  93,  93,  
93,  94,  94,  94,
+    94,  94,  94,  95,  95,  95,  96,  96,  96,  97,  93,  93,  94,  94,  94,  
94,  94,  95,  95,
+    96,  96,  97,  97,  98,  99,  99,  93,  93,  94,  94,  94,  95,  95,  96,  
96,  97,  98,  99,
+    100, 101, 102, 103, 93,  94,  94,  94,  95,  95,  96,  97,  98,  99,  100, 
101, 102, 104, 106,
+    107, 93,  94,  94,  94,  95,  96,  97,  98,  99,  101, 102, 104, 106, 108, 
110, 113, 93,  94,
+    94,  95,  96,  97,  98,  99,  101, 103, 105, 107, 110, 113, 116, 120, 93,  
94,  94,  95,  96,
+    98,  99,  101, 103, 106, 108, 112, 115, 119, 123, 128, 93,  94,  95,  96,  
97,  99,  101, 103,
+    106, 109, 112, 116, 121, 126, 132, 138, 93,  94,  95,  96,  98,  100, 102, 
105, 108, 112, 117,
+    122, 128, 134, 142, 150, 94,  94,  95,  97,  99,  101, 104, 107, 112, 116, 
122, 128, 135, 144,
+    153, 164, 94,  94,  96,  97,  100, 102, 106, 110, 115, 121, 128, 135, 145, 
155, 167, 181, 94,
+    94,  96,  98,  101, 104, 108, 113, 119, 126, 134, 144, 155, 168, 183, 201, 
94,  95,  96,  99,
+    102, 106, 110, 116, 123, 132, 142, 153, 167, 183, 203, 225, 94,  95,  97,  
99,  103, 107, 113,
+    120, 128, 138, 150, 164, 181, 201, 225, 255,
+};
+
+static const int16_t weights_dct32[1024] = {
+    0,   27,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  
93,  93,  93,  93,
+    93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  27,  93,  
93,  93,  93,  93,
+    93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  
93,  93,  94,  94,
+    94,  94,  94,  94,  94,  94,  94,  93,  93,  93,  93,  93,  93,  93,  93,  
93,  93,  93,  93,
+    93,  93,  93,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  
94,  94,  94,  94,
+    94,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  93,  94,  94,  94,  
94,  94,  94,  94,
+    94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  95,  95,  95,  95,  93,  
93,  93,  93,  93,
+    93,  93,  93,  93,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  
94,  94,  94,  95,
+    95,  95,  95,  95,  95,  95,  95,  96,  93,  93,  93,  93,  93,  93,  93,  
94,  94,  94,  94,
+    94,  94,  94,  94,  94,  94,  94,  94,  95,  95,  95,  95,  95,  95,  96,  
96,  96,  96,  96,
+    96,  97,  93,  93,  93,  93,  93,  93,  94,  94,  94,  94,  94,  94,  94,  
94,  94,  94,  95,
+    95,  95,  95,  95,  95,  96,  96,  96,  96,  97,  97,  97,  97,  98,  98,  
93,  93,  93,  93,
+    93,  94,  94,  94,  94,  94,  94,  94,  94,  94,  95,  95,  95,  95,  95,  
96,  96,  96,  96,
+    97,  97,  97,  98,  98,  98,  99,  99,  99,  93,  93,  93,  93,  93,  94,  
94,  94,  94,  94,
+    94,  94,  95,  95,  95,  95,  95,  96,  96,  96,  97,  97,  97,  98,  98,  
98,  99,  99,  100,
+    100, 101, 101, 93,  93,  93,  93,  94,  94,  94,  94,  94,  94,  94,  95,  
95,  95,  95,  96,
+    96,  96,  97,  97,  97,  98,  98,  99,  99,  100, 100, 101, 101, 102, 102, 
103, 93,  93,  93,
+    93,  94,  94,  94,  94,  94,  94,  95,  95,  95,  95,  96,  96,  96,  97,  
97,  98,  98,  99,
+    99,  100, 100, 101, 102, 102, 103, 104, 104, 105, 93,  93,  93,  94,  94,  
94,  94,  94,  94,
+    95,  95,  95,  96,  96,  96,  97,  97,  98,  98,  99,  99,  100, 100, 101, 
102, 102, 103, 104,
+    105, 106, 107, 107, 93,  93,  93,  94,  94,  94,  94,  94,  95,  95,  95,  
96,  96,  96,  97,
+    97,  98,  98,  99,  100, 100, 101, 102, 102, 103, 104, 105, 106, 107, 108, 
109, 110, 93,  93,
+    93,  94,  94,  94,  94,  94,  95,  95,  95,  96,  96,  97,  97,  98,  99,  
99,  100, 101, 101,
+    102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 93,  93,  93,  94,  
94,  94,  94,  95,
+    95,  95,  96,  96,  97,  97,  98,  99,  99,  100, 101, 102, 103, 104, 105, 
106, 107, 108, 109,
+    110, 112, 113, 115, 116, 93,  93,  94,  94,  94,  94,  94,  95,  95,  96,  
96,  97,  97,  98,
+    99,  99,  100, 101, 102, 103, 104, 105, 106, 107, 109, 110, 112, 113, 115, 
116, 118, 120, 93,
+    93,  94,  94,  94,  94,  95,  95,  95,  96,  96,  97,  98,  99,  99,  100, 
101, 102, 103, 104,
+    105, 107, 108, 109, 111, 113, 114, 116, 118, 120, 122, 124, 93,  93,  94,  
94,  94,  94,  95,
+    95,  96,  96,  97,  98,  98,  99,  100, 101, 102, 103, 104, 106, 107, 108, 
110, 112, 113, 115,
+    117, 119, 121, 123, 126, 128, 93,  93,  94,  94,  94,  94,  95,  95,  96,  
97,  97,  98,  99,
+    100, 101, 102, 103, 104, 106, 107, 109, 110, 112, 114, 116, 118, 120, 122, 
125, 127, 130, 133,
+    93,  93,  94,  94,  94,  95,  95,  96,  96,  97,  98,  99,  100, 101, 102, 
103, 104, 106, 107,
+    109, 110, 112, 114, 116, 119, 121, 123, 126, 129, 132, 135, 138, 93,  93,  
94,  94,  94,  95,
+    95,  96,  97,  97,  98,  99,  100, 101, 103, 104, 105, 107, 109, 110, 112, 
114, 117, 119, 122,
+    124, 127, 130, 133, 136, 140, 144, 93,  93,  94,  94,  94,  95,  95,  96,  
97,  98,  99,  100,
+    101, 102, 104, 105, 107, 108, 110, 112, 114, 117, 119, 122, 125, 128, 131, 
134, 138, 142, 146,
+    150, 93,  93,  94,  94,  94,  95,  96,  96,  97,  98,  99,  100, 102, 103, 
105, 106, 108, 110,
+    112, 114, 117, 119, 122, 125, 128, 131, 135, 139, 143, 147, 152, 157, 93,  
94,  94,  94,  95,
+    95,  96,  97,  98,  99,  100, 101, 102, 104, 106, 107, 109, 112, 114, 116, 
119, 122, 125, 128,
+    132, 135, 140, 144, 148, 153, 159, 164, 93,  94,  94,  94,  95,  95,  96,  
97,  98,  99,  100,
+    102, 103, 105, 107, 109, 111, 113, 116, 119, 122, 125, 128, 132, 136, 140, 
144, 149, 154, 160,
+    166, 172, 93,  94,  94,  94,  95,  96,  96,  97,  98,  100, 101, 102, 104, 
106, 108, 110, 113,
+    115, 118, 121, 124, 128, 131, 135, 140, 145, 150, 155, 161, 167, 174, 181, 
93,  94,  94,  94,
+    95,  96,  97,  98,  99,  100, 102, 103, 105, 107, 109, 112, 114, 117, 120, 
123, 127, 131, 135,
+    140, 144, 150, 155, 161, 168, 175, 182, 191, 93,  94,  94,  94,  95,  96,  
97,  98,  99,  101,
+    102, 104, 106, 108, 110, 113, 116, 119, 122, 126, 130, 134, 139, 144, 149, 
155, 161, 168, 175,
+    183, 192, 201, 93,  94,  94,  95,  95,  96,  97,  98,  100, 101, 103, 105, 
107, 109, 112, 115,
+    118, 121, 125, 129, 133, 138, 143, 148, 154, 161, 168, 175, 184, 193, 202, 
213, 93,  94,  94,
+    95,  95,  96,  97,  99,  100, 102, 104, 106, 108, 110, 113, 116, 120, 123, 
127, 132, 136, 142,
+    147, 153, 160, 167, 175, 183, 193, 203, 214, 225, 93,  94,  94,  95,  95,  
96,  98,  99,  101,
+    102, 104, 107, 109, 112, 115, 118, 122, 126, 130, 135, 140, 146, 152, 159, 
166, 174, 182, 192,
+    202, 214, 226, 239, 93,  94,  94,  95,  96,  97,  98,  99,  101, 103, 105, 
107, 110, 113, 116,
+    120, 124, 128, 133, 138, 144, 150, 157, 164, 172, 181, 191, 201, 213, 225, 
239, 255,
+};
+
+static const int16_t g_t4[4][4] =
+{
+    { 64, 64, 64, 64 },
+    { 83, 36, -36, -83 },
+    { 64, -64, -64, 64 },
+    { 36, -83, 83, -36 }
+};
+
+static const int16_t g_t8[8][8] =
+{
+    { 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 89, 75, 50, 18, -18, -50, -75, -89 },
+    { 83, 36, -36, -83, -83, -36, 36, 83 },
+    { 75, -18, -89, -50, 50, 89, 18, -75 },
+    { 64, -64, -64, 64, 64, -64, -64, 64 },
+    { 50, -89, 18, 75, -75, -18, 89, -50 },
+    { 36, -83, 83, -36, -36, 83, -83, 36 },
+    { 18, -50, 75, -89, 89, -75, 50, -18 }
+};
+
+static const int16_t g_t16[16][16] =
+{
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90 },
+    { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 },
+    { 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87 },
+    { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 },
+    { 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80 },
+    { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 },
+    { 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70 },
+    { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 },
+    { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57 },
+    { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 },
+    { 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43 },
+    { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 },
+    { 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25 },
+    { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 },
+    {  9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 }
+};
+
+static const int16_t g_t32[32][32] =
+{
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4, -4, -13, 
-22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
+    { 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90, 
-90, -87, -80, -70, -57, -43, -25, -9,  9, 25, 43, 57, 70, 80, 87, 90 },
+    { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, 
-13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4, -22, -46, -67, -82, -90 },
+    { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 
89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 },
+    { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, 
-22, -61, -85, -90, -73, -38,  4, 46, 78, 90, 82, 54, 13, -31, -67, -88 },
+    { 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, 
-87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43,  9, 57, 87 },
+    { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 
31, 78, 90, 61,  4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 },
+    { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 
83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 },
+    { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 73, 88, 38, 
-38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 },
+    { 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, 
-80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70,  9, 80 },
+    { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 
46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82,  4, -78 },
+    { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 
75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 },
+    { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, 
-54, -85,  4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 },
+    { 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, 
-70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90,  9, -87, -43, 70 },
+    { 67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 46, -73, -61, 
61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 },
+    { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 
64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 },
+    { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, 
-67, -54, 78, 38, -85, -22, 90,  4, -90, 13, 88, -31, -82, 46, 73, -61 },
+    { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57, 
-57, 80, 25, -90,  9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 },
+    { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 
73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88,  4, 85, -54 },
+    { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 
50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 },
+    { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82,  4, 78, 
-78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 },
+    { 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43, 
-43, 90, -57, -25, 87, -70, -9, 80, -80,  9, 70, -87, 25, 57, -90, 43 },
+    { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 
82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67,  4, -73, 88, -38 },
+    { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 
36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 },
+    { 31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, 
-85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 },
+    { 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, 
-25, 70, -90, 80, -43, -9, 57, -87, 87, -57,  9, 43, -80, 90, -70, 25 },
+    { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 
88, -67, 31, 13, -54, 82, -90, 78, -46,  4, 38, -73, 90, -85, 61, -22 },
+    { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, 
18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 },
+    { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31,  4, 22, -46, 67, -82, 90, 
-90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 },
+    {  9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, 
-9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9 },
+    {  4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 
90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }
+};
+
+static void partial_butterfly4(const int16_t* src, int16_t* dst, int shift, 
int line)
+{
+    int j;
+    int E[2], O[2];
+    int add = 1 << (shift - 1);
+
+    for (j = 0; j < line; j++)
+    {
+        /* E and O */
+        E[0] = src[0] + src[3];
+        O[0] = src[0] - src[3];
+        E[1] = src[1] + src[2];
+        O[1] = src[1] - src[2];
+
+        dst[0] = (int16_t)((g_t4[0][0] * E[0] + g_t4[0][1] * E[1] + add) >> 
shift);
+        dst[2 * line] = (int16_t)((g_t4[2][0] * E[0] + g_t4[2][1] * E[1] + 
add) >> shift);
+        dst[line] = (int16_t)((g_t4[1][0] * O[0] + g_t4[1][1] * O[1] + add) >> 
shift);
+        dst[3 * line] = (int16_t)((g_t4[3][0] * O[0] + g_t4[3][1] * O[1] + 
add) >> shift);
+
+        src += 4;
+        dst++;
+    }
+}
+
+static void partial_butterfly8(const int16_t* src, int16_t* dst, int shift, 
int line)
+{
+    int j, k;
+    int E[4], O[4];
+    int EE[2], EO[2];
+    int add = 1 << (shift - 1);
+
+    for (j = 0; j < line; j++)
+    {
+        /* E and O*/
+        for (k = 0; k < 4; k++)
+        {
+            E[k] = src[k] + src[7 - k];
+            O[k] = src[k] - src[7 - k];
+        }
+
+        /* EE and EO */
+        EE[0] = E[0] + E[3];
+        EO[0] = E[0] - E[3];
+        EE[1] = E[1] + E[2];
+        EO[1] = E[1] - E[2];
+
+        dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> 
shift);
+        dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + 
add) >> shift);
+        dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + 
add) >> shift);
+        dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + 
add) >> shift);
+
+        dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + 
g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift);
+        dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + 
g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift);
+        dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + 
g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift);
+        dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + 
g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift);
+
+        src += 8;
+        dst++;
+    }
+}
+
+static void partial_butterfly16(const int16_t* src, int16_t* dst, int shift, 
int line)
+{
+    int j, k;
+    int E[8], O[8];
+    int EE[4], EO[4];
+    int EEE[2], EEO[2];
+    int add = 1 << (shift - 1);
+
+    for (j = 0; j < line; j++)
+    {
+        /* E and O */
+        for (k = 0; k < 8; k++)
+        {
+            E[k] = src[k] + src[15 - k];
+            O[k] = src[k] - src[15 - k];
+        }
+
+        /* EE and EO */
+        for (k = 0; k < 4; k++)
+        {
+            EE[k] = E[k] + E[7 - k];
+            EO[k] = E[k] - E[7 - k];
+        }
+
+        /* EEE and EEO */
+        EEE[0] = EE[0] + EE[3];
+        EEO[0] = EE[0] - EE[3];
+        EEE[1] = EE[1] + EE[2];
+        EEO[1] = EE[1] - EE[2];
+
+        dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) 
>> shift);
+        dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] 
+ add) >> shift);
+        dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] 
+ add) >> shift);
+        dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * 
EEO[1] + add) >> shift);
+
+        for (k = 2; k < 16; k += 4)
+        {
+            dst[k * line] = (int16_t)((g_t16[k][0] * EO[0] + g_t16[k][1] * 
EO[1] + g_t16[k][2] * EO[2] +
+                                       g_t16[k][3] * EO[3] + add) >> shift);
+        }
+
+        for (k = 1; k < 16; k += 2)
+        {
+            dst[k * line] =  (int16_t)((g_t16[k][0] * O[0] + g_t16[k][1] * 
O[1] + g_t16[k][2] * O[2] + g_t16[k][3] * O[3] +
+                                        g_t16[k][4] * O[4] + g_t16[k][5] * 
O[5] + g_t16[k][6] * O[6] + g_t16[k][7] * O[7] +
+                                        add) >> shift);
+        }
+
+        src += 16;
+        dst++;
+    }
+}
+
+static void partial_butterfly32(const int16_t* src, int16_t* dst, int shift, 
int line)
+{
+    int j, k;
+    int E[16], O[16];
+    int EE[8], EO[8];
+    int EEE[4], EEO[4];
+    int EEEE[2], EEEO[2];
+    int add = 1 << (shift - 1);
+
+    for (j = 0; j < line; j++)
+    {
+        /* E and O*/
+        for (k = 0; k < 16; k++)
+        {
+            E[k] = src[k] + src[31 - k];
+            O[k] = src[k] - src[31 - k];
+        }
+
+        /* EE and EO */
+        for (k = 0; k < 8; k++)
+        {
+            EE[k] = E[k] + E[15 - k];
+            EO[k] = E[k] - E[15 - k];
+        }
+
+        /* EEE and EEO */
+        for (k = 0; k < 4; k++)
+        {
+            EEE[k] = EE[k] + EE[7 - k];
+            EEO[k] = EE[k] - EE[7 - k];
+        }
+
+        /* EEEE and EEEO */
+        EEEE[0] = EEE[0] + EEE[3];
+        EEEO[0] = EEE[0] - EEE[3];
+        EEEE[1] = EEE[1] + EEE[2];
+        EEEO[1] = EEE[1] - EEE[2];
+
+        dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + 
add) >> shift);
+        dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * 
EEEE[1] + add) >> shift);
+        dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * 
EEEO[1] + add) >> shift);
+        dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * 
EEEO[1] + add) >> shift);
+        for (k = 4; k < 32; k += 8)
+        {
+            dst[k * line] = (int16_t)((g_t32[k][0] * EEO[0] + g_t32[k][1] * 
EEO[1] + g_t32[k][2] * EEO[2] +
+                                       g_t32[k][3] * EEO[3] + add) >> shift);
+        }
+
+        for (k = 2; k < 32; k += 4)
+        {
+            dst[k * line] = (int16_t)((g_t32[k][0] * EO[0] + g_t32[k][1] * 
EO[1] + g_t32[k][2] * EO[2] +
+                                       g_t32[k][3] * EO[3] + g_t32[k][4] * 
EO[4] + g_t32[k][5] * EO[5] +
+                                       g_t32[k][6] * EO[6] + g_t32[k][7] * 
EO[7] + add) >> shift);
+        }
+
+        for (k = 1; k < 32; k += 2)
+        {
+            dst[k * line] = (int16_t)((g_t32[k][0] * O[0] + g_t32[k][1] * O[1] 
+ g_t32[k][2] * O[2] + g_t32[k][3] * O[3] +
+                                       g_t32[k][4] * O[4] + g_t32[k][5] * O[5] 
+ g_t32[k][6] * O[6] + g_t32[k][7] * O[7] +
+                                       g_t32[k][8] * O[8] + g_t32[k][9] * O[9] 
+ g_t32[k][10] * O[10] + g_t32[k][11] *
+                                       O[11] + g_t32[k][12] * O[12] + 
g_t32[k][13] * O[13] + g_t32[k][14] * O[14] +
+                                       g_t32[k][15] * O[15] + add) >> shift);
+        }
+
+        src += 32;
+        dst++;
+    }
+}
+
+void ff_vca_dct4_c(const int16_t* block, int16_t* dst, int bit_depth)
+{
+    const int shift_1st = 1 + bit_depth - 8;
+    const int shift_2nd = 8;
+
+    DECLARE_ALIGNED_32(int16_t, coef[4 * 4]);
+
+    partial_butterfly4(block, coef, shift_1st, 4);
+    partial_butterfly4(coef, dst, shift_2nd, 4);
+}
+
+void ff_vca_dct8_c(const int16_t* block, int16_t* dst, int bit_depth)
+{
+    const int shift_1st = 2 + bit_depth - 8;
+    const int shift_2nd = 9;
+
+    DECLARE_ALIGNED_32(int16_t, coef[8 * 8]);
+
+    partial_butterfly8(block, coef, shift_1st, 8);
+    partial_butterfly8(coef, dst, shift_2nd, 8);
+}
+
+void ff_vca_dct16_c(const int16_t* block, int16_t* dst, int bit_depth)
+{
+    const int shift_1st = 3 + bit_depth - 8;
+    const int shift_2nd = 10;
+
+    DECLARE_ALIGNED_32(int16_t, coef[16 * 16]);
+
+    partial_butterfly16(block, coef, shift_1st, 16);
+    partial_butterfly16(coef, dst, shift_2nd, 16);
+}
+
+void ff_vca_dct32_c(const int16_t* block, int16_t* dst, int bit_depth)
+{
+    const int shift_1st = 4 + bit_depth - 8;
+    const int shift_2nd = 11;
+
+    DECLARE_ALIGNED_32(int16_t, coef[32 * 32]);
+
+    partial_butterfly32(block, coef, shift_1st, 32);
+    partial_butterfly32(coef, dst, shift_2nd, 32);
+}
+
+void ff_vca_lowpass_dct8_c(const int16_t* src, int16_t* dst, int bit_depth)
+{
+    DECLARE_ALIGNED_32(int16_t, coef[4 * 4]);
+    DECLARE_ALIGNED_32(int16_t, avg_block[4 * 4]);
+
+    int16_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 4; i++)
+        for (int j =0; j < 4; j++)
+        {
+            // Calculate average of 2x2 cells
+            sum = src[2*i*8 + 2*j] + src[2*i*8 + 2*j + 1]
+                    + src[(2*i+1)*8 + 2*j] + src[(2*i+1)*8 + 2*j + 1];
+            avg_block[i*4 + j] = sum >> 2;
+
+            totalSum += sum; // use to calculate total block average
+        }
+
+    ff_vca_dct4_c(avg_block, coef, bit_depth);
+
+    memset(dst, 0, 64 * sizeof(int16_t));
+    for (int i = 0; i < 4; i++)
+    {
+        memcpy(&dst[i * 8], &coef[i * 4], 4 * sizeof(int16_t));
+    }
+
+    // replace first coef with total block average
+    dst[0] = totalSum << 1;
+}
+
+void ff_vca_lowpass_dct16_c(const int16_t* src, int16_t* dst, int bit_depth)
+{
+    DECLARE_ALIGNED_32(int16_t, coef[8 * 8]);
+    DECLARE_ALIGNED_32(int16_t, avg_block[8 * 8]);
+
+    int32_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 8; i++)
+        for (int j =0; j < 8; j++)
+        {
+            sum = src[2*i*16 + 2*j] + src[2*i*16 + 2*j + 1]
+                    + src[(2*i+1)*16 + 2*j] + src[(2*i+1)*16 + 2*j + 1];
+            avg_block[i*8 + j] = sum >> 2;
+
+            totalSum += sum;
+        }
+
+    ff_vca_dct8_c(avg_block, coef, bit_depth);
+
+    memset(dst, 0, 256 * sizeof(int16_t));
+    for (int i = 0; i < 8; i++)
+    {
+        memcpy(&dst[i * 16], &coef[i * 8], 8 * sizeof(int16_t));
+    }
+    dst[0] = (int16_t)(totalSum >> 1);
+}
+
+void ff_vca_lowpass_dct32_c(const int16_t* src, int16_t* dst, int bit_depth)
+{
+    DECLARE_ALIGNED_32(int16_t, coef[16 * 16]);
+    DECLARE_ALIGNED_32(int16_t, avg_block[16 * 16]);
+
+    int32_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 16; i++)
+        for (int j =0; j < 16; j++)
+        {
+            sum = src[2*i*32 + 2*j] + src[2*i*32 + 2*j + 1]
+                    + src[(2*i+1)*32 + 2*j] + src[(2*i+1)*32 + 2*j + 1];
+            avg_block[i*16 + j] = sum >> 2;
+
+            totalSum += sum;
+        }
+
+    ff_vca_dct16_c(avg_block, coef, bit_depth);
+
+    memset(dst, 0, 1024 * sizeof(int16_t));
+    for (int i = 0; i < 16; i++)
+    {
+        memcpy(&dst[i * 32], &coef[i * 16], 16 * sizeof(int16_t));
+    }
+    dst[0] = (int16_t)(totalSum >> 3);
+}
+
+
+uint32_t ff_calc_weighted_coeff(unsigned blocksize, int16_t *coeff_buffer, int 
enable_lowpass)
+{
+    uint32_t weighted_sum = 0;
+
+    uint16_t* weights_matrix = weights_dct32;
+    switch (blocksize)
+    {
+        case 32:
+            weights_matrix = weights_dct32;
+            break;
+        case 16:
+            weights_matrix = weights_dct16;
+            break;
+        case 8:
+            weights_matrix = weights_dct8;
+            break;
+    }
+
+    for (unsigned i = 0; i < blocksize * blocksize; i++)
+    {
+        uint32_t weighted_coeff = (uint32_t)((weights_matrix[i] * 
safe_abs(coeff_buffer[i])) >> 8);
+        weighted_sum += weighted_coeff;
+    }
+
+    if (blocksize >= 16 && enable_lowpass)
+        weighted_sum *= 2;
+
+    return weighted_sum;
+}
+
+
+static void copy_vals_wo_padding(unsigned pxl_depth, unsigned blocksize, 
uint8_t *src, unsigned stride, int16_t *buffer)
+{
+    if (pxl_depth == 1)
+    {
+        uint8_t *srcptr = src;
+        for (unsigned y = 0; y < blocksize; y++)
+            for (unsigned x = 0; x < blocksize; x++)
+                *(buffer++) = (int16_t)srcptr[x + stride*y];
+    } else {
+        uint16_t *srcptr = (uint16_t *) src;
+        const unsigned bytes_per_line = blocksize * 2;
+        for (unsigned y = 0; y < blocksize; ++y)
+        {
+            memcpy(buffer, srcptr, blocksize * sizeof(uint16_t));
+            srcptr += stride / 2;
+            buffer += blocksize;
+        }
+    }
+}
+
+static void copy_vals_w_padding(unsigned pxl_depth, unsigned blocksize, 
uint8_t *src, unsigned stride, int16_t *buffer, unsigned padding_r, unsigned 
padding_b)
+{
+    unsigned y          = 0;
+    int16_t *buffer_last_line = buffer;
+
+    if (pxl_depth == 1) {
+        for (; y < blocksize - padding_b; y++, src += stride) {
+            unsigned x     = 0;
+            buffer_last_line = buffer;
+            for (; x < blocksize - padding_r; x++)
+                *(buffer++) = (int16_t)(src[x]);
+            const int16_t last = (int16_t)(src[x]);
+            for (; x < blocksize; x++)
+                *(buffer++) = last;
+        }
+        for (; y < blocksize; y++) {
+            for (unsigned x = 0; x < blocksize; x++)
+                *(buffer++) = (buffer_last_line[x]);
+        }
+    } else {
+        uint16_t *srcptr = (uint16_t*)(src);
+        for (; y < blocksize - padding_b; y++) {
+            unsigned x     = 0;
+            buffer_last_line = buffer;
+
+            const unsigned nr_vals_copy = blocksize - padding_r;
+            memcpy(buffer, srcptr, nr_vals_copy * sizeof(uint16_t));
+
+            const uint16_t last = srcptr[nr_vals_copy - 1];
+            for (unsigned x = nr_vals_copy; x < blocksize; x++)
+                buffer[x] = last;
+
+            buffer += blocksize;
+            srcptr += stride / 2;
+        }
+        for (; y < blocksize; y++) {
+            const unsigned nr_bytes_copy = blocksize * 2;
+            memcpy(buffer, buffer_last_line, blocksize * sizeof(uint16_t));
+            buffer += blocksize;
+        }
+    }
+}
+
+void ff_copy_vals_buffer(unsigned pxl_depth, unsigned offset, unsigned 
blocksize, uint8_t *src, unsigned stride, int16_t *buffer, unsigned padding_r, 
unsigned padding_b)
+{
+    src += offset;
+    if (padding_r == 0 && padding_b == 0)
+        copy_vals_wo_padding(pxl_depth, blocksize, src, stride, buffer);
+    else
+        copy_vals_w_padding(pxl_depth, blocksize, src, stride, buffer, 
padding_r, padding_b);
+}
\ No newline at end of file
diff --git a/libavfilter/vca_dct.h b/libavfilter/vca_dct.h
new file mode 100644
index 0000000000..faa106c89a
--- /dev/null
+++ b/libavfilter/vca_dct.h
@@ -0,0 +1,131 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * functions and constants for descrete cosine transform for VCA
+ */
+
+#include "avfilter.h"
+#include "libavutil/eval.h"
+#include "libavutil/mem_internal.h"
+#include "libavformat/avio.h"
+
+#ifndef AVFILTER_VCADCT_H
+#define AVFILTER_VCADCT_H
+
+#if defined(__GNUC__)
+#define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
+#endif
+
+typedef struct VCAPlaneInfo {
+    int pxl_depth;
+    int bit_depth;
+
+    int w_pxls_src;
+    int h_pxls_src;
+
+    int n_blocks;
+
+    int w_blocks;
+    int h_blocks;
+
+    int w_pxls;
+    int h_pxls;
+} VCAPlaneInfo;
+
+typedef struct VCAResults {
+    // globals
+    uint32_t *energy;
+    uint32_t *energy_prev;
+    uint32_t *brightness;
+    double *energy_dif;
+} VCAResults;
+
+typedef struct ResultSums{
+    uint32_t E;
+    uint32_t L;
+    double h;
+} ResultSums;
+
+typedef struct VCAContext {
+    const AVClass *class;
+    AVIOContext *avio_context;
+    void (*print)(AVFilterContext *ctx, int lvl, const char *msg, ...); // 
av_printf_format(2, 3);
+    void (*perform_dct)(const int16_t* block, int16_t* dst, int bit_depth);
+
+    void (*calc_vca_slice_isnf0)(int stride, uint8_t *src, VCAPlaneInfo 
*plane, VCAResults *result,
+                        int enable_lowpass, int slice_start, int slice_end, 
ResultSums *partial_sum,
+                        void (*perform_dct)(const int16_t* block, int16_t* 
dst, int bit_depth));
+
+    void (*calc_vca_slice_isnf1)(int stride, uint8_t *src, VCAPlaneInfo 
*plane, VCAResults *result,
+                        int enable_lowpass, int slice_start, int slice_end, 
ResultSums *partial_sum,
+                        void (*perform_dct)(const int16_t* block, int16_t* 
dst, int bit_depth));
+
+
+    // options
+    unsigned blocksize;
+    int enable_lowpass;
+    int enable_chroma;
+    int enable_brightness;
+    int enable_simd;
+    int yuview;
+    int n_frames;
+    char *file_str;
+
+    // video frame properties
+    VCAPlaneInfo **plane;
+    int n_frames_processed;
+
+    // results
+    VCAResults **result;
+} VCAContext;
+
+
+static const int16_t weights_dct8[64];
+static const int16_t weights_dct16[256];
+static const int16_t weights_dct32[1024];
+
+static const int16_t g_t4[4][4];
+static const int16_t g_t8[8][8];
+static const int16_t g_t16[16][16];
+static const int16_t g_t32[32][32];
+
+uint32_t ff_calc_weighted_coeff(unsigned blocksize, int16_t *coeff_buffer, int 
enable_lowpass);
+
+void ff_copy_vals_buffer(unsigned pxl_depth, unsigned offset, unsigned 
blocksize, uint8_t *src, unsigned stride, int16_t *buffer, unsigned padding_r, 
unsigned padding_b);
+
+void ff_vca_dct4_c(const int16_t* src, int16_t* dst, int bit_depth);
+
+void ff_vca_dct8_c(const int16_t* src, int16_t* dst, int bit_depth);
+
+void ff_vca_dct16_c(const int16_t* src, int16_t* dst, int bit_depth);
+
+void ff_vca_dct32_c(const int16_t* src, int16_t* dst, int bit_depth);
+
+void ff_vca_lowpass_dct8_c(const int16_t* src, int16_t* dst, int bit_depth);
+
+void ff_vca_lowpass_dct16_c(const int16_t* src, int16_t* dst, int bit_depth);
+
+void ff_vca_lowpass_dct32_c(const int16_t* src, int16_t* dst, int bit_depth);
+
+int ff_vca_dct_init_x86(VCAContext *v);
+
+#endif
\ No newline at end of file
diff --git a/libavfilter/vf_vca.c b/libavfilter/vf_vca.c
new file mode 100644
index 0000000000..8c345aafa5
--- /dev/null
+++ b/libavfilter/vf_vca.c
@@ -0,0 +1,546 @@
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Calculate frame scores using Video Complexity Analyzer (VCA)
+ */
+
+
+#include "libavutil/timestamp.h"
+#include "libavutil/mathematics.h"
+
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+
+
+#include "avfilter.h"
+#include "filters.h"
+#include "formats.h"
+#include "video.h"
+
+#include "vca_dct.h"
+
+typedef struct ThreadData {
+    void (*perform_dct)(const int16_t* block, int16_t* dst, int bit_depth);
+    void (*calc_vca_slice)(int stride, uint8_t *src, VCAPlaneInfo *plane, 
VCAResults *result,
+                        int enable_lowpass, int slice_start, int slice_end, 
ResultSums *partial_sum,
+                        void (*perform_dct)(const int16_t* block, int16_t* 
dst, int bit_depth));
+    int stride;
+    int blocksize;
+
+    int enable_lowpass;
+
+    uint8_t *src;
+
+    VCAPlaneInfo *plane;
+    VCAResults *result;
+
+    ResultSums **partial_sums;
+} ThreadData;
+
+#define OFFSET(x) offsetof(VCAContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
+
+static const AVOption vca_options[] = {
+    // Analysis config
+    { "blocksize", "Set size of block", OFFSET(blocksize), AV_OPT_TYPE_INT, 
{.i64=32}, 8, 32, FLAGS },
+    { "n", "Set the frames batch size, -1 to process all", OFFSET(n_frames), 
AV_OPT_TYPE_INT, {.i64=-1}, -1, INT_MAX, FLAGS },
+    // Performance
+    { "lowpass", "Enable low-pass DCT", OFFSET(enable_lowpass), 
AV_OPT_TYPE_BOOL, { .i64=1 }, 0, 1, FLAGS },
+    { "simd", "Enable hardware acceralation with SIMD", OFFSET(enable_simd), 
AV_OPT_TYPE_BOOL, { .i64=1 }, 0, 1, FLAGS },
+    { "brightness", "Enable brightness infomation", OFFSET(enable_brightness), 
AV_OPT_TYPE_BOOL, { .i64=0 }, 0, 1, FLAGS },
+    { "chroma", "Enable analysis of chroma channels", OFFSET(enable_chroma), 
AV_OPT_TYPE_BOOL, { .i64=0 }, 0, 1, FLAGS },
+    // Output
+    { "file", "Set file where to print analysis information", 
OFFSET(file_str), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS },
+    { "yuview", "Produce a detailed blockwise output for YUView", 
OFFSET(yuview), AV_OPT_TYPE_BOOL, { .i64=0 }, 0, 1, FLAGS },
+    { NULL }
+};
+
+static const double E_norm_factor = 90;
+static const double h_norm_factor = 18;
+
+AVFILTER_DEFINE_CLASS(vca);
+
+static const enum AVPixelFormat pxl_fmts[] = {
+    AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV410P,
+    AV_PIX_FMT_YUVJ411P, AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P,
+    AV_PIX_FMT_YUVJ440P, AV_PIX_FMT_YUVJ444P,
+    #define PF(suf) AV_PIX_FMT_YUV420##suf, AV_PIX_FMT_YUV422##suf, 
AV_PIX_FMT_YUV444##suf
+    PF(P10), PF(P12),
+    AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10,
+    AV_PIX_FMT_NONE
+};
+
+#define WRITE_YUVIEW_BRIGHTNESS_1_1 \
+        result->brightness[block_i] = (uint32_t)sqrt(out_buffer[0]);
+
+#define WRITE_YUVIEW_BRIGHTNESS_1_0
+#define WRITE_YUVIEW_BRIGHTNESS_0_1
+#define WRITE_YUVIEW_BRIGHTNESS_0_0
+
+#define WRITE_YUVIEW_BRIGHTNESS(IS_BRIGHTNESS, IS_YUVIEW) 
WRITE_YUVIEW_BRIGHTNESS_##IS_BRIGHTNESS##_##IS_YUVIEW
+
+#define WRITE_YUVIEW_ENERGY_DIF_1_1 \
+        result->energy_dif[block_i] = abs((int)result->energy[block_i] - 
(int)result->energy_prev[block_i]);
+
+#define WRITE_YUVIEW_ENERGY_DIF_1_0
+#define WRITE_YUVIEW_ENERGY_DIF_0_1
+#define WRITE_YUVIEW_ENERGY_DIF_0_0
+
+#define WRITE_YUVIEW_ENERGY_DIF(IS_NOT_FIRST, IS_YUVIEW) 
WRITE_YUVIEW_ENERGY_DIF_##IS_NOT_FIRST##_##IS_YUVIEW
+
+#define ENERGY_DIF_1 \
+        partial_sum->h += abs((int)result->energy[block_i] - 
(int)result->energy_prev[block_i]);
+
+#define ENERGY_DIF_0
+
+#define ENERGY_DIF(IS_NOT_FIRST) ENERGY_DIF_##IS_NOT_FIRST
+
+#define BRIGHTNESS_1 \
+        partial_sum->L += (uint32_t)sqrt(out_buffer[0]);
+
+#define BRIGHTNESS_0
+
+#define BRIGHTNESS(IS_BRIGHTNESS) BRIGHTNESS_##IS_BRIGHTNESS
+
+#define DEFINE_CALC_ENERGY_SLICE(BLOCKSIZE, IS_NOT_FIRST, IS_YUVIEW, 
IS_BRIGHNTESS)                 \
+static void 
calc_vca_##BLOCKSIZE##_isnf##IS_NOT_FIRST##_brig##IS_BRIGHNTESS##_yuview##IS_YUVIEW##_slice(\
+    int stride, uint8_t *src, VCAPlaneInfo *plane, VCAResults *result,         
                     \
+    int enable_lowpass, int slice_start, int slice_end, ResultSums 
*partial_sum,                      \
+    void (*perform_dct)(const int16_t* block, int16_t* dst, int bit_depth)) {  
                     \
+    int block_i = (slice_start / BLOCKSIZE) * plane->w_blocks;                 
                     \
+    DECLARE_ALIGNED_32(int16_t, block_buffer[BLOCKSIZE * BLOCKSIZE]);          
                           \
+    DECLARE_ALIGNED_32(int16_t, out_buffer[BLOCKSIZE * BLOCKSIZE]);            
                           \
+    const unsigned bit_depth = plane->bit_depth;                               
                     \
+    for (unsigned blockY = slice_start; blockY < slice_end; blockY += 
BLOCKSIZE) {                  \
+        int padding_b = FFMAX(((int)(blockY + BLOCKSIZE) - 
(int)(plane->h_pxls_src)), 0);           \
+        for (unsigned blockX = 0; blockX < plane->w_pxls; blockX += BLOCKSIZE) 
{                    \
+            int offset = blockX * plane->pxl_depth + (blockY * stride);        
                     \
+            int padding_r = FFMAX((int)(blockX + BLOCKSIZE) - 
(int)(plane->w_pxls_src), 0);         \
+            /* Copy values to block buffer */                                  
                     \
+            ff_copy_vals_buffer(plane->pxl_depth, offset, BLOCKSIZE, src, 
stride,                   \
+                             block_buffer, padding_r, padding_b);              
                     \
+            perform_dct(block_buffer, out_buffer, bit_depth);                  
                     \
+            /* Calculate energy */                                             
                     \
+            result->energy[block_i] = ff_calc_weighted_coeff(BLOCKSIZE, 
out_buffer, enable_lowpass);\
+            partial_sum->E += result->energy[block_i];                         
                     \
+            BRIGHTNESS(IS_BRIGHNTESS)                                          
                     \
+            ENERGY_DIF(IS_NOT_FIRST)                                           
                     \
+            WRITE_YUVIEW_BRIGHTNESS(IS_BRIGHNTESS, IS_YUVIEW)                  
                     \
+            WRITE_YUVIEW_ENERGY_DIF(IS_NOT_FIRST, IS_YUVIEW)                   
                     \
+            block_i++;                                                         
                     \
+        }                                                                      
                     \
+    }                                                                          
                     \
+}
+
+#define FUNCTION_LIST(X) \
+    X(8,0,0,0)  X(16,0,0,0)  X(32,0,0,0) X(8,1,0,0)  X(16,1,0,0)  X(32,1,0,0) \
+    X(8,0,1,0)  X(16,0,1,0)  X(32,0,1,0) X(8,1,1,0)  X(16,1,1,0)  X(32,1,1,0) \
+    X(8,0,0,1)  X(16,0,0,1)  X(32,0,0,1) X(8,1,0,1)  X(16,1,0,1)  X(32,1,0,1) \
+    X(8,0,1,1)  X(16,0,1,1)  X(32,0,1,1) X(8,1,1,1)  X(16,1,1,1)  X(32,1,1,1)
+
+FUNCTION_LIST(DEFINE_CALC_ENERGY_SLICE)
+
+#define FN(blsz, isnf, brig, yuv) \
+    calc_vca_##blsz##_isnf##isnf##_brig##brig##_yuview##yuv##_slice
+
+#define ISNF(blsz, brig, yuv) \
+    { FN(blsz,0,brig,yuv), FN(blsz,1,brig,yuv) }
+
+#define YUVIEW(blsz, brig) \
+    { ISNF(blsz, brig, 0), ISNF(blsz, brig, 1) }
+
+#define BRIGHT(blsz) \
+    { YUVIEW(blsz, 0), YUVIEW(blsz, 1) }
+
+static void* calc_fn_table[3][2][2][2] = {
+    BRIGHT(8),
+    BRIGHT(16),
+    BRIGHT(32)
+};
+
+
+static void print_log(AVFilterContext *ctx, int lvl, const char *msg, ...)
+{
+    va_list argument_list;
+
+    va_start(argument_list, msg);
+    if (msg)
+        av_vlog(ctx, lvl, msg, argument_list);
+    va_end(argument_list);
+}
+
+static void print_file(AVFilterContext *ctx, int lvl, const char *msg, ...)
+{
+    VCAContext *v = ctx->priv;
+    va_list argument_list;
+
+    va_start(argument_list, msg);
+    if (msg) {
+        char buf[128];
+        int ret = vsnprintf(buf, sizeof(buf), msg, argument_list);
+        avio_write(v->avio_context, buf, ret);
+    }
+    va_end(argument_list);
+}
+
+static int calc_energy_filter_slice(AVFilterContext *ctx, void *arg, int job, 
int nb_jobs){
+    ThreadData *th = arg;
+
+    int block_row_start = (th->plane->h_blocks * job)     / nb_jobs;
+    int block_row_end   = (th->plane->h_blocks * (job+1)) / nb_jobs;
+
+    int slice_start = block_row_start * th->blocksize;
+    int slice_end   = block_row_end   * th->blocksize;
+
+    th->calc_vca_slice(
+        th->stride, th->src, th->plane, th->result,
+        th->enable_lowpass, slice_start, slice_end,
+        th->partial_sums[job], th->perform_dct
+    );
+
+    return 0;
+}
+
+static void perform_vca(AVFilterContext *ctx, AVFilterLink *inlink, AVFrame 
*in, FilterLink *inl ,
+    VCAContext *v, int plane_i, double* h, uint32_t* E, uint32_t* L){
+    VCAPlaneInfo* plane = v->plane[plane_i];
+
+    int stride = in->linesize[plane_i] / plane->pxl_depth;
+    int nb_threads = ff_filter_get_nb_threads(ctx);
+    void* calc_vca_slice;
+
+    ResultSums** partial_sums = av_calloc(nb_threads, sizeof(ResultSums*));
+    for(int j = 0; j < nb_threads; j++)
+        partial_sums[j] = av_calloc(1, sizeof(ResultSums));
+
+    if (v->n_frames_processed == 0)
+        calc_vca_slice = v->calc_vca_slice_isnf0;
+    else
+        calc_vca_slice = v->calc_vca_slice_isnf1;
+
+    ThreadData th = {
+        .stride = stride,
+        .blocksize = v->blocksize,
+        .enable_lowpass = v->enable_lowpass,
+        .src = in->data[plane_i],
+        .plane = plane,
+        .result =  v->result[plane_i],
+        .partial_sums = partial_sums,
+        .perform_dct = v->perform_dct,
+        .calc_vca_slice = calc_vca_slice,
+    };
+
+    ff_filter_execute(ctx, calc_energy_filter_slice, &th, NULL, 
FFMIN(plane->h_blocks, nb_threads));
+
+    for (int i = 0; i < nb_threads; i++){
+        E[plane_i] += th.partial_sums[i]->E;
+        L[plane_i] += th.partial_sums[i]->L;
+        h[plane_i] += th.partial_sums[i]->h;
+    }
+
+    E[plane_i] /= (plane->n_blocks * E_norm_factor);
+    L[plane_i] /= (plane->n_blocks);
+    h[plane_i] /= (plane->n_blocks * h_norm_factor);
+
+    av_free(th.partial_sums);
+
+    // At the end copy current energy to the previous
+    memcpy(v->result[plane_i]->energy_prev ,v->result[plane_i]->energy, 
v->plane[plane_i]->n_blocks * sizeof(uint32_t));
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+    AVFilterContext *ctx = inlink->dst;
+    VCAContext *v = ctx->priv;
+    FilterLink *inl = ff_filter_link(inlink);
+    int planes = v->enable_chroma ? 3 : 1;
+
+    if (v->n_frames_processed >= v->n_frames && v->n_frames != -1)
+        return ff_filter_frame(inlink->dst->outputs[0], in);
+
+    uint32_t E[3] =  {0,0,0};
+    uint32_t L[3] =  {0.0,0.0,0.0};
+    double h[3]   =  {0,0,0};
+
+    for(int i = 0; i < planes; i++){
+        if (v->plane[i]->bit_depth != 8
+            && v->plane[i]->bit_depth != 10
+            && v->plane[i]->bit_depth != 12)
+            return AVERROR(AVERROR_INVALIDDATA);
+        perform_vca(ctx, inlink, in, inl, v, i, h, E, L);
+    }
+
+    v->n_frames_processed++;
+
+    // Dump info;
+    if (v->yuview) {
+        int block_i = 0;
+        for (unsigned y = 0; y < v->plane[0]->h_blocks; y ++) {
+            for (unsigned x = 0; x < v->plane[0]->w_blocks; x ++) {
+                v->print(ctx, AV_LOG_INFO, "%d;%d;%d;%d;%d;%d;%d\n", 
inl->frame_count_out,
+                        x * v->blocksize, y * v->blocksize, v->blocksize, 
v->blocksize,
+                        0, v->result[0]->energy_prev[block_i]);
+                        block_i++;
+            }
+        }
+        block_i = 0;
+        for (unsigned y = 0; y < v->plane[0]->h_blocks; y ++) {
+            for (unsigned x = 0; x < v->plane[0]->w_blocks; x ++) {
+                v->print(ctx, AV_LOG_INFO, "%d;%d;%d;%d;%d;%d;%.0f\n", 
inl->frame_count_out,
+                        x * v->blocksize, y * v->blocksize, v->blocksize, 
v->blocksize,
+                        1, v->result[0]->energy_dif[block_i]);
+                        block_i++;
+            }
+        }
+    } else {
+        v->print(ctx, AV_LOG_INFO,
+            "%4"PRId64,
+            inl->frame_count_out);
+        v->print(ctx, AV_LOG_INFO,
+                ",%d,%f",
+                E[0], h[0]);
+        if(v->enable_brightness)
+                v->print(ctx, AV_LOG_INFO,",%d",L[0]);
+        if (v->enable_chroma) {
+            v->print(ctx, AV_LOG_INFO,
+                ",%d,%f",
+                E[1], h[1]);
+            v->print(ctx, AV_LOG_INFO,
+                ",%d,%f",
+                E[2], h[2]);
+            if(v->enable_brightness)
+                v->print(ctx, AV_LOG_INFO,",%d,%d",L[1],L[2]);
+        }
+    }
+
+    v->print(ctx, AV_LOG_INFO, "\n");
+    return ff_filter_frame(inlink->dst->outputs[0], in);
+}
+
+static int config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->dst;
+    VCAContext *v = ctx->priv;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
+    int max_pixsteps[4];
+    int planes;
+
+    v->plane[0]->w_pxls_src = inlink->w;
+    v->plane[0]->h_pxls_src = inlink->h;
+
+    if (v->enable_chroma){
+        v->plane[1]->w_pxls_src = AV_CEIL_RSHIFT(inlink->w, 
desc->log2_chroma_w);
+        v->plane[1]->h_pxls_src = AV_CEIL_RSHIFT(inlink->h, 
desc->log2_chroma_h);
+
+        v->plane[2]->w_pxls_src = AV_CEIL_RSHIFT(inlink->w, 
desc->log2_chroma_w);
+        v->plane[2]->h_pxls_src = AV_CEIL_RSHIFT(inlink->h, 
desc->log2_chroma_h);
+
+        planes = 3;
+    } else
+        planes = 1;
+
+    for(int i = 0; i < planes; i++){
+        // Inference of bit depth
+        v->plane[i]->bit_depth = desc->comp[i].depth;
+        // Inference of pixel depth
+        av_image_fill_max_pixsteps(max_pixsteps, NULL, desc);
+        v->plane[i]->pxl_depth = max_pixsteps[i];
+
+        v->plane[i]->w_blocks = (v->plane[i]->w_pxls_src + v->blocksize - 1) / 
v->blocksize;
+        v->plane[i]->h_blocks = (v->plane[i]->h_pxls_src + v->blocksize - 1) / 
v->blocksize;
+
+        v->plane[i]->n_blocks = v->plane[i]->w_blocks * v->plane[i]->h_blocks;
+
+        v->plane[i]->w_pxls = v->plane[i]->w_blocks * v->blocksize;
+        v->plane[i]->h_pxls = v->plane[i]->h_blocks * v->blocksize;
+
+        // Free previous buffers in case they are allocated already
+        av_freep(&v->result[i]->energy_prev);
+        av_freep(&v->result[i]->energy_dif);
+        av_freep(&v->result[i]->energy);
+        av_freep(&v->result[i]->brightness);
+
+        v->result[i]->energy = av_malloc(v->plane[i]->n_blocks * 
sizeof(uint32_t));
+        v->result[i]->energy_prev = av_malloc(v->plane[i]->n_blocks * 
sizeof(uint32_t));
+        if (!v->result[i]->energy || ! v->result[i]->energy_prev)
+            return AVERROR(ENOMEM);
+
+        if(v->yuview){
+            v->result[i]->energy_dif = av_malloc(v->plane[i]->n_blocks * 
sizeof(double));
+            v->result[i]->brightness = av_malloc(v->plane[i]->n_blocks * 
sizeof(uint32_t));
+            if (!v->result[i]->energy_dif || !v->result[i]->brightness)
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    if (v->yuview) {
+        v->print(ctx, AV_LOG_INFO, "%%;%%;Written by VCA for YUView\n");
+        v->print(ctx, AV_LOG_INFO, "%%;syntax-version;v1.22\n");
+        v->print(ctx, AV_LOG_INFO, "%%;%%;POC;X-position of the left top pixel 
in the block;Y-position of the left top pixel in the block;");
+        v->print(ctx, AV_LOG_INFO, "Width of the block;Height of the block; 
Type-ID;Type specific value\n");
+        v->print(ctx, AV_LOG_INFO, "%%;seq-specs;%s;%s;%d;%d;%d\n", "file", 
"layer0", v->plane[0]->w_pxls, v->plane[0]->h_pxls, 24);
+        v->print(ctx, AV_LOG_INFO, "%%;type;0;BlockEnergy;range\n");
+        v->print(ctx, AV_LOG_INFO, "%%;defaultRange;0;10000;heat\n");
+        v->print(ctx, AV_LOG_INFO, "%%;type;1;TempEnergyDiff;range\n");
+        v->print(ctx, AV_LOG_INFO, "%%;defaultRange;0;3000;heat\n");
+    } else {
+        v->print(ctx, AV_LOG_INFO, "POC,E,h");
+        if(v->enable_brightness)
+            v->print(ctx, AV_LOG_INFO, ",L");
+        if (v->enable_chroma)
+            v->print(ctx, AV_LOG_INFO, ",EV,LV,hV,EU,LU,hE");
+        if(v->enable_brightness && v->enable_chroma)
+            v->print(ctx, AV_LOG_INFO, ",LV,LU");
+
+        v->print(ctx, AV_LOG_INFO, "\n");
+    }
+
+    av_log(ctx, AV_LOG_INFO, "threads: %d\n", ff_filter_get_nb_threads(ctx));
+
+    return 0;
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+    // User options but no input data
+    VCAContext *v = ctx->priv;
+    int ret;
+    int planes = v->enable_chroma ? 3 : 1;
+
+    // allocate arrays of pointers
+    v->result = av_calloc(planes, sizeof(*v->result));
+    v->plane  = av_calloc(planes, sizeof(*v->plane));
+    if (!v->result || !v->plane)
+        return AVERROR(ENOMEM);
+
+    // allocate each plane/result struct
+    for (int i = 0; i < planes; i++) {
+        v->result[i] = av_mallocz(sizeof(*v->result[i]));
+        v->plane[i]  = av_mallocz(sizeof(*v->plane[i]));
+
+        if (!v->result[i] || !v->plane[i])
+            return AVERROR(ENOMEM);
+    }
+
+    v->n_frames_processed = 0;
+
+    if (v->file_str) {
+        v->print = print_file;
+    } else {
+        v->print = print_log;
+    }
+
+    if (v->enable_lowpass) {
+        switch (v->blocksize) {
+            case 32: v->perform_dct = ff_vca_lowpass_dct32_c; break;
+            case 16: v->perform_dct = ff_vca_lowpass_dct16_c; break;
+            case 8: v->perform_dct = ff_vca_lowpass_dct8_c; break;
+            default:
+                av_log(ctx, AV_LOG_ERROR, "Unallowed blocksize: %d\n", 
v->blocksize);
+                return AVERROR(AVERROR_INVALIDDATA);
+        }
+    } else {
+        switch (v->blocksize) {
+            case 32: v->perform_dct = ff_vca_dct32_c; break;
+            case 16: v->perform_dct = ff_vca_dct16_c; break;
+            case 8: v->perform_dct = ff_vca_dct8_c; break;
+            default:
+                av_log(ctx, AV_LOG_ERROR, "Unallowed blocksize: %d\n", 
v->blocksize);
+                return AVERROR(AVERROR_INVALIDDATA);
+        }
+    }
+
+    if (v->enable_simd) {
+        #if ARCH_X86 && HAVE_X86ASM
+        ret = ff_vca_dct_init_x86(v);
+        if (ret != 0) {
+            return ret;
+        }
+        #endif
+    }
+
+    int b = 0;
+    switch (v->blocksize) {
+        case 8:  b = 0; break;
+        case 16: b = 1; break;
+        case 32: b = 2; break;
+        default:
+            av_log(ctx, AV_LOG_ERROR, "Unallowed blocksize: %d\n", 
v->blocksize);
+            return AVERROR(AVERROR_INVALIDDATA);
+    }
+
+    v->calc_vca_slice_isnf0 = 
calc_fn_table[b][v->enable_brightness][v->yuview][0];
+    v->calc_vca_slice_isnf1 = 
calc_fn_table[b][v->enable_brightness][v->yuview][1];
+
+    v->avio_context = NULL;
+    if (v->file_str) {
+        ret = avio_open(&v->avio_context, v->file_str, AVIO_FLAG_WRITE);
+
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n",
+                   v->file_str, av_err2str(ret));
+            return ret;
+        }
+    }
+    return 0;
+}
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    VCAContext *v = ctx->priv;
+    int planes = v->enable_chroma ? 3 : 1;
+
+    for(int plane = 0; plane < planes; plane++) {
+        av_freep(&v->result[plane]->energy);
+        av_freep(&v->result[plane]->energy_prev);
+        av_freep(&v->result[plane]->energy_dif);
+        av_freep(&v->result[plane]->brightness);
+    }
+
+    if (v->avio_context) {
+        avio_closep(&v->avio_context);
+    }
+}
+
+static const AVFilterPad avfilter_vf_vca_inputs[] = {
+    {
+        .name             = "default",
+        .type             = AVMEDIA_TYPE_VIDEO,
+        .filter_frame     = filter_frame,
+        .config_props     = config_input,
+    },
+};
+
+const FFFilter ff_vf_vca = {
+    .p.name        = "vca",
+    .p.description = NULL_IF_CONFIG_SMALL("Perform VCA analysis."),
+    .p.priv_class  = &vca_class,
+    .p.flags       = AVFILTER_FLAG_SLICE_THREADS,
+    .priv_size     = sizeof(VCAContext),
+    .init          = init,
+    .uninit        = uninit,
+    FILTER_PIXFMTS_ARRAY(pxl_fmts),
+    FILTER_INPUTS(avfilter_vf_vca_inputs),
+    FILTER_OUTPUTS(ff_video_default_filterpad),
+};
\ No newline at end of file
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index ade0efc9ae..7711da416b 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -64,6 +64,7 @@ X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER)      += 
x86/vf_interlace.o            \
                                                 x86/vf_tinterlace_init.o
 X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER)       += x86/vf_transpose.o            \
                                                 x86/vf_transpose_init.o
+X86ASM-OBJS-$(CONFIG_VCA_FILTER)             += x86/vf_vca.o x86/vf_vca_init.o
 X86ASM-OBJS-$(CONFIG_VOLUME_FILTER)          += x86/af_volume.o 
x86/af_volume_init.o
 X86ASM-OBJS-$(CONFIG_V360_FILTER)            += x86/vf_v360.o 
x86/vf_v360_init.o
 X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER)          += x86/vf_w3fdif.o 
x86/vf_w3fdif_init.o
diff --git a/libavfilter/x86/vf_vca.asm b/libavfilter/x86/vf_vca.asm
new file mode 100644
index 0000000000..c121f2680c
--- /dev/null
+++ b/libavfilter/x86/vf_vca.asm
@@ -0,0 +1,877 @@
+;*****************************************************************************
+;* x86-optimized functions for DCT of VCA filter
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+
+;TO-DO : Further optimize the routines.
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 64
+
+dct8_shuf:         times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 
8, 9
+
+tab_dct8:       dw 64, 64, 64, 64, 64, 64, 64, 64
+                dw 89, 75, 50, 18, -18, -50, -75, -89
+                dw 83, 36, -36, -83, -83, -36, 36, 83
+                dw 75, -18, -89, -50, 50, 89, 18, -75
+                dw 64, -64, -64, 64, 64, -64, -64, 64
+                dw 50, -89, 18, 75, -75, -18, 89, -50
+                dw 36, -83, 83, -36, -36, 83, -83, 36
+                dw 18, -50, 75, -89, 89, -75, 50, -18
+
+tab_dct16_1:    dw 64, 64, 64, 64, 64, 64, 64, 64
+                dw 90, 87, 80, 70, 57, 43, 25,  9
+                dw 89, 75, 50, 18, -18, -50, -75, -89
+                dw 87, 57,  9, -43, -80, -90, -70, -25
+                dw 83, 36, -36, -83, -83, -36, 36, 83
+                dw 80,  9, -70, -87, -25, 57, 90, 43
+                dw 75, -18, -89, -50, 50, 89, 18, -75
+                dw 70, -43, -87,  9, 90, 25, -80, -57
+                dw 64, -64, -64, 64, 64, -64, -64, 64
+                dw 57, -80, -25, 90, -9, -87, 43, 70
+                dw 50, -89, 18, 75, -75, -18, 89, -50
+                dw 43, -90, 57, 25, -87, 70,  9, -80
+                dw 36, -83, 83, -36, -36, 83, -83, 36
+                dw 25, -70, 90, -80, 43,  9, -57, 87
+                dw 18, -50, 75, -89, 89, -75, 50, -18
+                dw  9, -25, 43, -57, 70, -80, 87, -90
+
+tab_dct16_2:    dw 64, 64, 64, 64, 64, 64, 64, 64
+                dw -9, -25, -43, -57, -70, -80, -87, -90
+                dw -89, -75, -50, -18, 18, 50, 75, 89
+                dw 25, 70, 90, 80, 43, -9, -57, -87
+                dw 83, 36, -36, -83, -83, -36, 36, 83
+                dw -43, -90, -57, 25, 87, 70, -9, -80
+                dw -75, 18, 89, 50, -50, -89, -18, 75
+                dw 57, 80, -25, -90, -9, 87, 43, -70
+                dw 64, -64, -64, 64, 64, -64, -64, 64
+                dw -70, -43, 87,  9, -90, 25, 80, -57
+                dw -50, 89, -18, -75, 75, 18, -89, 50
+                dw 80, -9, -70, 87, -25, -57, 90, -43
+                dw 36, -83, 83, -36, -36, 83, -83, 36
+                dw -87, 57, -9, -43, 80, -90, 70, -25
+                dw -18, 50, -75, 89, -89, 75, -50, 18
+                dw 90, -87, 80, -70, 57, -43, 25, -9
+
+dct16_shuf1:     times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 
1
+
+dct16_shuf2:    times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9
+
+tab_dct32_1:    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 
64
+                dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 
 4
+                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, 
-80, -87, -90
+                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, 
-61, -38, -13
+                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 
50, 75, 89
+                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 
85, 61, 22
+                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, 
-9, -57, -87
+                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, 
-90, -78, -31
+                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, 
-36, 36, 83
+                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 
73, 88, 38
+                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 
70, -9, -80
+                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, 
-38, -90, -46
+                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, 
-89, -18, 75
+                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, 
-4, 85, 54
+                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 
87, 43, -70
+                dw 67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 
46, -73, -61
+                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, 
-64, -64, 64
+                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, 
-78, 54, 67
+                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 
25, 80, -57
+                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 
90, -31, -73
+                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 
18, -89, 50
+                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, 
-82,  4, 78
+                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, 
-57, 90, -43
+                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 
54, 22, -82
+                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 
83, -83, 36
+                dw 31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, 
-13, -46, 85
+                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, 
-90, 70, -25
+                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, 
-31, 67, -88
+                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 
75, -50, 18
+                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31,  4, 22, -46, 
67, -82, 90
+                dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, 
-43, 25, -9
+                dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, 
-88, 90, -90
+
+tab_dct32_2:    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 
64
+                dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, 
-85, -88, -90, -90
+                dw -90, -87, -80, -70, -57, -43, -25, -9,  9, 25, 43, 57, 70, 
80, 87, 90
+                dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4, -22, -46, -67, 
-82, -90
+                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 
50, 75, 89
+                dw -22, -61, -85, -90, -73, -38,  4, 46, 78, 90, 82, 54, 13, 
-31, -67, -88
+                dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43,  
9, 57, 87
+                dw 31, 78, 90, 61,  4, -54, -88, -82, -38, 22, 73, 90, 67, 13, 
-46, -85
+                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, 
-36, 36, 83
+                dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 
54, -22, -82
+                dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, 
-70,  9, 80
+                dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 
82,  4, -78
+                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, 
-89, -18, 75
+                dw -54, -85,  4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 
90, 31, -73
+                dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90,  9, 
-87, -43, 70
+                dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 
78, 54, -67
+                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, 
-64, -64, 64
+                dw -67, -54, 78, 38, -85, -22, 90,  4, -90, 13, 88, -31, -82, 
46, 73, -61
+                dw -57, 80, 25, -90,  9, 87, -43, -70, 70, 43, -87, -9, 90, 
-25, -80, 57
+                dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88,  
4, 85, -54
+                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 
18, -89, 50
+                dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, 
-38, 90, -46
+                dw -43, 90, -57, -25, 87, -70, -9, 80, -80,  9, 70, -87, 25, 
57, -90, 43
+                dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67,  4, 
-73, 88, -38
+                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 
83, -83, 36
+                dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, 
-90, 78, -31
+                dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57,  9, 43, -80, 
90, -70, 25
+                dw 88, -67, 31, 13, -54, 82, -90, 78, -46,  4, 38, -73, 90, 
-85, 61, -22
+                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 
75, -50, 18
+                dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, 
-61, 38, -13
+                dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 
43, -25,  9
+                dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, 
-22, 13, -4
+
+tab_dct4:       times 4 dw 64, 64
+                times 4 dw 83, 36
+                times 4 dw 64, -64
+                times 4 dw 36, -83
+
+tab_dct8_1:     times 2 dw 89, 50, 75, 18
+                times 2 dw 75, -89, -18, -50
+                times 2 dw 50, 18, -89, 75
+                times 2 dw 18, 75, -50, -89
+
+tab_dct8_2:     times 2 dd 83, 36
+                times 2 dd 36, 83
+                times 1 dd 89, 75, 50, 18
+                times 1 dd 75, -18, -89, -50
+                times 1 dd 50, -89, 18, 75
+                times 1 dd 18, -50, 75, -89
+
+pb_unpackhlw1:  db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15
+
+SECTION .text
+const pd_2,                 times  8 dd 2
+const pd_4,                 times  4 dd 4
+const pd_8,                 times  4 dd 8
+const pd_128,               times  4 dd 128
+const pd_256,               times  4 dd 256
+const pd_512,               times  4 dd 512
+const pd_1024,              times  4 dd 1024
+const pw_ppppmmmm,          times  1 dw   1,   1,   1,   1,  -1,  -1,  -1,  -1
+const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
+
+cextern pd_16
+cextern pd_32
+cextern pd_64
+
+%macro DCT_CONSTS 1
+%if %1 == 12
+    %define     DCT4_SHIFT          5
+    %define     DCT4_ROUND          16
+    %define    IDCT_SHIFT           8
+    %define    IDCT_ROUND           128
+    %define     DST4_SHIFT          5
+    %define     DST4_ROUND          16
+    %define     DCT8_SHIFT1         6
+    %define     DCT8_ROUND1         32
+%elif %1 == 10
+    %define     DCT4_SHIFT          3
+    %define     DCT4_ROUND          4
+    %define    IDCT_SHIFT           10
+    %define    IDCT_ROUND           512
+    %define     DST4_SHIFT          3
+    %define     DST4_ROUND          4
+    %define     DCT8_SHIFT1         4
+    %define     DCT8_ROUND1         8
+%elif %1 == 8
+    %define     DCT4_SHIFT          1
+    %define     DCT4_ROUND          1
+    %define    IDCT_SHIFT           12
+    %define    IDCT_ROUND           2048
+    %define     DST4_SHIFT          1
+    %define     DST4_ROUND          1
+    %define     DCT8_SHIFT1         2
+    %define     DCT8_ROUND1         2
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+%endmacro
+
+%define  DCT8_ROUND2         256
+%define  DCT8_SHIFT2         9
+
+%if ARCH_X86_64 == 1
+%macro DCT8_PASS_1 4
+    vpbroadcastq    m0,                 [r6 + %1]
+    pmaddwd         m2,                 m%3, m0
+    pmaddwd         m0,                 m%4
+    phaddd          m2,                 m0
+    paddd           m2,                 m5
+    psrad           m2,                 DCT8_SHIFT1
+    packssdw        m2,                 m2
+    vpermq          m2,                 m2, 0x08
+    mova            [r5 + %2],          xm2
+%endmacro
+
+%macro DCT8_PASS_2 2
+    vbroadcasti128  m4,                 [r6 + %1]
+    pmaddwd         m6,                 m0, m4
+    pmaddwd         m7,                 m1, m4
+    pmaddwd         m8,                 m2, m4
+    pmaddwd         m9,                 m3, m4
+    phaddd          m6,                 m7
+    phaddd          m8,                 m9
+    phaddd          m6,                 m8
+    paddd           m6,                 m5
+    psrad           m6,                 DCT8_SHIFT2
+
+    vbroadcasti128  m4,                 [r6 + %2]
+    pmaddwd         m10,                m0, m4
+    pmaddwd         m7,                 m1, m4
+    pmaddwd         m8,                 m2, m4
+    pmaddwd         m9,                 m3, m4
+    phaddd          m10,                m7
+    phaddd          m8,                 m9
+    phaddd          m10,                m8
+    paddd           m10,                m5
+    psrad           m10,                DCT8_SHIFT2
+
+    packssdw        m6,                 m10
+    vpermq          m10,                m6, 0xD8
+
+%endmacro
+
+%macro VCA_DCT8 1
+
+INIT_YMM avx2
+%if %1 == 12
+cglobal dct8_12bit, 3, 7, 11, 0-8*16
+%elif %1 == 10
+cglobal dct8_10bit, 3, 7, 11, 0-8*16
+%elif %1 == 8
+cglobal dct8_8bit, 3, 7, 11, 0-8*16
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+vbroadcasti128      m5,                [pd_ %+ DCT8_ROUND1]
+%define             DCT_SHIFT2         9
+
+    add             r2d,               r2d
+    lea             r3,                [r2 * 3]
+    lea             r4,                [r0 + r2 * 4]
+    mov             r5,                rsp
+    lea             r6,                [tab_dct8]
+    mova            m6,                [dct8_shuf]
+
+    ;pass1
+    mova            xm0,               [r0]
+    vinserti128     m0,                m0, [r4], 1
+    mova            xm1,               [r0 + r2]
+    vinserti128     m1,                m1, [r4 + r2], 1
+    mova            xm2,               [r0 + r2 * 2]
+    vinserti128     m2,                m2, [r4 + r2 * 2], 1
+    mova            xm3,               [r0 + r3]
+    vinserti128     m3,                m3,  [r4 + r3], 1
+
+    punpcklqdq      m4,                m0, m1
+    punpckhqdq      m0,                m1
+    punpcklqdq      m1,                m2, m3
+    punpckhqdq      m2,                m3
+
+    pshufb          m0,                m6
+    pshufb          m2,                m6
+
+    paddw           m3,                m4, m0
+    paddw           m7,                m1, m2
+
+    psubw           m4,                m0
+    psubw           m1,                m2
+
+    DCT8_PASS_1     0 * 16,             0 * 16, 3, 7
+    DCT8_PASS_1     1 * 16,             2 * 16, 4, 1
+    DCT8_PASS_1     2 * 16,             4 * 16, 3, 7
+    DCT8_PASS_1     3 * 16,             6 * 16, 4, 1
+    DCT8_PASS_1     4 * 16,             1 * 16, 3, 7
+    DCT8_PASS_1     5 * 16,             3 * 16, 4, 1
+    DCT8_PASS_1     6 * 16,             5 * 16, 3, 7
+    DCT8_PASS_1     7 * 16,             7 * 16, 4, 1
+
+    ;pass2
+    vbroadcasti128  m5,                [pd_ %+ DCT8_ROUND2]
+
+    mova            m0,                [r5]
+    mova            m1,                [r5 + 32]
+    mova            m2,                [r5 + 64]
+    mova            m3,                [r5 + 96]
+
+    DCT8_PASS_2     0 * 16, 1 * 16
+    movu            [r1],              m10
+    DCT8_PASS_2     2 * 16, 3 * 16
+    movu            [r1 + 32],         m10
+    DCT8_PASS_2     4 * 16, 5 * 16
+    movu            [r1 + 64],         m10
+    DCT8_PASS_2     6 * 16, 7 * 16
+    movu            [r1 + 96],         m10
+    RET
+%endmacro
+
+%macro DCT16_PASS_1_E 2
+    vpbroadcastq    m7,                [r7 + %1]
+
+    pmaddwd         m4,                m0, m7
+    pmaddwd         m6,                m2, m7
+    phaddd          m4,                m6
+
+    paddd           m4,                m9
+    psrad           m4,                DCT_SHIFT
+
+    packssdw        m4,                m4
+    vpermq          m4,                m4, 0x08
+
+    mova            [r5 + %2],         xm4
+%endmacro
+
+%macro DCT16_PASS_1_O 2
+    vbroadcasti128  m7,                [r7 + %1]
+
+    pmaddwd         m10,               m0, m7
+    pmaddwd         m11,               m2, m7
+    phaddd          m10,               m11                 ; [d0 d0 d1 d1 d4 
d4 d5 d5]
+
+    pmaddwd         m11,               m4, m7
+    pmaddwd         m12,               m6, m7
+    phaddd          m11,               m12                 ; [d2 d2 d3 d3 d6 
d6 d7 d7]
+
+    phaddd          m10,               m11                 ; [d0 d1 d2 d3 d4 
d5 d6 d7]
+
+    paddd           m10,               m9
+    psrad           m10,               DCT_SHIFT
+
+    packssdw        m10,               m10                 ; [w0 w1 w2 w3 - - 
- - w4 w5 w6 w7 - - - -]
+    vpermq          m10,               m10, 0x08
+
+    mova            [r5 + %2],         xm10
+%endmacro
+
+%macro DCT16_PASS_2 2
+    vbroadcasti128  m8,                [r7 + %1]
+    vbroadcasti128  m13,               [r8 + %1]
+
+    pmaddwd         m10,               m0, m8
+    pmaddwd         m11,               m1, m13
+    paddd           m10,               m11
+
+    pmaddwd         m11,               m2, m8
+    pmaddwd         m12,               m3, m13
+    paddd           m11,               m12
+    phaddd          m10,               m11
+
+    pmaddwd         m11,               m4, m8
+    pmaddwd         m12,               m5, m13
+    paddd           m11,               m12
+
+    pmaddwd         m12,               m6, m8
+    pmaddwd         m13,               m7, m13
+    paddd           m12,               m13
+    phaddd          m11,               m12
+
+    phaddd          m10,               m11
+    paddd           m10,               m9
+    psrad           m10,               DCT_SHIFT2
+
+
+    vbroadcasti128  m8,                [r7 + %2]
+    vbroadcasti128  m13,               [r8 + %2]
+
+    pmaddwd         m14,               m0, m8
+    pmaddwd         m11,               m1, m13
+    paddd           m14,               m11
+
+    pmaddwd         m11,               m2, m8
+    pmaddwd         m12,               m3, m13
+    paddd           m11,               m12
+    phaddd          m14,               m11
+
+    pmaddwd         m11,               m4, m8
+    pmaddwd         m12,               m5, m13
+    paddd           m11,               m12
+
+    pmaddwd         m12,               m6, m8
+    pmaddwd         m13,               m7, m13
+    paddd           m12,               m13
+    phaddd          m11,               m12
+
+    phaddd          m14,               m11
+    paddd           m14,               m9
+    psrad           m14,               DCT_SHIFT2
+
+    packssdw        m10,               m14
+    vextracti128    xm14,              m10,       1
+    movlhps         xm15,              xm10,      xm14
+    movhlps         xm14,              xm10
+%endmacro
+
+%macro VCA_DCT16 1
+
+INIT_YMM avx2
+%if %1 == 12
+cglobal dct16_12bit, 3, 9, 16, 0-16*mmsize
+    %define         DCT_SHIFT          7
+    vbroadcasti128  m9,                [pd_64]
+%elif %1 == 10
+cglobal dct16_10bit, 3, 9, 16, 0-16*mmsize
+    %define         DCT_SHIFT          5
+    vbroadcasti128  m9,                [pd_16]
+%elif %1 == 8
+cglobal dct16_8bit, 3, 9, 16, 0-16*mmsize
+    %define         DCT_SHIFT          3
+    vbroadcasti128  m9,                [pd_4]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+%define             DCT_SHIFT2         10
+
+    add             r2d,               r2d
+
+    mova            m13,               [dct16_shuf1]
+    mova            m14,               [dct16_shuf2]
+    lea             r7,                [tab_dct16_1 + 8 * 16]
+    lea             r8,                [tab_dct16_2 + 8 * 16]
+    lea             r3,                [r2 * 3]
+    mov             r5,                rsp
+    mov             r4d,               2                   ; Each iteration 
process 8 rows, so 16/8 iterations
+
+.pass1:
+    lea             r6,                [r0 + r2 * 4]
+
+    movu            m2,                [r0]
+    movu            m1,                [r6]
+    vperm2i128      m0,                m2, m1, 0x20        ; [row0lo  row4lo]
+    vperm2i128      m1,                m2, m1, 0x31        ; [row0hi  row4hi]
+
+    movu            m4,                [r0 + r2]
+    movu            m3,                [r6 + r2]
+    vperm2i128      m2,                m4, m3, 0x20        ; [row1lo  row5lo]
+    vperm2i128      m3,                m4, m3, 0x31        ; [row1hi  row5hi]
+
+    movu            m6,                [r0 + r2 * 2]
+    movu            m5,                [r6 + r2 * 2]
+    vperm2i128      m4,                m6, m5, 0x20        ; [row2lo  row6lo]
+    vperm2i128      m5,                m6, m5, 0x31        ; [row2hi  row6hi]
+
+    movu            m8,                [r0 + r3]
+    movu            m7,                [r6 + r3]
+    vperm2i128      m6,                m8, m7, 0x20        ; [row3lo  row7lo]
+    vperm2i128      m7,                m8, m7, 0x31        ; [row3hi  row7hi]
+
+    pshufb          m1,                m13
+    pshufb          m3,                m13
+    pshufb          m5,                m13
+    pshufb          m7,                m13
+
+    paddw           m8,                m0, m1              ;E
+    psubw           m0,                m1                  ;O
+
+    paddw           m1,                m2, m3              ;E
+    psubw           m2,                m3                  ;O
+
+    paddw           m3,                m4, m5              ;E
+    psubw           m4,                m5                  ;O
+
+    paddw           m5,                m6, m7              ;E
+    psubw           m6,                m7                  ;O
+
+    DCT16_PASS_1_O  -7 * 16,           1 * 32
+    DCT16_PASS_1_O  -5 * 16,           3 * 32
+    DCT16_PASS_1_O  -3 * 16,           1 * 32 + 16
+    DCT16_PASS_1_O  -1 * 16,           3 * 32 + 16
+    DCT16_PASS_1_O  1 * 16,            5 * 32
+    DCT16_PASS_1_O  3 * 16,            7 * 32
+    DCT16_PASS_1_O  5 * 16,            5 * 32 + 16
+    DCT16_PASS_1_O  7 * 16,            7 * 32 + 16
+
+    pshufb          m8,                m14
+    pshufb          m1,                m14
+    phaddw          m0,                m8, m1
+
+    pshufb          m3,                m14
+    pshufb          m5,                m14
+    phaddw          m2,                m3, m5
+
+    DCT16_PASS_1_E  -8 * 16,           0 * 32
+    DCT16_PASS_1_E  -4 * 16,           0 * 32 + 16
+    DCT16_PASS_1_E  0 * 16,            4 * 32
+    DCT16_PASS_1_E  4 * 16,            4 * 32 + 16
+
+    phsubw          m0,                m8, m1
+    phsubw          m2,                m3, m5
+
+    DCT16_PASS_1_E  -6 * 16,           2 * 32
+    DCT16_PASS_1_E  -2 * 16,           2 * 32 + 16
+    DCT16_PASS_1_E  2 * 16,            6 * 32
+    DCT16_PASS_1_E  6 * 16,            6 * 32 + 16
+
+    lea             r0,                [r0 + 8 * r2]
+    add             r5,                256
+
+    dec             r4d
+    jnz             .pass1
+
+    mov             r5,                rsp
+    mov             r4d,               2
+    mov             r2d,               32
+    lea             r3,                [r2 * 3]
+    vbroadcasti128  m9,                [pd_512]
+
+.pass2:
+    mova            m0,                [r5 + 0 * 32]        ; [row0lo  row4lo]
+    mova            m1,                [r5 + 8 * 32]        ; [row0hi  row4hi]
+
+    mova            m2,                [r5 + 1 * 32]        ; [row1lo  row5lo]
+    mova            m3,                [r5 + 9 * 32]        ; [row1hi  row5hi]
+
+    mova            m4,                [r5 + 2 * 32]        ; [row2lo  row6lo]
+    mova            m5,                [r5 + 10 * 32]       ; [row2hi  row6hi]
+
+    mova            m6,                [r5 + 3 * 32]        ; [row3lo  row7lo]
+    mova            m7,                [r5 + 11 * 32]       ; [row3hi  row7hi]
+
+    DCT16_PASS_2    -8 * 16, -7 * 16
+    movu            [r1],              xm15
+    movu            [r1 + r2],         xm14
+
+    DCT16_PASS_2    -6 * 16, -5 * 16
+    movu            [r1 + r2 * 2],     xm15
+    movu            [r1 + r3],         xm14
+
+    lea             r6,                [r1 + r2 * 4]
+    DCT16_PASS_2    -4 * 16, -3 * 16
+    movu            [r6],              xm15
+    movu            [r6 + r2],         xm14
+
+    DCT16_PASS_2    -2 * 16, -1 * 16
+    movu            [r6 + r2 * 2],     xm15
+    movu            [r6 + r3],         xm14
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT16_PASS_2    0 * 16, 1 * 16
+    movu            [r6],              xm15
+    movu            [r6 + r2],         xm14
+
+    DCT16_PASS_2    2 * 16, 3 * 16
+    movu            [r6 + r2 * 2],     xm15
+    movu            [r6 + r3],         xm14
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT16_PASS_2    4 * 16, 5 * 16
+    movu            [r6],              xm15
+    movu            [r6 + r2],         xm14
+
+    DCT16_PASS_2    6 * 16, 7 * 16
+    movu            [r6 + r2 * 2],     xm15
+    movu            [r6 + r3],         xm14
+
+    add             r1,                16
+    add             r5,                128
+
+    dec             r4d
+    jnz             .pass2
+    RET
+%endmacro
+
+%macro DCT32_PASS_1 4
+    vbroadcasti128  m8,                [r7 + %1]
+    pmaddwd         m11,               m%3, m8
+    pmaddwd         m12,               m%4, m8
+    phaddd          m11,               m12
+
+    vbroadcasti128  m8,                [r7 + %1 + 32]
+    vbroadcasti128  m10,               [r7 + %1 + 48]
+    pmaddwd         m12,               m5, m8
+    pmaddwd         m13,               m6, m10
+    phaddd          m12,               m13
+
+    pmaddwd         m13,               m4, m8
+    pmaddwd         m14,               m7, m10
+    phaddd          m13,               m14
+
+    phaddd          m12,               m13
+
+    phaddd          m11,               m12
+    paddd           m11,               m9
+    psrad           m11,               DCT_SHIFT
+
+    vpermq          m11,               m11, 0xD8
+    packssdw        m11,               m11
+    movq            [r5 + %2],         xm11
+    vextracti128    xm10,              m11, 1
+    movq            [r5 + %2 + 64],    xm10
+%endmacro
+
+%macro DCT32_PASS_2 1
+    mova            m8,                [r7 + %1]
+    mova            m10,               [r8 + %1]
+    pmaddwd         m11,               m0, m8
+    pmaddwd         m12,               m1, m10
+    paddd           m11,               m12
+
+    pmaddwd         m12,               m2, m8
+    pmaddwd         m13,               m3, m10
+    paddd           m12,               m13
+
+    phaddd          m11,               m12
+
+    pmaddwd         m12,               m4, m8
+    pmaddwd         m13,               m5, m10
+    paddd           m12,               m13
+
+    pmaddwd         m13,               m6, m8
+    pmaddwd         m14,               m7, m10
+    paddd           m13,               m14
+
+    phaddd          m12,               m13
+
+    phaddd          m11,               m12
+    vextracti128    xm10,              m11, 1
+    paddd           xm11,              xm10
+
+    paddd           xm11,               xm9
+    psrad           xm11,               DCT_SHIFT2
+    packssdw        xm11,               xm11
+
+%endmacro
+
+%macro VCA_DCT32 1
+INIT_YMM avx2
+
+%if %1 == 12
+cglobal dct32_12bit, 3, 9, 16, 0-64*mmsize
+    %define         DCT_SHIFT          8
+    vpbroadcastq    m9,                [pd_128]
+%elif %1 == 10
+cglobal dct32_10bit, 3, 9, 16, 0-64*mmsize
+    %define         DCT_SHIFT          6
+    vpbroadcastq    m9,                [pd_32]
+%elif %1 == 8
+cglobal dct32_8bit, 3, 9, 16, 0-64*mmsize
+    %define         DCT_SHIFT          4
+    vpbroadcastq    m9,                [pd_8]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+%define             DCT_SHIFT2         11
+
+    add             r2d,               r2d
+
+    lea             r7,                [tab_dct32_1]
+    lea             r8,                [tab_dct32_2]
+    lea             r3,                [r2 * 3]
+    mov             r5,                rsp
+    mov             r4d,               8
+    mova            m15,               [dct16_shuf1]
+
+.pass1:
+    movu            m2,                [r0]
+    movu            m1,                [r0 + 32]
+    pshufb          m1,                m15
+    vpermq          m1,                m1, 0x4E
+    psubw           m7,                m2, m1
+    paddw           m2,                m1
+
+    movu            m1,                [r0 + r2 * 2]
+    movu            m0,                [r0 + r2 * 2 + 32]
+    pshufb          m0,                m15
+    vpermq          m0,                m0, 0x4E
+    psubw           m8,                m1, m0
+    paddw           m1,                m0
+    vperm2i128      m0,                m2, m1, 0x20        ; [row0lo  row2lo] 
for E
+    vperm2i128      m3,                m2, m1, 0x31        ; [row0hi  row2hi] 
for E
+    pshufb          m3,                m15
+    psubw           m1,                m0, m3
+    paddw           m0,                m3
+
+    vperm2i128      m5,                m7, m8, 0x20        ; [row0lo  row2lo] 
for O
+    vperm2i128      m6,                m7, m8, 0x31        ; [row0hi  row2hi] 
for O
+
+
+    movu            m4,                [r0 + r2]
+    movu            m2,                [r0 + r2 + 32]
+    pshufb          m2,                m15
+    vpermq          m2,                m2, 0x4E
+    psubw           m10,               m4, m2
+    paddw           m4,                m2
+
+    movu            m3,                [r0 + r3]
+    movu            m2,                [r0 + r3 + 32]
+    pshufb          m2,                m15
+    vpermq          m2,                m2, 0x4E
+    psubw           m11,               m3, m2
+    paddw           m3,                m2
+    vperm2i128      m2,                m4, m3, 0x20        ; [row1lo  row3lo] 
for E
+    vperm2i128      m8,                m4, m3, 0x31        ; [row1hi  row3hi] 
for E
+    pshufb          m8,                m15
+    psubw           m3,                m2, m8
+    paddw           m2,                m8
+
+    vperm2i128      m4,                m10, m11, 0x20      ; [row1lo  row3lo] 
for O
+    vperm2i128      m7,                m10, m11, 0x31      ; [row1hi  row3hi] 
for O
+
+
+    DCT32_PASS_1    0 * 32,            0 * 64, 0, 2
+    DCT32_PASS_1    2 * 32,            2 * 64, 1, 3
+    DCT32_PASS_1    4 * 32,            4 * 64, 0, 2
+    DCT32_PASS_1    6 * 32,            6 * 64, 1, 3
+    DCT32_PASS_1    8 * 32,            8 * 64, 0, 2
+    DCT32_PASS_1    10 * 32,           10 * 64, 1, 3
+    DCT32_PASS_1    12 * 32,           12 * 64, 0, 2
+    DCT32_PASS_1    14 * 32,           14 * 64, 1, 3
+    DCT32_PASS_1    16 * 32,           16 * 64, 0, 2
+    DCT32_PASS_1    18 * 32,           18 * 64, 1, 3
+    DCT32_PASS_1    20 * 32,           20 * 64, 0, 2
+    DCT32_PASS_1    22 * 32,           22 * 64, 1, 3
+    DCT32_PASS_1    24 * 32,           24 * 64, 0, 2
+    DCT32_PASS_1    26 * 32,           26 * 64, 1, 3
+    DCT32_PASS_1    28 * 32,           28 * 64, 0, 2
+    DCT32_PASS_1    30 * 32,           30 * 64, 1, 3
+
+    add             r5,                8
+    lea             r0,                [r0 + r2 * 4]
+
+    dec             r4d
+    jnz             .pass1
+
+    mov             r2d,               64
+    lea             r3,                [r2 * 3]
+    mov             r5,                rsp
+    mov             r4d,               8
+    vpbroadcastq    m9,                [pd_1024]
+
+.pass2:
+    mova            m0,                [r5 + 0 * 64]
+    mova            m1,                [r5 + 0 * 64 + 32]
+
+    mova            m2,                [r5 + 1 * 64]
+    mova            m3,                [r5 + 1 * 64 + 32]
+
+    mova            m4,                [r5 + 2 * 64]
+    mova            m5,                [r5 + 2 * 64 + 32]
+
+    mova            m6,                [r5 + 3 * 64]
+    mova            m7,                [r5 + 3 * 64 + 32]
+
+    DCT32_PASS_2    0 * 32
+    movq            [r1],              xm11
+    DCT32_PASS_2    1 * 32
+    movq            [r1 + r2],         xm11
+    DCT32_PASS_2    2 * 32
+    movq            [r1 + r2 * 2],     xm11
+    DCT32_PASS_2    3 * 32
+    movq            [r1 + r3],         xm11
+
+    lea             r6,                [r1 + r2 * 4]
+    DCT32_PASS_2    4 * 32
+    movq            [r6],              xm11
+    DCT32_PASS_2    5 * 32
+    movq            [r6 + r2],         xm11
+    DCT32_PASS_2    6 * 32
+    movq            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    7 * 32
+    movq            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    8 * 32
+    movq            [r6],              xm11
+    DCT32_PASS_2    9 * 32
+    movq            [r6 + r2],         xm11
+    DCT32_PASS_2    10 * 32
+    movq            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    11 * 32
+    movq            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    12 * 32
+    movq            [r6],              xm11
+    DCT32_PASS_2    13 * 32
+    movq            [r6 + r2],         xm11
+    DCT32_PASS_2    14 * 32
+    movq            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    15 * 32
+    movq            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    16 * 32
+    movq            [r6],              xm11
+    DCT32_PASS_2    17 * 32
+    movq            [r6 + r2],         xm11
+    DCT32_PASS_2    18 * 32
+    movq            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    19 * 32
+    movq            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    20 * 32
+    movq            [r6],              xm11
+    DCT32_PASS_2    21 * 32
+    movq            [r6 + r2],         xm11
+    DCT32_PASS_2    22 * 32
+    movq            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    23 * 32
+    movq            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    24 * 32
+    movq            [r6],              xm11
+    DCT32_PASS_2    25 * 32
+    movq            [r6 + r2],         xm11
+    DCT32_PASS_2    26 * 32
+    movq            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    27 * 32
+    movq            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    28 * 32
+    movq            [r6],              xm11
+    DCT32_PASS_2    29 * 32
+    movq            [r6 + r2],         xm11
+    DCT32_PASS_2    30 * 32
+    movq            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    31 * 32
+    movq            [r6 + r3],         xm11
+
+    add             r5,                256
+    add             r1,                8
+
+    dec             r4d
+    jnz             .pass2
+    RET
+%endmacro
+
+DCT_CONSTS 8
+VCA_DCT8 8
+VCA_DCT16 8
+VCA_DCT32 8
+
+DCT_CONSTS 10
+VCA_DCT8 10
+VCA_DCT16 10
+VCA_DCT32 10
+
+DCT_CONSTS 12
+VCA_DCT8 12
+VCA_DCT16 12
+VCA_DCT32 12
+
+%endif
diff --git a/libavfilter/x86/vf_vca_init.c b/libavfilter/x86/vf_vca_init.c
new file mode 100644
index 0000000000..d19bfcc34b
--- /dev/null
+++ b/libavfilter/x86/vf_vca_init.c
@@ -0,0 +1,210 @@
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavcodec/x86/constants.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vca_dct.h"
+
+void ff_dct8_8bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride);
+void ff_dct8_10bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride);
+void ff_dct8_12bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride);
+
+void ff_dct16_8bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride);
+void ff_dct16_10bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride);
+void ff_dct16_12bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride);
+
+void ff_dct32_8bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride);
+void ff_dct32_10bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride);
+void ff_dct32_12bit_avx2(const int16_t *src, int16_t *dst, intptr_t srcStride);
+
+
+#if HAVE_X86ASM
+
+static void ff_dct8_avx2(const int16_t *src, int16_t *dst, int bit_depth) {
+    switch (bit_depth) {
+        case 8:
+            ff_dct8_8bit_avx2(src, dst, 8);
+            break;
+        case 10:
+            ff_dct8_10bit_avx2(src, dst, 8);
+            break;
+        case 12:
+            ff_dct8_12bit_avx2(src, dst, 8);
+            break;
+        default:
+            ff_vca_dct8_c(src, dst, bit_depth);
+            break;
+    }
+}
+
+static void ff_dct16_avx2(const int16_t *src, int16_t *dst, int bit_depth) {
+    switch (bit_depth) {
+        case 8:
+            ff_dct16_8bit_avx2(src, dst, 16);
+            break;
+        case 10:
+            ff_dct16_10bit_avx2(src, dst, 16);
+            break;
+        case 12:
+            ff_dct16_12bit_avx2(src, dst, 16);
+            break;
+        default:
+            ff_vca_dct16_c(src, dst, bit_depth);
+            break;
+    }
+}
+
+static void ff_dct32_avx2(const int16_t *src, int16_t *dst, int bit_depth) {
+    switch (bit_depth) {
+        case 8:
+            ff_dct32_8bit_avx2(src, dst, 32);
+            break;
+        case 10:
+            ff_dct32_10bit_avx2(src, dst, 32);
+            break;
+        case 12:
+            ff_dct32_12bit_avx2(src, dst, 32);
+            break;
+        default:
+            ff_vca_dct32_c(src, dst, bit_depth);
+            break;
+    }
+}
+
+static void ff_lowpass_dct16_avx2(const int16_t *src, int16_t *dst, int 
bit_depth) {
+    DECLARE_ALIGNED_32(int16_t, coef[8 * 8]);
+    DECLARE_ALIGNED_32(int16_t, avg_block[8 * 8]);
+
+    int32_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 8; i++)
+        for (int j =0; j < 8; j++)
+        {
+            sum = src[2*i*16 + 2*j] + src[2*i*16 + 2*j + 1]
+                    + src[(2*i+1)*16 + 2*j] + src[(2*i+1)*16 + 2*j + 1];
+            avg_block[i*8 + j] = sum >> 2;
+
+            totalSum += sum;
+        }
+
+    switch (bit_depth) {
+        case 8:
+            ff_dct8_8bit_avx2(avg_block, coef, 8);
+            break;
+        case 10:
+            ff_dct8_10bit_avx2(avg_block, coef, 8);
+            break;
+        case 12:
+            ff_dct8_12bit_avx2(avg_block, coef, 8);
+            break;
+        default:
+            ff_vca_dct8_c(avg_block, coef, bit_depth);
+            break;
+    }
+
+    memset(dst, 0, 256 * sizeof(int16_t));
+    for (int i = 0; i < 8; i++)
+    {
+        memcpy(&dst[i * 16], &coef[i * 8], 8 * sizeof(int16_t));
+    }
+    dst[0] = (int16_t)(totalSum >> 1);
+}
+
+static void ff_lowpass_dct32_avx2(const int16_t *src, int16_t *dst, int 
bit_depth) {
+    DECLARE_ALIGNED_32(int16_t, coef[16 * 16]);
+    DECLARE_ALIGNED_32(int16_t, avg_block[16 * 16]);
+
+    int32_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 16; i++)
+        for (int j =0; j < 16; j++)
+        {
+            sum = src[2*i*32 + 2*j] + src[2*i*32 + 2*j + 1]
+                    + src[(2*i+1)*32 + 2*j] + src[(2*i+1)*32 + 2*j + 1];
+            avg_block[i*16 + j] = sum >> 2;
+
+            totalSum += sum;
+        }
+
+    switch (bit_depth) {
+        case 8:
+            ff_dct16_8bit_avx2(avg_block, coef, 16);
+            break;
+        case 10:
+            ff_dct16_10bit_avx2(avg_block, coef, 16);
+            break;
+        case 12:
+            ff_dct16_12bit_avx2(avg_block, coef, 16);
+            break;
+        default:
+            ff_vca_dct16_c(avg_block, coef, bit_depth);
+            break;
+    }
+
+    memset(dst, 0, 1024 * sizeof(int16_t));
+    for (int i = 0; i < 16; i++)
+    {
+        memcpy(&dst[i * 32], &coef[i * 16], 16 * sizeof(int16_t));
+    }
+    dst[0] = (int16_t)(totalSum >> 3);
+}
+#endif /* HAVE_X86ASM */
+
+av_cold int ff_vca_dct_init_x86(VCAContext *v) {
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        if(v->enable_lowpass) {
+            switch (v->blocksize) {
+                case 32:
+                    v->perform_dct = ff_lowpass_dct32_avx2;
+                    return 0;
+                case 16:
+                    v->perform_dct = ff_lowpass_dct16_avx2;
+                    return 0;
+                case 8:
+                    v->perform_dct = ff_vca_lowpass_dct8_c;
+                    return 0;
+                default:
+                    return AVERROR(AVERROR_INVALIDDATA);
+            }
+        }
+        else {
+            switch (v->blocksize) {
+                case 32:
+                    v->perform_dct = ff_dct32_avx2;
+                    return 0;
+                case 16:
+                    v->perform_dct = ff_dct16_avx2;
+                    return 0;
+                case 8:
+                    v->perform_dct = ff_dct8_avx2;
+                    return 0;
+                default:
+                    return AVERROR(AVERROR_INVALIDDATA);
+            }
+        }
+    }
+#endif /* HAVE_X86ASM */
+return 0;
+}
--
2.50.1 (Apple Git-155)

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to