This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new b4e0623 [SYSTEMDS-3062] New matrixProfile builtin (anomaly detection)
b4e0623 is described below
commit b4e06233b473d843566128888c75760ad12460af
Author: Alexander Ertl <[email protected]>
AuthorDate: Fri Jul 16 14:16:31 2021 +0200
[SYSTEMDS-3062] New matrixProfile builtin (anomaly detection)
AMLS project SS2021.
Closes #1348.
---
docs/site/builtins-reference.md | 29 +++-
scripts/builtin/matrixProfile.dml | 149 +++++++++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../builtin/BuiltinMatrixProfileTest.java | 141 +++++++++++++++++++
.../scripts/functions/builtin/matrix_profile.dml | 25 ++++
5 files changed, 344 insertions(+), 1 deletion(-)
diff --git a/docs/site/builtins-reference.md b/docs/site/builtins-reference.md
index 1cbf660..c4b5a9b 100644
--- a/docs/site/builtins-reference.md
+++ b/docs/site/builtins-reference.md
@@ -56,6 +56,7 @@ limitations under the License.
* [`lmCG`-Function](#lmcg-function)
* [`lmDS`-Function](#lmds-function)
* [`lmPredict`-Function](#lmPredict-function)
+ * [`matrixProfile`-Function](#matrixProfile-function)
* [`mdedup`-Function](#mdedup-function)
* [`mice`-Function](#mice-function)
* [`msvm`-Function](#msvm-function)
@@ -197,7 +198,7 @@ y = toOneHot(X, numClasses)
## `correctTypos`-Function
-The `correctTypos` - function tries to correct typos in a given frame. This
algorithm operates on the assumption that most strings are correct and simply
swaps strings that do not occur often with similar strings that occur more
often. If correct is set to FALSE only prints suggested corrections without
effecting the frame.
+The `correctTypos` - function tries to correct typos in a given frame. This
algorithm operates on the assumption that most strings are correct and simply
swaps strings that do not occur often with similar strings that occur more
often. If correct is set to FALSE only prints suggested corrections without
affecting the frame.
### Usage
@@ -1258,6 +1259,32 @@ yp = lmPredict(X = X, B = w, ytest=matrix(0,1,1))
```
+## `matrixProfile`-Function
+
+The `matrixProfile`-function implements the SCRIMP algorithm for efficient
time-series analysis.
+
+### Usage
+```r
+matrixProfile(ts, window_size, sample_percent, is_verbose)
+```
+
+### Arguments
+| Name | Type | Default | Description |
+| :------ | :------------- | -------- | :---------- |
+| ts | Matrix | --- | Input Frame X |
+| window_size | Integer | 4 | Sliding window size |
+| sample_percent| Double | 1.0 | Degree of approximation
between zero and one (1 computes the exact solution) |
+| verbose | Boolean | False | Print debug information |
+
+### Returns
+
+| Type | Default | Description |
+| :-------------- | -------- | :---------- |
+| Matrix[Double] | --- | The computed matrix profile distances |
+| Matrix[Integer] | --- | Indices of least distances |
+
+
+
## `mdedup`-Function
The `mdedup`-function implements builtin for deduplication using matching
dependencies
diff --git a/scripts/builtin/matrixProfile.dml
b/scripts/builtin/matrixProfile.dml
new file mode 100644
index 0000000..e6fcbff
--- /dev/null
+++ b/scripts/builtin/matrixProfile.dml
@@ -0,0 +1,149 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# ----------------------------------------------------------------------------
+# References:
+# Yan Zhu et al.. 2018.
+# Matrix Profile XI: SCRIMP++: Time Series Motif Discovery at Interactive
Speeds.
+# 2018 IEEE International Conference on Data Mining (ICDM), 2018, pp.
837-846.
+# DOI: 10.1109/ICDM.2018.00099.
+# https://www.cs.ucr.edu/~eamonn/SCRIMP_ICDM_camera_ready_updated.pdf
+# ----------------------------------------------------------------------------
+
+# Builtin function that computes the MatrixProfile of a time series efficiently
+# using the SCRIMP++ algorithm.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------
+# ts Matrix --- Time series to profile
+# window_size Integer 4 Sliding window size
+# sample_percent Double 1.0 Degree of approximation
+# between zero and one (1
+# computes the exact
solution)
+# is_verbose Boolean False Print debug information
+#
+#
+# RETURN VALUES
+# ----------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# profile Matrix --- The computed matrix
profile
+# profile_index Matrix --- Indices of least
distances
+
+
+m_matrixProfile = function(Matrix[Double] ts, Integer window_size=4, Double
sample_percent=1.0, Boolean is_verbose=FALSE)
+ return(Matrix[Double] profile, Matrix[Double] profile_index)
+{
+ if (is_verbose)
+ print ("##############################\n# MATRIXPROFILE SCRIPT ENTRY
#\n##############################");
+
+ # TODO: preSCRIMP
+ # requires a similarity search algorithm e.g.: MASS (Mueen's Algorithm for
Similarity Search)
+
+ n = length(ts);
+ [mu,sig] = moving_avg(ts, n, window_size);
+ if (is_verbose) {
+ print_ts(ts);
+ print_ts(mu);
+ print_ts(sig);
+ }
+
+ # initialize
+ profile_len = n-window_size+1;
+ profile = matrix(Inf, cols=1, rows=profile_len);
+ profile_index = matrix(1, cols=1, rows=profile_len);
+
+ # random permutation
+ exclusion_zone = as.integer(ceil(window_size/4)) + 1;
+ sample_size = profile_len-exclusion_zone;
+ if (sample_percent < 1.0 & sample_percent >= 0.0) {
+ sample_size = ceil(sample_size*sample_percent);
+ }
+ s = sample(sample_size, sample_size, FALSE);
+ s = s + exclusion_zone;
+
+ if (is_verbose) {
+ print("n: " + n);
+ print("window_size: " + window_size);
+ print("profile_len: " + profile_len);
+ print("exclusion_zone: " + exclusion_zone);
+ print("sample_size: " + sample_size);
+ }
+ k_idx = 1;
+ while (k_idx <= sample_size) {
+ k = as.scalar(s[k_idx]);
+ k_idx += 1;
+ q = 0;
+ for (i in 1:n-window_size+2-k) {
+ if (i==1)
+ q = as.scalar(t(ts[1:window_size]) %*% ts[k:k+window_size-1]);
+ else
+ q = as.scalar(q - ts[i-1]%*%ts[i+k-2] +
ts[i+window_size-1]%*%ts[i+k+window_size-2]);
+ d = sqrt(2*window_size*(1-(q - window_size*as.scalar(mu[i]*mu[i+k-1])) /
(window_size*as.scalar(sig[i]*sig[i+k-1]))));
+
+ if (d < as.scalar(profile[i])) {
+ profile[i] = d;
+ profile_index[i] = as.matrix(i+k-1);
+ }
+ if (d < as.scalar(profile[i+k-1])) {
+ profile[i+k-1] = d;
+ profile_index[i+k-1] = i;
+ }
+ }
+ }
+
+ print_ts(profile);
+ print_ts(profile_index);
+}
+
+moving_avg = function(Matrix[Double] array, Integer n, Integer window_size)
+ return(Matrix[Double] mu, Matrix[Double] sig)
+{
+ profile_len = n - window_size + 1;
+ cum_sum = matrix(0, cols=1, rows=n);
+ sq_cum_sum = matrix(0, cols=1, rows=n);
+ sums = matrix(0, cols=1, rows=profile_len);
+ sq_sums = matrix(0, cols=1, rows=profile_len);
+ mu = matrix(0, cols=1, rows=profile_len);
+ sig_sq = matrix(0, cols=1, rows=profile_len);
+ sig = matrix(0, cols=1, rows=profile_len);
+
+ cum_sum = cumsum(array);
+ sq_cum_sum = cumsum(array*array);
+
+ sums[1] = cum_sum[window_size];
+ sq_sums[1] = sq_cum_sum[window_size];
+ for (i in 1:n-window_size) {
+ sums[i+1] = cum_sum[window_size + i] - cum_sum[i];
+ sq_sums[i+1] = sq_cum_sum[window_size + i] - sq_cum_sum[i];
+ }
+
+ for (i in 1:profile_len) {
+ mu[i] = sums[i] / window_size;
+ sig_sq[i] = sq_sums[i] / window_size - mu[i] * mu[i];
+ sig[i] = max(sqrt(sig_sq[i]), 0);
+ }
+}
+
+print_ts = function(Matrix[Double] ts) {
+ print(toString(t(ts)));
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index e8a858f..548a031 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -177,6 +177,7 @@ public enum Builtins {
LSTM_BACKWARD("lstm_backward", false, ReturnType.MULTI_RETURN),
LU("lu", false, ReturnType.MULTI_RETURN),
MAP("map", false),
+ MATRIXPROFILE("matrixProfile", true),
MAX("max", "pmax", false),
MAX_POOL("max_pool", false),
MAX_POOL_BACKWARD("max_pool_backward", false),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMatrixProfileTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMatrixProfileTest.java
new file mode 100644
index 0000000..f0c227f
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMatrixProfileTest.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.common.Types.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.lang.Math;
+import java.util.Random;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map.Entry;
+
+public class BuiltinMatrixProfileTest extends AutomatedTestBase
+{
+ private final static String TEST_NAME = "matrix_profile";
+ private final static String TEST_DIR = "functions/builtin/";
+ private static final String TEST_CLASS_DIR = TEST_DIR +
BuiltinMatrixProfileTest.class.getSimpleName() + "/";
+
+ private static Random generator;
+ private final static int seed = 42;
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"B"}));
+ }
+
+ @Test
+ public void testMatrixProfileCP() throws IOException {
+ runMatrixProfileTest(4, 1.0, "TRUE", ExecType.CP);
+ }
+
+ @Test
+ public void testMatrixProfileApproxCP() throws IOException {
+ runMatrixProfileTest(4, 0.6, "TRUE", ExecType.CP);
+ }
+
+// @Test
+// public void testMatrixProfileSPARK() throws IOException {
+// runMatrixProfileTest(4, 1.0, "FALSE", ExecType.SPARK);
+// }
+
+
+ private void runMatrixProfileTest(Integer window_size, Double
sample_percent, String is_verbose, ExecType instType) throws IOException
+ {
+ ExecMode platformOld = setExecMode(instType);
+
+ try
+ {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[]{
+ "-nvargs", "TS=" + input("TS"), "MP=" +
output("MP"),
+ "MPI=" + output("MPI"),
+ "window_size=" + window_size,
+ "sample_percent=" + sample_percent,
+ "is_verbose=" + is_verbose};
+
+ generator = new Random(seed);
+
+ int len = 100;
+ double[][] ts = genSineWave(len, 0.05, 1, 1);
+ int[] noise_idxs = addNoise(1, len, ts);
+ writeInputMatrixWithMTD("TS", ts, false);
+
+ runTest(true, false, null, -1);
+
+ HashMap<CellIndex,Double> MP =
readDMLMatrixFromOutputDir("MP");
+ @SuppressWarnings("unused") //TODO
+ HashMap<CellIndex,Double> MPI =
readDMLMatrixFromOutputDir("MPI");
+
+ List<Entry<CellIndex,Double>> sortedList =
sortByValues(MP);
+ Entry<CellIndex,Double> entry = sortedList.get(0);
+ int highest_dist_idx = entry.getKey().row;
+ int noise_idx = noise_idxs[0];
+
+ System.out.println("Detected anomaly around idx " +
highest_dist_idx);
+ System.out.println("Noise idx: " + noise_idx);
+
Assert.assertTrue(highest_dist_idx>noise_idx-window_size &&
highest_dist_idx<noise_idx+window_size);
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+
+ private static double[][] genSineWave(Integer n, double sampling_rate,
float p, float amp) {
+ double[][] ts = new double[n][1];
+ for (int i=0; i<n; ++i) {
+ ts[i][0] = p*Math.sin(amp*sampling_rate*i);
+ }
+ return ts;
+ }
+
+ private static int[] addNoise(Integer n, Integer len, double[][] ts) {
+ int[] idxs = new int[n];
+ for (int i=0; i<n; ++i) {
+ int idx = generator.nextInt(len);
+ ts[idx][0] += 1;
+ idxs[i] = idx;
+ }
+ return idxs;
+ }
+
+ private static List<Entry<CellIndex,Double>>
sortByValues(HashMap<CellIndex,Double> map) {
+ List<Entry<CellIndex,Double>> list = new
LinkedList<>(map.entrySet());
+ Collections.sort(list, new
Comparator<Entry<CellIndex,Double>>() {
+ public int compare(Entry<CellIndex,Double> o1,
Entry<CellIndex,Double> o2) {
+ return o2.getValue().compareTo(o1.getValue());
+ }
+ });
+ return list;
+ }
+}
diff --git a/src/test/scripts/functions/builtin/matrix_profile.dml
b/src/test/scripts/functions/builtin/matrix_profile.dml
new file mode 100644
index 0000000..8195d04
--- /dev/null
+++ b/src/test/scripts/functions/builtin/matrix_profile.dml
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+timeSeries = read($TS)
+[MP,MPI] = matrixProfile(timeSeries, $window_size, $sample_percent,
$is_verbose)
+write(MP, $MP)
+write(MPI, $MPI)