This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 3a9baf48427c8ad6f51a233feeff03a407175f64
Author: David Kerschbaumer <david.kerschbau...@student.tugraz.at>
AuthorDate: Sat Jan 9 00:08:15 2021 +0100

    [SYSTEMDS-2789] Disguised Missing Values Detection
    Co-authored-by: Patrick Lovric <patrick.lov...@student.tugraz.at>
    Co-authored-by: Valentin Edelsbrunner <v.edelsbrun...@student.tugraz.at>
    
    DIA project WS2020/21.
    Closes #1144.
    
    Date:      Sat Jan 9 00:05:47 2021 +0100
---
 docs/site/builtins-reference.md                    |  38 +++
 scripts/builtin/dmv.dml                            |  29 ++
 .../java/org/apache/sysds/common/Builtins.java     |   1 +
 .../sysds/runtime/matrix/data/FrameBlock.java      |  69 ++++-
 .../org/apache/sysds/runtime/util/DMVUtils.java    | 341 +++++++++++++++++++++
 .../test/functions/builtin/BuiltinDMVTest.java     | 200 ++++++++++++
 .../functions/builtin/disguisedMissingValue.dml    |  24 ++
 7 files changed, 692 insertions(+), 10 deletions(-)

diff --git a/docs/site/builtins-reference.md b/docs/site/builtins-reference.md
index 22c1491..d960b25 100644
--- a/docs/site/builtins-reference.md
+++ b/docs/site/builtins-reference.md
@@ -32,6 +32,7 @@ limitations under the License.
     * [`DBSCAN`-Function](#DBSCAN-function)
     * [`discoverFD`-Function](#discoverFD-function)
     * [`dist`-Function](#dist-function)
+    * [`dmv`-Function](#dmv-function)
     * [`glm`-Function](#glm-function)
     * [`gridSearch`-Function](#gridSearch-function)
     * [`hyperband`-Function](#hyperband-function)
@@ -299,6 +300,43 @@ X = rand (rows = 5, cols = 5)
 Y = dist(X)
 ```
 
+
+
+## `dmv`-Function
+
+The `dmv`-function is used to find disguised missing values utilising 
syntactical pattern recognition.
+
+### Usage
+
+```r
+dmv(X, threshold, replace)
+```
+
+### Arguments
+
+| Name      | Type          | Default  | Description                           
                       |
+| :-------- | :------------ | :------- | 
:----------------------------------------------------------- |
+| X         | Frame[String] | required | Input Frame                           
                       |
+| threshold | Double        | 0.8      | threshold value in interval [0, 1] 
for dominant pattern per column (e.g., 0.8 means that 80% of the entries per 
column must adhere this pattern to be dominant) |
+| replace   | String        | "NA"     | The string disguised missing values 
are replaced with        |
+
+### Returns
+
+| Type          | Description                                            |
+| :------------ | :----------------------------------------------------- |
+| Frame[String] | Frame `X`  including detected disguised missing values |
+
+### Example
+
+```r
+A = read("fileA", data_type="frame", rows=10, cols=8);
+Z = dmv(X=A)
+Z = dmv(X=A, threshold=0.9)
+Z = dmv(X=A, threshold=0.9, replace="NaN")
+```
+
+
+
 ## `glm`-Function
 
 The `glm`-function  is a flexible generalization of ordinary linear regression 
that allows for response variables that have
diff --git a/scripts/builtin/dmv.dml b/scripts/builtin/dmv.dml
new file mode 100644
index 0000000..af68f1f
--- /dev/null
+++ b/scripts/builtin/dmv.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#------------------------------------------------------------
+
+s_dmv = function(Frame[String] X, Double threshold=0.8, String replace="NA") 
return (Frame[String] Y) {
+
+  if( threshold < 0 | threshold > 1 )
+    stop("Stopping due to invalid input, threshold required in interval [0, 1] 
found " + threshold)
+
+  Y = map(X, "UtilFunctions.syntacticalPatternDiscovery(" + threshold + "," + 
replace + ")")
+}
+
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index bb96100..5bcc85e 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -98,6 +98,7 @@ public enum Builtins {
        DIAG("diag", false),
        DISCOVER_FD("discoverFD", true),
        DIST("dist", true),
+       DMV("dmv", true),
        DROP_INVALID_TYPE("dropInvalidType", false),
        DROP_INVALID_LENGTH("dropInvalidLength", false),
        EIGEN("eigen", false, ReturnType.MULTI_RETURN),
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
index 0605a86..d157e37 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
@@ -56,6 +56,7 @@ import org.apache.sysds.runtime.io.IOUtilFunctions;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.transform.encode.EncoderRecode;
 import org.apache.sysds.runtime.util.CommonThreadPool;
+import org.apache.sysds.runtime.util.DMVUtils;
 import org.apache.sysds.runtime.util.IndexRange;
 import org.apache.sysds.runtime.util.UtilFunctions;
 
@@ -64,8 +65,8 @@ public class FrameBlock implements CacheBlock, Externalizable 
 {
        private static final long serialVersionUID = -3993450030207130665L;
        private static final Log LOG = 
LogFactory.getLog(FrameBlock.class.getName());
        private static final IDSequence CLASS_ID = new IDSequence();
-       
-       public static final int BUFFER_SIZE = 1 * 1000 * 1000; //1M elements, 
size of default matrix block 
+
+       public static final int BUFFER_SIZE = 1 * 1000 * 1000; //1M elements, 
size of default matrix block
 
        //internal configuration
        private static final boolean REUSE_RECODE_MAPS = true;
@@ -2101,13 +2102,26 @@ public class FrameBlock implements CacheBlock, 
Externalizable  {
        }
 
        public FrameBlock map(String lambdaExpr) {
+               if(!lambdaExpr.contains("->"))
+               {
+                       //return map(getCompiledFunctionBlock(lambdaExpr));
+                       String args = 
lambdaExpr.substring(lambdaExpr.indexOf('(') + 1, lambdaExpr.indexOf(')'));
+                       if(args.contains(",")) {
+                               String[] arguments = args.split(",");
+                               return 
DMVUtils.syntacticalPatternDiscovery(this, Double.parseDouble(arguments[0]), 
arguments[1]);
+                       }
+               }
                return map(getCompiledFunction(lambdaExpr));
        }
+
+       public FrameBlock map(FrameBlockMapFunction lambdaExpression) {
+               return lambdaExpression.apply();
+       }
        
        public FrameBlock map(FrameMapFunction lambdaExpr) {
                // Prepare temporary output array
                String[][] output = new String[getNumRows()][getNumColumns()];
-               
+
                // Execute map function on all cells
                for(int j=0; j<getNumColumns(); j++) {
                        Array input = getColumn(j);
@@ -2120,16 +2134,20 @@ public class FrameBlock implements CacheBlock, 
Externalizable  {
        }
 
        public static FrameMapFunction getCompiledFunction(String lambdaExpr) {
-               // split lambda expression
+               String varname;
+               String expr;
+
+               String cname = "StringProcessing"+CLASS_ID.getNextID();
+               StringBuilder sb = new StringBuilder();
+
+
                String[] parts = lambdaExpr.split("->");
                if( parts.length != 2 )
                        throw new DMLRuntimeException("Unsupported lambda 
expression: "+lambdaExpr);
-               String varname = parts[0].trim();
-               String expr = parts[1].trim();
-               
+               varname = parts[0].trim();
+               expr = parts[1].trim();
+
                // construct class code
-               String cname = "StringProcessing"+CLASS_ID.getNextID();
-               StringBuilder sb = new StringBuilder();
                sb.append("import 
org.apache.sysds.runtime.util.UtilFunctions;\n");
                sb.append("import 
org.apache.sysds.runtime.matrix.data.FrameBlock.FrameMapFunction;\n");
                sb.append("public class "+cname+" extends FrameMapFunction 
{\n");
@@ -2140,15 +2158,46 @@ public class FrameBlock implements CacheBlock, 
Externalizable  {
                // compile class, and create FrameMapFunction object
                try {
                        return (FrameMapFunction) CodegenUtils
-                               .compileClass(cname, 
sb.toString()).newInstance();
+                                       .compileClass(cname, 
sb.toString()).newInstance();
                }
                catch(InstantiationException | IllegalAccessException e) {
                        throw new DMLRuntimeException("Failed to compile 
FrameMapFunction.", e);
                }
        }
 
+
+       public FrameBlockMapFunction getCompiledFunctionBlock(String 
lambdaExpression) {
+               // split lambda expression
+               String expr;
+
+               String cname = "StringProcessing"+CLASS_ID.getNextID();
+               StringBuilder sb = new StringBuilder();
+
+               expr = lambdaExpression;
+
+               sb.append("import 
org.apache.sysds.runtime.util.UtilFunctions;\n");
+               sb.append("import 
org.apache.sysds.runtime.matrix.data.FrameBlock.FrameBlockMapFunction;\n");
+               sb.append("public class "+cname+" extends FrameBlockMapFunction 
{\n");
+               sb.append("@Override\n");
+               sb.append("public FrameBlock apply() {\n");
+               sb.append("  return "+expr+"; }}\n");
+
+               try {
+                       return (FrameBlockMapFunction) CodegenUtils
+                               .compileClass(cname, 
sb.toString()).newInstance();
+               }
+               catch(InstantiationException | IllegalAccessException e) {
+                       throw new DMLRuntimeException("Failed to compile 
FrameBlockMapFunction.", e);
+               }
+       }
+
        public static abstract class FrameMapFunction implements Serializable {
                private static final long serialVersionUID = 
-8398572153616520873L;
                public abstract String apply(String input);
        }
+
+       public static abstract class FrameBlockMapFunction implements 
Serializable {
+               private static final long serialVersionUID = 
-8398573333616520876L;
+               public abstract FrameBlock apply();
+       }
 }
diff --git a/src/main/java/org/apache/sysds/runtime/util/DMVUtils.java 
b/src/main/java/org/apache/sysds/runtime/util/DMVUtils.java
new file mode 100644
index 0000000..e850d01
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/util/DMVUtils.java
@@ -0,0 +1,341 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.util;
+
+import org.apache.commons.collections.map.HashedMap;
+import org.apache.sysds.runtime.matrix.data.FrameBlock;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+public class DMVUtils {
+       public static final char DIGIT = 'd';
+       public static final char LOWER = 'l';
+       public static final char UPPER = 'u';
+       public static final char ALPHA = 'a';
+       public static final char SPACE = 's';
+       public static final char DOT = 't';
+       public static final char OTHER = 'y';
+       public static final char ARBITRARY_LEN = '+';
+       public static final char MINUS = '-';
+       public static String DISGUISED_VAL = "";
+
+       public enum LEVEL_ENUM { LEVEL1, LEVEL2, LEVEL3, LEVEL4, LEVEL5, LEVEL6}
+
+       public static FrameBlock syntacticalPatternDiscovery(FrameBlock frame, 
double threshold, String disguised_value) {
+
+               // Preparation
+               DISGUISED_VAL = disguised_value;
+               int numCols = frame.getNumColumns();
+               int numRows = frame.getNumRows();
+               ArrayList<Map<String, Integer>> table_Hist = new 
ArrayList(numCols); // list of every column with values and their frequency
+
+               int idx;
+               for (idx = 0; idx < numCols; idx++) {
+                       Object c = frame.getColumnData(idx);
+                       String[] column = (String[]) c;
+                       String key = "";
+                       for (String attr : column) {
+                               key = (attr.isEmpty()) ? "NULL": attr;
+                               addDistinctValueOrIncrementCounter(table_Hist, 
key, idx);
+                       }
+               }
+
+               // Syntactic Pattern Discovery
+               idx = -1;
+               for (Map<String, Integer> col_Hist : table_Hist) {
+                       idx++;
+                       Map<String, Double> dominant_patterns_ratio = new 
HashedMap();
+                       Map<String, Integer> prev_pattern_hist = col_Hist;
+                       for(LEVEL_ENUM level : LEVEL_ENUM.values()) {
+                               dominant_patterns_ratio.clear();
+                               Map<String, Integer> current_pattern_hist = 
LevelsExecutor(prev_pattern_hist, level);
+                               dominant_patterns_ratio = 
calculatePatternsRatio(current_pattern_hist, numRows);
+                               String dominant_pattern = 
findDominantPattern(dominant_patterns_ratio, threshold);
+                               if(dominant_pattern != null) { //found pattern
+                                       detectDisguisedValues(dominant_pattern, 
frame.getColumnData(idx), idx, frame, level);
+                                       break;
+                               }
+                               prev_pattern_hist = current_pattern_hist;
+                       }
+               }
+               return frame;
+       }
+
+
+       public static Map<String, Double> calculatePatternsRatio(Map<String, 
Integer> patterns_hist, int nr_entries) {
+               Map<String, Double> patterns_ratio_map = new HashedMap();
+               Iterator it = patterns_hist.entrySet().iterator();
+               while(it.hasNext()) {
+                       Map.Entry pair = (Map.Entry) it.next();
+                       String pattern = (String) pair.getKey();
+                       Double nr_occurences = new 
Double((Integer)pair.getValue());
+
+                       double current_ratio = nr_occurences / nr_entries; // 
percentage of current pattern in column
+                       patterns_ratio_map.put(pattern, current_ratio);
+               }
+               return patterns_ratio_map;
+       }
+
+       public static String findDominantPattern(Map<String, Double> 
dominant_patterns, double threshold) {
+
+               Iterator it = dominant_patterns.entrySet().iterator();
+               while(it.hasNext()) {
+                       Map.Entry pair = (Map.Entry) it.next();
+                       String pattern = (String) pair.getKey();
+                       Double pattern_ratio = (Double)pair.getValue();
+
+                       if(pattern_ratio > threshold)
+                               return pattern;
+
+               }
+               return null;
+       }
+
+       private static void 
addDistinctValueOrIncrementCounter(ArrayList<Map<String, Integer>> maps, String 
key, Integer idx) {
+               if (maps.size() == idx) {
+                       HashMap<String, Integer> m = new HashMap<>();
+                       m.put(key, 1);
+                       maps.add(m);
+                       return;
+               }
+
+               if (!(maps.get(idx).containsKey(key))) {
+                       maps.get(idx).put(key, 1);
+               } else {
+                       maps.get(idx).compute(key, (k, v) -> v + 1);
+               }
+       }
+
+       private static void addDistinctValueOrIncrementCounter(Map<String, 
Integer> map, String encoded_value, Integer nr_occurrences) {
+               if (!(map.containsKey(encoded_value))) {
+                       map.put(encoded_value, nr_occurrences);
+               } else {
+                       map.compute(encoded_value, (k, v) -> v + 
nr_occurrences);
+               }
+       }
+
+       public static Map<String, Integer> LevelsExecutor(Map<String, Integer> 
old_pattern_hist, LEVEL_ENUM level) {
+               Map<String, Integer> new_pattern_hist = new HashedMap();
+               Iterator it = old_pattern_hist.entrySet().iterator();
+               while (it.hasNext()) {
+                       Map.Entry pair = (Map.Entry) it.next();
+                       String pattern = (String) pair.getKey();
+                       Integer nr_of_occurrences = (Integer)pair.getValue();
+
+                       String new_pattern;
+                       switch(level) {
+                               case LEVEL1: // default encoding
+                                       new_pattern = encodeRawString(pattern);
+                                       break;
+                               case LEVEL2: // ignores the number of 
occurrences. It replaces all numbers with '+'
+                                       new_pattern = removeNumbers(pattern);
+                                       break;
+                               case LEVEL3: // ignores upper and lowercase 
characters. It replaces all 'u' and 'l' with 'a' = Alphabet
+                                       new_pattern = 
removeUpperLowerCase(pattern);
+                                       break;
+                               case LEVEL4: // changes floats to digits
+                                       new_pattern = 
removeInnerCharacterInPattern(pattern, DIGIT, DOT);
+                                       break;
+                               case LEVEL5: // removes spaces between strings
+                                       new_pattern = 
removeInnerCharacterInPattern(pattern, ALPHA, SPACE);
+                                       break;
+                               case LEVEL6: // changes negative numbers to 
digits
+                                       new_pattern = 
acceptNegativeNumbersAsDigits(pattern);
+                                       break;
+                               default:
+                                       new_pattern = "";
+                                       break;
+                       }
+                       addDistinctValueOrIncrementCounter(new_pattern_hist, 
new_pattern, nr_of_occurrences);
+               }
+
+               return new_pattern_hist;
+       }
+
+       public static String acceptNegativeNumbersAsDigits(String pattern) {
+               char[] chars = pattern.toCharArray();
+               StringBuilder tmp = new StringBuilder();
+               boolean currently_minus_digit = false;
+               for (char ch : chars) {
+                       if(ch == MINUS && !currently_minus_digit) {
+                               currently_minus_digit = true;
+                       }
+                       else if(ch == DIGIT && currently_minus_digit) {
+                               tmp.append(ch);
+                               currently_minus_digit = false;
+                       }
+                       else if(currently_minus_digit) {
+                               tmp.append(MINUS);
+                               tmp.append(ch);
+                               currently_minus_digit = false;
+                       }
+                       else {
+                               tmp.append(ch);
+                       }
+               }
+               return tmp.toString();
+       }
+
+       public static String removeInnerCharacterInPattern(String pattern, char 
outter_char, char inner_char) {
+               char[] chars = pattern.toCharArray();
+               StringBuilder tmp = new StringBuilder();
+               boolean currently_digit = false;
+               for (char ch : chars) {
+                       if(ch == outter_char && !currently_digit) {
+                               currently_digit = true;
+                               tmp.append(ch);
+                       }
+                       else if(currently_digit && (ch == outter_char || ch == 
inner_char))
+                               continue;
+                       else if(ch != inner_char && ch != ARBITRARY_LEN) {
+                               currently_digit = false;
+                               tmp.append(ch);
+                       }
+                       else {
+                               if(tmp.length() > 0 && tmp.charAt(tmp.length() 
- 1) != ARBITRARY_LEN)
+                                       tmp.append(ch);
+                       }
+               }
+               return tmp.toString();
+       }
+
+
+       public static String removeUpperLowerCase(String pattern) {
+               char[] chars = pattern.toCharArray();
+               StringBuilder tmp = new StringBuilder();
+               boolean currently_alphabetic = false;
+               for (char ch : chars) {
+                       if(ch == UPPER || ch == LOWER) {
+                               if(!currently_alphabetic) {
+                                       currently_alphabetic = true;
+                                       tmp.append(ALPHA);
+                               }
+                       }
+                       else if(ch == ARBITRARY_LEN) {
+                               if(tmp.charAt(tmp.length() - 1) != 
ARBITRARY_LEN)
+                                       tmp.append(ch);
+                       }
+                       else {
+                               tmp.append(ch);
+                               currently_alphabetic = false;
+                       }
+               }
+               return tmp.toString();
+       }
+
+       private static String removeNumbers(String pattern) {
+               char[] chars = pattern.toCharArray();
+               StringBuilder tmp = new StringBuilder();
+               for (char ch : chars) {
+                       if(Character.isDigit(ch))
+                               tmp.append(ARBITRARY_LEN);
+                       else
+                               tmp.append(ch);
+               }
+               return tmp.toString();
+       }
+
+       public static String encodeRawString(String input) {
+               char[] chars = input.toCharArray();
+
+               StringBuilder tmp = new StringBuilder();
+               for (char ch : chars) {
+                       tmp.append(getCharClass(ch));
+               }
+               return getFrequencyOfEachConsecutiveChar(tmp.toString());
+       }
+
+       private static char getCharClass(char c) {
+               if (Character.isDigit(c)) return DIGIT;
+               if (Character.isLowerCase(c)) return LOWER;
+               if (Character.isUpperCase(c)) return UPPER;
+               if (Character.isSpaceChar(c)) return SPACE;
+               if (c == '.') return DOT;
+               if(c == '-') return MINUS;
+               return OTHER;
+       }
+
+       public static String getFrequencyOfEachConsecutiveChar(String s) {
+               StringBuilder retval = new StringBuilder();
+               for (int i = 0; i < s.length(); i++) {
+                       int count = 1;
+                       while (i + 1 < s.length() && s.charAt(i) == s.charAt(i 
+ 1)) {
+                               i++;
+                               count++;
+                       }
+                       retval.append(s.charAt(i));
+                       retval.append(count);
+               }
+               return retval.toString();
+       }
+
+       private static void detectDisguisedValues(String dom_pattern, Object 
col, int col_idx,
+               FrameBlock frameBlock, LEVEL_ENUM level)
+       {
+               int row_idx = -1;
+               String pattern = "";
+               String[] column = (String[]) col;
+               for (String attr : column) {
+                       switch (level){
+                               case LEVEL1:
+                                       pattern = encodeRawString(attr);
+                                       break;
+                               case LEVEL2:
+                                       pattern = encodeRawString(attr);
+                                       pattern = removeNumbers(pattern);
+                                       break;
+                               case LEVEL3:
+                                       pattern = encodeRawString(attr);
+                                       pattern = removeNumbers(pattern);
+                                       pattern = removeUpperLowerCase(pattern);
+                                       break;
+                               case LEVEL4:
+                                       pattern = encodeRawString(attr);
+                                       pattern = removeNumbers(pattern);
+                                       pattern = removeUpperLowerCase(pattern);
+                                       pattern = 
removeInnerCharacterInPattern(pattern, DIGIT, DOT);
+                                       break;
+                               case LEVEL5:
+                                       pattern = encodeRawString(attr);
+                                       pattern = removeNumbers(pattern);
+                                       pattern = removeUpperLowerCase(pattern);
+                                       pattern = 
removeInnerCharacterInPattern(pattern, DIGIT, DOT);
+                                       pattern = 
removeInnerCharacterInPattern(pattern, ALPHA, SPACE);
+                                       break;
+                               case LEVEL6:
+                                       pattern = encodeRawString(attr);
+                                       pattern = removeNumbers(pattern);
+                                       pattern = removeUpperLowerCase(pattern);
+                                       pattern = 
removeInnerCharacterInPattern(pattern, DIGIT, DOT);
+                                       pattern = 
removeInnerCharacterInPattern(pattern, ALPHA, SPACE);
+                                       pattern = 
acceptNegativeNumbersAsDigits(pattern);
+                               default:
+                                       //System.out.println("Could not find 
suitable level");
+                       }
+                       row_idx++;
+                       if(pattern.equals(dom_pattern)) continue;
+//                     System.out.println("[" + level +"] Disguised value: " + 
frameBlock.get(row_idx, col_idx) + " (c=" + col_idx + ",r=" + row_idx + ")");
+                       frameBlock.set(row_idx, col_idx, DISGUISED_VAL);
+               }
+       }
+}
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinDMVTest.java 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinDMVTest.java
new file mode 100644
index 0000000..10f4143
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinDMVTest.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.io.FrameWriterFactory;
+import org.apache.sysds.runtime.matrix.data.FrameBlock;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.apache.sysds.lops.LopProperties.ExecType;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+
+public class BuiltinDMVTest extends AutomatedTestBase {
+
+    private final static String TEST_NAME = "disguisedMissingValue";
+    private final static String TEST_DIR = "functions/builtin/";
+    private static final String TEST_CLASS_DIR = TEST_DIR + 
BuiltinOutlierTest.class.getSimpleName() + "/";
+
+    @BeforeClass
+    public static void init() {
+        TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
+    }
+
+    @AfterClass
+    public static void cleanUp() {
+        if (TEST_CACHE_ENABLED) {
+            TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
+        }
+    }
+
+    @Override
+    public void setUp() {
+        TestUtils.clearAssertionInformation();
+        addTestConfiguration(TEST_NAME,new TestConfiguration(TEST_CLASS_DIR, 
TEST_NAME,new String[]{"B"}));
+        if (TEST_CACHE_ENABLED) {
+            setOutAndExpectedDeletionDisabled(true);
+        }
+    }
+
+    @Test
+    public void NormalStringFrameTest() {
+        FrameBlock f = generateRandomFrameBlock(1000, 4,null);
+        String[] disguised_values = new String[]{"?", "9999", "?", "9999"};
+        ArrayList<List<Integer>> positions = getDisguisedPositions(f, 4, 
disguised_values);
+        runMissingValueTest(f, ExecType.CP, 0.8, "DMV", positions);
+    }
+
+    @Test
+    public void PreDefinedStringsFrameTest() {
+        String[] testarray0 = new String[]{"77","77","55","89","43", "99", 
"46"}; // detect Weg
+        String[] testarray1 = new 
String[]{"8010","9999","8456","4565","89655", "86542", "45624"}; // detect ?
+        String[] testarray2 = new String[]{"David K","Valentin E","Patrick 
L","VEVE","DK", "VE", "PL"}; // detect 45
+        String[] testarray3 = new 
String[]{"3.42","45","0.456",".45","4589.245", "97", "33"}; // detect ka
+        String[] testarray4 = new String[]{"99","123","158","146","158", 
"174", "201"}; // detect 9999
+
+        String[][] teststrings = new String[][]{testarray0, testarray1, 
testarray2, testarray3, testarray4};
+        FrameBlock f = generateRandomFrameBlock(7, 5, teststrings);
+        String[] disguised_values = new String[]{"Patrick-Lovric-Weg-666", 
"?", "45", "ka", "9999"};
+        ArrayList<List<Integer>> positions = getDisguisedPositions(f, 1, 
disguised_values);
+        runMissingValueTest(f, ExecType.CP, 0.7,"NA", positions);
+    }
+
+    @Test
+    public void PreDefinedDoubleFrame() {
+        Double[] test_val = new Double[10000];
+        for(int i = 0; i < test_val.length; i++) {
+            test_val[i] = TestUtils.getPositiveRandomDouble();
+        }
+        String[] test_string = new String[test_val.length];
+        for(int j = 0; j < test_val.length; j++) {
+            test_string[j] = test_val[j].toString();
+        }
+
+        String[][] teststrings = new String[][]{test_string};
+        FrameBlock f = generateRandomFrameBlock(test_string.length, 1, 
teststrings);
+        String[] disguised_values = new String[]{"9999999999"};
+        ArrayList<List<Integer>> positions = getDisguisedPositions(f, 10, 
disguised_values);
+        runMissingValueTest(f, ExecType.CP, 0.6, "-1", positions);
+    }
+
+    private void runMissingValueTest(FrameBlock test_frame, ExecType et, 
Double threshold, String replacement,
+        ArrayList<List<Integer>> positions)
+    {
+        Types.ExecMode platformOld = setExecMode(et);
+
+        try {
+            getAndLoadTestConfiguration(TEST_NAME);
+
+            String HOME = SCRIPT_DIR + TEST_DIR;
+            fullDMLScriptName = HOME + TEST_NAME + ".dml";
+            programArgs = new String[] {"-nvargs", "F=" + input("F"), "O=" + 
output("O"),
+                    "threshold=" + threshold, "replacement=" + replacement
+            };
+
+            FrameWriterFactory.createFrameWriter(Types.FileFormat.CSV).
+                    writeFrameToHDFS(test_frame, input("F"), 
test_frame.getNumRows(), test_frame.getNumColumns());
+
+            runTest(true, false, null, -1);
+
+            FrameBlock outputFrame = readDMLFrameFromHDFS("O", 
Types.FileFormat.CSV);
+
+            for(int i = 0; i < positions.size(); i++) {
+                String[] output = (String[]) outputFrame.getColumnData(i);
+                for(int j = 0; j < positions.get(i).size(); j++) {
+                    if(replacement.equals("NA")) {
+                      TestUtils.compareScalars(null, 
output[positions.get(i).get(j)]);
+                    }
+                    else {
+                      TestUtils.compareScalars(replacement, 
output[positions.get(i).get(j)]);
+                    }
+                }
+            }
+        }
+        catch (Exception ex) {
+            throw new RuntimeException(ex);
+        }
+        finally {
+            resetExecMode(platformOld);
+        }
+    }
+
+    private FrameBlock generateRandomFrameBlock(int rows, int cols, String[][] 
defined_strings)
+    {
+        Types.ValueType[] schema = new Types.ValueType[cols];
+        for(int i = 0; i < cols; i++) {
+            schema[i] = Types.ValueType.STRING;
+        }
+
+        if(defined_strings != null)
+        {
+            String[] names = new String[cols];
+            for(int i = 0; i < cols; i++)
+                names[i] = schema[i].toString();
+            FrameBlock frameBlock = new FrameBlock(schema, names);
+            frameBlock.ensureAllocatedColumns(rows);
+            for(int row = 0; row < rows; row++)
+                for(int col = 0; col < cols; col++)
+                    frameBlock.set(row, col, defined_strings[col][row]);
+            return frameBlock;
+        }
+        return TestUtils.generateRandomFrameBlock(rows, cols, schema 
,TestUtils.getPositiveRandomInt());
+    }
+
+    private ArrayList<List<Integer>> getDisguisedPositions(FrameBlock frame, 
int amountValues, String[] disguisedValue)
+    {
+        ArrayList<List<Integer>> positions = new ArrayList<>();
+        int counter;
+        for(int i = 0; i < frame.getNumColumns(); i++)
+        {
+            counter = 0;
+            List<Integer> arrayToFill = new ArrayList<>();
+            while(counter < frame.getNumRows() && counter < amountValues)
+            {
+                int position = TestUtils.getPositiveRandomInt() % 
frame.getNumRows();
+                while(counter != 0 && arrayToFill.contains(position))
+                {
+                    position = (position + TestUtils.getPositiveRandomInt() + 
5) % frame.getNumRows();
+                }
+                arrayToFill.add(position);
+                if(disguisedValue.length > 1)
+                {
+                    frame.set(position, i, disguisedValue[i]);
+                }
+                else if (disguisedValue.length == 1)
+                {
+                    frame.set(position, i, disguisedValue[0]);
+                }
+
+                counter++;
+            }
+            positions.add(i, arrayToFill);
+        }
+
+        return positions;
+    }
+
+}
diff --git a/src/test/scripts/functions/builtin/disguisedMissingValue.dml 
b/src/test/scripts/functions/builtin/disguisedMissingValue.dml
new file mode 100644
index 0000000..6d45fb7
--- /dev/null
+++ b/src/test/scripts/functions/builtin/disguisedMissingValue.dml
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+X = read($F, data_type="frame", format="csv", header=FALSE)
+Z = dmv(X=X, threshold=$threshold, replace=$replacement)
+
+write(Z, $O, format = "csv")

Reply via email to