This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 2f3e381  [SYSTEMDS-2902] Minor built-ins for cleaning pipelines
2f3e381 is described below

commit 2f3e3816635a0248f18dde6dea8594b8e27ca2ed
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Wed Mar 17 21:44:26 2021 +0100

    [SYSTEMDS-2902] Minor built-ins for cleaning pipelines
---
 scripts/builtin/imputeByMode.dml                   | 60 +++++++++++++++
 scripts/builtin/splitBalanced.dml                  | 89 ++++++++++++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |  2 +
 .../builtin/BuiltinSplitBalancedTest.java          | 84 ++++++++++++++++++++
 .../scripts/functions/builtin/splitBalanced.dml    | 36 +++++++++
 5 files changed, 271 insertions(+)

diff --git a/scripts/builtin/imputeByMode.dml b/scripts/builtin/imputeByMode.dml
new file mode 100644
index 0000000..0d55de5
--- /dev/null
+++ b/scripts/builtin/imputeByMode.dml
@@ -0,0 +1,60 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Related to [SYSTEMDS-2902] dependency function for cleaning pipelines
+
+# impute the data by mode value 
+
+# INPUT PARAMETERS:
+# 
---------------------------------------------------------------------------------------------
+# NAME            TYPE    DEFAULT     MEANING
+# 
---------------------------------------------------------------------------------------------
+# X               Double    ---        Data Matrix (Recoded Matrix for 
categorical features)
+# 
---------------------------------------------------------------------------------------------
+ 
+
+#Output(s)
+# 
---------------------------------------------------------------------------------------------
+# NAME                  TYPE    DEFAULT     MEANING
+# 
---------------------------------------------------------------------------------------------
+# X               Double   ---        imputed dataset
+
+
+m_imputeByMode = function(Matrix[Double] X)
+return(Matrix[Double] X)
+{
+
+  Mask = is.na(X)
+  X = replace(target=X, pattern=NaN, replacement=0)
+  colMode = matrix(0, 1, ncol(X))
+  for(i in 1: ncol(X)) {
+    X_c = removeEmpty(target=X[, i], margin = "rows", select=(X[, i] < 1)==0)
+    if(sum(X_c) == 0)
+      colMode[1, i] = 1
+    else {
+      cat_counts = table(X_c, 1, nrow(X_c), 1);  # counts for each category
+      colMode[1,i] = as.scalar(rowIndexMax(t(cat_counts))) # mode
+    }
+  }
+  Mask = Mask * colMode
+  X = X + Mask
+}
+
diff --git a/scripts/builtin/splitBalanced.dml 
b/scripts/builtin/splitBalanced.dml
new file mode 100644
index 0000000..4428443
--- /dev/null
+++ b/scripts/builtin/splitBalanced.dml
@@ -0,0 +1,89 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Related to [SYSTEMDS-2902] dependency function for cleaning pipelines
+
+# Split input data X and Y into contiguous balanced ratio  
+# 
------------------------------------------------------------------------------
+# NAME   TYPE    DEFAULT  MEANING
+# 
------------------------------------------------------------------------------
+# X       Matrix  ---      Input feature matrix
+# Y       Matrix  ---      Input Labels
+# f       Double  0.7      Train set fraction [0,1]
+# verbose Boolean FALSE    print available 
+# 
------------------------------------------------------------------------------
+# X_train Matrix  ---      Train split of feature matrix
+# X_test  Matrix  ---      Test split of feature matrix
+# y_train Matrix  ---      Train split of label matrix
+# y_test  Matrix  ---      Test split of label matrix
+# 
------------------------------------------------------------------------------
+
+m_splitBalanced = function(Matrix[Double] X, Matrix[Double] Y, Double 
splitRatio, Boolean verbose)
+return (Matrix[Double] X_train, Matrix[Double] y_train, Matrix[Double] X_test, 
+        Matrix[Double] y_test) 
+{
+
+  XY = order(target = cbind(Y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
+  # get the class count 
+  classes = table(XY[, 1], 1)
+  split = floor(nrow(X) * splitRatio)
+  start_class = 1
+  train_row_s = 1 
+  test_row_s = 1 
+  train_row_e = 0
+  test_row_e = 0
+  end_class = 0
+  
+  outTrain = matrix(0, split+nrow(classes), ncol(XY))
+  outTest =  matrix(0, (nrow(X) - split)+nrow(classes), ncol(XY))
+  
+  classes_ratio_train = floor(classes*splitRatio)
+  classes_ratio_test = classes - classes_ratio_train
+  if(verbose) {
+    print("rows "+nrow(X))
+    print("classes \n"+toString(classes))
+    print("train ratio \n"+toString(classes_ratio_train))
+    print("test ratio \n"+toString(classes_ratio_test))
+  }
+  for(i in 1:nrow(classes))
+  {
+    end_class = end_class + as.scalar(classes[i])
+    class_t = XY[start_class:end_class, ]
+
+    train_row_e = train_row_e + as.scalar(classes_ratio_train[i]) 
+    test_row_e = test_row_e + as.scalar(classes_ratio_test[i]) 
+    
+    outTrain[train_row_s:train_row_e, ] = 
class_t[1:as.scalar(classes_ratio_train[i]), ]
+   
+    outTest[test_row_s:test_row_e, ] = 
class_t[as.scalar(classes_ratio_train[i])+1:nrow(class_t), ]
+
+    train_row_s = train_row_e + 1
+    test_row_s = test_row_e + 1
+    start_class = end_class + 1
+  }
+  outTrain = removeEmpty(target = outTrain, margin = "rows")
+  outTest = removeEmpty(target = outTest, margin = "rows")
+  y_train = outTrain[, 1]
+  X_train = outTrain[, 2:ncol(outTrain)]
+  y_test = outTest[, 1]
+  X_test = outTest[, 2:ncol(outTest)]
+
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index 8854f69..353791c 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -132,6 +132,7 @@ public enum Builtins {
        IMG_BRIGHTNESS("img_brightness", true),
        IMPUTE_BY_MEAN("imputeByMean", true),
        IMPUTE_BY_MEDIAN("imputeByMedian", true),
+       IMPUTE_BY_MODE("imputeByMode", true),
        IMG_CROP("img_crop", true),
        IMPUTE_FD("imputeByFD", true),
        INTERQUANTILE("interQuantile", false),
@@ -221,6 +222,7 @@ public enum Builtins {
        SMOTE("smote", true),
        SOLVE("solve", false),
        SPLIT("split", true),
+       SPLIT_BALANCED("splitBalanced", true),
        STATSNA("statsNA", true),
        SQRT("sqrt", false),
        SUM("sum", false),
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSplitBalancedTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSplitBalancedTest.java
new file mode 100644
index 0000000..7f47495
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSplitBalancedTest.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.lops.LopProperties;
+import org.apache.sysds.lops.LopProperties.ExecType;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class BuiltinSplitBalancedTest extends AutomatedTestBase {
+       private final static String TEST_NAME = "splitBalanced";
+       private final static String TEST_DIR = "functions/builtin/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
BuiltinSplitTest.class.getSimpleName() + "/";
+
+       @Override
+       public void setUp() {
+               TestUtils.clearAssertionInformation();
+               addTestConfiguration(TEST_NAME, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"B",}));
+       }
+
+       public double eps = 0.00001;
+       public int cols = 10;
+       public int rows = 150;
+
+
+       @Test
+       public void test_CP1() {
+
+               runSplitTest(0.7, LopProperties.ExecType.CP);
+
+       }
+       @Test
+       public void test_CP2() {
+
+               runSplitTest(0.8, LopProperties.ExecType.CP);
+
+       }
+
+       @Test
+       public void test_Spark() {
+               runSplitTest( 0.8, LopProperties.ExecType.SPARK);
+       }
+
+       private void runSplitTest(double splitRatio, ExecType instType) {
+               ExecMode platformOld = setExecMode(instType);
+
+               try {
+                       setOutputBuffering(true);
+                       loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+                       String HOME = SCRIPT_DIR + TEST_DIR;
+
+                       fullDMLScriptName = HOME + TEST_NAME + ".dml";
+                       programArgs = new String[] {"-nvargs", "cols=" + cols, 
"rows=" + rows, "split="+splitRatio, "out="+output("O")};
+
+                       runTest(true, false, null, -1);
+                       
Assert.assertTrue(TestUtils.readDMLBoolean(output("O")));
+               }
+               finally {
+                       rtplatform = platformOld;
+               }
+       }
+}
diff --git a/src/test/scripts/functions/builtin/splitBalanced.dml 
b/src/test/scripts/functions/builtin/splitBalanced.dml
new file mode 100644
index 0000000..9fa3215
--- /dev/null
+++ b/src/test/scripts/functions/builtin/splitBalanced.dml
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = rand(rows = $rows, cols=$cols, seed=1)
+Y = ceil(rand(rows = $rows, cols=1, seed=13, sparsity= (1-$split)))
+Y = Y+1 
+
+classes = table(Y, 1)
+
+[Xtrain, Ytrain, Xtest, Ytest] = splitBalanced(X=X,Y=Y, splitRatio=$split, 
verbose=FALSE)
+
+classCountTrain = table(Ytrain, 1)
+classCountTest = table(Ytest, 1)
+
+verify = as.scalar(classCountTest[2]) ==  ceil((as.scalar(classes[2]) * 
(1-$split)))
+
+write(verify, $out, format="text")
\ No newline at end of file

Reply via email to