This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
     new 8fbcd75  [MINOR] Various improvements of data cleaning built-in 
primitives
8fbcd75 is described below

commit 8fbcd758674a07fa0a0f41be2ecea110b53691cc
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Sun May 3 14:50:43 2020 +0200

    [MINOR] Various improvements of data cleaning built-in primitives
    
    Closes #901.
---
 scripts/builtin/mice.dml                           | 27 ++++---
 scripts/builtin/multiLogReg.dml                    |  3 +-
 scripts/builtin/outlierByIQR.dml                   | 20 +++--
 scripts/builtin/outlierBySd.dml                    | 20 +++--
 scripts/builtin/winsorize.dml                      |  7 ++
 .../test/functions/builtin/BuiltinMiceTest.java    | 91 +++++++++++++++-------
 .../functions/builtin/BuiltinOutlierByIQRTest.java |  9 +++
 .../functions/builtin/BuiltinOutlierBySDTest.java  | 21 +++--
 .../functions/builtin/BuiltinWinsorizeTest.java    |  4 +-
 src/test/scripts/functions/builtin/mice.R          | 85 +++++++++++++-------
 src/test/scripts/functions/builtin/mice.dml        | 28 +++++--
 .../scripts/functions/builtin/outlier_by_IQR.dml   |  2 +-
 .../scripts/functions/builtin/outlier_by_sd.dml    |  2 +-
 src/test/scripts/functions/builtin/winsorize.R     |  4 +-
 .../scripts/functions/caching/BufferpoolLeak.dml   |  2 +-
 15 files changed, 220 insertions(+), 105 deletions(-)

diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index 99d2be2..b00d542 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -26,6 +26,7 @@
 # NAME            TYPE    DEFAULT     MEANING
 # 
---------------------------------------------------------------------------------------------
 # F               String    ---        Data Frame
+# cMask           Double    ---        A 0/1 row vector for identifying 
numeric (0) adn categorical features (1)
 # iter            Integer    3         Number of iteration for multiple 
imputations 
 # complete        Integer    3         A complete dataset generated though a 
specific iteration
 # 
---------------------------------------------------------------------------------------------
@@ -40,17 +41,21 @@
 
 # Assumption missing value are represented with empty string i.e ",," in csv 
file  
 # variables with suffix n are storing continous/numeric data and variables 
with suffix c are storing categorical data
-s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3, 
Integer complete = 3)
+s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3, 
Integer complete = 3, Boolean verbose = FALSE)
 return(Frame[String] dataset, Frame[String] singleSet)
 {
 
   if(ncol(F) == 1)
     stop("invalid aregument: can not apply mice on single column")
+    
+  if(complete > iter)
+    complete = iter
 
-  # adding a temporary categorical feature (in-case all attributes are 
continous)
+
+  # adding a temporary  feature (in-case all attributes are of same type)
   F = cbind(F,  as.frame(matrix(1,nrow(F), 1)))
   cMask = cbind(cMask, matrix(1,1,1))
-  
+
   n = nrow(F)
   row = n*complete;
   col = ncol(F) 
@@ -58,6 +63,10 @@ return(Frame[String] dataset, Frame[String] singleSet)
   Mask_Result = matrix(0, rows=1, cols=col)
   scat = seq(1, ncol(cMask))
   cat = removeEmpty(target=scat, margin="rows", select=t(cMask))
+
+  if(nrow(cat) == ncol(F))
+    cMask[1,ncol(cMask)] = 0
+  
   s=""
   for(i in 1: nrow(cat), check =0)
     s = s+as.integer(as.scalar(cat[i, 1]))+",";
@@ -168,7 +177,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
         in_n = in_n + 1;
       }
      
-      if((as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0))
+      if( (as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0) )
       {
         j = (i + as.scalar(dist[1,in_c])) - 1
 
@@ -199,8 +208,8 @@ return(Frame[String] dataset, Frame[String] singleSet)
           Mask_Filled_c[,in_c] = table(R, 1, pred, n, 1);
         i = as.integer(j)
       }
-      
-      in_c = in_c + 1
+      if(in_c < col)
+        in_c = in_c + 1
       i = i+1;
     }
 
@@ -214,7 +223,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
   Result = Result[2: n*iter+1, ]
   Mask_Result = Mask_Result[2: n*iter+1, ]
   index = (((complete*n)-n)+1)
-  #voting for aggregation of categorical imputations
+  # voting for aggregation of categorical imputations
   agg = cAggregate(Mask_Result*cMask, iter, n)
   
   # aggregating the results
@@ -229,11 +238,11 @@ return(Frame[String] dataset, Frame[String] singleSet)
   dataset =   XO + Agg_Matrix
   singleSet = Result[index:row, ]
    
-  # # decoding nominal columns 
+  # decoding nominal columns 
   dataset = transformdecode(target=dataset, spec=jspecR, meta=M);
   singleSet = transformdecode(target=singleSet, spec=jspecR, meta=M);
   
-  # # removing extra categorical column
+  # removing extra categorical column
   dataset = dataset[,1:col-1]
   singleSet = singleSet[,1:col-1]
  }
diff --git a/scripts/builtin/multiLogReg.dml b/scripts/builtin/multiLogReg.dml
index 7ccba6f..5afd5e7 100644
--- a/scripts/builtin/multiLogReg.dml
+++ b/scripts/builtin/multiLogReg.dml
@@ -259,7 +259,8 @@ m_multiLogReg = function(Matrix[Double] X, Matrix[Double] 
Y, Integer icpt = 2, D
     iter = iter + 1;
     converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxi) |
       ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs 
(obj_new)) * 0.00000000000001)));
-    if (converge) { print ("Termination / Convergence condition satisfied."); }
+    if (verbose & converge) 
+      print ("Termination / Convergence condition satisfied.");
   }
 
   if (icpt == 2) {
diff --git a/scripts/builtin/outlierByIQR.dml b/scripts/builtin/outlierByIQR.dml
index 3d61528..a31c6e6 100644
--- a/scripts/builtin/outlierByIQR.dml
+++ b/scripts/builtin/outlierByIQR.dml
@@ -30,7 +30,8 @@
 # k               Double   1.5       a constant used to discern outliers k*IQR 
 # isIterative     Boolean  TRUE      iterative repair or single repair 
 # repairMethod    Integer  1         values: 0 = delete rows having outliers, 
-#                                    1 = replace outliers as missing values
+#                                            1 = replace outliers with zeros 
+#                                            2 = replace outliers as missing 
values 
 # max_iterations  Integer  0         values: 0 = arbitrary number of 
iteraition until all outliers are removed, 
 #                                    n = any constant defined by user
 # 
---------------------------------------------------------------------------------------------
@@ -56,11 +57,11 @@ m_outlierByIQR = function(Matrix[Double] X, Double k =1.5, 
Integer repairMethod
     lowerBound = (Q1 - (k * IQR));
     outlierFilter = X < lowerBound | X > upperBound
 
-    if(sum(outlierFilter) > 1 & sum(X) != 0 & sumPrevious != sumNext ) {
+    if(sum(outlierFilter) > 1 & sumNext != 0 & sumPrevious != sumNext ) {
       #TODO: see outlierBySd why are sumPrevious and sumNext necessary         
-      sumPrevious = sum(X)
-      X = fix_outliers(X, outlierFilter, repairMethod)
-      sumNext = sum(X)
+      sumPrevious = sum(X * !is.nan(X))
+      X = fix_outliers_iqr(X, outlierFilter, repairMethod)
+      sumNext = sum(X * !is.nan(X))
     }
     else
       max_iterations = -1
@@ -79,7 +80,7 @@ m_outlierByIQR = function(Matrix[Double] X, Double k =1.5, 
Integer repairMethod
   }
 }
   
-fix_outliers = function(Matrix[Double] X, Matrix[Double] outlierFilter, 
Integer repairMethod = 1)
+fix_outliers_iqr = function(Matrix[Double] X, Matrix[Double] outlierFilter, 
Integer repairMethod = 1)
   return(Matrix[Double] fixed_X)
 {
   rows = nrow(X)
@@ -90,8 +91,13 @@ fix_outliers = function(Matrix[Double] X, Matrix[Double] 
outlierFilter, Integer
   }
   else if(repairMethod == 1)
     X = (outlierFilter == 0) * X
+  else if(repairMethod == 2)
+  {
+    outlierFilter = replace(target = (outlierFilter == 0), pattern = 0, 
replacement = NaN)
+    X = outlierFilter * X
+  }
   else
-    stop("outlierByIQR: invalid argument - repair required 0-1 found: 
"+repairMethod)
+    stop("outlierByIQR: invalid argument - repair required 0-2 found: 
"+repairMethod)
 
   fixed_X = X
 }
diff --git a/scripts/builtin/outlierBySd.dml b/scripts/builtin/outlierBySd.dml
index 3b781e1..6b1f672 100644
--- a/scripts/builtin/outlierBySd.dml
+++ b/scripts/builtin/outlierBySd.dml
@@ -28,9 +28,9 @@
 # 
---------------------------------------------------------------------------------------------
 # X               Double    ---       Matrix X  
 # k               Double    3         threshold values 1, 2, 3 for 68%, 95%, 
99.7% respectively (3-sigma rule)
-# repairMethod    Integer   1         values: 0 = delete rows having outliers, 
1 = replace outliers as missing values 
-#                                     (this script replaces outliers with 
zeros)
-# max_iterations  Integer     0       values: 0 = arbitrary number of 
iteration until all outliers are removed, 
+# repairMethod    Integer   1         values: 0 = delete rows having outliers, 
1 = replace outliers as  zeros 
+#                                             2 = replace outliers as missing 
values 
+# max_iterations  Integer   0         values: 0 = arbitrary number of 
iteration until all outliers are removed, 
 #                                     n = any constant defined by user
 # 
---------------------------------------------------------------------------------------------
  
@@ -61,11 +61,11 @@ m_outlierBySd = function(Matrix[Double] X, Double k = 3, 
Integer repairMethod =
 
     outlierFilter = (X < lowerBound) | (X > upperBound)
 
-    if(sum(outlierFilter) > 1 & sum(X) != 0 & sumPrevious != sumNext) {
+               if(sum(outlierFilter) > 1 & sumNext != 0 & sumPrevious != 
sumNext) {
       #TODO why is the check with sumPrevious and sumNext necessary
-      sumPrevious = sum(X)
-      X = fix_outliers(X, outlierFilter, repairMethod)
-      sumNext = sum(X)
+      sumPrevious = sum(X * !is.nan(X))
+      X = fix_outliers_sd(X, outlierFilter, repairMethod)
+      sumNext = sum(X * !is.nan(X))
     }
     else
       max_iterations = - 1;
@@ -85,7 +85,7 @@ m_outlierBySd = function(Matrix[Double] X, Double k = 3, 
Integer repairMethod =
   }
 }
 
-fix_outliers = function(Matrix[Double] X, Matrix[Double] outlierFilter, 
Integer repairMethod = 2)
+fix_outliers_sd = function(Matrix[Double] X, Matrix[Double] outlierFilter, 
Integer repairMethod = 2)
   return(Matrix[Double] fixed_X)
 {
   rows = nrow(X)
@@ -96,6 +96,10 @@ fix_outliers = function(Matrix[Double] X, Matrix[Double] 
outlierFilter, Integer
   }
   else if(repairMethod == 1)
     X = (outlierFilter == 0) * X
+  else if (repairMethod == 2) {    
+    outlierFilter = replace(target = (outlierFilter == 0), pattern = 0, 
replacement = NaN)
+    X = outlierFilter * X
+  }
   else
     stop("outlierBySd: invalid argument - repair required 0-1 found: 
"+repairMethod)
 
diff --git a/scripts/builtin/winsorize.dml b/scripts/builtin/winsorize.dml
index 53c20ce..62e2d53 100644
--- a/scripts/builtin/winsorize.dml
+++ b/scripts/builtin/winsorize.dml
@@ -20,6 +20,13 @@
 #-------------------------------------------------------------
 
 m_winsorize = function(Matrix[Double] X) return (Matrix[Double] Y) {
+  Y = matrix(0, nrow(X), ncol(X))
+  parfor(i in 1:ncol(X))
+    Y[,i] = fixOutliersWinsorize(X[,i])
+}
+
+fixOutliersWinsorize = function(Matrix[Double] X) return (Matrix[Double] Y)
+{
   # compute quantiles for lower and upper probs
   q = quantile(X, matrix("0.05 0.95", rows=2, cols=1));
   ql = as.scalar(q[1,1]);
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
index ade84be..5c3ad22 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
@@ -45,20 +45,30 @@ public class BuiltinMiceTest extends AutomatedTestBase {
                addTestConfiguration(TEST_NAME, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B"}));
        }
        @Test
-       public void testMiceCP() {
-               runMiceNominalTest( LopProperties.ExecType.CP);
+       public void testMiceMixCP() {
+               double[][] mask = {{ 0.0, 0.0, 1.0, 1.0, 0.0}};
+               runMiceNominalTest(mask, 1, LopProperties.ExecType.CP);
        }
 
-//     @Test
-//     public void testMiceSpark() {
-//             runMiceNominalTest( LopProperties.ExecType.SPARK);
-//     }
+       @Test
+       public void testMiceNumberCP() {
+               double[][] mask = {{ 0.0, 0.0, 0.0, 0.0, 0.0}};
+               runMiceNominalTest(mask, 2, LopProperties.ExecType.CP);
+       }
 
+       @Test
+       public void testMiceCategoricalCP() {
+               double[][] mask = {{ 1.0, 1.0, 1.0, 1.0, 1.0}};
+               runMiceNominalTest(mask, 3, LopProperties.ExecType.CP);
+       }
+       //      @Test
+       //      public void testMiceSpark() {
+       //              runMiceNominalTest( LopProperties.ExecType.SPARK);
+       //      }
 
-       private void runMiceNominalTest( LopProperties.ExecType instType) {
+       private void runMiceNominalTest(double[][] mask, int testType, 
LopProperties.ExecType instType) {
                Types.ExecMode platformOld = setExecMode(instType);
                try {
-                       double[][] mask = {{ 0.0, 0.0, 1.0, 1.0, 0.0}};
                        loadTestConfiguration(getTestConfiguration(TEST_NAME));
                        String HOME = SCRIPT_DIR + TEST_DIR;
                        fullDMLScriptName = HOME + TEST_NAME + ".dml";
@@ -71,30 +81,53 @@ public class BuiltinMiceTest extends AutomatedTestBase {
                        runTest(true, false, null, -1);
                        runRScript(true);
 
-                       //compare matrices
-                       HashMap<MatrixValue.CellIndex, Double> dmlfileN = 
readDMLMatrixFromHDFS("N");
-                       HashMap<MatrixValue.CellIndex, Double> rfileN  = 
readRMatrixFromFS("N");
-
-                       HashMap<MatrixValue.CellIndex, Double> dmlfileC = 
readDMLMatrixFromHDFS("C");
-                       HashMap<MatrixValue.CellIndex, Double> rfileC  = 
readRMatrixFromFS("C");
-                       // compare numerical imputations
-                       TestUtils.compareMatrices(dmlfileN, rfileN, eps, 
"Stat-DML", "Stat-R");
-                       // compare categorical imputations
-                       int countTrue = 0;
-                       for (MatrixValue.CellIndex index : dmlfileC.keySet()) {
-                               Double v1 = dmlfileC.get(index);
-                               Double v2 = rfileC.get(index);
-                               if(v1.equals(v2))
-                                       countTrue++;
-                               }
-
-                       if(countTrue / (double)dmlfileC.size() > 0.98)
-                               Assert.assertTrue(true);
-                       else
-                               Assert.fail();
+
+                       switch (testType)
+                       {
+                               case 1:
+                                       testCategoricalOutput();
+                                       testNumericOutput();
+                                       break;
+                               case 2:
+                                       testNumericOutput();
+                                       break;
+                               case 3:
+                                       testCategoricalOutput();
+                                       break;
+                       }
                }
                finally {
                        rtplatform = platformOld;
                }
        }
+
+       private void testNumericOutput()
+       {
+               //compare matrices
+               HashMap<MatrixValue.CellIndex, Double> dmlfileN = 
readDMLMatrixFromHDFS("N");
+               HashMap<MatrixValue.CellIndex, Double> rfileN  = 
readRMatrixFromFS("N");
+
+               // compare numerical imputations
+               TestUtils.compareMatrices(dmlfileN, rfileN, eps, "Stat-DML", 
"Stat-R");
+
+       }
+       private void testCategoricalOutput()
+       {
+               HashMap<MatrixValue.CellIndex, Double> dmlfileC = 
readDMLMatrixFromHDFS("C");
+               HashMap<MatrixValue.CellIndex, Double> rfileC  = 
readRMatrixFromFS("C");
+
+               // compare categorical imputations
+               int countTrue = 0;
+               for (MatrixValue.CellIndex index : dmlfileC.keySet()) {
+                       Double v1 = dmlfileC.get(index);
+                       Double v2 = rfileC.get(index);
+                       if(v1.equals(v2))
+                               countTrue++;
+               }
+
+               if(countTrue / (double)dmlfileC.size() > 0.98)
+                       Assert.assertTrue(true);
+               else
+                       Assert.fail();
+       }
 }
\ No newline at end of file
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierByIQRTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierByIQRTest.java
index de874df..3f22724 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierByIQRTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierByIQRTest.java
@@ -83,6 +83,15 @@ public class BuiltinOutlierByIQRTest extends 
AutomatedTestBase {
                runOutlierTest(false, 1.5, 1, 0,LopProperties.ExecType.SPARK);
        }
 
+       @Test
+       public void testOutlierRepair2IterativeCP() {
+               runOutlierTest(false, 1.5, 2, 0,LopProperties.ExecType.CP);
+       }
+
+       @Test
+       public void testOutlierRepair2IterativeSP() {
+               runOutlierTest(false, 1.5, 2, 0,LopProperties.ExecType.SPARK);
+       }
 
        private void runOutlierTest(boolean sparse, double  k,  int repair, int 
max_iterations, LopProperties.ExecType instType)
        {
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierBySDTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierBySDTest.java
index 38b4d12..2c04fdd 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierBySDTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierBySDTest.java
@@ -44,42 +44,47 @@ public class BuiltinOutlierBySDTest extends 
AutomatedTestBase {
 
        @Test
        public void testOutlierRepair0CP() {
-               runOutlierTest(false, 2,0, 0, LopProperties.ExecType.CP);
+               runOutlierTest(false, 2, 0, 0, LopProperties.ExecType.CP);
        }
 
        @Test
        public void testOutlierRepair1CP() {
-               runOutlierTest(false, 2,1, 0, LopProperties.ExecType.CP);
+               runOutlierTest(false, 2, 1, 0, LopProperties.ExecType.CP);
        }
 
        @Test
        public void testOutlierRepair2CP() {
-               runOutlierTest(false, 2,2, 10, LopProperties.ExecType.CP);
+               runOutlierTest(false, 2, 2, 10, LopProperties.ExecType.CP);
+       }
+
+       @Test
+       public void testOutlierRepair2SP() {
+               runOutlierTest(false, 2, 2, 0, LopProperties.ExecType.CP);
        }
 
        @Test
        public void testOutlierRepair0SP() {
-               runOutlierTest(false, 2,0, 10, LopProperties.ExecType.SPARK);
+               runOutlierTest(false, 2, 0, 10, LopProperties.ExecType.SPARK);
        }
 
        @Test
        public void testOutlierRepair1SP() {
-               runOutlierTest(false, 2,1, 0, LopProperties.ExecType.SPARK);
+               runOutlierTest(false, 2, 1, 10, LopProperties.ExecType.SPARK);
        }
 
        @Test
        public void testOutlierK3CP() {
-               runOutlierTest(true, 3,1, 10,LopProperties.ExecType.CP);
+               runOutlierTest(true, 3, 1, 10,LopProperties.ExecType.CP);
        }
 
        @Test
        public void testOutlierIterativeCP() {
-               runOutlierTest(false, 2,1, 0, LopProperties.ExecType.CP);
+               runOutlierTest(false, 2, 1, 0, LopProperties.ExecType.CP);
        }
 
        @Test
        public void testOutlierIterativeSP() {
-               runOutlierTest(false, 2,1, 0, LopProperties.ExecType.SPARK);
+               runOutlierTest(false, 2, 1, 10, LopProperties.ExecType.SPARK);
        }
 
        private void runOutlierTest(boolean sparse, double  k,  int repair, int 
max_iterations, LopProperties.ExecType instType)
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinWinsorizeTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinWinsorizeTest.java
index c79c105..6cee661 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinWinsorizeTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinWinsorizeTest.java
@@ -35,7 +35,7 @@ public class BuiltinWinsorizeTest extends AutomatedTestBase
        private final static String TEST_DIR = "functions/builtin/";
        private static final String TEST_CLASS_DIR = TEST_DIR + 
BuiltinWinsorizeTest.class.getSimpleName() + "/";
        
-       private final static double eps = 1e-4;
+       private final static double eps = 1e-3;
        private final static int rows = 1765;
        private final static double spDense = 0.99;
        
@@ -69,7 +69,7 @@ public class BuiltinWinsorizeTest extends AutomatedTestBase
                        rCmd = "Rscript" + " " + fullRScriptName + " " + 
inputDir() + " " + expectedDir();
                        
                        //generate actual dataset 
-                       double[][] A = getRandomMatrix(rows, 1, -1, 1, spDense, 
7);
+                       double[][] A = getRandomMatrix(rows, 10, -1, 1, 
spDense, 7);
                        writeInputMatrixWithMTD("A", A, true);
                        
                        runTest(true, false, null, -1);
diff --git a/src/test/scripts/functions/builtin/mice.R 
b/src/test/scripts/functions/builtin/mice.R
index 373832b..2237d7c 100644
--- a/src/test/scripts/functions/builtin/mice.R
+++ b/src/test/scripts/functions/builtin/mice.R
@@ -26,43 +26,68 @@ library(dplyr)
 
 d <- read.csv(args[1], header=FALSE )
 mass <- as.matrix(readMM(paste(args[2], "M.mtx", sep="")));
+
+if(sum(mass) == ncol(d))
+{
+d = d[,3:4]
+mass = mass[1,3:4]
 meth=""
-for(i in 1: ncol(mass)) {
-  if(as.integer(mass[1,i]) == 1)  {
-    d[[names(d)[i]]] =  as.factor(d[[names(d)[i]]]); 
-    meth = c(meth, "polyreg")
-  } 
-  else
-    meth = c(meth, "norm.predict")
-  }
+  for(i in 1: 2) {
+      d[[names(d)[i]]] =  as.factor(d[[names(d)[i]]]); 
+      meth = c(meth, "polyreg")
+    }
+  
+  meth=meth[-1]
 
-meth=meth[-1]
-# set the prediction matrix
-pred <- make.predictorMatrix(d)
-pred = pred * diag(1, ncol(mass))
+  #impute
+  imputeD <- mice(d,where = is.na(d), method = meth, m=3)
+  R = data.frame(complete(imputeD,3))
+  c = select_if(R, is.factor)
 
-pred[names(d)[1], names(d)[2]] = 1
-pred[names(d)[2], names(d)[1]] = 1
+  # convert factor into numeric before casting to matrix
+  c =  sapply(c, function(x) as.numeric(as.character(x)))
+  writeMM(as(as.matrix(c), "CsparseMatrix"), paste(args[3], "C", sep=""));
+} else if (sum(mass) == 0)
+{
+  print("Generating R witout cat")
+  imputeD <- mice(d,where = is.na(d), method = "norm.predict", m=3)
+  R = data.frame(complete(imputeD,3))
+  n = select_if(R, is.numeric)
+  writeMM(as(as.matrix(n), "CsparseMatrix"), paste(args[3], "N", sep=""));  
+} else {
+  meth=""
+  for(i in 1: ncol(mass)) {
+    if(as.integer(mass[1,i]) == 1)  {
+      d[[names(d)[i]]] =  as.factor(d[[names(d)[i]]]); 
+      meth = c(meth, "polyreg")
+    } else meth = c(meth, "norm.predict")
+  }
 
-pred[names(d)[1], names(d)[4]] = 1
-pred[names(d)[4], names(d)[1]] = 1
+  meth=meth[-1]
+  # set the prediction matrix
+  pred <- make.predictorMatrix(d)
+  pred = pred * diag(1, ncol(mass))
 
-pred[names(d)[2], names(d)[4]] = 1
-pred[names(d)[4], names(d)[2]] = 1
+  pred[names(d)[1], names(d)[2]] = 1
+  pred[names(d)[2], names(d)[1]] = 1
 
-pred[names(d)[3], names(d)[4]] = 1
-pred[names(d)[4], names(d)[3]] = 1
+  pred[names(d)[1], names(d)[4]] = 1
+  pred[names(d)[4], names(d)[1]] = 1
 
+  pred[names(d)[2], names(d)[4]] = 1
+  pred[names(d)[4], names(d)[2]] = 1
 
-#impute
-imputeD <- mice(d,where = is.na(d), method = meth, m=3,  pred = pred)
-R = data.frame(complete(imputeD,3))
+  pred[names(d)[3], names(d)[4]] = 1
+  pred[names(d)[4], names(d)[3]] = 1
 
-n =select_if(R, is.numeric)
-c = select_if(R, is.factor)
 
-# convert factor into numeric before casting to matrix
-c =  sapply(c, function(x) as.numeric(as.character(x)))
-            
-writeMM(as(as.matrix(n), "CsparseMatrix"), paste(args[3], "N", sep=""));
-writeMM(as(as.matrix(c), "CsparseMatrix"), paste(args[3], "C", sep=""));
+#impute
+  imputeD <- mice(d,where = is.na(d), method = meth, m=3,  pred = pred)
+  R = data.frame(complete(imputeD,3))
+  c = select_if(R, is.factor)
+  # convert factor into numeric before casting to matrix
+  c =  sapply(c, function(x) as.numeric(as.character(x)))
+  n = select_if(R, is.numeric)
+  writeMM(as(as.matrix(c), "CsparseMatrix"), paste(args[3], "C", sep=""));
+  writeMM(as(as.matrix(n), "CsparseMatrix"), paste(args[3], "N", sep=""));
+}
diff --git a/src/test/scripts/functions/builtin/mice.dml 
b/src/test/scripts/functions/builtin/mice.dml
index 52dd72a..7736f56 100644
--- a/src/test/scripts/functions/builtin/mice.dml
+++ b/src/test/scripts/functions/builtin/mice.dml
@@ -21,11 +21,25 @@
 
 X = read($X, data_type="frame", format="csv");
 M = read($Mask)
-[dataset, singleSet]= mice(F=X, cMask=M, iter=$iteration, complete=$com)
-n = as.matrix(dataset) * (1-M)
-n = removeEmpty(target=n, margin = "cols")
-c = as.matrix(dataset) * (M)
-c = removeEmpty(target=c, margin = "cols")
-write(n, $dataN)
-write(c, $dataC)
+[dataset, singleSet]= mice(F=X, cMask=M, iter=$iteration, complete=$com, 
verbose = FALSE)
 
+if(sum(M) == ncol(X))
+{
+  c = as.matrix(singleSet[,3:4]) # comparing only selected columns with R 
results because dataset is continuos and
+  write(c, $dataC)               # for categorical imputation R polyreg only 
support upto 50 distinct items (50 categories/feature)
+}
+else if (sum(M) == 0)
+{
+  n = as.matrix(dataset) * (1-M)
+  n = removeEmpty(target=n, margin = "cols")
+  write(n, $dataN)
+}
+else
+{
+  c = as.matrix(dataset) * (M)
+  c = removeEmpty(target=c, margin = "cols")
+  n = as.matrix(dataset) * (1-M)
+  n = removeEmpty(target=n, margin = "cols")
+  write(n, $dataN)
+  write(c, $dataC)
+}
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/outlier_by_IQR.dml 
b/src/test/scripts/functions/builtin/outlier_by_IQR.dml
index a6da102..6eb9510 100644
--- a/src/test/scripts/functions/builtin/outlier_by_IQR.dml
+++ b/src/test/scripts/functions/builtin/outlier_by_IQR.dml
@@ -20,5 +20,5 @@
 #-------------------------------------------------------------
 
 X = read($1);
-Y = outlierByIQR(X, $2, $3, $4, TRUE);
+Y = outlierByIQR(X, $2, $3, $4, FALSE);
 write(Y, $5)
diff --git a/src/test/scripts/functions/builtin/outlier_by_sd.dml 
b/src/test/scripts/functions/builtin/outlier_by_sd.dml
index 69098f0..d117d73 100644
--- a/src/test/scripts/functions/builtin/outlier_by_sd.dml
+++ b/src/test/scripts/functions/builtin/outlier_by_sd.dml
@@ -20,5 +20,5 @@
 #-------------------------------------------------------------
 
 X = read($1);
-Y = outlierBySd(X, $2, $3, $4, TRUE);
+Y = outlierBySd(X, $2, $3, $4, FALSE);
 write(Y, $5)
diff --git a/src/test/scripts/functions/builtin/winsorize.R 
b/src/test/scripts/functions/builtin/winsorize.R
index 92b3324..c917b17 100644
--- a/src/test/scripts/functions/builtin/winsorize.R
+++ b/src/test/scripts/functions/builtin/winsorize.R
@@ -25,6 +25,8 @@ library("Matrix")
 library("DescTools")
 
 X = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
-Y = Winsorize(X);
+Y = matrix(0, nrow(X), ncol(X))
+for(i in 1:ncol(X))
+  Y[,i] = Winsorize(X[,i]);
 writeMM(as(Y, "CsparseMatrix"), paste(args[2], "B", sep="")); 
  
\ No newline at end of file
diff --git a/src/test/scripts/functions/caching/BufferpoolLeak.dml 
b/src/test/scripts/functions/caching/BufferpoolLeak.dml
index b7c2ab4..6a50ea5 100644
--- a/src/test/scripts/functions/caching/BufferpoolLeak.dml
+++ b/src/test/scripts/functions/caching/BufferpoolLeak.dml
@@ -22,7 +22,7 @@
 X = rand(rows=$1, cols=$2, min=1, max=10);
 for(i in 1:500) {
   #print("executed iteration "+i)
-  [m1,m2] = mice(as.frame(X), matrix(0,1,ncol(X)),3,3)
+  [m1,m2] = mice(as.frame(X), matrix(0,1,ncol(X)),3,3, FALSE)
 }
 if( ncol(X) > $2 )
   print(toString(m1));

Reply via email to