This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git
The following commit(s) were added to refs/heads/master by this push:
new 8fbcd75 [MINOR] Various improvements of data cleaning built-in
primitives
8fbcd75 is described below
commit 8fbcd758674a07fa0a0f41be2ecea110b53691cc
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Sun May 3 14:50:43 2020 +0200
[MINOR] Various improvements of data cleaning built-in primitives
Closes #901.
---
scripts/builtin/mice.dml | 27 ++++---
scripts/builtin/multiLogReg.dml | 3 +-
scripts/builtin/outlierByIQR.dml | 20 +++--
scripts/builtin/outlierBySd.dml | 20 +++--
scripts/builtin/winsorize.dml | 7 ++
.../test/functions/builtin/BuiltinMiceTest.java | 91 +++++++++++++++-------
.../functions/builtin/BuiltinOutlierByIQRTest.java | 9 +++
.../functions/builtin/BuiltinOutlierBySDTest.java | 21 +++--
.../functions/builtin/BuiltinWinsorizeTest.java | 4 +-
src/test/scripts/functions/builtin/mice.R | 85 +++++++++++++-------
src/test/scripts/functions/builtin/mice.dml | 28 +++++--
.../scripts/functions/builtin/outlier_by_IQR.dml | 2 +-
.../scripts/functions/builtin/outlier_by_sd.dml | 2 +-
src/test/scripts/functions/builtin/winsorize.R | 4 +-
.../scripts/functions/caching/BufferpoolLeak.dml | 2 +-
15 files changed, 220 insertions(+), 105 deletions(-)
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index 99d2be2..b00d542 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -26,6 +26,7 @@
# NAME TYPE DEFAULT MEANING
#
---------------------------------------------------------------------------------------------
# F String --- Data Frame
+# cMask Double --- A 0/1 row vector for identifying
numeric (0) adn categorical features (1)
# iter Integer 3 Number of iteration for multiple
imputations
# complete Integer 3 A complete dataset generated though a
specific iteration
#
---------------------------------------------------------------------------------------------
@@ -40,17 +41,21 @@
# Assumption missing value are represented with empty string i.e ",," in csv
file
# variables with suffix n are storing continous/numeric data and variables
with suffix c are storing categorical data
-s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3,
Integer complete = 3)
+s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3,
Integer complete = 3, Boolean verbose = FALSE)
return(Frame[String] dataset, Frame[String] singleSet)
{
if(ncol(F) == 1)
stop("invalid aregument: can not apply mice on single column")
+
+ if(complete > iter)
+ complete = iter
- # adding a temporary categorical feature (in-case all attributes are
continous)
+
+ # adding a temporary feature (in-case all attributes are of same type)
F = cbind(F, as.frame(matrix(1,nrow(F), 1)))
cMask = cbind(cMask, matrix(1,1,1))
-
+
n = nrow(F)
row = n*complete;
col = ncol(F)
@@ -58,6 +63,10 @@ return(Frame[String] dataset, Frame[String] singleSet)
Mask_Result = matrix(0, rows=1, cols=col)
scat = seq(1, ncol(cMask))
cat = removeEmpty(target=scat, margin="rows", select=t(cMask))
+
+ if(nrow(cat) == ncol(F))
+ cMask[1,ncol(cMask)] = 0
+
s=""
for(i in 1: nrow(cat), check =0)
s = s+as.integer(as.scalar(cat[i, 1]))+",";
@@ -168,7 +177,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
in_n = in_n + 1;
}
- if((as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0))
+ if( (as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0) )
{
j = (i + as.scalar(dist[1,in_c])) - 1
@@ -199,8 +208,8 @@ return(Frame[String] dataset, Frame[String] singleSet)
Mask_Filled_c[,in_c] = table(R, 1, pred, n, 1);
i = as.integer(j)
}
-
- in_c = in_c + 1
+ if(in_c < col)
+ in_c = in_c + 1
i = i+1;
}
@@ -214,7 +223,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
Result = Result[2: n*iter+1, ]
Mask_Result = Mask_Result[2: n*iter+1, ]
index = (((complete*n)-n)+1)
- #voting for aggregation of categorical imputations
+ # voting for aggregation of categorical imputations
agg = cAggregate(Mask_Result*cMask, iter, n)
# aggregating the results
@@ -229,11 +238,11 @@ return(Frame[String] dataset, Frame[String] singleSet)
dataset = XO + Agg_Matrix
singleSet = Result[index:row, ]
- # # decoding nominal columns
+ # decoding nominal columns
dataset = transformdecode(target=dataset, spec=jspecR, meta=M);
singleSet = transformdecode(target=singleSet, spec=jspecR, meta=M);
- # # removing extra categorical column
+ # removing extra categorical column
dataset = dataset[,1:col-1]
singleSet = singleSet[,1:col-1]
}
diff --git a/scripts/builtin/multiLogReg.dml b/scripts/builtin/multiLogReg.dml
index 7ccba6f..5afd5e7 100644
--- a/scripts/builtin/multiLogReg.dml
+++ b/scripts/builtin/multiLogReg.dml
@@ -259,7 +259,8 @@ m_multiLogReg = function(Matrix[Double] X, Matrix[Double]
Y, Integer icpt = 2, D
iter = iter + 1;
converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxi) |
((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs
(obj_new)) * 0.00000000000001)));
- if (converge) { print ("Termination / Convergence condition satisfied."); }
+ if (verbose & converge)
+ print ("Termination / Convergence condition satisfied.");
}
if (icpt == 2) {
diff --git a/scripts/builtin/outlierByIQR.dml b/scripts/builtin/outlierByIQR.dml
index 3d61528..a31c6e6 100644
--- a/scripts/builtin/outlierByIQR.dml
+++ b/scripts/builtin/outlierByIQR.dml
@@ -30,7 +30,8 @@
# k Double 1.5 a constant used to discern outliers k*IQR
# isIterative Boolean TRUE iterative repair or single repair
# repairMethod Integer 1 values: 0 = delete rows having outliers,
-# 1 = replace outliers as missing values
+# 1 = replace outliers with zeros
+# 2 = replace outliers as missing
values
# max_iterations Integer 0 values: 0 = arbitrary number of
iteraition until all outliers are removed,
# n = any constant defined by user
#
---------------------------------------------------------------------------------------------
@@ -56,11 +57,11 @@ m_outlierByIQR = function(Matrix[Double] X, Double k =1.5,
Integer repairMethod
lowerBound = (Q1 - (k * IQR));
outlierFilter = X < lowerBound | X > upperBound
- if(sum(outlierFilter) > 1 & sum(X) != 0 & sumPrevious != sumNext ) {
+ if(sum(outlierFilter) > 1 & sumNext != 0 & sumPrevious != sumNext ) {
#TODO: see outlierBySd why are sumPrevious and sumNext necessary
- sumPrevious = sum(X)
- X = fix_outliers(X, outlierFilter, repairMethod)
- sumNext = sum(X)
+ sumPrevious = sum(X * !is.nan(X))
+ X = fix_outliers_iqr(X, outlierFilter, repairMethod)
+ sumNext = sum(X * !is.nan(X))
}
else
max_iterations = -1
@@ -79,7 +80,7 @@ m_outlierByIQR = function(Matrix[Double] X, Double k =1.5,
Integer repairMethod
}
}
-fix_outliers = function(Matrix[Double] X, Matrix[Double] outlierFilter,
Integer repairMethod = 1)
+fix_outliers_iqr = function(Matrix[Double] X, Matrix[Double] outlierFilter,
Integer repairMethod = 1)
return(Matrix[Double] fixed_X)
{
rows = nrow(X)
@@ -90,8 +91,13 @@ fix_outliers = function(Matrix[Double] X, Matrix[Double]
outlierFilter, Integer
}
else if(repairMethod == 1)
X = (outlierFilter == 0) * X
+ else if(repairMethod == 2)
+ {
+ outlierFilter = replace(target = (outlierFilter == 0), pattern = 0,
replacement = NaN)
+ X = outlierFilter * X
+ }
else
- stop("outlierByIQR: invalid argument - repair required 0-1 found:
"+repairMethod)
+ stop("outlierByIQR: invalid argument - repair required 0-2 found:
"+repairMethod)
fixed_X = X
}
diff --git a/scripts/builtin/outlierBySd.dml b/scripts/builtin/outlierBySd.dml
index 3b781e1..6b1f672 100644
--- a/scripts/builtin/outlierBySd.dml
+++ b/scripts/builtin/outlierBySd.dml
@@ -28,9 +28,9 @@
#
---------------------------------------------------------------------------------------------
# X Double --- Matrix X
# k Double 3 threshold values 1, 2, 3 for 68%, 95%,
99.7% respectively (3-sigma rule)
-# repairMethod Integer 1 values: 0 = delete rows having outliers,
1 = replace outliers as missing values
-# (this script replaces outliers with
zeros)
-# max_iterations Integer 0 values: 0 = arbitrary number of
iteration until all outliers are removed,
+# repairMethod Integer 1 values: 0 = delete rows having outliers,
1 = replace outliers as zeros
+# 2 = replace outliers as missing
values
+# max_iterations Integer 0 values: 0 = arbitrary number of
iteration until all outliers are removed,
# n = any constant defined by user
#
---------------------------------------------------------------------------------------------
@@ -61,11 +61,11 @@ m_outlierBySd = function(Matrix[Double] X, Double k = 3,
Integer repairMethod =
outlierFilter = (X < lowerBound) | (X > upperBound)
- if(sum(outlierFilter) > 1 & sum(X) != 0 & sumPrevious != sumNext) {
+ if(sum(outlierFilter) > 1 & sumNext != 0 & sumPrevious !=
sumNext) {
#TODO why is the check with sumPrevious and sumNext necessary
- sumPrevious = sum(X)
- X = fix_outliers(X, outlierFilter, repairMethod)
- sumNext = sum(X)
+ sumPrevious = sum(X * !is.nan(X))
+ X = fix_outliers_sd(X, outlierFilter, repairMethod)
+ sumNext = sum(X * !is.nan(X))
}
else
max_iterations = - 1;
@@ -85,7 +85,7 @@ m_outlierBySd = function(Matrix[Double] X, Double k = 3,
Integer repairMethod =
}
}
-fix_outliers = function(Matrix[Double] X, Matrix[Double] outlierFilter,
Integer repairMethod = 2)
+fix_outliers_sd = function(Matrix[Double] X, Matrix[Double] outlierFilter,
Integer repairMethod = 2)
return(Matrix[Double] fixed_X)
{
rows = nrow(X)
@@ -96,6 +96,10 @@ fix_outliers = function(Matrix[Double] X, Matrix[Double]
outlierFilter, Integer
}
else if(repairMethod == 1)
X = (outlierFilter == 0) * X
+ else if (repairMethod == 2) {
+ outlierFilter = replace(target = (outlierFilter == 0), pattern = 0,
replacement = NaN)
+ X = outlierFilter * X
+ }
else
stop("outlierBySd: invalid argument - repair required 0-1 found:
"+repairMethod)
diff --git a/scripts/builtin/winsorize.dml b/scripts/builtin/winsorize.dml
index 53c20ce..62e2d53 100644
--- a/scripts/builtin/winsorize.dml
+++ b/scripts/builtin/winsorize.dml
@@ -20,6 +20,13 @@
#-------------------------------------------------------------
m_winsorize = function(Matrix[Double] X) return (Matrix[Double] Y) {
+ Y = matrix(0, nrow(X), ncol(X))
+ parfor(i in 1:ncol(X))
+ Y[,i] = fixOutliersWinsorize(X[,i])
+}
+
+fixOutliersWinsorize = function(Matrix[Double] X) return (Matrix[Double] Y)
+{
# compute quantiles for lower and upper probs
q = quantile(X, matrix("0.05 0.95", rows=2, cols=1));
ql = as.scalar(q[1,1]);
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
index ade84be..5c3ad22 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
@@ -45,20 +45,30 @@ public class BuiltinMiceTest extends AutomatedTestBase {
addTestConfiguration(TEST_NAME, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B"}));
}
@Test
- public void testMiceCP() {
- runMiceNominalTest( LopProperties.ExecType.CP);
+ public void testMiceMixCP() {
+ double[][] mask = {{ 0.0, 0.0, 1.0, 1.0, 0.0}};
+ runMiceNominalTest(mask, 1, LopProperties.ExecType.CP);
}
-// @Test
-// public void testMiceSpark() {
-// runMiceNominalTest( LopProperties.ExecType.SPARK);
-// }
+ @Test
+ public void testMiceNumberCP() {
+ double[][] mask = {{ 0.0, 0.0, 0.0, 0.0, 0.0}};
+ runMiceNominalTest(mask, 2, LopProperties.ExecType.CP);
+ }
+ @Test
+ public void testMiceCategoricalCP() {
+ double[][] mask = {{ 1.0, 1.0, 1.0, 1.0, 1.0}};
+ runMiceNominalTest(mask, 3, LopProperties.ExecType.CP);
+ }
+ // @Test
+ // public void testMiceSpark() {
+ // runMiceNominalTest( LopProperties.ExecType.SPARK);
+ // }
- private void runMiceNominalTest( LopProperties.ExecType instType) {
+ private void runMiceNominalTest(double[][] mask, int testType,
LopProperties.ExecType instType) {
Types.ExecMode platformOld = setExecMode(instType);
try {
- double[][] mask = {{ 0.0, 0.0, 1.0, 1.0, 0.0}};
loadTestConfiguration(getTestConfiguration(TEST_NAME));
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
@@ -71,30 +81,53 @@ public class BuiltinMiceTest extends AutomatedTestBase {
runTest(true, false, null, -1);
runRScript(true);
- //compare matrices
- HashMap<MatrixValue.CellIndex, Double> dmlfileN =
readDMLMatrixFromHDFS("N");
- HashMap<MatrixValue.CellIndex, Double> rfileN =
readRMatrixFromFS("N");
-
- HashMap<MatrixValue.CellIndex, Double> dmlfileC =
readDMLMatrixFromHDFS("C");
- HashMap<MatrixValue.CellIndex, Double> rfileC =
readRMatrixFromFS("C");
- // compare numerical imputations
- TestUtils.compareMatrices(dmlfileN, rfileN, eps,
"Stat-DML", "Stat-R");
- // compare categorical imputations
- int countTrue = 0;
- for (MatrixValue.CellIndex index : dmlfileC.keySet()) {
- Double v1 = dmlfileC.get(index);
- Double v2 = rfileC.get(index);
- if(v1.equals(v2))
- countTrue++;
- }
-
- if(countTrue / (double)dmlfileC.size() > 0.98)
- Assert.assertTrue(true);
- else
- Assert.fail();
+
+ switch (testType)
+ {
+ case 1:
+ testCategoricalOutput();
+ testNumericOutput();
+ break;
+ case 2:
+ testNumericOutput();
+ break;
+ case 3:
+ testCategoricalOutput();
+ break;
+ }
}
finally {
rtplatform = platformOld;
}
}
+
+ private void testNumericOutput()
+ {
+ //compare matrices
+ HashMap<MatrixValue.CellIndex, Double> dmlfileN =
readDMLMatrixFromHDFS("N");
+ HashMap<MatrixValue.CellIndex, Double> rfileN =
readRMatrixFromFS("N");
+
+ // compare numerical imputations
+ TestUtils.compareMatrices(dmlfileN, rfileN, eps, "Stat-DML",
"Stat-R");
+
+ }
+ private void testCategoricalOutput()
+ {
+ HashMap<MatrixValue.CellIndex, Double> dmlfileC =
readDMLMatrixFromHDFS("C");
+ HashMap<MatrixValue.CellIndex, Double> rfileC =
readRMatrixFromFS("C");
+
+ // compare categorical imputations
+ int countTrue = 0;
+ for (MatrixValue.CellIndex index : dmlfileC.keySet()) {
+ Double v1 = dmlfileC.get(index);
+ Double v2 = rfileC.get(index);
+ if(v1.equals(v2))
+ countTrue++;
+ }
+
+ if(countTrue / (double)dmlfileC.size() > 0.98)
+ Assert.assertTrue(true);
+ else
+ Assert.fail();
+ }
}
\ No newline at end of file
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierByIQRTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierByIQRTest.java
index de874df..3f22724 100644
---
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierByIQRTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierByIQRTest.java
@@ -83,6 +83,15 @@ public class BuiltinOutlierByIQRTest extends
AutomatedTestBase {
runOutlierTest(false, 1.5, 1, 0,LopProperties.ExecType.SPARK);
}
+ @Test
+ public void testOutlierRepair2IterativeCP() {
+ runOutlierTest(false, 1.5, 2, 0,LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void testOutlierRepair2IterativeSP() {
+ runOutlierTest(false, 1.5, 2, 0,LopProperties.ExecType.SPARK);
+ }
private void runOutlierTest(boolean sparse, double k, int repair, int
max_iterations, LopProperties.ExecType instType)
{
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierBySDTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierBySDTest.java
index 38b4d12..2c04fdd 100644
---
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierBySDTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinOutlierBySDTest.java
@@ -44,42 +44,47 @@ public class BuiltinOutlierBySDTest extends
AutomatedTestBase {
@Test
public void testOutlierRepair0CP() {
- runOutlierTest(false, 2,0, 0, LopProperties.ExecType.CP);
+ runOutlierTest(false, 2, 0, 0, LopProperties.ExecType.CP);
}
@Test
public void testOutlierRepair1CP() {
- runOutlierTest(false, 2,1, 0, LopProperties.ExecType.CP);
+ runOutlierTest(false, 2, 1, 0, LopProperties.ExecType.CP);
}
@Test
public void testOutlierRepair2CP() {
- runOutlierTest(false, 2,2, 10, LopProperties.ExecType.CP);
+ runOutlierTest(false, 2, 2, 10, LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void testOutlierRepair2SP() {
+ runOutlierTest(false, 2, 2, 0, LopProperties.ExecType.CP);
}
@Test
public void testOutlierRepair0SP() {
- runOutlierTest(false, 2,0, 10, LopProperties.ExecType.SPARK);
+ runOutlierTest(false, 2, 0, 10, LopProperties.ExecType.SPARK);
}
@Test
public void testOutlierRepair1SP() {
- runOutlierTest(false, 2,1, 0, LopProperties.ExecType.SPARK);
+ runOutlierTest(false, 2, 1, 10, LopProperties.ExecType.SPARK);
}
@Test
public void testOutlierK3CP() {
- runOutlierTest(true, 3,1, 10,LopProperties.ExecType.CP);
+ runOutlierTest(true, 3, 1, 10,LopProperties.ExecType.CP);
}
@Test
public void testOutlierIterativeCP() {
- runOutlierTest(false, 2,1, 0, LopProperties.ExecType.CP);
+ runOutlierTest(false, 2, 1, 0, LopProperties.ExecType.CP);
}
@Test
public void testOutlierIterativeSP() {
- runOutlierTest(false, 2,1, 0, LopProperties.ExecType.SPARK);
+ runOutlierTest(false, 2, 1, 10, LopProperties.ExecType.SPARK);
}
private void runOutlierTest(boolean sparse, double k, int repair, int
max_iterations, LopProperties.ExecType instType)
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinWinsorizeTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinWinsorizeTest.java
index c79c105..6cee661 100644
---
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinWinsorizeTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinWinsorizeTest.java
@@ -35,7 +35,7 @@ public class BuiltinWinsorizeTest extends AutomatedTestBase
private final static String TEST_DIR = "functions/builtin/";
private static final String TEST_CLASS_DIR = TEST_DIR +
BuiltinWinsorizeTest.class.getSimpleName() + "/";
- private final static double eps = 1e-4;
+ private final static double eps = 1e-3;
private final static int rows = 1765;
private final static double spDense = 0.99;
@@ -69,7 +69,7 @@ public class BuiltinWinsorizeTest extends AutomatedTestBase
rCmd = "Rscript" + " " + fullRScriptName + " " +
inputDir() + " " + expectedDir();
//generate actual dataset
- double[][] A = getRandomMatrix(rows, 1, -1, 1, spDense,
7);
+ double[][] A = getRandomMatrix(rows, 10, -1, 1,
spDense, 7);
writeInputMatrixWithMTD("A", A, true);
runTest(true, false, null, -1);
diff --git a/src/test/scripts/functions/builtin/mice.R
b/src/test/scripts/functions/builtin/mice.R
index 373832b..2237d7c 100644
--- a/src/test/scripts/functions/builtin/mice.R
+++ b/src/test/scripts/functions/builtin/mice.R
@@ -26,43 +26,68 @@ library(dplyr)
d <- read.csv(args[1], header=FALSE )
mass <- as.matrix(readMM(paste(args[2], "M.mtx", sep="")));
+
+if(sum(mass) == ncol(d))
+{
+d = d[,3:4]
+mass = mass[1,3:4]
meth=""
-for(i in 1: ncol(mass)) {
- if(as.integer(mass[1,i]) == 1) {
- d[[names(d)[i]]] = as.factor(d[[names(d)[i]]]);
- meth = c(meth, "polyreg")
- }
- else
- meth = c(meth, "norm.predict")
- }
+ for(i in 1: 2) {
+ d[[names(d)[i]]] = as.factor(d[[names(d)[i]]]);
+ meth = c(meth, "polyreg")
+ }
+
+ meth=meth[-1]
-meth=meth[-1]
-# set the prediction matrix
-pred <- make.predictorMatrix(d)
-pred = pred * diag(1, ncol(mass))
+ #impute
+ imputeD <- mice(d,where = is.na(d), method = meth, m=3)
+ R = data.frame(complete(imputeD,3))
+ c = select_if(R, is.factor)
-pred[names(d)[1], names(d)[2]] = 1
-pred[names(d)[2], names(d)[1]] = 1
+ # convert factor into numeric before casting to matrix
+ c = sapply(c, function(x) as.numeric(as.character(x)))
+ writeMM(as(as.matrix(c), "CsparseMatrix"), paste(args[3], "C", sep=""));
+} else if (sum(mass) == 0)
+{
+ print("Generating R witout cat")
+ imputeD <- mice(d,where = is.na(d), method = "norm.predict", m=3)
+ R = data.frame(complete(imputeD,3))
+ n = select_if(R, is.numeric)
+ writeMM(as(as.matrix(n), "CsparseMatrix"), paste(args[3], "N", sep=""));
+} else {
+ meth=""
+ for(i in 1: ncol(mass)) {
+ if(as.integer(mass[1,i]) == 1) {
+ d[[names(d)[i]]] = as.factor(d[[names(d)[i]]]);
+ meth = c(meth, "polyreg")
+ } else meth = c(meth, "norm.predict")
+ }
-pred[names(d)[1], names(d)[4]] = 1
-pred[names(d)[4], names(d)[1]] = 1
+ meth=meth[-1]
+ # set the prediction matrix
+ pred <- make.predictorMatrix(d)
+ pred = pred * diag(1, ncol(mass))
-pred[names(d)[2], names(d)[4]] = 1
-pred[names(d)[4], names(d)[2]] = 1
+ pred[names(d)[1], names(d)[2]] = 1
+ pred[names(d)[2], names(d)[1]] = 1
-pred[names(d)[3], names(d)[4]] = 1
-pred[names(d)[4], names(d)[3]] = 1
+ pred[names(d)[1], names(d)[4]] = 1
+ pred[names(d)[4], names(d)[1]] = 1
+ pred[names(d)[2], names(d)[4]] = 1
+ pred[names(d)[4], names(d)[2]] = 1
-#impute
-imputeD <- mice(d,where = is.na(d), method = meth, m=3, pred = pred)
-R = data.frame(complete(imputeD,3))
+ pred[names(d)[3], names(d)[4]] = 1
+ pred[names(d)[4], names(d)[3]] = 1
-n =select_if(R, is.numeric)
-c = select_if(R, is.factor)
-# convert factor into numeric before casting to matrix
-c = sapply(c, function(x) as.numeric(as.character(x)))
-
-writeMM(as(as.matrix(n), "CsparseMatrix"), paste(args[3], "N", sep=""));
-writeMM(as(as.matrix(c), "CsparseMatrix"), paste(args[3], "C", sep=""));
+#impute
+ imputeD <- mice(d,where = is.na(d), method = meth, m=3, pred = pred)
+ R = data.frame(complete(imputeD,3))
+ c = select_if(R, is.factor)
+ # convert factor into numeric before casting to matrix
+ c = sapply(c, function(x) as.numeric(as.character(x)))
+ n = select_if(R, is.numeric)
+ writeMM(as(as.matrix(c), "CsparseMatrix"), paste(args[3], "C", sep=""));
+ writeMM(as(as.matrix(n), "CsparseMatrix"), paste(args[3], "N", sep=""));
+}
diff --git a/src/test/scripts/functions/builtin/mice.dml
b/src/test/scripts/functions/builtin/mice.dml
index 52dd72a..7736f56 100644
--- a/src/test/scripts/functions/builtin/mice.dml
+++ b/src/test/scripts/functions/builtin/mice.dml
@@ -21,11 +21,25 @@
X = read($X, data_type="frame", format="csv");
M = read($Mask)
-[dataset, singleSet]= mice(F=X, cMask=M, iter=$iteration, complete=$com)
-n = as.matrix(dataset) * (1-M)
-n = removeEmpty(target=n, margin = "cols")
-c = as.matrix(dataset) * (M)
-c = removeEmpty(target=c, margin = "cols")
-write(n, $dataN)
-write(c, $dataC)
+[dataset, singleSet]= mice(F=X, cMask=M, iter=$iteration, complete=$com,
verbose = FALSE)
+if(sum(M) == ncol(X))
+{
+ c = as.matrix(singleSet[,3:4]) # comparing only selected columns with R
results because dataset is continuos and
+ write(c, $dataC) # for categorical imputation R polyreg only
support upto 50 distinct items (50 categories/feature)
+}
+else if (sum(M) == 0)
+{
+ n = as.matrix(dataset) * (1-M)
+ n = removeEmpty(target=n, margin = "cols")
+ write(n, $dataN)
+}
+else
+{
+ c = as.matrix(dataset) * (M)
+ c = removeEmpty(target=c, margin = "cols")
+ n = as.matrix(dataset) * (1-M)
+ n = removeEmpty(target=n, margin = "cols")
+ write(n, $dataN)
+ write(c, $dataC)
+}
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/outlier_by_IQR.dml
b/src/test/scripts/functions/builtin/outlier_by_IQR.dml
index a6da102..6eb9510 100644
--- a/src/test/scripts/functions/builtin/outlier_by_IQR.dml
+++ b/src/test/scripts/functions/builtin/outlier_by_IQR.dml
@@ -20,5 +20,5 @@
#-------------------------------------------------------------
X = read($1);
-Y = outlierByIQR(X, $2, $3, $4, TRUE);
+Y = outlierByIQR(X, $2, $3, $4, FALSE);
write(Y, $5)
diff --git a/src/test/scripts/functions/builtin/outlier_by_sd.dml
b/src/test/scripts/functions/builtin/outlier_by_sd.dml
index 69098f0..d117d73 100644
--- a/src/test/scripts/functions/builtin/outlier_by_sd.dml
+++ b/src/test/scripts/functions/builtin/outlier_by_sd.dml
@@ -20,5 +20,5 @@
#-------------------------------------------------------------
X = read($1);
-Y = outlierBySd(X, $2, $3, $4, TRUE);
+Y = outlierBySd(X, $2, $3, $4, FALSE);
write(Y, $5)
diff --git a/src/test/scripts/functions/builtin/winsorize.R
b/src/test/scripts/functions/builtin/winsorize.R
index 92b3324..c917b17 100644
--- a/src/test/scripts/functions/builtin/winsorize.R
+++ b/src/test/scripts/functions/builtin/winsorize.R
@@ -25,6 +25,8 @@ library("Matrix")
library("DescTools")
X = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
-Y = Winsorize(X);
+Y = matrix(0, nrow(X), ncol(X))
+for(i in 1:ncol(X))
+ Y[,i] = Winsorize(X[,i]);
writeMM(as(Y, "CsparseMatrix"), paste(args[2], "B", sep=""));
\ No newline at end of file
diff --git a/src/test/scripts/functions/caching/BufferpoolLeak.dml
b/src/test/scripts/functions/caching/BufferpoolLeak.dml
index b7c2ab4..6a50ea5 100644
--- a/src/test/scripts/functions/caching/BufferpoolLeak.dml
+++ b/src/test/scripts/functions/caching/BufferpoolLeak.dml
@@ -22,7 +22,7 @@
X = rand(rows=$1, cols=$2, min=1, max=10);
for(i in 1:500) {
#print("executed iteration "+i)
- [m1,m2] = mice(as.frame(X), matrix(0,1,ncol(X)),3,3)
+ [m1,m2] = mice(as.frame(X), matrix(0,1,ncol(X)),3,3, FALSE)
}
if( ncol(X) > $2 )
print(toString(m1));