This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git
The following commit(s) were added to refs/heads/master by this push:
new f450ead [MINOR] Script-level improvements mice builtin function
f450ead is described below
commit f450ead5506d1615b5979bee85b39891e0f0fc00
Author: Matthias Boehm <[email protected]>
AuthorDate: Sat Apr 25 19:40:58 2020 +0200
[MINOR] Script-level improvements mice builtin function
* Loop vectorization of scalar assignment
* Removed unnecessary branch for table padding
* Minor modifications of rmEmpty use to increase common subexpression
elimination
---
scripts/builtin/mice.dml | 44 +++++++++++++++++++-------------------------
1 file changed, 19 insertions(+), 25 deletions(-)
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index 3f3c325..99d2be2 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -56,12 +56,12 @@ return(Frame[String] dataset, Frame[String] singleSet)
col = ncol(F)
Result = matrix(0, rows=1, cols = col)
Mask_Result = matrix(0, rows=1, cols=col)
- cat = t(cMask) * seq(1, ncol(cMask))
- cat = removeEmpty(target = cat, margin = "rows")
+ scat = seq(1, ncol(cMask))
+ cat = removeEmpty(target=scat, margin="rows", select=t(cMask))
s=""
for(i in 1: nrow(cat), check =0)
- s = s+as.integer(as.scalar(cat[i, 1]))+",";
-
+ s = s+as.integer(as.scalar(cat[i, 1]))+",";
+
# encoding categorical columns using recode transformation
jspecR = "{ids:true, recode:["+s+"]}";
@@ -70,7 +70,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
XO = replace(target=X, pattern=NaN, replacement=0);
# remove categorical features and impute continous features with mean
- eX_n = removeEmpty(target=X, margin="cols", select=(1-cMask))
+ eX_n = removeEmpty(target=X, margin="cols", select=(cMask==0))
col_n = ncol(eX_n);
# storing the mask/address of missing values
Mask_n = is.na(eX_n);
@@ -80,7 +80,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
# filling the missing data with their means
X2_n = eX_n+(Mask_n*colMeans(eX_n))
# matrices for computing actul data
- p_n = table( (seq(1, ncol(eX_n))) , (removeEmpty(target = t(cMask==0)*seq(1,
ncol(cMask)), margin ="rows")) , 1 )
+ p_n = table(seq(1, ncol(eX_n)), removeEmpty(target=scat, margin="rows",
select=t(cMask==0)))
if(ncol(p_n) < ncol(cMask))
p_n = cbind(p_n, matrix(0, nrow(p_n), ncol(cMask)-ncol(p_n)))
q = XO * cMask
@@ -91,8 +91,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
eX_c2 = removeEmpty(target = eX_c, margin = "rows", select = (rowSums(eX_c
!= 0)==col_c))
colMod = matrix(0, 1, ncol(eX_c))
# compute columnwise mode
- parfor(i in 1: col_c)
- {
+ parfor(i in 1: col_c) {
f = eX_c2[, i] # adding one in data for dealing with zero category
cat_counts = table(f, 1, n, 1); # counts for each category
mode = as.scalar(rowIndexMax(t(cat_counts)));
@@ -100,13 +99,10 @@ return(Frame[String] dataset, Frame[String] singleSet)
}
# find the mask of missing values
- tmpMask_c = (eX_c == 0);
- tmpMask_c = (tmpMask_c * colMod) # fill missing values with mode
+ tmpMask_c = (eX_c==0) * colMod # fill missing values with mode
# Generate a matrix of actual length
- p_c = table((seq(1, ncol(tmpMask_c))) , (removeEmpty(target =
t(cMask)*seq(1, ncol(cMask)), margin ="rows")), 1)
- if(ncol(p_c) < ncol(cMask))
- p_c = cbind(p_c, matrix(0, nrow(p_c), ncol(cMask)-ncol(p_c)))
+ p_c = table(seq(1, ncol(tmpMask_c)), removeEmpty(target=scat, margin
="rows", select=t(cMask)), ncol(tmpMask_c), ncol(cMask))
Mask_c = tmpMask_c %*% p_c
inverseMask_c = Mask_c == 0
@@ -131,14 +127,13 @@ return(Frame[String] dataset, Frame[String] singleSet)
dXMask = matrix(0, 1, ncol(dX))
index = 1
for(k in 1:col) {
- if(as.scalar(dcDistincts[1,k]) != 0) {
- for(l in 1:as.scalar(dcDistincts[1,k])){
- dXMask[1,index] = 1
- index = index +1
- }
+ nDistk = as.scalar(dcDistincts[1,k]);
+ if(nDistk != 0) {
+ dXMask[1,index:(index+nDistk-1)] = matrix(1,1,nDistk)
+ index += nDistk;
}
else
- index = index +1
+ index += 1
}
#multiple imputations
@@ -149,7 +144,6 @@ return(Frame[String] dataset, Frame[String] singleSet)
in_n = 1; in_c = 1; i=1; j=1; # varibales for index selection
while(i <= ncol(dX))
{
-
if(as.scalar(dXMask[1,i]) == 0)
{
# construct column selector
@@ -175,7 +169,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
}
if((as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0))
- {
+ {
j = (i + as.scalar(dist[1,in_c])) - 1
# construct column selector
@@ -194,8 +188,8 @@ return(Frame[String] dataset, Frame[String] singleSet)
test_X = removeEmpty(target = slice2, margin = "cols", select = selX);
test_Y = slice2a[,in_c]
- # train clasification model
- beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.00000001,
reg = 0.001, maxi = 100, maxii=0, verbose=FALSE)
+ # train clasification model
+ beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.00000001,
reg = 0.001, maxi = 100, maxii=0, verbose=FALSE)
# predicting missing values
[prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
# imputing missing column values (assumes Mask_Filled being 0/1-matrix)
@@ -209,7 +203,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
in_c = in_c + 1
i = i+1;
}
-
+
nM = ((Mask_Filled_n) %*% p_n) + Mask_Filled_c
Result = rbind(Result, nM+XO)
Mask_Result = rbind(Mask_Result, nM)
@@ -266,7 +260,7 @@ return (Matrix[Double] agg)
if(sum(u1 != s ) == 0)
uCount = uCount + 1
if(sum(v1 != s) == 0)
- vCount = vCount + 1
+ vCount = vCount + 1
}
# copy the results of u in v
if(uCount > vCount)