[systemds] branch master updated: [SYSTEMDS-2961] Refactor Cleaning Pipelines (2) 1 .remove redundant functions 2. return execution time in executePipelines and CV 3. store execution time for each pipeline in the feature vector

ssiddiqi Sun, 02 May 2021 08:45:19 -0700

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/master by this push:
     new cf0acf4  [SYSTEMDS-2961] Refactor Cleaning Pipelines (2) 1 .remove 
redundant functions 2. return execution time in executePipelines and CV 3. 
store execution time for each pipeline in the feature vector
cf0acf4 is described below

commit cf0acf4f9fd53419cd85da1b8cef94051b201793
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Thu Apr 22 13:39:16 2021 +0200

    [SYSTEMDS-2961] Refactor Cleaning Pipelines (2)
    1 .remove redundant functions
    2. return execution time in executePipelines and CV
    3. store execution time for each pipeline in the feature vector
    
    Closes #1246.
---
 scripts/builtin/bandit.dml                         | 173 ++++++++++++----
 scripts/builtin/discoverFD.dml                     |   4 +-
 scripts/builtin/executePipeline.dml                |   9 +-
 scripts/builtin/imputeByMedian.dml                 |   2 +-
 scripts/pipelines/scripts/utils.dml                | 226 ++-------------------
 .../test/functions/builtin/BuiltinMiceTest.java    |   2 +-
 .../functions/pipelines/compareAccuracy.dml        | 102 +---------
 .../pipelines/intermediates/hyperparams.csv        |  10 +-
 .../pipelines/intermediates/pipelines.csv          |   2 +-
 .../scripts/functions/pipelines/mainScript.dml     |   2 +-
 .../functions/pipelines/testClassification.dml     |  92 +++------
 .../scripts/functions/pipelines/testCompare.dml    |   8 +-
 12 files changed, 201 insertions(+), 431 deletions(-)

diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 6523354..687d76b 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -18,7 +18,6 @@
 # under the License.
 #
 #-------------------------------------------------------------
-source("scripts/pipelines/scripts/utils.dml") as utils;
 
 m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, 
List[Unknown] metaList, List[Unknown] targetList,
   Frame[Unknown] lp, Frame[Unknown] primitives, Frame[Unknown] param,  Integer 
k = 3, Integer R=50, Boolean verbose = TRUE)
@@ -37,15 +36,17 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, List[Unknown
   B = (s_max + 1) * R;
   
   # initialize output variables
-  hparam = matrix(0, rows=k*(s_max+1), cols=55)
+  hparam = matrix(0, rows=k*(s_max+1), cols=100)
   pipeline = frame(0, rows=k*(s_max+1), cols=ncol(lp)+1)
   startOut=0; endOut=0;
-  feaFrameOuter = frame("", rows = 1, cols = NUM_FEATURES + ncol(lp) + 1 )
+  feaFrameOuter = frame(data=["#MissingValues", "MinVla", "MaxVal", 
"AverageMin", "AverageMax", 
+  "#CategoricalFeatures", "#NumericFeatures", "Mean", "#Outliers", 
"#OHEfeatures", "#Classes",
+  "Imbalance", "#rows", "#cols", "pipelines", "accuracy", "execution time in 
ms", "CV time in ms"], rows = 1, cols = NUM_FEATURES + 4 )
 
-  for(s in s_max:0, check = 0) {
+  for(s in s_max:0) {
     
    # result variables
-    bracket_hp = matrix(0, rows=k*(s+1)+k, cols=55)
+    bracket_hp = matrix(0, rows=k*(s+1)+k, cols=100)
     bracket_pipel = matrix(0, rows=k*(s+1)+k, cols=3)
     start=1; end=0;
     
@@ -62,9 +63,9 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, List[Unknown
     lookup = configurations
     
     if(verbose) 
-      print("n "+ n +"\n R "+ R +"\n s_max "+ s_max +"\n B "+ B +"\n n "+ n 
+"\n r "+ r)
+      print("n "+ n +"\nR "+ R +"\ns_max "+ s_max +"\nB "+ B +"\nn "+ n +"\nr 
"+ r)
     
-    for( i in 0:s, check=0 ) {
+    for( i in 0:s) {
       # successive halving    
       n_i = min(max(as.integer(floor(n * eta^(-i))), 1), nrow(configurations));
       r_i = as.integer(floor(r * eta^i));
@@ -72,7 +73,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, List[Unknown
       if(verbose) {
         print("no of configurations ---------"+n_i)
         print("no of resources --------------"+r_i)
-        print("iteration  ---------------------"+i)
+        print("iteration  ---------------------"+i+" out of "+s)
       }
       
       configurations = configurations[1:n_i, ]      
@@ -137,36 +138,34 @@ get_physical_configurations = function(Frame[String] 
logical, Scalar[int] numCon
   dim = primitives[,5]
   dummy = primitives[,6]
   scale = primitives[,7]
-
  
-  operator = as.frame(matrix(0,nrow(outliers),1)) #combine all logical 
primitives
+  operator = frame(0, rows=nrow(primitives), cols=ncol(logical)) 
#as.frame(matrix(0,nrow(outliers),1)) #combine all logical primitives
   for(j in 1:ncol(logical))
   {
     # extract the physical primitives
     if(as.scalar(logical[1,j]) == "OTLR")
-      operator = cbind(operator, outliers);
+      operator[, j] = outliers;
     else if(as.scalar(logical[1,j]) == "MVI")
-      operator = cbind(operator, mvi);
+      operator[, j] = mvi;
     else if(as.scalar(logical[1,j]) == "NR")
-      operator = cbind(operator, noise);  
+      operator[, j] = noise;  
     else if(as.scalar(logical[1,j]) == "CI")
-      operator = cbind(operator, ci);
+      operator[, j] = ci;
     else if(as.scalar(logical[1,j]) == "DIM")
-      operator = cbind(operator, dim);
+      operator[, j] =  dim;
     else if(as.scalar(logical[1,j]) == "DUMMY")
-      operator = cbind(operator, dummy);  
+      operator[, j] =  dummy;  
     else if(as.scalar(logical[1,j]) == "SCALE")
-      operator = cbind(operator, scale);
+      operator[, j] = scale;
     else stop("invalid operation "+as.scalar(logical[1,j]))
   }
-  opt = operator[,2:ncol(operator)] 
 
   idx = matrix(1, rows=1, cols=ncol(logical))
   # get the indexes of columns for recode transformation
   index = vectorToCsv(idx)
   # recode logical pipelines for easy handling
   jspecR = "{ids:true, recode:["+index+"]}";
-  [X, M] = transformencode(target=opt, spec=jspecR);
+  [X, M] = transformencode(target=operator, spec=jspecR);
   X = replace(target= X, pattern = NaN, replacement = 0)
   
   paramLens = matrix(0, ncol(logical), 1);
@@ -202,7 +201,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, 
Integer r_i, Matrix[Double
    List[Unknown] targetList, Frame[Unknown] param, Frame[Unknown] 
featureFrameOuter, Boolean verbose)                    
   return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, 
Frame[Unknown] featureFrameOuter) {
 
-  output_hp = matrix(0, nrow(ph_pip)*r_i, 50)
+  output_hp = matrix(0, nrow(ph_pip)*r_i, 60)
   output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1)
   output_pipelines = matrix(0, nrow(ph_pip)*r_i, 2)
   
@@ -213,13 +212,14 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, 
Integer r_i, Matrix[Double
   id = as.matrix(ph_pip[, 1])
   ph_pip = ph_pip[, 2:ncol(ph_pip)]
   
-  feaVec = utils::gatherStats(X, Y, as.matrix(metaList['mask']), 
as.scalar(targetList['target']))
+  feaVec = gatherStats(X, Y, as.matrix(metaList['mask']), 
as.scalar(targetList['target']))
 
   for(i in 1:nrow(ph_pip))
   {
     # execute configurations with r resources
     [hp, no_of_res, no_of_flag_vars] = getHyperparam(ph_pip[i], param, r_i)
-    feaFrame = frame("", rows = no_of_res, cols = ncol(feaVec) + ncol(ph_pip) 
+ 1)
+    feaFrame = frame("", rows = no_of_res, cols = ncol(featureFrameOuter))
+    pip_toString = pipToString(ph_pip[i])
     for(r in 1:no_of_res)
     { 
       # as the matrix first block of r rows belongs to first operator and r+1 
block of rows to second operator 
@@ -229,11 +229,11 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, 
Integer r_i, Matrix[Double
       indexes = cumsum(indexes)
       indexes = table(indexes, 1, 1, nrow(hp), 1)
       hp_matrix = removeEmpty(target = hp, margin="rows", select = indexes)
-      [X, Y] = executePipeline(ph_pip[i], X, Y, as.matrix(metaList['mask']), 
as.matrix(metaList['fd']), hp_matrix, no_of_flag_vars, FALSE)
+      [X, Y, T] = executePipeline(ph_pip[i], X, Y, 
as.matrix(metaList['mask']), as.matrix(metaList['fd']), hp_matrix, 
no_of_flag_vars, FALSE)
       if(as.scalar(targetList['target']) == "compare")
-        accuracy = utils::compareValue(clone_X, X, 
as.matrix(targetList['cleanData']), as.matrix(metaList['mask']))
+        [accuracy, accT] = compareValue(clone_X, X, 
as.matrix(targetList['cleanData']), as.matrix(metaList['mask']))
       else
-        accuracy = fclassify(X, Y, as.matrix(metaList['mask']), 
as.matrix(targetList['mlHp']), as.scalar(targetList['dirAcc']), 
+        [accuracy, accT] = fclassify(X, Y, as.matrix(metaList['mask']), 
as.matrix(targetList['mlHp']), as.scalar(targetList['dirAcc']), 
         as.scalar(targetList['wAccuracy']), as.scalar(targetList['cv']))
       matrix_width = as.matrix(nrow(hp_matrix) * ncol(hp_matrix))
       hp_vec = cbind(matrix_width, matrix(hp_matrix, rows=1, 
cols=nrow(hp_matrix)*ncol(hp_matrix), byrow=TRUE))
@@ -244,8 +244,10 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, 
Integer r_i, Matrix[Double
       Y = clone_Y
       index = index + 1  
       feaFrame[r, 1:ncol(feaVec)] = as.frame(feaVec)
-      feaFrame[r, ncol(feaVec)+1:ncol(feaVec)+ncol(ph_pip[1])] = ph_pip[i]
-      feaFrame[r, ncol(feaFrame)] = accuracy
+      feaFrame[r, (ncol(feaVec)+1)] = pip_toString
+      feaFrame[r, (ncol(feaVec)+2)] = accuracy
+      feaFrame[r, (ncol(feaVec)+3)] = T
+      feaFrame[r, (ncol(feaVec)+4)] = accT
     }
     
     X = clone_X
@@ -406,14 +408,11 @@ extractTopK = function(Frame[Unknown] pipeline, 
Matrix[Double] hyperparam,
   
 }
 
-
-
 # extract the top k pipelines for each bracket, the intermediate results
 extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] 
hyperparam, 
   Integer k, Frame[Unknown] conf)
   return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams)
 {
-
   # bestPipeline = frameSort(bestPipeline)
   hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE, 
index.return=FALSE)
   pipeline = order(target = pipeline, by = 1, decreasing=TRUE, 
index.return=FALSE)
@@ -443,11 +442,12 @@ return (Frame[Unknown] maxperconf)
   maxperconf[1:ncol(tab),] = as.frame(t(colMaxs(tab)))
 }
 
-
-# function to classify the data using cross validation
+###########################################################################
+## function to classify the data using cross validation
+############################################################################
 fclassify = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask, 
Matrix[Double] MLhp,
   Double testAccuracy, Boolean isWeighted, Integer cv=3)
-  return (Double accuracy)
+  return (Double accuracy, Double T)
 {
  
   if(max(Y) == min(Y)) {
@@ -457,11 +457,13 @@ fclassify = function(Matrix[Double] X, Matrix[Double] Y, 
Matrix[Double] mask, Ma
   else { 
     print("STARTING "+cv+" CROSS VALIDATIONS")
     # do the k = 3 cross validations
+    t1 = time()
     accuracyMatrix = crossV(X, Y, cv, mask, MLhp, isWeighted)
+    T = floor((time() - t1) / 1e+6)
     accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
     acc = colMeans(accuracyMatrix)
     accuracy = as.scalar(acc[1,1])
-    print("validation accuracy "+accuracy)
+    print(cv +" validation accuracy "+accuracy+" in "+T+" ms\n\n")
   }
 }
 
@@ -477,9 +479,7 @@ crossV = function(Matrix[double] X, Matrix[double] y, 
Integer k, Matrix[Double]
   Matrix[Double] MLhp, Boolean isWeighted) 
 return (Matrix[Double] accuracyMatrix)
 {
-
   accuracyMatrix = matrix(0, k, 1)
-
   dataList = list()
   testL = list()
   data = order(target = cbind(y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
@@ -499,20 +499,16 @@ return (Matrix[Double] accuracyMatrix)
       start = end + 1;
       end = end + idx
       class_j =  data[start:end, ]
-
       start_i = as.scalar(fold_idxes[j, 1]);
       end_i = as.scalar(fold_idxes[j, 2])
-
       fold_i = rbind(fold_i, class_j[start_i:end_i, ])
     }
-
     dataList = append(dataList, fold_i)
     fold_idxes[, 1] = fold_idxes[, 2] + 1
     fold_idxes[, 2] += ins_per_fold
-    while(FALSE){}
   }
 
-  for(i in seq(1,k))
+  parfor(i in seq(1,k))
   {
     [trainList, hold_out] = remove(dataList, i)
     trainset = rbind(trainList)
@@ -530,6 +526,95 @@ return (Matrix[Double] accuracyMatrix)
 }
 
 
-# data=["#MissingValues", "MinVla", "MaxVal", "AverageMin", "AverageMax", 
-# "#CategoricalFeatures", "#NumericFeatures", "Mean", "#Outliers", 
"#OHEfeatures", "#Classes",
-# "Imbalance", "#rows", "#cols", ""]
\ No newline at end of file
+###############################################################################################
+# The function will collect the features like statistics and pipelines and 
accuracy 
+# so that they could be used for training a model and predicting pipelines 
without enumeration
+###############################################################################################
+gatherStats = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] 
mask, String target)
+return (Matrix[Double] features)
+{
+
+  features = matrix(0, rows = 1, cols= 14)
+  features[1, 1]=  sum(is.na(X)) # number of missing values
+  X = replace(target= X, pattern = NaN, replacement = 0)
+  num = removeEmpty(target=X, margin="cols", select=(mask == 0))
+  # get the stats
+  features[1, 2] =  min(num) # minimum value
+  features[1, 3] = max(num)
+  features[1, 4] = mean(colMins(num)) # average minimum value
+  features[1, 5] = mean(colMaxs(num)) # average maximum value
+  features[1, 6] = sum(mask) # number of categorical features
+  features[1, 7] = sum(mask == 0) # number of numerical features
+  features[1, 8] = mean(num) # mean value
+  colSd = colSds(num)
+  count3sdplus = sum(num > (colMeans(num) + 3*colSd )) 
+  count3sdminus = sum(num < (colMeans(num) - 3*colSd )) 
+  outliers = count3sdplus + count3sdminus
+  features[1, 9] = outliers
+  # OHE features 
+  OHE = sum(colMaxs(X) * mask)
+  features[1, 10] = OHE
+  if(target != "compare")
+  {
+    ctab = table(Y, 1)
+    features[1, 11] = nrow(ctab) # number of classes
+    minCat = min(ctab) / nrow(ctab)
+    maxCat = max(ctab) / nrow(ctab)
+    # class imabalance 1=YES, 0=NO
+    features[1, 12]= ifelse((maxCat - minCat) > 0.3, 1, 0)
+  }
+  else 
+  {
+    features[1, 11] = 0
+    features[1, 12] = 0
+  }
+  features[1, 13] = nrow(X)
+  features[1, 14] = ncol(X)
+  
+}
+
+
+######################################################################
+# # Function for cross validation using hold out method
+# # Inputs: The input dataset X, Y and the value of k validation, mask of the 
+# # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
+# # via grid-search and a boolean value of (un)weighted accuracy.
+# # Output: It return a matrix having the accuracy of each fold.
+######################################################################
+
+compareValue = function(Matrix[double] dirtyX, Matrix[double] fixedX,  
Matrix[Double] cleanX, Matrix[Double] mask) 
+return (Double precision, Double T)
+{
+  t1 = time()
+  DEFAULT = 404
+  mv = is.na(dirtyX)
+  correctionsRequired = 0
+  mv = is.na(fixedX)
+  dirtyX = replace(target= dirtyX, pattern=NaN, replacement=DEFAULT)
+  cleanX = replace(target= cleanX, pattern=NaN, replacement=DEFAULT)
+  fixedX = replace(target= fixedX, pattern=NaN, replacement=DEFAULT)
+  diffCleanDirty =  sum((abs(cleanX - dirtyX) < 0.001) < 1) #sum(cleanX == 
dirtyX) #
+  print("dirty != clean: "+diffCleanDirty)
+  correctionsRequired =  (abs(cleanX - dirtyX) < 0.001) < 1#dirtyX != cleanX
+  print("corrections required: "+sum(correctionsRequired))
+  correctionsMade =  sum(dirtyX != fixedX)
+  print("corrections made: "+correctionsMade)
+  dim = nrow(dirtyX) * ncol(dirtyX) 
+  match = (abs(cleanX - fixedX) < 0.001) * correctionsRequired
+  print("total matches "+sum(match))
+  # print("total matches \n"+toString(match))
+  precision = max(0.001, sum(match) / correctionsMade)
+  T = floor((time() - t1) / 1e+6)
+  print("Precision: "+toString(precision) + " in "+T+" ms")
+
+
+}
+
+pipToString = function(Frame[String] F)
+return (String s)
+{
+  s = ""
+  for(i in 1:ncol(F))
+    s = s + as.scalar(F[,i])+";"
+
+}
diff --git a/scripts/builtin/discoverFD.dml b/scripts/builtin/discoverFD.dml
index 1bb0a21..dbc4fbf 100644
--- a/scripts/builtin/discoverFD.dml
+++ b/scripts/builtin/discoverFD.dml
@@ -74,11 +74,11 @@ m_discoverFD = function(Matrix[Double] X, Matrix[Double] 
Mask, Double threshold)
   parfor(i in 1 : d, check=0) {
     index_i = as.scalar(cm2[i,1])
     ndX = as.scalar(cm[1,index_i])
-    if( ndX!=1 & ndX!=n) {
+    if(ndX!=1 & ndX != n) {
       Xi = X[,index_i];
       k = ifelse(threshold < 1, 1, (i+1)); # enumerate only upper triangle if 
threshold = 1
       parfor(j in k:d , check=0) {
-        if(j != i) {
+        if((j != i) & (j > 0) & (j <= d)) {
           index_j = as.scalar(cm2[j,1])
           [A_determines_B, ratio] = isFD(Xi, X[,index_j], ndX);
           if(A_determines_B | ratio >= threshold)
diff --git a/scripts/builtin/executePipeline.dml 
b/scripts/builtin/executePipeline.dml
index 3ca0240..b646464 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -21,9 +21,9 @@
 
 s_executePipeline = function(Frame[String] pipeline, Matrix[Double] X,  
Matrix[Double] Y, Matrix[Double] mask,
   Matrix[Double] FD, Matrix[Double] hyperParameters, Integer flagsCount, 
Boolean verbose)
-  return (Matrix[Double] X, Matrix[Double] Y)
+  return (Matrix[Double] X, Matrix[Double] Y, Double t2)
 {
-
+  t1 = time();
   print("PIPELINE EXECUTION START ... "+toString(pipeline))
 
   if(verbose) {
@@ -47,6 +47,8 @@ s_executePipeline = function(Frame[String] pipeline, 
Matrix[Double] X,  Matrix[D
 
     X = confirmMeta(X, mask)
   }
+  t2 = floor((time() - t1) / 1e+6)
+  print("PIPELINE EXECUTION ENDED: "+t2+" ms")
 }
 
 # This function will convert the matrix row-vector into list
@@ -214,7 +216,6 @@ return (Matrix[Double] dX_train) {
 }
 
 
-
 #######################################################################
 # Wrapper of imputeByFD OHE call, to call inside eval as a function
 # Inputs: The input dataset X, and  mask of the columns and threshold value
@@ -293,7 +294,7 @@ fillDefault = function(Matrix[Double] X)
 return(Matrix[Double] X){
   defaullt = round(colMaxs(X) - colMins(X))
   Mask = is.na(X)
-  X = replace(target=X, pattern=NaN, replacement=0)
+  X = replace(target=X, pattern=NaN, replacement=max(X))
   Mask = Mask * defaullt
   X = X + Mask
 }
diff --git a/scripts/builtin/imputeByMedian.dml 
b/scripts/builtin/imputeByMedian.dml
index 1a8a9f7..73931b6 100644
--- a/scripts/builtin/imputeByMedian.dml
+++ b/scripts/builtin/imputeByMedian.dml
@@ -51,7 +51,7 @@ return(Matrix[Double] X)
   cols = ncol(nX)
   #  median imputation
   colMedian = matrix(0, 1, cols)
-  parfor(i in 1:cols)
+  for(i in 1:cols, check=0)
     colMedian[1, i] = median(X[,i])
   X_n = nX + (Mask_n * colMedian)
   # mode imputation
diff --git a/scripts/pipelines/scripts/utils.dml 
b/scripts/pipelines/scripts/utils.dml
index aa9338a..8b23536 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -18,20 +18,8 @@
 # under the License.
 #
 #-------------------------------------------------------------
-vectorToCsv = function(Matrix[Double] mask)
-return (String indexes){
+source("scripts/builtin/bandit.dml") as bandit;
 
-  vector  = mask * t(seq(1, ncol(mask)))
-  vector = removeEmpty(target = vector, margin = "cols")
-  if(nrow(vector) >  ncol(vector))
-    vector = t(vector)
-  s = ""
-  if(ncol(vector) > 1) {
-    for(i in 1:ncol(vector)-1)
-      s = s+as.integer(as.scalar(vector[1,i]))+","
-  }
-  indexes = s+as.integer(as.scalar(vector[1,ncol(vector)]))
-}
 
 # remove empty wrapper for frames
 frameRemoveEmpty = function(Frame[Unknown] target, String margin, 
Matrix[Double] select)
@@ -40,19 +28,6 @@ return (Frame[Unknown] frameblock)
   idx = seq(1, ncol(target))
   # get the indexes of columns for recode transformation
   index = vectorToCsv(idx)
-  #########################################################
-  # vector  = mask * t(seq(1, ncol(mask)))
-  # vector = removeEmpty(target = vector, margin = "cols")
-  # if(nrow(vector) >  ncol(vector))
-    # vector = t(vector)
-  # s = ""
-  # if(ncol(vector) > 1) {
-    # for(i in 1:ncol(vector)-1)
-      # s = s+as.integer(as.scalar(vector[1,i]))+","
-  # }
-  # index = s+as.integer(as.scalar(vector[1,ncol(vector)]))
-  #########################################################
-  
   # recode logical pipelines for easy handling
   jspecR = "{ids:true, recode:["+index+"]}";
   [Xd, M] = transformencode(target=target, spec=jspecR);
@@ -60,18 +35,13 @@ return (Frame[Unknown] frameblock)
   if(nrow(select) > 1)
     X = removeEmpty(target = X, margin = margin, select = select)
   else  
-    {
-      X = removeEmpty(target = X, margin = margin)
-    }
+    X = removeEmpty(target = X, margin = margin)
     
   frameblock = transformdecode(target = Xd, spec = jspecR, meta = M)
   frameblock = frameblock[1:nrow(X), 1:ncol(X)]
 }
 
 
-
-
-
 #######################################################################
 # Function for group-wise/stratified sampling from all classes in labelled 
dataset
 # Inputs: The input dataset X, Y  and  sampling ratio between 0 and 1
@@ -95,10 +65,8 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, 
Double ratio)
     out_s = 1 
     out_e = 0
     end_class = 0
-  
     out = matrix(0, sampled, ncol(XY))
     classes_ratio = floor(classes*ratio)
-    print("class ratio "+toString(classes_ratio))
     for(i in 1:nrow(classes))
     {
       end_class = end_class + as.scalar(classes[i])
@@ -132,134 +100,24 @@ return (Matrix[Double] dX_train) {
 
 }
 
-# # function to classify the data using cross validation
-# fclassify = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] 
mask, Matrix[Double] MLhp,
-  # Double testAccuracy, Boolean isWeighted, Integer cv=3)
-  # return (Double accuracy)
-# {
- 
-  # if(max(Y) == min(Y)) {
-    # print("Y contains only one class")
-    # accuracy = as.double(0)
-  # }
-  # else { 
-    # print("STARTING "+cv+" CROSS VALIDATIONS")
-    # # do the k = 3 cross validations
-    # accuracyMatrix = crossV(X, Y, cv, mask, MLhp, isWeighted)
-    # accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
-    # acc = colMeans(accuracyMatrix)
-    # accuracy = as.scalar(acc[1,1])
-    # print("validation accuracy "+accuracy)
-  # }
-# }
-
-
-
-# ####################################################################
-# # Function for classifying the dirty dataset, makes a call to crossV()
-# # Inputs: takes the input dataset X, Y and the value of k validation, mask 
of the 
-# # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
-# # via gridsearch and a boolean value of (un)weighted accuracy.
-# # Output: It return a matrix having the accuracy of each fold.
-# ####################################################################
-# classifyDirty = function(Matrix[Double] Xtrain, Matrix[Double] ytrain, 
Matrix[Double] opt, 
-  # Matrix[Double] mask, Boolean isWeighted = TRUE, Integer cv)
-  # return (Double accuracy)
-# {
-  # # # classify without cleaning fill with edfault values 1
-  # Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
-  # dX_train = dummycoding(Xtrain, mask)
-  # accuracy = crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
-  # accuracy = mean(accuracy)
-  # print("cross validated dirty accuracy "+accuracy)
-# }
-
-
-# # # ######################################################################
-# # # # # Function for cross validation using hold out method
-# # # # # Inputs: The input dataset X, Y and the value of k validation, mask 
of the 
-# # # # # dataset for OHE of categorical columns, vector of ML 
hyper-parameters identified 
-# # # # # via gridsearch and a boolean value of (un)weighted accuracy.
-# # # # # Output: It return a matrix having the accuracy of each fold.
-# # # ######################################################################
-
-# crossV = function(Matrix[double] X, Matrix[double] y, Integer k, 
Matrix[Double] mask,
-  # Matrix[Double] MLhp, Boolean isWeighted) 
-# return (Matrix[Double] accuracyMatrix)
-# {
-
-  # accuracyMatrix = matrix(0, k, 1)
-
-  # dataList = list()
-  # testL = list()
-  # data = order(target = cbind(y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
-  # classes = table(data[, 1], 1)
-  # ins_per_fold = classes/k
-  # start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
-  # fold_idxes = cbind(start_fold, ins_per_fold)
 
-  # start_i = 0; end_i = 0; idx_fold = 1;;
-  # for(i in 1:k)
-  # {
-    # fold_i = matrix(0, 0, ncol(data))
-    # start=0; end=0; 
-    # for(j in 1:nrow(classes))
-    # {
-      # idx = as.scalar(classes[j, 1])
-      # start = end + 1;
-      # end = end + idx
-      # class_j =  data[start:end, ]
-
-      # start_i = as.scalar(fold_idxes[j, 1]);
-      # end_i = as.scalar(fold_idxes[j, 2])
-
-      # fold_i = rbind(fold_i, class_j[start_i:end_i, ])
-    # }
-
-    # dataList = append(dataList, fold_i)
-    # fold_idxes[, 1] = fold_idxes[, 2] + 1
-    # fold_idxes[, 2] += ins_per_fold
-    # while(FALSE){}
-  # }
-
-  # for(i in seq(1,k))
-  # {
-    # [trainList, hold_out] = remove(dataList, i)
-    # trainset = rbind(trainList)
-    # testset = as.matrix(hold_out)
-    # trainX = trainset[, 2:ncol(trainset)]
-    # trainy = trainset[, 1]
-    # testX = testset[, 2:ncol(testset)]
-    # testy = testset[, 1]
-    # beta = multiLogReg(X=trainX, Y=trainy, icpt=1, reg=as.scalar(MLhp[1,1]), 
tol= 1e-9, 
-    # maxi=as.scalar(MLhp[1,2]), maxii= 50, verbose=FALSE);
-    # [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
-    # accuracy = getAccuracy(testy, yhat, isWeighted)
-    # accuracyMatrix[i] = accuracy
-  # }
-# }
-
-
-######################################################################
-# # Function for cross validation using hold out method
-# # Inputs: The input dataset X, Y and the value of k validation, mask of the 
-# # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
-# # via grid-search and a boolean value of (un)weighted accuracy.
-# # Output: It return a matrix having the accuracy of each fold.
-######################################################################
-
-compareValue = function(Matrix[double] dirtyX, Matrix[double] cleanX,  
Matrix[Double] fixedX, Matrix[Double] mask) 
-return (Double precision)
+####################################################################
+# Function for classifying the dirty dataset, makes a call to crossV()
+# Inputs: takes the input dataset X, Y and the value of k validation, mask of 
the 
+# dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
+# via gridsearch and a boolean value of (un)weighted accuracy.
+# Output: It return a matrix having the accuracy of each fold.
+####################################################################
+classifyDirty = function(Matrix[Double] Xtrain, Matrix[Double] ytrain, 
Matrix[Double] opt, 
+  Matrix[Double] mask, Boolean isWeighted = TRUE, Integer cv)
+  return (Double accuracy)
 {
-  dirtyX = replace(target= dirtyX, pattern=NaN, replacement=0)
-  cleanX = replace(target= cleanX, pattern=NaN, replacement=0)
-  fixedX = replace(target= fixedX, pattern=NaN, replacement=0)
-  correctionsRequired = dirtyX != cleanX
-  correctionsMade = (dirtyX != fixedX)
-  allCorrections_ = sum(correctionsMade)
-  match = (abs(cleanX - fixedX) < 0.1)  * correctionsRequired
-  precision = max(0.001, sum(match) / allCorrections_)
-  print("---------------------------------true positives are 
"+toString(precision))
+  # # classify without cleaning fill with edfault values 1
+  Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
+  dX_train = dummycoding(Xtrain, mask)
+  accuracy = bandit::crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
+  accuracy = mean(accuracy)
+  print("cross validated dirty accuracy "+accuracy)
 }
 
 # constraints over hyper parameters
@@ -281,7 +139,6 @@ return (Double minVal, Double maxVal) {
       minVal = 2.0
     }
   }
-
 }
 
 
@@ -304,50 +161,3 @@ return(Boolean validForResources)
   validForResources = count > 0
 }
 
-
-###############################################################################################
-# The function will collect the features like statistics and pipelines and 
accuracy 
-# so that they could be used for training a model and predicting pipelines 
without enumeration
-###############################################################################################
-gatherStats = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] 
mask, String target)
-return (Matrix[Double] features)
-{
-
-  features = matrix(0, rows = 1, cols= 14)
-  features[1, 1]=  sum(is.na(X)) # number of missing values
-  X = replace(target= X, pattern = NaN, replacement = 0)
-  num = removeEmpty(target=X, margin="cols", select=(mask == 0))
-  # get the stats
-  features[1, 2] =  min(num) # minimum value
-  features[1, 3] = max(num)
-  features[1, 4] = mean(colMins(num)) # average minimum value
-  features[1, 5] = mean(colMaxs(num)) # average maximum value
-  features[1, 6] = sum(mask) # number of categorical features
-  features[1, 7] = sum(mask == 0) # number of numerical features
-  features[1, 8] = mean(num) # mean value
-  colSd = colSds(num)
-  count3sdplus = sum(num > (colMeans(num) + 3*colSd )) 
-  count3sdminus = sum(num < (colMeans(num) - 3*colSd )) 
-  outliers = count3sdplus + count3sdminus
-  features[1, 9] = outliers
-  # OHE features 
-  OHE = sum(colMaxs(X) * mask)
-  features[1, 10] = OHE
-  if(target != "compare")
-  {
-    ctab = table(Y, 1)
-    features[1, 11] = nrow(ctab) # number of classes
-    minCat = min(ctab) / nrow(ctab)
-    maxCat = max(ctab) / nrow(ctab)
-    # class imabalance 1=YES, 0=NO
-    features[1, 12]= ifelse((maxCat - minCat) > 0.3, 1, 0)
-  }
-  else 
-  {
-    features[1, 11] = 0
-    features[1, 12] = 0
-  }
-  features[1, 13] = nrow(X)
-  features[1, 14] = ncol(X)
-  
-}
\ No newline at end of file
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
index 1328112..ba2f2ef 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
@@ -87,7 +87,7 @@ public class BuiltinMiceTest extends AutomatedTestBase {
                        loadTestConfiguration(getTestConfiguration(TEST_NAME));
                        String HOME = SCRIPT_DIR + TEST_DIR;
                        fullDMLScriptName = HOME + TEST_NAME + ".dml";
-                       programArgs = new String[]{"-nvargs", "X=" + DATASET, 
"Mask="+input("M"), 
+                       programArgs = new String[]{"-nvargs", "X=" + DATASET, 
"Mask="+input("M"),
                                "iteration=" + iter, "dataN=" + output("N"), 
"dataC=" + output("C")};
                        if (lineage) {
                                programArgs = (String[]) 
ArrayUtils.addAll(programArgs, new String[] {
diff --git a/src/test/scripts/functions/pipelines/compareAccuracy.dml 
b/src/test/scripts/functions/pipelines/compareAccuracy.dml
index 5cd7cda..22165c6 100644
--- a/src/test/scripts/functions/pipelines/compareAccuracy.dml
+++ b/src/test/scripts/functions/pipelines/compareAccuracy.dml
@@ -40,6 +40,8 @@
 
######################################################################################################################
 
 source("scripts/pipelines/scripts/utils.dml") as utils;
+source("scripts/builtin/bandit.dml") as bandit;
+
 
 F = read($1, data_type="frame", format="csv", header=FALSE, 
   naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
@@ -95,37 +97,9 @@ FD = FD > 0
 # this condition is unnecessary here in this case because the input dataset is 
balanced and 
 # instead of diving the dataset into train/test I am doing cross validations
 
-print("hp matrix")
 no_of_param = as.scalar(hp[1, 1]) + 1
 hp_width= hp[1, 2:no_of_param]
 hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
-index = 1
-# for(i in 1:ncol(pip))
-# {
-  # no_of_param = as.scalar(hp[1, index])
-  # hp_matrix[i, 1:no_of_param] = hp[1, 2:no_of_param+1]
-  # index = index + no_of_param + 2
-# }
-
-
-
-print(toString(hp_matrix))
-
-# while(k <= ncol(pip))
-# {
-  # end = as.integer(i+as.integer(as.scalar(hp[1,i]))) 
-  # mat = hp[1, i+1:end]
-  # i = end + 1
-  # if(as.scalar(pip[1,k]) != "SMOTE") {
-    # pip1 = cbind(pip1, pip[1,k] )
-    # ls = append(ls, mat)
-  # }
-  # k = k + 1
-# }
-
-
-print("ncol in X "+ncol(eX))
-print("ncol in mask "+ncol(getMask))
 
 # # clean using best pipeline 
 [cX , cY] = executePipeline(pip[1], eX, eY, getMask, FD, hp_matrix, 5, FALSE)
@@ -148,12 +122,12 @@ oX = oX[, 1:ncol(oX) - 1]
 
 
 # do the k cross validations for original clean data
-accuracyMatrix = crossV(oX, oY, 3, as.matrix(0), matrix("0.000001 100", 
rows=1, cols=2), TRUE)
+accuracyMatrix = bandit::crossV(oX, oY, 3, as.matrix(0), matrix("0.000001 
100", rows=1, cols=2), TRUE)
 accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
 oAcc = mean(accuracyMatrix)
 
 # do the k cross validations for cleaned data
-accuracyMatrix = crossV(cX, cY, 3, as.matrix(0), matrix("0.000001 100", 
rows=1, cols=2), TRUE)
+accuracyMatrix = bandit::crossV(cX, cY, 3, as.matrix(0), matrix("0.000001 
100", rows=1, cols=2), TRUE)
 accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
 cAcc = mean(accuracyMatrix)
 tol = 1
@@ -163,71 +137,3 @@ print("clean accuracy "+cAcc)
 print("original accuracy "+oAcc)
 write(results, $5, format = "text")
 
-
-# ######################################################################
-# # # Function for cross validation using hold out method
-# # # Inputs: The input dataset X, Y and the value of k validation, mask of 
the 
-# # # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
-# # # via gridsearch and a boolean value of (un)weighted accuracy.
-# # # Output: It return a matrix having the accuracy of each fold.
-# ######################################################################
-
-crossV = function(Matrix[double] X, Matrix[double] y, Integer k, 
Matrix[Double] mask,
-  Matrix[Double] MLhp, Boolean isWeighted) 
-return (Matrix[Double] accuracyMatrix)
-{
-
-  accuracyMatrix = matrix(0, k, 1)
-
-  dataList = list()
-  testL = list()
-  data = order(target = cbind(y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
-  classes = table(data[, 1], 1)
-  ins_per_fold = classes/k
-  start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
-  fold_idxes = cbind(start_fold, ins_per_fold)
-
-  start_i = 0; end_i = 0; idx_fold = 1;;
-  for(i in 1:k)
-  {
-    fold_i = matrix(0, 0, ncol(data))
-    start=0; end=0; 
-    for(j in 1:nrow(classes))
-    {
-      idx = as.scalar(classes[j, 1])
-      start = end + 1;
-      end = end + idx
-      class_j =  data[start:end, ]
-
-
-      start_i = as.scalar(fold_idxes[j, 1]);
-      end_i = as.scalar(fold_idxes[j, 2])
-
-      fold_i = rbind(fold_i, class_j[start_i:end_i, ])
-    }
-
-    dataList = append(dataList, fold_i)
-    fold_idxes[, 1] = fold_idxes[, 2] + 1
-    fold_idxes[, 2] += ins_per_fold
-    while(FALSE){}
-  }
-
-  for(i in seq(1,k))
-  {
-    [trainList, hold_out] = remove(dataList, i)
-    trainset = rbind(trainList)
-    testset = as.matrix(hold_out)
-    trainX = trainset[, 2:ncol(trainset)]
-    trainy = trainset[, 1]
-    testX = testset[, 2:ncol(testset)]
-    testy = testset[, 1]
-    beta = multiLogReg(X=trainX, Y=trainy, icpt=1, reg=as.scalar(MLhp[1,1]), 
tol= 1e-9, 
-    maxi=as.scalar(MLhp[1,2]), maxii= 50, verbose=FALSE);
-    [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
-    accuracy = getAccuracy(testy, yhat, isWeighted)
-    accuracyMatrix[i] = accuracy
-  }
-
-}
-
-
diff --git a/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv 
b/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv
index a1cfa4f..85972c6 100644
--- a/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv
@@ -1,5 +1,5 @@
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,69.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,69.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,58.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,89.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,61.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,77.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,77.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,68.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,56.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,56.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/pipelines.csv 
b/src/test/scripts/functions/pipelines/intermediates/pipelines.csv
index 601f82a..e9a6697 100644
--- a/src/test/scripts/functions/pipelines/intermediates/pipelines.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/pipelines.csv
@@ -2,4 +2,4 @@ imputeByMedian,scale,dummycoding,pca
 imputeByMedian,scale,dummycoding,pca
 imputeByMedian,scale,dummycoding,pca
 imputeByMean,scale,dummycoding,pca
-imputeByMedian,scale,dummycoding,pca
+imputeByMean,scale,dummycoding,pca
diff --git a/src/test/scripts/functions/pipelines/mainScript.dml 
b/src/test/scripts/functions/pipelines/mainScript.dml
index 5422ae6..7883e67 100644
--- a/src/test/scripts/functions/pipelines/mainScript.dml
+++ b/src/test/scripts/functions/pipelines/mainScript.dml
@@ -139,7 +139,7 @@ FD = FD > 0
 logical1 =  frame(["4", "MVI", "SCALE", "DUMMY", "DIM", "0", "0", "0"], 
rows=1, cols=8)
 logical2 =  frame(["2", "MVI", "DUMMY", "0", "0", "0", "0", "0"], rows=1, 
cols=8)
 logical3 =  frame(["3", "MVI", "SCALE", "DUMMY", "0", "0", "0", "0"], rows=1, 
cols=8)
-logical4 =  frame(["7", "MVI", "OTLR", "CI", "SCALE", "DUMMY", "DIM", "0"], 
rows=1, cols=8)
+logical4 =  frame(["6", "MVI", "OTLR", "CI", "SCALE", "DUMMY", "DIM", "0"], 
rows=1, cols=8)
 logical5 = frame(["7", "MVI", "OTLR", "MVI", "CI", "SCALE", "DUMMY", "DIM"], 
rows=1, cols=8)
 logical6 = frame(["6", "OTLR", "MVI", "CI", "SCALE", "DUMMY", "DIM", "0"], 
rows=1, cols=8)
 
diff --git a/src/test/scripts/functions/pipelines/testClassification.dml 
b/src/test/scripts/functions/pipelines/testClassification.dml
index 1a33bf2..45c6761 100644
--- a/src/test/scripts/functions/pipelines/testClassification.dml
+++ b/src/test/scripts/functions/pipelines/testClassification.dml
@@ -24,6 +24,7 @@ source("scripts/pipelines/scripts/utils.dml") as utils;
 source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
 source("scripts/pipelines/scripts/gridsearchMLR.dml") as gs;
 
+
 # read the inputs
 F = read($dirtyData, data_type="frame", format="csv", header=TRUE, 
   naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
@@ -43,9 +44,6 @@ targetApplicaton = $target # accuracy flag
 if(nrow(metaInfo) < 2)
   stop("incomplete meta info")
 
- # Do the initial cleaning
- 
- 
 getSchema = metaInfo[1, 2:ncol(metaInfo)]
 getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
 getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for 
FD computation
@@ -84,39 +82,36 @@ getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the 
mask of class label
 lgSeed = logical::generateLogicalSeed(eX, eY, getMask, targetApplicaton)
 allLgs = logical::transformLogical(lgSeed)
 
-
 d_accuracy = 0
 # 4. perform the sampling
 
-[eX, eY] = doSample(eX, eY, sample)
+[eX, eY] = utils::doSample(eX, eY, sample)
 
 # 5. get train test and validation set with balanced class distribution
-# [X_train, y_train, X_test, y_test] = splitBalanced(X=eX, Y=eY, 
splitRatio=0.7, verbose=FALSE)
-X_train = eX
-y_train = eY
+[X_train, y_train, X_test, y_test] = splitBalanced(X=eX, Y=eY, splitRatio=0.7, 
verbose=FALSE)
+
 # 6. find the best hyper parameters for classification algorithm
 # for now only find the best values for intercept and maximum outer iteration
 params = list("reg", "maxi");
 paramRanges = list(10^seq(0,-10), seq(10,100, 10));
-# if(sum(getMask) > 0)
-# {
-  # dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test), 
pattern = NaN, replacement=0), getMask)
-  # dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),] 
-  # dX_train = dX_train[1:nrow(y_train),] 
-  # [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test, 
-  # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
-#  }
-# else  
-  # [opt, loss] = gs::gridSearchMLR(X_train, y_train, X_test, y_test, 
-    # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
-# hardcoded hyper-params for multilogReg
-opt = matrix("0 100", 1, 2)
-
+if(sum(getMask) > 0)
+{
+  dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test), 
pattern = NaN, replacement=0), getMask)
+  dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),] 
+  dX_train = dX_train[1:nrow(y_train),] 
+  [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test, 
+  "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+ }
+else  
+  [opt, loss] = gs::gridSearchMLR(X_train, y_train, X_test, y_test, 
+    "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+
+# as I am testing on CV not on holdout train/test
+X_train = eX
+y_train = eY
 # 7. get the cross validated accuracy on dirty dataset (only on training set)
-d_accuracy = classifyDirty(X_train, y_train, opt, getMask, weightedAccuracy, 
crossValidations)
-# print("dirty accuracy is "+d_accuracy)
- # [eX, eY] = prioritise(eX, eY, getMask)
- 
+d_accuracy = utils::classifyDirty(X_train, y_train, opt, getMask, 
weightedAccuracy, crossValidations)
+
 FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1), 
Mask=getFdMask, threshold=0.8)
 FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD 
 FD = FD > 0
@@ -128,11 +123,12 @@ targetClassification = list(target=targetApplicaton, 
cv=crossValidations, wAccur
 # # initialize output variables
 pip = as.frame("NULL"); hp = matrix(0,0,0); acc = matrix(0,0,0); features = 
as.frame("NULL")
 
-[pip, hp, acc, features] = bandit(X_train=eX, Y_train=eY,  metaList=metaList, 
targetList=targetClassification, lp=allLgs[1],
-  primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
 
 output = $output
-write(features, output+"/features.csv", format="csv")
+
+[pip, hp, acc, features] = bandit(X_train=eX, Y_train=eY,  metaList=metaList, 
targetList=targetClassification, lp=allLgs[1,],
+  primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
+
 
 
 if(as.scalar((is.na(acc[1,1]))) == 1 | as.scalar(acc[1,1]) < d_accuracy)
@@ -146,54 +142,24 @@ print("best hyperparam")
 print(toString(hp))
 
 print("best accuracy")
-print(toString(acc))
+print(toString(acc[1, 1]))
 
 
-clean_accuracy = as.scalar(acc[1,1])
+clean_accuracy = max(acc[1,1])
 
 
 result = d_accuracy < clean_accuracy  
 print("result satisfied ------------"+result)
 
-accuracies = cbind(as.matrix(d_accuracy), as.matrix(clean_accuracy))
-
 
 write(pip, output+"/pipelines.csv", format="csv")
 write(hp, output+"/hyperparams.csv", format="csv")
 write(acc, output+"/accuracies.csv", format="csv")
+accuracies = cbind(as.matrix(d_accuracy), acc[1,1])
 write(accuracies , output+"/BestAccuracy.csv", format="csv")
+write(features, output+"/features.csv", format="csv")
 write(result , $O)
 
-
-
-
-####################################################################
-# Function for classifying the dirty dataset, makes a call to crossV()
-# Inputs: takes the input dataset X, Y and the value of k validation, mask of 
the 
-# dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
-# via grid-search and a boolean value of (un)weighted accuracy.
-# Output: It return a matrix having the accuracy of each fold.
-####################################################################
-classifyDirty = function(Matrix[Double] Xtrain, Matrix[Double] ytrain, 
Matrix[Double] opt, 
-  Matrix[Double] mask, Boolean isWeighted = TRUE, Integer cv)
-  return (Double accuracy)
-{
-  # # classify without cleaning fill with default values 1
-  Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
-  if(sum(mask) > 0)
-    Xtrain = utils::dummycoding(Xtrain, mask)
-  # print("rows in data ")
-  # print(nrow(dX_train))
-  # print("column in data")
-  # print(ncol(dX_train))
-  accuracy = crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
-  accuracy = mean(accuracy)
-  print("cross validated dirty accuracy "+accuracy)
-}
-
-
-
-
 lossFunc = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) 
 return (Matrix[Double] loss) {
   [prob, yhat, acc] = multiLogRegPredict(X=X, B=B, Y=y,  verbose=FALSE)
diff --git a/src/test/scripts/functions/pipelines/testCompare.dml 
b/src/test/scripts/functions/pipelines/testCompare.dml
index dc2bf84..019df79 100644
--- a/src/test/scripts/functions/pipelines/testCompare.dml
+++ b/src/test/scripts/functions/pipelines/testCompare.dml
@@ -20,7 +20,7 @@
 #-------------------------------------------------------------
 # Generate the logical pipelines for data cleaning
 
-source("scripts/pipelines/scripts/utils.dml") as utils;
+
 source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
 
 # read the inputs
@@ -64,7 +64,7 @@ X = dropInvalidType(F, getSchema)
 if(sum(getMask) > 0)
 {
   # always recode the label
-  index = utils::vectorToCsv(getMask)
+  index = vectorToCsv(getMask)
   jspecR = "{ids:true, recode:["+index+"]}"
 
   [eX, X_meta] = transformencode(target=rbind(cleanData, X), spec=jspecR);
@@ -78,8 +78,10 @@ if(sum(getMask) > 0)
 
 } 
 # if no categorical value exist then just cast the frame into matrix
-else
+else {
   eX = as.matrix(X)
+  cleanX = as.matrix(cleanData)  
+}
   
 
 # get the logical seed

[systemds] branch master updated: [SYSTEMDS-2961] Refactor Cleaning Pipelines (2) 1 .remove redundant functions 2. return execution time in executePipelines and CV 3. store execution time for each pipeline in the feature vector

Reply via email to