This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new ecee97f  [MINOR] Various cleanups in cleaning pipelines   1. FDs are 
passed as meta data   2. All meta variables are passed via a list   3. Test 
added for classification and compare target applications   4. warning are fixed 
by initializing variables   5. Possible deduplication of tasks in separate 
namespaces   TODO : Add tests for clustering and regression. Fix logical 
pipelines
ecee97f is described below

commit ecee97f56a4b93b1cf7dd08934f04a8795c710ac
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Mon Mar 29 18:00:08 2021 +0200

    [MINOR] Various cleanups in cleaning pipelines
      1. FDs are passed as meta data
      2. All meta variables are passed via a list
      3. Test added for classification and compare target applications
      4. warning are fixed by initializing variables
      5. Possible deduplication of tasks in separate namespaces
      TODO : Add tests for clustering and regression. Fix logical pipelines
---
 scripts/builtin/bandit.dml                         | 442 ++++++++-------------
 scripts/builtin/discoverFD.dml                     |   1 +
 scripts/builtin/executePipeline.dml                | 132 +++++-
 scripts/builtin/mice.dml                           |   6 +-
 scripts/builtin/multiLogRegPredict.dml             |   2 +-
 scripts/builtin/pca.dml                            |   7 +-
 scripts/builtin/splitBalanced.dml                  |   1 +
 scripts/pipelines/properties/param.csv             |  32 +-
 scripts/pipelines/properties/primitives.csv        |   3 +-
 scripts/pipelines/scripts/logicalFunc.dml          |  79 ++--
 scripts/pipelines/scripts/utils.dml                | 156 +++++++-
 ...ngTest.java => CleaningTestClassification.java} |  23 +-
 ...{CleaningTest.java => CleaningTestCompare.java} |  51 +--
 .../functions/pipelines/compareAccuracy.dml        |  57 ++-
 .../functions/pipelines/intermediates/acc.csv      |   5 -
 .../functions/pipelines/intermediates/hp.csv       |   5 -
 .../functions/pipelines/intermediates/pip.csv      |   5 -
 .../scripts/functions/pipelines/mainScript.dml     | 394 +++++++++---------
 .../functions/pipelines/meta/meta_census.csv       |   2 +-
 .../functions/pipelines/testClassification.dml     | 203 ++++++++++
 .../scripts/functions/pipelines/testCompare.dml    | 138 +++++++
 21 files changed, 1097 insertions(+), 647 deletions(-)

diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index feb16c4..a2399af 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -18,12 +18,13 @@
 # under the License.
 #
 #-------------------------------------------------------------
+source("scripts/pipelines/scripts/utils.dml") as utils;
 
-m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, 
Matrix[Double] mask, Matrix[Double] MLhp,
-  Frame[Unknown] schema, Frame[Unknown] lp, Frame[Unknown] primitives, 
Frame[Unknown] param,  Integer k = 3,
-  Double testAccuracy = 0.8, Boolean isWeighted, Integer R=50, Integer cv=3, 
Boolean verbose = TRUE)
-  return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams,  
Matrix[Double] bestAccuracy) 
+m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, 
List[Unknown] metaList, List[Unknown] targetList,
+  Frame[Unknown] lp, Frame[Unknown] primitives, Frame[Unknown] param,  Integer 
k = 3, Integer R=50, Boolean verbose = TRUE)
+  return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams,  
Matrix[Double] bestAccuracy, Frame[String] feaFrameOuter) 
 {
+  NUM_FEATURES = 14
   print("null in data "+sum(is.na(X_train)))
   bestPipeline = frame("", rows=1, cols=1)
   bestHyperparams = as.matrix(0)
@@ -38,6 +39,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, Matrix[Doubl
   hparam = matrix(0, rows=k*(s_max+1), cols=55)
   pipeline = frame(0, rows=k*(s_max+1), cols=ncol(lp)+1)
   startOut=0; endOut=0;
+  feaFrameOuter = frame("", rows = 1, cols = NUM_FEATURES + ncol(lp) + 1 )
+
   for(s in s_max:0, check = 0) {
     
    # result variables
@@ -50,7 +53,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, Matrix[Doubl
     r = R * eta^(-s);
     # get the physical pipelines, the pipelines, pipelines are recoded
     [configurations, n] = get_physical_configurations(lp, n, primitives)
-
+    
     # append configuration keys for extracting the pipeline later on
     id = seq(1, nrow(configurations))
     configurations = cbind(as.frame(id), configurations)
@@ -72,13 +75,13 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, Matrix[Doubl
       }
       
       configurations = configurations[1:n_i, ]      
-      [outPip,outHp] = run_with_hyperparam(configurations, r_i, X_train, 
Y_train, mask, 
-        MLhp, schema, param, isWeighted, testAccuracy, cv,  verbose)
+      [outPip,outHp, feaFrameOuter] = run_with_hyperparam(configurations, r_i, 
X_train, Y_train, metaList,
+        targetList, param, feaFrameOuter, verbose)
       # sort the pipelines by order of accuracy decreasing
       a = order(target = outPip, by = 1, decreasing=TRUE, index.return=FALSE)
       b = order(target = outHp, by = 1, decreasing=TRUE, index.return=FALSE)
       rowIndex = ifelse(nrow(a) >= k, k, nrow(a))
-            
+
       # maintain the brackets results
       end = end + rowIndex
       bracket_pipel[start:end, ] =  a[1:rowIndex,]
@@ -86,9 +89,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, Matrix[Doubl
       start = end + 1
 
       # sort the configurations fro successive halving
-      avergae_perf =  getMaxPerConf(outPip)     #as.frame(aggregate(target=a[, 
1], groups=a[, 2], fn="mean"))
-      print("configurations "+toString(configurations))
-      while(FALSE){}
+      avergae_perf =  getMaxPerConf(outPip) 
       configurations = frameSort(cbind(avergae_perf, configurations))
       configurations = configurations[, 2:ncol(configurations)]
     }
@@ -96,27 +97,15 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, Matrix[Doubl
     bracket_hp = removeEmpty(target=bracket_hp, margin="rows")
     # keep the best k results for each bracket
     [bracket_bestPipeline, bracket_bestHyperparams] = 
extractBracketWinners(bracket_pipel, bracket_hp, k, lookup)
-    
-    # print("after "+i+" bracket ")
-    # print(toString(bracket_bestPipeline))
-    # print("------------------")
-    # print(toString(bracket_bestHyperparams))  
-    # while(FALSE){}
-    
+    # optimize by the features
+
     startOut = endOut + 1
     endOut = endOut + nrow(bracket_bestPipeline)
     pipeline[startOut: endOut, ] = bracket_bestPipeline
     hparam[startOut:endOut, 1:ncol(bracket_bestHyperparams)] = 
bracket_bestHyperparams
   }
-  
-  # print("after all brackets ")
-  # while(FALSE){}
-  # print(toString(pipeline))
-  # print("------------------")
-  # print(toString(hparam))
-  # while(FALSE){}
-  # extract best top k from all iterations
-  [bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam, 
testAccuracy, k)
+
+  [bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam, 
as.scalar(targetList['dirAcc']), k)
 
   bestAccuracy = as.matrix(bestPipeline[,1])
   bestPipeline = bestPipeline[,2:ncol(bestPipeline)]
@@ -126,7 +115,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] 
Y_train, Matrix[Doubl
     print("best pipeline"+ toString(bestPipeline))
     print("best hyper-parameters \n"+ toString(bestHyperparams))
     print("best accuracy \n"+ toString(bestAccuracy))
-    print("dirty accuracy "+testAccuracy)
+    if(as.scalar(targetList['target']) != "compare")
+      print("dirty accuracy "+as.scalar(targetList['dirAcc']))
   }
 }
 
@@ -207,42 +197,59 @@ get_physical_configurations = function(Frame[String] 
logical, Scalar[int] numCon
 }
 
 # this method will call the execute pipelines with their hyper-parameters
-run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, 
Matrix[Double] X, Matrix[Double] Y,
-  Matrix[Double] mask, Matrix[Double] MLhp, Frame[Unknown] schema, 
Frame[Unknown] param, Boolean isWeighted,
-  Double testAccuracy, Integer cv=3, Boolean verbose)                    
-  return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam) {
+run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, 
Matrix[Double] X, Matrix[Double] Y, List[Unknown] metaList, 
+   List[Unknown] targetList, Frame[Unknown] param, Frame[Unknown] 
featureFrameOuter, Boolean verbose)                    
+  return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, 
Frame[Unknown] featureFrameOuter) {
 
   output_hp = matrix(0, nrow(ph_pip)*r_i, 50)
   output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1)
   output_pipelines = matrix(0, nrow(ph_pip)*r_i, 2)
-
+  
   # rows in validation set
   clone_X = X
   clone_Y = Y
   index = 1
   id = as.matrix(ph_pip[, 1])
   ph_pip = ph_pip[, 2:ncol(ph_pip)]
+  
+  feaVec = utils::gatherStats(X, Y, as.matrix(metaList['mask']), 
as.scalar(targetList['target']))
+
   for(i in 1:nrow(ph_pip))
   {
     # execute configurations with r resources
-    hp = getHyperparam(ph_pip[i], param, r_i)  
-    for(r in 1:r_i)
-    {    
-      [X, Y] = executePipeline(ph_pip[i], X, Y, mask, hp, r, FALSE)
-      accuracy = fclassify(X, Y, mask, MLhp, testAccuracy, isWeighted, cv)
-      hp_vec = listToVector(hp, FALSE)
+    [hp, no_of_res, no_of_flag_vars] = getHyperparam(ph_pip[i], param, r_i)
+    feaFrame = frame("", rows = no_of_res, cols = ncol(feaVec) + ncol(ph_pip) 
+ 1)
+    for(r in 1:no_of_res)
+    { 
+      # as the matrix first block of r rows belongs to first operator and r+1 
block of rows to second operator 
+      # we need to extract a row from each block
+      indexes = matrix(no_of_res, rows=ncol(ph_pip), cols=1)
+      indexes[1, 1] = r
+      indexes = cumsum(indexes)
+      indexes = table(indexes, 1, 1, nrow(hp), 1)
+      hp_matrix = removeEmpty(target = hp, margin="rows", select = indexes)
+      [X, Y] = executePipeline(ph_pip[i], X, Y, as.matrix(metaList['mask']), 
as.matrix(metaList['fd']), hp_matrix, no_of_flag_vars, FALSE)
+      if(as.scalar(targetList['target']) == "compare")
+        accuracy = utils::compareValue(clone_X, X, 
as.matrix(targetList['cleanData']), as.matrix(metaList['mask']))
+      else
+        accuracy = fclassify(X, Y, as.matrix(metaList['mask']), 
as.matrix(targetList['mlHp']), as.scalar(targetList['dirAcc']), 
+        as.scalar(targetList['wAccuracy']), as.scalar(targetList['cv']))
+      matrix_width = as.matrix(nrow(hp_matrix) * ncol(hp_matrix))
+      hp_vec = cbind(matrix_width, matrix(hp_matrix, rows=1, 
cols=nrow(hp_matrix)*ncol(hp_matrix), byrow=TRUE))
       output_accuracy[index, 1] = accuracy
       output_hp[index, 1:ncol(hp_vec)] = hp_vec
       output_pipelines[index, ] = cbind(as.matrix(i), id[i,1])
       X = clone_X
       Y = clone_Y
-      while(FALSE){}
-      index = index + 1
-      # hp = getHyperparam(ph_pip[i,], param)  
+      index = index + 1  
+      feaFrame[r, 1:ncol(feaVec)] = as.frame(feaVec)
+      feaFrame[r, ncol(feaVec)+1:ncol(feaVec)+ncol(ph_pip[1])] = ph_pip[i]
+      feaFrame[r, ncol(feaFrame)] = accuracy
     }
     
     X = clone_X
     Y = clone_Y
+    featureFrameOuter = rbind(featureFrameOuter, feaFrame)
   }
   output_hyperparam = removeEmpty(target=cbind(output_accuracy, output_hp), 
margin="rows")
   output_operator = removeEmpty(target=cbind(output_accuracy, 
output_pipelines) ,margin="rows")
@@ -250,13 +257,17 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, 
Integer r_i, Matrix[Double
 
 # extract the hyper-parameters for pipelines
 getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, 
Integer no_of_res)
-  return (List[Unknown] paramList)
+  return (Matrix[Double] paramMatrix, Integer no_of_res, Integer 
NUM_META_FLAGS)
 {
+
+  allParam = 0;
+  START_INDEX = 8 # value from where the hyper-params starts after skipping 
meta flags
+  NUM_META_FLAGS = 5
   # load the hyper-parameters values
   paramList = list()
-  allParam = 0;
   # store the row indexes of the operator matches
   indexes = matrix(0, rows= ncol(pipeline), cols=1)
+  paramCount = matrix(0, rows= ncol(pipeline), cols=1)
   for(k in 1:ncol(pipeline))
   {
     op = as.scalar(pipeline[1,k])
@@ -268,104 +279,71 @@ getHyperparam = function(Frame[Unknown] pipeline, 
Frame[Unknown]  hpList, Intege
     index = m_hasParam * seq(1, nrow(m_hasParam))
     index = as.scalar(removeEmpty(target = index, margin = "rows"))
     indexes[k] = index
-    no_of_param = as.integer(as.scalar(hpList[index, 2]))
-    allParam = no_of_param + allParam
+    paramCount[k] = as.integer(as.scalar(hpList[index, 2]))
   }
   # if there are no hyper-parameters than change the values of resources
   # so that the pipeline is only executed once and no resource are wasted, 
saving looping
-  no_of_res = ifelse(allParam > 0, no_of_res, 1)
-  
+  no_of_res = ifelse(sum(paramCount) > 0, no_of_res, 1)
+  # the below matrix stores the different combinations of hyper-parameter 
value for each pipeline
+  # if the resource value is greater than zero this means for 1 pipeline it 
will store r rows where each row store set
+  # of hyperparameter values for ith pipeline. If resource value rv = 10 and 
ncol(pip) = 3 then the output matrix will have
+  # 10*3= 30 rows and 1:10 hyper-paramters for i-the pipeline 11:20 for 
(i+1)-th pipeline and so on
+  # this matrix stores no. of hps, values of hps, and flags
+  paramMatrix = matrix(0, rows=ncol(pipeline)*no_of_res, 
cols=max(paramCount)+NUM_META_FLAGS+1)
+
   for(i in 1:ncol(pipeline)) {
     index = as.scalar(indexes[i])
-    no_of_param = as.integer(as.scalar(hpList[index, 2]))
-
+    no_of_param = as.integer(as.scalar(paramCount[i]))
     # extract hasY and verbose flags
     attachMask = matrix(as.scalar(hpList[index, 3]), rows=no_of_res, cols=1)
-    attachY = matrix(as.scalar(hpList[index, 4]), rows=no_of_res, cols=1)
-    isVerbose = matrix(as.scalar(hpList[index, 5]), rows=no_of_res, cols=1)
-    dataFlag = matrix(as.scalar(hpList[index, 6]), rows=no_of_res, cols=1)
+    attachFD = matrix(as.scalar(hpList[index, 4]), rows=no_of_res, cols=1)
+    attachY = matrix(as.scalar(hpList[index, 5]), rows=no_of_res, cols=1)
+    isVerbose = matrix(as.scalar(hpList[index, 6]), rows=no_of_res, cols=1)
+    dataFlag = matrix(as.scalar(hpList[index, 7]), rows=no_of_res, cols=1)
     
     if(no_of_param > 0) {
-      start = 7
-      t = 7
-      OpParam = matrix(0, no_of_res, no_of_param)
+      paramIdx = START_INDEX
+      typeIdx = START_INDEX
+      OpParam = matrix(0, rows=no_of_res, cols=max(paramCount))
+      
       for(j in 1:no_of_param) {
-        type = as.scalar(hpList[index, t])
-        paramValIndex = (no_of_param) + start
+        type = as.scalar(hpList[index, typeIdx])
+        paramValIndex = (no_of_param) + paramIdx
         minVal =  as.scalar(hpList[index, paramValIndex])
         maxVal = as.scalar(hpList[index, paramValIndex + 1])
         if(type == "FP") {
-          val = rand(rows=no_of_res, cols=1, min=minVal,
-                          max=maxVal, pdf="uniform");
-          OpParam[, j] = val
-        }
-        else if(type == "INT") {
-          # val = ifelse(minVal == maxVal , minVal, as.scalar(sample(maxVal, 
1)));
-          val = sample(maxVal, no_of_res, TRUE)
-          less_than_min = val < minVal
-          val = (less_than_min * minVal) + val
+          val = rand(rows=no_of_res, cols=1, min=minVal,max=maxVal, 
pdf="uniform");
+          OpParam[, j] = val;
+        } else if(type == "INT") {
+          val = sample(maxVal, no_of_res, TRUE);
+          less_than_min = val < minVal;
+          val = (less_than_min * minVal) + val;
           OpParam[, j] = val
-        }
-        else if(type == "BOOL") {
+        } else if(type == "BOOL") {
           if(maxVal == 1) {
-            s = sample(2, no_of_res, TRUE)
-            b = s - 1
-            OpParam[, j] = b
-          }
-          else  OpParam[, j] = matrix(0, rows=no_of_res, cols=1)
-        }
-        else {
+            s = sample(2, no_of_res, TRUE);
+            b = s - 1;
+            OpParam[, j] = b;
+          } else  
+            OpParam[, j] = matrix(0, rows=no_of_res, cols=1)
+        } else {
           # TODO handle string set something like {,,}
           print("invalid data type")
         }
-        start = start + 2
-        t = t + 1
+        paramIdx = paramIdx + 2
+        typeIdx = typeIdx + 1
       }
-      OpParam = cbind(OpParam, attachMask, attachY, isVerbose, dataFlag)
+      # hyper-parameter vector contains no. of hp, values of hp, and flag 
values
+      OpParam = cbind(matrix(no_of_param, rows=nrow(OpParam), cols=1),OpParam, 
attachMask,
+        attachFD, attachY, isVerbose, dataFlag)
     }
     else {
-      OpParam = cbind(attachMask, attachY)
+      # no hyper-parameters, so create a dummy matrix of zeros so flags are 
always aligned
+      dummy = matrix(0, rows=no_of_res, cols=max(paramCount)+1)
+      OpParam = cbind(dummy, attachMask, attachFD, attachY)
       OpParam = cbind(OpParam, isVerbose, dataFlag)
     }
-    while(FALSE){}
-    paramList = append(paramList, OpParam)
-  }
-}
-
-
-# method to convert the operators from a list to a vector representation 
-# so that the could be append in an output matrix
-listToVector = function(List[Unknown] hp, Boolean verbose)
-return (Matrix[Double] hp_vec)
-{
-  hp_vec = matrix(0,1,1)
-  len = length(hp)
-  for(k in 1:len) {
-    mat = as.matrix(hp[k])
-    hpy = cbind(as.matrix(ncol(mat)), mat)
-    hp_vec = cbind(hp_vec, hpy)
-  }
-  hp_vec = hp_vec[1, 2:ncol(hp_vec)]
-}
-
-# function to classify the data using cross validation
-fclassify = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask, 
Matrix[Double] MLhp,
-  Double testAccuracy, Boolean isWeighted, Integer cv=3)
-  return (Double accuracy)
-{
- 
-  if(max(Y) == min(Y)) {
-    print("Y contains only one class")
-    accuracy = as.double(0)
-  }
-  else { 
-    print("STARTING "+cv+" CROSS VALIDATIONS")
-    # do the k = 3 cross validations
-    accuracyMatrix = crossV(X, Y, cv, mask, MLhp, isWeighted)
-    accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
-    acc = colMeans(accuracyMatrix)
-    accuracy = as.scalar(acc[1,1])
-    print("validation accuracy "+accuracy)
+    paramMatrix[((i-1)*no_of_res)+1:i*no_of_res, 1:ncol(OpParam)] = OpParam
   }
 }
 
@@ -453,176 +431,104 @@ extractBracketWinners = function(Matrix[Double] 
pipeline, Matrix[Double] hyperpa
   
 }
 
-
-
-# smote wrapper for doing relative over-sampling
-SMOTE  = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask, 
Integer remainingRatio, Boolean verbose)
-return (Matrix[Double] XY)
-{
-  XY = order(target = cbind(Y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
-  # get the class count 
-  classes = table(XY[, 1], 1)
-  print("before smote")
-  print(toString(classes))
-  while(FALSE){}
-  start_class = 1
-  end_class = 0
-  k = table(XY[, 1], 1)
-  getMax = max(k)
-  maxKIndex = as.scalar(rowIndexMax(t(k)))
-  outSet = matrix(0, 0, ncol(XY))
-    print("remaining ration before "+remainingRatio)
-  remainingRatio = ifelse((remainingRatio%%100) >= 50, remainingRatio+(100 - 
(remainingRatio%%100)),
-    remainingRatio-(remainingRatio%%100))
-  print("remaining ration after "+remainingRatio)
-  for(i in 1: nrow(k)) {
-    end_class = end_class + as.scalar(classes[i])
-    class_t = XY[start_class:end_class, ]
-    # remainingRatio = (round(getMax/nrow(class_t)) - 1) * 100
-    if((i != maxKIndex)) {
-      synthesized = smote(class_t[, 2:ncol(XY)], mask, remainingRatio, 1, 
FALSE)
-      synthesized = cbind(matrix(as.scalar(class_t[2,1]), nrow(synthesized), 
1), synthesized)
-      outSet = rbind(outSet, synthesized)
-      if(verbose) {
-        print("max value: "+getMax)
-        print("values of i: "+i)
-        print("remaining ratio: "+remainingRatio)
-      }
-    }
-    start_class = end_class + 1
-  }
-  
-  XY = rbind(XY, synthesized)
-  Y = XY[, 1]
-  X = XY[, 2:ncol(XY)]
-  XY = cbind(X,Y)
-  classes = table(Y, 1)
-  print("after smote")
-  print(toString(classes))
-}
-
-# constraints over hyper parameters
-verifyHp = function(Integer index, Frame[Unknown] pip, Double minVal, Double 
maxVal, Integer paraNo)
-return (Double minVal, Double maxVal) {
-  op = as.scalar(pip[1,index])
-  # 1. if next op is pca then current op should not leave NaNs in data
-  # 2. if next op is mice then current op should not replace NaNs with zeros
-  
-  if((op == "outlierBySd" | op == "outlierByIQR") & index < ncol(pip) & paraNo 
== 2)
-  {
-    nextOp = as.scalar(pip[1, index + 1])
-    if(nextOp == "pca" | nextOp == "abstain" | nextOp == "SMOTE")
-    {
-      maxVal = 1.0
-    }
-    if(nextOp == "mice")
-    {
-      minVal = 2.0
-    }
-  }
-  # print("now min and max val ")
-  # print(minVal+" "+maxVal)
-  
-}
-
-
-#####################################
-# The function will replace the null with default values
-######################################
-fillDefault = function(Matrix[Double] X)
-return(Matrix[Double] X){
-  defaullt = round(colMaxs(X) - colMins(X))
-  Mask = is.na(X)
-  X = replace(target=X, pattern=NaN, replacement=0)
-  Mask = Mask * defaullt
-  X = X + Mask
-}
-
-#####################################
+###########################################################################
 # The function will return the max performance by each individual pipeline
-######################################
+############################################################################
 getMaxPerConf = function(Matrix[Double] pipelines)
 return (Frame[Unknown] maxperconf)
 {
-  tab = removeEmpty(target=table(pipelines[, 2], pipelines[, 3], pipelines[, 
1]), margin="cols")  
+  tab = removeEmpty(target=table(pipelines[, 2], pipelines[, 3], pipelines[, 
1]), margin="cols")
   maxperconf = frame(0, rows=max(pipelines[, 2]), cols=1)
-  maxperconf = as.frame(t(colMaxs(tab)))
-
+  maxperconf[1:ncol(tab),] = as.frame(t(colMaxs(tab)))
 }
 
 
-#####################################
-# The function will check if the pipeline have zero hyper-parameters
-# then it should not use more resource iterations and should be executed once
-######################################
-isResourceOptimal = function(List[Unknown] param, Boolean verbose)
-return(Boolean validForResources) 
+# function to classify the data using cross validation
+fclassify = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask, 
Matrix[Double] MLhp,
+  Double testAccuracy, Boolean isWeighted, Integer cv=3)
+  return (Double accuracy)
 {
-  validForResources = FALSE
-
-  count = 0
-  for(i in 1:length(param))
-  {
-    hp = as.matrix(param[i])
-    if(ncol(hp) > 4)
-      count += 1
+ 
+  if(max(Y) == min(Y)) {
+    print("Y contains only one class")
+    accuracy = as.double(0)
+  }
+  else { 
+    print("STARTING "+cv+" CROSS VALIDATIONS")
+    # do the k = 3 cross validations
+    accuracyMatrix = crossV(X, Y, cv, mask, MLhp, isWeighted)
+    accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
+    acc = colMeans(accuracyMatrix)
+    accuracy = as.scalar(acc[1,1])
+    print("validation accuracy "+accuracy)
   }
-  validForResources = count > 0
 }
 
+# # ######################################################################
+# # # # Function for cross validation using hold out method
+# # # # Inputs: The input dataset X, Y and the value of k validation, mask of 
the 
+# # # # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
+# # # # via gridsearch and a boolean value of (un)weighted accuracy.
+# # # # Output: It return a matrix having the accuracy of each fold.
+# # ######################################################################
+
+crossV = function(Matrix[double] X, Matrix[double] y, Integer k, 
Matrix[Double] mask,
+  Matrix[Double] MLhp, Boolean isWeighted) 
+return (Matrix[Double] accuracyMatrix)
+{
 
+  accuracyMatrix = matrix(0, k, 1)
 
-#######################################################################
-# Wrapper of transformencode OHE call, to call inside eval as a function
-# Inputs: The input dataset X, and  mask of the columns
-# Output: OHEd matrix X
-#######################################################################
+  dataList = list()
+  testL = list()
+  data = order(target = cbind(y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
+  classes = table(data[, 1], 1)
+  ins_per_fold = classes/k
+  start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
+  fold_idxes = cbind(start_fold, ins_per_fold)
 
-dummycoding = function(Matrix[Double] X, Matrix[Double] mask)
-return (Matrix[Double] dX_train) {
-  X = replace(target=X, pattern=NaN, replacement=0)
-  idx = vectorToCsv(mask)
-  
-  # specifications for one-hot encoding of categorical features
-  jspecDC = "{ids:true, dummycode:["+idx+"]}";
-  # OHE of categorical features
-  [dX_train, dM] = transformencode(target=as.frame(X), spec=jspecDC);
-
-}
+  start_i = 0; end_i = 0; idx_fold = 1;;
+  for(i in 1:k)
+  {
+    fold_i = matrix(0, 0, ncol(data))
+    start=0; end=0; 
+    for(j in 1:nrow(classes))
+    {
+      idx = as.scalar(classes[j, 1])
+      start = end + 1;
+      end = end + idx
+      class_j =  data[start:end, ]
 
+      start_i = as.scalar(fold_idxes[j, 1]);
+      end_i = as.scalar(fold_idxes[j, 2])
 
-#######################################################################
-# Wrapper of imputeByFD OHE call, to call inside eval as a function
-# Inputs: The input dataset X, and  mask of the columns and threshold value
-# Output: filled matrix X
-#######################################################################
+      fold_i = rbind(fold_i, class_j[start_i:end_i, ])
+    }
 
-imputeByFd = function(Matrix[Double] X, Matrix[Double] mask, Double threshold)
-return (Matrix[Double] X_filled)
-{
-  
-  FD = discoverFD(replace(target=X, pattern=NaN, replacement=1), mask, 
threshold)
-  diagonal = diag(FD)
+    dataList = append(dataList, fold_i)
+    fold_idxes[, 1] = fold_idxes[, 2] + 1
+    fold_idxes[, 2] += ins_per_fold
+    while(FALSE){}
+  }
 
-  for(i in 1: nrow(FD))
+  for(i in seq(1,k))
   {
-    for(j in 1:ncol(FD)) {
-    if(as.scalar(FD[i, j]) > threshold)
-      X = imputeByFD(X, i, j, threshold, FALSE)
-    
-    }
+    [trainList, hold_out] = remove(dataList, i)
+    trainset = rbind(trainList)
+    testset = as.matrix(hold_out)
+    trainX = trainset[, 2:ncol(trainset)]
+    trainy = trainset[, 1]
+    testX = testset[, 2:ncol(testset)]
+    testy = testset[, 1]
+    beta = multiLogReg(X=trainX, Y=trainy, icpt=1, reg=as.scalar(MLhp[1,1]), 
tol= 1e-9, 
+    maxi=as.scalar(MLhp[1,2]), maxii= 50, verbose=FALSE);
+    [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
+    accuracy = getAccuracy(testy, yhat, isWeighted)
+    accuracyMatrix[i] = accuracy
   }
-  X_filled = X
 }
 
-#######################################################################
-# Wrapper of na_lof to call inside eval as a function
-# Output: filled matrix X
-#######################################################################
 
-forward_fill = function(Matrix[Double] X, Boolean op, Boolean verbose)
-return (Matrix[Double] X_filled)
-{ 
-  option = ifelse(op, "locf", "nocb")
-  X_filled = na_locf(X=X, option=option, verbose=verbose)
-}
+# data=["#MissingValues", "MinVla", "MaxVal", "AverageMin", "AverageMax", 
+# "#CategoricalFeatures", "#NumericFeatures", "Mean", "#Outliers", 
"#OHEfeatures", "#Classes",
+# "Imbalance", "#rows", "#cols", ""]
\ No newline at end of file
diff --git a/scripts/builtin/discoverFD.dml b/scripts/builtin/discoverFD.dml
index 49d013b..1bb0a21 100644
--- a/scripts/builtin/discoverFD.dml
+++ b/scripts/builtin/discoverFD.dml
@@ -41,6 +41,7 @@
 m_discoverFD = function(Matrix[Double] X, Matrix[Double] Mask, Double 
threshold)
   return(Matrix[Double] FD)
 {
+  
   if( threshold < 0 | threshold > 1 )
     stop("Stopping due to invalid input, threshold required in interval [0, 1] 
found "+threshold)
 
diff --git a/scripts/builtin/executePipeline.dml 
b/scripts/builtin/executePipeline.dml
index 90a9902..3ca0240 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -20,7 +20,7 @@
 #-------------------------------------------------------------
 
 s_executePipeline = function(Frame[String] pipeline, Matrix[Double] X,  
Matrix[Double] Y, Matrix[Double] mask,
-  List[Unknown] hyperParameters, Integer resource_index, Boolean verbose)
+  Matrix[Double] FD, Matrix[Double] hyperParameters, Integer flagsCount, 
Boolean verbose)
   return (Matrix[Double] X, Matrix[Double] Y)
 {
 
@@ -30,16 +30,13 @@ s_executePipeline = function(Frame[String] pipeline, 
Matrix[Double] X,  Matrix[D
     print("checks   rows in X = "+nrow(X)+" rows in Y = "+nrow(Y)+" cols in X 
= "+ncol(X)+" col in Y = "+ncol(Y))
     print("pipeline in execution "+toString(pipeline))
     print("pipeline hps "+toString(hyperParameters))
-    print("index "+toString(resource_index))
     while(FALSE){}
   }
   for(i in 1:ncol(pipeline)) {
-
     op = as.scalar(pipeline[1,i])
-    [hp, withClass, dataFlag] = matrixToList(X, Y, mask, 
as.matrix(hyperParameters[i]), resource_index, op)
+    [hp, withClass, dataFlag] = matrixToList(X, Y, mask, FD, 
hyperParameters[i], flagsCount, op)
     Xclone = X
     X = eval(op, hp)
-    while(FALSE){}
     # dataFlag 0 = only on numeric, 1 = on whole data
     X = confirmData(X, Xclone, mask, dataFlag)
     if(withClass)
@@ -50,21 +47,21 @@ s_executePipeline = function(Frame[String] pipeline, 
Matrix[Double] X,  Matrix[D
 
     X = confirmMeta(X, mask)
   }
-  print("END OF PIPELINE"+toString(pipeline))
-  while(FALSE){}
 }
 
 # This function will convert the matrix row-vector into list
-matrixToList = function(Matrix[Double] X,  Matrix[Double] Y, Matrix[Double] 
mask, Matrix[Double] p, Integer resource_index, String op)
+matrixToList = function(Matrix[Double] X,  Matrix[Double] Y, Matrix[Double] 
mask, Matrix[Double] FD,
+  Matrix[Double] p, Integer flagsCount, String op)
   return (List[Unknown] l, Boolean hasY, Integer dataFlag)
 {
-
+  NUM_META_FLAGS = flagsCount;
   hasY = FALSE
 
   dataFlag = as.integer(as.scalar(p[1, ncol(p)]))
   hasVerbose = as.scalar(p[1, ncol(p) - 1])
   yFlag = as.scalar(p[1, ncol(p) - 2])
-  maskFlag = as.integer(as.scalar(p[1, ncol(p)-3]))
+  fDFlag = as.integer(as.scalar(p[1, ncol(p)-3]))
+  maskFlag = as.integer(as.scalar(p[1, ncol(p)-4]))
   
   ######################################################
   # CHECK FOR DATA FLAG
@@ -90,6 +87,13 @@ matrixToList = function(Matrix[Double] X,  Matrix[Double] Y, 
Matrix[Double] mask
     hasY = TRUE
   }
   ######################################################
+  # CHECK FOR FD APPEND FLAG
+  if(fDFlag == 1)
+  {
+    l = append(l, FD)
+  }
+  
+  ######################################################
   # CHECK FOR MASK APPEND FLAG
   if(maskFlag == 1)
   {
@@ -97,20 +101,22 @@ matrixToList = function(Matrix[Double] X,  Matrix[Double] 
Y, Matrix[Double] mask
   }
   #####################################################
   # POPULATE HYPER PARAM
-  if(ncol(p) > 4) {
-    if(op == "pca") {
-      ratio = as.scalar(p[resource_index,1])
-      p[resource_index, 1] = as.integer(ncol(X) - ratio)
-    }
-    for(i in 1:ncol(p)-4)
-      l = append(l, as.scalar(p[resource_index,i]))
+  # get the number of hyper-parameters and loop till that
+  no_of_hyperparam = as.scalar(p[1,1])
+  if(no_of_hyperparam > 0) {
+    # if(op == "pca") {
+      # # convert the number parameters to a ration related to OHE columns
+      # ratio = as.scalar(p[resource_index,1])
+      # p[resource_index, 1] = as.integer(ncol(X) - ratio)
+    # }
+    for(i in 1:no_of_hyperparam)
+      l = append(l, as.scalar(p[1,(i+1)]))
   }
   ######################################################
   # CHECK FOR VERBOSE FLAG
   if(hasVerbose == 1)
     l = append(l, FALSE)
-   # print("+++++++++++HP++++++++++++++")
-   # print(toString(l, rows=2))
+
 }
 
 confirmMeta = function(Matrix[Double] X, Matrix[Double] mask)
@@ -188,6 +194,7 @@ return (Matrix[Double] X)
     # print("recreated data \n"+toString(X, rows = 20))
 }
 
+
 #######################################################################
 # Wrapper of transformencode OHE call, to call inside eval as a function
 # Inputs: The input dataset X, and  mask of the columns
@@ -205,3 +212,90 @@ return (Matrix[Double] dX_train) {
   [dX_train, dM] = transformencode(target=as.frame(X), spec=jspecDC);
 
 }
+
+
+
+#######################################################################
+# Wrapper of imputeByFD OHE call, to call inside eval as a function
+# Inputs: The input dataset X, and  mask of the columns and threshold value
+# Output: filled matrix X
+#######################################################################
+
+imputeByFd = function(Matrix[Double] X, Matrix[Double] FD,  Double threshold)
+return (Matrix[Double] X_filled)
+{
+  
+  for(i in 1: nrow(FD))
+  {
+    for(j in 1:ncol(FD)) {
+      if(as.scalar(FD[i, j]) > 0 & (min(X[, i]) != 0) & (min(X[, j]) != 0) & 
(sum(FD[, j]) != nrow(FD)))
+        X = imputeByFD(X, i, j, threshold, FALSE)
+    }
+  }
+  X_filled = X
+}
+
+#######################################################################
+# Wrapper of na_lof to call inside eval as a function
+# Output: filled matrix X
+#######################################################################
+
+forward_fill = function(Matrix[Double] X, Boolean op, Boolean verbose)
+return (Matrix[Double] X_filled)
+{ 
+  option = ifelse(op, "locf", "nocb")
+  X_filled = na_locf(X=X, option=option, verbose=verbose)
+}
+
+
+
+# smote wrapper for doing relative over-sampling
+SMOTE  = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask, 
Integer remainingRatio, Boolean verbose)
+return (Matrix[Double] XY)
+{
+  XY = order(target = cbind(Y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
+  synthesized = matrix(0,0,0) # initialize variable
+  # get the class count 
+  classes = table(XY[, 1], 1)
+  start_class = 1
+  end_class = 0
+  k = table(XY[, 1], 1)
+  getMax = max(k)
+  maxKIndex = as.scalar(rowIndexMax(t(k)))
+  outSet = matrix(0, 0, ncol(XY))
+  remainingRatio = ifelse((remainingRatio%%100) >= 50, remainingRatio+(100 - 
(remainingRatio%%100)),
+  remainingRatio-(remainingRatio%%100))
+  for(i in 1: nrow(k)) {
+    end_class = end_class + as.scalar(classes[i])
+    class_t = XY[start_class:end_class, ]
+    if((i != maxKIndex)) {
+      synthesized = smote(class_t[, 2:ncol(XY)], mask, remainingRatio, 1, 
FALSE)
+      synthesized = cbind(matrix(as.scalar(class_t[2,1]), nrow(synthesized), 
1), synthesized)
+      outSet = rbind(outSet, synthesized)
+    }
+    start_class = end_class + 1
+  }
+  
+  XY = rbind(XY, synthesized)
+  Y = XY[, 1]
+  X = XY[, 2:ncol(XY)]
+  XY = cbind(X,Y)
+  classes = table(Y, 1)
+}
+
+
+
+
+########################################################
+# The function will replace the null with default values
+########################################################
+fillDefault = function(Matrix[Double] X)
+return(Matrix[Double] X){
+  defaullt = round(colMaxs(X) - colMins(X))
+  Mask = is.na(X)
+  X = replace(target=X, pattern=NaN, replacement=0)
+  Mask = Mask * defaullt
+  X = X + Mask
+}
+
+
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index eddbd73..3372886 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -54,7 +54,7 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, 
Integer iter = 3,
          expected number of columns > 1 found: "+ncol(X))
   
   if(ncol(cMask) != ncol(X))
-    stop("Dimension mismatch: the columns in X != columns in mask")
+    stop("MICE Dimension mismatch: the columns in X != columns in mask")
   
     
   lastIndex = ncol(X);
@@ -204,8 +204,9 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, 
Integer iter = 3,
 
 colDist= function(Matrix[Double] X, Matrix[Double] mask)
 return (Matrix[Double] dist){
+ 
   dist = matrix(1, 1, ncol(X))
-  X = replace(target=X, pattern=0, replacement=min(X))
+  X = replace(target=X, pattern=0, replacement=max(X)+1)
   parfor(i in 1:ncol(X))
   {
     if(as.scalar(mask[,i]) == 1)
@@ -214,5 +215,6 @@ return (Matrix[Double] dist){
       dist[1, i] = sum(distT != 0)
     }
   }
+
 }
 
diff --git a/scripts/builtin/multiLogRegPredict.dml 
b/scripts/builtin/multiLogRegPredict.dml
index a2c7e8c..a7bbc23 100644
--- a/scripts/builtin/multiLogRegPredict.dml
+++ b/scripts/builtin/multiLogRegPredict.dml
@@ -51,7 +51,7 @@ m_multiLogRegPredict = function(Matrix[Double] X, 
Matrix[Double] B, Matrix[Doubl
   }
   if(ncol(X) < nrow(B)-1)
     stop("multiLogRegPredict: mismatching ncol(X) and nrow(B): "+ncol(X)+" 
"+nrow(B));
-
+  accuracy = 0.0 # initialize variable 
   beta = B[1:ncol(X), ];
   intercept = ifelse(ncol(X)==nrow(B), matrix(0,1,ncol(B)), B[nrow(B),]);
   linear_terms = X %*% beta + matrix(1,nrow(X),1) %*% intercept;
diff --git a/scripts/builtin/pca.dml b/scripts/builtin/pca.dml
index 1cd7cfd..3054d4c 100644
--- a/scripts/builtin/pca.dml
+++ b/scripts/builtin/pca.dml
@@ -26,7 +26,7 @@
 # NAME   TYPE    DEFAULT  MEANING
 # 
---------------------------------------------------------------------------------------------
 # X      Matrix  ---      Input feature matrix
-# K      Int     2      Number of reduced dimensions (i.e., columns)
+# K      Int     2        Number of reduced dimensions (i.e., columns)
 # Center Boolean TRUE     Indicates whether or not to center the feature matrix
 # Scale  Boolean TRUE     Indicates whether or not to scale the feature matrix
 
@@ -41,6 +41,11 @@
 m_pca = function(Matrix[Double] X, Integer K=2, Boolean center=TRUE, Boolean 
scale=TRUE)
   return (Matrix[Double] Xout, Matrix[Double] Mout, Matrix[Double] Centering, 
Matrix[Double] ScaleFactor) 
 {
+  if(K > ncol(X)) {
+    print("PCA: invalid parameter value, the value of k should not be greater 
than the no. of columns in X ")
+    print("setting k = ncol(X)")
+    K = ncol(X)
+  }
   N = nrow(X);
   D = ncol(X);
 
diff --git a/scripts/builtin/splitBalanced.dml 
b/scripts/builtin/splitBalanced.dml
index 4428443..32b87d7 100644
--- a/scripts/builtin/splitBalanced.dml
+++ b/scripts/builtin/splitBalanced.dml
@@ -63,6 +63,7 @@ return (Matrix[Double] X_train, Matrix[Double] y_train, 
Matrix[Double] X_test,
     print("train ratio \n"+toString(classes_ratio_train))
     print("test ratio \n"+toString(classes_ratio_test))
   }
+
   for(i in 1:nrow(classes))
   {
     end_class = end_class + as.scalar(classes[i])
diff --git a/scripts/pipelines/properties/param.csv 
b/scripts/pipelines/properties/param.csv
index 1ab4218..c533e07 100644
--- a/scripts/pipelines/properties/param.csv
+++ b/scripts/pipelines/properties/param.csv
@@ -1,16 +1,16 @@
-name,param_no,maskFlag,yFlag,verboseFlag,dataFlag,dataType,ranges,st1,en1,st2,en1,en2,st3,en3
-outlierByIQR,3,0,0,1,0,FP,INT,INT,1,5,1,2,1,10
-outlierBySd,3,0,0,1,0,FP,INT,INT,1,5,2,2,1,10
-winsorize,0,0,0,1,0,,,,,,,,,
-imputeByMean,0,1,0,0,2,,,,,,,,,
-imputeByMedian,0,1,0,0,2,,,,,,,,,
-mice,2,1,0,1,2,INT,FP,1,3,0.5,0.9,,,
-abstain,1,0,1,1,2,FP,0.6,0.8,,,,,,
-SMOTE,1,1,1,1,2,INT,100,200,,,,,,
-downSample,0,0,1,0,2,,,,,,,,,
-pca,3,0,0,0,2,INT,BOOL,BOOL,1,10,0,1,0,0
-fillDefault,0,0,0,0,2,,,,,,,,,
-dummycoding,0,1,0,0,2,,,,,,,,,
-scale,2,0,0,0,0,BOOL,BOOL,0,1,0,1,,,
-forward_fill,1,0,0,1,0,BOOL,0,1,,,,,,
-imputeByFd,1,1,0,0,2,FP,0.7,1,,,,,,
\ No newline at end of file
+name,param_no,maskFlag,FDFlag,yFlag,verboseFlag,dataFlag,dataType,ranges,st1,en1,st2,en1,en2,st3,en3
+outlierByIQR,3,0,0,0,1,0,FP,INT,INT,1,5,1,2,1,10
+outlierBySd,3,0,0,0,1,0,FP,INT,INT,1,5,2,2,1,10
+winsorize,0,0,0,0,1,0,,,,,,,,,
+imputeByMean,0,1,0,0,0,2,,,,,,,,,
+imputeByMedian,0,1,0,0,0,2,,,,,,,,,
+mice,2,1,0,0,1,2,INT,FP,1,3,0.5,1.0,,,
+abstain,1,0,0,1,1,2,FP,0.6,0.8,,,,,,
+SMOTE,1,1,0,1,1,2,INT,100,200,,,,,,
+downSample,0,0,0,1,0,2,,,,,,,,,
+pca,3,0,0,0,0,2,INT,BOOL,BOOL,100,200,0,1,0,0
+fillDefault,0,0,0,0,0,2,,,,,,,,,
+dummycoding,0,1,0,0,0,2,,,,,,,,,
+scale,2,0,0,0,0,0,BOOL,BOOL,0,1,0,1,,,
+forward_fill,1,0,0,0,1,0,BOOL,0,1,,,,,,
+imputeByFd,1,0,1,0,0,2,FP,0.55,1,,,,,,
\ No newline at end of file
diff --git a/scripts/pipelines/properties/primitives.csv 
b/scripts/pipelines/properties/primitives.csv
index 19eb7d8..98f3874 100644
--- a/scripts/pipelines/properties/primitives.csv
+++ b/scripts/pipelines/properties/primitives.csv
@@ -2,5 +2,4 @@ OTLR,MVI,NR,CI,DIM,DUMMY,SCALE
 winsorize,imputeByMean,abstain,SMOTE,pca,dummycoding,scale
 outlierBySd,imputeByMedian,,,,,
 outlierByIQR,mice,,,,,
-,fillDefault,,,,,
-,forward_fill,,,,,
\ No newline at end of file
+,fillDefault,,,,,
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/logicalFunc.dml 
b/scripts/pipelines/scripts/logicalFunc.dml
index 8f5365e..0ddc1bb 100644
--- a/scripts/pipelines/scripts/logicalFunc.dml
+++ b/scripts/pipelines/scripts/logicalFunc.dml
@@ -23,11 +23,10 @@
 source("scripts/pipelines/scripts/utils.dml") as utils;
 
 # incomplete implementation of automatic logical pipelines
-generateLogicalSeed = function(Matrix[Double] X, Matrix[Double] Y, 
Matrix[Double] mask)
+generateLogicalSeed = function(Matrix[Double] X, Matrix[Double] Y, 
Matrix[Double] mask, String target)
 return(Frame[String] logical){
   
-  # detection = 
-  logical = as.frame("")
+  logical = frame(data=["NULL"], rows=1, cols=1, schema=["STRING"])
   no_of_mv = sum(is.na(X))
   X = replace(target= X, pattern = NaN, replacement = 0)
   
@@ -42,10 +41,13 @@ return(Frame[String] logical){
   count3sdplus = sum(X > (colMean + 3*colSd )) 
   count3sdminus = sum(X < (colMean - 3*colSd )) 
   outliers = count3sdplus + count3sdminus
-  ctab = table(Y, 1)
-  minCatPer = min(ctab) / nrow(ctab)
-  maxCat = max(ctab) / nrow(ctab)
-  
+  minCat = 0.0 # initialize variables
+  maxCat = 0.0
+  if(target != "compare") {
+    ctab = table(Y, 1)
+    minCat = min(ctab)
+    maxCat = max(ctab)
+  }
   mv_to_data_ratio = no_of_mv/(nrow(X) * ncol(X))
   out_to_data_ratio = outliers/ (nrow(X) * ncol(X))
   
@@ -53,17 +55,22 @@ return(Frame[String] logical){
     logical = cbind(logical, as.frame("MVI"))
   if(out_to_data_ratio > 0.1)
     logical = cbind(logical, as.frame("OTLR"))
-  if(maxVal - minVal > 1000)
-    logical = cbind(logical, as.frame("SCALE"))
-  if((maxCat - minCatPer) > 0.3)
-    logical = cbind(logical, as.frame("CI"))
-  if(sum(mask) > 0) {
-    logical = cbind(logical, as.frame("DUMMY"))
-    if(sum(distinctCategories) > 5*ncol(X))
-      logical = cbind(logical, as.frame("DIM"))
-  
-  logical = logical[, 2:ncol(logical)]
+  if(target != "compare") {
+    if(maxVal - minVal > 1000 )
+      logical = cbind(logical, as.frame("SCALE"))
+    if((maxCat - minCat) > (minCat/2))
+      logical = cbind(logical, as.frame("CI"))
+    if(sum(mask) > 0) {
+      logical = cbind(logical, as.frame("DUMMY"))
+      if(sum(distinctCategories) > 5*ncol(X))
+        logical = cbind(logical, as.frame("DIM"))
+    }
   }
+   
+  if(ncol(logical) == 1)
+    logical = frame(["OTLR", "MVI"], rows=1, cols=2, schema=["STRING", 
"STRING"])
+  else
+    logical = logical[, 2:ncol(logical)]
 }
 
 
@@ -73,26 +80,28 @@ return(Frame[Unknown] transformLogical) {
   transformLogical = frame(0, rows=3, cols= ncol(seed)+2)
  
   # case 1: MVI and OTLR
-  if(as.scalar(seed[1,1]) == "MVI" & as.scalar(seed[1,2]) == "OTLR")
+  if(ncol(seed) > 1)
   {
-   # t1: swap MV and OTLR 
-    transformLogical[2,1] = seed[1,2]
-    transformLogical[2,2] = seed[1,1]
-    transformLogical[2, 3:ncol(seed)] = seed[1,3:ncol(seed)]
+    if(as.scalar(seed[1,1]) == "MVI" & as.scalar(seed[1,2]) == "OTLR") {
+      # t1: swap MV and OTLR 
+      transformLogical[2,1] = seed[1,2]
+      transformLogical[2,2] = seed[1,1]
+      transformLogical[2, 3:ncol(seed)] = seed[1,3:ncol(seed)]
   
-    # t2: if the sequence is MVI, OTLR then introduce an MVI after to avoid 
nulls
-
-    transformLogical[3,1:2] = seed[1,1:2]
-    transformLogical[3,3] = seed[1,1]
-    transformLogical[3, 4:ncol(seed)] = seed[1,3:ncol(seed)]
-  }
-  # case 2: OTLR
-  else if(as.scalar(seed[1, 1]) == "OTLR")
-  {
-    # if first operation is OTLR then add a MVI to fill in MVs introduced by 
OTLR
-    transformLogical[2,1] = seed[1, 1]
-    transformLogical[2,2] = "MVI"
-    transformLogical[2, 3:ncol(seed)] = seed[1,2:ncol(seed)]
+    
+      # t2: if the sequence is MVI, OTLR then introduce an MVI after to avoid 
null
+      transformLogical[3,1:2] = seed[1,1:2]
+      transformLogical[3,3] = seed[1,1]
+      transformLogical[3, 4:ncol(seed)] = seed[1,3:ncol(seed)]
+    }
+    # case 2: OTLR
+    else if(as.scalar(seed[1, 1]) == "OTLR" & as.scalar(seed[1, 2]) != "MVI" )
+    {
+      # if first operation is OTLR then add a MVI to fill in MVs introduced by 
OTLR
+      transformLogical[2,1] = seed[1, 1]
+      transformLogical[2,2] = "MVI"
+      transformLogical[2, 3:ncol(seed)] = seed[1,2:ncol(seed)]
+    }
   }
   transformLogical[1, 1:ncol(seed)] = seed
   transformLogical = map(transformLogical, "var -> var.replace(\"0\", \"\")")
diff --git a/scripts/pipelines/scripts/utils.dml 
b/scripts/pipelines/scripts/utils.dml
index 1214d73..aa9338a 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -114,11 +114,11 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, 
Double ratio)
   }
 }
 
-#######################################################################
-# Wrapper of transformencode OHE call, to call inside eval as a function
-# Inputs: The input dataset X, and  mask of the columns
-# Output: OHEd matrix X
-#######################################################################
+# #######################################################################
+# # Wrapper of transformencode OHE call, to call inside eval as a function
+# # Inputs: The input dataset X, and  mask of the columns
+# # Output: OHEd matrix X
+# #######################################################################
 
 dummycoding = function(Matrix[Double] X, Matrix[Double] mask)
 return (Matrix[Double] dX_train) {
@@ -132,6 +132,26 @@ return (Matrix[Double] dX_train) {
 
 }
 
+# # function to classify the data using cross validation
+# fclassify = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] 
mask, Matrix[Double] MLhp,
+  # Double testAccuracy, Boolean isWeighted, Integer cv=3)
+  # return (Double accuracy)
+# {
+ 
+  # if(max(Y) == min(Y)) {
+    # print("Y contains only one class")
+    # accuracy = as.double(0)
+  # }
+  # else { 
+    # print("STARTING "+cv+" CROSS VALIDATIONS")
+    # # do the k = 3 cross validations
+    # accuracyMatrix = crossV(X, Y, cv, mask, MLhp, isWeighted)
+    # accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
+    # acc = colMeans(accuracyMatrix)
+    # accuracy = as.scalar(acc[1,1])
+    # print("validation accuracy "+accuracy)
+  # }
+# }
 
 
 
@@ -155,13 +175,13 @@ return (Matrix[Double] dX_train) {
 # }
 
 
-# # ######################################################################
-# # # # Function for cross validation using hold out method
-# # # # Inputs: The input dataset X, Y and the value of k validation, mask of 
the 
-# # # # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
-# # # # via gridsearch and a boolean value of (un)weighted accuracy.
-# # # # Output: It return a matrix having the accuracy of each fold.
-# # ######################################################################
+# # # ######################################################################
+# # # # # Function for cross validation using hold out method
+# # # # # Inputs: The input dataset X, Y and the value of k validation, mask 
of the 
+# # # # # dataset for OHE of categorical columns, vector of ML 
hyper-parameters identified 
+# # # # # via gridsearch and a boolean value of (un)weighted accuracy.
+# # # # # Output: It return a matrix having the accuracy of each fold.
+# # # ######################################################################
 
 # crossV = function(Matrix[double] X, Matrix[double] y, Integer k, 
Matrix[Double] mask,
   # Matrix[Double] MLhp, Boolean isWeighted) 
@@ -190,7 +210,6 @@ return (Matrix[Double] dX_train) {
       # end = end + idx
       # class_j =  data[start:end, ]
 
-
       # start_i = as.scalar(fold_idxes[j, 1]);
       # end_i = as.scalar(fold_idxes[j, 2])
 
@@ -218,8 +237,117 @@ return (Matrix[Double] dX_train) {
     # accuracy = getAccuracy(testy, yhat, isWeighted)
     # accuracyMatrix[i] = accuracy
   # }
-
 # }
 
 
+######################################################################
+# # Function for cross validation using hold out method
+# # Inputs: The input dataset X, Y and the value of k validation, mask of the 
+# # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
+# # via grid-search and a boolean value of (un)weighted accuracy.
+# # Output: It return a matrix having the accuracy of each fold.
+######################################################################
+
+compareValue = function(Matrix[double] dirtyX, Matrix[double] cleanX,  
Matrix[Double] fixedX, Matrix[Double] mask) 
+return (Double precision)
+{
+  dirtyX = replace(target= dirtyX, pattern=NaN, replacement=0)
+  cleanX = replace(target= cleanX, pattern=NaN, replacement=0)
+  fixedX = replace(target= fixedX, pattern=NaN, replacement=0)
+  correctionsRequired = dirtyX != cleanX
+  correctionsMade = (dirtyX != fixedX)
+  allCorrections_ = sum(correctionsMade)
+  match = (abs(cleanX - fixedX) < 0.1)  * correctionsRequired
+  precision = max(0.001, sum(match) / allCorrections_)
+  print("---------------------------------true positives are 
"+toString(precision))
+}
 
+# constraints over hyper parameters
+verifyHp = function(Integer index, Frame[Unknown] pip, Double minVal, Double 
maxVal, Integer paraNo)
+return (Double minVal, Double maxVal) {
+  op = as.scalar(pip[1,index])
+  # 1. if next op is pca then current op should not leave NaNs in data
+  # 2. if next op is mice then current op should not replace NaNs with zeros
+  
+  if((op == "outlierBySd" | op == "outlierByIQR") & index < ncol(pip) & paraNo 
== 2)
+  {
+    nextOp = as.scalar(pip[1, index + 1])
+    if(nextOp == "pca" | nextOp == "abstain" | nextOp == "SMOTE")
+    {
+      maxVal = 1.0
+    }
+    if(nextOp == "mice")
+    {
+      minVal = 2.0
+    }
+  }
+
+}
+
+
+#####################################
+# The function will check if the pipeline have zero hyper-parameters
+# then it should not use more resource iterations and should be executed once
+######################################
+isResourceOptimal = function(List[Unknown] param, Boolean verbose)
+return(Boolean validForResources) 
+{
+  validForResources = FALSE
+
+  count = 0
+  for(i in 1:length(param))
+  {
+    hp = as.matrix(param[i])
+    if(ncol(hp) > 4)
+      count += 1
+  }
+  validForResources = count > 0
+}
+
+
+###############################################################################################
+# The function will collect the features like statistics and pipelines and 
accuracy 
+# so that they could be used for training a model and predicting pipelines 
without enumeration
+###############################################################################################
+gatherStats = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] 
mask, String target)
+return (Matrix[Double] features)
+{
+
+  features = matrix(0, rows = 1, cols= 14)
+  features[1, 1]=  sum(is.na(X)) # number of missing values
+  X = replace(target= X, pattern = NaN, replacement = 0)
+  num = removeEmpty(target=X, margin="cols", select=(mask == 0))
+  # get the stats
+  features[1, 2] =  min(num) # minimum value
+  features[1, 3] = max(num)
+  features[1, 4] = mean(colMins(num)) # average minimum value
+  features[1, 5] = mean(colMaxs(num)) # average maximum value
+  features[1, 6] = sum(mask) # number of categorical features
+  features[1, 7] = sum(mask == 0) # number of numerical features
+  features[1, 8] = mean(num) # mean value
+  colSd = colSds(num)
+  count3sdplus = sum(num > (colMeans(num) + 3*colSd )) 
+  count3sdminus = sum(num < (colMeans(num) - 3*colSd )) 
+  outliers = count3sdplus + count3sdminus
+  features[1, 9] = outliers
+  # OHE features 
+  OHE = sum(colMaxs(X) * mask)
+  features[1, 10] = OHE
+  if(target != "compare")
+  {
+    ctab = table(Y, 1)
+    features[1, 11] = nrow(ctab) # number of classes
+    minCat = min(ctab) / nrow(ctab)
+    maxCat = max(ctab) / nrow(ctab)
+    # class imabalance 1=YES, 0=NO
+    features[1, 12]= ifelse((maxCat - minCat) > 0.3, 1, 0)
+  }
+  else 
+  {
+    features[1, 11] = 0
+    features[1, 12] = 0
+  }
+  features[1, 13] = nrow(X)
+  features[1, 14] = ncol(X)
+  
+}
\ No newline at end of file
diff --git 
a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java 
b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestClassification.java
similarity index 82%
copy from 
src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java
copy to 
src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestClassification.java
index e74662e..0ad15e0 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestClassification.java
@@ -27,10 +27,10 @@ import org.junit.Assert;
 import org.junit.Ignore;
 import org.junit.Test;
 
-public class CleaningTest extends AutomatedTestBase {
-       private final static String TEST_NAME1 = "mainScript";
+public class CleaningTestClassification extends AutomatedTestBase {
+       private final static String TEST_NAME1 = "testClassification";
        private final static String TEST_NAME2 = "compareAccuracy";
-       private final static String TEST_CLASS_DIR = SCRIPT_DIR + 
CleaningTest.class.getSimpleName() + "/";
+       private final static String TEST_CLASS_DIR = SCRIPT_DIR + 
CleaningTestClassification.class.getSimpleName() + "/";
 
        protected static final String RESOURCE = 
SCRIPT_DIR+"functions/pipelines/";
        protected static final String DATA_DIR = RESOURCE+"data/";
@@ -51,10 +51,10 @@ public class CleaningTest extends AutomatedTestBase {
        }
 
 
-       @Ignore
+       @Test
        public void testCP1() {
-               runFindPipelineTest(1.0, 5,10, 2,
-                       true, Types.ExecMode.SINGLE_NODE);
+               runFindPipelineTest(0.5, 5,10, 2,
+                       true, "classification", Types.ExecMode.SINGLE_NODE);
        }
 
        @Test
@@ -63,7 +63,7 @@ public class CleaningTest extends AutomatedTestBase {
        }
 
        private void runFindPipelineTest(Double sample, int topk, int 
resources, int crossfold,
-               boolean weightedAccuracy, Types.ExecMode et) {
+               boolean weightedAccuracy, String target, Types.ExecMode et) {
 
                setOutputBuffering(true);
                String HOME = SCRIPT_DIR+"functions/pipelines/" ;
@@ -71,10 +71,11 @@ public class CleaningTest extends AutomatedTestBase {
                try {
                        loadTestConfiguration(getTestConfiguration(TEST_NAME1));
                        fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
-
-                       programArgs = new String[] {"-stats", "-exec", 
"singlenode", "-args", DIRTY, META, PRIMITIVES,
-                               PARAM, String.valueOf(sample), 
String.valueOf(topk), String.valueOf(resources),
-                               String.valueOf(crossfold), 
String.valueOf(weightedAccuracy), output("O"), OUTPUT };
+                       programArgs = new String[] {"-stats", "-exec", 
"singlenode", "-nvargs", "dirtyData="+DIRTY, "metaData="+META,
+                               "primitives="+PRIMITIVES, "parameters="+PARAM, 
"sampleSize="+String.valueOf(sample),
+                               "topk="+String.valueOf(topk), 
"rv="+String.valueOf(resources), "cv="+String.valueOf(crossfold),
+                               "weighted="+ String.valueOf(weightedAccuracy), 
"output="+OUTPUT, "target="+target, "cleanData="+CLEAN,
+                               "O="+output("O")};
 
                        runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
 
diff --git 
a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java 
b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestCompare.java
similarity index 63%
rename from 
src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java
rename to 
src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestCompare.java
index e74662e..36adfbb 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/pipelines/CleaningTestCompare.java
@@ -27,10 +27,9 @@ import org.junit.Assert;
 import org.junit.Ignore;
 import org.junit.Test;
 
-public class CleaningTest extends AutomatedTestBase {
-       private final static String TEST_NAME1 = "mainScript";
-       private final static String TEST_NAME2 = "compareAccuracy";
-       private final static String TEST_CLASS_DIR = SCRIPT_DIR + 
CleaningTest.class.getSimpleName() + "/";
+public class CleaningTestCompare extends AutomatedTestBase {
+       private final static String TEST_NAME1 = "testCompare";
+       private final static String TEST_CLASS_DIR = SCRIPT_DIR + 
CleaningTestCompare.class.getSimpleName() + "/";
 
        protected static final String RESOURCE = 
SCRIPT_DIR+"functions/pipelines/";
        protected static final String DATA_DIR = RESOURCE+"data/";
@@ -47,23 +46,17 @@ public class CleaningTest extends AutomatedTestBase {
        @Override
        public void setUp() {
                addTestConfiguration(TEST_NAME1,new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1,new String[]{"R"}));
-               addTestConfiguration(TEST_NAME2,new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME2,new String[]{"R"}));
        }
 
 
-       @Ignore
-       public void testCP1() {
-               runFindPipelineTest(1.0, 5,10, 2,
-                       true, Types.ExecMode.SINGLE_NODE);
-       }
-
        @Test
-       public void testCP2() {
-               runCleanAndCompareTest( Types.ExecMode.SINGLE_NODE);
+       public void testCP1() {
+               runFindPipelineTest(0.5, 5,10, 2,
+                       true, "compare", Types.ExecMode.SINGLE_NODE);
        }
 
        private void runFindPipelineTest(Double sample, int topk, int 
resources, int crossfold,
-               boolean weightedAccuracy, Types.ExecMode et) {
+               boolean weightedAccuracy, String target, Types.ExecMode et) {
 
                setOutputBuffering(true);
                String HOME = SCRIPT_DIR+"functions/pipelines/" ;
@@ -71,35 +64,11 @@ public class CleaningTest extends AutomatedTestBase {
                try {
                        loadTestConfiguration(getTestConfiguration(TEST_NAME1));
                        fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
-
-                       programArgs = new String[] {"-stats", "-exec", 
"singlenode", "-args", DIRTY, META, PRIMITIVES,
-                               PARAM, String.valueOf(sample), 
String.valueOf(topk), String.valueOf(resources),
-                               String.valueOf(crossfold), 
String.valueOf(weightedAccuracy), output("O"), OUTPUT };
-
-                       runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
-
-                       //expected loss smaller than default invocation
-                       
Assert.assertTrue(TestUtils.readDMLBoolean(output("O")));
-               }
-               finally {
-                       resetExecMode(modeOld);
-               }
-       }
-
-       private void runCleanAndCompareTest( Types.ExecMode et) {
-               setOutputBuffering(true);
-               String HOME = SCRIPT_DIR+"functions/pipelines/";
-               Types.ExecMode modeOld = setExecMode(et);
-               try {
-                       loadTestConfiguration(getTestConfiguration(TEST_NAME2));
-                       fullDMLScriptName = HOME + TEST_NAME2 + ".dml";
-
-                       programArgs = new String[] {"-stats", "-exec",
-                               "singlenode", "-args", DIRTY, CLEAN, META, 
OUTPUT, output("O")};
+                       programArgs = new String[] {"-stats", "-exec", 
"singlenode", "-nvargs", "dirtyData="+DIRTY, "metaData="+META,
+                               "primitives="+PRIMITIVES, "parameters="+PARAM,  
"topk="+String.valueOf(topk), "rv="+String.valueOf(resources),
+                               "output="+OUTPUT, "target="+target, 
"cleanData="+CLEAN, "O="+output("O")};
 
                        runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
-
-                       //expected loss smaller than default invocation
                        
Assert.assertTrue(TestUtils.readDMLBoolean(output("O")));
                }
                finally {
diff --git a/src/test/scripts/functions/pipelines/compareAccuracy.dml 
b/src/test/scripts/functions/pipelines/compareAccuracy.dml
index a8aa0d8..5cd7cda 100644
--- a/src/test/scripts/functions/pipelines/compareAccuracy.dml
+++ b/src/test/scripts/functions/pipelines/compareAccuracy.dml
@@ -49,11 +49,12 @@ O = read($2, data_type="frame", format="csv", header=FALSE,
 
 metaInfo = read($3, data_type="frame", format="csv", header=FALSE);  
 input = $4
-pip = read(input+"pip.csv", data_type="frame", format="csv", header=FALSE);
-hp = read(input+"hp.csv", data_type="matrix", format="csv", header=FALSE);
+pip = read(input+"pipelines.csv", data_type="frame", format="csv", 
header=FALSE);
+hp = read(input+"hyperparams.csv", data_type="matrix", format="csv", 
header=FALSE);
 
 getSchema = metaInfo[1, 2:ncol(metaInfo)]
 getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
+getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for 
FD computation
 
 # # 1. dropInvalid function will remove the values which are not the part 
 # # of the column data type  
@@ -80,32 +81,54 @@ eX = eX[, 1:ncol(eX) - 1]
 # strip the mask of class label
 getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
 getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
-
+getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
 
 # construct hyper-parameters
 ls = list();
 i = 1; k = 1
 
-# take the oversampling out from the test processing
-pip1 = as.frame("")
+FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1), 
Mask=getFdMask, threshold=0.8)
+FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD 
+FD = FD > 0
 # construct the parameter list for best hyper-parameters if the oversampling 
technique is part of 
 # pipeline then take it out because oversampling is not applied on test dataset
 # this condition is unnecessary here in this case because the input dataset is 
balanced and 
 # instead of diving the dataset into train/test I am doing cross validations
-while(k <= ncol(pip))
-{
-  end = as.integer(i+as.integer(as.scalar(hp[1,i])))
-  mat = hp[1, i+1:end]
-  i = end + 1
-  if(as.scalar(pip[1,k]) != "SMOTE") {
-    pip1 = cbind(pip1, pip[1,k] )
-    ls = append(ls, mat)
-  }
-  k = k + 1
-}
+
+print("hp matrix")
+no_of_param = as.scalar(hp[1, 1]) + 1
+hp_width= hp[1, 2:no_of_param]
+hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
+index = 1
+# for(i in 1:ncol(pip))
+# {
+  # no_of_param = as.scalar(hp[1, index])
+  # hp_matrix[i, 1:no_of_param] = hp[1, 2:no_of_param+1]
+  # index = index + no_of_param + 2
+# }
+
+
+
+print(toString(hp_matrix))
+
+# while(k <= ncol(pip))
+# {
+  # end = as.integer(i+as.integer(as.scalar(hp[1,i]))) 
+  # mat = hp[1, i+1:end]
+  # i = end + 1
+  # if(as.scalar(pip[1,k]) != "SMOTE") {
+    # pip1 = cbind(pip1, pip[1,k] )
+    # ls = append(ls, mat)
+  # }
+  # k = k + 1
+# }
+
+
+print("ncol in X "+ncol(eX))
+print("ncol in mask "+ncol(getMask))
 
 # # clean using best pipeline 
-[cX , cY] = executePipeline(pip1[, 2:ncol(pip1)], eX, eY, getMask, ls, 1, 
FALSE)
+[cX , cY] = executePipeline(pip[1], eX, eY, getMask, FD, hp_matrix, 5, FALSE)
 
 if(sum(getMask) > 0)
 {
diff --git a/src/test/scripts/functions/pipelines/intermediates/acc.csv 
b/src/test/scripts/functions/pipelines/intermediates/acc.csv
deleted file mode 100644
index f6b666b..0000000
--- a/src/test/scripts/functions/pipelines/intermediates/acc.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-76.14285714285714
-76.0
-75.85714285714286
-75.85714285714286
-75.85714285714286
diff --git a/src/test/scripts/functions/pipelines/intermediates/hp.csv 
b/src/test/scripts/functions/pipelines/intermediates/hp.csv
deleted file mode 100644
index 385ecad..0000000
--- a/src/test/scripts/functions/pipelines/intermediates/hp.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-4.0,1.0,0,0,2.0,6.0,0,0,0,0,0,0,4.0,1.0,0,0,2.0,7.0,5.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-4.0,1.0,0,0,2.0,6.0,1.0,0,0,0,0,0,4.0,1.0,0,0,2.0,7.0,5.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-4.0,1.0,0,0,2.0,6.0,0,1.0,0,0,0,0,4.0,1.0,0,0,2.0,7.0,2.0,1.0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-4.0,1.0,0,0,2.0,6.0,1.0,1.0,0,0,0,0,4.0,1.0,0,0,2.0,7.0,10.0,1.0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-4.0,1.0,0,0,2.0,6.0,1.0,1.0,0,0,0,0,4.0,1.0,0,0,2.0,7.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/pip.csv 
b/src/test/scripts/functions/pipelines/intermediates/pip.csv
deleted file mode 100644
index 834e793..0000000
--- a/src/test/scripts/functions/pipelines/intermediates/pip.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-imputeByMean,scale,dummycoding,pca
-imputeByMedian,scale,dummycoding,pca
-imputeByMedian,scale,dummycoding,pca
-imputeByMean,scale,dummycoding,pca
-imputeByMedian,scale,dummycoding,pca
diff --git a/src/test/scripts/functions/pipelines/mainScript.dml 
b/src/test/scripts/functions/pipelines/mainScript.dml
index 3999e94..5422ae6 100644
--- a/src/test/scripts/functions/pipelines/mainScript.dml
+++ b/src/test/scripts/functions/pipelines/mainScript.dml
@@ -25,16 +25,20 @@ source("scripts/pipelines/scripts/logicalFunc.dml") as 
logical;
 source("scripts/pipelines/scripts/gridsearchMLR.dml") as gs;
 
 # read the inputs
-F = read($1, data_type="frame", format="csv", header=FALSE, 
+F = read($dirtyData, data_type="frame", format="csv", header=TRUE, 
+  naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
+
+metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
+primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
+param = read($parameters, data_type = "frame", format="csv", header= TRUE)
+sample = $sampleSize
+topK = $topk
+resources = $rv
+crossValidations = $cv
+weightedAccuracy = $weighted # accuracy flag
+targetApplicaton = $target # accuracy flag
+cleanData = read($cleanData, data_type="frame", format="csv", header=TRUE, 
   naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
-metaInfo = read($2, data_type="frame", format="csv", header=FALSE);
-primitives = read($3, data_type = "frame", format="csv", header= TRUE)
-param = read($4, data_type = "frame", format="csv", header= TRUE)
-sample = $5
-topK = $6
-resources = $7
-crossValidations = $8
-weightedAccuracy = $9 # accuracy flag
 
 
 if(nrow(metaInfo) < 2)
@@ -45,12 +49,13 @@ if(nrow(metaInfo) < 2)
  
 getSchema = metaInfo[1, 2:ncol(metaInfo)]
 getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
+getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for 
FD computation
   
 # 1. dropInvalid function will remove the values which are not the part 
 # of the column data type  
 
-X = dropInvalidType(F, getSchema)
-  # X = F
+# X = dropInvalidType(F, getSchema)
+  X = F
 
 # 2. encode the categorical data
 if(sum(getMask) > 0)
@@ -58,60 +63,108 @@ if(sum(getMask) > 0)
   # always recode the label
   index = utils::vectorToCsv(getMask)
   jspecR = "{ids:true, recode:["+index+"]}"
-  [eX, X_meta] = transformencode(target=X, spec=jspecR);
+  if(targetApplicaton == "compare") {
+    [eX, X_meta] = transformencode(target=rbind(cleanData, X), spec=jspecR);
+    cleanX = eX[1:nrow(cleanData)]
+    eX = eX[nrow(cleanData)+1:nrow(eX)]  
+  }
+  else 
+    [eX, X_meta] = transformencode(target=X, spec=jspecR);
   # change the schema to reflect the encoded values
   getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
   getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")
-    
+
+
 } 
 # if no categorical value exist then just cast the frame into matrix
 else
   eX = as.matrix(X)
+  
+  
 
 # 3. extract the class label  
-eY = eX[, ncol(eX)]
-eX = eX[, 1:ncol(eX) - 1]
-   
-
-getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
-getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
-
-
+if(targetApplicaton == "classification")
+{
+  eY = eX[, ncol(eX)]
+  eX = eX[, 1:ncol(eX) - 1]
 
+  getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
+  getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class 
label
+  getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class 
label
+}
+   
 # get the logical seed
-lgSeed = logical::generateLogicalSeed(eX, eY, getMask)
+if(targetApplicaton == "compare")
+  lgSeed = logical::generateLogicalSeed(eX, as.matrix(0), getMask, 
targetApplicaton)
+else
+  lgSeed = logical::generateLogicalSeed(eX, eY, getMask, targetApplicaton)
 allLgs = logical::transformLogical(lgSeed)
 
 
-# 4. perform the sampling  
-[eX, eY] = utils::doSample(eX, eY, sample)
-
-# 5. get train test and validation set with balanced class distribution
-[X_train, y_train, X_test, y_test] = splitBalanced(eX, eY, 0.7, FALSE)
-
-# 6. find the best hyper parameters for classification algorithm
-# for now only find the best values for intercept and maximum outer iteration
-params = list("reg", "maxi");
-paramRanges = list(10^seq(0,-10), seq(10,100, 10));
-
- 
-dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test), pattern 
= NaN, replacement=0), getMask)
-dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),] 
-dX_train = dX_train[1:nrow(y_train),] 
-
-# [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test, 
-  # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
-opt = matrix("0 100", 1, 2)
-
-# 7. get the cross validated accuracy on dirty dataset (only on training set)
-d_accuracy = classifyDirty(X_train, y_train, opt, getMask, weightedAccuracy, 
crossValidations)
-# print("dirty accuracy is "+d_accuracy)
-# # [eX, eY] = prioritise(eX, eY, getMask)
-
+d_accuracy = 0
+# 4. perform the sampling
+if(targetApplicaton != "compare") {
+  [eX, eY] = utils::doSample(eX, eY, sample)
+
+  # 5. get train test and validation set with balanced class distribution
+  [X_train, y_train, X_test, y_test] = splitBalanced(eX, eY, 0.7, FALSE)
+
+  # 6. find the best hyper parameters for classification algorithm
+  # for now only find the best values for intercept and maximum outer iteration
+  params = list("reg", "maxi");
+  paramRanges = list(10^seq(0,-10), seq(10,100, 10));
+
+  # if(sum(getMask) > 0)
+  # {
+    # dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test), 
pattern = NaN, replacement=0), getMask)
+    # dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),] 
+    # dX_train = dX_train[1:nrow(y_train),] 
+    # [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test, 
+    # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+#   }
+  # else  
+    # [opt, loss] = gs::gridSearchMLR(X_train, y_train, X_test, y_test, 
+      # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+    opt = matrix("0 100", 1, 2)
+
+  # 7. get the cross validated accuracy on dirty dataset (only on training set)
+  d_accuracy = classifyDirty(X_train, y_train, opt, getMask, weightedAccuracy, 
crossValidations)
+  # print("dirty accuracy is "+d_accuracy)
+  # # [eX, eY] = prioritise(eX, eY, getMask)
+} 
+FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1), 
Mask=getFdMask, threshold=0.8)
+FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD 
+FD = FD > 0
+
+logical1 =  frame(["4", "MVI", "SCALE", "DUMMY", "DIM", "0", "0", "0"], 
rows=1, cols=8)
+logical2 =  frame(["2", "MVI", "DUMMY", "0", "0", "0", "0", "0"], rows=1, 
cols=8)
+logical3 =  frame(["3", "MVI", "SCALE", "DUMMY", "0", "0", "0", "0"], rows=1, 
cols=8)
+logical4 =  frame(["7", "MVI", "OTLR", "CI", "SCALE", "DUMMY", "DIM", "0"], 
rows=1, cols=8)
+logical5 = frame(["7", "MVI", "OTLR", "MVI", "CI", "SCALE", "DUMMY", "DIM"], 
rows=1, cols=8)
+logical6 = frame(["6", "OTLR", "MVI", "CI", "SCALE", "DUMMY", "DIM", "0"], 
rows=1, cols=8)
+
+log = rbind(logical1, logical2)
+log = rbind(log, logical3)
+log = rbind(log, logical4)
+log = rbind(log, logical5)
+log = rbind(log, logical6)
+print("logical permutations "+toString(log))
+
+metaList = list(mask=getMask, schema=getSchema, fd=FD)
+targetClassification = list(target=targetApplicaton, cv=crossValidations, 
wAccuracy=weightedAccuracy, 
+  dirtyAcc = d_accuracy, mlHp = opt, cleanData = as.matrix(0))
+
+
+# val = compareValue(replace(target=eX, pattern=NaN, replacement=0), getMask)
+parfor(i in 1:nrow(log))
+{
+  lv = as.integer(as.scalar(log[i, 1])) + 1
+  [pip, hp, acc, features] = bandit(X_train=eX, Y_train=eY,  
metaList=metaList, targetList=targetClassification, lp=log[i, 2:lv],
+    primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
+}
 
-[pip, hp, acc] = bandit(X_train=X_train, Y_train=y_train,  mask=getMask, 
MLhp=opt,
-  schema=getSchema, lp=allLgs, primitives=primitives, param=param, k=topK, 
testAccuracy=d_accuracy,
-  isWeighted=weightedAccuracy, R=resources, cv=crossValidations, verbose=TRUE);
+output = $output
+write(features, output+"/features.csv", format="csv")
 
 
 if(as.scalar((is.na(acc[1,1]))) == 1 | as.scalar(acc[1,1]) < d_accuracy)
@@ -132,35 +185,29 @@ print(toString(hp))
 print("best accuracy")
 print(toString(acc))
 
-
-clean_accuracy = testBestPipeline(pip=pip[1,], hp=hp[1,], X_train=X_train, 
y_train=y_train,
-  X_test=X_test, y_test=y_test, cmask=getMask, MLhp=opt, 
valAcc=as.scalar(acc[1,1]), dirAcc=d_accuracy,
-  isWeighted=weightedAccuracy)
-
-
-# # # dataPath = $10
-# # # # write the preprocessing
-# # # trainset = cbind(X_train, y_train)
-# # # testset = cbind(X_test, y_test)
-# # # write(trainset, dataPath+"/train.csv" , format="csv", sep=",")
-# # # write(testset, dataPath+"/test.csv", format="csv", sep=",")
-# # # write(opt, dataPath+"/mlHp.csv", format="csv", sep=",")
-# # # write(allLgs, $11, format="csv")
+# if(targetApplicaton != "compare")
+  # clean_accuracy = testBestPipeline(pip=pip[1,], hp=hp[1,], X_train=X_train, 
y_train=y_train,
+    # X_test=X_test, y_test=y_test, cmask=getMask, FD=FD, MLhp=opt, 
valAcc=as.scalar(acc[1,1]), dirAcc=d_accuracy,
+    # isWeighted=weightedAccuracy)
+# else 
+clean_accuracy = as.scalar(acc[1,1])
 
 
 result = d_accuracy < clean_accuracy  
-print("reult satisfied ------------"+result)
-write(result, $10, format="text")
+print("result satisfied ------------"+result)
 
+accuracies = cbind(as.matrix(d_accuracy), as.matrix(clean_accuracy))
 
-output = $11
-if(result) {
-  write(pip, output+"pip.csv", format="csv")
-  write(hp, output+"hp.csv", format="csv")
-  write(acc, output+"acc.csv", format="csv")
-}
+
+tmp_hp = cbind(matrix(NaN, nrow(hp), 1), hp)
+writeResult = cbind(pip, as.frame(tmp_hp))
+writeResult = cbind(writeResult , as.frame(acc))
 
 
+write(pip, output+"/pipelines.csv", format="csv")
+write(hp, output+"/hyperparams.csv", format="csv")
+write(acc, output+"/accuracies.csv", format="csv")
+write(accuracies , output+"/BestAccuracy.csv", format="csv")
 
 
 
@@ -178,92 +225,94 @@ classifyDirty = function(Matrix[Double] Xtrain, 
Matrix[Double] ytrain, Matrix[Do
 {
   # # classify without cleaning fill with default values 1
   Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
-  dX_train = utils::dummycoding(Xtrain, mask)
+  if(sum(mask) > 0)
+    Xtrain = utils::dummycoding(Xtrain, mask)
   # print("rows in data ")
   # print(nrow(dX_train))
   # print("column in data")
   # print(ncol(dX_train))
-  accuracy = crossV(dX_train, ytrain, cv, mask, opt, isWeighted)
+  accuracy = crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
   accuracy = mean(accuracy)
   print("cross validated dirty accuracy "+accuracy)
 }
 
 
-######################################################################
-# # Function for cross validation using hold out method
-# # Inputs: The input dataset X, Y and the value of k validation, mask of the 
-# # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
-# # via grid-search and a boolean value of (un)weighted accuracy.
-# # Output: It return a matrix having the accuracy of each fold.
-######################################################################
-
-crossV = function(Matrix[double] X, Matrix[double] y, Integer k, 
Matrix[Double] mask,
-  Matrix[Double] MLhp, Boolean isWeighted) 
-return (Matrix[Double] accuracyMatrix)
-{
-
-  accuracyMatrix = matrix(0, k, 1)
-
-  dataList = list()
-  testL = list()
-  data = order(target = cbind(y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
-  classes = table(data[, 1], 1)
-  ins_per_fold = classes/k
-  start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
-  fold_idxes = cbind(start_fold, ins_per_fold)
-
-  start_i = 0; end_i = 0; idx_fold = 1;;
-  for(i in 1:k)
-  {
-    fold_i = matrix(0, 0, ncol(data))
-    start=0; end=0; 
-    for(j in 1:nrow(classes))
-    {
-      idx = as.scalar(classes[j, 1])
-      start = end + 1;
-      end = end + idx
-      class_j =  data[start:end, ]
 
 
-      start_i = as.scalar(fold_idxes[j, 1]);
-      end_i = as.scalar(fold_idxes[j, 2])
+lossFunc = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) 
+return (Matrix[Double] loss) {
+  [prob, yhat, acc] = multiLogRegPredict(X=X, B=B, Y=y,  verbose=FALSE)
+  loss = as.matrix(1 - (acc/100))
+  # [confusionCount_c, confusionAVG_c] = confusionMatrix(P=yhat, Y=y)
+}
 
-      fold_i = rbind(fold_i, class_j[start_i:end_i, ])
-    }
 
-    dataList = append(dataList, fold_i)
-    fold_idxes[, 1] = fold_idxes[, 2] + 1
-    fold_idxes[, 2] += ins_per_fold
-    while(FALSE){}
-  }
+# testBestPipeline = function(Frame[Unknown] pip, Matrix[Double] hp, 
Matrix[Double] X_train, Matrix[Double] y_train, 
+  # Matrix[Double] X_test, Matrix[Double] y_test, Matrix[Double] cmask, 
Matrix[Double] FD, Matrix[Double] MLhp, 
+  # Double valAcc, Double dirAcc, Boolean isWeighted)
+  # return (Double result) {
+  # print("hp "+toString(hp))
+  # lsTrain = list();
+  # lsTest = list();
+  # i = 1; k = 1
+  # trRow=nrow(X_train)
+  # # take the oversampling out from the test processing
+  # pip1 = as.frame("")
+  # # construct the parameter list for best hyper-parameters
+  # while(k <= ncol(pip))
+  # {
+    # end = as.integer(i+as.integer(as.scalar(hp[1,i])))
+    # mat = hp[1, i+1:end]
+    # i = end + 1
+    # lsTrain = append(lsTrain, mat)
+    # if(as.scalar(pip[1,k]) != "SMOTE") {
+      # pip1 = cbind(pip1, pip[1,k] )
+      # lsTest = append(lsTest, mat)
+    # }
+    # k = k + 1
+  # }
+
+  # # clean using best pipeline and train model
+  # [X_train, y_train] = executePipeline(pip, X_train, y_train, cmask, FD, 
lsTrain, 1, FALSE)
+  # if(ncol(pip1) > 1)
+    # [X_test, y_test] = executePipeline(pip1[, 2:ncol(pip1)], X_test, y_test, 
cmask, FD, lsTest, 1, FALSE)
+  # # X_train_clean = X_train[1:trRow, ]
+  # # y_train_clean = Y_train[1:trRow, ]
+  # # X_test_clean = X_train[trRow+1:nrow(X_train), ]
+  # # y_test_clean = Y_train[trRow+1:nrow(X_train), ]
+
+  # # classify after cleaning  
+  # betas = multiLogReg(X=X_train, Y=y_train, icpt=1,
+    # reg=as.scalar(MLhp[1,1]), tol= 1e-9, maxi=as.scalar(MLhp[1,2]), 
+    # maxii= 50, verbose=FALSE);
+    
+  # [c_prob, c_yhat, c_accuracy] = multiLogRegPredict(X_test, betas, y_test, 
FALSE)
+  # c_accuracy = getAccuracy(y_test, c_yhat, isWeighted)
+  # [confusionCount_c, confusionAVG_c] = confusionMatrix(P=c_yhat, Y=y_test)
+  
+  
+  # print("Actual Records \n"+toString(cbind(X_test, y_test)))
+  # # print("Clean Records \n"+toString(cbind(X_test, y_test)))
+  # print("predictions Records \n"+toString(cbind(X_test, c_yhat)))
+  # print("accuracy of dirty data  "+dirAcc)
+  # print("accuracy of val data  "+valAcc)
+  # print("accuracy of test accuracy "+c_accuracy)
+  # print("clean confusion matrix  \n"+toString(confusionCount_c))
+  
+  # result = c_accuracy
+# }
 
-  for(i in seq(1,k))
-  {
-    [trainList, hold_out] = remove(dataList, i)
-    trainset = rbind(trainList)
-    testset = as.matrix(hold_out)
-    trainX = trainset[, 2:ncol(trainset)]
-    trainy = trainset[, 1]
-    testX = testset[, 2:ncol(testset)]
-    testy = testset[, 1]
-    beta = multiLogReg(X=trainX, Y=trainy, icpt=1, reg=as.scalar(MLhp[1,1]), 
tol= 1e-9, 
-    maxi=as.scalar(MLhp[1,2]), maxii= 50, verbose=FALSE);
-    [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
-    accuracy = getAccuracy(testy, yhat, isWeighted)
-    accuracyMatrix[i] = accuracy
-  }
 
-}
 
-######################################################################
-# # Function for cross validation using hold out method
-# # Inputs: The input dataset X, Y and the value of k validation, mask of the 
-# # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
-# # via grid-search and a boolean value of (un)weighted accuracy.
-# # Output: It return a matrix having the accuracy of each fold.
-######################################################################
+# # ######################################################################
+# # # # Function for cross validation using hold out method
+# # # # Inputs: The input dataset X, Y and the value of k validation, mask of 
the 
+# # # # dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
+# # # # via gridsearch and a boolean value of (un)weighted accuracy.
+# # # # Output: It return a matrix having the accuracy of each fold.
+# # ######################################################################
 
-compare = function(Matrix[double] X, Matrix[double] y, Integer k, 
Matrix[Double] mask,
+crossV = function(Matrix[double] X, Matrix[double] y, Integer k, 
Matrix[Double] mask,
   Matrix[Double] MLhp, Boolean isWeighted) 
 return (Matrix[Double] accuracyMatrix)
 {
@@ -290,7 +339,6 @@ return (Matrix[Double] accuracyMatrix)
       end = end + idx
       class_j =  data[start:end, ]
 
-
       start_i = as.scalar(fold_idxes[j, 1]);
       end_i = as.scalar(fold_idxes[j, 2])
 
@@ -318,68 +366,6 @@ return (Matrix[Double] accuracyMatrix)
     accuracy = getAccuracy(testy, yhat, isWeighted)
     accuracyMatrix[i] = accuracy
   }
-
 }
 
-lossFunc = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) 
-return (Matrix[Double] loss) {
-  [prob, yhat, acc] = multiLogRegPredict(X=X, B=B, Y=y,  verbose=FALSE)
-  loss = as.matrix(1 - (acc/100))
-  # [confusionCount_c, confusionAVG_c] = confusionMatrix(P=yhat, Y=y)
-}
-
-
-
-testBestPipeline = function(Frame[Unknown] pip, Matrix[Double] hp, 
Matrix[Double] X_train, Matrix[Double] y_train, 
-  Matrix[Double] X_test, Matrix[Double] y_test, Matrix[Double] cmask, 
Matrix[Double] MLhp, 
-  Double valAcc, Double dirAcc, Boolean isWeighted)
-  return (Double result) {
-  print("hp "+toString(hp))
-  ls = list();
-  i = 1; k = 1
-  trRow=nrow(X_train)
-  # take the oversampling out from the test processing
-  pip1 = as.frame("")
-  # construct the parameter list for best hyper-parameters
-  while(k <= ncol(pip))
-  {
-    end = as.integer(i+as.integer(as.scalar(hp[1,i])))
-    mat = hp[1, i+1:end]
-    i = end + 1
-    if(as.scalar(pip[1,k]) != "SMOTE") {
-      pip1 = cbind(pip1, pip[1,k] )
-      ls = append(ls, mat)
-    }
-    k = k + 1
-  }
-
-  # clean using best pipeline and train model
-  [X_train, Y_train] = executePipeline(pip1[, 2:ncol(pip1)], 
rbind(X_train,X_test), rbind(y_train,y_test), cmask, ls, 1, FALSE)
-  X_train_clean = X_train[1:trRow, ]
-  y_train_clean = Y_train[1:trRow, ]
-  X_test_clean = X_train[trRow+1:nrow(X_train), ]
-  y_test_clean = Y_train[trRow+1:nrow(X_train), ]
-
-  # classify after cleaning  
-  betas = multiLogReg(X=X_train_clean, Y=y_train_clean, icpt=1,
-    reg=as.scalar(MLhp[1,1]), tol= 1e-9, maxi=as.scalar(MLhp[1,2]), 
-    maxii= 50, verbose=FALSE);
-    
-  [c_prob, c_yhat, c_accuracy] = multiLogRegPredict(X_test_clean, betas, 
y_test_clean, FALSE)
-  c_accuracy = getAccuracy(y_test_clean, c_yhat, isWeighted)
-  [confusionCount_c, confusionAVG_c] = confusionMatrix(P=c_yhat, 
Y=y_test_clean)
-  
-  
-  print("Actual Records \n"+toString(cbind(X_test, y_test)))
-  print("Clean Records \n"+toString(cbind(X_test_clean, y_test_clean)))
-  print("predictions Records \n"+toString(cbind(X_test_clean, c_yhat)))
-  print("accuracy of dirty data  "+dirAcc)
-  print("accuracy of val data  "+valAcc)
-  print("accuracy of test accuracy "+c_accuracy)
-  print("clean confusion matrix  \n"+toString(confusionCount_c))
-  
-  result = c_accuracy
-}
-
-
 
diff --git a/src/test/scripts/functions/pipelines/meta/meta_census.csv 
b/src/test/scripts/functions/pipelines/meta/meta_census.csv
index 427abbc..8ffe862 100644
--- a/src/test/scripts/functions/pipelines/meta/meta_census.csv
+++ b/src/test/scripts/functions/pipelines/meta/meta_census.csv
@@ -1,3 +1,3 @@
 
Scehma,FP32,STRING,INT32,INT32,STRING,FP32,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,FP32,FP32,FP32,STRING,STRING,STRING,STRING,STRING,FP32,STRING,STRING,STRING,STRING,STRING,FP32,STRING,STRING,STRING,STRING,STRING,INT32,STRING,INT32,FP32,FP32,STRING
 
mask,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,1
-,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
+FD,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,1
diff --git a/src/test/scripts/functions/pipelines/testClassification.dml 
b/src/test/scripts/functions/pipelines/testClassification.dml
new file mode 100644
index 0000000..93f90ed
--- /dev/null
+++ b/src/test/scripts/functions/pipelines/testClassification.dml
@@ -0,0 +1,203 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+# Generate the logical pipelines for data cleaning
+
+source("scripts/pipelines/scripts/utils.dml") as utils;
+source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
+source("scripts/pipelines/scripts/gridsearchMLR.dml") as gs;
+
+# read the inputs
+F = read($dirtyData, data_type="frame", format="csv", header=TRUE, 
+  naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
+
+metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
+primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
+param = read($parameters, data_type = "frame", format="csv", header= TRUE)
+sample = $sampleSize
+topK = $topk
+resources = $rv
+crossValidations = $cv
+weightedAccuracy = $weighted # accuracy flag
+targetApplicaton = $target # accuracy flag
+
+
+
+if(nrow(metaInfo) < 2)
+  stop("incomplete meta info")
+
+ # Do the initial cleaning
+ 
+ 
+getSchema = metaInfo[1, 2:ncol(metaInfo)]
+getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
+getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for 
FD computation
+  
+# 1. dropInvalid function will remove the values which are not the part 
+# of the column data type  
+
+X = dropInvalidType(F, getSchema)
+
+# 2. encode the categorical data
+if(sum(getMask) > 0)
+{
+  # always recode the label
+  index = utils::vectorToCsv(getMask)
+  jspecR = "{ids:true, recode:["+index+"]}"
+  [eX, X_meta] = transformencode(target=X, spec=jspecR);
+  # change the schema to reflect the encoded values
+  getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
+  getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")
+
+} 
+# if no categorical value exist then just cast the frame into matrix
+else
+  eX = as.matrix(X)
+  
+# 3. extract the class label  
+eY = eX[, ncol(eX)]
+eX = eX[, 1:ncol(eX) - 1]
+
+getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
+getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
+getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
+
+
+# get the logical seed
+lgSeed = logical::generateLogicalSeed(eX, eY, getMask, targetApplicaton)
+allLgs = logical::transformLogical(lgSeed)
+
+
+d_accuracy = 0
+# 4. perform the sampling
+
+[eX, eY] = utils::doSample(eX, eY, sample)
+
+# 5. get train test and validation set with balanced class distribution
+# [X_train, y_train, X_test, y_test] = splitBalanced(X=eX, Y=eY, 
splitRatio=0.7, verbose=FALSE)
+X_train = eX
+y_train = eY
+# 6. find the best hyper parameters for classification algorithm
+# for now only find the best values for intercept and maximum outer iteration
+params = list("reg", "maxi");
+paramRanges = list(10^seq(0,-10), seq(10,100, 10));
+# if(sum(getMask) > 0)
+# {
+  # dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test), 
pattern = NaN, replacement=0), getMask)
+  # dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),] 
+  # dX_train = dX_train[1:nrow(y_train),] 
+  # [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test, 
+  # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+#  }
+# else  
+  # [opt, loss] = gs::gridSearchMLR(X_train, y_train, X_test, y_test, 
+    # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+# hardcoded hyper-params for multilogReg
+opt = matrix("0 100", 1, 2)
+
+# 7. get the cross validated accuracy on dirty dataset (only on training set)
+d_accuracy = classifyDirty(X_train, y_train, opt, getMask, weightedAccuracy, 
crossValidations)
+# print("dirty accuracy is "+d_accuracy)
+ # [eX, eY] = prioritise(eX, eY, getMask)
+ 
+FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1), 
Mask=getFdMask, threshold=0.8)
+FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD 
+FD = FD > 0
+
+metaList = list(mask=getMask, schema=getSchema, fd=FD)
+targetClassification = list(target=targetApplicaton, cv=crossValidations, 
wAccuracy=weightedAccuracy, 
+  dirAcc = d_accuracy, mlHp = opt, cleanData = as.matrix(0))
+
+# # initialize output variables
+pip = as.frame("NULL"); hp = matrix(0,0,0); acc = matrix(0,0,0); features = 
as.frame("NULL")
+
+[pip, hp, acc, features] = bandit(X_train=eX, Y_train=eY,  metaList=metaList, 
targetList=targetClassification, lp=allLgs[1],
+  primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
+
+output = $output
+write(features, output+"/features.csv", format="csv")
+
+
+if(as.scalar((is.na(acc[1,1]))) == 1 | as.scalar(acc[1,1]) < d_accuracy)
+  stop("warning: no best pipeline found")
+  
+
+print("best pipelines")
+print(toString(pip))
+
+print("best hyperparam")
+print(toString(hp))
+
+print("best accuracy")
+print(toString(acc))
+
+
+clean_accuracy = as.scalar(acc[1,1])
+
+
+result = d_accuracy < clean_accuracy  
+print("result satisfied ------------"+result)
+
+accuracies = cbind(as.matrix(d_accuracy), as.matrix(clean_accuracy))
+
+
+write(pip, output+"/pipelines.csv", format="csv")
+write(hp, output+"/hyperparams.csv", format="csv")
+write(acc, output+"/accuracies.csv", format="csv")
+write(accuracies , output+"/BestAccuracy.csv", format="csv")
+write(result , $O)
+
+
+
+
+####################################################################
+# Function for classifying the dirty dataset, makes a call to crossV()
+# Inputs: takes the input dataset X, Y and the value of k validation, mask of 
the 
+# dataset for OHE of categorical columns, vector of ML hyper-parameters 
identified 
+# via grid-search and a boolean value of (un)weighted accuracy.
+# Output: It return a matrix having the accuracy of each fold.
+####################################################################
+classifyDirty = function(Matrix[Double] Xtrain, Matrix[Double] ytrain, 
Matrix[Double] opt, 
+  Matrix[Double] mask, Boolean isWeighted = TRUE, Integer cv)
+  return (Double accuracy)
+{
+  # # classify without cleaning fill with default values 1
+  Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
+  if(sum(mask) > 0)
+    Xtrain = utils::dummycoding(Xtrain, mask)
+  # print("rows in data ")
+  # print(nrow(dX_train))
+  # print("column in data")
+  # print(ncol(dX_train))
+  accuracy = crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
+  accuracy = mean(accuracy)
+  print("cross validated dirty accuracy "+accuracy)
+}
+
+
+
+
+lossFunc = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) 
+return (Matrix[Double] loss) {
+  [prob, yhat, acc] = multiLogRegPredict(X=X, B=B, Y=y,  verbose=FALSE)
+  loss = as.matrix(1 - (acc/100))
+  # [confusionCount_c, confusionAVG_c] = confusionMatrix(P=yhat, Y=y)
+}
+
diff --git a/src/test/scripts/functions/pipelines/testCompare.dml 
b/src/test/scripts/functions/pipelines/testCompare.dml
new file mode 100644
index 0000000..df110e2
--- /dev/null
+++ b/src/test/scripts/functions/pipelines/testCompare.dml
@@ -0,0 +1,138 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+# Generate the logical pipelines for data cleaning
+
+source("scripts/pipelines/scripts/utils.dml") as utils;
+source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
+
+# read the inputs
+F = read($dirtyData, data_type="frame", format="csv", header=TRUE, 
+  naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
+
+metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
+primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
+param = read($parameters, data_type = "frame", format="csv", header= TRUE)
+topK = $topk
+resources = $rv
+targetApplicaton = $target # accuracy flag
+cleanData = read($cleanData, data_type="frame", format="csv", header=TRUE, 
+  naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
+
+
+  
+# take the sample of 500 rows to avoid java heap issue
+
+F = F[1:500,]
+cleanData = cleanData[1:500,]
+  
+if(nrow(metaInfo) < 2)
+  stop("incomplete meta info")
+
+ # Do the initial cleaning
+ 
+ 
+getSchema = metaInfo[1, 2:ncol(metaInfo)]
+getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
+getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for 
FD computation
+  
+# 1. dropInvalid function will remove the values which are not the part 
+# of the column data type  
+
+X = dropInvalidType(F, getSchema)
+
+
+# 2. encode the categorical data
+if(sum(getMask) > 0)
+{
+  # always recode the label
+  index = utils::vectorToCsv(getMask)
+  jspecR = "{ids:true, recode:["+index+"]}"
+
+  [eX, X_meta] = transformencode(target=rbind(cleanData, X), spec=jspecR);
+  cleanX = eX[1:nrow(cleanData)]
+  eX = eX[nrow(cleanData)+1:nrow(eX)]  
+
+  # change the schema to reflect the encoded values
+  getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
+  getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")
+
+
+} 
+# if no categorical value exist then just cast the frame into matrix
+else
+  eX = as.matrix(X)
+  
+
+# get the logical seed
+lgSeed = logical::generateLogicalSeed(eX, as.matrix(0), getMask, 
targetApplicaton)
+allLgs = logical::transformLogical(lgSeed)
+
+
+
+FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1), 
Mask=getFdMask, threshold=0.8)
+FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD 
+FD = FD > 0
+
+expectedAccuracy = 0.6
+
+metaList = list(mask=getMask, schema=getSchema, fd=FD)
+targetClassification = list(target=targetApplicaton, cv=0, wAccuracy=FALSE, 
+  dirAcc = expectedAccuracy,  mlHp = as.matrix(0), cleanData = cleanX)
+
+
+# # initialize output variables
+pip = as.frame("NULL"); hp = matrix(0,0,0); acc = matrix(0,0,0); features = 
as.frame("NULL")
+
+[pip, hp, acc, features] = bandit(X_train=eX, Y_train=as.matrix(0),  
metaList=metaList, targetList=targetClassification, 
+  lp=allLgs, primitives=primitives, param=param, k=topK, R=resources, 
verbose=TRUE);
+
+
+output = $output
+write(features, output+"/features.csv", format="csv")
+
+
+if(as.scalar((is.na(acc[1,1]))) == 1 | as.scalar(acc[1,1]) < expectedAccuracy)
+  stop("warning: no best pipeline found")
+  
+  
+print("best pipelines")
+print(toString(pip))
+
+print("best hyperparam")
+print(toString(hp))
+
+print("best accuracy")
+print(toString(acc))
+
+clean_accuracy = as.scalar(acc[1,1])
+
+
+result = expectedAccuracy <= clean_accuracy  
+print("result satisfied ------------"+result)
+
+accuracies = cbind(as.matrix(expectedAccuracy), as.matrix(clean_accuracy))
+
+
+write(pip, output+"/pipelines.csv", format="csv")
+write(hp, output+"/hyperparams.csv", format="csv")
+write(acc, output+"/accuracies.csv", format="csv")
+write(accuracies , output+"/BestAccuracy.csv", format="csv")
+write(result , $O)
\ No newline at end of file

Reply via email to