[systemds] branch main updated: [MINOR] Cleanups in various cleaning scripts (prints, comments, validation checks etc.)

ssiddiqi Fri, 02 Dec 2022 03:03:00 -0800

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new a685373083 [MINOR] Cleanups in various cleaning scripts (prints, 
comments, validation checks etc.)
a685373083 is described below

commit a685373083cc764d271d097c324dc34adda155ab
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Fri Dec 2 12:01:25 2022 +0100

    [MINOR] Cleanups in various cleaning scripts (prints, comments, validation 
checks etc.)
---
 scripts/builtin/bandit.dml                         |   8 +-
 scripts/builtin/executePipeline.dml                |  32 +----
 scripts/builtin/multiLogReg.dml                    |   5 +-
 scripts/builtin/topk_cleaning.dml                  |   6 +-
 scripts/pipelines/scripts/utils.dml                | 154 ++-------------------
 .../intermediates/classification/applyFunc.csv     |   6 +-
 .../intermediates/classification/bestAcc.csv       |   6 +-
 .../pipelines/intermediates/classification/hp.csv  |   6 +-
 .../pipelines/intermediates/classification/pip.csv |   6 +-
 9 files changed, 31 insertions(+), 198 deletions(-)

diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 075890740c..3699eb0c6d 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -286,7 +286,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, 
Integer r_i = 1, Matrix[Do
     hp = hp[, 2:totalVals]
     applyFunctions = allApplyFunctions[i]
     no_of_res = nrow(hp)
-    print("PIPELINE EXECUTION START ... "+toString(op))
+    # print("PIPELINE EXECUTION START ... "+toString(op))
     hpForPruning = matrix(0, rows=1, cols=ncol(op))
     changesByOp = matrix(0, rows=1, cols=ncol(op))
     metaList2 = metaList; #ensure metaList is no result var
@@ -341,9 +341,6 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, 
Integer r_i = 1, Matrix[Do
   changesByPipMatrix = removeEmpty(target=changesByPipMatrix, margin="rows", 
select = sel)
 }
 
-
-
-
 # extract the hyper-parameters for pipelines
 getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, 
Integer no_of_res, Boolean default, Integer seed = -1, Boolean enablePruning)
   return (Matrix[Double] paramMatrix, Frame[Unknown] applyFunc, Integer 
no_of_res, Integer NUM_META_FLAGS)
@@ -560,7 +557,6 @@ return (Double accuracy, Matrix[Double] evalFunHp, 
Matrix[Double] hpForPruning,
   allChanges = min(allChanges)
   changesByOp = colMaxs(cvChanges)
   accuracy =  mean(accuracyMatrix)
-  print("mean: \n"+toString(accuracyMatrix))
   print("cv accuracy: "+toString(accuracy))
 }
 
@@ -590,8 +586,6 @@ return(Boolean execute)
   execute = !(changeCount > 0)
 }
 
-
-
 getParamMeta = function(Frame[Unknown] pipeline, Frame[Unknown] hpList)
 return(Frame[Unknown] applyFunc, Matrix[Double] indexes, Matrix[Double] 
paramCount)
 {
diff --git a/scripts/builtin/executePipeline.dml 
b/scripts/builtin/executePipeline.dml
index cdde19e2ef..b42a49bd0e 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -184,43 +184,15 @@ return(Matrix[Double] X,Integer executeFlag)
   { 
     if(sum(mask) == 0)
       executeFlag = 0
-    else {
+    else if(sum(mask) != ncol(mask)) {
       # take categorical out and remove numerics
       X = removeEmpty(target=X, margin = "cols", select = mask)
     }
+    else X = X
   }
   else X = X
 }
 
-# confirmMeta = function(Matrix[Double] X, Matrix[Double] mask)
-# return (Matrix[Double] X)
-# {
-  # if((sum(mask) > 0) & (ncol(X) == ncol(mask)))
-  # {
-    # # get  the max + 1 for nan replacement
-    # nanMask = is.na(X)
-    # # replace nan
-    # X = replace(target = X, pattern = NaN, replacement = 9999)
-    # # take categorical out
-    # cat = removeEmpty(target=X, margin="cols", select = mask)
-    # # round categorical (if there is any floating  point)
-    # cat = round(cat)
-    # less_than_1_mask = cat < 1
-    # less_than_1 = less_than_1_mask * 9999
-    # cat = (cat * (less_than_1_mask == 0)) +  less_than_1
-    # # reconstruct original X
-    # X = X * (mask == 0)
-    # q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)), 
margin="rows", 
-      # select=t(mask)), ncol(cat), ncol(X))
-    # X = (cat %*% q) + X 
-
-    # # put nan back
-    # nanMask = replace(target = nanMask, pattern = 1, replacement = NaN)
-    # X = X + nanMask
-  # }
-# }
-
-
 confirmData = function(Matrix[Double] nX, Matrix[Double] originalX, 
Matrix[Double] mask, Integer dataFlag)
 return (Matrix[Double] X)
 {
diff --git a/scripts/builtin/multiLogReg.dml b/scripts/builtin/multiLogReg.dml
index 21c87a35e4..9b7d7da79e 100644
--- a/scripts/builtin/multiLogReg.dml
+++ b/scripts/builtin/multiLogReg.dml
@@ -61,12 +61,15 @@ m_multiLogReg = function(Matrix[Double] X, Matrix[Double] 
Y, Int icpt = 2,
   # Robustness for datasets with missing values (causing NaN gradients)
   numNaNs = sum(isNaN(X))
   if( numNaNs > 0 ) {
-    print("multiLogReg: matrix X contains "+numNaNs+" missing values, 
replacing with 0.")
+    if(verbose)
+      print("multiLogReg: matrix X contains "+numNaNs+" missing values, 
replacing with 0.")
     X = replace(target=X, pattern=NaN, replacement=0);
   }
 
   # Introduce the intercept, shift and rescale the columns of X if needed
   if (icpt == 1 | icpt == 2) { # add the intercept column
+    if(N == nrow(X))
+      N = nrow(X)
     X = cbind (X, matrix (1, N, 1));
     D = ncol (X);
   }
diff --git a/scripts/builtin/topk_cleaning.dml 
b/scripts/builtin/topk_cleaning.dml
index d72f300b92..8fe5b48b1b 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -80,7 +80,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, 
Frame[Unknown] dataTest = a
   # TODO why recoding/sampling twice (within getDirtyScore)
   print("---- class-stratified sampling of feature matrix w/ f="+sample);
   if(sum(mask) > ncol(mask)/2 & nrow(eYtrain) >= 10000 & sample == 1.0) 
-    [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq)
+    [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, 3500)
   else 
     [eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, mask, 
metaR, TRUE)
   t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s");
@@ -115,14 +115,14 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, 
Frame[Unknown] dataTest = a
   [bestLogical, bestHp, con, refChanges, acc] = 
lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
   initial_population=logical, refSol=refSol, seed = seed,  max_iter=max_iter, 
metaList = metaList,
   evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, 
param=parameters,
-  dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=FALSE, 
ctx=ctx)
+  dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE, 
ctx=ctx)
   t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s");
   topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); 
topKScores = matrix(0,0,0); applyFunc = as.frame("NULL")
   # write(acc, output+"/acc.csv", format="csv")
   # stop("end of enumlp")
   [topKPipelines, topKHyperParams, topKScores, applyFunc] = 
bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, Y_test=eYtest,  
metaList=metaList,
     evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical, 
lpHp=bestHp, primitives=primitives, param=parameters, baseLineScore=dirtyScore,
-    k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, 
enablePruning = enablePruning, verbose=FALSE);  
+    k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, 
enablePruning = enablePruning, verbose=TRUE);  
   t7 = time(); print("-- Cleaning - Enum Physical Pipelines: 
"+(t7-t6)/1e9+"s");
 }
 
diff --git a/scripts/pipelines/scripts/utils.dml 
b/scripts/pipelines/scripts/utils.dml
index a4f9f6c9b7..b1b1e2086a 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -50,107 +50,6 @@ return (Frame[Unknown] frameblock)
 }
 
 
-# # #######################################################################
-# # # Function for group-wise/stratified sampling from all classes in labelled 
dataset
-# # # Inputs: The input dataset X, Y  and  sampling ratio between 0 and 1
-# # # Output: sample X and Y
-# # #######################################################################
-# # doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, 
Matrix[Double] mask, Frame[String] metaR, Boolean verbose = FALSE)
-  # # return (Matrix[Double] sampledX, Matrix[Double] sampledY, Matrix[Double] 
filterMask)
-# # {
-  # # print("initial number of rows: " +nrow(eX))
-  # # # # # prepare feature vector for NB
-  # # beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6,  maxi=50, 
maxii=50, verbose=FALSE);
-  # # [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
-
-  # # # # if the operation is binary make a fixed confidence of 0.9, for 
multi-class compute kappa
-  # # # threshold = 0
-  # # # if(max(eY) == 2)
-    # # # threshold = quantile(rowMaxs(trainProbs), 0.95)
-  # # kappa = 0.0
-  # # # if(max(eY) <= 2) {
-    # # # kappa = quantile(rowMaxs(trainProbs), 0.95)
-    # # # print("for binary classification")
-  # # # }
-  # # # else {
-    # # # # compute kappa
-    # # classFreA = table(eY, 1, 1, max(eY), 1)
-    # # classFreP = table(yhat, 1, 1, max(eY), 1)
-    # # probA = classFreA/nrow(eY)
-    # # probP = classFreP/nrow(eY)
-    # # condProb = sum(probA * probP)
-    # # kappa = ((accuracy/100) - condProb) / (1 - condProb)
-    # # print("kappa for multi-class"+toString(kappa))
-  # # # }
-  # # print("threshold "+toString(kappa))
-  # # filterMask = rowMaxs(trainProbs) > kappa
-  # # # sampledX = removeEmpty(target = eX, margin = "rows", 
select=(rowMaxs(trainProbs) < threshold))
-  # # # sampledY = removeEmpty(target = eY, margin = "rows", 
select=(rowMaxs(trainProbs) < threshold))
-  # # # print("filtered number of rows: " +nrow(sampledX))
-
-  # # mask[1,1] = 0
-    # # # # # stats of wrong
-  # # maxUniques = max(colMaxs(replace(target=eX, pattern=NaN, replacement=1)) 
* mask)
-  # # print("maxUniques "+maxUniques)
-  # # while(FALSE){}
-  # # stats = matrix(0, rows=maxUniques, cols=ncol(mask))
-  # # metaInfo = frame(0, rows=nrow(metaR), cols = 2*ncol(metaR))
-  # # # m = 1
-  # # for(i in 1:ncol(mask))
-  # # {
-    # # print("meta: "+as.scalar(mask[1, i]))
-    # # if(as.scalar(mask[1, i]) == 1)
-    # # {
-      # # problematic_cats = removeEmpty(target=eX[, i], margin = "rows", 
select = (yhat != eY))
-      # # problematic_cats_sums = table(problematic_cats, 1)
-      # # stats[1:nrow(problematic_cats_sums), i] = problematic_cats_sums
-      # # stats_rowMax = rowMaxs(stats)
-      # # stats2 = (stats == stats_rowMax) * (stats_rowMax >= 100)
-      # # # colum =  metaR[, i]
-      # # # print("printing meta recoded")
-      # # # print(toString(colum))
-      # # # while(FALSE){}
-      # # # tmpValue = map(colum, "x -> x.toLowerCase()")
-      # # # tmpIndex = map(colum, "x -> x.toLowerCase()")
-      # # # metaInfo[1:nrow(tmpIndex), m] = tmpIndex
-      # # # metaInfo[1:nrow(tmpIndex), m+1] = tmpValue
-      # # # m = m + 2
-    # # }
-  # # }
-  # # filterMask = eX[, 4] == 2 | eX[, 5] == 4 | eX[, 5] == 7 | eX[, 5] == 8
-  # # filterMask = filterMask == 0
-  # # # stats = cbind(seq(1, nrow(stats)), stats, stats_rowMax)
-  # # # stats2 = cbind(seq(1, nrow(stats)), stats2)
-  # # # print("print status: \n"+toString(stats))
-  # # # print("print status 2: \n"+toString(stats2))
-  # # # print("meta infor: \n"+toString(metaInfo, rows=10))
-  # # # # create the filter mask  
-  # # print("rows taken after filtering the categories: "+sum(filterMask))
-  # # MIN_SAMPLE = 1000
-  # # sampledX = eX
-  # # sampledY = eY
-  # # ratio = ifelse(nrow(eY) > 200000, 0.6, ratio)
-  # # sampled = floor(nrow(eX) * ratio)
-  
-  # # if(sampled > MIN_SAMPLE & ratio != 1.0)
-  # # {
-    # # sampleVec = sample(nrow(eX), sampled, FALSE, 23)
-    # # P = table(seq(1, nrow(sampleVec)), sampleVec, nrow(sampleVec), 
nrow(eX))
-    # # if((nrow(eY) > 1))  # for classification
-    # # {
-      # # sampledX = P %*% eX
-      # # sampledY = P %*% eY
-    # # }
-    # # else if(nrow(eY) == 1) { # for clustering
-      # # sampledX = P %*% eX
-      # # sampledY = eY 
-    # # }
-    # # print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
-  # # }
-
-# # }
-
-
 #######################################################################
 # Function for group-wise/stratified sampling from all classes in labelled 
dataset
 # Inputs: The input dataset X, Y  and  sampling ratio between 0 and 1
@@ -184,7 +83,7 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, 
Double ratio, Matrix[D
 }
 
 
-doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY, Double lq, 
Double uq)
+doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY, Double lq, 
Double uq, Integer rowCount = 3500)
   return (Matrix[Double] sampledX, Matrix[Double] sampledY)
 {
   print("initial number of rows: " +nrow(eX))
@@ -193,57 +92,22 @@ doErrorSample = function(Matrix[Double] eX, Matrix[Double] 
eY, Double lq, Double
   beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6,  maxi=20, 
maxii=20, verbose=FALSE);
   [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
 
-  # kappa = 0.0
-
-  # # compute kappa
-  # classFreA = table(eY, 1, 1, max(eY), 1)
-  # classFreP = table(yhat, 1, 1, max(eY), 1)
-  # probA = classFreA/nrow(eY)
-  # probP = classFreP/nrow(eY)
-  # condProb = sum(probA * probP)
-  # kappa = ((accuracy/100) - condProb) / (1 - condProb)
-  # print("kappa for multi-class"+toString(kappa))
-  # filterMask = rowMaxs(trainProbs) < kappa
-  # threshold = ifelse(sum(filterMask) <= 2, median(rowMaxs(trainProbs)), 
kappa)
-  # threshold = ifelse(sum(filterMask) <= 2, median(rowMaxs(trainProbs)), 
kappa)
-  # print("threshold "+toString(threshold))
-
+ 
   print("applying error filter")
-  # sampledX = removeEmpty(target = eX, margin = "rows", 
select=(rowMaxs(trainProbs) < threshold))
-  # sampledY = removeEmpty(target = eY, margin = "rows", 
select=(rowMaxs(trainProbs) < threshold))
   filterMask = rowMaxs(trainProbs) <  quantile(rowMaxs(trainProbs), lq) | 
rowMaxs(trainProbs) >  quantile(rowMaxs(trainProbs), uq)
+  delta = 0.001
+  while(sum(filterMask) < rowCount & nrow(eY) > rowCount)
+  {
+    lq = lq + delta
+    uq = uq - delta
+    filterMask = rowMaxs(trainProbs) <  quantile(rowMaxs(trainProbs), lq) | 
rowMaxs(trainProbs) >  quantile(rowMaxs(trainProbs), uq)
+  }
   sampledX = removeEmpty(target = eX, margin = "rows", select=filterMask)
   sampledY = removeEmpty(target = eY, margin = "rows", select=filterMask)
   print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
  
 }
 
-# doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY)
-  # return (Matrix[Double] sampledX, Matrix[Double] sampledY, Matrix[Double] 
filterMask)
-# {
-  # print("initial number of rows: " +nrow(eX))
-  # # # # prepare feature vector for NB
-  # beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6,  maxi=50, 
maxii=50, verbose=FALSE);
-  # [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
-  # # # # stats of wrong
-  # maxUniques = max(colMaxs(eX) * mask)
-  # stats = matrix(0, rows=nrow(maxUniques), cols=ncol(mask))
-  # for(i in 1:ncol(mask))
-  # {
-    # if(as.scalar(mask[1, i]) == 1)
-    # {
-      # problematic_cats = removeEmpty(target=eX[, i], margin = rows, select = 
(yhat != eY))
-      # problematic_cats_sums = table(problematic_cats, 1)
-      # stats[1:nrow(problematic_cats_sums), i] = problematic_cats_sums
-    # }
-  
-  # }
-  # print(toString(stats))
-
-
-# }
-
-
 # #######################################################################
 # # Wrapper of transformencode OHE call, to call inside eval as a function
 # # Inputs: The input dataset X, and  mask of the columns
diff --git 
a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
 
b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
index 2133a4dd5a..af64dc371a 100644
--- 
a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
+++ 
b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
@@ -1,3 +1,3 @@
-forward_fill,imputeByMeanApply,NA,imputeByMedianApply,forward_fill,NA,imputeByMeanApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0
-NA,forward_fill,imputeByMeanApply,imputeByMeanApply,imputeByMedianApply,forward_fill,NA,NA,imputeByMedianApply,forward_fill,NA,imputeByMeanApply,dummycodingApply,0,0,0,0,0
-NA,forward_fill,imputeByMeanApply,imputeByMeanApply,imputeByMedianApply,forward_fill,NA,NA,imputeByMedianApply,forward_fill,NA,imputeByMeanApply,dummycodingApply,0,0,0,0,0
+forward_fill,winsorizeApply,imputeByMedianApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0
+forward_fill,winsorizeApply,imputeByMedianApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0
+forward_fill,winsorizeApply,imputeByMedianApply,NA,winsorizeApply,forward_fill,imputeByMeanApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0
diff --git 
a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv 
b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
index c86545e61c..e8747b356c 100644
--- 
a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
+++ 
b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
@@ -1,3 +1,3 @@
-86.23188405797102
-84.23913043478261
-83.87681159420289
+73.731884057971
+73.731884057971
+73.731884057971
diff --git 
a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv 
b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index 1c3e619b04..750229f523 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-56.0,1.0,1.0,0,0,0,1.0,2.0,0,0,1.0,0,0,0,2.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 [...]
-91.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 [...]
-91.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 [...]
+40.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 [...]
+40.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 [...]
+64.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,1.0,1.0,0,0,0,0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 [...]
diff --git 
a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv 
b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index 0fc1fc0921..228496cbef 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-forward_fill,imputeByMean,underSampling,imputeByMedian,forward_fill,underSampling,imputeByMean,dummycoding,0,0,0,0,0,0,0,0,0,0
-underSampling,forward_fill,imputeByMean,imputeByMean,imputeByMedian,forward_fill,underSampling,underSampling,imputeByMedian,forward_fill,underSampling,imputeByMean,dummycoding,0,0,0,0,0
-underSampling,forward_fill,imputeByMean,imputeByMean,imputeByMedian,forward_fill,underSampling,underSampling,imputeByMedian,forward_fill,underSampling,imputeByMean,dummycoding,0,0,0,0,0
+forward_fill,winsorize,imputeByMedian,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0
+forward_fill,winsorize,imputeByMedian,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0
+forward_fill,winsorize,imputeByMedian,tomeklink,winsorize,forward_fill,imputeByMean,dummycoding,0,0,0,0,0,0,0,0,0,0

[systemds] branch main updated: [MINOR] Cleanups in various cleaning scripts (prints, comments, validation checks etc.)

Reply via email to