This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new a685373083 [MINOR] Cleanups in various cleaning scripts (prints,
comments, validation checks etc.)
a685373083 is described below
commit a685373083cc764d271d097c324dc34adda155ab
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Fri Dec 2 12:01:25 2022 +0100
[MINOR] Cleanups in various cleaning scripts (prints, comments, validation
checks etc.)
---
scripts/builtin/bandit.dml | 8 +-
scripts/builtin/executePipeline.dml | 32 +----
scripts/builtin/multiLogReg.dml | 5 +-
scripts/builtin/topk_cleaning.dml | 6 +-
scripts/pipelines/scripts/utils.dml | 154 ++-------------------
.../intermediates/classification/applyFunc.csv | 6 +-
.../intermediates/classification/bestAcc.csv | 6 +-
.../pipelines/intermediates/classification/hp.csv | 6 +-
.../pipelines/intermediates/classification/pip.csv | 6 +-
9 files changed, 31 insertions(+), 198 deletions(-)
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 075890740c..3699eb0c6d 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -286,7 +286,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i = 1, Matrix[Do
hp = hp[, 2:totalVals]
applyFunctions = allApplyFunctions[i]
no_of_res = nrow(hp)
- print("PIPELINE EXECUTION START ... "+toString(op))
+ # print("PIPELINE EXECUTION START ... "+toString(op))
hpForPruning = matrix(0, rows=1, cols=ncol(op))
changesByOp = matrix(0, rows=1, cols=ncol(op))
metaList2 = metaList; #ensure metaList is no result var
@@ -341,9 +341,6 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i = 1, Matrix[Do
changesByPipMatrix = removeEmpty(target=changesByPipMatrix, margin="rows",
select = sel)
}
-
-
-
# extract the hyper-parameters for pipelines
getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList,
Integer no_of_res, Boolean default, Integer seed = -1, Boolean enablePruning)
return (Matrix[Double] paramMatrix, Frame[Unknown] applyFunc, Integer
no_of_res, Integer NUM_META_FLAGS)
@@ -560,7 +557,6 @@ return (Double accuracy, Matrix[Double] evalFunHp,
Matrix[Double] hpForPruning,
allChanges = min(allChanges)
changesByOp = colMaxs(cvChanges)
accuracy = mean(accuracyMatrix)
- print("mean: \n"+toString(accuracyMatrix))
print("cv accuracy: "+toString(accuracy))
}
@@ -590,8 +586,6 @@ return(Boolean execute)
execute = !(changeCount > 0)
}
-
-
getParamMeta = function(Frame[Unknown] pipeline, Frame[Unknown] hpList)
return(Frame[Unknown] applyFunc, Matrix[Double] indexes, Matrix[Double]
paramCount)
{
diff --git a/scripts/builtin/executePipeline.dml
b/scripts/builtin/executePipeline.dml
index cdde19e2ef..b42a49bd0e 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -184,43 +184,15 @@ return(Matrix[Double] X,Integer executeFlag)
{
if(sum(mask) == 0)
executeFlag = 0
- else {
+ else if(sum(mask) != ncol(mask)) {
# take categorical out and remove numerics
X = removeEmpty(target=X, margin = "cols", select = mask)
}
+ else X = X
}
else X = X
}
-# confirmMeta = function(Matrix[Double] X, Matrix[Double] mask)
-# return (Matrix[Double] X)
-# {
- # if((sum(mask) > 0) & (ncol(X) == ncol(mask)))
- # {
- # # get the max + 1 for nan replacement
- # nanMask = is.na(X)
- # # replace nan
- # X = replace(target = X, pattern = NaN, replacement = 9999)
- # # take categorical out
- # cat = removeEmpty(target=X, margin="cols", select = mask)
- # # round categorical (if there is any floating point)
- # cat = round(cat)
- # less_than_1_mask = cat < 1
- # less_than_1 = less_than_1_mask * 9999
- # cat = (cat * (less_than_1_mask == 0)) + less_than_1
- # # reconstruct original X
- # X = X * (mask == 0)
- # q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)),
margin="rows",
- # select=t(mask)), ncol(cat), ncol(X))
- # X = (cat %*% q) + X
-
- # # put nan back
- # nanMask = replace(target = nanMask, pattern = 1, replacement = NaN)
- # X = X + nanMask
- # }
-# }
-
-
confirmData = function(Matrix[Double] nX, Matrix[Double] originalX,
Matrix[Double] mask, Integer dataFlag)
return (Matrix[Double] X)
{
diff --git a/scripts/builtin/multiLogReg.dml b/scripts/builtin/multiLogReg.dml
index 21c87a35e4..9b7d7da79e 100644
--- a/scripts/builtin/multiLogReg.dml
+++ b/scripts/builtin/multiLogReg.dml
@@ -61,12 +61,15 @@ m_multiLogReg = function(Matrix[Double] X, Matrix[Double]
Y, Int icpt = 2,
# Robustness for datasets with missing values (causing NaN gradients)
numNaNs = sum(isNaN(X))
if( numNaNs > 0 ) {
- print("multiLogReg: matrix X contains "+numNaNs+" missing values,
replacing with 0.")
+ if(verbose)
+ print("multiLogReg: matrix X contains "+numNaNs+" missing values,
replacing with 0.")
X = replace(target=X, pattern=NaN, replacement=0);
}
# Introduce the intercept, shift and rescale the columns of X if needed
if (icpt == 1 | icpt == 2) { # add the intercept column
+ if(N == nrow(X))
+ N = nrow(X)
X = cbind (X, matrix (1, N, 1));
D = ncol (X);
}
diff --git a/scripts/builtin/topk_cleaning.dml
b/scripts/builtin/topk_cleaning.dml
index d72f300b92..8fe5b48b1b 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -80,7 +80,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain,
Frame[Unknown] dataTest = a
# TODO why recoding/sampling twice (within getDirtyScore)
print("---- class-stratified sampling of feature matrix w/ f="+sample);
if(sum(mask) > ncol(mask)/2 & nrow(eYtrain) >= 10000 & sample == 1.0)
- [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq)
+ [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, 3500)
else
[eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, mask,
metaR, TRUE)
t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s");
@@ -115,14 +115,14 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain,
Frame[Unknown] dataTest = a
[bestLogical, bestHp, con, refChanges, acc] =
lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
initial_population=logical, refSol=refSol, seed = seed, max_iter=max_iter,
metaList = metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives,
param=parameters,
- dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=FALSE,
ctx=ctx)
+ dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE,
ctx=ctx)
t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s");
topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0);
topKScores = matrix(0,0,0); applyFunc = as.frame("NULL")
# write(acc, output+"/acc.csv", format="csv")
# stop("end of enumlp")
[topKPipelines, topKHyperParams, topKScores, applyFunc] =
bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, Y_test=eYtest,
metaList=metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical,
lpHp=bestHp, primitives=primitives, param=parameters, baseLineScore=dirtyScore,
- k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed,
enablePruning = enablePruning, verbose=FALSE);
+ k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed,
enablePruning = enablePruning, verbose=TRUE);
t7 = time(); print("-- Cleaning - Enum Physical Pipelines:
"+(t7-t6)/1e9+"s");
}
diff --git a/scripts/pipelines/scripts/utils.dml
b/scripts/pipelines/scripts/utils.dml
index a4f9f6c9b7..b1b1e2086a 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -50,107 +50,6 @@ return (Frame[Unknown] frameblock)
}
-# # #######################################################################
-# # # Function for group-wise/stratified sampling from all classes in labelled
dataset
-# # # Inputs: The input dataset X, Y and sampling ratio between 0 and 1
-# # # Output: sample X and Y
-# # #######################################################################
-# # doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio,
Matrix[Double] mask, Frame[String] metaR, Boolean verbose = FALSE)
- # # return (Matrix[Double] sampledX, Matrix[Double] sampledY, Matrix[Double]
filterMask)
-# # {
- # # print("initial number of rows: " +nrow(eX))
- # # # # # prepare feature vector for NB
- # # beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6, maxi=50,
maxii=50, verbose=FALSE);
- # # [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
-
- # # # # if the operation is binary make a fixed confidence of 0.9, for
multi-class compute kappa
- # # # threshold = 0
- # # # if(max(eY) == 2)
- # # # threshold = quantile(rowMaxs(trainProbs), 0.95)
- # # kappa = 0.0
- # # # if(max(eY) <= 2) {
- # # # kappa = quantile(rowMaxs(trainProbs), 0.95)
- # # # print("for binary classification")
- # # # }
- # # # else {
- # # # # compute kappa
- # # classFreA = table(eY, 1, 1, max(eY), 1)
- # # classFreP = table(yhat, 1, 1, max(eY), 1)
- # # probA = classFreA/nrow(eY)
- # # probP = classFreP/nrow(eY)
- # # condProb = sum(probA * probP)
- # # kappa = ((accuracy/100) - condProb) / (1 - condProb)
- # # print("kappa for multi-class"+toString(kappa))
- # # # }
- # # print("threshold "+toString(kappa))
- # # filterMask = rowMaxs(trainProbs) > kappa
- # # # sampledX = removeEmpty(target = eX, margin = "rows",
select=(rowMaxs(trainProbs) < threshold))
- # # # sampledY = removeEmpty(target = eY, margin = "rows",
select=(rowMaxs(trainProbs) < threshold))
- # # # print("filtered number of rows: " +nrow(sampledX))
-
- # # mask[1,1] = 0
- # # # # # stats of wrong
- # # maxUniques = max(colMaxs(replace(target=eX, pattern=NaN, replacement=1))
* mask)
- # # print("maxUniques "+maxUniques)
- # # while(FALSE){}
- # # stats = matrix(0, rows=maxUniques, cols=ncol(mask))
- # # metaInfo = frame(0, rows=nrow(metaR), cols = 2*ncol(metaR))
- # # # m = 1
- # # for(i in 1:ncol(mask))
- # # {
- # # print("meta: "+as.scalar(mask[1, i]))
- # # if(as.scalar(mask[1, i]) == 1)
- # # {
- # # problematic_cats = removeEmpty(target=eX[, i], margin = "rows",
select = (yhat != eY))
- # # problematic_cats_sums = table(problematic_cats, 1)
- # # stats[1:nrow(problematic_cats_sums), i] = problematic_cats_sums
- # # stats_rowMax = rowMaxs(stats)
- # # stats2 = (stats == stats_rowMax) * (stats_rowMax >= 100)
- # # # colum = metaR[, i]
- # # # print("printing meta recoded")
- # # # print(toString(colum))
- # # # while(FALSE){}
- # # # tmpValue = map(colum, "x -> x.toLowerCase()")
- # # # tmpIndex = map(colum, "x -> x.toLowerCase()")
- # # # metaInfo[1:nrow(tmpIndex), m] = tmpIndex
- # # # metaInfo[1:nrow(tmpIndex), m+1] = tmpValue
- # # # m = m + 2
- # # }
- # # }
- # # filterMask = eX[, 4] == 2 | eX[, 5] == 4 | eX[, 5] == 7 | eX[, 5] == 8
- # # filterMask = filterMask == 0
- # # # stats = cbind(seq(1, nrow(stats)), stats, stats_rowMax)
- # # # stats2 = cbind(seq(1, nrow(stats)), stats2)
- # # # print("print status: \n"+toString(stats))
- # # # print("print status 2: \n"+toString(stats2))
- # # # print("meta infor: \n"+toString(metaInfo, rows=10))
- # # # # create the filter mask
- # # print("rows taken after filtering the categories: "+sum(filterMask))
- # # MIN_SAMPLE = 1000
- # # sampledX = eX
- # # sampledY = eY
- # # ratio = ifelse(nrow(eY) > 200000, 0.6, ratio)
- # # sampled = floor(nrow(eX) * ratio)
-
- # # if(sampled > MIN_SAMPLE & ratio != 1.0)
- # # {
- # # sampleVec = sample(nrow(eX), sampled, FALSE, 23)
- # # P = table(seq(1, nrow(sampleVec)), sampleVec, nrow(sampleVec),
nrow(eX))
- # # if((nrow(eY) > 1)) # for classification
- # # {
- # # sampledX = P %*% eX
- # # sampledY = P %*% eY
- # # }
- # # else if(nrow(eY) == 1) { # for clustering
- # # sampledX = P %*% eX
- # # sampledY = eY
- # # }
- # # print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
- # # }
-
-# # }
-
-
#######################################################################
# Function for group-wise/stratified sampling from all classes in labelled
dataset
# Inputs: The input dataset X, Y and sampling ratio between 0 and 1
@@ -184,7 +83,7 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY,
Double ratio, Matrix[D
}
-doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY, Double lq,
Double uq)
+doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY, Double lq,
Double uq, Integer rowCount = 3500)
return (Matrix[Double] sampledX, Matrix[Double] sampledY)
{
print("initial number of rows: " +nrow(eX))
@@ -193,57 +92,22 @@ doErrorSample = function(Matrix[Double] eX, Matrix[Double]
eY, Double lq, Double
beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6, maxi=20,
maxii=20, verbose=FALSE);
[trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
- # kappa = 0.0
-
- # # compute kappa
- # classFreA = table(eY, 1, 1, max(eY), 1)
- # classFreP = table(yhat, 1, 1, max(eY), 1)
- # probA = classFreA/nrow(eY)
- # probP = classFreP/nrow(eY)
- # condProb = sum(probA * probP)
- # kappa = ((accuracy/100) - condProb) / (1 - condProb)
- # print("kappa for multi-class"+toString(kappa))
- # filterMask = rowMaxs(trainProbs) < kappa
- # threshold = ifelse(sum(filterMask) <= 2, median(rowMaxs(trainProbs)),
kappa)
- # threshold = ifelse(sum(filterMask) <= 2, median(rowMaxs(trainProbs)),
kappa)
- # print("threshold "+toString(threshold))
-
+
print("applying error filter")
- # sampledX = removeEmpty(target = eX, margin = "rows",
select=(rowMaxs(trainProbs) < threshold))
- # sampledY = removeEmpty(target = eY, margin = "rows",
select=(rowMaxs(trainProbs) < threshold))
filterMask = rowMaxs(trainProbs) < quantile(rowMaxs(trainProbs), lq) |
rowMaxs(trainProbs) > quantile(rowMaxs(trainProbs), uq)
+ delta = 0.001
+ while(sum(filterMask) < rowCount & nrow(eY) > rowCount)
+ {
+ lq = lq + delta
+ uq = uq - delta
+ filterMask = rowMaxs(trainProbs) < quantile(rowMaxs(trainProbs), lq) |
rowMaxs(trainProbs) > quantile(rowMaxs(trainProbs), uq)
+ }
sampledX = removeEmpty(target = eX, margin = "rows", select=filterMask)
sampledY = removeEmpty(target = eY, margin = "rows", select=filterMask)
print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
}
-# doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY)
- # return (Matrix[Double] sampledX, Matrix[Double] sampledY, Matrix[Double]
filterMask)
-# {
- # print("initial number of rows: " +nrow(eX))
- # # # # prepare feature vector for NB
- # beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6, maxi=50,
maxii=50, verbose=FALSE);
- # [trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
- # # # # stats of wrong
- # maxUniques = max(colMaxs(eX) * mask)
- # stats = matrix(0, rows=nrow(maxUniques), cols=ncol(mask))
- # for(i in 1:ncol(mask))
- # {
- # if(as.scalar(mask[1, i]) == 1)
- # {
- # problematic_cats = removeEmpty(target=eX[, i], margin = rows, select =
(yhat != eY))
- # problematic_cats_sums = table(problematic_cats, 1)
- # stats[1:nrow(problematic_cats_sums), i] = problematic_cats_sums
- # }
-
- # }
- # print(toString(stats))
-
-
-# }
-
-
# #######################################################################
# # Wrapper of transformencode OHE call, to call inside eval as a function
# # Inputs: The input dataset X, and mask of the columns
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
index 2133a4dd5a..af64dc371a 100644
---
a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
+++
b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
@@ -1,3 +1,3 @@
-forward_fill,imputeByMeanApply,NA,imputeByMedianApply,forward_fill,NA,imputeByMeanApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0
-NA,forward_fill,imputeByMeanApply,imputeByMeanApply,imputeByMedianApply,forward_fill,NA,NA,imputeByMedianApply,forward_fill,NA,imputeByMeanApply,dummycodingApply,0,0,0,0,0
-NA,forward_fill,imputeByMeanApply,imputeByMeanApply,imputeByMedianApply,forward_fill,NA,NA,imputeByMedianApply,forward_fill,NA,imputeByMeanApply,dummycodingApply,0,0,0,0,0
+forward_fill,winsorizeApply,imputeByMedianApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0
+forward_fill,winsorizeApply,imputeByMedianApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0
+forward_fill,winsorizeApply,imputeByMedianApply,NA,winsorizeApply,forward_fill,imputeByMeanApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
index c86545e61c..e8747b356c 100644
---
a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
+++
b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
@@ -1,3 +1,3 @@
-86.23188405797102
-84.23913043478261
-83.87681159420289
+73.731884057971
+73.731884057971
+73.731884057971
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index 1c3e619b04..750229f523 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-56.0,1.0,1.0,0,0,0,1.0,2.0,0,0,1.0,0,0,0,2.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
[...]
-91.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
[...]
-91.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,1.0,1.0,0,0,0,1.0,2.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
[...]
+40.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
[...]
+40.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
[...]
+64.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,1.0,1.0,0,0,0,0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
[...]
diff --git
a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index 0fc1fc0921..228496cbef 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-forward_fill,imputeByMean,underSampling,imputeByMedian,forward_fill,underSampling,imputeByMean,dummycoding,0,0,0,0,0,0,0,0,0,0
-underSampling,forward_fill,imputeByMean,imputeByMean,imputeByMedian,forward_fill,underSampling,underSampling,imputeByMedian,forward_fill,underSampling,imputeByMean,dummycoding,0,0,0,0,0
-underSampling,forward_fill,imputeByMean,imputeByMean,imputeByMedian,forward_fill,underSampling,underSampling,imputeByMedian,forward_fill,underSampling,imputeByMean,dummycoding,0,0,0,0,0
+forward_fill,winsorize,imputeByMedian,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0
+forward_fill,winsorize,imputeByMedian,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0
+forward_fill,winsorize,imputeByMedian,tomeklink,winsorize,forward_fill,imputeByMean,dummycoding,0,0,0,0,0,0,0,0,0,0