This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new cf0acf4 [SYSTEMDS-2961] Refactor Cleaning Pipelines (2) 1 .remove
redundant functions 2. return execution time in executePipelines and CV 3.
store execution time for each pipeline in the feature vector
cf0acf4 is described below
commit cf0acf4f9fd53419cd85da1b8cef94051b201793
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Thu Apr 22 13:39:16 2021 +0200
[SYSTEMDS-2961] Refactor Cleaning Pipelines (2)
1 .remove redundant functions
2. return execution time in executePipelines and CV
3. store execution time for each pipeline in the feature vector
Closes #1246.
---
scripts/builtin/bandit.dml | 173 ++++++++++++----
scripts/builtin/discoverFD.dml | 4 +-
scripts/builtin/executePipeline.dml | 9 +-
scripts/builtin/imputeByMedian.dml | 2 +-
scripts/pipelines/scripts/utils.dml | 226 ++-------------------
.../test/functions/builtin/BuiltinMiceTest.java | 2 +-
.../functions/pipelines/compareAccuracy.dml | 102 +---------
.../pipelines/intermediates/hyperparams.csv | 10 +-
.../pipelines/intermediates/pipelines.csv | 2 +-
.../scripts/functions/pipelines/mainScript.dml | 2 +-
.../functions/pipelines/testClassification.dml | 92 +++------
.../scripts/functions/pipelines/testCompare.dml | 8 +-
12 files changed, 201 insertions(+), 431 deletions(-)
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 6523354..687d76b 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -18,7 +18,6 @@
# under the License.
#
#-------------------------------------------------------------
-source("scripts/pipelines/scripts/utils.dml") as utils;
m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train,
List[Unknown] metaList, List[Unknown] targetList,
Frame[Unknown] lp, Frame[Unknown] primitives, Frame[Unknown] param, Integer
k = 3, Integer R=50, Boolean verbose = TRUE)
@@ -37,15 +36,17 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, List[Unknown
B = (s_max + 1) * R;
# initialize output variables
- hparam = matrix(0, rows=k*(s_max+1), cols=55)
+ hparam = matrix(0, rows=k*(s_max+1), cols=100)
pipeline = frame(0, rows=k*(s_max+1), cols=ncol(lp)+1)
startOut=0; endOut=0;
- feaFrameOuter = frame("", rows = 1, cols = NUM_FEATURES + ncol(lp) + 1 )
+ feaFrameOuter = frame(data=["#MissingValues", "MinVla", "MaxVal",
"AverageMin", "AverageMax",
+ "#CategoricalFeatures", "#NumericFeatures", "Mean", "#Outliers",
"#OHEfeatures", "#Classes",
+ "Imbalance", "#rows", "#cols", "pipelines", "accuracy", "execution time in
ms", "CV time in ms"], rows = 1, cols = NUM_FEATURES + 4 )
- for(s in s_max:0, check = 0) {
+ for(s in s_max:0) {
# result variables
- bracket_hp = matrix(0, rows=k*(s+1)+k, cols=55)
+ bracket_hp = matrix(0, rows=k*(s+1)+k, cols=100)
bracket_pipel = matrix(0, rows=k*(s+1)+k, cols=3)
start=1; end=0;
@@ -62,9 +63,9 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, List[Unknown
lookup = configurations
if(verbose)
- print("n "+ n +"\n R "+ R +"\n s_max "+ s_max +"\n B "+ B +"\n n "+ n
+"\n r "+ r)
+ print("n "+ n +"\nR "+ R +"\ns_max "+ s_max +"\nB "+ B +"\nn "+ n +"\nr
"+ r)
- for( i in 0:s, check=0 ) {
+ for( i in 0:s) {
# successive halving
n_i = min(max(as.integer(floor(n * eta^(-i))), 1), nrow(configurations));
r_i = as.integer(floor(r * eta^i));
@@ -72,7 +73,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double]
Y_train, List[Unknown
if(verbose) {
print("no of configurations ---------"+n_i)
print("no of resources --------------"+r_i)
- print("iteration ---------------------"+i)
+ print("iteration ---------------------"+i+" out of "+s)
}
configurations = configurations[1:n_i, ]
@@ -137,36 +138,34 @@ get_physical_configurations = function(Frame[String]
logical, Scalar[int] numCon
dim = primitives[,5]
dummy = primitives[,6]
scale = primitives[,7]
-
- operator = as.frame(matrix(0,nrow(outliers),1)) #combine all logical
primitives
+ operator = frame(0, rows=nrow(primitives), cols=ncol(logical))
#as.frame(matrix(0,nrow(outliers),1)) #combine all logical primitives
for(j in 1:ncol(logical))
{
# extract the physical primitives
if(as.scalar(logical[1,j]) == "OTLR")
- operator = cbind(operator, outliers);
+ operator[, j] = outliers;
else if(as.scalar(logical[1,j]) == "MVI")
- operator = cbind(operator, mvi);
+ operator[, j] = mvi;
else if(as.scalar(logical[1,j]) == "NR")
- operator = cbind(operator, noise);
+ operator[, j] = noise;
else if(as.scalar(logical[1,j]) == "CI")
- operator = cbind(operator, ci);
+ operator[, j] = ci;
else if(as.scalar(logical[1,j]) == "DIM")
- operator = cbind(operator, dim);
+ operator[, j] = dim;
else if(as.scalar(logical[1,j]) == "DUMMY")
- operator = cbind(operator, dummy);
+ operator[, j] = dummy;
else if(as.scalar(logical[1,j]) == "SCALE")
- operator = cbind(operator, scale);
+ operator[, j] = scale;
else stop("invalid operation "+as.scalar(logical[1,j]))
}
- opt = operator[,2:ncol(operator)]
idx = matrix(1, rows=1, cols=ncol(logical))
# get the indexes of columns for recode transformation
index = vectorToCsv(idx)
# recode logical pipelines for easy handling
jspecR = "{ids:true, recode:["+index+"]}";
- [X, M] = transformencode(target=opt, spec=jspecR);
+ [X, M] = transformencode(target=operator, spec=jspecR);
X = replace(target= X, pattern = NaN, replacement = 0)
paramLens = matrix(0, ncol(logical), 1);
@@ -202,7 +201,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i, Matrix[Double
List[Unknown] targetList, Frame[Unknown] param, Frame[Unknown]
featureFrameOuter, Boolean verbose)
return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam,
Frame[Unknown] featureFrameOuter) {
- output_hp = matrix(0, nrow(ph_pip)*r_i, 50)
+ output_hp = matrix(0, nrow(ph_pip)*r_i, 60)
output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1)
output_pipelines = matrix(0, nrow(ph_pip)*r_i, 2)
@@ -213,13 +212,14 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i, Matrix[Double
id = as.matrix(ph_pip[, 1])
ph_pip = ph_pip[, 2:ncol(ph_pip)]
- feaVec = utils::gatherStats(X, Y, as.matrix(metaList['mask']),
as.scalar(targetList['target']))
+ feaVec = gatherStats(X, Y, as.matrix(metaList['mask']),
as.scalar(targetList['target']))
for(i in 1:nrow(ph_pip))
{
# execute configurations with r resources
[hp, no_of_res, no_of_flag_vars] = getHyperparam(ph_pip[i], param, r_i)
- feaFrame = frame("", rows = no_of_res, cols = ncol(feaVec) + ncol(ph_pip)
+ 1)
+ feaFrame = frame("", rows = no_of_res, cols = ncol(featureFrameOuter))
+ pip_toString = pipToString(ph_pip[i])
for(r in 1:no_of_res)
{
# as the matrix first block of r rows belongs to first operator and r+1
block of rows to second operator
@@ -229,11 +229,11 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i, Matrix[Double
indexes = cumsum(indexes)
indexes = table(indexes, 1, 1, nrow(hp), 1)
hp_matrix = removeEmpty(target = hp, margin="rows", select = indexes)
- [X, Y] = executePipeline(ph_pip[i], X, Y, as.matrix(metaList['mask']),
as.matrix(metaList['fd']), hp_matrix, no_of_flag_vars, FALSE)
+ [X, Y, T] = executePipeline(ph_pip[i], X, Y,
as.matrix(metaList['mask']), as.matrix(metaList['fd']), hp_matrix,
no_of_flag_vars, FALSE)
if(as.scalar(targetList['target']) == "compare")
- accuracy = utils::compareValue(clone_X, X,
as.matrix(targetList['cleanData']), as.matrix(metaList['mask']))
+ [accuracy, accT] = compareValue(clone_X, X,
as.matrix(targetList['cleanData']), as.matrix(metaList['mask']))
else
- accuracy = fclassify(X, Y, as.matrix(metaList['mask']),
as.matrix(targetList['mlHp']), as.scalar(targetList['dirAcc']),
+ [accuracy, accT] = fclassify(X, Y, as.matrix(metaList['mask']),
as.matrix(targetList['mlHp']), as.scalar(targetList['dirAcc']),
as.scalar(targetList['wAccuracy']), as.scalar(targetList['cv']))
matrix_width = as.matrix(nrow(hp_matrix) * ncol(hp_matrix))
hp_vec = cbind(matrix_width, matrix(hp_matrix, rows=1,
cols=nrow(hp_matrix)*ncol(hp_matrix), byrow=TRUE))
@@ -244,8 +244,10 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip,
Integer r_i, Matrix[Double
Y = clone_Y
index = index + 1
feaFrame[r, 1:ncol(feaVec)] = as.frame(feaVec)
- feaFrame[r, ncol(feaVec)+1:ncol(feaVec)+ncol(ph_pip[1])] = ph_pip[i]
- feaFrame[r, ncol(feaFrame)] = accuracy
+ feaFrame[r, (ncol(feaVec)+1)] = pip_toString
+ feaFrame[r, (ncol(feaVec)+2)] = accuracy
+ feaFrame[r, (ncol(feaVec)+3)] = T
+ feaFrame[r, (ncol(feaVec)+4)] = accT
}
X = clone_X
@@ -406,14 +408,11 @@ extractTopK = function(Frame[Unknown] pipeline,
Matrix[Double] hyperparam,
}
-
-
# extract the top k pipelines for each bracket, the intermediate results
extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double]
hyperparam,
Integer k, Frame[Unknown] conf)
return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams)
{
-
# bestPipeline = frameSort(bestPipeline)
hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE,
index.return=FALSE)
pipeline = order(target = pipeline, by = 1, decreasing=TRUE,
index.return=FALSE)
@@ -443,11 +442,12 @@ return (Frame[Unknown] maxperconf)
maxperconf[1:ncol(tab),] = as.frame(t(colMaxs(tab)))
}
-
-# function to classify the data using cross validation
+###########################################################################
+## function to classify the data using cross validation
+############################################################################
fclassify = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask,
Matrix[Double] MLhp,
Double testAccuracy, Boolean isWeighted, Integer cv=3)
- return (Double accuracy)
+ return (Double accuracy, Double T)
{
if(max(Y) == min(Y)) {
@@ -457,11 +457,13 @@ fclassify = function(Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] mask, Ma
else {
print("STARTING "+cv+" CROSS VALIDATIONS")
# do the k = 3 cross validations
+ t1 = time()
accuracyMatrix = crossV(X, Y, cv, mask, MLhp, isWeighted)
+ T = floor((time() - t1) / 1e+6)
accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
acc = colMeans(accuracyMatrix)
accuracy = as.scalar(acc[1,1])
- print("validation accuracy "+accuracy)
+ print(cv +" validation accuracy "+accuracy+" in "+T+" ms\n\n")
}
}
@@ -477,9 +479,7 @@ crossV = function(Matrix[double] X, Matrix[double] y,
Integer k, Matrix[Double]
Matrix[Double] MLhp, Boolean isWeighted)
return (Matrix[Double] accuracyMatrix)
{
-
accuracyMatrix = matrix(0, k, 1)
-
dataList = list()
testL = list()
data = order(target = cbind(y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
@@ -499,20 +499,16 @@ return (Matrix[Double] accuracyMatrix)
start = end + 1;
end = end + idx
class_j = data[start:end, ]
-
start_i = as.scalar(fold_idxes[j, 1]);
end_i = as.scalar(fold_idxes[j, 2])
-
fold_i = rbind(fold_i, class_j[start_i:end_i, ])
}
-
dataList = append(dataList, fold_i)
fold_idxes[, 1] = fold_idxes[, 2] + 1
fold_idxes[, 2] += ins_per_fold
- while(FALSE){}
}
- for(i in seq(1,k))
+ parfor(i in seq(1,k))
{
[trainList, hold_out] = remove(dataList, i)
trainset = rbind(trainList)
@@ -530,6 +526,95 @@ return (Matrix[Double] accuracyMatrix)
}
-# data=["#MissingValues", "MinVla", "MaxVal", "AverageMin", "AverageMax",
-# "#CategoricalFeatures", "#NumericFeatures", "Mean", "#Outliers",
"#OHEfeatures", "#Classes",
-# "Imbalance", "#rows", "#cols", ""]
\ No newline at end of file
+###############################################################################################
+# The function will collect the features like statistics and pipelines and
accuracy
+# so that they could be used for training a model and predicting pipelines
without enumeration
+###############################################################################################
+gatherStats = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double]
mask, String target)
+return (Matrix[Double] features)
+{
+
+ features = matrix(0, rows = 1, cols= 14)
+ features[1, 1]= sum(is.na(X)) # number of missing values
+ X = replace(target= X, pattern = NaN, replacement = 0)
+ num = removeEmpty(target=X, margin="cols", select=(mask == 0))
+ # get the stats
+ features[1, 2] = min(num) # minimum value
+ features[1, 3] = max(num)
+ features[1, 4] = mean(colMins(num)) # average minimum value
+ features[1, 5] = mean(colMaxs(num)) # average maximum value
+ features[1, 6] = sum(mask) # number of categorical features
+ features[1, 7] = sum(mask == 0) # number of numerical features
+ features[1, 8] = mean(num) # mean value
+ colSd = colSds(num)
+ count3sdplus = sum(num > (colMeans(num) + 3*colSd ))
+ count3sdminus = sum(num < (colMeans(num) - 3*colSd ))
+ outliers = count3sdplus + count3sdminus
+ features[1, 9] = outliers
+ # OHE features
+ OHE = sum(colMaxs(X) * mask)
+ features[1, 10] = OHE
+ if(target != "compare")
+ {
+ ctab = table(Y, 1)
+ features[1, 11] = nrow(ctab) # number of classes
+ minCat = min(ctab) / nrow(ctab)
+ maxCat = max(ctab) / nrow(ctab)
+ # class imabalance 1=YES, 0=NO
+ features[1, 12]= ifelse((maxCat - minCat) > 0.3, 1, 0)
+ }
+ else
+ {
+ features[1, 11] = 0
+ features[1, 12] = 0
+ }
+ features[1, 13] = nrow(X)
+ features[1, 14] = ncol(X)
+
+}
+
+
+######################################################################
+# # Function for cross validation using hold out method
+# # Inputs: The input dataset X, Y and the value of k validation, mask of the
+# # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
+# # via grid-search and a boolean value of (un)weighted accuracy.
+# # Output: It return a matrix having the accuracy of each fold.
+######################################################################
+
+compareValue = function(Matrix[double] dirtyX, Matrix[double] fixedX,
Matrix[Double] cleanX, Matrix[Double] mask)
+return (Double precision, Double T)
+{
+ t1 = time()
+ DEFAULT = 404
+ mv = is.na(dirtyX)
+ correctionsRequired = 0
+ mv = is.na(fixedX)
+ dirtyX = replace(target= dirtyX, pattern=NaN, replacement=DEFAULT)
+ cleanX = replace(target= cleanX, pattern=NaN, replacement=DEFAULT)
+ fixedX = replace(target= fixedX, pattern=NaN, replacement=DEFAULT)
+ diffCleanDirty = sum((abs(cleanX - dirtyX) < 0.001) < 1) #sum(cleanX ==
dirtyX) #
+ print("dirty != clean: "+diffCleanDirty)
+ correctionsRequired = (abs(cleanX - dirtyX) < 0.001) < 1#dirtyX != cleanX
+ print("corrections required: "+sum(correctionsRequired))
+ correctionsMade = sum(dirtyX != fixedX)
+ print("corrections made: "+correctionsMade)
+ dim = nrow(dirtyX) * ncol(dirtyX)
+ match = (abs(cleanX - fixedX) < 0.001) * correctionsRequired
+ print("total matches "+sum(match))
+ # print("total matches \n"+toString(match))
+ precision = max(0.001, sum(match) / correctionsMade)
+ T = floor((time() - t1) / 1e+6)
+ print("Precision: "+toString(precision) + " in "+T+" ms")
+
+
+}
+
+pipToString = function(Frame[String] F)
+return (String s)
+{
+ s = ""
+ for(i in 1:ncol(F))
+ s = s + as.scalar(F[,i])+";"
+
+}
diff --git a/scripts/builtin/discoverFD.dml b/scripts/builtin/discoverFD.dml
index 1bb0a21..dbc4fbf 100644
--- a/scripts/builtin/discoverFD.dml
+++ b/scripts/builtin/discoverFD.dml
@@ -74,11 +74,11 @@ m_discoverFD = function(Matrix[Double] X, Matrix[Double]
Mask, Double threshold)
parfor(i in 1 : d, check=0) {
index_i = as.scalar(cm2[i,1])
ndX = as.scalar(cm[1,index_i])
- if( ndX!=1 & ndX!=n) {
+ if(ndX!=1 & ndX != n) {
Xi = X[,index_i];
k = ifelse(threshold < 1, 1, (i+1)); # enumerate only upper triangle if
threshold = 1
parfor(j in k:d , check=0) {
- if(j != i) {
+ if((j != i) & (j > 0) & (j <= d)) {
index_j = as.scalar(cm2[j,1])
[A_determines_B, ratio] = isFD(Xi, X[,index_j], ndX);
if(A_determines_B | ratio >= threshold)
diff --git a/scripts/builtin/executePipeline.dml
b/scripts/builtin/executePipeline.dml
index 3ca0240..b646464 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -21,9 +21,9 @@
s_executePipeline = function(Frame[String] pipeline, Matrix[Double] X,
Matrix[Double] Y, Matrix[Double] mask,
Matrix[Double] FD, Matrix[Double] hyperParameters, Integer flagsCount,
Boolean verbose)
- return (Matrix[Double] X, Matrix[Double] Y)
+ return (Matrix[Double] X, Matrix[Double] Y, Double t2)
{
-
+ t1 = time();
print("PIPELINE EXECUTION START ... "+toString(pipeline))
if(verbose) {
@@ -47,6 +47,8 @@ s_executePipeline = function(Frame[String] pipeline,
Matrix[Double] X, Matrix[D
X = confirmMeta(X, mask)
}
+ t2 = floor((time() - t1) / 1e+6)
+ print("PIPELINE EXECUTION ENDED: "+t2+" ms")
}
# This function will convert the matrix row-vector into list
@@ -214,7 +216,6 @@ return (Matrix[Double] dX_train) {
}
-
#######################################################################
# Wrapper of imputeByFD OHE call, to call inside eval as a function
# Inputs: The input dataset X, and mask of the columns and threshold value
@@ -293,7 +294,7 @@ fillDefault = function(Matrix[Double] X)
return(Matrix[Double] X){
defaullt = round(colMaxs(X) - colMins(X))
Mask = is.na(X)
- X = replace(target=X, pattern=NaN, replacement=0)
+ X = replace(target=X, pattern=NaN, replacement=max(X))
Mask = Mask * defaullt
X = X + Mask
}
diff --git a/scripts/builtin/imputeByMedian.dml
b/scripts/builtin/imputeByMedian.dml
index 1a8a9f7..73931b6 100644
--- a/scripts/builtin/imputeByMedian.dml
+++ b/scripts/builtin/imputeByMedian.dml
@@ -51,7 +51,7 @@ return(Matrix[Double] X)
cols = ncol(nX)
# median imputation
colMedian = matrix(0, 1, cols)
- parfor(i in 1:cols)
+ for(i in 1:cols, check=0)
colMedian[1, i] = median(X[,i])
X_n = nX + (Mask_n * colMedian)
# mode imputation
diff --git a/scripts/pipelines/scripts/utils.dml
b/scripts/pipelines/scripts/utils.dml
index aa9338a..8b23536 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -18,20 +18,8 @@
# under the License.
#
#-------------------------------------------------------------
-vectorToCsv = function(Matrix[Double] mask)
-return (String indexes){
+source("scripts/builtin/bandit.dml") as bandit;
- vector = mask * t(seq(1, ncol(mask)))
- vector = removeEmpty(target = vector, margin = "cols")
- if(nrow(vector) > ncol(vector))
- vector = t(vector)
- s = ""
- if(ncol(vector) > 1) {
- for(i in 1:ncol(vector)-1)
- s = s+as.integer(as.scalar(vector[1,i]))+","
- }
- indexes = s+as.integer(as.scalar(vector[1,ncol(vector)]))
-}
# remove empty wrapper for frames
frameRemoveEmpty = function(Frame[Unknown] target, String margin,
Matrix[Double] select)
@@ -40,19 +28,6 @@ return (Frame[Unknown] frameblock)
idx = seq(1, ncol(target))
# get the indexes of columns for recode transformation
index = vectorToCsv(idx)
- #########################################################
- # vector = mask * t(seq(1, ncol(mask)))
- # vector = removeEmpty(target = vector, margin = "cols")
- # if(nrow(vector) > ncol(vector))
- # vector = t(vector)
- # s = ""
- # if(ncol(vector) > 1) {
- # for(i in 1:ncol(vector)-1)
- # s = s+as.integer(as.scalar(vector[1,i]))+","
- # }
- # index = s+as.integer(as.scalar(vector[1,ncol(vector)]))
- #########################################################
-
# recode logical pipelines for easy handling
jspecR = "{ids:true, recode:["+index+"]}";
[Xd, M] = transformencode(target=target, spec=jspecR);
@@ -60,18 +35,13 @@ return (Frame[Unknown] frameblock)
if(nrow(select) > 1)
X = removeEmpty(target = X, margin = margin, select = select)
else
- {
- X = removeEmpty(target = X, margin = margin)
- }
+ X = removeEmpty(target = X, margin = margin)
frameblock = transformdecode(target = Xd, spec = jspecR, meta = M)
frameblock = frameblock[1:nrow(X), 1:ncol(X)]
}
-
-
-
#######################################################################
# Function for group-wise/stratified sampling from all classes in labelled
dataset
# Inputs: The input dataset X, Y and sampling ratio between 0 and 1
@@ -95,10 +65,8 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY,
Double ratio)
out_s = 1
out_e = 0
end_class = 0
-
out = matrix(0, sampled, ncol(XY))
classes_ratio = floor(classes*ratio)
- print("class ratio "+toString(classes_ratio))
for(i in 1:nrow(classes))
{
end_class = end_class + as.scalar(classes[i])
@@ -132,134 +100,24 @@ return (Matrix[Double] dX_train) {
}
-# # function to classify the data using cross validation
-# fclassify = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double]
mask, Matrix[Double] MLhp,
- # Double testAccuracy, Boolean isWeighted, Integer cv=3)
- # return (Double accuracy)
-# {
-
- # if(max(Y) == min(Y)) {
- # print("Y contains only one class")
- # accuracy = as.double(0)
- # }
- # else {
- # print("STARTING "+cv+" CROSS VALIDATIONS")
- # # do the k = 3 cross validations
- # accuracyMatrix = crossV(X, Y, cv, mask, MLhp, isWeighted)
- # accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
- # acc = colMeans(accuracyMatrix)
- # accuracy = as.scalar(acc[1,1])
- # print("validation accuracy "+accuracy)
- # }
-# }
-
-
-
-# ####################################################################
-# # Function for classifying the dirty dataset, makes a call to crossV()
-# # Inputs: takes the input dataset X, Y and the value of k validation, mask
of the
-# # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
-# # via gridsearch and a boolean value of (un)weighted accuracy.
-# # Output: It return a matrix having the accuracy of each fold.
-# ####################################################################
-# classifyDirty = function(Matrix[Double] Xtrain, Matrix[Double] ytrain,
Matrix[Double] opt,
- # Matrix[Double] mask, Boolean isWeighted = TRUE, Integer cv)
- # return (Double accuracy)
-# {
- # # # classify without cleaning fill with edfault values 1
- # Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
- # dX_train = dummycoding(Xtrain, mask)
- # accuracy = crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
- # accuracy = mean(accuracy)
- # print("cross validated dirty accuracy "+accuracy)
-# }
-
-
-# # # ######################################################################
-# # # # # Function for cross validation using hold out method
-# # # # # Inputs: The input dataset X, Y and the value of k validation, mask
of the
-# # # # # dataset for OHE of categorical columns, vector of ML
hyper-parameters identified
-# # # # # via gridsearch and a boolean value of (un)weighted accuracy.
-# # # # # Output: It return a matrix having the accuracy of each fold.
-# # # ######################################################################
-
-# crossV = function(Matrix[double] X, Matrix[double] y, Integer k,
Matrix[Double] mask,
- # Matrix[Double] MLhp, Boolean isWeighted)
-# return (Matrix[Double] accuracyMatrix)
-# {
-
- # accuracyMatrix = matrix(0, k, 1)
-
- # dataList = list()
- # testL = list()
- # data = order(target = cbind(y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
- # classes = table(data[, 1], 1)
- # ins_per_fold = classes/k
- # start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
- # fold_idxes = cbind(start_fold, ins_per_fold)
- # start_i = 0; end_i = 0; idx_fold = 1;;
- # for(i in 1:k)
- # {
- # fold_i = matrix(0, 0, ncol(data))
- # start=0; end=0;
- # for(j in 1:nrow(classes))
- # {
- # idx = as.scalar(classes[j, 1])
- # start = end + 1;
- # end = end + idx
- # class_j = data[start:end, ]
-
- # start_i = as.scalar(fold_idxes[j, 1]);
- # end_i = as.scalar(fold_idxes[j, 2])
-
- # fold_i = rbind(fold_i, class_j[start_i:end_i, ])
- # }
-
- # dataList = append(dataList, fold_i)
- # fold_idxes[, 1] = fold_idxes[, 2] + 1
- # fold_idxes[, 2] += ins_per_fold
- # while(FALSE){}
- # }
-
- # for(i in seq(1,k))
- # {
- # [trainList, hold_out] = remove(dataList, i)
- # trainset = rbind(trainList)
- # testset = as.matrix(hold_out)
- # trainX = trainset[, 2:ncol(trainset)]
- # trainy = trainset[, 1]
- # testX = testset[, 2:ncol(testset)]
- # testy = testset[, 1]
- # beta = multiLogReg(X=trainX, Y=trainy, icpt=1, reg=as.scalar(MLhp[1,1]),
tol= 1e-9,
- # maxi=as.scalar(MLhp[1,2]), maxii= 50, verbose=FALSE);
- # [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
- # accuracy = getAccuracy(testy, yhat, isWeighted)
- # accuracyMatrix[i] = accuracy
- # }
-# }
-
-
-######################################################################
-# # Function for cross validation using hold out method
-# # Inputs: The input dataset X, Y and the value of k validation, mask of the
-# # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
-# # via grid-search and a boolean value of (un)weighted accuracy.
-# # Output: It return a matrix having the accuracy of each fold.
-######################################################################
-
-compareValue = function(Matrix[double] dirtyX, Matrix[double] cleanX,
Matrix[Double] fixedX, Matrix[Double] mask)
-return (Double precision)
+####################################################################
+# Function for classifying the dirty dataset, makes a call to crossV()
+# Inputs: takes the input dataset X, Y and the value of k validation, mask of
the
+# dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
+# via gridsearch and a boolean value of (un)weighted accuracy.
+# Output: It return a matrix having the accuracy of each fold.
+####################################################################
+classifyDirty = function(Matrix[Double] Xtrain, Matrix[Double] ytrain,
Matrix[Double] opt,
+ Matrix[Double] mask, Boolean isWeighted = TRUE, Integer cv)
+ return (Double accuracy)
{
- dirtyX = replace(target= dirtyX, pattern=NaN, replacement=0)
- cleanX = replace(target= cleanX, pattern=NaN, replacement=0)
- fixedX = replace(target= fixedX, pattern=NaN, replacement=0)
- correctionsRequired = dirtyX != cleanX
- correctionsMade = (dirtyX != fixedX)
- allCorrections_ = sum(correctionsMade)
- match = (abs(cleanX - fixedX) < 0.1) * correctionsRequired
- precision = max(0.001, sum(match) / allCorrections_)
- print("---------------------------------true positives are
"+toString(precision))
+ # # classify without cleaning fill with edfault values 1
+ Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
+ dX_train = dummycoding(Xtrain, mask)
+ accuracy = bandit::crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
+ accuracy = mean(accuracy)
+ print("cross validated dirty accuracy "+accuracy)
}
# constraints over hyper parameters
@@ -281,7 +139,6 @@ return (Double minVal, Double maxVal) {
minVal = 2.0
}
}
-
}
@@ -304,50 +161,3 @@ return(Boolean validForResources)
validForResources = count > 0
}
-
-###############################################################################################
-# The function will collect the features like statistics and pipelines and
accuracy
-# so that they could be used for training a model and predicting pipelines
without enumeration
-###############################################################################################
-gatherStats = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double]
mask, String target)
-return (Matrix[Double] features)
-{
-
- features = matrix(0, rows = 1, cols= 14)
- features[1, 1]= sum(is.na(X)) # number of missing values
- X = replace(target= X, pattern = NaN, replacement = 0)
- num = removeEmpty(target=X, margin="cols", select=(mask == 0))
- # get the stats
- features[1, 2] = min(num) # minimum value
- features[1, 3] = max(num)
- features[1, 4] = mean(colMins(num)) # average minimum value
- features[1, 5] = mean(colMaxs(num)) # average maximum value
- features[1, 6] = sum(mask) # number of categorical features
- features[1, 7] = sum(mask == 0) # number of numerical features
- features[1, 8] = mean(num) # mean value
- colSd = colSds(num)
- count3sdplus = sum(num > (colMeans(num) + 3*colSd ))
- count3sdminus = sum(num < (colMeans(num) - 3*colSd ))
- outliers = count3sdplus + count3sdminus
- features[1, 9] = outliers
- # OHE features
- OHE = sum(colMaxs(X) * mask)
- features[1, 10] = OHE
- if(target != "compare")
- {
- ctab = table(Y, 1)
- features[1, 11] = nrow(ctab) # number of classes
- minCat = min(ctab) / nrow(ctab)
- maxCat = max(ctab) / nrow(ctab)
- # class imabalance 1=YES, 0=NO
- features[1, 12]= ifelse((maxCat - minCat) > 0.3, 1, 0)
- }
- else
- {
- features[1, 11] = 0
- features[1, 12] = 0
- }
- features[1, 13] = nrow(X)
- features[1, 14] = ncol(X)
-
-}
\ No newline at end of file
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
index 1328112..ba2f2ef 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinMiceTest.java
@@ -87,7 +87,7 @@ public class BuiltinMiceTest extends AutomatedTestBase {
loadTestConfiguration(getTestConfiguration(TEST_NAME));
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
- programArgs = new String[]{"-nvargs", "X=" + DATASET,
"Mask="+input("M"),
+ programArgs = new String[]{"-nvargs", "X=" + DATASET,
"Mask="+input("M"),
"iteration=" + iter, "dataN=" + output("N"),
"dataC=" + output("C")};
if (lineage) {
programArgs = (String[])
ArrayUtils.addAll(programArgs, new String[] {
diff --git a/src/test/scripts/functions/pipelines/compareAccuracy.dml
b/src/test/scripts/functions/pipelines/compareAccuracy.dml
index 5cd7cda..22165c6 100644
--- a/src/test/scripts/functions/pipelines/compareAccuracy.dml
+++ b/src/test/scripts/functions/pipelines/compareAccuracy.dml
@@ -40,6 +40,8 @@
######################################################################################################################
source("scripts/pipelines/scripts/utils.dml") as utils;
+source("scripts/builtin/bandit.dml") as bandit;
+
F = read($1, data_type="frame", format="csv", header=FALSE,
naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
@@ -95,37 +97,9 @@ FD = FD > 0
# this condition is unnecessary here in this case because the input dataset is
balanced and
# instead of diving the dataset into train/test I am doing cross validations
-print("hp matrix")
no_of_param = as.scalar(hp[1, 1]) + 1
hp_width= hp[1, 2:no_of_param]
hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
-index = 1
-# for(i in 1:ncol(pip))
-# {
- # no_of_param = as.scalar(hp[1, index])
- # hp_matrix[i, 1:no_of_param] = hp[1, 2:no_of_param+1]
- # index = index + no_of_param + 2
-# }
-
-
-
-print(toString(hp_matrix))
-
-# while(k <= ncol(pip))
-# {
- # end = as.integer(i+as.integer(as.scalar(hp[1,i])))
- # mat = hp[1, i+1:end]
- # i = end + 1
- # if(as.scalar(pip[1,k]) != "SMOTE") {
- # pip1 = cbind(pip1, pip[1,k] )
- # ls = append(ls, mat)
- # }
- # k = k + 1
-# }
-
-
-print("ncol in X "+ncol(eX))
-print("ncol in mask "+ncol(getMask))
# # clean using best pipeline
[cX , cY] = executePipeline(pip[1], eX, eY, getMask, FD, hp_matrix, 5, FALSE)
@@ -148,12 +122,12 @@ oX = oX[, 1:ncol(oX) - 1]
# do the k cross validations for original clean data
-accuracyMatrix = crossV(oX, oY, 3, as.matrix(0), matrix("0.000001 100",
rows=1, cols=2), TRUE)
+accuracyMatrix = bandit::crossV(oX, oY, 3, as.matrix(0), matrix("0.000001
100", rows=1, cols=2), TRUE)
accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
oAcc = mean(accuracyMatrix)
# do the k cross validations for cleaned data
-accuracyMatrix = crossV(cX, cY, 3, as.matrix(0), matrix("0.000001 100",
rows=1, cols=2), TRUE)
+accuracyMatrix = bandit::crossV(cX, cY, 3, as.matrix(0), matrix("0.000001
100", rows=1, cols=2), TRUE)
accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
cAcc = mean(accuracyMatrix)
tol = 1
@@ -163,71 +137,3 @@ print("clean accuracy "+cAcc)
print("original accuracy "+oAcc)
write(results, $5, format = "text")
-
-# ######################################################################
-# # # Function for cross validation using hold out method
-# # # Inputs: The input dataset X, Y and the value of k validation, mask of
the
-# # # dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
-# # # via gridsearch and a boolean value of (un)weighted accuracy.
-# # # Output: It return a matrix having the accuracy of each fold.
-# ######################################################################
-
-crossV = function(Matrix[double] X, Matrix[double] y, Integer k,
Matrix[Double] mask,
- Matrix[Double] MLhp, Boolean isWeighted)
-return (Matrix[Double] accuracyMatrix)
-{
-
- accuracyMatrix = matrix(0, k, 1)
-
- dataList = list()
- testL = list()
- data = order(target = cbind(y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
- classes = table(data[, 1], 1)
- ins_per_fold = classes/k
- start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
- fold_idxes = cbind(start_fold, ins_per_fold)
-
- start_i = 0; end_i = 0; idx_fold = 1;;
- for(i in 1:k)
- {
- fold_i = matrix(0, 0, ncol(data))
- start=0; end=0;
- for(j in 1:nrow(classes))
- {
- idx = as.scalar(classes[j, 1])
- start = end + 1;
- end = end + idx
- class_j = data[start:end, ]
-
-
- start_i = as.scalar(fold_idxes[j, 1]);
- end_i = as.scalar(fold_idxes[j, 2])
-
- fold_i = rbind(fold_i, class_j[start_i:end_i, ])
- }
-
- dataList = append(dataList, fold_i)
- fold_idxes[, 1] = fold_idxes[, 2] + 1
- fold_idxes[, 2] += ins_per_fold
- while(FALSE){}
- }
-
- for(i in seq(1,k))
- {
- [trainList, hold_out] = remove(dataList, i)
- trainset = rbind(trainList)
- testset = as.matrix(hold_out)
- trainX = trainset[, 2:ncol(trainset)]
- trainy = trainset[, 1]
- testX = testset[, 2:ncol(testset)]
- testy = testset[, 1]
- beta = multiLogReg(X=trainX, Y=trainy, icpt=1, reg=as.scalar(MLhp[1,1]),
tol= 1e-9,
- maxi=as.scalar(MLhp[1,2]), maxii= 50, verbose=FALSE);
- [prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
- accuracy = getAccuracy(testy, yhat, isWeighted)
- accuracyMatrix[i] = accuracy
- }
-
-}
-
-
diff --git a/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv
b/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv
index a1cfa4f..85972c6 100644
--- a/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/hyperparams.csv
@@ -1,5 +1,5 @@
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,69.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,69.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,58.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,89.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,61.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,77.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,77.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,68.0,1.0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,56.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+36.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,3.0,56.0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/pipelines.csv
b/src/test/scripts/functions/pipelines/intermediates/pipelines.csv
index 601f82a..e9a6697 100644
--- a/src/test/scripts/functions/pipelines/intermediates/pipelines.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/pipelines.csv
@@ -2,4 +2,4 @@ imputeByMedian,scale,dummycoding,pca
imputeByMedian,scale,dummycoding,pca
imputeByMedian,scale,dummycoding,pca
imputeByMean,scale,dummycoding,pca
-imputeByMedian,scale,dummycoding,pca
+imputeByMean,scale,dummycoding,pca
diff --git a/src/test/scripts/functions/pipelines/mainScript.dml
b/src/test/scripts/functions/pipelines/mainScript.dml
index 5422ae6..7883e67 100644
--- a/src/test/scripts/functions/pipelines/mainScript.dml
+++ b/src/test/scripts/functions/pipelines/mainScript.dml
@@ -139,7 +139,7 @@ FD = FD > 0
logical1 = frame(["4", "MVI", "SCALE", "DUMMY", "DIM", "0", "0", "0"],
rows=1, cols=8)
logical2 = frame(["2", "MVI", "DUMMY", "0", "0", "0", "0", "0"], rows=1,
cols=8)
logical3 = frame(["3", "MVI", "SCALE", "DUMMY", "0", "0", "0", "0"], rows=1,
cols=8)
-logical4 = frame(["7", "MVI", "OTLR", "CI", "SCALE", "DUMMY", "DIM", "0"],
rows=1, cols=8)
+logical4 = frame(["6", "MVI", "OTLR", "CI", "SCALE", "DUMMY", "DIM", "0"],
rows=1, cols=8)
logical5 = frame(["7", "MVI", "OTLR", "MVI", "CI", "SCALE", "DUMMY", "DIM"],
rows=1, cols=8)
logical6 = frame(["6", "OTLR", "MVI", "CI", "SCALE", "DUMMY", "DIM", "0"],
rows=1, cols=8)
diff --git a/src/test/scripts/functions/pipelines/testClassification.dml
b/src/test/scripts/functions/pipelines/testClassification.dml
index 1a33bf2..45c6761 100644
--- a/src/test/scripts/functions/pipelines/testClassification.dml
+++ b/src/test/scripts/functions/pipelines/testClassification.dml
@@ -24,6 +24,7 @@ source("scripts/pipelines/scripts/utils.dml") as utils;
source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
source("scripts/pipelines/scripts/gridsearchMLR.dml") as gs;
+
# read the inputs
F = read($dirtyData, data_type="frame", format="csv", header=TRUE,
naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
@@ -43,9 +44,6 @@ targetApplicaton = $target # accuracy flag
if(nrow(metaInfo) < 2)
stop("incomplete meta info")
- # Do the initial cleaning
-
-
getSchema = metaInfo[1, 2:ncol(metaInfo)]
getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for
FD computation
@@ -84,39 +82,36 @@ getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the
mask of class label
lgSeed = logical::generateLogicalSeed(eX, eY, getMask, targetApplicaton)
allLgs = logical::transformLogical(lgSeed)
-
d_accuracy = 0
# 4. perform the sampling
-[eX, eY] = doSample(eX, eY, sample)
+[eX, eY] = utils::doSample(eX, eY, sample)
# 5. get train test and validation set with balanced class distribution
-# [X_train, y_train, X_test, y_test] = splitBalanced(X=eX, Y=eY,
splitRatio=0.7, verbose=FALSE)
-X_train = eX
-y_train = eY
+[X_train, y_train, X_test, y_test] = splitBalanced(X=eX, Y=eY, splitRatio=0.7,
verbose=FALSE)
+
# 6. find the best hyper parameters for classification algorithm
# for now only find the best values for intercept and maximum outer iteration
params = list("reg", "maxi");
paramRanges = list(10^seq(0,-10), seq(10,100, 10));
-# if(sum(getMask) > 0)
-# {
- # dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test),
pattern = NaN, replacement=0), getMask)
- # dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),]
- # dX_train = dX_train[1:nrow(y_train),]
- # [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test,
- # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
-# }
-# else
- # [opt, loss] = gs::gridSearchMLR(X_train, y_train, X_test, y_test,
- # "multiLogReg", "lossFunc", params, paramRanges, FALSE);
-# hardcoded hyper-params for multilogReg
-opt = matrix("0 100", 1, 2)
-
+if(sum(getMask) > 0)
+{
+ dX_train = utils::dummycoding(replace(target = rbind(X_train, X_test),
pattern = NaN, replacement=0), getMask)
+ dX_test = dX_train[nrow(y_train)+1:nrow(dX_train),]
+ dX_train = dX_train[1:nrow(y_train),]
+ [opt, loss] = gs::gridSearchMLR(dX_train, y_train, dX_test, y_test,
+ "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+ }
+else
+ [opt, loss] = gs::gridSearchMLR(X_train, y_train, X_test, y_test,
+ "multiLogReg", "lossFunc", params, paramRanges, FALSE);
+
+# as I am testing on CV not on holdout train/test
+X_train = eX
+y_train = eY
# 7. get the cross validated accuracy on dirty dataset (only on training set)
-d_accuracy = classifyDirty(X_train, y_train, opt, getMask, weightedAccuracy,
crossValidations)
-# print("dirty accuracy is "+d_accuracy)
- # [eX, eY] = prioritise(eX, eY, getMask)
-
+d_accuracy = utils::classifyDirty(X_train, y_train, opt, getMask,
weightedAccuracy, crossValidations)
+
FD = discoverFD(X=replace(target=eX, pattern=NaN, replacement=1),
Mask=getFdMask, threshold=0.8)
FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD
FD = FD > 0
@@ -128,11 +123,12 @@ targetClassification = list(target=targetApplicaton,
cv=crossValidations, wAccur
# # initialize output variables
pip = as.frame("NULL"); hp = matrix(0,0,0); acc = matrix(0,0,0); features =
as.frame("NULL")
-[pip, hp, acc, features] = bandit(X_train=eX, Y_train=eY, metaList=metaList,
targetList=targetClassification, lp=allLgs[1],
- primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
output = $output
-write(features, output+"/features.csv", format="csv")
+
+[pip, hp, acc, features] = bandit(X_train=eX, Y_train=eY, metaList=metaList,
targetList=targetClassification, lp=allLgs[1,],
+ primitives=primitives, param=param, k=topK, R=resources, verbose=TRUE);
+
if(as.scalar((is.na(acc[1,1]))) == 1 | as.scalar(acc[1,1]) < d_accuracy)
@@ -146,54 +142,24 @@ print("best hyperparam")
print(toString(hp))
print("best accuracy")
-print(toString(acc))
+print(toString(acc[1, 1]))
-clean_accuracy = as.scalar(acc[1,1])
+clean_accuracy = max(acc[1,1])
result = d_accuracy < clean_accuracy
print("result satisfied ------------"+result)
-accuracies = cbind(as.matrix(d_accuracy), as.matrix(clean_accuracy))
-
write(pip, output+"/pipelines.csv", format="csv")
write(hp, output+"/hyperparams.csv", format="csv")
write(acc, output+"/accuracies.csv", format="csv")
+accuracies = cbind(as.matrix(d_accuracy), acc[1,1])
write(accuracies , output+"/BestAccuracy.csv", format="csv")
+write(features, output+"/features.csv", format="csv")
write(result , $O)
-
-
-
-####################################################################
-# Function for classifying the dirty dataset, makes a call to crossV()
-# Inputs: takes the input dataset X, Y and the value of k validation, mask of
the
-# dataset for OHE of categorical columns, vector of ML hyper-parameters
identified
-# via grid-search and a boolean value of (un)weighted accuracy.
-# Output: It return a matrix having the accuracy of each fold.
-####################################################################
-classifyDirty = function(Matrix[Double] Xtrain, Matrix[Double] ytrain,
Matrix[Double] opt,
- Matrix[Double] mask, Boolean isWeighted = TRUE, Integer cv)
- return (Double accuracy)
-{
- # # classify without cleaning fill with default values 1
- Xtrain = replace(target = Xtrain, pattern = NaN, replacement=0)
- if(sum(mask) > 0)
- Xtrain = utils::dummycoding(Xtrain, mask)
- # print("rows in data ")
- # print(nrow(dX_train))
- # print("column in data")
- # print(ncol(dX_train))
- accuracy = crossV(Xtrain, ytrain, cv, mask, opt, isWeighted)
- accuracy = mean(accuracy)
- print("cross validated dirty accuracy "+accuracy)
-}
-
-
-
-
lossFunc = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B)
return (Matrix[Double] loss) {
[prob, yhat, acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE)
diff --git a/src/test/scripts/functions/pipelines/testCompare.dml
b/src/test/scripts/functions/pipelines/testCompare.dml
index dc2bf84..019df79 100644
--- a/src/test/scripts/functions/pipelines/testCompare.dml
+++ b/src/test/scripts/functions/pipelines/testCompare.dml
@@ -20,7 +20,7 @@
#-------------------------------------------------------------
# Generate the logical pipelines for data cleaning
-source("scripts/pipelines/scripts/utils.dml") as utils;
+
source("scripts/pipelines/scripts/logicalFunc.dml") as logical;
# read the inputs
@@ -64,7 +64,7 @@ X = dropInvalidType(F, getSchema)
if(sum(getMask) > 0)
{
# always recode the label
- index = utils::vectorToCsv(getMask)
+ index = vectorToCsv(getMask)
jspecR = "{ids:true, recode:["+index+"]}"
[eX, X_meta] = transformencode(target=rbind(cleanData, X), spec=jspecR);
@@ -78,8 +78,10 @@ if(sum(getMask) > 0)
}
# if no categorical value exist then just cast the frame into matrix
-else
+else {
eX = as.matrix(X)
+ cleanX = as.matrix(cleanData)
+}
# get the logical seed