This is an automated email from the ASF dual-hosted git repository. ssiddiqi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
commit e3ee3720c03307b447e69e0c949b0c9ae5eb3c2d Author: Shafaq Siddiqi <[email protected]> AuthorDate: Mon Sep 6 14:30:18 2021 +0200 fixing bug for winsorizing whatever I am sleepy crossV fix --- scripts/builtin/bandit.dml | 19 ++++---- scripts/builtin/tomeklink.dml | 51 ++++++++++++++++++++++ scripts/builtin/topk_cleaning.dml | 2 +- .../functions/builtin/BuiltinTomeklinkTest.java | 4 +- .../functions/pipelines/applyEvaluateTest.dml | 6 +-- .../pipelines/intermediates/classification/hp.csv | 6 +-- .../pipelines/intermediates/classification/lp.csv | 2 +- .../pipelines/intermediates/classification/pip.csv | 6 +-- 8 files changed, 73 insertions(+), 23 deletions(-) diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml index 28aa909..91abc08 100644 --- a/scripts/builtin/bandit.dml +++ b/scripts/builtin/bandit.dml @@ -53,7 +53,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl rows = 1, cols = NUM_FEATURES + 4 ) frameList = list() - for(s in s_max:0) { # TODO convert to parfor + parfor(s in s_max:0, check=0) { # TODO convert to parfor # result variables bracket_hp = matrix(0, rows=k*(s+1)+k, cols=HYPERPARAM_LENGTH) @@ -272,12 +272,11 @@ run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer { pipList = list(lp = lp, ph = ph_pip[i], hp = hp_matrix, flags = no_of_flag_vars) [evalFunOutput, hpForPruning, changesByOp] = crossV(X=X, y=Y, cvk=cvk, evalFunHp=evalFunHp, pipList=pipList, metaList=metaList, hpForPruning=hpForPruning, - changesByOp=changesByOp, evalFunc=evaluationFunc, trainML = FALSE) - + changesByOp=changesByOp, evalFunc=evaluationFunc, trainML = 0) } else { - [eXtrain, eYtrain, eXtest, eYtest, Tr] = executePipeline(logical=lp, pipeline=ph_pip[i], X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList, + [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp] = executePipeline(logical=lp, pipeline=ph_pip[i], X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList, hyperParameters=hp_matrix, hpForPruning=hpForPruning, changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE) if(max(eYtrain) == min(eYtrain)) print("Y contains only one class") @@ -603,8 +602,8 @@ return (String s) } crossV = function(Matrix[double] X, Matrix[double] y, Integer cvk, Matrix[Double] evalFunHp, List[Unknown] pipList, List[Unknown] metaList, - Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Boolean trainML = FALSE) -return (Matrix[Double] accuracy, Matrix[Double] hpForPruning, Matrix[Double] changesByOp) + Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Integer trainML = 0) +return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] changesByOp) { accuracyMatrix = matrix(0, cvk, 1) dataList = list() @@ -652,10 +651,12 @@ return (Matrix[Double] accuracy, Matrix[Double] hpForPruning, Matrix[Double] cha changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE) } # print("test out: "+nrow(testy)) - res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = 0)) - accuracyMatrix[i] = res + res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = trainML)) + accuracyMatrix[i] = res[1, 1] + evalFunHp = res[, 2:ncol(res)] } accuracy = as.matrix(mean(accuracyMatrix)) + output = cbind(accuracy, evalFunHp) } pruningSignal = function(Frame[Unknown] ph_pip, Matrix[Double] hp_matrix, Matrix[Double] hpForPruning, Matrix[Double] changesByOp) @@ -680,5 +681,3 @@ return(Boolean execute) } execute = !(changeCount > 0) } - - diff --git a/scripts/builtin/tomeklink.dml b/scripts/builtin/tomeklink.dml index 6169dbf..a48f0b7 100644 --- a/scripts/builtin/tomeklink.dml +++ b/scripts/builtin/tomeklink.dml @@ -74,6 +74,8 @@ return (Matrix[Double] nn) { dists = rowSums((X - X[i,])^2) dists[i,] = NaN; # mask out self-ref nn[i, 1] = rowIndexMin(t(dists)) + # res = naiveKNNsearch(X, X[i], 2) + # nn[i, 1] = res[1,2] } } @@ -86,3 +88,52 @@ return (Matrix[Double] tomek_links) { links = (y != majority_label) & (nn_labels == majority_label) tomek_links = (table(nn, 1, links, nrow(y), 1) > 0) } + + +#naive knn search implement +naiveKNNsearch = function( + Matrix[Double] P, + Matrix[Double] Q, + Integer K +)return( + Matrix[Double] O +){ + num_records = nrow (P); + num_features = ncol (P); + num_queries = nrow (Q); + Qt = t(Q); + PQt = P %*% Qt; + P2 = rowSums (P ^ 2); + D = -2 * PQt + P2; + if (K == 1) { + Dt = t(D); + O = rowIndexMin (Dt); + } else { + O = matrix (0, rows = num_queries, cols = K); + parfor (i in 1:num_queries) { + D_sorted=order(target=D[,i], by=1, decreasing=FALSE, index.return=TRUE); + O[i,] = t(D_sorted[1:K,1]); + } + } +} + + + +# #naive knn search implement +# KNNApprox = function( + # Matrix[Double] P, + # Matrix[Double] Q, + # Integer K +# )return( + # Matrix[Double] O +# ){ + +# [C, Y] = kmeans(X, nrow(X)/ncol(X), 25, 50, 0.0001, TRUE, 50, 1324) +# clusX = cbind(Y, X) +# clusX = order(target=X, by=1, decreasing=FALSE, index.return=FALSE); +# clus = table(Y, 1) + + +# Y_1 = kmeansPredict(X, C) +# } + diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml index 75ee184..1b32e3c 100644 --- a/scripts/builtin/topk_cleaning.dml +++ b/scripts/builtin/topk_cleaning.dml @@ -106,7 +106,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a tab = table(eYtrain, 1) dist = nrow(tab) - if((nrow(eYtrain) > 0 & dist < 10)) + if(FALSE) #(nrow(eYtrain) > 0 & dist < 10) logical = logicalSeedCI else logical = logicalSeedNoCI diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java index 00f0b36..411be29 100644 --- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java +++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinTomeklinkTest.java @@ -36,8 +36,8 @@ public class BuiltinTomeklinkTest extends AutomatedTestBase private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinTomeklinkTest.class.getSimpleName() + "/"; private final static double eps = 1e-3; - private final static int rows = 53; - private final static int cols = 6; + private final static int rows = 50000; + private final static int cols = 60; @Override public void setUp() { diff --git a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml index 813ef94..6edd239 100644 --- a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml +++ b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml @@ -82,8 +82,8 @@ return(Matrix[Double] accuracy) beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]), maxi=as.scalar(evalFunHp[1,4]), maxii=50, verbose=FALSE); - [prob, yhat, a] = multiLogRegPredict(Xtest, beta, Ytest, FALSE) - accuracy = getAccuracy(Ytest, yhat, TRUE) - print("accuracy weighted: "+accuracy) + [prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE) + a = getAccuracy(Ytest, yhat, TRUE) + print("accuracy: "+ accuracy+", accuracy weighted: "+a) accuracy = as.matrix(accuracy) } diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv index 3d5c3ff..f92bc2f 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv @@ -1,3 +1,3 @@ -40.0,2.0,0.01816863223655686,0.9565161479438591,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,1.0,0.6515164788504212,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -40.0,2.0,0.03510876761722913,0.9673791862807241,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,1.0,0.6149768032146687,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -40.0,2.0,0.014861839294898092,0.9595626659056867,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,1.0,0.6274449265973082,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +27.0,3.0,5.0,2.0,1.0,0,0,0,1.0,0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +27.0,3.0,7.0,2.0,1.0,0,0,0,1.0,0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +27.0,3.0,2.0,2.0,1.0,0,0,0,1.0,0,1.0,0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv index 52b30dc..5824d76 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv @@ -1 +1 @@ -OTLR,EC,EC,CI,DUMMY +ED,MVI,DUMMY diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv index 2feb234..f2d0efb 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv @@ -1,3 +1,3 @@ -winsorize,imputeByMean,imputeByMedian,abstain,dummycoding -winsorize,imputeByMean,imputeByMedian,abstain,dummycoding -winsorize,imputeByMean,imputeByMedian,abstain,dummycoding +outlierBySd,forward_fill,dummycoding +outlierBySd,forward_fill,dummycoding +outlierBySd,forward_fill,dummycoding
