This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
commit fe3fad7ca9c986f97ca5b29856d3d37263280bc1 Author: Matthias Boehm <[email protected]> AuthorDate: Tue Aug 10 21:55:11 2021 +0200 [MINOR] Performance tomeklink builtin function (task-parallel) This patch makes a minor performance improvement at script level for the tomeklink builtin function for under-sampling. In detail, we now avoid unnecessary sorting and use task-parallel for loops. Later, we might switch the manhatten distance with the euclidean dist() in order to fully vectorize the distance computation and exploit matrix multiplications. On a machine with 112 vcores, and a scenario of calling tomeklink on a 10K x 1K matrix, the runtime improved from 342s to 45s. --- scripts/builtin/tomeklink.dml | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/scripts/builtin/tomeklink.dml b/scripts/builtin/tomeklink.dml index 3b4fc6f..18daafe 100644 --- a/scripts/builtin/tomeklink.dml +++ b/scripts/builtin/tomeklink.dml @@ -23,13 +23,13 @@ # COMPUTES TOMEK LINKS AND DROPS THEM FROM DATA MATRIX AND LABEL VECTOR # DROPS ONLY THE MAJORITY LABEL AND CORRESPONDING POINT OF TOMEK LINKS # -# INPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# X MATRIX --- Data Matrix (nxm) -# y MATRIX --- Label Matrix (nx1), greater than zero -# --------------------------------------------------------------------------------------------- +# INPUT PARAMETERS: +# ------------------------------------------------------------ +# NAME TYPE DEFAULT MEANING +# ------------------------------------------------------------ +# X MATRIX --- Data Matrix (nxm) +# y MATRIX --- Label Matrix (nx1), greater than zero +# ------------------------------------------------------------ # OUTPUT: # X_under - Data Matrix without Tomek links # y_under - Labels corresponding to undersampled data @@ -37,12 +37,12 @@ m_tomeklink = function(Matrix[Double] X, Matrix[Double] y) -return (Matrix[Double] X_under, Matrix[Double] y_under, Matrix[Double] drop_idx) { - +return (Matrix[Double] X_under, Matrix[Double] y_under, Matrix[Double] drop_idx) +{ ymin = min(y) if(ymin == 0) y = y + 1 - + # # find the majority labels label = table(y, 1) majority_label = as.scalar(rowIndexMax(t(label))) @@ -54,18 +54,17 @@ return (Matrix[Double] X_under, Matrix[Double] y_under, Matrix[Double] drop_idx) drop_idx = removeEmpty(target=drop_idx, margin="rows", select = tomek_links) if(ymin) y = y - 1 - } - # get the nearest neighbour index get_nn = function(Matrix[Double] X) return (Matrix[Double] nn) { + # TODO exchange manhatten by euclidean dist()? nn = matrix(0, rows = nrow(X), cols = 1) - for (i in 1:nrow(X)) { - dists = rowSums((X - X[i,])^2) - sort_dists = order(target = dists, by = 1, decreasing = FALSE, index.return = TRUE) - nn[i, 1] = as.scalar(sort_dists[2, 1]) # nearest, not self + parfor (i in 1:nrow(X)) { + dists = rowSums((X - X[i,])^2) + dists[i,] = NaN; # mask out self-ref + nn[i, 1] = rowIndexMin(t(dists)) } } @@ -78,4 +77,3 @@ return (Matrix[Double] tomek_links) { links = (y != majority_label) & (nn_labels == majority_label) tomek_links = (table(nn, 1, links, nrow(y), 1) > 0) } -
