[systemds] branch main updated: [SYSTEMDS-3277] Performance decisionTree/randomForest (vectorization)

mboehm7 Thu, 06 Apr 2023 11:09:05 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new d39f745a85 [SYSTEMDS-3277] Performance decisionTree/randomForest 
(vectorization)
d39f745a85 is described below

commit d39f745a85cfe1dfb976fd85ddb6130fd845a942
Author: Matthias Boehm <[email protected]>
AuthorDate: Thu Apr 6 20:07:21 2023 +0200

    [SYSTEMDS-3277] Performance decisionTree/randomForest (vectorization)
    
    This patch further improves the performance of the decisionTree and
    transitively randomForest builtin functions. We now vectorized the
    inner loop matrix-vector multiplications and other computations over
    split candidates into matrix-matrix multiplications.
    
    On a larger sample of the nashville dataset, the patch improve
    performance by 4x as follows:
    
    OLD:
    Total execution time:           57.521 sec.
    HOP DAGs recompiled (PRED, SB): 172/407.
    HOP DAGs recompile time:        0.236 sec.
    Functions recompiled:           68.
    Functions recompile time:       0.769 sec.
    Heavy hitter instructions:
      1  ba+*             64.440    376497
      2  m_decisionTree   55.493         1
      3  findBestSplit    55.190        65
      4  ctable           10.827    376427
      5  *                 9.750   2634861
      6  rmvar             4.779  11686188
      7  uak+              3.888   1881914
      8  createvar         2.880   7536038
      9  uark+             2.875   1129060
     10  ==                1.771    376441
    
    NEW
    Total execution time:           15.635 sec.
    HOP DAGs recompiled (PRED, SB): 172/1815.
    HOP DAGs recompile time:        3.768 sec.
    Functions recompiled:           68.
    Functions recompile time:       0.289 sec.
    Heavy hitter instructions:
      1  m_decisionTree   13.608      1
      2  findBestSplit    13.325     65
      3  ba+*              8.707   7298
      4  leftIndex         6.142   2990
      5  r'                3.752   6000
      6  ==                2.574   1521
      7  *                 2.516   7560
      8  rand              2.450   2931
      9  uppertri          1.608   1430
     10  sp_csvrblk        1.032      1
---
 scripts/builtin/decisionTree.dml | 64 +++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/scripts/builtin/decisionTree.dml b/scripts/builtin/decisionTree.dml
index 2683521d15..5e72127dd1 100644
--- a/scripts/builtin/decisionTree.dml
+++ b/scripts/builtin/decisionTree.dml
@@ -78,13 +78,13 @@ m_decisionTree = function(Matrix[Double] X, Matrix[Double] 
y, Matrix[Double] cty
   m = nrow(X); n = ncol(X);
   classify = (as.scalar(ctypes[1,n+1]) == 2);
 
-  fdom = colMaxs(X);                  # num distinct per feature
+  fdom = max(colMaxs(X),2);           # num distinct per feature
   foffb = t(cumsum(t(fdom))) - fdom;  # feature begin
   foffe = t(cumsum(t(fdom)))          # feature end
   rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
   cix = matrix(X + foffb, m*n, 1);
   X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded
-  y2 = t(table(seq(1,m), y));
+  y2 = table(seq(1,m), y);
   cnt = colSums(X2);
   I = matrix(1, rows=1, cols=nrow(X));
 
@@ -92,7 +92,7 @@ m_decisionTree = function(Matrix[Double] X, Matrix[Double] y, 
Matrix[Double] cty
     print("decisionTree: initialize with max_depth=" + max_depth + ", 
max_features="
       + max_features + ", impurity=" + impurity + ", seed=" + seed + ".");
     print("decisionTree: basic statistics:");
-    print("-- impurity: " + computeImpurity(y2, I, impurity) );
+    print("-- impurity: " + as.scalar(computeImpurity(y2, I, impurity)));
     print("-- minFeatureCount: " + min(cnt));
     print("-- maxFeatureCount: " + max(cnt));
   }
@@ -164,26 +164,30 @@ findBestSplit = function(Matrix[Double] X2, 
Matrix[Double] y2, Matrix[Double] fo
   # finding a cutoff point in the recoded/binned representation)
   R = matrix(0, rows=3, cols=nrow(feat));
   parfor( i in 1:nrow(feat) ) {
-    f = as.scalar(feat[i]);
-    beg = as.scalar(foffb[1,f])+1;
-    end = as.scalar(foffe[1,f]);
-    bestig = 0.0; bestv = -1;
-    for(j in beg:end-1 ) { # lte semantics
-       # construct predicate 0/1 vector
-       p = table(seq(beg, j), 1, ncol(X2), 1);
-       # find rows that match at least one value and appear in I
-       Ileft = (t(X2 %*% p) * I) != 0;
-       Iright = I * (Ileft==0);
-       # compute information gain
-       ig = computeImpurity(y2, I, impurity)
-            - sum(Ileft)/numI * computeImpurity(y2, Ileft, impurity)
-            - sum(Iright)/numI * computeImpurity(y2, Iright, impurity);
-       # track best split value and index, incl validity
-       if( ig > bestig & sum(Ileft) >= min_leaf & sum(Iright) >= min_leaf ) {
-          bestig = ig;
-          bestv = j;
-       }
-    }
+    f = as.scalar(feat[i]);        # feature
+    beg = as.scalar(foffb[1,f])+1; # feature start in X2
+    end = as.scalar(foffe[1,f]);   # feature end in X2
+    belen = end-beg;
+    while(FALSE){} # make beg/end known
+
+    # construct 0/1 predicate vectors with <= semantics
+    # find rows that match at least one value and appear in I
+    # (vectorized evaluation, each column in P is a split candidate)
+    P = matrix(0, ncol(X2), belen+1);
+    P[beg:end-1,1:belen+1] = upper.tri(target=matrix(1,belen,belen+1), 
diag=TRUE);
+    Ileft = (t(X2 %*% P) * I) != 0;
+    Iright = (Ileft==0) * I;
+
+    # compute information gain for all split candidates
+    ig = as.scalar(computeImpurity(y2, I, impurity))
+         - rowSums(Ileft)/numI * computeImpurity(y2, Ileft, impurity)
+         - rowSums(Iright)/numI * computeImpurity(y2, Iright, impurity);
+    ig = replace(target=ig, pattern=NaN, replacement=0);
+
+    # track best split value and index, incl validity
+    valid = (rowSums(Ileft)>=min_leaf) & (rowSums(Iright)>=min_leaf);
+    bestig = max(valid*ig);
+    bestv = ifelse(bestig>0, 
nrow(valid)-as.scalar(rowIndexMax(t(rev(valid*ig))))+beg, -1);
     R[,i] = as.matrix(list(f, bestig, bestv));
   }
   ix = as.scalar(rowIndexMax(R[2,]));
@@ -206,14 +210,14 @@ findBestSplit = function(Matrix[Double] X2, 
Matrix[Double] y2, Matrix[Double] fo
 }
 
 computeImpurity = function(Matrix[Double] y2, Matrix[Double] I, String 
impurity)
-  return(Double score)
+  return(Matrix[Double] score)
 {
-  f = rowSums(y2 * I) / sum(I); # rel. freq. per category/bin
-  score = 0.0;
+  f = (I %*% y2) / rowSums(I); # rel. freq. per category/bin
+  score = matrix(0, nrow(I), 1);
   if( impurity == "gini" )
-    score = 1 - sum(f^2); # sum(f*(1-f));
+    score = 1 - rowSums(f^2); # sum(f*(1-f));
   else if( impurity == "entropy" )
-    score = sum(-f * log(f));
+    score = rowSums(-f * log(f));
   else
     stop("decisionTree: unsupported impurity measure: "+impurity);
 }
@@ -221,9 +225,9 @@ computeImpurity = function(Matrix[Double] y2, 
Matrix[Double] I, String impurity)
 computeLeafLabel = function(Matrix[Double] y2, Matrix[Double] I, Boolean 
classify, Boolean verbose)
   return(Double label)
 {
-  f = rowSums(y2 * I) / sum(I);
+  f = (I %*% y2) / sum(I);
   label = ifelse(classify,
-    as.scalar(rowIndexMax(t(f))), sum(f*seq(1,nrow(f))));
+    as.scalar(rowIndexMax(f)), sum(t(f)*seq(1,ncol(f))));
   if(verbose)
     print("-- leaf node label: " + label +" ("+sum(I)*max(f)+"/"+sum(I)+")");
 }

[systemds] branch main updated: [SYSTEMDS-3277] Performance decisionTree/randomForest (vectorization)

Reply via email to