[systemds] branch main updated: [SYSTEMDS-2672] Fix XGBoost builtin implementation and internals

mboehm7 Mon, 03 Jul 2023 08:51:53 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 55305031bc [SYSTEMDS-2672] Fix XGBoost builtin implementation and 
internals
55305031bc is described below

commit 55305031bce20e17eae90a8a9a89334332ca33ba
Author: Matthias Boehm <[email protected]>
AuthorDate: Mon Jul 3 17:50:15 2023 +0200

    [SYSTEMDS-2672] Fix XGBoost builtin implementation and internals
    
    The recently improved rewrites triggered an issue in XGBoost, which
    used indexing and reshapes as a workaround to create new objects and
    thus avoid variable cleanup issues. With the new rewrites this
    workaround is no longer working. This patch replaces the workaround
    (but does not fix the underlying issue), fixed a few internals such as
    replace operations with empty inputs, robustness of indexing rewrites,
    and vectorizes large parts of XGBoost in order to aid debugging.
---
 scripts/builtin/xgboost.dml                        | 120 +++++++--------------
 .../java/org/apache/sysds/hops/IndexingOp.java     |   4 +-
 .../runtime/controlprogram/BasicProgramBlock.java  |   5 +-
 .../sysds/runtime/matrix/data/MatrixBlock.java     |   4 +
 .../part2/BuiltinXgBoostTest_classification.java   |   5 +-
 5 files changed, 51 insertions(+), 87 deletions(-)

diff --git a/scripts/builtin/xgboost.dml b/scripts/builtin/xgboost.dml
index d22b037fea..163b7d51a7 100644
--- a/scripts/builtin/xgboost.dml
+++ b/scripts/builtin/xgboost.dml
@@ -60,8 +60,8 @@
 # M     Matrix M where each column corresponds to a node in the learned tree
 # 
-----------------------------------------------------------------------------------
 
-m_xgboost = function(Matrix[Double] X, Matrix[Double] y, 
-  Matrix[Double] R = matrix(1,rows=1,cols=nrow(X)), Integer sml_type = 1, 
Integer num_trees = 7, 
+m_xgboost = function(Matrix[Double] X, Matrix[Double] y,
+  Matrix[Double] R = matrix(1,rows=1,cols=nrow(X)), Integer sml_type = 1, 
Integer num_trees = 7,
   Double learning_rate = 0.3, Integer max_depth = 6, Double lambda = 0.0)
   return (Matrix[Double] M)
 {
@@ -325,14 +325,10 @@ buildOneTreeClassification = function(Matrix[Double] X, 
Matrix[Double] y, Matrix
     }
 
     best_feature_index = 0.00
-    done = FALSE
     count = sum(curr_y)
-    if(count == 0)
-      done = TRUE
-    if(count == nrow(curr_y))
-      done = TRUE
+    done = (count == 0) | (count == nrow(curr_y));
 
-    if(available_rows > 1 & max_depth > level & done == FALSE) # leaf check or 
max depth check
+    if(available_rows > 1 & max_depth > level & !done) # leaf check or max 
depth check
     {
       best_feature_index = findBestFeature(X=curr_X, y=curr_y, 
sml_type=sml_type)
       type = getTypeOfFeature(R, best_feature_index)
@@ -404,7 +400,7 @@ calculateChildNodeIDs = function(Matrix[Double] node)
 # INPUT:    threshold: the value at the node we separated it, if leaf => the 
output value
 # OUTPUT:   new_M: nxn output Matrix
 addOutputRow = function(Matrix[Double] M, Matrix[Double] node, Double tree_id, 
Matrix[Double] R,
-  Double offset, Double used_col, Double threshold, Double output_value) 
+  Double offset, Double used_col, Double threshold, Double output_value)
   return (Matrix[Double] new_M) 
 {
   current_node = matrix(0, rows=6, cols=1)
@@ -496,18 +492,11 @@ updateMatrices = function(Matrix[Double] X, 
Matrix[Double] y, Matrix[Double] pre
 
#-----------------------------------------------------------------------------------------------------------------------
 # INPUT:    vector: a 1xn matrix (current datapoints which are interesting)
 # OUTPUT:   available_elements: nr. of available datapoints
-calcAvailableRows = function(Matrix[Double] vector) 
+calcAvailableRows = function(Matrix[Double] vector)
   return(Double available_elements)
 {
   assert(ncol(vector) == 1)
-  len = nrow(vector)
-  available_elements = 0.0
-  # TODO perf vectorize
-  for (index in 1:len) {
-    element = as.scalar(vector[index,])
-    if(element > 0.0)
-      available_elements = available_elements + 1.0
-  }
+  available_elements = sum(vector > 0);
 }
 
 
#-----------------------------------------------------------------------------------------------------------------------
@@ -515,12 +504,12 @@ calcAvailableRows = function(Matrix[Double] vector)
 # OUTPUT:   new_queue: the new queue after the first vector gets popped
 # OUTPUT:   node_vec: the popped vector (1xn) from the queue
 dataQueuePop = function(Matrix[Double] queue)
-  return (Matrix[Double] new_queue, Matrix[Double] node_vec) 
+  return (Matrix[Double] new_queue, Matrix[Double] node_vec)
 {
-  node_vec = matrix(queue[,1], rows=1, cols=nrow(queue)) # reshape to force 
the creation of a new object
-  node_vec = matrix(node_vec, rows=nrow(queue), cols=1)  # reshape to force 
the creation of a new object
-  len = ncol(queue)
-  if (len < 2)
+  node_vec = queue[,1] + 7 #TODO workaround variable cleanup
+  while(FALSE){}
+  node_vec = node_vec - 7;
+  if (ncol(queue) < 2)
     new_queue = matrix(0,0,0)
   else
     new_queue = matrix(queue[,2:ncol(queue)], rows=nrow(queue), 
cols=ncol(queue)-1)
@@ -531,11 +520,10 @@ dataQueuePop = function(Matrix[Double] queue)
 # INPUT:    right: a 1xn matrix
 # INPUT:    queue: a nxn matrix (queue containing values)
 # OUTPUT:   new_queue: the new queue nxn matrix
-dataQueuePush = function(Matrix[Double] left, Matrix[Double] right, 
Matrix[Double] queue)  
+dataQueuePush = function(Matrix[Double] left, Matrix[Double] right, 
Matrix[Double] queue)
   return (Matrix[Double] new_queue)
 {
-  len = ncol(queue)
-  if(len <= 0)
+  if(ncol(queue) <= 0)
     new_queue = cbind(left, right)
   else
     new_queue = cbind(queue, left, right)
@@ -546,7 +534,7 @@ dataQueuePush = function(Matrix[Double] left, 
Matrix[Double] right, Matrix[Doubl
 # INPUT:    X: a nxn matrix (the current samples with all features we observe)
 # INPUT:    y: a 1xn matrix (the current y to all observed samples)
 # OUTPUT:   lowest_residuals_index: the feature index with the lowest residuals
-findBestFeature = function(Matrix[Double] X, Matrix[Double] y, Integer 
sml_type) 
+findBestFeature = function(Matrix[Double] X, Matrix[Double] y, Integer 
sml_type)
   return (Integer lowest_residuals_index)
 {
   lowest_residuals = 0
@@ -560,7 +548,7 @@ findBestFeature = function(Matrix[Double] X, Matrix[Double] 
y, Integer sml_type)
       weights = glm(X=current_feature, Y=y, dfam=1, verbose=FALSE)
     else # Classification
       weights = glm(X=current_feature, Y=y, dfam=2, verbose=FALSE)
-      
+
     y_residual = y - current_feature %*% weights
     res = sum(y_residual ^ 2) # sum of least square calculation
 
@@ -618,16 +606,9 @@ findBestSplit = function(Integer sml_type, Matrix[Double] 
one_featureX, Double s
 #           right: a 1xn matrix with all values larger than "value"
 splitMatrixByValue = function(Matrix[Double] X, Double value) return 
(Matrix[Double] left, Matrix[Double] right) {
   assert(ncol(X) == 1)
-  left = matrix(0, rows=0, cols=1)
-  right = matrix(0, rows=0, cols=1)
-
-  # TODO perf vectorize
-  for(i in 1:nrow(X)) {
-    if(as.scalar(X[i,]) < value)
-      left = rbind(left,X[i,])
-    else
-      right = rbind(right, X[i,])
-  }
+  I = (X < value);
+  left = removeEmpty(target=X, margin="rows", select=I);
+  right = removeEmpty(target=X, margin="rows", select=!I);
 }
 
 
#-----------------------------------------------------------------------------------------------------------------------
@@ -636,25 +617,11 @@ splitMatrixByValue = function(Matrix[Double] X, Double 
value) return (Matrix[Dou
 #           value: the border to separate the values
 # OUTPUT:   left: a 1xn matrix with 1.0 if the value is smaller or 0.0 if it 
is bigger
 #           right: a 1xn matrix with 1.0 if the value is bigger or 0.0 if it 
is smaller
-splitMatrixByValueLoop = function(Matrix[Double] curr_X, Matrix[Double] X, 
Double value) 
+splitMatrixByValueLoop = function(Matrix[Double] curr_X, Matrix[Double] X, 
Double value)
   return (Matrix[Double] left, Matrix[Double] right)
 {
-  left = matrix(0, rows=nrow(X), cols=1)
-  right = matrix(0, rows=nrow(X), cols=1)
-
-  # TODO perf vectorize
-  for(i in 1:nrow(X)) {
-    if (as.scalar(X[i,]) == as.scalar(curr_X[i,])) { # if same row in curr_X 
and X
-      if (as.scalar(X[i,]) < value)
-        left[i,1] = 1
-      else
-        right[i,1] = 1
-    }
-    else { # 0 in left and right if not used in this node
-      left[i,1] = 0
-      right[i,1] = 0
-    }
-  }
+  left = (X == curr_X) & (X < value);
+  right = (X == curr_X) & (X >= value);
 }
 
 
#-----------------------------------------------------------------------------------------------------------------------
@@ -662,31 +629,20 @@ splitMatrixByValueLoop = function(Matrix[Double] curr_X, 
Matrix[Double] X, Doubl
 #           X:     a 1xn matrix the feature of the X-Matrix with all values
 # OUTPUT:   left: a 1xn matrix with 1.0 for if the value is true or 0.0 if it 
is false
 #           right: a 1xn matrix with 1.0 for if the value is false or 0.0 if 
it is true
-splitMatrixByCategory = function(Matrix[Double] curr_X, Matrix[Double] X) 
-  return (Matrix[Double] left, Matrix[Double] right) 
+splitMatrixByCategory = function(Matrix[Double] curr_X, Matrix[Double] X)
+  return (Matrix[Double] left, Matrix[Double] right)
 {
   assert(ncol(curr_X) == 1)
   assert(ncol(X) == 1)
-  left = matrix(0, rows=nrow(X), cols=1)
-  right = matrix(0, rows=nrow(X), cols=1)
-
-  for(i in 1:nrow(curr_X)) {
-    if (as.scalar(curr_X[i,]) == 1) # categorical true
-      left[i,1] = 1
-    else if(as.scalar(curr_X[i,]) == 0)  # categorical false
-      right[i,1] = 1
-    else {  # replace with NaN if not used for the current samples
-      left[i,1] = 'NaN'
-      right[i,1] = 'NaN'
-    }
-  }
+  left = replace(target=(curr_X == 1), pattern=0, replacement=NaN);
+  right = replace(target=(curr_X == 0), pattern=0, replacement=NaN);
 }
 
 
#-----------------------------------------------------------------------------------------------------------------------
 # INPUT:    row_vector: a 1xn matrix (one feature with all residuals)
 # OUTPUT:   similarity_score: the similarity score of the residuals
-calculateSimilarityScore = function (matrix[Double] row_vector, Double lambda) 
-  return (Double similarity_score) 
+calculateSimilarityScore = function (matrix[Double] row_vector, Double lambda)
+  return (Double similarity_score)
 {
   similarity_score = (sum(row_vector)^2) / (nrow(row_vector) + lambda);
 }
@@ -694,8 +650,8 @@ calculateSimilarityScore = function (matrix[Double] 
row_vector, Double lambda)
 
#-----------------------------------------------------------------------------------------------------------------------
 # INPUT:    row_vector: a 1xn matrix (one feature with all residuals)
 # OUTPUT:   similarity_score: the similarity score of the residuals
-calculateSimilarityScoreClassification = function (matrix[Double] row_vector, 
matrix[Double] predictions, Double lambda) 
-  return (Double similarity_score) 
+calculateSimilarityScoreClassification = function (matrix[Double] row_vector, 
matrix[Double] predictions, Double lambda)
+  return (Double similarity_score)
 {
   nominator = (sum(row_vector)^2)
   d =  predictions * (1 - predictions)
@@ -706,8 +662,8 @@ calculateSimilarityScoreClassification = function 
(matrix[Double] row_vector, ma
 
#-----------------------------------------------------------------------------------------------------------------------
 # INPUT:    residuals_vector: a 1xn matrix (one feature with all residuals)
 # OUTPUT:   similarity_score: the similarity score of the residuals
-calculateOutputValue = function (matrix[Double] residuals_vector, Double 
lambda) 
-  return (Double output_value) 
+calculateOutputValue = function (matrix[Double] residuals_vector, Double 
lambda)
+  return (Double output_value)
 {
   output_value = (sum(residuals_vector)) / (nrow(residuals_vector) + lambda);
   if(output_value == 'NaN') # just in case we have a node with no sample inside
@@ -717,8 +673,8 @@ calculateOutputValue = function (matrix[Double] 
residuals_vector, Double lambda)
 
#-----------------------------------------------------------------------------------------------------------------------
 # INPUT:    residuals_vector: a 1xn matrix (one feature with all residuals)
 # OUTPUT:   similarity_score: the similarity score of the residuals
-calculateOutputValueClassification = function (matrix[Double] 
residuals_vector, matrix[Double] predictions, Double lambda) 
-  return (Double output_value) 
+calculateOutputValueClassification = function (matrix[Double] 
residuals_vector, matrix[Double] predictions, Double lambda)
+  return (Double output_value)
 {
   nominator = (sum(residuals_vector))
   d =  predictions * (1 - predictions)
@@ -733,8 +689,8 @@ calculateOutputValueClassification = function 
(matrix[Double] residuals_vector,
 # INPUT:    row_vector: a 1xn matrix (one feature with all rows)
 #           prediction: a 1xn matrix, the current prediction value of each row
 # OUTPUT:   the residuals from the "prediction" as 1xn matrix
-calculateResiduals = function (matrix[Double] row_vector, matrix[Double] 
prediction) 
-  return (matrix[Double] residuals_row) 
+calculateResiduals = function (matrix[Double] row_vector, matrix[Double] 
prediction)
+  return (matrix[Double] residuals_row)
 {
   residuals_row = row_vector - prediction;
 }
@@ -742,8 +698,8 @@ calculateResiduals = function (matrix[Double] row_vector, 
matrix[Double] predict
 
#-----------------------------------------------------------------------------------------------------------------------
 # INPUT:    row_vector: a 1xn matrix which has unordered values
 # OUTPUT:   row_vector_ordered: 1xn matrix were the values are in accessing 
order
-orderFeature = function (matrix[double] row_vector) 
-  return (matrix[double] row_vector_ordered) 
+orderFeature = function (matrix[double] row_vector)
+  return (matrix[double] row_vector_ordered)
 {
     row_vector_ordered = order(target=row_vector, by=1, decreasing=FALSE, 
index.return=FALSE);
 }
diff --git a/src/main/java/org/apache/sysds/hops/IndexingOp.java 
b/src/main/java/org/apache/sysds/hops/IndexingOp.java
index 4155dfd9f5..aa5a16c00f 100644
--- a/src/main/java/org/apache/sysds/hops/IndexingOp.java
+++ b/src/main/java/org/apache/sysds/hops/IndexingOp.java
@@ -433,7 +433,7 @@ public class IndexingOp extends Hop
                Hop input3 = getInput().get(2);
                return HopRewriteUtils.isLiteralOfValue(input2, 1)
                        && ((HopRewriteUtils.isUnary(input3, OpOp1.NROW) && 
input3.getInput().get(0) == input1 )
-                       || HopRewriteUtils.isLiteralOfValue(input3, 
input1.getDim1()));
+                       || (HopRewriteUtils.isLiteralOfValue(input3, 
input1.getDim1()) && input1.getDim1()>0));
        }
        
        public boolean isAllCols() {
@@ -442,7 +442,7 @@ public class IndexingOp extends Hop
                Hop input5 = getInput().get(4);
                return HopRewriteUtils.isLiteralOfValue(input4, 1)
                        && ((HopRewriteUtils.isUnary(input5, OpOp1.NCOL) && 
input5.getInput().get(0) == input1 )
-                       || HopRewriteUtils.isLiteralOfValue(input5, 
input1.getDim2()));
+                       || (HopRewriteUtils.isLiteralOfValue(input5, 
input1.getDim2())&& input1.getDim1()>0));
        }
        
        @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java 
b/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java
index 1686055b7a..c28a82a8c4 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java
@@ -103,7 +103,10 @@ public class BasicProgramBlock extends ProgramBlock
                }
                catch(Exception ex)
                {
-                       throw new DMLRuntimeException("Unable to recompile 
program block.", ex);
+                       if( _sb != null )
+                               throw new DMLRuntimeException("Unable to 
recompile program block (lines "+_sb.getBeginLine()+"-"+_sb.getEndLine()+").", 
ex);
+                       else
+                               throw new DMLRuntimeException("Unable to 
recompile program block.", ex);
                }
                
                //statement-block-level, lineage-based reuse
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
index c075a9fd29..4c0ee0fc6c 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
@@ -5134,6 +5134,10 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock<MatrixBlock>,
                        return ret;
                if( !containsValue(pattern) )
                        return this; //avoid allocation + copy
+               if( isEmpty() && pattern==0 ) {
+                       ret.reset(rlen, clen, replacement);
+                       return ret;
+               }
                
                boolean NaNpattern = Double.isNaN(pattern);
                if( sparse ) //SPARSE
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinXgBoostTest_classification.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinXgBoostTest_classification.java
index ddda791e67..60cb86e5af 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinXgBoostTest_classification.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinXgBoostTest_classification.java
@@ -52,8 +52,9 @@ public class BuiltinXgBoostTest_classification extends 
AutomatedTestBase {
 
                        String HOME = SCRIPT_DIR + TEST_DIR;
                        fullDMLScriptName = HOME + TEST_NAME + ".dml";
-                       programArgs = new String[]{"-args", input("X"), 
input("y"), input("R"), input("sml_type"),
-                                       input("num_trees"), output("M")};
+                       programArgs = new String[]{"-explain", "-args",
+                               input("X"), input("y"), input("R"), 
input("sml_type"),
+                               input("num_trees"), output("M")};
 
                        double[][] y = {
                                        {1.0},

[systemds] branch main updated: [SYSTEMDS-2672] Fix XGBoost builtin implementation and internals

Reply via email to