dkerschbaumer commented on a change in pull request #1334: URL: https://github.com/apache/systemds/pull/1334#discussion_r671125283
########## File path: scripts/builtin/xgboost.dml ########## @@ -0,0 +1,780 @@ +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix[Double] --- Feature matrix X; note that X needs to be both recoded and dummy coded +# Y Matrix[Double] --- Label matrix Y; note that Y needs to be both recoded and dummy coded +# R Matrix[Double] 1, 1xn Matrix R; 1xn vector which for each feature in X contains the following information +# - R[,1]: 1 (scalar feature) +# - R[,2]: 2 (categorical feature) +# Feature 1 is a scalar feature and features 2 is a categorical feature +# If R is not provided by default all variables are assumed to be scale (1) +# sml_type Integer 1 Supervised machine learning type: 1 = Regression(default), 2 = Classification +# num_trees Integer 7 Number of trees to be created in the xgboost model +# learning_rate Double 0.3 Alias: eta. After each boosting step the learning rate controls the weights of the new predictions +# max_depth Integer 6 Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit +# lambda Double 0.0 L2 regularization term on weights. Increasing this value will make model more conservative and reduce amount of leaves of a tree +# --------------------------------------------------------------------------------------------- + +# --------------------------------------------------------------------------------------------- +# OUTPUT: +# Matrix M where each column corresponds to a node in the learned tree (the first node is the init prediction) and each row contains the following information: +# M[1,j]: id of node j (in a complete binary tree) +# M[2,j]: tree id to which node j belongs +# M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 +# M[4,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical) +# that node j looks at if j is an internal node, otherwise 0 +# M[5,j]: Type of the feature that node j looks at if j is an internal node. if leaf = 0, if scalar = 1, if categorical = 2 +# M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale, +# otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j +# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 +# ------------------------------------------------------------------------------------------- + +m_xgboost = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] R = matrix(1,rows=1,cols=nrow(X)), + Integer sml_type = 1, Integer num_trees = 7, Double learning_rate = 0.3, Integer max_depth = 6, Double lambda = 0.0) + return (Matrix[Double] M) { + # test if input correct + assert(nrow(X) == nrow(y)) + assert(ncol(y) == 1) + assert(nrow(R) == 1) + + M = matrix(0,rows=6,cols=0) + # set the init prediction at first col in M + init_prediction_matrix = matrix("0 0 0 0 0 0",rows=nrow(M),cols=1) + init_prediction_matrix[6,1] = median(y) + M = cbind(M, init_prediction_matrix) + + current_prediction = matrix(median(y), rows=nrow(y), cols=1) + + tree_id = 1 + while(tree_id <= num_trees) + { + curr_M = buildOneTree(X, y, R, sml_type, max_depth, current_prediction, tree_id, lambda) + + # in current prediction all previous trees are considered, so we only add the current tree to calculate new predictions + current_prediction = calculateNewPredictions(X, sml_type,current_prediction, learning_rate, curr_M) + + tree_id = tree_id + 1 + M = cbind(M, curr_M) # concat the new tree to the existing one (forest-ing) + } +} + + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: X: nxn matrix, original input matrix +# INPUT: current_prediction: nx1 vector of the current prediction for my target features y (1st run is init prediction) +# INPUT: learning_rate: set by user +# INPUT: curr_M: The current M matrix with the current tree +# OUTPUT: new_prediction: x1 vector of new new_prediction for my target features y +calculateNewPredictions = function(Matrix[Double] X, Integer sml_type, Matrix[Double] current_prediction, + Double learning_rate, Matrix[Double] curr_M) + return (Matrix[Double] new_prediction) { + assert(ncol(current_prediction) == 1) + assert(nrow(current_prediction) == nrow(X)) # each Entry should have an own initial prediction + new_prediction = matrix(0, rows=nrow(current_prediction), cols=1) + start_node_current_tree = curr_M[,1] + + if(sml_type == 1) # Regression + { + for(entry in 1:nrow(X)) # go though each entry in X and calculate the new prediction + { + output_values = matrix(0, rows=1, cols=0) + + output_value = getOutputValueForEntry(X[entry,], curr_M, start_node_current_tree) + output_values = cbind(output_values, as.matrix(output_value)) + new_prediction[entry,] = current_prediction[entry,] + learning_rate * sum(output_values) + } + } + else # Classification + { + assert(sml_type == 2) + for(entry in 1:nrow(X)) # go though each entry in X and calculate the new prediction + { + output_values = matrix(0, rows=1, cols=0) + output_value = getOutputValueForEntry(X[entry,], curr_M, start_node_current_tree) + output_values = cbind(output_values, as.matrix(output_value)) + odds = as.scalar(current_prediction[entry,]) + if((1 - odds) == 0) + log_odds = 0 + else + log_odds = log(odds / (1 - odds)) + x = (log_odds + learning_rate * sum(output_values)) + e = 2.7182818284 + new_prediction[entry,] = e^x / (1 + e^x) + } + } +} + + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: row_vector: nx1 vector, one sample with n features +# INPUT: curr_M: The current M matrix with the current tree +# INPUT: current_node: the start node of the current tree +# OUTPUT: output_value: the prediction for the current sample at the current tree +getOutputValueForEntry = function(Matrix[Double] row_vector, Matrix[Double] curr_M, Matrix[Double] current_node) + return (Double output_value) { + assert(nrow(row_vector) == 1) + assert(as.scalar(current_node[1,]) == 1) + + cur_node_index = 1 # cant take the node id because its diff to the index + + # iterate over one tree and find output value + while(as.scalar(current_node[5,]) != 0.0) # until leaf + { + used_feature = as.scalar(current_node[4,]) + feature_is_scalar = as.scalar(current_node[5,]) == 1 + if(feature_is_scalar) #SCALAR values + { + if(as.scalar(row_vector[,used_feature]) < as.scalar(current_node[6,])) # go left + { + cur_node_index = (cur_node_index + as.scalar(current_node[3,])) + current_node = curr_M[,cur_node_index] + } + else # go right + { + cur_node_index = cur_node_index + as.scalar(current_node[3,]) + 1 + current_node = curr_M[,cur_node_index] + } + } + else # CATEGORICAL values + { + assert(as.scalar(current_node[5,]) == 2) + if(as.scalar(row_vector[,used_feature]) == 1) # go left + { + cur_node_index = (cur_node_index + as.scalar(current_node[3,])) + current_node = curr_M[,cur_node_index] + } + else # go right + { + assert(as.scalar(row_vector[,used_feature]) == 0) + cur_node_index = cur_node_index + as.scalar(current_node[3,]) + 1 + current_node = curr_M[,cur_node_index] + } + } + } + output_value = as.scalar(current_node[6,]) +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: X: nxn matrix, original input matrix +# INPUT: y: nx1 vector of my target values +# INPUT: R The matrix R which for each feature in X contains the following information +# - R[,2]: 1 (scalar feature) +# - R[,1]: 2 (categorical feature) +# INPUT: M: the nxn output matrix +# INPUT: prediction: nx1 vector, my current predictions for my target value y +# INPUT: tree_id: The current tree id, starting at 1 +# OUTPUT: M: the current M matrix of this tree +buildOneTree = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] R, Integer sml_type, Integer max_depth, + Matrix[Double] prediction, Double tree_id, Double lambda) + return (Matrix[Double] M) { + + M = matrix(0,rows=6,cols=0) + node_queue = matrix(1, rows=1, cols=1) # Add first Node + curr_rows_queue = matrix(1, rows=nrow(X), cols=1) + node_queue_len = 1 + depth = 0.0 + level = 0 + + # adjust this here if we create more trees to create new prediction values + while (node_queue_len > 0) { + has_child = FALSE + [node_queue, node] = dataQueuePop(node_queue) + [curr_rows_queue, curr_rows_vector] = dataQueuePop(curr_rows_queue) + + level = log(as.scalar(node), 2) + 1 + available_rows = calcAvailableRows(curr_rows_vector) # check if leaf + + if(available_rows == 0) + { + residual_matrix = matrix(0, rows=0, cols=0) + } + else + { + [curr_X, curr_y, curr_X_full, curr_prediction] = updateMatrices(X, y, prediction, curr_rows_vector) + residual_matrix = calculateResiduals(curr_y, curr_prediction) # we need residuals calculated also for leafs for output value + } + + best_feature_index = 0.00 + done = FALSE + if(sml_type == 2) + { + count = sum(curr_y) + if(count == 0) + done = TRUE + if(count == nrow(curr_y)) + done = TRUE + } + + if(available_rows > 1 & max_depth > level & done == FALSE) # leaf check or max depth check + { + best_feature_index = findBestFeature(X=curr_X, y=curr_y, sml_type=sml_type) + type = getTypeOfFeature(R, best_feature_index) + + if(type == 1.0) # SCALAR + { + if(sml_type == 1) # Regression + { + similarity_score = calculateSimilarityScore(residual_matrix, lambda) + } + else # Classification + { + assert(sml_type == 2) + similarity_score = calculateSimilarityScoreClassification(residual_matrix, curr_prediction, lambda) + } + + [best_split_threshold, best_gain] = findBestSplit(sml_type, curr_X[,best_feature_index], similarity_score, curr_prediction, lambda) + has_child = best_gain > 0 # if the gain is < 0, the split is worse than the current node + } + else # CATEGORICAL + { + assert(type == 2.0) + has_child = TRUE + } + } + + if (has_child) + { + [left, right] = calculateChildNodeIDs(node) + node_queue = dataQueuePush(left, right, node_queue) + + if (type == 1.0) # SCALAR + { + [left_row_vec, right_row_vec] = splitMatrixByValueLoop(curr_X_full[,best_feature_index], X[,best_feature_index], best_split_threshold) + curr_rows_queue = dataQueuePush(left_row_vec, right_row_vec, curr_rows_queue) + offset = ncol(node_queue) - 1 + M = addOutputRow(M, node, tree_id, R, offset, best_feature_index, best_split_threshold, 0.0) + } + else # CATEGORICAL + { + assert(type == 2.0) + [left_row_vec, right_row_vec] = splitMatrixByCategory(curr_X_full[,best_feature_index], X[,best_feature_index]) + curr_rows_queue = dataQueuePush(left_row_vec, right_row_vec, curr_rows_queue) + offset = ncol(node_queue) - 1 + M = addOutputRow(M, node, tree_id, R, offset, best_feature_index, 0.0, 0.0) + } + } + else # has no child => must be leaf + { + if(sml_type == 1) # Regression + { + output_value = calculateOutputValue(residual_matrix, lambda) + } + else # Classification + { + assert(sml_type == 2) + output_value = calculateOutputValueClassification(residual_matrix, curr_prediction, lambda) + } + # offset, best_feature_idx, threshold + M = addOutputRow(M, node, tree_id, R, 0.0, 0.0, 0.0, output_value) + } + node_queue_len = ncol(node_queue) + } +} + + + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: node: a 1xn matrix (node containing values) +# OUTPUT: left: the new left node id +# OUTPUT: right: the new right node id +calculateChildNodeIDs = function(Matrix[Double] node) return(Matrix[Double] left, Matrix[Double] right) { + left = node * 2.0 + right = node * 2.0 + 1.0 +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: M: the nxn output matrix +# INPUT: node: the current node +# INPUT: R The matrix R which for each feature in X contains the following information +# - R[,2]: 1 (scalar feature) +# - R[,1]: 2 (categorical feature) +# INPUT: offset: offset to the left child +# INPUT: used_col: which feature got used +# INPUT: +# INPUT: threshold: the value at the node we separated it, if leaf => the output value +# OUTPUT: new_M: nxn output Matrix +addOutputRow = function( + Matrix[Double] M, + Matrix[Double] node, + Double tree_id, + Matrix[Double] R, + Double offset, + Double used_col, + Double threshold, + Double output_value +) return (Matrix[Double] new_M) { + + current_node = matrix(0, rows=6, cols=1) + current_node[1,1] = node[1,1] # node id + current_node[2,1] = tree_id + current_node[3,1] = offset # offset to left child + current_node[4,1] = used_col # features used + + if(used_col == 0.0) { # is a single value or max depth reached => leaf + current_node[5,1] = 0 # 0 if leaf node + assert(threshold == 0) + current_node[6,1] = output_value + } + else if (as.scalar(R[1, used_col]) == 2.0) { + current_node[5, 1] = 2.0 # categorical + assert(threshold == 0.0) # should be 0 at categorical + current_node[6,1] = threshold + } + else { + current_node[5,1] = 1.0 # 1 if it is scalar + current_node[6,1] = threshold # split value ( 0 if leaf or categorical) + } + + if (ncol(M) == 0 & nrow(M) == 0) { # first node + new_M = current_node + } else { + new_M = cbind(M, current_node) + } +} + + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: R: Location to read the matrix R(nx1) which for each feature in X contains the following information +# - R[,2]: 1 (scalar feature) +# - R[,1]: 2 (categorical feature) +# INPUT: col: the col that is desired to know the type +# OUTPUT: [1.0,scalar] or [2.0,categorical] +#----------------------------------------------------------------------------------------------------------------------- +getTypeOfFeature = function(Matrix[Double] R, Double col) return(Double type) { + type = as.scalar(R[1, col]) +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: X: a nxn matrix (X Matrix from input) +# INPUT: y: a 1xn matrix (y Matrix from input) +# INPUT: prediction: a 1xn matrix with the current predictions for y (median at first tree) +# INPUT: vector: a 1xn matrix (contains 1 for relevant and 0 for non-relevant value) +# OUTPUT: new_X: the new X containing only the relevant rows +# OUTPUT: new_y: the new y containing only the relevant rows +# OUTPUT: new_X_full: the new X containing the relevant rows + 0 for non_relevant +# OUTPUT: new_prediction: the current prediction containing only the relevant rows +updateMatrices = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] prediction, Matrix[Double] vector) + return(Matrix[Double] new_X, Matrix[Double] new_y, Matrix[Double] new_X_full, Matrix[Double] new_prediction) { + + assert(ncol(vector) == 1) + assert(nrow(vector) == nrow(X)) + + new_X = matrix(0, rows = 0, cols = 0) + new_y = matrix(0, rows = 0, cols = 0) + zero_vec = matrix(0, rows=nrow(vector), cols=ncol(X)) + nan_vec = fillMatrixWithNaN(zero_vec) # could not directly set to NaN + new_X_full = matrix(0, rows = 0, cols=0) + new_prediction = matrix(0, rows=0, cols=0) + for(i in 1:nrow(vector)) { + if(as.scalar(vector[i,]) == 1.0) # this row is relevant for this node + { + if (ncol(new_X) == 0 & ncol(new_X_full) == 0) { # first iteration + new_X = X[i,] + new_y = y[i,] + new_X_full = X[i,] + new_prediction = prediction[i,] + } else if (ncol(new_X) == 0 & ncol(new_X_full) != 0) { # already a 0 row found + new_X = X[i,] + new_y = y[i,] + new_X_full = rbind(new_X_full, X[i,]) + new_prediction = prediction[i,] + } else { # adding all others + new_X = rbind(new_X, X[i,]) + new_y = rbind(new_y, y[i,]) + new_X_full = rbind(new_X_full, X[i,]) + new_prediction = rbind(new_prediction, prediction[i,]) + } + } else { # this row is NOT relevant for this node + assert(as.scalar(vector[i,]) == 0.0 | as.scalar(vector[i,]) == 'NaN') + if (ncol(new_X_full) == 0) { # first row + new_X_full = nan_vec[i,] + } else { + new_X_full = rbind(new_X_full, nan_vec[i,]) # add a new row filled with NaNs + } + } + } +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: vector: a 1xn matrix (current datapoints which are interesting) +# OUTPUT: available_elements: nr. of available datapoints +calcAvailableRows = function(Matrix[Double] vector) return(Double available_elements){ + assert(ncol(vector) == 1) + len = nrow(vector) + available_elements = 0.0 + for (index in 1:len) { + element = as.scalar(vector[index,]) + if(element > 0.0) { + available_elements = available_elements + 1.0 + } + } +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: queue: a nxn matrix (queue containing values) +# OUTPUT: new_queue: the new queue after the first vector gets popped +# OUTPUT: node_vec: the popped vector (1xn) from the queue +dataQueuePop = function(Matrix[Double] queue) return (Matrix[Double] new_queue, Matrix[Double] node_vec) { + node_vec = matrix(queue[,1], rows=1, cols=nrow(queue)) # reshape to force the creation of a new object + node_vec = matrix(node_vec, rows=nrow(queue), cols=1) # reshape to force the creation of a new object + len = ncol(queue) + if (len < 2) { + new_queue = matrix(0,0,0) + } else { + new_queue = matrix(queue[,2:ncol(queue)], rows=nrow(queue), cols=ncol(queue)-1) + } +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: left: a 1xn matrix +# INPUT: right: a 1xn matrix +# INPUT: queue: a nxn matrix (queue containing values) +# OUTPUT: new_queue: the new queue nxn matrix +dataQueuePush = function(Matrix[Double] left, Matrix[Double] right, Matrix[Double] queue) return (Matrix[Double] new_queue) { + len = ncol(queue) + if(len <= 0) { + new_queue = cbind(left, right) + } else { + new_queue = cbind(queue, left, right) + } +} + + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: X: a nxn matrix (the current samples with all features we observe) +# INPUT: y: a 1xn matrix (the current y to all observed samples) +# OUTPUT: lowest_residuals_index: the feature index with the lowest residuals +findBestFeature = function(Matrix[Double] X, Matrix[Double] y, Integer sml_type) return (Integer lowest_residuals_index) { + + lowest_residuals = 0 + lowest_residuals_index = 1 + + for(i in 1: ncol(X)) { + current_feature = X[,i] + + if(sml_type == 1) # Regression + { + weights = glm(X=current_feature, Y=y, dfam=1, verbose=FALSE) + } + else # Classification + { + assert(sml_type == 2) + weights = glm(X=current_feature, Y=y, dfam=2, verbose=FALSE) + } + y_residual = y - current_feature %*% weights + res = sum(y_residual ^ 2) # sum of least square calculation + + if(i == 1 | res < lowest_residuals) + { + lowest_residuals = res + lowest_residuals_index = i + } + } +} + + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: one_featureX: a 1xn matrix (one feature with all values) +# OUTPUT: best_split: the best split (highest gain indicates best splitting of datasets) +findBestSplit = function(Integer sml_type, Matrix[Double] one_featureX, Double sim_score_parent, Matrix[Double] predictions, Double lambda) + return (Double best_split, Double best_gain) +{ + assert(ncol(one_featureX) == 1) + best_split = 0 + best_gain = 0 + best_sim_score_left = 0 + best_sim_score_right = 0 + + ordered_X = orderFeature(one_featureX) # order entries by feature value + + # iterate over all averages between 2 points + for(i in 1: (nrow(ordered_X)-1)) { + current_split = average(ordered_X[i,], ordered_X[i+1,]) + [left, right] = splitMatrixByValue(one_featureX, current_split) + + if(sml_type == 1) # Regression + { + sim_score_left = calculateSimilarityScore(left, lambda) + sim_score_right = calculateSimilarityScore(right, lambda) + } + else # Classification + { + assert(sml_type == 2) + sim_score_left = calculateSimilarityScoreClassification(left, predictions, lambda) + sim_score_right = calculateSimilarityScoreClassification(right, predictions, lambda) + } + current_gain = sim_score_left + sim_score_right - sim_score_parent + + if(current_gain > best_gain) + { + best_gain = current_gain + best_split = current_split + best_sim_score_left = sim_score_left + best_sim_score_right = sim_score_right + } + } +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: X: a 1xn matrix (one feature with all values) +# value: the border to separate the values +# loop: boolean to check if used in loop and returns then other values +# OUTPUT: left: a 1xn matrix with all values smaller than "value" +# right: a 1xn matrix with all values larger than "value" +splitMatrixByValue = function(Matrix[Double] X, Double value) return (Matrix[Double] left, Matrix[Double] right) { + assert(ncol(X) == 1) + left = matrix(0, rows=0, cols=1) + right = matrix(0, rows=0, cols=1) + + for(i in 1:nrow(X)) { + if(as.scalar(X[i,]) < value) + { + left = rbind(left,X[i,]) + } + else + { + right = rbind(right, X[i,]) + } + } +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: curr_X: a 1xn matrix (one feature with current values) +# X: a 1xn matrix the feature of the X-Matrix with all values +# value: the border to separate the values +# OUTPUT: left: a 1xn matrix with 1.0 if the value is smaller or 0.0 if it is bigger +# right: a 1xn matrix with 1.0 if the value is bigger or 0.0 if it is smaller +splitMatrixByValueLoop = function(Matrix[Double] curr_X, Matrix[Double] X, Double value) return (Matrix[Double] left, Matrix[Double] right) { + left = matrix(0, rows=0, cols=1) + right = matrix(0, rows=0, cols=1) + one_vec = matrix(1, rows=1, cols=1) + zero_vec = matrix(0, rows=1, cols=1) + + for(i in 1:nrow(X)) { + if (as.scalar(X[i,]) == as.scalar(curr_X[i,])) # if same row in curr_X and X + { + if (as.scalar(X[i,]) < value) + { + left = rbind(left, one_vec[1,]) # add 1 to left and 0 to right vector + right = rbind(right, zero_vec[1,]) + } + else + { + right = rbind(right, one_vec[1,]) # add 1 to right and 0 to left vector + left = rbind(left, zero_vec[1,]) + } + } + else + { + left = rbind(left, zero_vec[1,]) # add 0 to both vectors + right = rbind(right, zero_vec[1,]) + } + } +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: curr_X: a 1xn matrix (one feature with current values) +# X: a 1xn matrix the feature of the X-Matrix with all values +# OUTPUT: left: a 1xn matrix with 1.0 for if the value is true or 0.0 if it is false +# right: a 1xn matrix with 1.0 for if the value is false or 0.0 if it is true +splitMatrixByCategory = function(Matrix[Double] curr_X, Matrix[Double] X) return (Matrix[Double] left, Matrix[Double] right) { + assert(ncol(curr_X) == 1) + assert(ncol(X) == 1) + + left = matrix(0, rows=0, cols=1) + right = matrix(0, rows=0, cols=1) + + for(i in 1:nrow(curr_X)) + { + if (as.scalar(curr_X[i,]) == 1) # categorical true + { + left = rbind(left, curr_X[i,]) + right = rbind(right, matrix(0, rows=1, cols=1)) + } + else if(as.scalar(curr_X[i,]) == 0) # categorical false + { + left = rbind(left, curr_X[i,]) + right = rbind(right, matrix(1, rows=1, cols=1)) + } + else # replace with NaN if not used for the current samples + { + assert(as.scalar(curr_X[i,]) == 'NaN') + left = rbind(left, curr_X[i,]) + right = rbind(right, curr_X[i,]) + } + } +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: row_vector: a 1xn matrix (one feature with all residuals) +# OUTPUT: similarity_score: the similarity score of the residuals +calculateSimilarityScore = function (matrix[Double] row_vector, Double lambda) return (Double similarity_score) { + similarity_score = (sum(row_vector)^2) / (nrow(row_vector) + lambda); +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: row_vector: a 1xn matrix (one feature with all residuals) +# OUTPUT: similarity_score: the similarity score of the residuals +calculateSimilarityScoreClassification = function (matrix[Double] row_vector, matrix[Double] predictions, Double lambda) return (Double similarity_score) { + nominator = (sum(row_vector)^2) + d = predictions * (1 - predictions) + denominator = sum(d) + lambda + similarity_score = nominator / denominator; +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: residuals_vector: a 1xn matrix (one feature with all residuals) +# OUTPUT: similarity_score: the similarity score of the residuals +calculateOutputValue = function (matrix[Double] residuals_vector, Double lambda) return (Double output_value) { + output_value = (sum(residuals_vector)) / (nrow(residuals_vector) + lambda); + if(output_value == 'NaN') # just in case we have a node with no sample inside + output_value = 0.0 +} + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: residuals_vector: a 1xn matrix (one feature with all residuals) +# OUTPUT: similarity_score: the similarity score of the residuals +calculateOutputValueClassification = function (matrix[Double] residuals_vector, matrix[Double] predictions, Double lambda) return (Double output_value) { + nominator = (sum(residuals_vector)) + d = predictions * (1 - predictions) + denominator = sum(d) + lambda + if(denominator == 0) + output_value = 0 + else + output_value = nominator / denominator +} + + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: row_vector: a 1xn matrix (one feature with all rows) +# prediction: a 1xn matrix, the current prediction value of each row +# OUTPUT: the residuals from the "prediction" as 1xn matrix +calculateResiduals = function (matrix[Double] row_vector, matrix[Double] prediction) return (matrix[Double] residuals_row) { + assert(ncol(row_vector) == 1) + residuals_row = row_vector - prediction; +} + + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: row_vector: a 1xn matrix which has unordered values +# OUTPUT: row_vector_ordered: 1xn matrix were the values are in accessing order +orderFeature = function (matrix[double] row_vector) return (matrix[double] row_vector_ordered) { + row_vector_ordered = order(target=row_vector, by=1, decreasing=FALSE, index.return=FALSE); +} + + +#----------------------------------------------------------------------------------------------------------------------- +# INPUT: m: a nxn matrix which should be set to NaN's in each cell +# OUTPUT: new_m: matrix with same dimensions with all NaNs inside +fillMatrixWithNaN = function(Matrix[Double] m) return (Matrix[Double] new_m) { + new_m = matrix(0, rows=nrow(m), cols=ncol(m)) + for(i in 1:ncol(m)) + { + for(j in 1:nrow(m)) + new_m[i,j] = 'NaN' + } +} + + +average = function(Matrix[Double] x, Matrix[Double] y) return (Double avg) { + assert(ncol(x) == 1 & nrow(x) == 1) + assert(ncol(y) == 1 & nrow(y) == 1) + avg = (as.scalar(x) + as.scalar(y)) / 2 Review comment: done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@systemds.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org