[systemds] branch main updated: [MINOR] Python autogenerate new Builtins

baunsgaard Wed, 10 May 2023 01:42:49 -0700

This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 2d4114e96e [MINOR] Python autogenerate new Builtins
2d4114e96e is described below

commit 2d4114e96ef6858b0dff0059e63644bfca8189ad
Author: baunsgaard <[email protected]>
AuthorDate: Wed May 10 10:42:17 2023 +0200

    [MINOR] Python autogenerate new Builtins
---
 .../python/systemds/operator/algorithm/__init__.py |  6 ++
 .../algorithm/builtin/{scale.py => auc.py}         | 38 +++------
 .../operator/algorithm/builtin/decisionTree.py     | 63 +++++++-------
 .../algorithm/builtin/decisionTreePredict.py       | 46 +++++-----
 .../builtin/{scale.py => lmPredictStats.py}        | 39 ++++-----
 .../systemds/operator/algorithm/builtin/pca.py     | 39 +++++++--
 .../operator/algorithm/builtin/randomForest.py     | 97 ++++++++++------------
 .../builtin/{scale.py => randomForestPredict.py}   | 43 +++++-----
 .../systemds/operator/algorithm/builtin/scale.py   | 17 ++--
 .../operator/algorithm/builtin/slicefinder.py      |  4 +-
 10 files changed, 194 insertions(+), 198 deletions(-)

diff --git a/src/main/python/systemds/operator/algorithm/__init__.py 
b/src/main/python/systemds/operator/algorithm/__init__.py
index 2dd6578833..769ca66229 100644
--- a/src/main/python/systemds/operator/algorithm/__init__.py
+++ b/src/main/python/systemds/operator/algorithm/__init__.py
@@ -31,6 +31,7 @@ from .builtin.alsPredict import alsPredict
 from .builtin.alsTopkPredict import alsTopkPredict 
 from .builtin.apply_pipeline import apply_pipeline 
 from .builtin.arima import arima 
+from .builtin.auc import auc 
 from .builtin.autoencoder_2layer import autoencoder_2layer 
 from .builtin.bandit import bandit 
 from .builtin.bivar import bivar 
@@ -110,6 +111,7 @@ from .builtin.lm import lm
 from .builtin.lmCG import lmCG 
 from .builtin.lmDS import lmDS 
 from .builtin.lmPredict import lmPredict 
+from .builtin.lmPredictStats import lmPredictStats 
 from .builtin.logSumExp import logSumExp 
 from .builtin.matrixProfile import matrixProfile 
 from .builtin.mcc import mcc 
@@ -137,6 +139,7 @@ from .builtin.pcaTransform import pcaTransform
 from .builtin.pnmf import pnmf 
 from .builtin.ppca import ppca 
 from .builtin.randomForest import randomForest 
+from .builtin.randomForestPredict import randomForestPredict 
 from .builtin.scale import scale 
 from .builtin.scaleApply import scaleApply 
 from .builtin.scaleMinMax import scaleMinMax 
@@ -182,6 +185,7 @@ __all__ = ['WoE',
  'alsTopkPredict',
  'apply_pipeline',
  'arima',
+ 'auc',
  'autoencoder_2layer',
  'bandit',
  'bivar',
@@ -261,6 +265,7 @@ __all__ = ['WoE',
  'lmCG',
  'lmDS',
  'lmPredict',
+ 'lmPredictStats',
  'logSumExp',
  'matrixProfile',
  'mcc',
@@ -288,6 +293,7 @@ __all__ = ['WoE',
  'pnmf',
  'ppca',
  'randomForest',
+ 'randomForestPredict',
  'scale',
  'scaleApply',
  'scaleMinMax',
diff --git a/src/main/python/systemds/operator/algorithm/builtin/scale.py 
b/src/main/python/systemds/operator/algorithm/builtin/auc.py
similarity index 53%
copy from src/main/python/systemds/operator/algorithm/builtin/scale.py
copy to src/main/python/systemds/operator/algorithm/builtin/auc.py
index 015709d8c6..8df6835311 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/scale.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/auc.py
@@ -20,7 +20,7 @@
 # -------------------------------------------------------------
 
 # Autogenerated By   : src/main/python/generator/generator.py
-# Autogenerated From : scripts/builtin/scale.dml
+# Autogenerated From : scripts/builtin/auc.dml
 
 from typing import Dict, Iterable
 
@@ -29,33 +29,21 @@ from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
-def scale(X: Matrix,
-          **kwargs: Dict[str, VALID_INPUT_TYPES]):
+def auc(Y: Matrix,
+        P: Matrix):
     """
-     This function scales and center individual features in the input matrix 
(column wise.) using z-score to scale the values.
+     This builting function computes the area under the ROC curve (AUC)
+     for binary classifiers.
     
     
     
-    :param X: Input feature matrix
-    :param center: Indicates whether or not to center the feature matrix
-    :param scale: Indicates whether or not to scale the feature matrix
-    :return: Output feature matrix with K columns
-    :return: The column means of the input, subtracted if Center was TRUE
-    :return: The Scaling of the values, to make each dimension have similar 
value ranges
+    :param Y: Binary response vector (shape: n x 1), in -1/+1 or 0/1 encoding
+    :param P: Prediction scores (predictor such as estimated probabilities)
+        for true class (shape: n x 1), assumed in [0,1]
+    :return: Area under the ROC curve (AUC)
     """
 
-    params_dict = {'X': X}
-    params_dict.update(kwargs)
-    
-    vX_0 = Matrix(X.sds_context, '')
-    vX_1 = Matrix(X.sds_context, '')
-    vX_2 = Matrix(X.sds_context, '')
-    output_nodes = [vX_0, vX_1, vX_2, ]
-
-    op = MultiReturn(X.sds_context, 'scale', output_nodes, 
named_input_nodes=params_dict)
-
-    vX_0._unnamed_input_nodes = [op]
-    vX_1._unnamed_input_nodes = [op]
-    vX_2._unnamed_input_nodes = [op]
-
-    return op
+    params_dict = {'Y': Y, 'P': P}
+    return Matrix(Y.sds_context,
+        'auc',
+        named_input_nodes=params_dict)
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py 
b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
index 4fb2ee5688..399a21fd50 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
@@ -30,43 +30,46 @@ from systemds.utils.consts import VALID_INPUT_TYPES
 
 
 def decisionTree(X: Matrix,
-                 Y: Matrix,
-                 R: Matrix,
+                 y: Matrix,
+                 ctypes: Matrix,
                  **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     Builtin script implementing classification trees with scale and 
categorical features
+     This script implements decision trees for recoded and binned categorical 
and
+     numerical input features. We train a single CART (classification and
+     regression tree) decision trees depending on the provided labels y, either
+     classification (majority vote per leaf) or regression (average per leaf).
     
     
     
-    :param X: Feature matrix X; note that X needs to be both recoded and dummy 
coded
-    :param Y: Label matrix Y; note that Y needs to be both recoded and dummy 
coded
-    :param R: Matrix R which for each feature in X contains the following 
information
-        - R[1,]: Row Vector which indicates if feature vector is scalar or 
categorical. 1 indicates
-        a scalar feature vector, other positive Integers indicate the number 
of categories
-        If R is not provided by default all variables are assumed to be scale
-    :param bins: Number of equiheight bins per scale feature to choose 
thresholds
-    :param depth: Maximum depth of the learned tree
-    :param verbose: boolean specifying if the algorithm should print 
information while executing
-    :return: Matrix M where each column corresponds to a node in the learned 
tree and each row
-        contains the following information:
-        M[1,j]: id of node j (in a complete binary tree)
-        M[2,j]: Offset (no. of columns) to left child of j if j is an internal 
node, otherwise 0
-        M[3,j]: Feature index of the feature (scale feature id if the feature 
is scale or
-        categorical feature id if the feature is categorical)
-        that node j looks at if j is an internal node, otherwise 0
-        M[4,j]: Type of the feature that node j looks at if j is an internal 
node: holds
-        the same information as R input vector
-        M[5,j]: If j is an internal node: 1 if the feature chosen for j is 
scale,
-        otherwise the size of the subset of values
-        stored in rows 6,7,... if j is categorical
-        If j is a leaf node: number of misclassified samples reaching at node j
-        M[6:,j]: If j is an internal node: Threshold the example's feature 
value is compared
-        to is stored at M[6,j] if the feature chosen for j is scale,
-        otherwise if the feature chosen for j is categorical rows 6,7,... 
depict the value subset chosen for j
-        If j is a leaf node 1 if j is impure and the number of samples at j > 
threshold, otherwise 0
+    :param X: Feature matrix in recoded/binned representation
+    :param y: Label matrix in recoded/binned representation
+    :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+        of shape 1-by-(ncol(X)+1), where the last entry is the y type
+    :param max_depth: Maximum depth of the learned tree (stopping criterion)
+    :param min_leaf: Minimum number of samples in leaf nodes (stopping 
criterion),
+        odd number recommended to avoid 50/50 leaf label decisions
+    :param min_split: Minimum number of samples in leaf for attempting a split
+    :param max_features: Parameter controlling the number of features used as 
split
+        candidates at tree nodes: m = ceil(num_features^max_features)
+    :param max_values: Parameter controlling the number of values per feature 
used
+        as split candidates: nb = ceil(num_values^max_values)
+    :param impurity: Impurity measure: entropy, gini (default), rss 
(regression)
+    :param seed: Fixed seed for randomization of samples and split candidates
+    :param verbose: Flag indicating verbose debug output
+    :return: Matrix M containing the learne trees, in linearized form
+        For example, give a feature matrix with features [a,b,c,d]
+        and the following trees, M would look as follows:
+        (L1)               |d<5|
+        /     \
+        (L2)           P1:2    |a<7|
+        /   \
+        (L3)                 P2:2 P3:1
+        --> M :=
+        [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]]
+        |(L1)| |  (L2)   | |        (L3)         |
     """
 
-    params_dict = {'X': X, 'Y': Y, 'R': R}
+    params_dict = {'X': X, 'y': y, 'ctypes': ctypes}
     params_dict.update(kwargs)
     return Matrix(X.sds_context,
         'decisionTree',
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py 
b/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
index 51a396eef7..32bb06609b 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
@@ -29,40 +29,32 @@ from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
-def decisionTreePredict(M: Matrix,
-                        X: Matrix,
-                        strategy: str):
+def decisionTreePredict(X: Matrix,
+                        ctypes: Matrix,
+                        M: Matrix,
+                        **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-    
-     Builtin script implementing prediction based on classification trees with 
scale features using prediction methods of the
+     This script implements random forest prediction for recoded and binned
+     categorical and numerical input features.
      Hummingbird paper 
(https://www.usenix.org/system/files/osdi20-nakandala.pdf).
     
     
     
-    :param M: Decision tree matrix M, as generated by 
scripts/builtin/decisionTree.dml, where each column corresponds
-        to a node in the learned tree and each row contains the following 
information:
-        M[1,j]: id of node j (in a complete binary tree)
-        M[2,j]: Offset (no. of columns) to left child of j if j is an internal 
node, otherwise 0
-        M[3,j]: Feature index of the feature (scale feature id if the feature 
is scale or
-        categorical feature id if the feature is categorical)
-        that node j looks at if j is an internal node, otherwise 0
-        M[4,j]: Type of the feature that node j looks at if j is an internal 
node: holds
-        the same information as R input vector
-        M[5,j]: If j is an internal node: 1 if the feature chosen for j is 
scale,
-        otherwise the size of the subset of values
-        stored in rows 6,7,... if j is categorical
-        If j is a leaf node: number of misclassified samples reaching at node j
-        M[6:,j]: If j is an internal node: Threshold the example's feature 
value is compared
-        to is stored at M[6,j] if the feature chosen for j is scale,
-        otherwise if the feature chosen for j is categorical rows 6,7,... 
depict the value subset chosen for j
-        If j is a leaf node 1 if j is impure and the number of samples at j > 
threshold, otherwise 0
-    :param X: Feature matrix X
-    :param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"], 
referring to "Generic matrix multiplication",
+    :param X: Feature matrix in recoded/binned representation
+    :param y: Label matrix in recoded/binned representation,
+        optional for accuracy evaluation
+    :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+    :param M: Matrix M holding the learned tree in linearized form
+        see decisionTree() for the detailed tree representation.
+    :param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"],
+        referring to "Generic matrix multiplication",
         "Tree traversal", and "Perfect tree traversal", respectively
-    :return: Matrix containing the predicted labels for X
+    :param verbose: Flag indicating verbose debug output
+    :return: Label vector of predictions
     """
 
-    params_dict = {'M': M, 'X': X, 'strategy': strategy}
-    return Matrix(M.sds_context,
+    params_dict = {'X': X, 'ctypes': ctypes, 'M': M}
+    params_dict.update(kwargs)
+    return Matrix(X.sds_context,
         'decisionTreePredict',
         named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/scale.py 
b/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py
similarity index 53%
copy from src/main/python/systemds/operator/algorithm/builtin/scale.py
copy to src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py
index 015709d8c6..731d6d232c 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/scale.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py
@@ -20,7 +20,7 @@
 # -------------------------------------------------------------
 
 # Autogenerated By   : src/main/python/generator/generator.py
-# Autogenerated From : scripts/builtin/scale.dml
+# Autogenerated From : scripts/builtin/lmPredictStats.dml
 
 from typing import Dict, Iterable
 
@@ -29,33 +29,22 @@ from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
-def scale(X: Matrix,
-          **kwargs: Dict[str, VALID_INPUT_TYPES]):
+def lmPredictStats(yhat: Matrix,
+                   ytest: Matrix,
+                   lm: bool):
     """
-     This function scales and center individual features in the input matrix 
(column wise.) using z-score to scale the values.
+     This builtin function computes and prints a summary of accuracy
+     measures for regression problems.
     
     
     
-    :param X: Input feature matrix
-    :param center: Indicates whether or not to center the feature matrix
-    :param scale: Indicates whether or not to scale the feature matrix
-    :return: Output feature matrix with K columns
-    :return: The column means of the input, subtracted if Center was TRUE
-    :return: The Scaling of the values, to make each dimension have similar 
value ranges
+    :param yhat: column vector of predicted response values y
+    :param ytest: column vector of actual response values y
+    :param lm: indicator if used for linear regression model
+    :return: column vector holding avg_res, ss_avg_res, and R2
     """
 
-    params_dict = {'X': X}
-    params_dict.update(kwargs)
-    
-    vX_0 = Matrix(X.sds_context, '')
-    vX_1 = Matrix(X.sds_context, '')
-    vX_2 = Matrix(X.sds_context, '')
-    output_nodes = [vX_0, vX_1, vX_2, ]
-
-    op = MultiReturn(X.sds_context, 'scale', output_nodes, 
named_input_nodes=params_dict)
-
-    vX_0._unnamed_input_nodes = [op]
-    vX_1._unnamed_input_nodes = [op]
-    vX_2._unnamed_input_nodes = [op]
-
-    return op
+    params_dict = {'yhat': yhat, 'ytest': ytest, 'lm': lm}
+    return Matrix(yhat.sds_context,
+        'lmPredictStats',
+        named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/pca.py 
b/src/main/python/systemds/operator/algorithm/builtin/pca.py
index 403f9cfca1..016c7caf7f 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/pca.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/pca.py
@@ -32,18 +32,45 @@ from systemds.utils.consts import VALID_INPUT_TYPES
 def pca(X: Matrix,
         **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     The function Principal Component Analysis (PCA) is used for 
dimensionality reduction
+     This builtin defines PCA that is a technique typically used to
+     reduce the number of dimensions of a matrix.
+     This implementation is based on calculating eigenvectors on
+     the covariance matrix of the input.
+    
+     An example of calling in DML:
+    
+     .. code-block::
+    
+       data = read($1)
+       [data_reduced, Components] = pca(data=data, K=4, onlyComponents=TRUE)
+       print(Components)
+    
+    
+     An example in a ML pipeline containing PCA:
+    
+     .. code-block::
+    
+       X = read($1)
+       [X_reduced, Components] = pca(data=X, K=4)
+       Y = read($2)
+       bias = l2svm(X=X, Y=Y)
+       X_test = read($3)
+       [y_predict_normal, Y_predict_rounded] = l2svmPredict(X=X_test, W=bias)
+       write($5, Y_predict_rounded)
+    
     
     
     
     :param X: Input feature matrix
-    :param K: Number of reduced dimensions (i.e., columns)
-    :param Center: Indicates whether or not to center the feature matrix
-    :param Scale: Indicates whether or not to scale the feature matrix
+    :param K: Number of components returned
+    :param center: Indicates whether or not to center the feature matrix
+    :param scale: Indicates whether or not to scale the feature matrix
+    :param onlyComponents: Indicate if only the components should be 
calculated and returned
+        not the application of the components on X
     :return: Output feature matrix with K columns
-    :return: Output dominant eigen vectors (can be used for projections)
+    :return: Output dominant eigen vectors sorted by influence
     :return: The column means of the input, subtracted to construct the PCA
-    :return: The Scaling of the values, to make each dimension same size.
+    :return: The scaling of the values, to make each dimension same size.
     """
 
     params_dict = {'X': X}
diff --git 
a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py 
b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
index b2b4424ff6..5c4bb0438a 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
@@ -30,66 +30,55 @@ from systemds.utils.consts import VALID_INPUT_TYPES
 
 
 def randomForest(X: Matrix,
-                 Y: Matrix,
-                 R: Matrix,
+                 y: Matrix,
+                 ctypes: Matrix,
                  **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     This script implement classification random forest with both scale and 
categorical features.
+     This script implements random forest for recoded and binned categorical 
and
+     numerical input features. In detail, we train multiple CART 
(classification
+     and regression trees) decision trees in parallel and use them as an 
ensemble.
+     classifier/regressor. Each tree is trained on a sample of observations 
(rows)
+     and optionally subset of features (columns). During tree construction, 
split
+     candidates are additionally chosen on a sample of remaining features.
     
     
     
-    :param X: Feature matrix X; note that X needs to be both recoded and dummy 
coded
-    :param Y: Label matrix Y; note that Y needs to be both recoded and dummy 
coded
-    :param R: Matrix which for each feature in X contains the following 
information
-        - R[,1]: column ids       TODO pass recorded and binned
-        - R[,2]: start indices
-        - R[,3]: end indices
-        If R is not provided by default all variables are assumed to be scale
-    :param bins: Number of equiheight bins per scale feature to choose 
thresholds
-    :param depth: Maximum depth of the learned tree
-    :param num_leaf: Number of samples when splitting stops and a leaf node is 
added
-    :param num_samples: Number of samples at which point we switch to 
in-memory subtree building
+    :param X: Feature matrix in recoded/binned representation
+    :param y: Label matrix in recoded/binned representation
+    :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+        of shape 1-by-(ncol(X)+1), where the last entry is the y type
     :param num_trees: Number of trees to be learned in the random forest model
-    :param subsamp_rate: Parameter controlling the size of each tree in the 
forest; samples are selected from a
-        Poisson distribution with parameter subsamp_rate (the default value is 
1.0)
-    :param feature_subset: Parameter that controls the number of feature used 
as candidates for splitting at each tree node
-        as a power of number of features in the dataset;
-        by default square root of features (i.e., feature_subset = 0.5) are 
used at each tree node
-    :param impurity: Impurity measure: entropy or Gini (the default)
-    :return: Matrix M containing the learned tree, where each column 
corresponds to a node
-        in the learned tree and each row contains the following information:
-        M[1,j]: id of node j (in a complete binary tree)
-        M[2,j]: tree id to which node j belongs
-        M[3,j]: Offset (no. of columns) to left child of j
-        M[4,j]: Feature index of the feature that node j looks at if j is an 
internal node, otherwise 0
-        M[5,j]: Type of the feature that node j looks at if j is an internal 
node: 1 for scale and 2
-        for categorical features,
-        otherwise the label that leaf node j is supposed to predict
-        M[6,j]: 1 if j is an internal node and the feature chosen for j is 
scale, otherwise the
-        size of the subset of values
-        stored in rows 7,8,... if j is categorical
-        M[7:,j]: Only applicable for internal nodes. Threshold the example's 
feature value is
-        compared to is stored at M[7,j] if the feature chosen for j is scale;
-        If the feature chosen for j is categorical rows 7,8,... depict the 
value subset chosen for j
-    :return: Matrix C containing the number of times samples are chosen in 
each tree of the random forest
-    :return: Mappings from scale feature ids to global feature ids
-    :return: Mappings from categorical feature ids to global feature ids
+    :param sample_frac: Sample fraction of examples for each tree in the forest
+    :param feature_frac: Sample fraction of features for each tree in the 
forest
+    :param max_depth: Maximum depth of the learned tree (stopping criterion)
+    :param min_leaf: Minimum number of samples in leaf nodes (stopping 
criterion)
+    :param min_split: Minimum number of samples in leaf for attempting a split
+    :param max_features: Parameter controlling the number of features used as 
split
+        candidates at tree nodes: m = ceil(num_features^max_features)
+    :param max_values: Parameter controlling the number of values per feature 
used
+        as split candidates: nb = ceil(num_values^max_values)
+    :param impurity: Impurity measure: entropy, gini (default), rss 
(regression)
+    :param seed: Fixed seed for randomization of samples and split candidates
+    :param verbose: Flag indicating verbose debug output
+    :return: Matrix M containing the learned trees, in linearized form
+        For example, give a feature matrix with features [a,b,c,d]
+        and the following two trees, M would look as follows:
+        (L1)          |a<7|                   |d<5|
+        /     \                 /     \
+        (L2)     |c<3|     |b<4|         |a<7|     P3:2
+        /   \     /   \         /   \
+        (L3)   P1:2 P2:1 P3:1 P4:2     P1:2 P2:1
+        --> M :=
+        [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2],  (1st tree)
+        [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]]  (2nd tree)
+        |(L1)| |  (L2)   | |        (L3)         |
+        With feature sampling (feature_frac < 1), each tree is
+        prefixed by a one-hot vector of sampled features
+        (e.g., [1,1,1,0] if we sampled a,b,c of the four features)
     """
 
-    params_dict = {'X': X, 'Y': Y, 'R': R}
+    params_dict = {'X': X, 'y': y, 'ctypes': ctypes}
     params_dict.update(kwargs)
-    
-    vX_0 = Matrix(X.sds_context, '')
-    vX_1 = Matrix(X.sds_context, '')
-    vX_2 = Matrix(X.sds_context, '')
-    vX_3 = Matrix(X.sds_context, '')
-    output_nodes = [vX_0, vX_1, vX_2, vX_3, ]
-
-    op = MultiReturn(X.sds_context, 'randomForest', output_nodes, 
named_input_nodes=params_dict)
-
-    vX_0._unnamed_input_nodes = [op]
-    vX_1._unnamed_input_nodes = [op]
-    vX_2._unnamed_input_nodes = [op]
-    vX_3._unnamed_input_nodes = [op]
-
-    return op
+    return Matrix(X.sds_context,
+        'randomForest',
+        named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/scale.py 
b/src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py
similarity index 55%
copy from src/main/python/systemds/operator/algorithm/builtin/scale.py
copy to 
src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py
index 015709d8c6..c7a598faa5 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/scale.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py
@@ -20,7 +20,7 @@
 # -------------------------------------------------------------
 
 # Autogenerated By   : src/main/python/generator/generator.py
-# Autogenerated From : scripts/builtin/scale.dml
+# Autogenerated From : scripts/builtin/randomForestPredict.dml
 
 from typing import Dict, Iterable
 
@@ -29,33 +29,28 @@ from systemds.script_building.dag import OutputType
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
-def scale(X: Matrix,
-          **kwargs: Dict[str, VALID_INPUT_TYPES]):
+def randomForestPredict(X: Matrix,
+                        ctypes: Matrix,
+                        M: Matrix,
+                        **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     This function scales and center individual features in the input matrix 
(column wise.) using z-score to scale the values.
+     This script implements random forest prediction for recoded and binned
+     categorical and numerical input features.
     
     
     
-    :param X: Input feature matrix
-    :param center: Indicates whether or not to center the feature matrix
-    :param scale: Indicates whether or not to scale the feature matrix
-    :return: Output feature matrix with K columns
-    :return: The column means of the input, subtracted if Center was TRUE
-    :return: The Scaling of the values, to make each dimension have similar 
value ranges
+    :param X: Feature matrix in recoded/binned representation
+    :param y: Label matrix in recoded/binned representation,
+        optional for accuracy evaluation
+    :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+    :param M: Matrix M holding the learned trees (one tree per row),
+        see randomForest() for the detailed tree representation.
+    :param verbose: Flag indicating verbose debug output
+    :return: Label vector of predictions
     """
 
-    params_dict = {'X': X}
+    params_dict = {'X': X, 'ctypes': ctypes, 'M': M}
     params_dict.update(kwargs)
-    
-    vX_0 = Matrix(X.sds_context, '')
-    vX_1 = Matrix(X.sds_context, '')
-    vX_2 = Matrix(X.sds_context, '')
-    output_nodes = [vX_0, vX_1, vX_2, ]
-
-    op = MultiReturn(X.sds_context, 'scale', output_nodes, 
named_input_nodes=params_dict)
-
-    vX_0._unnamed_input_nodes = [op]
-    vX_1._unnamed_input_nodes = [op]
-    vX_2._unnamed_input_nodes = [op]
-
-    return op
+    return Matrix(X.sds_context,
+        'randomForestPredict',
+        named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/scale.py 
b/src/main/python/systemds/operator/algorithm/builtin/scale.py
index 015709d8c6..33203fafb6 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/scale.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/scale.py
@@ -32,16 +32,23 @@ from systemds.utils.consts import VALID_INPUT_TYPES
 def scale(X: Matrix,
           **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     This function scales and center individual features in the input matrix 
(column wise.) using z-score to scale the values.
+     This function scales and center individual features in the input
+     matrix (column wise.) using z-score to scale the values.
+     The transformation is sometimes also called scale and shift,
+     but it is shifted first and then subsequently scaled.
+    
+     The method is not resistant to inputs containing NaN nor overflows
+     of doubles, but handle it by guaranteeing that no extra NaN values
+     are introduced and columns that contain NaN will not be scaled or shifted.
     
     
     
     :param X: Input feature matrix
-    :param center: Indicates whether or not to center the feature matrix
-    :param scale: Indicates whether or not to scale the feature matrix
-    :return: Output feature matrix with K columns
+    :param center: Indicates to center the feature matrix
+    :param scale: Indicates to scale the feature matrix according to z-score
+    :return: Output feature matrix scaled and shifted
     :return: The column means of the input, subtracted if Center was TRUE
-    :return: The Scaling of the values, to make each dimension have similar 
value ranges
+    :return: The scaling of the values, to make each dimension have similar 
value ranges
     """
 
     params_dict = {'X': X}
diff --git a/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py 
b/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py
index a8c34cc0b9..2ca2991391 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py
@@ -41,8 +41,8 @@ def slicefinder(X: Matrix,
     
     
     
-    :param X: Recoded dataset into Matrix
-    :param e: Trained model
+    :param X: Feature matrix in recoded/binned representation
+    :param e: Error vector of trained model
     :param k: Number of subsets required
     :param maxL: maximum level L (conjunctions of L predicates), 0 unlimited
     :param minSup: minimum support (min number of rows per slice)

[systemds] branch main updated: [MINOR] Python autogenerate new Builtins

Reply via email to