This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 2d4114e96e [MINOR] Python autogenerate new Builtins
2d4114e96e is described below
commit 2d4114e96ef6858b0dff0059e63644bfca8189ad
Author: baunsgaard <[email protected]>
AuthorDate: Wed May 10 10:42:17 2023 +0200
[MINOR] Python autogenerate new Builtins
---
.../python/systemds/operator/algorithm/__init__.py | 6 ++
.../algorithm/builtin/{scale.py => auc.py} | 38 +++------
.../operator/algorithm/builtin/decisionTree.py | 63 +++++++-------
.../algorithm/builtin/decisionTreePredict.py | 46 +++++-----
.../builtin/{scale.py => lmPredictStats.py} | 39 ++++-----
.../systemds/operator/algorithm/builtin/pca.py | 39 +++++++--
.../operator/algorithm/builtin/randomForest.py | 97 ++++++++++------------
.../builtin/{scale.py => randomForestPredict.py} | 43 +++++-----
.../systemds/operator/algorithm/builtin/scale.py | 17 ++--
.../operator/algorithm/builtin/slicefinder.py | 4 +-
10 files changed, 194 insertions(+), 198 deletions(-)
diff --git a/src/main/python/systemds/operator/algorithm/__init__.py
b/src/main/python/systemds/operator/algorithm/__init__.py
index 2dd6578833..769ca66229 100644
--- a/src/main/python/systemds/operator/algorithm/__init__.py
+++ b/src/main/python/systemds/operator/algorithm/__init__.py
@@ -31,6 +31,7 @@ from .builtin.alsPredict import alsPredict
from .builtin.alsTopkPredict import alsTopkPredict
from .builtin.apply_pipeline import apply_pipeline
from .builtin.arima import arima
+from .builtin.auc import auc
from .builtin.autoencoder_2layer import autoencoder_2layer
from .builtin.bandit import bandit
from .builtin.bivar import bivar
@@ -110,6 +111,7 @@ from .builtin.lm import lm
from .builtin.lmCG import lmCG
from .builtin.lmDS import lmDS
from .builtin.lmPredict import lmPredict
+from .builtin.lmPredictStats import lmPredictStats
from .builtin.logSumExp import logSumExp
from .builtin.matrixProfile import matrixProfile
from .builtin.mcc import mcc
@@ -137,6 +139,7 @@ from .builtin.pcaTransform import pcaTransform
from .builtin.pnmf import pnmf
from .builtin.ppca import ppca
from .builtin.randomForest import randomForest
+from .builtin.randomForestPredict import randomForestPredict
from .builtin.scale import scale
from .builtin.scaleApply import scaleApply
from .builtin.scaleMinMax import scaleMinMax
@@ -182,6 +185,7 @@ __all__ = ['WoE',
'alsTopkPredict',
'apply_pipeline',
'arima',
+ 'auc',
'autoencoder_2layer',
'bandit',
'bivar',
@@ -261,6 +265,7 @@ __all__ = ['WoE',
'lmCG',
'lmDS',
'lmPredict',
+ 'lmPredictStats',
'logSumExp',
'matrixProfile',
'mcc',
@@ -288,6 +293,7 @@ __all__ = ['WoE',
'pnmf',
'ppca',
'randomForest',
+ 'randomForestPredict',
'scale',
'scaleApply',
'scaleMinMax',
diff --git a/src/main/python/systemds/operator/algorithm/builtin/scale.py
b/src/main/python/systemds/operator/algorithm/builtin/auc.py
similarity index 53%
copy from src/main/python/systemds/operator/algorithm/builtin/scale.py
copy to src/main/python/systemds/operator/algorithm/builtin/auc.py
index 015709d8c6..8df6835311 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/scale.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/auc.py
@@ -20,7 +20,7 @@
# -------------------------------------------------------------
# Autogenerated By : src/main/python/generator/generator.py
-# Autogenerated From : scripts/builtin/scale.dml
+# Autogenerated From : scripts/builtin/auc.dml
from typing import Dict, Iterable
@@ -29,33 +29,21 @@ from systemds.script_building.dag import OutputType
from systemds.utils.consts import VALID_INPUT_TYPES
-def scale(X: Matrix,
- **kwargs: Dict[str, VALID_INPUT_TYPES]):
+def auc(Y: Matrix,
+ P: Matrix):
"""
- This function scales and center individual features in the input matrix
(column wise.) using z-score to scale the values.
+ This builting function computes the area under the ROC curve (AUC)
+ for binary classifiers.
- :param X: Input feature matrix
- :param center: Indicates whether or not to center the feature matrix
- :param scale: Indicates whether or not to scale the feature matrix
- :return: Output feature matrix with K columns
- :return: The column means of the input, subtracted if Center was TRUE
- :return: The Scaling of the values, to make each dimension have similar
value ranges
+ :param Y: Binary response vector (shape: n x 1), in -1/+1 or 0/1 encoding
+ :param P: Prediction scores (predictor such as estimated probabilities)
+ for true class (shape: n x 1), assumed in [0,1]
+ :return: Area under the ROC curve (AUC)
"""
- params_dict = {'X': X}
- params_dict.update(kwargs)
-
- vX_0 = Matrix(X.sds_context, '')
- vX_1 = Matrix(X.sds_context, '')
- vX_2 = Matrix(X.sds_context, '')
- output_nodes = [vX_0, vX_1, vX_2, ]
-
- op = MultiReturn(X.sds_context, 'scale', output_nodes,
named_input_nodes=params_dict)
-
- vX_0._unnamed_input_nodes = [op]
- vX_1._unnamed_input_nodes = [op]
- vX_2._unnamed_input_nodes = [op]
-
- return op
+ params_dict = {'Y': Y, 'P': P}
+ return Matrix(Y.sds_context,
+ 'auc',
+ named_input_nodes=params_dict)
diff --git
a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
index 4fb2ee5688..399a21fd50 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
@@ -30,43 +30,46 @@ from systemds.utils.consts import VALID_INPUT_TYPES
def decisionTree(X: Matrix,
- Y: Matrix,
- R: Matrix,
+ y: Matrix,
+ ctypes: Matrix,
**kwargs: Dict[str, VALID_INPUT_TYPES]):
"""
- Builtin script implementing classification trees with scale and
categorical features
+ This script implements decision trees for recoded and binned categorical
and
+ numerical input features. We train a single CART (classification and
+ regression tree) decision trees depending on the provided labels y, either
+ classification (majority vote per leaf) or regression (average per leaf).
- :param X: Feature matrix X; note that X needs to be both recoded and dummy
coded
- :param Y: Label matrix Y; note that Y needs to be both recoded and dummy
coded
- :param R: Matrix R which for each feature in X contains the following
information
- - R[1,]: Row Vector which indicates if feature vector is scalar or
categorical. 1 indicates
- a scalar feature vector, other positive Integers indicate the number
of categories
- If R is not provided by default all variables are assumed to be scale
- :param bins: Number of equiheight bins per scale feature to choose
thresholds
- :param depth: Maximum depth of the learned tree
- :param verbose: boolean specifying if the algorithm should print
information while executing
- :return: Matrix M where each column corresponds to a node in the learned
tree and each row
- contains the following information:
- M[1,j]: id of node j (in a complete binary tree)
- M[2,j]: Offset (no. of columns) to left child of j if j is an internal
node, otherwise 0
- M[3,j]: Feature index of the feature (scale feature id if the feature
is scale or
- categorical feature id if the feature is categorical)
- that node j looks at if j is an internal node, otherwise 0
- M[4,j]: Type of the feature that node j looks at if j is an internal
node: holds
- the same information as R input vector
- M[5,j]: If j is an internal node: 1 if the feature chosen for j is
scale,
- otherwise the size of the subset of values
- stored in rows 6,7,... if j is categorical
- If j is a leaf node: number of misclassified samples reaching at node j
- M[6:,j]: If j is an internal node: Threshold the example's feature
value is compared
- to is stored at M[6,j] if the feature chosen for j is scale,
- otherwise if the feature chosen for j is categorical rows 6,7,...
depict the value subset chosen for j
- If j is a leaf node 1 if j is impure and the number of samples at j >
threshold, otherwise 0
+ :param X: Feature matrix in recoded/binned representation
+ :param y: Label matrix in recoded/binned representation
+ :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+ of shape 1-by-(ncol(X)+1), where the last entry is the y type
+ :param max_depth: Maximum depth of the learned tree (stopping criterion)
+ :param min_leaf: Minimum number of samples in leaf nodes (stopping
criterion),
+ odd number recommended to avoid 50/50 leaf label decisions
+ :param min_split: Minimum number of samples in leaf for attempting a split
+ :param max_features: Parameter controlling the number of features used as
split
+ candidates at tree nodes: m = ceil(num_features^max_features)
+ :param max_values: Parameter controlling the number of values per feature
used
+ as split candidates: nb = ceil(num_values^max_values)
+ :param impurity: Impurity measure: entropy, gini (default), rss
(regression)
+ :param seed: Fixed seed for randomization of samples and split candidates
+ :param verbose: Flag indicating verbose debug output
+ :return: Matrix M containing the learne trees, in linearized form
+ For example, give a feature matrix with features [a,b,c,d]
+ and the following trees, M would look as follows:
+ (L1) |d<5|
+ / \
+ (L2) P1:2 |a<7|
+ / \
+ (L3) P2:2 P3:1
+ --> M :=
+ [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]]
+ |(L1)| | (L2) | | (L3) |
"""
- params_dict = {'X': X, 'Y': Y, 'R': R}
+ params_dict = {'X': X, 'y': y, 'ctypes': ctypes}
params_dict.update(kwargs)
return Matrix(X.sds_context,
'decisionTree',
diff --git
a/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
b/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
index 51a396eef7..32bb06609b 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
@@ -29,40 +29,32 @@ from systemds.script_building.dag import OutputType
from systemds.utils.consts import VALID_INPUT_TYPES
-def decisionTreePredict(M: Matrix,
- X: Matrix,
- strategy: str):
+def decisionTreePredict(X: Matrix,
+ ctypes: Matrix,
+ M: Matrix,
+ **kwargs: Dict[str, VALID_INPUT_TYPES]):
"""
-
- Builtin script implementing prediction based on classification trees with
scale features using prediction methods of the
+ This script implements random forest prediction for recoded and binned
+ categorical and numerical input features.
Hummingbird paper
(https://www.usenix.org/system/files/osdi20-nakandala.pdf).
- :param M: Decision tree matrix M, as generated by
scripts/builtin/decisionTree.dml, where each column corresponds
- to a node in the learned tree and each row contains the following
information:
- M[1,j]: id of node j (in a complete binary tree)
- M[2,j]: Offset (no. of columns) to left child of j if j is an internal
node, otherwise 0
- M[3,j]: Feature index of the feature (scale feature id if the feature
is scale or
- categorical feature id if the feature is categorical)
- that node j looks at if j is an internal node, otherwise 0
- M[4,j]: Type of the feature that node j looks at if j is an internal
node: holds
- the same information as R input vector
- M[5,j]: If j is an internal node: 1 if the feature chosen for j is
scale,
- otherwise the size of the subset of values
- stored in rows 6,7,... if j is categorical
- If j is a leaf node: number of misclassified samples reaching at node j
- M[6:,j]: If j is an internal node: Threshold the example's feature
value is compared
- to is stored at M[6,j] if the feature chosen for j is scale,
- otherwise if the feature chosen for j is categorical rows 6,7,...
depict the value subset chosen for j
- If j is a leaf node 1 if j is impure and the number of samples at j >
threshold, otherwise 0
- :param X: Feature matrix X
- :param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"],
referring to "Generic matrix multiplication",
+ :param X: Feature matrix in recoded/binned representation
+ :param y: Label matrix in recoded/binned representation,
+ optional for accuracy evaluation
+ :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+ :param M: Matrix M holding the learned tree in linearized form
+ see decisionTree() for the detailed tree representation.
+ :param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"],
+ referring to "Generic matrix multiplication",
"Tree traversal", and "Perfect tree traversal", respectively
- :return: Matrix containing the predicted labels for X
+ :param verbose: Flag indicating verbose debug output
+ :return: Label vector of predictions
"""
- params_dict = {'M': M, 'X': X, 'strategy': strategy}
- return Matrix(M.sds_context,
+ params_dict = {'X': X, 'ctypes': ctypes, 'M': M}
+ params_dict.update(kwargs)
+ return Matrix(X.sds_context,
'decisionTreePredict',
named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/scale.py
b/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py
similarity index 53%
copy from src/main/python/systemds/operator/algorithm/builtin/scale.py
copy to src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py
index 015709d8c6..731d6d232c 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/scale.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py
@@ -20,7 +20,7 @@
# -------------------------------------------------------------
# Autogenerated By : src/main/python/generator/generator.py
-# Autogenerated From : scripts/builtin/scale.dml
+# Autogenerated From : scripts/builtin/lmPredictStats.dml
from typing import Dict, Iterable
@@ -29,33 +29,22 @@ from systemds.script_building.dag import OutputType
from systemds.utils.consts import VALID_INPUT_TYPES
-def scale(X: Matrix,
- **kwargs: Dict[str, VALID_INPUT_TYPES]):
+def lmPredictStats(yhat: Matrix,
+ ytest: Matrix,
+ lm: bool):
"""
- This function scales and center individual features in the input matrix
(column wise.) using z-score to scale the values.
+ This builtin function computes and prints a summary of accuracy
+ measures for regression problems.
- :param X: Input feature matrix
- :param center: Indicates whether or not to center the feature matrix
- :param scale: Indicates whether or not to scale the feature matrix
- :return: Output feature matrix with K columns
- :return: The column means of the input, subtracted if Center was TRUE
- :return: The Scaling of the values, to make each dimension have similar
value ranges
+ :param yhat: column vector of predicted response values y
+ :param ytest: column vector of actual response values y
+ :param lm: indicator if used for linear regression model
+ :return: column vector holding avg_res, ss_avg_res, and R2
"""
- params_dict = {'X': X}
- params_dict.update(kwargs)
-
- vX_0 = Matrix(X.sds_context, '')
- vX_1 = Matrix(X.sds_context, '')
- vX_2 = Matrix(X.sds_context, '')
- output_nodes = [vX_0, vX_1, vX_2, ]
-
- op = MultiReturn(X.sds_context, 'scale', output_nodes,
named_input_nodes=params_dict)
-
- vX_0._unnamed_input_nodes = [op]
- vX_1._unnamed_input_nodes = [op]
- vX_2._unnamed_input_nodes = [op]
-
- return op
+ params_dict = {'yhat': yhat, 'ytest': ytest, 'lm': lm}
+ return Matrix(yhat.sds_context,
+ 'lmPredictStats',
+ named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/pca.py
b/src/main/python/systemds/operator/algorithm/builtin/pca.py
index 403f9cfca1..016c7caf7f 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/pca.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/pca.py
@@ -32,18 +32,45 @@ from systemds.utils.consts import VALID_INPUT_TYPES
def pca(X: Matrix,
**kwargs: Dict[str, VALID_INPUT_TYPES]):
"""
- The function Principal Component Analysis (PCA) is used for
dimensionality reduction
+ This builtin defines PCA that is a technique typically used to
+ reduce the number of dimensions of a matrix.
+ This implementation is based on calculating eigenvectors on
+ the covariance matrix of the input.
+
+ An example of calling in DML:
+
+ .. code-block::
+
+ data = read($1)
+ [data_reduced, Components] = pca(data=data, K=4, onlyComponents=TRUE)
+ print(Components)
+
+
+ An example in a ML pipeline containing PCA:
+
+ .. code-block::
+
+ X = read($1)
+ [X_reduced, Components] = pca(data=X, K=4)
+ Y = read($2)
+ bias = l2svm(X=X, Y=Y)
+ X_test = read($3)
+ [y_predict_normal, Y_predict_rounded] = l2svmPredict(X=X_test, W=bias)
+ write($5, Y_predict_rounded)
+
:param X: Input feature matrix
- :param K: Number of reduced dimensions (i.e., columns)
- :param Center: Indicates whether or not to center the feature matrix
- :param Scale: Indicates whether or not to scale the feature matrix
+ :param K: Number of components returned
+ :param center: Indicates whether or not to center the feature matrix
+ :param scale: Indicates whether or not to scale the feature matrix
+ :param onlyComponents: Indicate if only the components should be
calculated and returned
+ not the application of the components on X
:return: Output feature matrix with K columns
- :return: Output dominant eigen vectors (can be used for projections)
+ :return: Output dominant eigen vectors sorted by influence
:return: The column means of the input, subtracted to construct the PCA
- :return: The Scaling of the values, to make each dimension same size.
+ :return: The scaling of the values, to make each dimension same size.
"""
params_dict = {'X': X}
diff --git
a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
index b2b4424ff6..5c4bb0438a 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
@@ -30,66 +30,55 @@ from systemds.utils.consts import VALID_INPUT_TYPES
def randomForest(X: Matrix,
- Y: Matrix,
- R: Matrix,
+ y: Matrix,
+ ctypes: Matrix,
**kwargs: Dict[str, VALID_INPUT_TYPES]):
"""
- This script implement classification random forest with both scale and
categorical features.
+ This script implements random forest for recoded and binned categorical
and
+ numerical input features. In detail, we train multiple CART
(classification
+ and regression trees) decision trees in parallel and use them as an
ensemble.
+ classifier/regressor. Each tree is trained on a sample of observations
(rows)
+ and optionally subset of features (columns). During tree construction,
split
+ candidates are additionally chosen on a sample of remaining features.
- :param X: Feature matrix X; note that X needs to be both recoded and dummy
coded
- :param Y: Label matrix Y; note that Y needs to be both recoded and dummy
coded
- :param R: Matrix which for each feature in X contains the following
information
- - R[,1]: column ids TODO pass recorded and binned
- - R[,2]: start indices
- - R[,3]: end indices
- If R is not provided by default all variables are assumed to be scale
- :param bins: Number of equiheight bins per scale feature to choose
thresholds
- :param depth: Maximum depth of the learned tree
- :param num_leaf: Number of samples when splitting stops and a leaf node is
added
- :param num_samples: Number of samples at which point we switch to
in-memory subtree building
+ :param X: Feature matrix in recoded/binned representation
+ :param y: Label matrix in recoded/binned representation
+ :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+ of shape 1-by-(ncol(X)+1), where the last entry is the y type
:param num_trees: Number of trees to be learned in the random forest model
- :param subsamp_rate: Parameter controlling the size of each tree in the
forest; samples are selected from a
- Poisson distribution with parameter subsamp_rate (the default value is
1.0)
- :param feature_subset: Parameter that controls the number of feature used
as candidates for splitting at each tree node
- as a power of number of features in the dataset;
- by default square root of features (i.e., feature_subset = 0.5) are
used at each tree node
- :param impurity: Impurity measure: entropy or Gini (the default)
- :return: Matrix M containing the learned tree, where each column
corresponds to a node
- in the learned tree and each row contains the following information:
- M[1,j]: id of node j (in a complete binary tree)
- M[2,j]: tree id to which node j belongs
- M[3,j]: Offset (no. of columns) to left child of j
- M[4,j]: Feature index of the feature that node j looks at if j is an
internal node, otherwise 0
- M[5,j]: Type of the feature that node j looks at if j is an internal
node: 1 for scale and 2
- for categorical features,
- otherwise the label that leaf node j is supposed to predict
- M[6,j]: 1 if j is an internal node and the feature chosen for j is
scale, otherwise the
- size of the subset of values
- stored in rows 7,8,... if j is categorical
- M[7:,j]: Only applicable for internal nodes. Threshold the example's
feature value is
- compared to is stored at M[7,j] if the feature chosen for j is scale;
- If the feature chosen for j is categorical rows 7,8,... depict the
value subset chosen for j
- :return: Matrix C containing the number of times samples are chosen in
each tree of the random forest
- :return: Mappings from scale feature ids to global feature ids
- :return: Mappings from categorical feature ids to global feature ids
+ :param sample_frac: Sample fraction of examples for each tree in the forest
+ :param feature_frac: Sample fraction of features for each tree in the
forest
+ :param max_depth: Maximum depth of the learned tree (stopping criterion)
+ :param min_leaf: Minimum number of samples in leaf nodes (stopping
criterion)
+ :param min_split: Minimum number of samples in leaf for attempting a split
+ :param max_features: Parameter controlling the number of features used as
split
+ candidates at tree nodes: m = ceil(num_features^max_features)
+ :param max_values: Parameter controlling the number of values per feature
used
+ as split candidates: nb = ceil(num_values^max_values)
+ :param impurity: Impurity measure: entropy, gini (default), rss
(regression)
+ :param seed: Fixed seed for randomization of samples and split candidates
+ :param verbose: Flag indicating verbose debug output
+ :return: Matrix M containing the learned trees, in linearized form
+ For example, give a feature matrix with features [a,b,c,d]
+ and the following two trees, M would look as follows:
+ (L1) |a<7| |d<5|
+ / \ / \
+ (L2) |c<3| |b<4| |a<7| P3:2
+ / \ / \ / \
+ (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1
+ --> M :=
+ [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree)
+ [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree)
+ |(L1)| | (L2) | | (L3) |
+ With feature sampling (feature_frac < 1), each tree is
+ prefixed by a one-hot vector of sampled features
+ (e.g., [1,1,1,0] if we sampled a,b,c of the four features)
"""
- params_dict = {'X': X, 'Y': Y, 'R': R}
+ params_dict = {'X': X, 'y': y, 'ctypes': ctypes}
params_dict.update(kwargs)
-
- vX_0 = Matrix(X.sds_context, '')
- vX_1 = Matrix(X.sds_context, '')
- vX_2 = Matrix(X.sds_context, '')
- vX_3 = Matrix(X.sds_context, '')
- output_nodes = [vX_0, vX_1, vX_2, vX_3, ]
-
- op = MultiReturn(X.sds_context, 'randomForest', output_nodes,
named_input_nodes=params_dict)
-
- vX_0._unnamed_input_nodes = [op]
- vX_1._unnamed_input_nodes = [op]
- vX_2._unnamed_input_nodes = [op]
- vX_3._unnamed_input_nodes = [op]
-
- return op
+ return Matrix(X.sds_context,
+ 'randomForest',
+ named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/scale.py
b/src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py
similarity index 55%
copy from src/main/python/systemds/operator/algorithm/builtin/scale.py
copy to
src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py
index 015709d8c6..c7a598faa5 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/scale.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py
@@ -20,7 +20,7 @@
# -------------------------------------------------------------
# Autogenerated By : src/main/python/generator/generator.py
-# Autogenerated From : scripts/builtin/scale.dml
+# Autogenerated From : scripts/builtin/randomForestPredict.dml
from typing import Dict, Iterable
@@ -29,33 +29,28 @@ from systemds.script_building.dag import OutputType
from systemds.utils.consts import VALID_INPUT_TYPES
-def scale(X: Matrix,
- **kwargs: Dict[str, VALID_INPUT_TYPES]):
+def randomForestPredict(X: Matrix,
+ ctypes: Matrix,
+ M: Matrix,
+ **kwargs: Dict[str, VALID_INPUT_TYPES]):
"""
- This function scales and center individual features in the input matrix
(column wise.) using z-score to scale the values.
+ This script implements random forest prediction for recoded and binned
+ categorical and numerical input features.
- :param X: Input feature matrix
- :param center: Indicates whether or not to center the feature matrix
- :param scale: Indicates whether or not to scale the feature matrix
- :return: Output feature matrix with K columns
- :return: The column means of the input, subtracted if Center was TRUE
- :return: The Scaling of the values, to make each dimension have similar
value ranges
+ :param X: Feature matrix in recoded/binned representation
+ :param y: Label matrix in recoded/binned representation,
+ optional for accuracy evaluation
+ :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+ :param M: Matrix M holding the learned trees (one tree per row),
+ see randomForest() for the detailed tree representation.
+ :param verbose: Flag indicating verbose debug output
+ :return: Label vector of predictions
"""
- params_dict = {'X': X}
+ params_dict = {'X': X, 'ctypes': ctypes, 'M': M}
params_dict.update(kwargs)
-
- vX_0 = Matrix(X.sds_context, '')
- vX_1 = Matrix(X.sds_context, '')
- vX_2 = Matrix(X.sds_context, '')
- output_nodes = [vX_0, vX_1, vX_2, ]
-
- op = MultiReturn(X.sds_context, 'scale', output_nodes,
named_input_nodes=params_dict)
-
- vX_0._unnamed_input_nodes = [op]
- vX_1._unnamed_input_nodes = [op]
- vX_2._unnamed_input_nodes = [op]
-
- return op
+ return Matrix(X.sds_context,
+ 'randomForestPredict',
+ named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/scale.py
b/src/main/python/systemds/operator/algorithm/builtin/scale.py
index 015709d8c6..33203fafb6 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/scale.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/scale.py
@@ -32,16 +32,23 @@ from systemds.utils.consts import VALID_INPUT_TYPES
def scale(X: Matrix,
**kwargs: Dict[str, VALID_INPUT_TYPES]):
"""
- This function scales and center individual features in the input matrix
(column wise.) using z-score to scale the values.
+ This function scales and center individual features in the input
+ matrix (column wise.) using z-score to scale the values.
+ The transformation is sometimes also called scale and shift,
+ but it is shifted first and then subsequently scaled.
+
+ The method is not resistant to inputs containing NaN nor overflows
+ of doubles, but handle it by guaranteeing that no extra NaN values
+ are introduced and columns that contain NaN will not be scaled or shifted.
:param X: Input feature matrix
- :param center: Indicates whether or not to center the feature matrix
- :param scale: Indicates whether or not to scale the feature matrix
- :return: Output feature matrix with K columns
+ :param center: Indicates to center the feature matrix
+ :param scale: Indicates to scale the feature matrix according to z-score
+ :return: Output feature matrix scaled and shifted
:return: The column means of the input, subtracted if Center was TRUE
- :return: The Scaling of the values, to make each dimension have similar
value ranges
+ :return: The scaling of the values, to make each dimension have similar
value ranges
"""
params_dict = {'X': X}
diff --git a/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py
b/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py
index a8c34cc0b9..2ca2991391 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py
@@ -41,8 +41,8 @@ def slicefinder(X: Matrix,
- :param X: Recoded dataset into Matrix
- :param e: Trained model
+ :param X: Feature matrix in recoded/binned representation
+ :param e: Error vector of trained model
:param k: Number of subsets required
:param maxL: maximum level L (conjunctions of L predicates), 0 unlimited
:param minSup: minimum support (min number of rows per slice)