This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 41685600ba8370646301ef0997f170c300101cea Author: Matthias Boehm <[email protected]> AuthorDate: Sat Aug 15 13:20:08 2020 +0200 [SYSTEMDS-2619] New pca builtin function (principal component analysis) --- scripts/builtin/pca.dml | 64 ++++++++++++++++++++++ .../java/org/apache/sysds/common/Builtins.java | 1 + 2 files changed, 65 insertions(+) diff --git a/scripts/builtin/pca.dml b/scripts/builtin/pca.dml new file mode 100644 index 0000000..b968162 --- /dev/null +++ b/scripts/builtin/pca.dml @@ -0,0 +1,64 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Principal Component Analysis (PCA) for dimensionality reduction +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix --- Input feature matrix +# K Int --- Number of reduced dimensions (i.e., columns) +# Center Boolean TRUE Indicates whether or not to center the feature matrix +# Scale Boolean TRUE Indicates whether or not to scale the feature matrix +# --------------------------------------------------------------------------------------------- +# Xout Matrix --- Output feature matrix with K columns +# Mout Matrix --- Output dominant eigen vectors (can be used for projections) +# --------------------------------------------------------------------------------------------- + +m_pca = function(Matrix[Double] X, Integer K=2, Boolean center=TRUE, Boolean scale=TRUE) + return (Matrix[Double] Xout, Matrix[Double] Mout) +{ + N = nrow(X); + D = ncol(X); + + # perform z-scoring (centering and scaling) + X = scale(X, center, scale); + + # co-variance matrix + mu = colSums(X)/N; + C = (t(X) %*% X)/(N-1) - (N/(N-1))*t(mu) %*% mu; + + # compute eigen vectors and values + [evalues, evectors] = eigen(C); + + decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE); + diagmat = table(seq(1,D),decreasing_Idx); + # sorts eigenvalues by decreasing order + evalues = diagmat %*% evalues; + # sorts eigenvectors column-wise in the order of decreasing eigenvalues + evectors = evectors %*% diagmat; + + eval_dominant = evalues[1:K, 1]; + evec_dominant = evectors[,1:K]; + + # Construct new data set by treating computed dominant eigenvectors as the basis vectors + Xout = X %*% evec_dominant; + Mout = evec_dominant; +} diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index cc5b12b..1cd430c 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -145,6 +145,7 @@ public enum Builtins { OUTLIER("outlier", true, false), //TODO parameterize opposite OUTLIER_SD("outlierBySd", true), OUTLIER_IQR("outlierByIQR", true), + PCA("pca", true), PNMF("pnmf", true), PPRED("ppred", false), PROD("prod", false),
