Repository: systemml Updated Branches: refs/heads/master 900d8c926 -> 317f2189c
[SYSTEMML-493] Functionalize PCA Changes include wrapping PCA computation to a function call. Improve docs and add default values in genRandData4PCA.dml. Closes #653. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/317f2189 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/317f2189 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/317f2189 Branch: refs/heads/master Commit: 317f2189c585e2dd37d847f5e2e32d97f8b31a60 Parents: 900d8c9 Author: krishnakalyan3 <[email protected]> Authored: Fri Sep 22 10:51:44 2017 -0700 Committer: Deron Eriksson <[email protected]> Committed: Fri Sep 22 10:51:44 2017 -0700 ---------------------------------------------------------------------- scripts/datagen/genRandData4PCA.dml | 43 ++++++------ scripts/staging/PCA.dml | 117 +++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 22 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/317f2189/scripts/datagen/genRandData4PCA.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4PCA.dml b/scripts/datagen/genRandData4PCA.dml index 382a362..058facb 100644 --- a/scripts/datagen/genRandData4PCA.dml +++ b/scripts/datagen/genRandData4PCA.dml @@ -19,28 +19,27 @@ # #------------------------------------------------------------- -/* -Synthetic data generator for PCA. --> 3 hidden dimensions (V1, V2, V3) --> generates only "dense" data - ---------------------------------- - Parameters ---------------------------------- -$R = #rows -$C = #columns -$OUT = output file path on HDFS -$FMT = output format ---------------------------------- -hadoop jar SystemML.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000 DATA=/user/biuser/pcaData.mtx FMT=csv ---------------------------------- -*/ - -FMT = ifdef($FMT,"binary"); # default output format - -# number of categorical attributes.. numC <= C -R = $R; -C = $C; +# +# Synthetic data generator for PCA +# 3 hidden dimensions (V1, V2, V3) +# generates only "dense" data +# +# INPUT PARAMETERS: +# -------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# -------------------------------------------------------------------------------------------- +# R Int 10000 Number of rows +# C Int 1000 Number of categorical attributes +# OUT String --- Location (on HDFS) to store the generated dataset +# FMT String "csv" Matrix output format, usually "text", "csv" or "binary" +# -------------------------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemML.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000 OUT=/user/biuser/pcaData.mtx FMT=csv + +R = ifdef ($R, 10000) +C = ifdef ($C, 1000) +FMT = ifdef ($FMT, "csv"); # Modofied version of the procedure from Zou et.al., "Sparse Principal Component Analysis", 2006. http://git-wip-us.apache.org/repos/asf/systemml/blob/317f2189/scripts/staging/PCA.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/PCA.dml b/scripts/staging/PCA.dml new file mode 100644 index 0000000..d73c2e4 --- /dev/null +++ b/scripts/staging/PCA.dml @@ -0,0 +1,117 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# This script performs Principal Component Analysis (PCA) on the given input data. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# INPUT String --- Location to read the matrix A of feature vectors +# K Int --- Indicates dimension of the new vector space constructed from eigen vectors +# CENTER Int 0 Indicates whether or not to center data +# SCALE Int 0 Indicates whether or not to scale data +# OFMT String --- Output data format +# PROJDATA Int 0 This argument indicates if the data should be projected or not +# MODEL String --- Location to already existing model: eigenvectors and eigenvalues +# OUTPUT String / Location to write output matrices (covariance matrix, new basis vectors, +# and data projected onto new basis vectors) +# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000 +# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1 +# --------------------------------------------------------------------------------------------- + +A = read($INPUT); +K = ifdef($K, ncol(A)); +ofmt = ifdef($OFMT, "CSV"); +projectData = ifdef($PROJDATA,0); +model = ifdef($MODEL,""); +center = ifdef($CENTER,0); +scale = ifdef($SCALE,0); +output = ifdef($OUTPUT,"/"); + +# reuse existing model to project data +if (model != "") { + evec_dominant = read(model+"/dominant.eigen.vectors"); + }else{ + model = output; +} + +PCA = function(matrix[double] A, integer K, string ofmt, integer projectData, string model, integer center, integer scale, string output) + return(matrix[double] eval_dominant, matrix[double] evec_dominant) { + + evec_dominant = matrix(0,cols=1,rows=1); + + N = nrow(A); + D = ncol(A); + + # perform z-scoring (centering and scaling) + if (center == 1) { + cm = colMeans(A); + A = A - cm; + } + if (scale == 1) { + cvars = (colSums (A^2)); + if (center == 1){ + cm = colMeans(A); + cvars = (cvars - N*(cm^2))/(N-1); + } + Azscored = (A)/sqrt(cvars); + A = Azscored; + } + + # co-variance matrix + mu = colSums(A)/N; + C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu; + + + # compute eigen vectors and values + [evalues, evectors] = eigen(C); + + decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE); + diagmat = table(seq(1,D),decreasing_Idx); + # sorts eigenvalues by decreasing order + evalues = diagmat %*% evalues; + # sorts eigenvectors column-wise in the order of decreasing eigenvalues + evectors = evectors %*% diagmat; + + + # select K dominant eigen vectors + nvec = ncol(evectors); + + eval_dominant = evalues[1:K, 1]; + evec_dominant = evectors[,1:K]; + +} + +[eval_dominant, evec_dominant] = PCA(A, K, ofmt, projectData, model, center, scale, output) + +# the square root of eigenvalues +eval_stdev_dominant = sqrt(eval_dominant); +write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt); +write(eval_dominant, model+"/dominant.eigen.values", format=ofmt); +write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt); + +# Construct new data set by treating computed dominant eigenvectors as the basis vectors +if (projectData == 1 | model != ""){ + newA = A %*% evec_dominant; + write(newA, output+"/projected.data", format=ofmt); +}
