This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push: new e929457 [SYSTEMDS-2682/3] New Lasso and PPCA built-in functions e929457 is described below commit e929457e2f1850e6f1c2b5c490523f9526e51be5 Author: Sven Celin <sven.ce...@gmail.com> AuthorDate: Sat Oct 10 19:38:10 2020 +0200 [SYSTEMDS-2682/3] New Lasso and PPCA built-in functions AMLS project SS2020. Closes #1071. --- scripts/builtin/lasso.dml | 126 +++++++++++++++++ scripts/builtin/ppca.dml | 149 +++++++++++++++++++++ .../java/org/apache/sysds/common/Builtins.java | 2 + .../test/functions/builtin/BuiltinLassoTest.java | 70 ++++++++++ .../test/functions/builtin/BuiltinPPCATest.java | 74 ++++++++++ src/test/scripts/functions/builtin/lasso.dml | 25 ++++ src/test/scripts/functions/builtin/ppca.dml | 26 ++++ 7 files changed, 472 insertions(+) diff --git a/scripts/builtin/lasso.dml b/scripts/builtin/lasso.dml new file mode 100644 index 0000000..7b6b464 --- /dev/null +++ b/scripts/builtin/lasso.dml @@ -0,0 +1,126 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Builtin function for the SpaRSA algorithm to perform lasso regression +# (SpaRSA .. Sparse Reconstruction by Separable Approximation) +# +# INPUTS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Double --- input feature matrix +# y Double --- matrix Y columns of the design matrix +# tol Double 1e-15 target convergence tolerance +# M Integer 5 history length +# tau Double 1 regularization component +# maxi Integer 100 maximum number of iterations until convergence +# --------------------------------------------------------------------------------------------- +# OUTPUTS +# w Double --- model matrix + + +m_lasso = function(Matrix[Double] X, Matrix[Double] y, Double tol = 10^(-15), + Integer M = 5, Double tau = 1, Integer maxi = 100, Boolean verbose = TRUE) + return(Matrix[Double] w) +{ + n = nrow(X) + m = ncol(X) + + #constants + eta = 2 + sigma = 0.01 + alpha_min = 10^(-30) + alpha_max = 10^(30) + + #init + alpha = 1 + w = Rand(rows=m, cols=1, min=0, max=1, pdf="uniform") + history = -1e30 * matrix(1, rows=M, cols=1) + + r = X %*% w - y + g = t(X) %*% r + obj = 0.5 * sum(r*r) + tau*sum(abs(w)) + + if( verbose ) + print("Initial OBJ=" + obj) + + history[M,1] = obj + + inactive_set = matrix(1, rows=m, cols=1) + iter = 0 + continue = TRUE + while(iter < maxiter & continue) { + dw = matrix(0, rows=m, cols=1) + dg = matrix(0, rows=m, cols=1) + relChangeObj = -1.0 + + inner_iter = 0 + inner_continue = TRUE + inner_maxiter = 100 + while(inner_iter < inner_maxiter & inner_continue) { + u = w - g/alpha + lambda = tau/alpha + + wnew = sign(u) * (abs(u) - lambda) * ((abs(u) - lambda) > 0) + dw = wnew - w + dw2 = sum(dw*dw) + + r = X %*% wnew - y + gnew = t(X) %*% r + objnew = 0.5 * sum(r*r) + tau*sum(abs(wnew)) + obj_threshold = max(history) - 0.5*sigma*alpha*dw2 + + if(objnew <= obj_threshold) { + w = wnew + dg = gnew - g + g = gnew + inner_continue = FALSE + + history[1:(M-1),] = history[2:M,] + history[M,1] = objnew + relChangeObj = abs(objnew - obj)/obj + obj = objnew + } + else + alpha = eta*alpha + + inner_iter = inner_iter + 1 + } + + if(inner_continue) + print("Inner loop did not converge") + + alphanew = sum(dw*dg)/sum(dw*dw) + alpha = max(alpha_min, min(alpha_max, alphanew)) + + old_inactive_set = inactive_set + inactive_set = w != 0 + diff = sum(abs(old_inactive_set - inactive_set)) + + if(diff == 0 & relChangeObj < tol) + continue = FALSE + + num_inactive = sum(w != 0) + if( verbose ) + print("ITER=" + iter + " OBJ=" + obj + " relative change=" + relChangeObj + " num_inactive=" + num_inactive) + iter = iter + 1 + } +} diff --git a/scripts/builtin/ppca.dml b/scripts/builtin/ppca.dml new file mode 100644 index 0000000..2683f90 --- /dev/null +++ b/scripts/builtin/ppca.dml @@ -0,0 +1,149 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data. +# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed +# Platforms. Tarek Elgamal et.al. + +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix --- n x m input feature matrix +# k Integer --- indicates dimension of the new vector space constructed from eigen vectors +# maxi Integer --- maximum number of iterations until convergence +# tolobj Double 0.00001 objective function tolerance value to stop ppca algorithm +# tolrecerr Double 0.02 reconstruction error tolerance value to stop the algorithm +# verbose Boolen TRUE verbose debug output +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# Xout Matrix --- Output feature matrix with K columns +# Mout Matrix --- Output dominant eigen vectors (can be used for projections) + + +m_ppca = function(Matrix[Double] X, Integer K=2, Integer maxi = 10, + Double tolobj = 0.00001, Double tolrecerr = 0.02, Boolean verbose = TRUE) + return(Matrix[Double] Xout, Matrix[Double] Mout) +{ + n = nrow(X); + m = ncol(X); + + #initializing principal components matrix + C = rand(rows=m, cols=K, pdf="normal"); + ss = rand(rows=1, cols=1, pdf="normal"); + ss = as.scalar(ss); + ssPrev = ss; + + # best selected principle components - with the lowest reconstruction error + PC = C; + + # initilizing reconstruction error + RE = tolrecerr+1; + REBest = RE; + + Z = matrix(0,rows=1,cols=1); + + #Objective function value + ObjRelChng = tolobj+1; + + # mean centered input matrix - dim -> [n,m] + Xm = X - colMeans(X); + + #I -> k x k + ITMP = matrix(1,rows=K,cols=1); + I = diag(ITMP); + + i = 0; + while (i < maxi & ObjRelChng > tolobj & RE > tolrecerr){ + #Estimation step - Covariance matrix + #M -> k x k + M = t(C) %*% C + I*ss; + + #Auxilary matrix with n latent variables + # Z -> n x k + Z = Xm %*% (C %*% inv(M)); + + #ZtZ -> k x k + ZtZ = t(Z) %*% Z + inv(M)*ss; + + #XtZ -> m x k + XtZ = t(Xm) %*% Z; + + #Maximization step + #C -> m x k + ZtZ_sum = sum(ZtZ); #+n*inv(M)); + C = XtZ/ZtZ_sum; + + #ss2 -> 1 x 1 + ss2 = trace(ZtZ * (t(C) %*% C)); + + #ss3 -> 1 x 1 + ss3 = sum((Z %*% t(C)) %*% t(Xm)); + + #Frobenius norm of reconstruction error -> Euclidean norm + #Fn -> 1 x 1 + Fn = sum(Xm*Xm); + + #ss -> 1 x 1 + ss = (Fn + ss2 - 2*ss3)/(n*m); + + #calculating objective function relative change + ObjRelChng = abs(1 - ss/ssPrev); + #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss); + + #Reconstruction error + R = ((Z %*% t(C)) - Xm); + + #calculate the error + #TODO rethink calculation of reconstruction error .... + #1-Norm of reconstruction error - a big dense matrix + #RE -> n x m + RE = abs(sum(R)/sum(Xm)); + if (RE < REBest){ + PC = C; + REBest = RE; + } + #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE); + + ssPrev = ss; + i = i+1; + } + if( verbose ) + print("Objective Relative Change: " + ObjRelChng); + if( verbose ) + print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest); + + # reconstructs data + # RD -> n x k + Xout = X %*% PC; + + # calculate eigenvalues - principle component variance + RDMean = colMeans(Xout); + V = t(colMeans(Xout^2) - (RDMean^2)); + + # sorting eigenvalues and eigenvectors in decreasing order + V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE); + VF_decr = table(seq(1,nrow(V)),V_decr_idx); + Mout = PC %*% VF_decr; # vectors (values via VF_decr %*% V) +} diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index 92f6e48..157a411 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -120,6 +120,7 @@ public enum Builtins { ISINF("is.infinite", false), KMEANS("kmeans", true), L2SVM("l2svm", true), + LASSO("lasso", true), LENGTH("length", false), LINEAGE("lineage", false), LIST("list", false), //note: builtin and parbuiltin @@ -155,6 +156,7 @@ public enum Builtins { OUTLIER_IQR("outlierByIQR", true), PCA("pca", true), PNMF("pnmf", true), + PPCA("ppca", true), PPRED("ppred", false), PROD("prod", false), QR("qr", false, ReturnType.MULTI_RETURN), diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinLassoTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinLassoTest.java new file mode 100644 index 0000000..db94b04 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinLassoTest.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin; + +import org.apache.sysds.runtime.meta.MatrixCharacteristics; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +public class BuiltinLassoTest extends AutomatedTestBase { + private final static String TEST_NAME = "lasso"; + private final static String TEST_DIR = "functions/builtin/"; + private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinLassoTest.class.getSimpleName() + "/"; + + private final static int rows = 100; + private final static int cols = 10; + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B"})); + } + + @Test + public void testLasso() { + runLassoTest(); + } + + private void runLassoTest() { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + List<String> proArgs = new ArrayList<>(); + + proArgs.add("-args"); + proArgs.add(input("X")); + proArgs.add(input("y")); + proArgs.add(output("w")); + programArgs = proArgs.toArray(new String[proArgs.size()]); + double[][] X = getRandomMatrix(rows, cols, 0, 1, 0.8, -1); + double[][] y = getRandomMatrix(rows, 1, 0, 1, 0.8, -1); + writeInputMatrixWithMTD("X", X, true); + writeInputMatrixWithMTD("y", y, true); + + runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + MatrixCharacteristics mc = readDMLMetaDataFile("w"); + Assert.assertEquals(cols, mc.getRows()); + Assert.assertEquals(1, mc.getCols()); + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPPCATest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPPCATest.java new file mode 100644 index 0000000..c1b96cd --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPPCATest.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin; + +import org.apache.sysds.runtime.meta.MatrixCharacteristics; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +public class BuiltinPPCATest extends AutomatedTestBase { + private final static String TEST_NAME = "ppca"; + private final static String TEST_DIR = "functions/builtin/"; + private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinPPCATest.class.getSimpleName() + "/"; + + private final static int rows = 200; + private final static int cols = 20; + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"PC","V"})); + } + + @Test + public void testPpca4() { + runPPCATest(4); + } + + @Test + public void testPpca16() { + runPPCATest(16); + } + + private void runPPCATest(int k) { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + List<String> proArgs = new ArrayList<>(); + + proArgs.add("-args"); + proArgs.add(input("X")); + proArgs.add(String.valueOf(k)); + proArgs.add(output("PC")); + proArgs.add(output("V")); + programArgs = proArgs.toArray(new String[proArgs.size()]); + double[][] X = getRandomMatrix(rows, cols, 0, 1, 0.8, -1); + writeInputMatrixWithMTD("X", X, true); + + runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + MatrixCharacteristics mc = readDMLMetaDataFile("PC"); + Assert.assertEquals(rows, mc.getRows()); + Assert.assertEquals(k, mc.getCols()); + } +} diff --git a/src/test/scripts/functions/builtin/lasso.dml b/src/test/scripts/functions/builtin/lasso.dml new file mode 100644 index 0000000..0a2e3e9 --- /dev/null +++ b/src/test/scripts/functions/builtin/lasso.dml @@ -0,0 +1,25 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +X = read($1) +y = read($2) +w = lasso(X = X, y = y) +write(w, $3) \ No newline at end of file diff --git a/src/test/scripts/functions/builtin/ppca.dml b/src/test/scripts/functions/builtin/ppca.dml new file mode 100644 index 0000000..2699ccc --- /dev/null +++ b/src/test/scripts/functions/builtin/ppca.dml @@ -0,0 +1,26 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +X = read($1) +k = $2; +[PC, V] = ppca(X=X, K=k) +write(PC, $3) +write(V, $4)