This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new c14430a  [SYSTEMDS-2592] New built-in function cor (correlation matrix)
c14430a is described below

commit c14430a1895855422c4ced0b81089eb16eed2d00
Author: Olga Ovcharenko <[email protected]>
AuthorDate: Tue Jul 28 14:06:05 2020 +0200

    [SYSTEMDS-2592] New built-in function cor (correlation matrix)
    
    Closes #1002.
---
 docs/site/run_issues.md                            |  6 ++
 pom.xml                                            | 13 ++--
 scripts/builtin/cor.dml                            | 25 +++++++
 .../java/org/apache/sysds/common/Builtins.java     | 43 +++++------
 .../builtin/BuiltinCorrelationMatrixTest.java      | 87 ++++++++++++++++++++++
 .../scripts/functions/builtin/correlationMatrix.R  | 27 +++++++
 .../functions/builtin/correlationMatrix.dml        | 24 ++++++
 7 files changed, 198 insertions(+), 27 deletions(-)

diff --git a/docs/site/run_issues.md b/docs/site/run_issues.md
new file mode 100644
index 0000000..6cc931b
--- /dev/null
+++ b/docs/site/run_issues.md
@@ -0,0 +1,6 @@
+Error: Could not find or load main class org.apache.sysds.api.DMLScript 
+
+Solution for macOS: Install `realpath` with Homebrew
+```bash
+brew install coreutils 
+```
diff --git a/pom.xml b/pom.xml
index 063e532..a5e5f92 100644
--- a/pom.xml
+++ b/pom.xml
@@ -176,8 +176,8 @@
                                        </execution>
                                </executions>
                                <configuration>
-                                       <!-- Include signature files so that 
recent versions of Java will run 
-                                               the resulting jar without 
complaining about "Invalid signature file digest 
+                                       <!-- Include signature files so that 
recent versions of Java will run
+                                               the resulting jar without 
complaining about "Invalid signature file digest
                                                for Manifest main attributes".
                                                Furthermore, the excluded 
notice and license files will be explicitly
                                                added by the resource 
transformers above -->
@@ -250,7 +250,7 @@
                                </executions>
                        </plugin>
 
-                       
+
                        <plugin> <!-- unit tests -->
                                <groupId>org.apache.maven.plugins</groupId>
                                <artifactId>maven-surefire-plugin</artifactId>
@@ -521,6 +521,7 @@
                                                                
<exclude>src/main/python/docs/build/**/*</exclude>
                                                                
<exclude>docs/api/**/*</exclude>
                                                                
<exclude>docs/_site/**/*</exclude>
+                                                               
<exclude>docs/site/run_issues.md</exclude>
                                                                
<exclude>docs/.jekyll-cache/**/*</exclude>
                                                                
<exclude>docs/css/bootstrap.min.css</exclude>
                                                                
<exclude>docs/css/pygments-default.css</exclude>
@@ -573,7 +574,7 @@
                </profile>
 
                <profile>
-                       <!-- Profile to create binary distributions. Execute 
with `mvn clean package 
+                       <!-- Profile to create binary distributions. Execute 
with `mvn clean package
                                -P distribution` -->
                        <id>distribution</id>
                        <build>
@@ -710,7 +711,7 @@
                        </build>
                </profile>
        </profiles>
-       
+
        <dependencies>
                <dependency>
                        <groupId>org.jcuda</groupId>
@@ -970,7 +971,7 @@
                        <version>0.10</version>
                        <scope>test</scope>
                </dependency>
-               
+
                <dependency>
                        <!--Used for annotations in tests to execute tests in 
thread safe manner-->
                        <groupId>com.github.stephenc.jcip</groupId>
diff --git a/scripts/builtin/cor.dml b/scripts/builtin/cor.dml
new file mode 100644
index 0000000..ea7cf53
--- /dev/null
+++ b/scripts/builtin/cor.dml
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+m_cor = function(Matrix[Double] X) return (Matrix[Double] Y) {
+  # compute correlation matrix in vectorized form
+  Xc = X - colMeans(X);
+  Y = ((t(Xc) %*% Xc)/(nrow(X)-1)) / (t(colSds(X)) %*% colSds(X));
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index 53fc39e..b6733d2 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -31,7 +31,7 @@ import org.apache.sysds.common.Types.ReturnType;
  * case of DML script, these functions are loaded during parsing. As
  * always, user-defined DML-bodied functions take precedence over all
  * builtin functions.
- * 
+ *
  * To add a new builtin script function, simply add the definition here
  * as well as a dml file in scripts/builtin with a matching name. On 
  * building SystemDS, these scripts are packaged into the jar as well.
@@ -82,6 +82,7 @@ public enum Builtins {
        CUMSUM("cumsum", false),
        CUMSUMPROD("cumsumprod", false),
        CONFUSIONMATRIX("confusionMatrix", true),
+       COR("cor", true),
        DETECTSCHEMA("detectSchema", false),
        DIAG("diag", false),
        DISCOVER_FD("discoverFD", true),
@@ -186,7 +187,7 @@ public enum Builtins {
        VAR("var", false),
        XOR("xor", false),
        WINSORIZE("winsorize", true, false), //TODO parameterize w/ prob, 
min/max val
-       
+
        //parameterized builtin functions
        CDF("cdf", false, true),
        GROUPEDAGG("aggregate", "groupedAggregate", false, true),
@@ -217,27 +218,27 @@ public enum Builtins {
        TRANSFORMENCODE("transformencode", false, true),
        TRANSFORMMETA("transformmeta", false, true),
        UPPER_TRI("upper.tri", false, true);
-       
+
        Builtins(String name, boolean script) {
                this(name, null, script, false, ReturnType.SINGLE_RETURN);
        }
-       
+
        Builtins(String name, boolean script, ReturnType retType) {
                this(name, null, script, false, retType);
        }
-       
+
        Builtins(String name, boolean script, boolean parameterized) {
                this(name, null, script, parameterized, 
ReturnType.SINGLE_RETURN);
        }
-       
+
        Builtins(String name, String alias, boolean script) {
                this(name, alias, script, false, ReturnType.SINGLE_RETURN);
        }
-       
+
        Builtins(String name, String alias, boolean script, boolean 
parameterized) {
                this(name, alias, script, parameterized, 
ReturnType.SINGLE_RETURN);
        }
-       
+
        Builtins(String name, String alias, boolean script, boolean 
parameterized, ReturnType retType) {
                _name = name;
                _alias = alias;
@@ -245,10 +246,10 @@ public enum Builtins {
                _parameterized = parameterized;
                _retType = retType;
        }
-       
+
        private final static String BUILTIN_DIR = "scripts/builtin/";
        private final static HashMap<String, Builtins> _map = new HashMap<>();
-       
+
        static {
                //materialize lookup map for all builtin names
                for( Builtins b : EnumSet.allOf(Builtins.class) ) {
@@ -257,52 +258,52 @@ public enum Builtins {
                                _map.put(b.getAlias(), b);
                }
        }
-       
+
        private final String _name;
        private final String _alias;
        private final boolean _script;
        private final boolean _parameterized;
        private final ReturnType _retType;
-       
+
        public String getName() {
                return _name;
        }
-       
+
        public String getAlias() {
                return _alias;
        }
-       
+
        public boolean isScript() {
                return _script;
        }
-       
+
        public boolean isParameterized() {
                return _parameterized;
        }
-       
+
        public boolean isMultiReturn() {
                return _retType == ReturnType.MULTI_RETURN;
        }
-       
+
        public static boolean contains(String name, boolean script, boolean 
parameterized) {
                Builtins tmp = get(name);
                return tmp != null && script == tmp.isScript()
                        && parameterized == tmp.isParameterized();
        }
-       
+
        public static Builtins get(String name) {
                if( name.equals("list") )
                        return LIST; //unparameterized
                return _map.get(name);
        }
-       
+
        public static Builtins get(String name, boolean params) {
                if( name.equals("list") )
                        return params ? LISTNV : LIST;
                Builtins tmp = get(name);
                return tmp != null && (params == tmp.isParameterized()) ? tmp : 
null;
        }
-       
+
        public static String getFilePath(String name) {
                StringBuilder sb = new StringBuilder();
                sb.append(BUILTIN_DIR);
@@ -310,7 +311,7 @@ public enum Builtins {
                sb.append(".dml");
                return sb.toString();
        }
-       
+
        public static String getInternalFName(String name, DataType dt) {
                return (dt.isMatrix() ? "m_" : "s_") + name;
        }
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCorrelationMatrixTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCorrelationMatrixTest.java
new file mode 100644
index 0000000..26e3b02
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCorrelationMatrixTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.lops.LopProperties.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class BuiltinCorrelationMatrixTest extends AutomatedTestBase
+{
+       private final static String TEST_NAME = "correlationMatrix";
+       private final static String TEST_DIR = "functions/builtin/";
+       private static final String TEST_CLASS_DIR = TEST_DIR + 
BuiltinCorrelationMatrixTest.class.getSimpleName() + "/";
+       
+       private final static double eps = 1e-3;
+       private final static int rows = 1765;
+       private final static double spDense = 0.99;
+       
+       @Override
+       public void setUp() {
+               addTestConfiguration(TEST_NAME,new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"B"})); 
+       }
+
+       @Test
+       public void testCorrelationMatrixDefaultCP() {
+               runCorrelationMatrix(true, ExecType.CP);
+       }
+       
+       @Test
+       public void testCorrelationMatrixDefaultSP() {
+               runCorrelationMatrix(true, ExecType.SPARK);
+       }
+
+       private void runCorrelationMatrix(boolean defaultProb, ExecType 
instType)
+       {
+               ExecMode platformOld = setExecMode(instType);
+               
+               try
+               {
+                       loadTestConfiguration(getTestConfiguration(TEST_NAME));
+                       
+                       String HOME = SCRIPT_DIR + TEST_DIR;
+                       fullDMLScriptName = HOME + TEST_NAME + ".dml";
+                       programArgs = new String[]{"-args", input("A"), 
output("B") };
+                       fullRScriptName = HOME + TEST_NAME + ".R";
+                       rCmd = "Rscript" + " " + fullRScriptName + " " + 
inputDir() + " " + expectedDir();
+                       
+                       //generate actual dataset 
+                       double[][] A = getRandomMatrix(rows, 10, -1, 1, 
spDense, 7);
+                       writeInputMatrixWithMTD("A", A, true);
+                       
+                       runTest(true, false, null, -1);
+                       runRScript(true);
+                       
+                       //compare matrices
+                       HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromHDFS("B");
+                       HashMap<CellIndex, Double> rfile  = 
readRMatrixFromFS("B");
+                       TestUtils.compareMatrices(dmlfile, rfile, eps, 
"Stat-DML", "Stat-R");
+               }
+               finally {
+                       rtplatform = platformOld;
+               }
+       }
+}
diff --git a/src/test/scripts/functions/builtin/correlationMatrix.R 
b/src/test/scripts/functions/builtin/correlationMatrix.R
new file mode 100644
index 0000000..57bc080
--- /dev/null
+++ b/src/test/scripts/functions/builtin/correlationMatrix.R
@@ -0,0 +1,27 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+R = cor(X);
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "B", sep=""));
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/correlationMatrix.dml 
b/src/test/scripts/functions/builtin/correlationMatrix.dml
new file mode 100644
index 0000000..d3a9b48
--- /dev/null
+++ b/src/test/scripts/functions/builtin/correlationMatrix.dml
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1);
+Y = cor(X);
+write(Y, $2);

Reply via email to