This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git
The following commit(s) were added to refs/heads/master by this push:
new 50fac03 [SYSTEMDS-393] Builtin function for connected components,
tests
50fac03 is described below
commit 50fac03eed584c2019c95a4702367ddc442e7bf2
Author: Matthias Boehm <[email protected]>
AuthorDate: Sat May 23 22:09:04 2020 +0200
[SYSTEMDS-393] Builtin function for connected components, tests
This patch adds a new built-in function for finding the connected
components in a undirected graph, represented as a symmetric 0/1 matrix.
On a scenario of finding the connected components of the DBLP co-author
graph (for selected DB venues and >=2011 -> 35632x35632, 310582
non-zeros), the algorithm terminated in 12 iterations, found 2443
connected components (w/ 837 single-author components), and took only
4.1s including I/O, transform recoding, and graph construction.
---
docs/Tasks.txt | 1 +
scripts/builtin/components.dml | 53 ++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../functions/builtin/BuiltinComponentsTest.java | 94 ++++++++++++++++++++++
.../functions/builtin/ConnectedComponents.dml | 28 +++++++
5 files changed, 177 insertions(+)
diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 66d0901..6539ff8 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -301,6 +301,7 @@ SYSTEMDS-380 Memory Footprint
SYSTEMDS-390 New Builtin Functions IV
* 391 New GLM builtin function (from algorithms) OK
* 392 Builtin function for missing value imputation via FDs OK
+ * 393 Builtin to find Connected Components of a graph OK
Others:
* Break append instruction to cbind and rbind
diff --git a/scripts/builtin/components.dml b/scripts/builtin/components.dml
new file mode 100644
index 0000000..f760a49
--- /dev/null
+++ b/scripts/builtin/components.dml
@@ -0,0 +1,53 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Computes the connected components of a graph and returns a
+# vector indicating the assignment of vertices to components,
+# where each component is identified by the maximum vertex ID
+# (i.e., row/column position of the input graph)
+
+m_components = function(Matrix[Double] G, Integer maxi = 0, Boolean verbose =
TRUE)
+ return (Matrix[Double] C)
+{
+ # ensure there are no self-edges in the graph
+ if( trace(G) != 0 ) {
+ G = G - diag(diag(G));
+ if(verbose)
+ print("Connected Components: warning - removed self-edges from input
graph");
+ }
+
+ # initialize state with vertex ids
+ c = seq(1,nrow(G));
+ diff = Inf;
+ iter = 1;
+
+ # iterative computation of connected components
+ while( diff > 0 & (maxi==0 | maxi<=iter) ) {
+ u = max(rowMaxs(G * t(c)), c);
+ diff = sum(u != c)
+ c = u; # update assignment
+ if( verbose )
+ print("Connected components: iter = "+iter+", #diff = "+diff);
+ iter = iter + 1;
+ }
+
+ C = c;
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index b738d40..6c53692 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -68,6 +68,7 @@ public enum Builtins {
COLSD("colSds", false),
COLSUM("colSums", false),
COLVAR("colVars", false),
+ COMPONENTS("components", true),
CONV2D("conv2d", false),
CONV2D_BACKWARD_FILTER("conv2d_backward_filter", false),
CONV2D_BACKWARD_DATA("conv2d_backward_data", false),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java
new file mode 100644
index 0000000..ca54528
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.lops.LopProperties;
+import org.apache.sysds.lops.LopProperties.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+
+public class BuiltinComponentsTest extends AutomatedTestBase {
+ private final static String TEST_NAME = "ConnectedComponents";
+ private final static String TEST_DIR = "functions/builtin/";
+ private static final String TEST_CLASS_DIR = TEST_DIR +
BuiltinComponentsTest.class.getSimpleName() + "/";
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_NAME,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"r"}));
+ }
+
+ @Test
+ public void testConnectedComponents11CP() {
+ runConnectedComponentsTest(11, LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void testConnectedComponents201CP() {
+ runConnectedComponentsTest(201, LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void testConnectedComponents2001CP() {
+ runConnectedComponentsTest(2001, LopProperties.ExecType.CP);
+ }
+
+ private void runConnectedComponentsTest(int numVertices, ExecType
instType)
+ {
+ Types.ExecMode platformOld = setExecMode(instType);
+
+ try
+ {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[]{ "-explain", "-stats",
"-args", input("X"), output("R")};
+
+ //generate actual dataset (3 components)
+ double[][] X = new double[numVertices-3][2];
+ for(int i=0; i<numVertices-3; i++) {
+ X[i][0] = i<(numVertices/2-1) ? i+1 : i+3;
+ X[i][1] = i<(numVertices/2-1) ? i+2 : i+4;
+ }
+ writeInputMatrixWithMTD("X", X, true);
+
+ runTest(true, false, null, -1);
+
+ HashMap<MatrixValue.CellIndex, Double> dmlfile =
readDMLMatrixFromHDFS("R");
+ for( int i=0; i<numVertices; i++ ) {
+ int expected = i<(numVertices/2) ?
(numVertices/2) :
+ i==(numVertices/2) ? i+1 : numVertices;
+ Assert.assertEquals(new Double(expected),
dmlfile.get(new MatrixValue.CellIndex(i+1,1)));
+ }
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+}
diff --git a/src/test/scripts/functions/builtin/ConnectedComponents.dml
b/src/test/scripts/functions/builtin/ConnectedComponents.dml
new file mode 100644
index 0000000..0c6fbe7
--- /dev/null
+++ b/src/test/scripts/functions/builtin/ConnectedComponents.dml
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+n = max(X);
+G = table(X[,1], X[, 2], n, n)
+G = G + t(G); #symmetry
+C = components(G=G, verbose=FALSE)
+
+write(C, $2)