IGNITE-6373: Create example for local and distributed k-means algorithm this closes #3173
Project: http://git-wip-us.apache.org/repos/asf/ignite/repo Commit: http://git-wip-us.apache.org/repos/asf/ignite/commit/a7893b63 Tree: http://git-wip-us.apache.org/repos/asf/ignite/tree/a7893b63 Diff: http://git-wip-us.apache.org/repos/asf/ignite/diff/a7893b63 Branch: refs/heads/ignite-zk Commit: a7893b6307bc752d7b7f1432967a964f30716858 Parents: 1f63b81 Author: Oleg Ignatenko <[email protected]> Authored: Thu Dec 7 20:34:14 2017 +0300 Committer: Yury Babak <[email protected]> Committed: Thu Dec 7 20:34:14 2017 +0300 ---------------------------------------------------------------------- .../clustering/DatasetWithObviousStructure.java | 105 ++++++++++++++++++ .../KMeansDistributedClustererExample.java | 95 +++++++++++++++++ .../clustering/KMeansLocalClustererExample.java | 106 +++++++++++++++++++ .../examples/ml/clustering/package-info.java | 2 +- ...KMeansDistributedClustererTestMultiNode.java | 15 +-- ...MeansDistributedClustererTestSingleNode.java | 15 +-- .../apache/ignite/ml/clustering/KMeansUtil.java | 6 +- 7 files changed, 326 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ignite/blob/a7893b63/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/DatasetWithObviousStructure.java ---------------------------------------------------------------------- diff --git a/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/DatasetWithObviousStructure.java b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/DatasetWithObviousStructure.java new file mode 100644 index 0000000..5cd0e09 --- /dev/null +++ b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/DatasetWithObviousStructure.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.examples.ml.clustering; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.ignite.ml.math.Matrix; +import org.apache.ignite.ml.math.Vector; +import org.apache.ignite.ml.math.VectorUtils; +import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; + +/** + * See KMeansDistributedClustererTestSingleNode#testClusterizationOnDatasetWithObviousStructure. + */ +class DatasetWithObviousStructure { + /** */ + private final Random rnd = new Random(123456L); + + /** Let centers be in the vertices of square. */ + private final Map<Integer, Vector> centers = new HashMap<>(); + + /** Square side length. */ + private final int squareSideLen; + + /** */ + DatasetWithObviousStructure(int squareSideLen) { + this.squareSideLen = squareSideLen; + centers.put(100, new DenseLocalOnHeapVector(new double[] {0.0, 0.0})); + centers.put(900, new DenseLocalOnHeapVector(new double[] {squareSideLen, 0.0})); + centers.put(3000, new DenseLocalOnHeapVector(new double[] {0.0, squareSideLen})); + centers.put(6000, new DenseLocalOnHeapVector(new double[] {squareSideLen, squareSideLen})); + } + + /** */ + List<Vector> generate(Matrix points) { + int ptsCnt = points.rowSize(); + + // Mass centers of dataset. + List<Vector> massCenters = new ArrayList<>(); + + int centersCnt = centers.size(); + + List<Integer> permutation = IntStream.range(0, ptsCnt).boxed().collect(Collectors.toList()); + Collections.shuffle(permutation, rnd); + + Vector[] mc = new Vector[centersCnt]; + Arrays.fill(mc, VectorUtils.zeroes(2)); + + int totalCnt = 0; + + int centIdx = 0; + massCenters.clear(); + + for (Integer count : centers.keySet()) { + for (int i = 0; i < count; i++) { + Vector pnt = getPoint(count); + + mc[centIdx] = mc[centIdx].plus(pnt); + + points.assignRow(permutation.get(totalCnt), pnt); + + totalCnt++; + } + massCenters.add(mc[centIdx].times(1 / (double)count)); + centIdx++; + } + + return massCenters; + } + + /** */ + Map<Integer, Vector> centers() { + return centers; + } + + /** */ + private Vector getPoint(Integer cnt) { + Vector pnt = new DenseLocalOnHeapVector(2).assign(centers.get(cnt)); + // Perturbate point on random value. + pnt.map(val -> val + rnd.nextDouble() * squareSideLen / 100); + return pnt; + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/a7893b63/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/KMeansDistributedClustererExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/KMeansDistributedClustererExample.java b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/KMeansDistributedClustererExample.java new file mode 100644 index 0000000..456e915 --- /dev/null +++ b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/KMeansDistributedClustererExample.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.examples.ml.clustering; + +import java.util.Arrays; +import java.util.List; +import org.apache.ignite.Ignite; +import org.apache.ignite.Ignition; +import org.apache.ignite.examples.ExampleNodeStartup; +import org.apache.ignite.examples.ml.math.matrix.SparseDistributedMatrixExample; +import org.apache.ignite.ml.clustering.KMeansDistributedClusterer; +import org.apache.ignite.ml.math.EuclideanDistance; +import org.apache.ignite.ml.math.StorageConstants; +import org.apache.ignite.ml.math.Tracer; +import org.apache.ignite.ml.math.Vector; +import org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix; +import org.apache.ignite.thread.IgniteThread; + +/** + * <p> + * Example of using {@link KMeansDistributedClusterer}.</p> + * <p> + * Note that in this example we cannot guarantee order in which nodes return results of intermediate + * computations and therefore algorithm can return different results.</p> + * <p> + * Remote nodes should always be started with special configuration file which + * enables P2P class loading: {@code 'ignite.{sh|bat} examples/config/example-ignite.xml'}.</p> + * <p> + * Alternatively you can run {@link ExampleNodeStartup} in another JVM which will start node + * with {@code examples/config/example-ignite.xml} configuration.</p> + */ +public class KMeansDistributedClustererExample { + /** + * Executes example. + * + * @param args Command line arguments, none required. + */ + public static void main(String[] args) throws InterruptedException { + // IMPL NOTE based on KMeansDistributedClustererTestSingleNode#testClusterizationOnDatasetWithObviousStructure + System.out.println(">>> K-means distributed clusterer example started."); + // Start ignite grid. + try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) { + System.out.println(">>> Ignite grid started."); + // Create IgniteThread, we must work with SparseDistributedMatrix inside IgniteThread + // because we create ignite cache internally. + IgniteThread igniteThread = new IgniteThread(ignite.configuration().getIgniteInstanceName(), + SparseDistributedMatrixExample.class.getSimpleName(), () -> { + + int ptsCnt = 10000; + + SparseDistributedMatrix points = new SparseDistributedMatrix(ptsCnt, 2, + StorageConstants.ROW_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE); + + DatasetWithObviousStructure dataset = new DatasetWithObviousStructure(10000); + + List<Vector> massCenters = dataset.generate(points); + + EuclideanDistance dist = new EuclideanDistance(); + + KMeansDistributedClusterer clusterer = new KMeansDistributedClusterer(dist, 3, 100, 1L); + + Vector[] resCenters = clusterer.cluster(points, 4).centers(); + + System.out.println("Mass centers:"); + massCenters.forEach(Tracer::showAscii); + + System.out.println("Cluster centers:"); + Arrays.asList(resCenters).forEach(Tracer::showAscii); + + points.destroy(); + + System.out.println("\n>>> K-means distributed clusterer example completed."); + }); + + igniteThread.start(); + + igniteThread.join(); + } + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/a7893b63/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/KMeansLocalClustererExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/KMeansLocalClustererExample.java b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/KMeansLocalClustererExample.java new file mode 100644 index 0000000..970931e --- /dev/null +++ b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/KMeansLocalClustererExample.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.examples.ml.clustering; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import org.apache.ignite.ml.clustering.KMeansLocalClusterer; +import org.apache.ignite.ml.clustering.KMeansModel; +import org.apache.ignite.ml.math.DistanceMeasure; +import org.apache.ignite.ml.math.EuclideanDistance; +import org.apache.ignite.ml.math.Tracer; +import org.apache.ignite.ml.math.Vector; +import org.apache.ignite.ml.math.functions.Functions; +import org.apache.ignite.ml.math.impls.matrix.DenseLocalOnHeapMatrix; + +/** + * Example of using {@link KMeansLocalClusterer}. + */ +public class KMeansLocalClustererExample { + /** + * Executes example. + * + * @param args Command line arguments, none required. + */ + public static void main(String[] args) { + // IMPL NOTE based on KMeansDistributedClustererTestSingleNode#testClusterizationOnDatasetWithObviousStructure + System.out.println(">>> K-means local clusterer example started."); + + int ptsCnt = 10000; + DenseLocalOnHeapMatrix points = new DenseLocalOnHeapMatrix(ptsCnt, 2); + + DatasetWithObviousStructure dataset = new DatasetWithObviousStructure(10000); + + List<Vector> massCenters = dataset.generate(points); + + EuclideanDistance dist = new EuclideanDistance(); + OrderedNodesComparator comp = new OrderedNodesComparator( + dataset.centers().values().toArray(new Vector[] {}), dist); + + massCenters.sort(comp); + + KMeansLocalClusterer clusterer = new KMeansLocalClusterer(dist, 100, 1L); + + KMeansModel mdl = clusterer.cluster(points, 4); + Vector[] resCenters = mdl.centers(); + Arrays.sort(resCenters, comp); + + System.out.println("Mass centers:"); + massCenters.forEach(Tracer::showAscii); + + System.out.println("Cluster centers:"); + Arrays.asList(resCenters).forEach(Tracer::showAscii); + + System.out.println("\n>>> K-means local clusterer example completed."); + } + + /** */ + private static class OrderedNodesComparator implements Comparator<Vector> { + /** */ + private final DistanceMeasure measure; + + /** */ + List<Vector> orderedNodes; + + /** */ + OrderedNodesComparator(Vector[] orderedNodes, DistanceMeasure measure) { + this.orderedNodes = Arrays.asList(orderedNodes); + this.measure = measure; + } + + /** */ + private int findClosestNodeIndex(Vector v) { + return Functions.argmin(orderedNodes, v1 -> measure.compute(v1, v)).get1(); + } + + /** */ + @Override public int compare(Vector v1, Vector v2) { + int ind1 = findClosestNodeIndex(v1); + int ind2 = findClosestNodeIndex(v2); + + int signum = (int)Math.signum(ind1 - ind2); + + if (signum != 0) + return signum; + + return (int)Math.signum(orderedNodes.get(ind1).minus(v1).kNorm(2) - + orderedNodes.get(ind2).minus(v2).kNorm(2)); + } + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/a7893b63/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/package-info.java ---------------------------------------------------------------------- diff --git a/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/package-info.java b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/package-info.java index 7051912..872aa16 100644 --- a/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/package-info.java +++ b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/package-info.java @@ -17,6 +17,6 @@ /** * <!-- Package description. --> - * Clustering examples. + * ML clustering examples. */ package org.apache.ignite.examples.ml.clustering; http://git-wip-us.apache.org/repos/asf/ignite/blob/a7893b63/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansDistributedClustererTestMultiNode.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansDistributedClustererTestMultiNode.java b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansDistributedClustererTestMultiNode.java index 2722d74..1f71dee 100644 --- a/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansDistributedClustererTestMultiNode.java +++ b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansDistributedClustererTestMultiNode.java @@ -33,12 +33,11 @@ import org.apache.ignite.ml.math.Vector; import org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix; import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest; -import org.junit.Test; /** - * This test is made to make sure that K-Means distributed clustering does not crush on distributed environment. - * In {@link KMeansDistributedClustererTestMultiNode} we check logic of clustering (checks for clusters structures). - * In this class we just check that clusterer does not crush. There are two separate tests because we cannot + * This test is made to make sure that K-Means distributed clustering does not crash on distributed environment. + * In {@link KMeansDistributedClustererTestSingleNode} we check logic of clustering (checks for clusters structures). + * In this class we just check that clusterer does not crash. There are two separate tests because we cannot * guarantee order in which nodes return results of intermediate computations and therefore algorithm can return * different results. */ @@ -75,7 +74,6 @@ public class KMeansDistributedClustererTestMultiNode extends GridCommonAbstractT } /** */ - @Test public void testPerformClusterAnalysisDegenerate() { IgniteUtils.setCurrentIgniteName(ignite.configuration().getIgniteInstanceName()); @@ -91,10 +89,11 @@ public class KMeansDistributedClustererTestMultiNode extends GridCommonAbstractT points.setRow(1, v2); clusterer.cluster(points, 1); + + points.destroy(); } /** */ - @Test public void testClusterizationOnDatasetWithObviousStructure() throws IOException { IgniteUtils.setCurrentIgniteName(ignite.configuration().getIgniteInstanceName()); @@ -120,7 +119,7 @@ public class KMeansDistributedClustererTestMultiNode extends GridCommonAbstractT for (Integer count : centers.keySet()) { for (int i = 0; i < count; i++) { - DenseLocalOnHeapVector pnt = (DenseLocalOnHeapVector)new DenseLocalOnHeapVector(2).assign(centers.get(count)); + Vector pnt = new DenseLocalOnHeapVector(2).assign(centers.get(count)); // Perturbate point on random value. pnt.map(val -> val + rnd.nextDouble() * squareSideLen / 100); points.assignRow(permutation.get(totalCnt), pnt); @@ -133,5 +132,7 @@ public class KMeansDistributedClustererTestMultiNode extends GridCommonAbstractT KMeansDistributedClusterer clusterer = new KMeansDistributedClusterer(dist, 3, 100, 1L); clusterer.cluster(points, 4); + + points.destroy(); } } http://git-wip-us.apache.org/repos/asf/ignite/blob/a7893b63/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansDistributedClustererTestSingleNode.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansDistributedClustererTestSingleNode.java b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansDistributedClustererTestSingleNode.java index 27aaa0c..19c328a 100644 --- a/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansDistributedClustererTestSingleNode.java +++ b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansDistributedClustererTestSingleNode.java @@ -40,11 +40,12 @@ import org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix; import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest; import org.junit.Assert; -import org.junit.Test; import static org.apache.ignite.ml.clustering.KMeansUtil.checkIsInEpsilonNeighbourhood; -/** */ +/** + * This test checks logic of clustering (checks for clusters structures). + */ public class KMeansDistributedClustererTestSingleNode extends GridCommonAbstractTest { /** * Number of nodes in grid. We should use 1 in this test because otherwise algorithm will be unstable @@ -81,7 +82,6 @@ public class KMeansDistributedClustererTestSingleNode extends GridCommonAbstract } /** */ - @Test public void testPerformClusterAnalysisDegenerate() { IgniteUtils.setCurrentIgniteName(ignite.configuration().getIgniteInstanceName()); @@ -103,7 +103,6 @@ public class KMeansDistributedClustererTestSingleNode extends GridCommonAbstract } /** */ - @Test public void testClusterizationOnDatasetWithObviousStructure() throws IOException { IgniteUtils.setCurrentIgniteName(ignite.configuration().getIgniteInstanceName()); @@ -137,7 +136,7 @@ public class KMeansDistributedClustererTestSingleNode extends GridCommonAbstract for (Integer count : centers.keySet()) { for (int i = 0; i < count; i++) { - DenseLocalOnHeapVector pnt = (DenseLocalOnHeapVector)new DenseLocalOnHeapVector(2).assign(centers.get(count)); + Vector pnt = new DenseLocalOnHeapVector(2).assign(centers.get(count)); // Perturbate point on random value. pnt.map(val -> val + rnd.nextDouble() * squareSideLen / 100); mc[centIdx] = mc[centIdx].plus(pnt); @@ -159,6 +158,8 @@ public class KMeansDistributedClustererTestSingleNode extends GridCommonAbstract Arrays.sort(resCenters, comp); checkIsInEpsilonNeighbourhood(resCenters, massCenters.toArray(new Vector[] {}), 30.0); + + points.destroy(); } /** */ @@ -170,7 +171,7 @@ public class KMeansDistributedClustererTestSingleNode extends GridCommonAbstract List<Vector> orderedNodes; /** */ - public OrderedNodesComparator(Vector[] orderedNodes, DistanceMeasure measure) { + OrderedNodesComparator(Vector[] orderedNodes, DistanceMeasure measure) { this.orderedNodes = Arrays.asList(orderedNodes); this.measure = measure; } @@ -194,4 +195,4 @@ public class KMeansDistributedClustererTestSingleNode extends GridCommonAbstract orderedNodes.get(ind2).minus(v2).kNorm(2)); } } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/ignite/blob/a7893b63/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansUtil.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansUtil.java b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansUtil.java index 0a39748..420678f 100644 --- a/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansUtil.java +++ b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/KMeansUtil.java @@ -21,10 +21,10 @@ import org.apache.ignite.ml.math.Vector; import static org.junit.Assert.assertTrue; -/** Base test for k-means algorithms. */ -public class KMeansUtil { +/** Utilities for k-means tests. */ +class KMeansUtil { /** */ - public static void checkIsInEpsilonNeighbourhood(Vector[] v1s, Vector[] v2s, double epsilon) { + static void checkIsInEpsilonNeighbourhood(Vector[] v1s, Vector[] v2s, double epsilon) { for (int i = 0; i < v1s.length; i++) { assertTrue("Not in epsilon neighbourhood (index " + i + ") ", v1s[i].minus(v2s[i]).kNorm(2) < epsilon);
