IGNITE-8169: [ML] Adopt KMeans to the new Partitioned Dataset and cleanup old code
this closes #3817 Project: http://git-wip-us.apache.org/repos/asf/ignite/repo Commit: http://git-wip-us.apache.org/repos/asf/ignite/commit/9e21cec0 Tree: http://git-wip-us.apache.org/repos/asf/ignite/tree/9e21cec0 Diff: http://git-wip-us.apache.org/repos/asf/ignite/diff/9e21cec0 Branch: refs/heads/ignite-7708 Commit: 9e21cec024168105fd30bfb3acecf9fd24a52e8c Parents: 228254a Author: zaleslaw <[email protected]> Authored: Mon Apr 16 20:20:49 2018 +0300 Committer: Yury Babak <[email protected]> Committed: Mon Apr 16 20:20:49 2018 +0300 ---------------------------------------------------------------------- .../clustering/DatasetWithObviousStructure.java | 105 ---- .../ml/clustering/FuzzyCMeansExample.java | 134 ----- .../ml/clustering/FuzzyCMeansLocalExample.java | 95 ---- .../clustering/KMeansClusterizationExample.java | 226 ++++++++ .../KMeansDistributedClustererExample.java | 97 ---- .../clustering/KMeansLocalClustererExample.java | 106 ---- .../ignite/ml/FuzzyCMeansModelFormat.java | 76 --- .../org/apache/ignite/ml/KMeansModelFormat.java | 77 --- .../ml/clustering/BaseFuzzyCMeansClusterer.java | 90 ---- .../ml/clustering/BaseKMeansClusterer.java | 96 ---- .../apache/ignite/ml/clustering/Clusterer.java | 33 -- .../ml/clustering/ClusterizationModel.java | 29 -- .../FuzzyCMeansDistributedClusterer.java | 512 ------------------- .../clustering/FuzzyCMeansLocalClusterer.java | 254 --------- .../ignite/ml/clustering/FuzzyCMeansModel.java | 88 ---- .../clustering/KMeansDistributedClusterer.java | 306 ----------- .../ml/clustering/KMeansLocalClusterer.java | 177 ------- .../ignite/ml/clustering/KMeansModel.java | 113 ---- .../ignite/ml/clustering/WeightedClusterer.java | 38 -- .../ignite/ml/clustering/kmeans/Clusterer.java | 33 ++ .../clustering/kmeans/ClusterizationModel.java | 29 ++ .../ml/clustering/kmeans/KMeansModel.java | 112 ++++ .../ml/clustering/kmeans/KMeansModelFormat.java | 79 +++ .../ml/clustering/kmeans/KMeansTrainer.java | 320 ++++++++++++ .../ml/clustering/kmeans/package-info.java | 22 + .../preprocessing/LabellingMachine.java | 41 -- .../ml/structures/preprocessing/Normalizer.java | 80 --- .../org/apache/ignite/ml/LocalModelsTest.java | 26 +- .../ml/clustering/ClusteringTestSuite.java | 7 +- .../FuzzyCMeansDistributedClustererTest.java | 180 ------- .../FuzzyCMeansLocalClustererTest.java | 202 -------- ...KMeansDistributedClustererTestMultiNode.java | 138 ----- ...MeansDistributedClustererTestSingleNode.java | 198 ------- .../ml/clustering/KMeansLocalClustererTest.java | 46 -- .../ignite/ml/clustering/KMeansModelTest.java | 63 +++ .../ignite/ml/clustering/KMeansTrainerTest.java | 73 +++ .../apache/ignite/ml/clustering/KMeansUtil.java | 33 -- 37 files changed, 977 insertions(+), 3357 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/examples/src/main/java/org/apache/ignite/examples/ml/clustering/DatasetWithObviousStructure.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/DatasetWithObviousStructure.java b/examples/src/main/java/org/apache/ignite/examples/ml/clustering/DatasetWithObviousStructure.java deleted file mode 100644 index 5cd0e09..0000000 --- a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/DatasetWithObviousStructure.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.examples.ml.clustering; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.ignite.ml.math.Matrix; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.VectorUtils; -import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; - -/** - * See KMeansDistributedClustererTestSingleNode#testClusterizationOnDatasetWithObviousStructure. - */ -class DatasetWithObviousStructure { - /** */ - private final Random rnd = new Random(123456L); - - /** Let centers be in the vertices of square. */ - private final Map<Integer, Vector> centers = new HashMap<>(); - - /** Square side length. */ - private final int squareSideLen; - - /** */ - DatasetWithObviousStructure(int squareSideLen) { - this.squareSideLen = squareSideLen; - centers.put(100, new DenseLocalOnHeapVector(new double[] {0.0, 0.0})); - centers.put(900, new DenseLocalOnHeapVector(new double[] {squareSideLen, 0.0})); - centers.put(3000, new DenseLocalOnHeapVector(new double[] {0.0, squareSideLen})); - centers.put(6000, new DenseLocalOnHeapVector(new double[] {squareSideLen, squareSideLen})); - } - - /** */ - List<Vector> generate(Matrix points) { - int ptsCnt = points.rowSize(); - - // Mass centers of dataset. - List<Vector> massCenters = new ArrayList<>(); - - int centersCnt = centers.size(); - - List<Integer> permutation = IntStream.range(0, ptsCnt).boxed().collect(Collectors.toList()); - Collections.shuffle(permutation, rnd); - - Vector[] mc = new Vector[centersCnt]; - Arrays.fill(mc, VectorUtils.zeroes(2)); - - int totalCnt = 0; - - int centIdx = 0; - massCenters.clear(); - - for (Integer count : centers.keySet()) { - for (int i = 0; i < count; i++) { - Vector pnt = getPoint(count); - - mc[centIdx] = mc[centIdx].plus(pnt); - - points.assignRow(permutation.get(totalCnt), pnt); - - totalCnt++; - } - massCenters.add(mc[centIdx].times(1 / (double)count)); - centIdx++; - } - - return massCenters; - } - - /** */ - Map<Integer, Vector> centers() { - return centers; - } - - /** */ - private Vector getPoint(Integer cnt) { - Vector pnt = new DenseLocalOnHeapVector(2).assign(centers.get(cnt)); - // Perturbate point on random value. - pnt.map(val -> val + rnd.nextDouble() * squareSideLen / 100); - return pnt; - } -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/examples/src/main/java/org/apache/ignite/examples/ml/clustering/FuzzyCMeansExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/FuzzyCMeansExample.java b/examples/src/main/java/org/apache/ignite/examples/ml/clustering/FuzzyCMeansExample.java deleted file mode 100644 index 23aeed7..0000000 --- a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/FuzzyCMeansExample.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.examples.ml.clustering; - -import org.apache.ignite.Ignite; -import org.apache.ignite.Ignition; -import org.apache.ignite.examples.ExampleNodeStartup; -import org.apache.ignite.ml.clustering.BaseFuzzyCMeansClusterer; -import org.apache.ignite.ml.clustering.FuzzyCMeansDistributedClusterer; -import org.apache.ignite.ml.clustering.FuzzyCMeansModel; -import org.apache.ignite.ml.math.StorageConstants; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.distances.DistanceMeasure; -import org.apache.ignite.ml.math.distances.EuclideanDistance; -import org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix; -import org.apache.ignite.thread.IgniteThread; - -/** - * <p> - * This example shows how to use {@link FuzzyCMeansDistributedClusterer}.</p> - * <p> - * Remote nodes should always be started with special configuration file which - * enables P2P class loading: {@code 'ignite.{sh|bat} examples/config/example-ignite.xml'}.</p> - * <p> - * Alternatively you can run {@link ExampleNodeStartup} in another JVM which will start node - * with {@code examples/config/example-ignite.xml} configuration.</p> - */ -public final class FuzzyCMeansExample { - /** - * Executes example. - * - * @param args Command line arguments, none required. - */ - public static void main(String[] args) throws InterruptedException { - System.out.println(">>> Fuzzy C-Means usage example started."); - - // Start ignite grid. - try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) { - System.out.println(">>> Ignite grid started."); - - // Start new Ignite thread. - IgniteThread igniteThread = new IgniteThread(ignite.configuration().getIgniteInstanceName(), - FuzzyCMeansExample.class.getSimpleName(), - () -> { - // Distance measure that computes distance between two points. - DistanceMeasure distanceMeasure = new EuclideanDistance(); - - // "Fuzziness" - specific constant that is used in membership calculation (1.0+-eps ~ K-Means). - double exponentialWeight = 2.0; - - // Condition that indicated when algorithm must stop. - // In this example algorithm stops if memberships have changed insignificantly. - BaseFuzzyCMeansClusterer.StopCondition stopCond = - BaseFuzzyCMeansClusterer.StopCondition.STABLE_MEMBERSHIPS; - - // Maximum difference between new and old membership values with which algorithm will continue to work. - double maxDelta = 0.01; - - // The maximum number of FCM iterations. - int maxIterations = 50; - - // Value that is used to initialize random numbers generator. You can choose it randomly. - Long seed = null; - - // Number of steps of primary centers selection (more steps more candidates). - int initializationSteps = 2; - - // Number of K-Means iteration that is used to choose required number of primary centers from candidates. - int kMeansMaxIterations = 50; - - // Create new distributed clusterer with parameters described above. - System.out.println(">>> Create new Distributed Fuzzy C-Means clusterer."); - FuzzyCMeansDistributedClusterer clusterer = new FuzzyCMeansDistributedClusterer( - distanceMeasure, exponentialWeight, stopCond, maxDelta, maxIterations, - seed, initializationSteps, kMeansMaxIterations); - - // Create sample data. - double[][] points = new double[][] { - {-10, -10}, {-9, -11}, {-10, -9}, {-11, -9}, - {10, 10}, {9, 11}, {10, 9}, {11, 9}, - {-10, 10}, {-9, 11}, {-10, 9}, {-11, 9}, - {10, -10}, {9, -11}, {10, -9}, {11, -9}}; - - // Initialize matrix of data points. Each row contains one point. - int rows = points.length; - int cols = points[0].length; - - System.out.println(">>> Create the matrix that contains sample points."); - SparseDistributedMatrix pntMatrix = new SparseDistributedMatrix(rows, cols, - StorageConstants.ROW_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE); - - // Store points into matrix. - pntMatrix.assign(points); - - // Call clusterization method with some number of centers. - // It returns model that can predict results for new points. - System.out.println(">>> Perform clusterization."); - int numCenters = 4; - FuzzyCMeansModel mdl = clusterer.cluster(pntMatrix, numCenters); - - // You can also get centers of clusters that is computed by Fuzzy C-Means algorithm. - Vector[] centers = mdl.centers(); - - String res = ">>> Results:\n" - + ">>> 1st center: " + centers[0].get(0) + " " + centers[0].get(1) + "\n" - + ">>> 2nd center: " + centers[1].get(0) + " " + centers[1].get(1) + "\n" - + ">>> 3rd center: " + centers[2].get(0) + " " + centers[2].get(1) + "\n" - + ">>> 4th center: " + centers[3].get(0) + " " + centers[3].get(1) + "\n"; - - System.out.println(res); - - pntMatrix.destroy(); - }); - - igniteThread.start(); - igniteThread.join(); - } - } -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/examples/src/main/java/org/apache/ignite/examples/ml/clustering/FuzzyCMeansLocalExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/FuzzyCMeansLocalExample.java b/examples/src/main/java/org/apache/ignite/examples/ml/clustering/FuzzyCMeansLocalExample.java deleted file mode 100644 index 5c1753a..0000000 --- a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/FuzzyCMeansLocalExample.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.examples.ml.clustering; - -import org.apache.ignite.ml.clustering.BaseFuzzyCMeansClusterer; -import org.apache.ignite.ml.clustering.FuzzyCMeansLocalClusterer; -import org.apache.ignite.ml.clustering.FuzzyCMeansModel; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.distances.DistanceMeasure; -import org.apache.ignite.ml.math.distances.EuclideanDistance; -import org.apache.ignite.ml.math.impls.matrix.DenseLocalOnHeapMatrix; - -/** - * This example shows how to use {@link FuzzyCMeansLocalClusterer}. - */ -public final class FuzzyCMeansLocalExample { - /** - * Executes example. - * - * @param args Command line arguments, none required. - */ - public static void main(String[] args) { - System.out.println(">>> Local Fuzzy C-Means usage example started."); - - // Distance measure that computes distance between two points. - DistanceMeasure distanceMeasure = new EuclideanDistance(); - - // "Fuzziness" - specific constant that is used in membership calculation (1.0+-eps ~ K-Means). - double exponentialWeight = 2.0; - - // Condition that indicated when algorithm must stop. - // In this example algorithm stops if memberships have changed insignificantly. - BaseFuzzyCMeansClusterer.StopCondition stopCond = - BaseFuzzyCMeansClusterer.StopCondition.STABLE_MEMBERSHIPS; - - // Maximum difference between new and old membership values with which algorithm will continue to work. - double maxDelta = 0.01; - - // The maximum number of FCM iterations. - int maxIterations = 50; - - // Value that is used to initialize random numbers generator. You can choose it randomly. - Long seed = null; - - // Create new distributed clusterer with parameters described above. - System.out.println(">>> Create new Local Fuzzy C-Means clusterer."); - FuzzyCMeansLocalClusterer clusterer = new FuzzyCMeansLocalClusterer(distanceMeasure, - exponentialWeight, stopCond, - maxDelta, maxIterations, seed); - - // Create sample data. - double[][] points = new double[][] { - {-10, -10}, {-9, -11}, {-10, -9}, {-11, -9}, - {10, 10}, {9, 11}, {10, 9}, {11, 9}, - {-10, 10}, {-9, 11}, {-10, 9}, {-11, 9}, - {10, -10}, {9, -11}, {10, -9}, {11, -9}}; - - // Initialize matrix of data points. Each row contains one point. - System.out.println(">>> Create the matrix that contains sample points."); - // Store points into matrix. - DenseLocalOnHeapMatrix pntMatrix = new DenseLocalOnHeapMatrix(points); - - // Call clusterization method with some number of centers. - // It returns model that can predict results for new points. - System.out.println(">>> Perform clusterization."); - int numCenters = 4; - FuzzyCMeansModel mdl = clusterer.cluster(pntMatrix, numCenters); - - // You can also get centers of clusters that is computed by Fuzzy C-Means algorithm. - Vector[] centers = mdl.centers(); - - String res = ">>> Results:\n" - + ">>> 1st center: " + centers[0].get(0) + " " + centers[0].get(1) + "\n" - + ">>> 2nd center: " + centers[1].get(0) + " " + centers[1].get(1) + "\n" - + ">>> 3rd center: " + centers[2].get(0) + " " + centers[2].get(1) + "\n" - + ">>> 4th center: " + centers[3].get(0) + " " + centers[3].get(1) + "\n"; - - System.out.println(res); - } -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansClusterizationExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansClusterizationExample.java b/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansClusterizationExample.java new file mode 100644 index 0000000..8825ebb --- /dev/null +++ b/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansClusterizationExample.java @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.examples.ml.clustering; + +import java.util.Arrays; +import java.util.UUID; +import javax.cache.Cache; +import org.apache.ignite.Ignite; +import org.apache.ignite.IgniteCache; +import org.apache.ignite.Ignition; +import org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction; +import org.apache.ignite.cache.query.QueryCursor; +import org.apache.ignite.cache.query.ScanQuery; +import org.apache.ignite.configuration.CacheConfiguration; +import org.apache.ignite.ml.dataset.impl.cache.CacheBasedDatasetBuilder; +import org.apache.ignite.ml.knn.classification.KNNClassificationTrainer; +import org.apache.ignite.ml.math.Tracer; +import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; +import org.apache.ignite.ml.clustering.kmeans.KMeansModel; +import org.apache.ignite.ml.clustering.kmeans.KMeansTrainer; +import org.apache.ignite.thread.IgniteThread; + +/** + * Run kNN multi-class classification trainer over distributed dataset. + * + * @see KNNClassificationTrainer + */ +public class KMeansClusterizationExample { + /** Run example. */ + public static void main(String[] args) throws InterruptedException { + System.out.println(); + System.out.println(">>> KMeans clustering algorithm over cached dataset usage example started."); + // Start ignite grid. + try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) { + System.out.println(">>> Ignite grid started."); + + IgniteThread igniteThread = new IgniteThread(ignite.configuration().getIgniteInstanceName(), + KMeansClusterizationExample.class.getSimpleName(), () -> { + IgniteCache<Integer, double[]> dataCache = getTestCache(ignite); + + KMeansTrainer trainer = new KMeansTrainer() + .withSeed(7867L); + + KMeansModel mdl = trainer.fit( + new CacheBasedDatasetBuilder<>(ignite, dataCache), + (k, v) -> Arrays.copyOfRange(v, 1, v.length), + (k, v) -> v[0] + ); + + System.out.println(">>> KMeans centroids"); + Tracer.showAscii(mdl.centers()[0]); + Tracer.showAscii(mdl.centers()[1]); + System.out.println(">>>"); + + System.out.println(">>> -----------------------------------"); + System.out.println(">>> | Predicted cluster\t| Real Label\t|"); + System.out.println(">>> -----------------------------------"); + + int amountOfErrors = 0; + int totalAmount = 0; + + try (QueryCursor<Cache.Entry<Integer, double[]>> observations = dataCache.query(new ScanQuery<>())) { + for (Cache.Entry<Integer, double[]> observation : observations) { + double[] val = observation.getValue(); + double[] inputs = Arrays.copyOfRange(val, 1, val.length); + double groundTruth = val[0]; + + double prediction = mdl.apply(new DenseLocalOnHeapVector(inputs)); + + totalAmount++; + if (groundTruth != prediction) + amountOfErrors++; + + System.out.printf(">>> | %.4f\t\t\t| %.4f\t\t|\n", prediction, groundTruth); + } + + System.out.println(">>> ---------------------------------"); + + System.out.println("\n>>> Absolute amount of errors " + amountOfErrors); + System.out.println("\n>>> Accuracy " + (1 - amountOfErrors / (double)totalAmount)); + } + }); + + igniteThread.start(); + igniteThread.join(); + } + } + + /** + * Fills cache with data and returns it. + * + * @param ignite Ignite instance. + * @return Filled Ignite Cache. + */ + private static IgniteCache<Integer, double[]> getTestCache(Ignite ignite) { + CacheConfiguration<Integer, double[]> cacheConfiguration = new CacheConfiguration<>(); + cacheConfiguration.setName("TEST_" + UUID.randomUUID()); + cacheConfiguration.setAffinity(new RendezvousAffinityFunction(false, 10)); + + IgniteCache<Integer, double[]> cache = ignite.createCache(cacheConfiguration); + + for (int i = 0; i < data.length; i++) + cache.put(i, data[i]); + + return cache; + } + + /** The Iris dataset. */ + private static final double[][] data = { + {0, 5.1, 3.5, 1.4, 0.2}, + {0, 4.9, 3, 1.4, 0.2}, + {0, 4.7, 3.2, 1.3, 0.2}, + {0, 4.6, 3.1, 1.5, 0.2}, + {0, 5, 3.6, 1.4, 0.2}, + {0, 5.4, 3.9, 1.7, 0.4}, + {0, 4.6, 3.4, 1.4, 0.3}, + {0, 5, 3.4, 1.5, 0.2}, + {0, 4.4, 2.9, 1.4, 0.2}, + {0, 4.9, 3.1, 1.5, 0.1}, + {0, 5.4, 3.7, 1.5, 0.2}, + {0, 4.8, 3.4, 1.6, 0.2}, + {0, 4.8, 3, 1.4, 0.1}, + {0, 4.3, 3, 1.1, 0.1}, + {0, 5.8, 4, 1.2, 0.2}, + {0, 5.7, 4.4, 1.5, 0.4}, + {0, 5.4, 3.9, 1.3, 0.4}, + {0, 5.1, 3.5, 1.4, 0.3}, + {0, 5.7, 3.8, 1.7, 0.3}, + {0, 5.1, 3.8, 1.5, 0.3}, + {0, 5.4, 3.4, 1.7, 0.2}, + {0, 5.1, 3.7, 1.5, 0.4}, + {0, 4.6, 3.6, 1, 0.2}, + {0, 5.1, 3.3, 1.7, 0.5}, + {0, 4.8, 3.4, 1.9, 0.2}, + {0, 5, 3, 1.6, 0.2}, + {0, 5, 3.4, 1.6, 0.4}, + {0, 5.2, 3.5, 1.5, 0.2}, + {0, 5.2, 3.4, 1.4, 0.2}, + {0, 4.7, 3.2, 1.6, 0.2}, + {0, 4.8, 3.1, 1.6, 0.2}, + {0, 5.4, 3.4, 1.5, 0.4}, + {0, 5.2, 4.1, 1.5, 0.1}, + {0, 5.5, 4.2, 1.4, 0.2}, + {0, 4.9, 3.1, 1.5, 0.1}, + {0, 5, 3.2, 1.2, 0.2}, + {0, 5.5, 3.5, 1.3, 0.2}, + {0, 4.9, 3.1, 1.5, 0.1}, + {0, 4.4, 3, 1.3, 0.2}, + {0, 5.1, 3.4, 1.5, 0.2}, + {0, 5, 3.5, 1.3, 0.3}, + {0, 4.5, 2.3, 1.3, 0.3}, + {0, 4.4, 3.2, 1.3, 0.2}, + {0, 5, 3.5, 1.6, 0.6}, + {0, 5.1, 3.8, 1.9, 0.4}, + {0, 4.8, 3, 1.4, 0.3}, + {0, 5.1, 3.8, 1.6, 0.2}, + {0, 4.6, 3.2, 1.4, 0.2}, + {0, 5.3, 3.7, 1.5, 0.2}, + {0, 5, 3.3, 1.4, 0.2}, + {1, 7, 3.2, 4.7, 1.4}, + {1, 6.4, 3.2, 4.5, 1.5}, + {1, 6.9, 3.1, 4.9, 1.5}, + {1, 5.5, 2.3, 4, 1.3}, + {1, 6.5, 2.8, 4.6, 1.5}, + {1, 5.7, 2.8, 4.5, 1.3}, + {1, 6.3, 3.3, 4.7, 1.6}, + {1, 4.9, 2.4, 3.3, 1}, + {1, 6.6, 2.9, 4.6, 1.3}, + {1, 5.2, 2.7, 3.9, 1.4}, + {1, 5, 2, 3.5, 1}, + {1, 5.9, 3, 4.2, 1.5}, + {1, 6, 2.2, 4, 1}, + {1, 6.1, 2.9, 4.7, 1.4}, + {1, 5.6, 2.9, 3.6, 1.3}, + {1, 6.7, 3.1, 4.4, 1.4}, + {1, 5.6, 3, 4.5, 1.5}, + {1, 5.8, 2.7, 4.1, 1}, + {1, 6.2, 2.2, 4.5, 1.5}, + {1, 5.6, 2.5, 3.9, 1.1}, + {1, 5.9, 3.2, 4.8, 1.8}, + {1, 6.1, 2.8, 4, 1.3}, + {1, 6.3, 2.5, 4.9, 1.5}, + {1, 6.1, 2.8, 4.7, 1.2}, + {1, 6.4, 2.9, 4.3, 1.3}, + {1, 6.6, 3, 4.4, 1.4}, + {1, 6.8, 2.8, 4.8, 1.4}, + {1, 6.7, 3, 5, 1.7}, + {1, 6, 2.9, 4.5, 1.5}, + {1, 5.7, 2.6, 3.5, 1}, + {1, 5.5, 2.4, 3.8, 1.1}, + {1, 5.5, 2.4, 3.7, 1}, + {1, 5.8, 2.7, 3.9, 1.2}, + {1, 6, 2.7, 5.1, 1.6}, + {1, 5.4, 3, 4.5, 1.5}, + {1, 6, 3.4, 4.5, 1.6}, + {1, 6.7, 3.1, 4.7, 1.5}, + {1, 6.3, 2.3, 4.4, 1.3}, + {1, 5.6, 3, 4.1, 1.3}, + {1, 5.5, 2.5, 4, 1.3}, + {1, 5.5, 2.6, 4.4, 1.2}, + {1, 6.1, 3, 4.6, 1.4}, + {1, 5.8, 2.6, 4, 1.2}, + {1, 5, 2.3, 3.3, 1}, + {1, 5.6, 2.7, 4.2, 1.3}, + {1, 5.7, 3, 4.2, 1.2}, + {1, 5.7, 2.9, 4.2, 1.3}, + {1, 6.2, 2.9, 4.3, 1.3}, + {1, 5.1, 2.5, 3, 1.1}, + {1, 5.7, 2.8, 4.1, 1.3}, + }; +} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansDistributedClustererExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansDistributedClustererExample.java b/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansDistributedClustererExample.java deleted file mode 100644 index f8709e6..0000000 --- a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansDistributedClustererExample.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.examples.ml.clustering; - -import java.util.Arrays; -import java.util.List; -import org.apache.ignite.Ignite; -import org.apache.ignite.Ignition; -import org.apache.ignite.examples.ExampleNodeStartup; -import org.apache.ignite.examples.ml.math.matrix.SparseDistributedMatrixExample; -import org.apache.ignite.ml.clustering.KMeansDistributedClusterer; -import org.apache.ignite.ml.math.StorageConstants; -import org.apache.ignite.ml.math.Tracer; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.distances.EuclideanDistance; -import org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix; -import org.apache.ignite.thread.IgniteThread; - -/** - * <p> - * Example of using {@link KMeansDistributedClusterer}.</p> - * <p> - * Note that in this example we cannot guarantee order in which nodes return results of intermediate - * computations and therefore algorithm can return different results.</p> - * <p> - * Remote nodes should always be started with special configuration file which - * enables P2P class loading: {@code 'ignite.{sh|bat} examples/config/example-ignite.xml'}.</p> - * <p> - * Alternatively you can run {@link ExampleNodeStartup} in another JVM which will start node - * with {@code examples/config/example-ignite.xml} configuration.</p> - */ -public class KMeansDistributedClustererExample { - /** - * Executes example. - * - * @param args Command line arguments, none required. - */ - public static void main(String[] args) throws InterruptedException { - // IMPL NOTE based on KMeansDistributedClustererTestSingleNode#testClusterizationOnDatasetWithObviousStructure - System.out.println(">>> K-means distributed clusterer example started."); - - // Start ignite grid. - try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) { - System.out.println(">>> Ignite grid started."); - - // Create IgniteThread, we must work with SparseDistributedMatrix inside IgniteThread - // because we create ignite cache internally. - IgniteThread igniteThread = new IgniteThread(ignite.configuration().getIgniteInstanceName(), - SparseDistributedMatrixExample.class.getSimpleName(), () -> { - - int ptsCnt = 10000; - - SparseDistributedMatrix points = new SparseDistributedMatrix(ptsCnt, 2, - StorageConstants.ROW_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE); - - DatasetWithObviousStructure dataset = new DatasetWithObviousStructure(10000); - - List<Vector> massCenters = dataset.generate(points); - - EuclideanDistance dist = new EuclideanDistance(); - - KMeansDistributedClusterer clusterer = new KMeansDistributedClusterer(dist, 3, 100, 1L); - - Vector[] resCenters = clusterer.cluster(points, 4).centers(); - - System.out.println("Mass centers:"); - massCenters.forEach(Tracer::showAscii); - - System.out.println("Cluster centers:"); - Arrays.asList(resCenters).forEach(Tracer::showAscii); - - points.destroy(); - - System.out.println("\n>>> K-means distributed clusterer example completed."); - }); - - igniteThread.start(); - - igniteThread.join(); - } - } -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansLocalClustererExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansLocalClustererExample.java b/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansLocalClustererExample.java deleted file mode 100644 index 28ca9d9..0000000 --- a/examples/src/main/java/org/apache/ignite/examples/ml/clustering/KMeansLocalClustererExample.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.examples.ml.clustering; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import org.apache.ignite.ml.clustering.KMeansLocalClusterer; -import org.apache.ignite.ml.clustering.KMeansModel; -import org.apache.ignite.ml.math.Tracer; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.distances.DistanceMeasure; -import org.apache.ignite.ml.math.distances.EuclideanDistance; -import org.apache.ignite.ml.math.functions.Functions; -import org.apache.ignite.ml.math.impls.matrix.DenseLocalOnHeapMatrix; - -/** - * Example of using {@link KMeansLocalClusterer}. - */ -public class KMeansLocalClustererExample { - /** - * Executes example. - * - * @param args Command line arguments, none required. - */ - public static void main(String[] args) { - // IMPL NOTE based on KMeansDistributedClustererTestSingleNode#testClusterizationOnDatasetWithObviousStructure - System.out.println(">>> K-means local clusterer example started."); - - int ptsCnt = 10000; - DenseLocalOnHeapMatrix points = new DenseLocalOnHeapMatrix(ptsCnt, 2); - - DatasetWithObviousStructure dataset = new DatasetWithObviousStructure(10000); - - List<Vector> massCenters = dataset.generate(points); - - EuclideanDistance dist = new EuclideanDistance(); - OrderedNodesComparator comp = new OrderedNodesComparator( - dataset.centers().values().toArray(new Vector[] {}), dist); - - massCenters.sort(comp); - - KMeansLocalClusterer clusterer = new KMeansLocalClusterer(dist, 100, 1L); - - KMeansModel mdl = clusterer.cluster(points, 4); - Vector[] resCenters = mdl.centers(); - Arrays.sort(resCenters, comp); - - System.out.println("Mass centers:"); - massCenters.forEach(Tracer::showAscii); - - System.out.println("Cluster centers:"); - Arrays.asList(resCenters).forEach(Tracer::showAscii); - - System.out.println("\n>>> K-means local clusterer example completed."); - } - - /** */ - private static class OrderedNodesComparator implements Comparator<Vector> { - /** */ - private final DistanceMeasure measure; - - /** */ - List<Vector> orderedNodes; - - /** */ - OrderedNodesComparator(Vector[] orderedNodes, DistanceMeasure measure) { - this.orderedNodes = Arrays.asList(orderedNodes); - this.measure = measure; - } - - /** */ - private int findClosestNodeIndex(Vector v) { - return Functions.argmin(orderedNodes, v1 -> measure.compute(v1, v)).get1(); - } - - /** */ - @Override public int compare(Vector v1, Vector v2) { - int ind1 = findClosestNodeIndex(v1); - int ind2 = findClosestNodeIndex(v2); - - int signum = (int)Math.signum(ind1 - ind2); - - if (signum != 0) - return signum; - - return (int)Math.signum(orderedNodes.get(ind1).minus(v1).kNorm(2) - - orderedNodes.get(ind2).minus(v2).kNorm(2)); - } - } -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/modules/ml/src/main/java/org/apache/ignite/ml/FuzzyCMeansModelFormat.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/FuzzyCMeansModelFormat.java b/modules/ml/src/main/java/org/apache/ignite/ml/FuzzyCMeansModelFormat.java deleted file mode 100644 index cc3d9b3..0000000 --- a/modules/ml/src/main/java/org/apache/ignite/ml/FuzzyCMeansModelFormat.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.ml; - -import java.io.Serializable; -import java.util.Arrays; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.distances.DistanceMeasure; - -/** Fuzzy C-Means model representation. */ -public class FuzzyCMeansModelFormat implements Serializable { - /** Centers of clusters. */ - private final Vector[] centers; - - /** Distance measure. */ - private final DistanceMeasure measure; - - /** - * Constructor that retains result of clusterization and distance measure. - * - * @param centers Centers found while clusterization. - * @param measure Distance measure. - */ - public FuzzyCMeansModelFormat(Vector[] centers, DistanceMeasure measure) { - this.centers = centers; - this.measure = measure; - } - - /** Distance measure used while clusterization. */ - public DistanceMeasure getMeasure() { - return measure; - } - - /** Get cluster centers. */ - public Vector[] getCenters() { - return centers; - } - - /** {@inheritDoc} */ - @Override public int hashCode() { - int res = 1; - - res = res * 37 + measure.hashCode(); - res = res * 37 + Arrays.hashCode(centers); - - return res; - } - - /** {@inheritDoc} */ - @Override public boolean equals(Object obj) { - if (this == obj) - return true; - - if (obj == null || getClass() != obj.getClass()) - return false; - - FuzzyCMeansModelFormat that = (FuzzyCMeansModelFormat) obj; - - return measure.equals(that.measure) && Arrays.deepEquals(centers, that.centers); - } -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/modules/ml/src/main/java/org/apache/ignite/ml/KMeansModelFormat.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/KMeansModelFormat.java b/modules/ml/src/main/java/org/apache/ignite/ml/KMeansModelFormat.java deleted file mode 100644 index c013198..0000000 --- a/modules/ml/src/main/java/org/apache/ignite/ml/KMeansModelFormat.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.ml; - -import java.io.Serializable; -import java.util.Arrays; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.distances.DistanceMeasure; - -/** - * K-means model representation. - * - * @see Exportable - * @see Exporter - */ -public class KMeansModelFormat implements Serializable { - /** Centers of clusters. */ - private final Vector[] centers; - - /** Distance measure. */ - private final DistanceMeasure distance; - - /** */ - public KMeansModelFormat(Vector[] centers, DistanceMeasure distance) { - this.centers = centers; - this.distance = distance; - } - - /** */ - public DistanceMeasure getDistance() { - return distance; - } - - /** */ - public Vector[] getCenters() { - return centers; - } - - /** {@inheritDoc} */ - @Override public int hashCode() { - int res = 1; - - res = res * 37 + distance.hashCode(); - res = res * 37 + Arrays.hashCode(centers); - - return res; - } - - /** {@inheritDoc} */ - @Override public boolean equals(Object obj) { - if (this == obj) - return true; - - if (obj == null || getClass() != obj.getClass()) - return false; - - KMeansModelFormat that = (KMeansModelFormat)obj; - - return distance.equals(that.distance) && Arrays.deepEquals(centers, that.centers); - } - -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseFuzzyCMeansClusterer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseFuzzyCMeansClusterer.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseFuzzyCMeansClusterer.java deleted file mode 100644 index 2b2febf..0000000 --- a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseFuzzyCMeansClusterer.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.ml.clustering; - -import org.apache.ignite.ml.math.Matrix; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.distances.DistanceMeasure; -import org.apache.ignite.ml.math.exceptions.ConvergenceException; -import org.apache.ignite.ml.math.exceptions.MathIllegalArgumentException; - -/** The abstract class that defines the basic interface of Fuzzy C-Means clusterers */ -public abstract class BaseFuzzyCMeansClusterer<T extends Matrix> implements Clusterer<T, FuzzyCMeansModel> { - /** Distance measure. */ - protected DistanceMeasure measure; - - /** Specific constant which is used in calculating of membership matrix. */ - protected double exponentialWeight; - - /** The maximum distance between old and new centers or the maximum difference between new and old membership matrix - * elements for which algorithm must stop. */ - protected double maxDelta; - - /** The flag that tells when algorithm should stop. */ - protected StopCondition stopCond; - - /** - * Constructor that stores some required parameters. - * - * @param measure Distance measure. - * @param exponentialWeight Specific constant which is used in calculating of membership matrix. - * @param stopCond Flag that tells when algorithm should stop. - * @param maxDelta The maximum distance between old and new centers or maximum difference between new and old - * membership matrix elements for which algorithm must stop. - */ - protected BaseFuzzyCMeansClusterer(DistanceMeasure measure, double exponentialWeight, StopCondition stopCond, - double maxDelta) { - this.measure = measure; - this.exponentialWeight = exponentialWeight; - this.stopCond = stopCond; - this.maxDelta = maxDelta; - } - - /** - * Perform a cluster analysis on the given set of points. - * - * @param points The set of points. - * @return A list of clusters. - * @throws MathIllegalArgumentException If points are null or the number of data points is not compatible with this - * clusterer. - * @throws ConvergenceException If the algorithm has not yet converged after the maximum number of iterations has - * been exceeded. - */ - public abstract FuzzyCMeansModel cluster(T points, int k); - - /** - * Calculates the distance between two vectors. * with the configured {@link DistanceMeasure}. - * - * @return The distance between two points. - */ - protected double distance(final Vector v1, final Vector v2) { - return measure.compute(v1, v2); - } - - /** Enumeration that contains different conditions under which algorithm must stop. */ - public enum StopCondition { - /** Algorithm stops if the maximum distance between new and old centers is less than {@link #maxDelta}. */ - STABLE_CENTERS, - - /** - * Algorithm stops if the maximum difference between elements of new and old membership matrix is less than - * {@link #maxDelta}. - */ - STABLE_MEMBERSHIPS - } -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseKMeansClusterer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseKMeansClusterer.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseKMeansClusterer.java deleted file mode 100644 index 521437c..0000000 --- a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseKMeansClusterer.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.ml.clustering; - -import java.util.List; -import org.apache.ignite.lang.IgniteBiTuple; -import org.apache.ignite.ml.math.Matrix; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.distances.DistanceMeasure; -import org.apache.ignite.ml.math.exceptions.ConvergenceException; -import org.apache.ignite.ml.math.exceptions.MathIllegalArgumentException; - -/** - * This class is partly based on the corresponding class from Apache Common Math lib. - */ -public abstract class BaseKMeansClusterer<T extends Matrix> implements Clusterer<T, KMeansModel> { - /** The distance measure to use. */ - private DistanceMeasure measure; - - /** - * Build a new clusterer with the given {@link DistanceMeasure}. - * - * @param measure the distance measure to use - */ - protected BaseKMeansClusterer(final DistanceMeasure measure) { - this.measure = measure; - } - - /** - * Perform a cluster analysis on the given set of points. - * - * @param points the set of points - * @return a {@link List} of clusters - * @throws MathIllegalArgumentException if points are null or the number of data points is not compatible with this - * clusterer - * @throws ConvergenceException if the algorithm has not yet converged after the maximum number of iterations has - * been exceeded - */ - public abstract KMeansModel cluster(T points, int k) - throws MathIllegalArgumentException, ConvergenceException; - - /** - * Returns the {@link DistanceMeasure} instance used by this clusterer. - * - * @return the distance measure - */ - public DistanceMeasure getDistanceMeasure() { - return measure; - } - - /** - * Calculates the distance between two vectors. - * with the configured {@link DistanceMeasure}. - * - * @return the distance between the two clusterables - */ - protected double distance(final Vector v1, final Vector v2) { - return measure.compute(v1, v2); - } - - /** - * Find the closest cluster center index and distance to it from a given point. - * - * @param centers Centers to look in. - * @param pnt Point. - */ - protected IgniteBiTuple<Integer, Double> findClosest(Vector[] centers, Vector pnt) { - double bestDistance = Double.POSITIVE_INFINITY; - int bestInd = 0; - - for (int i = 0; i < centers.length; i++) { - double dist = distance(centers[i], pnt); - if (dist < bestDistance) { - bestDistance = dist; - bestInd = i; - } - } - - return new IgniteBiTuple<>(bestInd, bestDistance); - } -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/modules/ml/src/main/java/org/apache/ignite/ml/clustering/Clusterer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/Clusterer.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/Clusterer.java deleted file mode 100644 index 204e28a..0000000 --- a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/Clusterer.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.ml.clustering; - -import org.apache.ignite.ml.Model; - -/** - * Base interface for clusterers. - */ -public interface Clusterer<P, M extends Model> { - /** - * Cluster given points set into k clusters. - * - * @param points Points set. - * @param k Clusters count. - */ - public M cluster(P points, int k); -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/modules/ml/src/main/java/org/apache/ignite/ml/clustering/ClusterizationModel.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/ClusterizationModel.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/ClusterizationModel.java deleted file mode 100644 index 99afec5..0000000 --- a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/ClusterizationModel.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.ml.clustering; - -import org.apache.ignite.ml.Model; - -/** Base interface for all clusterization models. */ -public interface ClusterizationModel<P, V> extends Model<P, V> { - /** Gets the clusters count. */ - public int clustersCount(); - - /** Get cluster centers. */ - public P[] centers(); -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClusterer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClusterer.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClusterer.java deleted file mode 100644 index 8823c10..0000000 --- a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClusterer.java +++ /dev/null @@ -1,512 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.ml.clustering; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.UUID; -import java.util.concurrent.ConcurrentHashMap; -import java.util.stream.Collectors; -import javax.cache.Cache; -import org.apache.ignite.internal.util.GridArgumentCheck; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.VectorUtils; -import org.apache.ignite.ml.math.distances.DistanceMeasure; -import org.apache.ignite.ml.math.distributed.CacheUtils; -import org.apache.ignite.ml.math.distributed.keys.impl.SparseMatrixKey; -import org.apache.ignite.ml.math.exceptions.ConvergenceException; -import org.apache.ignite.ml.math.exceptions.MathIllegalArgumentException; -import org.apache.ignite.ml.math.functions.Functions; -import org.apache.ignite.ml.math.functions.IgniteBiFunction; -import org.apache.ignite.ml.math.functions.IgniteSupplier; -import org.apache.ignite.ml.math.impls.matrix.DenseLocalOnHeapMatrix; -import org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix; -import org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage; -import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; -import org.apache.ignite.ml.math.util.MatrixUtil; - -/** This class implements distributed version of Fuzzy C-Means clusterization of equal-weighted points. */ -public class FuzzyCMeansDistributedClusterer extends BaseFuzzyCMeansClusterer<SparseDistributedMatrix> { - /** Random numbers generator which is used in centers selection. */ - private Random rnd; - - /** The value that is used to initialize random numbers generator. */ - private long seed; - - /** The number of initialization steps each of which adds some number of candidates for being a center. */ - private int initSteps; - - /** The maximum number of iterations of K-Means algorithm which selects the required number of centers. */ - private int kMeansMaxIterations; - - /** The maximum number of FCM iterations. */ - private int cMeansMaxIterations; - - /** - * Constructor that retains all required parameters. - * - * @param measure Distance measure. - * @param exponentialWeight Specific constant which is used in calculating of membership matrix. - * @param stopCond Flag that tells when algorithm should stop. - * @param maxDelta The maximum distance between old and new centers or maximum difference between new and old - * membership matrix elements for which algorithm must stop. - * @param cMeansMaxIterations The maximum number of FCM iterations. - * @param seed Seed for random numbers generator. - * @param initSteps Number of steps of primary centers selection (the more steps, the more candidates). - * @param kMeansMaxIterations The maximum number of K-Means iteration in primary centers selection. - */ - public FuzzyCMeansDistributedClusterer(DistanceMeasure measure, double exponentialWeight, - StopCondition stopCond, double maxDelta, int cMeansMaxIterations, - Long seed, int initSteps, int kMeansMaxIterations) { - super(measure, exponentialWeight, stopCond, maxDelta); - - this.seed = seed != null ? seed : new Random().nextLong(); - this.initSteps = initSteps; - this.cMeansMaxIterations = cMeansMaxIterations; - this.kMeansMaxIterations = kMeansMaxIterations; - rnd = new Random(this.seed); - } - - /** {@inheritDoc} */ - @Override public FuzzyCMeansModel cluster(SparseDistributedMatrix points, int k) - throws MathIllegalArgumentException, ConvergenceException { - GridArgumentCheck.notNull(points, "points"); - - if (k < 2) - throw new MathIllegalArgumentException("The number of clusters is less than 2"); - - Vector[] centers = initializeCenters(points, k); - - MembershipsAndSums membershipsAndSums = null; - - int iteration = 0; - boolean finished = false; - while (!finished && iteration < cMeansMaxIterations) { - MembershipsAndSums newMembershipsAndSums = calculateMembership(points, centers); - Vector[] newCenters = calculateNewCenters(points, newMembershipsAndSums, k); - - if (stopCond == StopCondition.STABLE_CENTERS) - finished = isFinished(centers, newCenters); - else - finished = isFinished(membershipsAndSums, newMembershipsAndSums); - - centers = newCenters; - membershipsAndSums = newMembershipsAndSums; - - iteration++; - } - - if (iteration == cMeansMaxIterations) - throw new ConvergenceException("Fuzzy C-Means algorithm has not converged after " + - Integer.toString(iteration) + " iterations"); - - return new FuzzyCMeansModel(centers, measure); - } - - /** - * Choose k primary centers from source points. - * - * @param points Matrix with source points. - * @param k Number of centers. - * @return Array of primary centers. - */ - private Vector[] initializeCenters(SparseDistributedMatrix points, int k) { - int pointsNum = points.rowSize(); - - Vector firstCenter = points.viewRow(rnd.nextInt(pointsNum)); - - List<Vector> centers = new ArrayList<>(); - List<Vector> newCenters = new ArrayList<>(); - - centers.add(firstCenter); - newCenters.add(firstCenter); - - ConcurrentHashMap<Integer, Double> costs = new ConcurrentHashMap<>(); - - int step = 0; - UUID uuid = points.getUUID(); - String cacheName = ((SparseDistributedMatrixStorage) points.getStorage()).cacheName(); - - while(step < initSteps) { - ConcurrentHashMap<Integer, Double> newCosts = getNewCosts(cacheName, uuid, newCenters); - - for (Integer key : newCosts.keySet()) - costs.merge(key, newCosts.get(key), Math::min); - - double costsSum = costs.values().stream().mapToDouble(Double::valueOf).sum(); - - newCenters = getNewCenters(cacheName, uuid, costs, costsSum, k); - centers.addAll(newCenters); - - step++; - } - - return chooseKCenters(cacheName, uuid, centers, k); - } - - /** - * Calculate new distances from each point to the nearest center. - * - * @param cacheName Cache name of point matrix. - * @param uuid Uuid of point matrix. - * @param newCenters The list of centers that was added on previous step. - * @return Hash map with distances. - */ - private ConcurrentHashMap<Integer, Double> getNewCosts(String cacheName, UUID uuid, - List<Vector> newCenters) { - return CacheUtils.distributedFold(cacheName, - (IgniteBiFunction<Cache.Entry<SparseMatrixKey, ConcurrentHashMap<Integer, Double>>, - ConcurrentHashMap<Integer, Double>, - ConcurrentHashMap<Integer, Double>>)(vectorWithIndex, map) -> { - Vector vector = VectorUtils.fromMap(vectorWithIndex.getValue(), false); - - for (Vector center : newCenters) - map.merge(vectorWithIndex.getKey().index(), distance(vector, center), Functions.MIN); - - return map; - }, - key -> key.dataStructureId().equals(uuid), - (map1, map2) -> { - map1.putAll(map2); - return map1; - }, - ConcurrentHashMap::new); - } - - /** - * Choose some number of center candidates from source points according to their costs. - * - * @param cacheName Cache name of point matrix. - * @param uuid Uuid of point matrix. - * @param costs Hash map with costs (distances to nearest center). - * @param costsSum The sum of costs. - * @param k The estimated number of centers. - * @return The list of new candidates. - */ - private List<Vector> getNewCenters(String cacheName, UUID uuid, - ConcurrentHashMap<Integer, Double> costs, double costsSum, int k) { - return CacheUtils.distributedFold(cacheName, - (IgniteBiFunction<Cache.Entry<SparseMatrixKey, Map<Integer, Double>>, - List<Vector>, - List<Vector>>)(vectorWithIndex, centers) -> { - Integer idx = vectorWithIndex.getKey().index(); - Vector vector = VectorUtils.fromMap(vectorWithIndex.getValue(), false); - - double probability = (costs.get(idx) * 2.0 * k) / costsSum; - - if (rnd.nextDouble() < probability) - centers.add(vector); - - return centers; - }, - key -> key.dataStructureId().equals(uuid), - (list1, list2) -> { - list1.addAll(list2); - return list1; - }, - ArrayList::new); - } - - /** - * Weight candidates and use K-Means to choose required number of them. - * - * @param cacheName Cache name of the point matrix. - * @param uuid Uuid of the point matrix. - * @param centers The list of candidates. - * @param k The estimated number of centers. - * @return {@code k} centers. - */ - private Vector[] chooseKCenters(String cacheName, UUID uuid, List<Vector> centers, int k) { - centers = centers.stream().distinct().collect(Collectors.toList()); - - ConcurrentHashMap<Integer, Integer> weightsMap = weightCenters(cacheName, uuid, centers); - - List<Double> weights = new ArrayList<>(centers.size()); - - for (int i = 0; i < centers.size(); i++) - weights.add(i, Double.valueOf(weightsMap.getOrDefault(i, 0))); - - DenseLocalOnHeapMatrix centersMatrix = MatrixUtil.fromList(centers, true); - - KMeansLocalClusterer clusterer = new KMeansLocalClusterer(measure, kMeansMaxIterations, seed); - return clusterer.cluster(centersMatrix, k, weights).centers(); - } - - /** - * Weight each center with number of points for which this center is the nearest. - * - * @param cacheName Cache name of the point matrix. - * @param uuid Uuid of the point matrix. - * @param centers The list of centers. - * @return Hash map with weights. - */ - public ConcurrentHashMap<Integer, Integer> weightCenters(String cacheName, UUID uuid, List<Vector> centers) { - if (centers.size() == 0) - return new ConcurrentHashMap<>(); - - return CacheUtils.distributedFold(cacheName, - (IgniteBiFunction<Cache.Entry<SparseMatrixKey, ConcurrentHashMap<Integer, Double>>, - ConcurrentHashMap<Integer, Integer>, - ConcurrentHashMap<Integer, Integer>>)(vectorWithIndex, counts) -> { - Vector vector = VectorUtils.fromMap(vectorWithIndex.getValue(), false); - - int nearest = 0; - double minDistance = distance(centers.get(nearest), vector); - - for (int i = 0; i < centers.size(); i++) { - double currDistance = distance(centers.get(i), vector); - if (currDistance < minDistance) { - minDistance = currDistance; - nearest = i; - } - } - - counts.compute(nearest, (index, value) -> value == null ? 1 : value + 1); - - return counts; - }, - key -> key.dataStructureId().equals(uuid), - (map1, map2) -> { - map1.putAll(map2); - return map1; - }, - ConcurrentHashMap::new); - } - - /** - * Calculate matrix of membership coefficients for each point and each center. - * - * @param points Matrix with source points. - * @param centers Array of current centers. - * @return Membership matrix and sums of membership coefficients for each center. - */ - private MembershipsAndSums calculateMembership(SparseDistributedMatrix points, Vector[] centers) { - String cacheName = ((SparseDistributedMatrixStorage) points.getStorage()).cacheName(); - UUID uuid = points.getUUID(); - double fuzzyMembershipCoefficient = 2 / (exponentialWeight - 1); - - MembershipsAndSumsSupplier supplier = new MembershipsAndSumsSupplier(centers.length); - - return CacheUtils.distributedFold(cacheName, - (IgniteBiFunction<Cache.Entry<SparseMatrixKey, ConcurrentHashMap<Integer, Double>>, - MembershipsAndSums, - MembershipsAndSums>)(vectorWithIndex, membershipsAndSums) -> { - Integer idx = vectorWithIndex.getKey().index(); - Vector pnt = VectorUtils.fromMap(vectorWithIndex.getValue(), false); - Vector distances = new DenseLocalOnHeapVector(centers.length); - Vector pntMemberships = new DenseLocalOnHeapVector(centers.length); - - for (int i = 0; i < centers.length; i++) - distances.setX(i, distance(centers[i], pnt)); - - for (int i = 0; i < centers.length; i++) { - double invertedFuzzyWeight = 0.0; - - for (int j = 0; j < centers.length; j++) { - double val = Math.pow(distances.getX(i) / distances.getX(j), fuzzyMembershipCoefficient); - if (Double.isNaN(val)) - val = 1.0; - - invertedFuzzyWeight += val; - } - - double membership = Math.pow(1.0 / invertedFuzzyWeight, exponentialWeight); - pntMemberships.setX(i, membership); - } - - membershipsAndSums.memberships.put(idx, pntMemberships); - membershipsAndSums.membershipSums = membershipsAndSums.membershipSums.plus(pntMemberships); - - return membershipsAndSums; - }, - key -> key.dataStructureId().equals(uuid), - (mem1, mem2) -> { - mem1.merge(mem2); - return mem1; - }, - supplier); - } - - /** - * Calculate new centers according to membership matrix. - * - * @param points Matrix with source points. - * @param membershipsAndSums Membership matrix and sums of membership coefficient for each center. - * @param k The number of centers. - * @return Array of new centers. - */ - private Vector[] calculateNewCenters(SparseDistributedMatrix points, MembershipsAndSums membershipsAndSums, int k) { - String cacheName = ((SparseDistributedMatrixStorage) points.getStorage()).cacheName(); - UUID uuid = points.getUUID(); - - CentersArraySupplier supplier = new CentersArraySupplier(k, points.columnSize()); - - Vector[] centers = CacheUtils.distributedFold(cacheName, - (IgniteBiFunction<Cache.Entry<SparseMatrixKey, ConcurrentHashMap<Integer, Double>>, - Vector[], - Vector[]>)(vectorWithIndex, centerSums) -> { - Integer idx = vectorWithIndex.getKey().index(); - Vector pnt = MatrixUtil.localCopyOf(VectorUtils.fromMap(vectorWithIndex.getValue(), false)); - Vector pntMemberships = membershipsAndSums.memberships.get(idx); - - for (int i = 0; i < k; i++) { - Vector weightedPnt = pnt.times(pntMemberships.getX(i)); - centerSums[i] = centerSums[i].plus(weightedPnt); - } - - return centerSums; - }, - key -> key.dataStructureId().equals(uuid), - (sums1, sums2) -> { - for (int i = 0; i < k; i++) - sums1[i] = sums1[i].plus(sums2[i]); - - return sums1; - }, - supplier); - - for (int i = 0; i < k; i++) - centers[i] = centers[i].divide(membershipsAndSums.membershipSums.getX(i)); - - return centers; - } - - /** - * Check if centers have moved insignificantly. - * - * @param centers Old centers. - * @param newCenters New centers. - * @return The result of comparison. - */ - private boolean isFinished(Vector[] centers, Vector[] newCenters) { - int numCenters = centers.length; - - for (int i = 0; i < numCenters; i++) - if (distance(centers[i], newCenters[i]) > maxDelta) - return false; - - return true; - } - - /** - * Check memberships difference. - * - * @param membershipsAndSums Old memberships. - * @param newMembershipsAndSums New memberships. - * @return The result of comparison. - */ - private boolean isFinished(MembershipsAndSums membershipsAndSums, MembershipsAndSums newMembershipsAndSums) { - if (membershipsAndSums == null) - return false; - - double currMaxDelta = 0.0; - for (Integer key : membershipsAndSums.memberships.keySet()) { - double distance = measure.compute(membershipsAndSums.memberships.get(key), - newMembershipsAndSums.memberships.get(key)); - if (distance > currMaxDelta) - currMaxDelta = distance; - } - - return currMaxDelta <= maxDelta; - } - - /** Service class used to optimize counting of membership sums. */ - private class MembershipsAndSums { - /** Membership matrix. */ - public ConcurrentHashMap<Integer, Vector> memberships = new ConcurrentHashMap<>(); - - /** Membership sums. */ - public Vector membershipSums; - - /** - * Default constructor. - * - * @param k The number of centers. - */ - public MembershipsAndSums(int k) { - membershipSums = new DenseLocalOnHeapVector(k); - } - - /** - * Merge results of calculation for different parts of points. - * @param another Another part of memberships and sums. - */ - public void merge(MembershipsAndSums another) { - memberships.putAll(another.memberships); - membershipSums = membershipSums.plus(another.membershipSums); - } - } - - /** Service class that is used to create new {@link MembershipsAndSums} instances. */ - private class MembershipsAndSumsSupplier implements IgniteSupplier<MembershipsAndSums> { - /** The number of centers */ - int k; - - /** - * Constructor that retains the number of centers. - * - * @param k The number of centers. - */ - public MembershipsAndSumsSupplier(int k) { - this.k = k; - } - - /** - * Create new instance of {@link MembershipsAndSums}. - * - * @return {@link MembershipsAndSums} object. - */ - @Override public MembershipsAndSums get() { - return new MembershipsAndSums(k); - } - } - - /** Service class that is used to create new arrays of vectors. */ - private class CentersArraySupplier implements IgniteSupplier<Vector[]> { - /** The number of centers. */ - int k; - - /** The number of coordinates. */ - int dim; - - /** - * Constructor that retains all required parameters. - * - * @param k The number of centers. - * @param dim The number of coordinates. - */ - public CentersArraySupplier(int k, int dim) { - this.k = k; - this.dim = dim; - } - - /** - * Create new array of vectors. - * - * @return Array of vectors. - */ - @Override public Vector[] get() { - DenseLocalOnHeapVector[] centerSumsArr = new DenseLocalOnHeapVector[k]; - for (int i = 0; i < k; i++) - centerSumsArr[i] = new DenseLocalOnHeapVector(dim); - return centerSumsArr; - } - } -} http://git-wip-us.apache.org/repos/asf/ignite/blob/9e21cec0/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClusterer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClusterer.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClusterer.java deleted file mode 100644 index a1b6d3f..0000000 --- a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClusterer.java +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.ignite.ml.clustering; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Random; -import org.apache.ignite.internal.util.GridArgumentCheck; -import org.apache.ignite.ml.math.Matrix; -import org.apache.ignite.ml.math.Vector; -import org.apache.ignite.ml.math.distances.DistanceMeasure; -import org.apache.ignite.ml.math.exceptions.ConvergenceException; -import org.apache.ignite.ml.math.exceptions.MathIllegalArgumentException; -import org.apache.ignite.ml.math.impls.matrix.DenseLocalOnHeapMatrix; -import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; - -/** Implements the local version of Fuzzy C-Means algorithm for weighted points. */ -public class FuzzyCMeansLocalClusterer extends BaseFuzzyCMeansClusterer<DenseLocalOnHeapMatrix> implements - WeightedClusterer<DenseLocalOnHeapMatrix, FuzzyCMeansModel> { - /** The maximum number of iterations. */ - private int maxIterations; - - /** The random numbers generator that is used to choose primary centers. */ - private Random rnd; - - /** - * Constructor that retains all required parameters. - * - * @param measure Distance measure. - * @param exponentialWeight Specific constant which is used in calculating of membership matrix. - * @param stopCond Flag that tells when algorithm should stop. - * @param maxDelta The maximum distance between old and new centers or maximum difference between new and old - * membership matrix elements for which algorithm must stop. - * @param maxIterations The maximum number of FCM iterations. - */ - public FuzzyCMeansLocalClusterer(DistanceMeasure measure, double exponentialWeight, StopCondition stopCond, - double maxDelta, int maxIterations, Long seed) { - super(measure, exponentialWeight, stopCond, maxDelta); - this.maxIterations = maxIterations; - rnd = seed != null ? new Random(seed) : new Random(); - } - - /** {@inheritDoc} */ - @Override public FuzzyCMeansModel cluster(DenseLocalOnHeapMatrix points, int k) { - List<Double> ones = new ArrayList<>(Collections.nCopies(points.rowSize(), 1.0)); - return cluster(points, k, ones); - } - - /** {@inheritDoc} */ - @Override public FuzzyCMeansModel cluster(DenseLocalOnHeapMatrix points, int k, List<Double> weights) - throws MathIllegalArgumentException, ConvergenceException { - GridArgumentCheck.notNull(points, "points"); - GridArgumentCheck.notNull(weights, "weights"); - - if (points.rowSize() != weights.size()) - throw new MathIllegalArgumentException("The number of points and the number of weights are not equal"); - - if (k < 2) - throw new MathIllegalArgumentException("The number of clusters is less than 2"); - - Matrix centers = new DenseLocalOnHeapMatrix(k, points.columnSize()); - Matrix distances = new DenseLocalOnHeapMatrix(k, points.rowSize()); - Matrix membership = new DenseLocalOnHeapMatrix(k, points.rowSize()); - Vector weightsVector = new DenseLocalOnHeapVector(weights.size()); - for (int i = 0; i < weights.size(); i++) - weightsVector.setX(i, weights.get(i)); - - initializeCenters(centers, points, k, weightsVector); - - int iteration = 0; - boolean finished = false; - while (iteration < maxIterations && !finished) { - calculateDistances(distances, points, centers); - Matrix newMembership = calculateMembership(distances, weightsVector); - Matrix newCenters = calculateNewCenters(points, newMembership); - - if (this.stopCond == StopCondition.STABLE_CENTERS) - finished = areCentersStable(centers, newCenters); - else - finished = areMembershipStable(membership, newMembership); - - centers = newCenters; - membership = newMembership; - iteration++; - } - - if (iteration == maxIterations) - throw new ConvergenceException("Fuzzy C-Means algorithm has not converged after " + - Integer.toString(iteration) + " iterations"); - - Vector[] centersArr = new Vector[k]; - for (int i = 0; i < k; i++) - centersArr[i] = centers.getRow(i); - - return new FuzzyCMeansModel(centersArr, measure); - } - - /** - * Choose {@code k} centers according to their weights. - * - * @param centers Output matrix containing primary centers. - * @param points Matrix of source points. - * @param k The number of centers. - * @param weights Vector of weights. - */ - private void initializeCenters(Matrix centers, Matrix points, int k, Vector weights) { - //int dimensions = points.columnSize(); - int numPoints = points.rowSize(); - - Vector firstCenter = points.viewRow(rnd.nextInt(numPoints)); - centers.setRow(0, firstCenter.getStorage().data()); - - Vector costs = points.foldRows(vector -> distance(vector, firstCenter)); - costs = costs.times(weights); - - double sum = costs.sum(); - - for (int i = 1; i < k; i++) { - double probe = rnd.nextDouble() * sum; - double cntr = 0; - int id = 0; - - for (int j = 0; j < numPoints; j++) { - cntr += costs.getX(j); - if (cntr >= probe) { - id = j; - break; - } - } - - centers.setRow(i, points.viewRow(id).getStorage().data()); - sum -= costs.get(id); - costs.set(id, 0.0); - } - } - - /** - * Calculate matrix of distances form each point to each center. - * - * @param distances Output matrix. - * @param points Matrix that contains source points. - * @param centers Matrix that contains centers. - */ - private void calculateDistances(Matrix distances, Matrix points, Matrix centers) { - int numPoints = points.rowSize(); - int numCenters = centers.rowSize(); - - for (int i = 0; i < numCenters; i++) - for (int j = 0; j < numPoints; j++) - distances.set(i, j, distance(centers.viewRow(i), points.viewRow(j))); - } - - /** - * Calculate membership matrix. - * - * @param distances Matrix of distances. - * @param weights Vector of weights. - * @ - */ - private Matrix calculateMembership(Matrix distances, Vector weights) { - Matrix newMembership = new DenseLocalOnHeapMatrix(distances.rowSize(), distances.columnSize()); - int numPoints = distances.columnSize(); - int numCenters = distances.rowSize(); - double fuzzyMembershipCoefficient = 2 / (exponentialWeight - 1); - - for (int i = 0; i < numCenters; i++) { - for (int j = 0; j < numPoints; j++) { - double invertedFuzzyWeight = 0.0; - - for (int k = 0; k < numCenters; k++) { - double val = Math.pow(distances.get(i, j) / distances.get(k, j), - fuzzyMembershipCoefficient); - if (Double.isNaN(val)) - val = 1.0; - - invertedFuzzyWeight += val; - } - - double weight = 1.0 / invertedFuzzyWeight * weights.getX(j); - newMembership.setX(i, j, Math.pow(weight, exponentialWeight)); - } - } - return newMembership; - } - - /** - * Calculate new centers using membership matrix. - * - * @param points Matrix of source points. - * @param membership Matrix that contains membership coefficients. - * @return Matrix that contains new centers. - */ - private Matrix calculateNewCenters(Matrix points, Matrix membership) { - Vector membershipSums = membership.foldRows(Vector::sum); - Matrix newCenters = membership.times(points); - - int numCenters = newCenters.rowSize(); - for (int i = 0; i < numCenters; i++) - newCenters.viewRow(i).divide(membershipSums.getX(i)); - - return newCenters; - } - - /** - * Check if centers have moved insignificantly. - * - * @param centers Old centers. - * @param newCenters New centers. - * @return The result of comparison. - */ - private boolean areCentersStable(Matrix centers, Matrix newCenters) { - int numCenters = centers.rowSize(); - for (int i = 0; i < numCenters; i++) - if (distance(centers.viewRow(i), newCenters.viewRow(i)) > maxDelta) - return false; - - return true; - } - - /** - * Check if membership matrix has changed insignificantly. - * - * @param membership Old membership matrix. - * @param newMembership New membership matrix. - * @return The result of comparison. - */ - private boolean areMembershipStable(Matrix membership, Matrix newMembership) { - int numCenters = membership.rowSize(); - int numPoints = membership.columnSize(); - - for (int i = 0; i < numCenters; i++) - for (int j = 0; j < numPoints; j++) - if (Math.abs(newMembership.getX(i, j) - membership.getX(i, j)) > maxDelta) - return false; - - return true; - } -}
