Repository: mahout Updated Branches: refs/heads/master 9b4eabb9c -> c29496cb1
MAHOUT-1976 Canopy Clustering closes apache/mahout#314 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/c29496cb Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/c29496cb Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/c29496cb Branch: refs/heads/master Commit: c29496cb11372baddbb76acdee51530347525645 Parents: 9b4eabb Author: rawkintrevo <[email protected]> Authored: Sat May 20 22:36:02 2017 -0500 Committer: rawkintrevo <[email protected]> Committed: Sat May 20 22:36:02 2017 -0500 ---------------------------------------------------------------------- .../standard/ClusteringSuite.scala | 26 +++ .../math/algorithms/ClusteringSuite.scala | 25 +++ .../math/algorithms/clustering/Canopy.scala | 158 +++++++++++++++++++ .../algorithms/clustering/ClusteringModel.scala | 45 ++++++ .../common/distance/DistanceMetrics.scala | 48 ++++++ .../math/algorithms/ClusteringSuiteBase.scala | 48 ++++++ .../math/algorithms/ClusteringSuite.scala | 25 +++ website/docs/_includes/algo_navbar.html | 12 +- .../algorithms/clustering/canopy/Canopy.png | Bin 0 -> 48061 bytes .../algorithms/clustering/canopy/Canopy10.png | Bin 0 -> 42414 bytes .../algorithms/clustering/canopy/SampleData.png | Bin 0 -> 32587 bytes .../docs/algorithms/clustering/canopy/index.md | 128 +++++++++++++++ .../algorithms/clustering/distance-metrics.md | 82 ++++++++++ website/docs/algorithms/clustering/index.md | 6 + .../map-reduce/clustering/canopy-clustering.md | 3 + 15 files changed, 604 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/ClusteringSuite.scala ---------------------------------------------------------------------- diff --git a/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/ClusteringSuite.scala b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/ClusteringSuite.scala new file mode 100644 index 0000000..ea86c91 --- /dev/null +++ b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/ClusteringSuite.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.flinkbindings.standard + +import org.apache.mahout.flinkbindings.DistributedFlinkSuite +import org.apache.mahout.math.algorithms.ClusteringSuiteBase +import org.scalatest.FunSuite + +class ClusteringSuite extends FunSuite + with DistributedFlinkSuite with ClusteringSuiteBase + http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/h2o/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala ---------------------------------------------------------------------- diff --git a/h2o/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala new file mode 100644 index 0000000..61d54ac --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.h2obindings.test.DistributedH2OSuite +import org.scalatest.FunSuite + +class ClusteringSuite extends FunSuite + with DistributedH2OSuite with ClusteringSuiteBase + http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala new file mode 100644 index 0000000..96d1968 --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/Canopy.scala @@ -0,0 +1,158 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.clustering + + + +import org.apache.mahout.math.algorithms.common.distance.{DistanceMetric, DistanceMetricSelector} +import org.apache.mahout.math._ +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.function.VectorFunction +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.math.{Matrix, Vector} + + +class CanopyClusteringModel(canopies: Matrix, dm: Symbol) extends ClusteringModel { + + val canopyCenters = canopies + val distanceMetric = dm + + def cluster[K](input: DrmLike[K]): DrmLike[K] = { + + implicit val ctx = input.context + implicit val ktag = input.keyClassTag + + val bcCanopies = drmBroadcast(canopyCenters) + val bcDM = drmBroadcast(dvec(DistanceMetricSelector.namedMetricLookup(distanceMetric))) + + input.mapBlock(1) { + case (keys, block: Matrix) => { + val outputMatrix = new DenseMatrix(block.nrow, 1) + + val localCanopies: Matrix = bcCanopies.value + for (i <- 0 until block.nrow) { + val distanceMetric = DistanceMetricSelector.select(bcDM.value.get(0)) + + val cluster = (0 until localCanopies.nrow).foldLeft(-1, 9999999999999999.9)((l, r) => { + val dist = distanceMetric.distance(localCanopies(r, ::), block(i, ::)) + if ((dist) < l._2) { + (r, dist) + } + else { + l + } + })._1 + outputMatrix(i, ::) = dvec(cluster) + } + keys -> outputMatrix + } + } + } +} + + +class CanopyClustering extends ClusteringFitter { + + var t1: Double = _ // loose distance + var t2: Double = _ // tight distance + var t3: Double = _ + var t4: Double = _ + var distanceMeasure: Symbol = _ + + def setStandardHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> None)): Unit = { + t1 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t1, 0.5) + t2 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t2, 0.1) + t3 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t3, t1) + t4 = hyperparameters.asInstanceOf[Map[Symbol, Double]].getOrElse('t4, t2) + + distanceMeasure = hyperparameters.asInstanceOf[Map[Symbol, Symbol]].getOrElse('distanceMeasure, 'Cosine) + + } + + def fit[K](input: DrmLike[K], + hyperparameters: (Symbol, Any)*): CanopyClusteringModel = { + + setStandardHyperparameters(hyperparameters.toMap) + implicit val ctx = input.context + implicit val ktag = input.keyClassTag + + val dmNumber = DistanceMetricSelector.namedMetricLookup(distanceMeasure) + + val distanceBC = drmBroadcast(dvec(t1,t2,t3,t4, dmNumber)) + val canopies = input.allreduceBlock( + { + + // Assign All Points to Clusters + case (keys, block: Matrix) => { + val t1_local = distanceBC.value.get(0) + val t2_local = distanceBC.value.get(1) + val dm = distanceBC.value.get(4) + CanopyFn.findCenters(block, DistanceMetricSelector.select(dm), t1_local, t2_local) + } + }, { + // Optionally Merge Clusters that are close enough + case (oldM: Matrix, newM: Matrix) => { + val t3_local = distanceBC.value.get(2) + val t4_local = distanceBC.value.get(3) + val dm = distanceBC.value.get(4) + CanopyFn.findCenters(oldM, DistanceMetricSelector.select(dm), t3_local, t4_local) + } + }) + + val model = new CanopyClusteringModel(canopies, distanceMeasure) + model.summary = s"""CanopyClusteringModel\n${canopies.nrow} Clusters\n${distanceMeasure} distance metric used for calculating distances\nCanopy centers stored in model.canopies where row n coresponds to canopy n""" + model + } + + +} + +object CanopyFn extends Serializable { + def findCenters(block: Matrix, distanceMeasure: DistanceMetric, t1: Double, t2: Double): Matrix = { + val block = dense((1.0, 1.2, 1.3, 1.4), (1.1, 1.5, 2.5, 1.0), (6.0, 5.2, -5.2, 5.3), (7.0,6.0, 5.0, 5.0), (10.0, 1.0, 20.0, -10.0)) + var rowAssignedToCanopy = Array.fill(block.nrow) { false } + val clusterBuf = scala.collection.mutable.ListBuffer.empty[org.apache.mahout.math.Vector] + while (rowAssignedToCanopy.contains(false)) { + val rowIndexOfNextUncanopiedVector = rowAssignedToCanopy.indexOf(false) + clusterBuf += block(rowIndexOfNextUncanopiedVector, ::).cloned + block(rowIndexOfNextUncanopiedVector, ::) = svec(Nil, cardinality = block.ncol) + rowAssignedToCanopy(rowIndexOfNextUncanopiedVector) = true + for (i <- 0 until block.nrow) { + if (block(i, ::).getNumNonZeroElements > 0) { // + distanceMeasure.distance(block(i, ::), clusterBuf.last) match { + case d if d < t2 => { + + rowAssignedToCanopy(i) = true + block(i, ::) = svec(Nil, cardinality = block.ncol) + } + case d if d < t1 => { + + rowAssignedToCanopy(i) = true + } + case d => {} + } + } + } + } + dense(clusterBuf) + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala new file mode 100644 index 0000000..8ab1170 --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/clustering/ClusteringModel.scala @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.clustering + +import org.apache.mahout.math.algorithms.{UnsupervisedFitter, UnsupervisedModel} +import org.apache.mahout.math.drm.DrmLike + +trait ClusteringModel extends UnsupervisedModel { + + def cluster[K](input: DrmLike[K]): DrmLike[K] + +} + +trait ClusteringFitter extends UnsupervisedFitter { + + def fit[K](input: DrmLike[K], + hyperparameters: (Symbol, Any)*): ClusteringModel + + def fitCluster[K](input: DrmLike[K], + hyperparameters: (Symbol, Any)*): DrmLike[K] = { + model = this.fit(input, hyperparameters:_*) + model.cluster(input) + + } + + // used to store the model if `fitTransform` method called + var model: ClusteringModel = _ +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala new file mode 100644 index 0000000..00495fd --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/distance/DistanceMetrics.scala @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.math.algorithms.common.distance + +import org.apache.mahout.math.function.Functions +import org.apache.mahout.math.{CardinalityException, Vector} + +trait DistanceMetric extends Serializable { + def distance(v1: Vector, v2: Vector): Double +} + + +object DistanceMetricSelector extends Serializable{ + + val namedMetricLookup = Map('Chebyshev -> 1.0, 'Cosine -> 2.0) + + def select(dm: Double): DistanceMetric = { + dm match { + case 1.0 => Chebyshev + case 2.0 => Cosine + } + } +} + +object Chebyshev extends DistanceMetric { + def distance(v1: Vector, v2: Vector): Double = { + if (v1.size != v2.size) throw new CardinalityException(v1.size, v2.size) + v1.aggregate(v2, Functions.MAX_ABS, Functions.MINUS) + } +} + +object Cosine extends DistanceMetric { + def distance(v1: Vector, v2: Vector): Double = 1.0 - v1.dot(v2) / (Math.sqrt(v1.getLengthSquared) * Math.sqrt(v2.getLengthSquared)) +} http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala new file mode 100644 index 0000000..70fb10f --- /dev/null +++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.math.algorithms.preprocessing._ +import org.apache.mahout.math.drm.drmParallelize +import org.apache.mahout.math.scalabindings.{dense, sparse, svec} +import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.test.DistributedMahoutSuite +import org.scalatest.{FunSuite, Matchers} + +import org.apache.mahout.test.DistributedMahoutSuite + +trait ClusteringSuiteBase extends DistributedMahoutSuite with Matchers { + + this: FunSuite => + + test("canopy test") { + val drmA = drmParallelize(dense((1.0, 1.2, 1.3, 1.4), (1.1, 1.5, 2.5, 1.0), (6.0, 5.2, -5.2, 5.3), (7.0,6.0, 5.0, 5.0), (10.0, 1.0, 20.0, -10.0))) + + import org.apache.mahout.math.algorithms.clustering.CanopyClustering + + val model = new CanopyClustering().fit(drmA, 't1 -> 6.5, 't2 -> 5.5, 'distanceMeasure -> 'Chebyshev) + val myAnswer = model.cluster(drmA).collect + + val correctAnswer = dense((0.0), (0.0), (1.0), (0.0), (2.0)) + + val epsilon = 1E-6 + (myAnswer.norm - correctAnswer.norm) should be <= epsilon + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/spark/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala b/spark/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala new file mode 100644 index 0000000..3e5a06e --- /dev/null +++ b/spark/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuite.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.sparkbindings.test.DistributedSparkSuite +import org.scalatest.FunSuite + +class ClusteringSuite extends FunSuite + with DistributedSparkSuite with ClusteringSuiteBase + http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/_includes/algo_navbar.html ---------------------------------------------------------------------- diff --git a/website/docs/_includes/algo_navbar.html b/website/docs/_includes/algo_navbar.html index 40a2861..38f833a 100644 --- a/website/docs/_includes/algo_navbar.html +++ b/website/docs/_includes/algo_navbar.html @@ -9,6 +9,14 @@ <li> <a href="{{ BASE_PATH }}/algorithms/linear-algebra/d-ssvd.html">Distributed Stochastic Singular Value Decomposition</a></li> </ul> </div> + <a href="#clustering" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#AlgoMenu"><b>Clustering</b><i class="fa fa-caret-down"></i></a> + <div class="collapse" id="clustering"> + <ul class="nav sidebar-nav"> + <li> <a href="{{ BASE_PATH }}/algorithms/clustering">Clustering Algorithms</a></li> + <li> <a href="{{ BASE_PATH }}/algorithms/clustering/distance-metrics.html">Distance Metrics</a></li> + <li> <a href="{{ BASE_PATH }}/algorithms/clustering/canopy">Canopy Clustering</a></li> + </ul> + </div> <a href="#preprocessors" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#AlgoMenu"><b>Preprocessors</b><i class="fa fa-caret-down"></i></a> <div class="collapse" id="preprocessors"> <ul class="nav sidebar-nav"> @@ -64,8 +72,8 @@ <li> <a href="{{ BASE_PATH }}/algorithms/map-reduce/classification/support-vector-machines.html">Support Vector Machines</a></li> </ul> </div> - <a href="#clustering" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#AlgoMenu"><b>Clustering</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="clustering"> + <a href="#mr-clustering" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#AlgoMenu"><b>Clustering</b><i class="fa fa-caret-down"></i></a> + <div class="collapse" id="mr-clustering"> <ul class="nav sidebar-nav"> <li> <a href="{{ BASE_PATH }}/algorithms/map-reduce/clustering/canopy-clustering.html">Canopy Clustering</a></li> <li> <a href="{{ BASE_PATH }}/algorithms/map-reduce/clustering/cluster-dumper.html">Cluster Dumper</a></li> http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/canopy/Canopy.png ---------------------------------------------------------------------- diff --git a/website/docs/algorithms/clustering/canopy/Canopy.png b/website/docs/algorithms/clustering/canopy/Canopy.png new file mode 100644 index 0000000..934efd7 Binary files /dev/null and b/website/docs/algorithms/clustering/canopy/Canopy.png differ http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/canopy/Canopy10.png ---------------------------------------------------------------------- diff --git a/website/docs/algorithms/clustering/canopy/Canopy10.png b/website/docs/algorithms/clustering/canopy/Canopy10.png new file mode 100644 index 0000000..4bb291c Binary files /dev/null and b/website/docs/algorithms/clustering/canopy/Canopy10.png differ http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/canopy/SampleData.png ---------------------------------------------------------------------- diff --git a/website/docs/algorithms/clustering/canopy/SampleData.png b/website/docs/algorithms/clustering/canopy/SampleData.png new file mode 100644 index 0000000..48c4690 Binary files /dev/null and b/website/docs/algorithms/clustering/canopy/SampleData.png differ http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/canopy/index.md ---------------------------------------------------------------------- diff --git a/website/docs/algorithms/clustering/canopy/index.md b/website/docs/algorithms/clustering/canopy/index.md new file mode 100644 index 0000000..923c63a --- /dev/null +++ b/website/docs/algorithms/clustering/canopy/index.md @@ -0,0 +1,128 @@ +--- +layout: algorithm +title: Canopy Clustering +theme: + name: retro-mahout +--- + +### About + +[Canopy Clustering](http://www.kamalnigam.com/papers/canopy-kdd00.pdf) + is a very simple, fast and surprisingly accurate method for grouping +objects into clusters. All objects are represented as a point in a +multidimensional feature space. The algorithm uses a fast approximate +distance metric and two distance thresholds T1 > T2 for processing. The +basic algorithm is to begin with a set of points and remove one at random. +Create a Canopy containing this point and iterate through the remainder of +the point set. At each point, if its distance from the first point is < T1, +then add the point to the cluster. If, in addition, the distance is < T2, +then remove the point from the set. This way points that are very close to +the original will avoid all further processing. The algorithm loops until +the initial set is empty, accumulating a set of Canopies, each containing +one or more points. A given point may occur in more than one Canopy. + +Canopy Clustering is often used as an initial step in more rigorous +clustering techniques, such as [K-Means Clustering](k-means-clustering.html) +. By starting with an initial clustering the number of more expensive +distance measurements can be significantly reduced by ignoring points +outside of the initial canopies. + +#### Strategy for parallelization + +Looking at the sample Hadoop implementation in [http://code.google.com/p/canopy-clustering/](http://code.google.com/p/canopy-clustering/) + the processing is done in 3 steps: +1. The data is massaged into suitable input format +1. Each mapper performs canopy clustering on the points in its input set and +outputs its canopies' centers +1. The reducer clusters the canopy centers to produce the final canopy +centers + +The points are then clustered into these final canopies when the `model.cluster(inputDRM)` is called. + +Some ideas can be found in [Cluster computing and MapReduce](https://www.youtube.com/watch?v=yjPBkvYh-ss&list=PLEFAB97242917704A) + lecture video series \[by Google(r)\]; Canopy Clustering is discussed in [lecture #4](https://www.youtube.com/watch?v=1ZDybXl212Q) +. Finally here is the [Wikipedia page](http://en.wikipedia.org/wiki/Canopy_clustering_algorithm) +. + +#### Illustrations + +The following images illustrate Canopy clustering applied to a set of +randomly-generated 2-d data points. The points are generated using a normal +distribution centered at a mean location and with a constant standard +deviation. See the README file in the [/examples/src/main/java/org/apache/mahout/clustering/display/README.txt](https://github.com/apache/mahout/blob/master/examples/src/main/java/org/apache/mahout/clustering/display/README.txt) + for details on running similar examples. + +The points are generated as follows: + +* 500 samples m=\[1.0, 1.0\](1.0,-1.0\.html) + sd=3.0 +* 300 samples m=\[1.0, 0.0\](1.0,-0.0\.html) + sd=0.5 +* 300 samples m=\[0.0, 2.0\](0.0,-2.0\.html) + sd=0.1 + +In the first image, the points are plotted and the 3-sigma boundaries of +their generator are superimposed. + + + +In the second image, the resulting canopies are shown superimposed upon the +sample data. Each canopy is represented by two circles, with radius T1 and +radius T2. + + + +The third image uses the same values of T1 and T2 but only superimposes +canopies covering more than 10% of the population. This is a bit better +representation of the data but it still has lots of room for improvement. +The advantage of Canopy clustering is that it is single-pass and fast +enough to iterate runs using different T1, T2 parameters and display +thresholds. + + + +### Parameters + +<div class="table-striped"> + <table class="table"> + <tr> + <th>Parameter</th> + <th>Description</th> + <th>Default Value</th> + </tr> + <tr> + <td><code>'distanceMeasure</code></td> + <td>The metric used for calculating distance, see <a href="../distance-metrics.html">Distance Metrics</a></td> + <td><code>'Cosine</code></td> + </tr> + <tr> + <td><code>'t1</code></td> + <td>The "loose" distance in the mapping phase</code></td> + <td>0.5</td> + </tr> + <tr> + <td><code>'t2</code></td> + <td>The "tight" distance in the mapping phase</code></td> + <td>0.1</td> + </tr> + <tr> + <td><code>'t3</code></td> + <td>The "loose" distance in the reducing phase</code></td> + <td><code>'t1</code></td> + </tr> + <tr> + <td><code>'t4</code></td> + <td>The "tight" distance in the reducing phase</code></td> + <td><code>'t2</code></td> + </tr> + </table> +</div> + +### Example + + val drmA = drmParallelize(dense((1.0, 1.2, 1.3, 1.4), (1.1, 1.5, 2.5, 1.0), (6.0, 5.2, -5.2, 5.3), (7.0,6.0, 5.0, 5.0), (10.0, 1.0, 20.0, -10.0))) + + import org.apache.mahout.math.algorithms.clustering.CanopyClustering + + val model = new CanopyClustering().fit(drmA, 't1 -> 6.5, 't2 -> 5.5, 'distanceMeasure -> 'Chebyshev) + model.cluster(drmA).collect http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/distance-metrics.md ---------------------------------------------------------------------- diff --git a/website/docs/algorithms/clustering/distance-metrics.md b/website/docs/algorithms/clustering/distance-metrics.md new file mode 100644 index 0000000..5b25a93 --- /dev/null +++ b/website/docs/algorithms/clustering/distance-metrics.md @@ -0,0 +1,82 @@ +--- +layout: algorithm +title: Distance Metrics +theme: + name: retro-mahout +--- + +### Distance Metrics Supported By Mahout + +<div class="table-striped"> + <table class="table"> + <tr> + <th>Name</th> + <th>Object</th> + <th>Symbol</th> + </tr> + <tr> + <td><a href="https://en.wikipedia.org/wiki/Chebyshev_distance">Chebyshev Distance</a></td> + <td><code>org.apache.mahout.math.algorithms.common.distance.Chebyshev</code></td> + <td><code>'Chebyshev</code></td> + </tr> + <tr> + <td><a href="https://en.wikipedia.org/wiki/Cosine_similarity">Cosine Similarity</a></td> + <td><code>org.apache.mahout.math.algorithms.common.distance.Cosine</code></td> + <td><code>'Cosine</code></td> + </tr> + </table> +</div> + + +<!-- +A beginner JIRA to port the rest of these +[Euclidean](https://en.wikipedia.org/wiki/Euclidean_distance) + +[Mahalanobis](https://en.wikipedia.org/wiki/Mahalanobis_distance) + +[Manhattan](https://en.wiktionary.org/wiki/Manhattan_distance) + +[Minkowski](https://en.wikipedia.org/wiki/Minkowski_distance) + +[Squared Euclidian](https://en.wikipedia.org/wiki/Euclidean_distance#Squared_Euclidean_distance) + +[Tanimoto](https://en.wikipedia.org/wiki/Jaccard_index#Tanimoto_similarity_and_distance) + +Weighted Euclidean + +Weighted Manhattan--> + +### Using Distance Metrics + +In Mahout one can access the distant metrics directly to measure the distance between two arbitrary vectors, or +can specify which distance metric to use as part of an algorithm. In the latter case the distance metric is called +by `Symbol`, we never pass Distance metrics directly to an algorithm. This design choice, in part has to do with +serialization of object and keeping the engine bindings as simple as possible. Behind the scenes, the only thing +that is serialized and sent to the workers is a number which specifies what distant metric to use- this is much more +abstract and easier to maintain on the back end than making sure each function can be serialized by any arbitrary engine. +We feel from the user perspective, it may seem quirky but causes no decrease in usability. If a user wishes to use a +custom distance metric- simply add it to [math-scala/src/main/org/apache/mahout/math/common/DistanceMetrics.scala](https://github.com/apache/mahout/blob/master/math-scala/src/main/scala/org/apache/mahout/math/algorithms/common/DistanceMetrics.scala) +and recompile. + +### Examples + +**Meausring the distance between two vectors** + + import org.apache.mahout.math.algorithms.common.distance._ + + val v1 = dvec(1.0, 1.5, -1.2, 3.5) + val v2 = dvec(0.1, -1.4, 10.5, 3.2) + + Cosine.distance(v1, v2) + +**Using distance in clustering** + + import org.apache.mahout.math.algorithms.clustering.CanopyClustering + + val drmA = drmParallelize(dense((1.0, 1.2, 1.3, 1.4), + (1.1, 1.5, 2.5, 1.0), + (6.0, 5.2, -5.2, 5.3), + (7.0,6.0, 5.0, 5.0), + (10.0, 1.0, 20.0, -10.0))) + + val model = new CanopyClustering().fit(drmA, 'distanceMeasure -> 'Cosine) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/clustering/index.md ---------------------------------------------------------------------- diff --git a/website/docs/algorithms/clustering/index.md b/website/docs/algorithms/clustering/index.md new file mode 100644 index 0000000..50030c2 --- /dev/null +++ b/website/docs/algorithms/clustering/index.md @@ -0,0 +1,6 @@ +--- +layout: algorithm +title: Clustering Algorithms +theme: + name: retro-mahout +--- http://git-wip-us.apache.org/repos/asf/mahout/blob/c29496cb/website/docs/algorithms/map-reduce/clustering/canopy-clustering.md ---------------------------------------------------------------------- diff --git a/website/docs/algorithms/map-reduce/clustering/canopy-clustering.md b/website/docs/algorithms/map-reduce/clustering/canopy-clustering.md index d538d88..ac8a47c 100644 --- a/website/docs/algorithms/map-reduce/clustering/canopy-clustering.md +++ b/website/docs/algorithms/map-reduce/clustering/canopy-clustering.md @@ -5,6 +5,9 @@ theme: name: retro-mahout --- + +**_This Document refers to the deprecated Map-Reduce Version of this algorithm, please see [the new implementation]({{ BASE_PATH }}/algorithms/clustering/canopy.html)_** + <a name="CanopyClustering-CanopyClustering"></a> # Canopy Clustering
