Github user WeichenXu123 commented on a diff in the pull request:
https://github.com/apache/spark/pull/21513#discussion_r194215008
--- Diff: python/pyspark/ml/clustering.py ---
@@ -1156,6 +1157,204 @@ def getKeepLastCheckpoint(self):
return self.getOrDefault(self.keepLastCheckpoint)
+@inherit_doc
+class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams,
JavaMLReadable,
+ JavaMLWritable):
+ """
+ .. note:: Experimental
+
+ Power Iteration Clustering (PIC), a scalable graph clustering
algorithm developed by
+ <a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From
the abstract:
+ PIC finds a very low-dimensional embedding of a dataset using
truncated power
+ iteration on a normalized pair-wise similarity matrix of the data.
+
+ This class is not yet an Estimator/Transformer, use `assignClusters`
method to run the
+ PowerIterationClustering algorithm.
+
+ .. seealso:: `Wikipedia on Spectral clustering \
+ <http://en.wikipedia.org/wiki/Spectral_clustering>`_
+
+ >>> from pyspark.sql.types import DoubleType, LongType, StructField,
StructType
+ >>> import math
+ >>> def genCircle(r, n):
+ ... points = []
+ ... for i in range(0, n):
+ ... theta = 2.0 * math.pi * i / n
+ ... points.append((r * math.cos(theta), r * math.sin(theta)))
+ ... return points
+ >>> def sim(x, y):
+ ... dist = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] -
y[1])
+ ... return math.exp(-dist / 2.0)
+ >>> r1 = 1.0
+ >>> n1 = 10
+ >>> r2 = 4.0
+ >>> n2 = 40
+ >>> n = n1 + n2
+ >>> points = genCircle(r1, n1) + genCircle(r2, n2)
+ >>> data = [(i, j, sim(points[i], points[j])) for i in range(1, n) for
j in range(0, i)]
+ >>> rdd = sc.parallelize(data, 2)
+ >>> schema = StructType([StructField("src", LongType(), False), \
+ StructField("dst", LongType(), True), \
+ StructField("weight", DoubleType(), True)])
+ >>> df = spark.createDataFrame(rdd, schema)
+ >>> pic = PowerIterationClustering()
+ >>> assignments =
pic.setK(2).setMaxIter(40).setWeightCol("weight").assignClusters(df)
+ >>> result = sorted(assignments.collect(), key=lambda x: x.id)
+ >>> result[0].cluster == result[1].cluster == result[2].cluster ==
result[3].cluster
+ True
+ >>> result[4].cluster == result[5].cluster == result[6].cluster ==
result[7].cluster
+ True
+ >>> pic_path = temp_path + "/pic"
+ >>> pic.save(pic_path)
+ >>> pic2 = PowerIterationClustering.load(pic_path)
+ >>> pic2.getK()
+ 2
+ >>> pic2.getMaxIter()
+ 40
+ >>> assignments2 = pic2.assignClusters(df)
+ >>> result2 = sorted(assignments2.collect(), key=lambda x: x.id)
+ >>> result2[0].cluster == result2[1].cluster == result2[2].cluster ==
result2[3].cluster
+ True
+ >>> result2[4].cluster == result2[5].cluster == result2[6].cluster ==
result2[7].cluster
+ True
--- End diff --
Let's use a simpler way to check result, like:
```
>>> assignments.sort(assignments.id).show(truncate=False)
...
```
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]