Github user huaxingao commented on a diff in the pull request: https://github.com/apache/spark/pull/21119#discussion_r184809072 --- Diff: python/pyspark/ml/clustering.py --- @@ -1156,6 +1156,201 @@ def getKeepLastCheckpoint(self): return self.getOrDefault(self.keepLastCheckpoint) +class _PowerIterationClusteringParams(JavaParams, HasMaxIter, HasPredictionCol): + """ + Params for :py:attr:`PowerIterationClustering`. + .. versionadded:: 2.4.0 + """ + + k = Param(Params._dummy(), "k", + "The number of clusters to create. Must be > 1.", + typeConverter=TypeConverters.toInt) + initMode = Param(Params._dummy(), "initMode", + "The initialization algorithm. This can be either " + + "'random' to use a random vector as vertex properties, or 'degree' to use " + + "a normalized sum of similarities with other vertices. Supported options: " + + "'random' and 'degree'.", + typeConverter=TypeConverters.toString) + idCol = Param(Params._dummy(), "idCol", + "Name of the input column for vertex IDs.", + typeConverter=TypeConverters.toString) + neighborsCol = Param(Params._dummy(), "neighborsCol", + "Name of the input column for neighbors in the adjacency list " + + "representation.", + typeConverter=TypeConverters.toString) + similaritiesCol = Param(Params._dummy(), "similaritiesCol", + "Name of the input column for non-negative weights (similarities) " + + "of edges between the vertex in `idCol` and each neighbor in " + + "`neighborsCol`", + typeConverter=TypeConverters.toString) + + @since("2.4.0") + def getK(self): + """ + Gets the value of `k` + """ + return self.getOrDefault(self.k) + + @since("2.4.0") + def getInitMode(self): + """ + Gets the value of `initMode` + """ + return self.getOrDefault(self.initMode) + + @since("2.4.0") + def getIdCol(self): + """ + Gets the value of `idCol` + """ + return self.getOrDefault(self.idCol) + + @since("2.4.0") + def getNeighborsCol(self): + """ + Gets the value of `neighborsCol` + """ + return self.getOrDefault(self.neighborsCol) + + @since("2.4.0") + def getSimilaritiesCol(self): + """ + Gets the value of `similaritiesCol` + """ + return self.getOrDefault(self.binary) + + +@inherit_doc +class PowerIterationClustering(JavaTransformer, _PowerIterationClusteringParams, JavaMLReadable, + JavaMLWritable): + """ + Model produced by [[PowerIterationClustering]]. + >>> from pyspark.sql.types import ArrayType, DoubleType, LongType, StructField, StructType + >>> import math + >>> def genCircle(r, n): + ... points = [] + ... for i in range(0, n): + ... theta = 2.0 * math.pi * i / n + ... points.append((r * math.cos(theta), r * math.sin(theta))) + ... return points + >>> def sim(x, y): + ... dist = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1]) + ... return math.exp(-dist / 2.0) + >>> r1 = 1.0 + >>> n1 = 10 + >>> r2 = 4.0 + >>> n2 = 40 + >>> n = n1 + n2 + >>> points = genCircle(r1, n1) + genCircle(r2, n2) + >>> similarities = [] + >>> for i in range (1, n): + ... neighbor = [] + ... weight = [] + ... for j in range (i): + ... neighbor.append((long)(j)) + ... weight.append(sim(points[i], points[j])) + ... similarities.append([(long)(i), neighbor, weight]) --- End diff -- @WeichenXu123 I will move this to tests, and add a simple example in the doctest.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org