[GitHub] spark pull request #21119: [SPARK-19826][ML][PYTHON]add spark.ml Python API ...

WeichenXu123 Thu, 26 Apr 2018 03:49:11 -0700

Github user WeichenXu123 commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21119#discussion_r184346287
  
    --- Diff: python/pyspark/ml/clustering.py ---
    @@ -1156,6 +1156,201 @@ def getKeepLastCheckpoint(self):
             return self.getOrDefault(self.keepLastCheckpoint)
     
     
    +class _PowerIterationClusteringParams(JavaParams, HasMaxIter, 
HasPredictionCol):
    +    """
    +    Params for :py:attr:`PowerIterationClustering`.
    +    .. versionadded:: 2.4.0
    +    """
    +
    +    k = Param(Params._dummy(), "k",
    +              "The number of clusters to create. Must be > 1.",
    +              typeConverter=TypeConverters.toInt)
    +    initMode = Param(Params._dummy(), "initMode",
    +                     "The initialization algorithm. This can be either " +
    +                     "'random' to use a random vector as vertex 
properties, or 'degree' to use " +
    +                     "a normalized sum of similarities with other 
vertices.  Supported options: " +
    +                     "'random' and 'degree'.",
    +                     typeConverter=TypeConverters.toString)
    +    idCol = Param(Params._dummy(), "idCol",
    +                  "Name of the input column for vertex IDs.",
    +                  typeConverter=TypeConverters.toString)
    +    neighborsCol = Param(Params._dummy(), "neighborsCol",
    +                         "Name of the input column for neighbors in the 
adjacency list " +
    +                         "representation.",
    +                         typeConverter=TypeConverters.toString)
    +    similaritiesCol = Param(Params._dummy(), "similaritiesCol",
    +                            "Name of the input column for non-negative 
weights (similarities) " +
    +                            "of edges between the vertex in `idCol` and 
each neighbor in " +
    +                            "`neighborsCol`",
    +                            typeConverter=TypeConverters.toString)
    +
    +    @since("2.4.0")
    +    def getK(self):
    +        """
    +        Gets the value of `k`
    +        """
    +        return self.getOrDefault(self.k)
    +
    +    @since("2.4.0")
    +    def getInitMode(self):
    +        """
    +        Gets the value of `initMode`
    +        """
    +        return self.getOrDefault(self.initMode)
    +
    +    @since("2.4.0")
    +    def getIdCol(self):
    +        """
    +        Gets the value of `idCol`
    +        """
    +        return self.getOrDefault(self.idCol)
    +
    +    @since("2.4.0")
    +    def getNeighborsCol(self):
    +        """
    +        Gets the value of `neighborsCol`
    +        """
    +        return self.getOrDefault(self.neighborsCol)
    +
    +    @since("2.4.0")
    +    def getSimilaritiesCol(self):
    +        """
    +        Gets the value of `similaritiesCol`
    +        """
    +        return self.getOrDefault(self.binary)
    +
    +
    +@inherit_doc
    +class PowerIterationClustering(JavaTransformer, 
_PowerIterationClusteringParams, JavaMLReadable,
    +                               JavaMLWritable):
    +    """
    +    Model produced by [[PowerIterationClustering]].
    +    >>> from pyspark.sql.types import ArrayType, DoubleType, LongType, 
StructField, StructType
    +    >>> import math
    +    >>> def genCircle(r, n):
    +    ...     points = []
    +    ...     for i in range(0, n):
    +    ...         theta = 2.0 * math.pi * i / n
    +    ...         points.append((r * math.cos(theta), r * math.sin(theta)))
    +    ...     return points
    +    >>> def sim(x, y):
    +    ...     dist = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - 
y[1])
    +    ...     return math.exp(-dist / 2.0)
    +    >>> r1 = 1.0
    +    >>> n1 = 10
    +    >>> r2 = 4.0
    +    >>> n2 = 40
    +    >>> n = n1 + n2
    +    >>> points = genCircle(r1, n1) + genCircle(r2, n2)
    +    >>> similarities = []
    +    >>> for i in range (1, n):
    +    ...    neighbor = []
    +    ...    weight = []
    +    ...    for j in range (i):
    +    ...        neighbor.append((long)(j))
    +    ...        weight.append(sim(points[i], points[j]))
    +    ...    similarities.append([(long)(i), neighbor, weight])
    --- End diff --
    
    The doctest code looks like too long, maybe more proper to put it in 
examples.
    Could you replace the data generation code here by using a simple hardcoded 
dataset ?



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #21119: [SPARK-19826][ML][PYTHON]add spark.ml Python API ...

Reply via email to