[spark] branch branch-3.3 updated: [SPARK-37402][PYTHON][MLLIB] Inline typehints for pyspark.mllib.clustering

zero323 Mon, 11 Apr 2022 02:30:45 -0700

This is an automated email from the ASF dual-hosted git repository.

zero323 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.3 by this push:
     new f4457e6310f [SPARK-37402][PYTHON][MLLIB] Inline typehints for 
pyspark.mllib.clustering
f4457e6310f is described below

commit f4457e6310f6bd900d7634b279606436a6faf8fb
Author: zero323 <[email protected]>
AuthorDate: Mon Apr 11 11:29:47 2022 +0200

    [SPARK-37402][PYTHON][MLLIB] Inline typehints for pyspark.mllib.clustering
    
    ### What changes were proposed in this pull request?
    
    This PR migrates type `pyspark.mllib.clustering` annotations from stub file 
to inline type hints.
    
    ### Why are the changes needed?
    
    Part of ongoing migration of type hints.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Existing tests.
    
    Closes #35578 from zero323/SPARK-37402.
    
    Authored-by: zero323 <[email protected]>
    Signed-off-by: zero323 <[email protected]>
    (cherry picked from commit e71cf3907b9ff2036dfe45bc8fe939f20cca741b)
    Signed-off-by: zero323 <[email protected]>
---
 python/pyspark/mllib/clustering.py  | 247 ++++++++++++++++++++++++------------
 python/pyspark/mllib/clustering.pyi | 188 ---------------------------
 2 files changed, 163 insertions(+), 272 deletions(-)

diff --git a/python/pyspark/mllib/clustering.py 
b/python/pyspark/mllib/clustering.py
index a9e4fd82089..fd33887fd9e 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -19,7 +19,9 @@ import sys
 import array as pyarray
 from math import exp, log
 from collections import namedtuple
+from typing import Any, List, Optional, Tuple, TypeVar, Union, overload, 
TYPE_CHECKING
 
+import numpy as np
 from numpy import array, random, tile
 
 from pyspark import SparkContext, since
@@ -30,6 +32,12 @@ from pyspark.mllib.stat.distribution import 
MultivariateGaussian
 from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, 
JavaSaveable
 from pyspark.streaming import DStream
 
+if TYPE_CHECKING:
+    from py4j.java_gateway import JavaObject
+    from pyspark.mllib._typing import VectorLike
+
+T = TypeVar("T")
+
 __all__ = [
     "BisectingKMeansModel",
     "BisectingKMeans",
@@ -67,24 +75,32 @@ class BisectingKMeansModel(JavaModelWrapper):
     0.0
     """
 
-    def __init__(self, java_model):
+    def __init__(self, java_model: "JavaObject"):
         super(BisectingKMeansModel, self).__init__(java_model)
         self.centers = [c.toArray() for c in self.call("clusterCenters")]
 
-    @property
+    @property  # type: ignore[misc]
     @since("2.0.0")
-    def clusterCenters(self):
+    def clusterCenters(self) -> List[np.ndarray]:
         """Get the cluster centers, represented as a list of NumPy
         arrays."""
         return self.centers
 
-    @property
+    @property  # type: ignore[misc]
     @since("2.0.0")
-    def k(self):
+    def k(self) -> int:
         """Get the number of clusters"""
         return self.call("k")
 
-    def predict(self, x):
+    @overload
+    def predict(self, x: "VectorLike") -> int:
+        ...
+
+    @overload
+    def predict(self, x: RDD["VectorLike"]) -> RDD[int]:
+        ...
+
+    def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[int, 
RDD[int]]:
         """
         Find the cluster that each of the points belongs to in this
         model.
@@ -111,7 +127,7 @@ class BisectingKMeansModel(JavaModelWrapper):
         x = _convert_to_vector(x)
         return self.call("predict", x)
 
-    def computeCost(self, x):
+    def computeCost(self, x: Union["VectorLike", RDD["VectorLike"]]) -> float:
         """
         Return the Bisecting K-means cost (sum of squared distances of
         points to their nearest center) for this model on the given
@@ -159,7 +175,14 @@ class BisectingKMeans:
     """
 
     @classmethod
-    def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, 
seed=-1888008604):
+    def train(
+        self,
+        rdd: RDD["VectorLike"],
+        k: int = 4,
+        maxIterations: int = 20,
+        minDivisibleClusterSize: float = 1.0,
+        seed: int = -1888008604,
+    ) -> BisectingKMeansModel:
         """
         Runs the bisecting k-means algorithm return the model.
 
@@ -197,7 +220,7 @@ class BisectingKMeans:
 
 
 @inherit_doc
-class KMeansModel(Saveable, Loader):
+class KMeansModel(Saveable, Loader["KMeansModel"]):
 
     """A clustering model derived from the k-means method.
 
@@ -255,22 +278,30 @@ class KMeansModel(Saveable, Loader):
     [array([-1000., -1000.]), array([ 5.,  5.]), array([ 1000.,  1000.])]
     """
 
-    def __init__(self, centers):
+    def __init__(self, centers: List["VectorLike"]):
         self.centers = centers
 
-    @property
+    @property  # type: ignore[misc]
     @since("1.0.0")
-    def clusterCenters(self):
+    def clusterCenters(self) -> List["VectorLike"]:
         """Get the cluster centers, represented as a list of NumPy arrays."""
         return self.centers
 
-    @property
+    @property  # type: ignore[misc]
     @since("1.4.0")
-    def k(self):
+    def k(self) -> int:
         """Total number of clusters."""
         return len(self.centers)
 
-    def predict(self, x):
+    @overload
+    def predict(self, x: "VectorLike") -> int:
+        ...
+
+    @overload
+    def predict(self, x: RDD["VectorLike"]) -> RDD[int]:
+        ...
+
+    def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[int, 
RDD[int]]:
         """
         Find the cluster that each of the points belongs to in this
         model.
@@ -297,13 +328,13 @@ class KMeansModel(Saveable, Loader):
 
         x = _convert_to_vector(x)
         for i in range(len(self.centers)):
-            distance = x.squared_distance(self.centers[i])
+            distance = x.squared_distance(self.centers[i])  # type: 
ignore[attr-defined]
             if distance < best_distance:
                 best = i
                 best_distance = distance
         return best
 
-    def computeCost(self, rdd):
+    def computeCost(self, rdd: RDD["VectorLike"]) -> float:
         """
         Return the K-means cost (sum of squared distances of points to
         their nearest center) for this model on the given
@@ -324,20 +355,24 @@ class KMeansModel(Saveable, Loader):
         return cost
 
     @since("1.4.0")
-    def save(self, sc, path):
+    def save(self, sc: SparkContext, path: str) -> None:
         """
         Save this model to the given path.
         """
+        assert sc._jvm is not None
+
         java_centers = _py2java(sc, [_convert_to_vector(c) for c in 
self.centers])
         java_model = 
sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers)
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
     @since("1.4.0")
-    def load(cls, sc, path):
+    def load(cls, sc: SparkContext, path: str) -> "KMeansModel":
         """
         Load a model from the given path.
         """
+        assert sc._jvm is not None
+
         java_model = 
sc._jvm.org.apache.spark.mllib.clustering.KMeansModel.load(sc._jsc.sc(), path)
         return KMeansModel(_java2py(sc, java_model.clusterCenters()))
 
@@ -352,16 +387,16 @@ class KMeans:
     @classmethod
     def train(
         cls,
-        rdd,
-        k,
-        maxIterations=100,
-        initializationMode="k-means||",
-        seed=None,
-        initializationSteps=2,
-        epsilon=1e-4,
-        initialModel=None,
-        distanceMeasure="euclidean",
-    ):
+        rdd: RDD["VectorLike"],
+        k: int,
+        maxIterations: int = 100,
+        initializationMode: str = "k-means||",
+        seed: Optional[int] = None,
+        initializationSteps: int = 2,
+        epsilon: float = 1e-4,
+        initialModel: Optional[KMeansModel] = None,
+        distanceMeasure: str = "euclidean",
+    ) -> "KMeansModel":
         """
         Train a k-means clustering model.
 
@@ -428,7 +463,7 @@ class KMeans:
 
 
 @inherit_doc
-class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader):
+class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, 
JavaLoader["GaussianMixtureModel"]):
 
     """
     A clustering model derived from the Gaussian Mixture Model method.
@@ -497,18 +532,18 @@ class GaussianMixtureModel(JavaModelWrapper, 
JavaSaveable, JavaLoader):
     True
     """
 
-    @property
+    @property  # type: ignore[misc]
     @since("1.4.0")
-    def weights(self):
+    def weights(self) -> np.ndarray:
         """
         Weights for each Gaussian distribution in the mixture, where 
weights[i] is
         the weight for Gaussian i, and weights.sum == 1.
         """
         return array(self.call("weights"))
 
-    @property
+    @property  # type: ignore[misc]
     @since("1.4.0")
-    def gaussians(self):
+    def gaussians(self) -> List[MultivariateGaussian]:
         """
         Array of MultivariateGaussian where gaussians[i] represents
         the Multivariate Gaussian (Normal) Distribution for Gaussian i.
@@ -517,13 +552,21 @@ class GaussianMixtureModel(JavaModelWrapper, 
JavaSaveable, JavaLoader):
             MultivariateGaussian(gaussian[0], gaussian[1]) for gaussian in 
self.call("gaussians")
         ]
 
-    @property
+    @property  # type: ignore[misc]
     @since("1.4.0")
-    def k(self):
+    def k(self) -> int:
         """Number of gaussians in mixture."""
         return len(self.weights)
 
-    def predict(self, x):
+    @overload
+    def predict(self, x: "VectorLike") -> np.int64:
+        ...
+
+    @overload
+    def predict(self, x: RDD["VectorLike"]) -> RDD[int]:
+        ...
+
+    def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> 
Union[np.int64, RDD[int]]:
         """
         Find the cluster to which the point 'x' or each point in RDD 'x'
         has maximum membership in this model.
@@ -548,7 +591,17 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, 
JavaLoader):
             z = self.predictSoft(x)
             return z.argmax()
 
-    def predictSoft(self, x):
+    @overload
+    def predictSoft(self, x: "VectorLike") -> np.ndarray:
+        ...
+
+    @overload
+    def predictSoft(self, x: RDD["VectorLike"]) -> RDD[pyarray.array]:
+        ...
+
+    def predictSoft(
+        self, x: Union["VectorLike", RDD["VectorLike"]]
+    ) -> Union[np.ndarray, RDD[pyarray.array]]:
         """
         Find the membership of point 'x' or each point in RDD 'x' to all 
mixture components.
 
@@ -579,7 +632,7 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, 
JavaLoader):
             return self.call("predictSoft", _convert_to_vector(x)).toArray()
 
     @classmethod
-    def load(cls, sc, path):
+    def load(cls, sc: SparkContext, path: str) -> "GaussianMixtureModel":
         """Load the GaussianMixtureModel from disk.
 
         .. versionadded:: 1.5.0
@@ -590,6 +643,8 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, 
JavaLoader):
         path : str
             Path to where the model is stored.
         """
+        assert sc._jvm is not None
+
         model = cls._load_java(sc, path)
         wrapper = 
sc._jvm.org.apache.spark.mllib.api.python.GaussianMixtureModelWrapper(model)
         return cls(wrapper)
@@ -603,7 +658,15 @@ class GaussianMixture:
     """
 
     @classmethod
-    def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, 
initialModel=None):
+    def train(
+        cls,
+        rdd: RDD["VectorLike"],
+        k: int,
+        convergenceTol: float = 1e-3,
+        maxIterations: int = 100,
+        seed: Optional[int] = None,
+        initialModel: Optional[GaussianMixtureModel] = None,
+    ) -> GaussianMixtureModel:
         """
         Train a Gaussian Mixture clustering model.
 
@@ -658,7 +721,9 @@ class GaussianMixture:
         return GaussianMixtureModel(java_model)
 
 
-class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, 
JavaLoader):
+class PowerIterationClusteringModel(
+    JavaModelWrapper, JavaSaveable, JavaLoader["PowerIterationClusteringModel"]
+):
 
     """
     Model produced by :py:class:`PowerIterationClustering`.
@@ -711,16 +776,16 @@ class PowerIterationClusteringModel(JavaModelWrapper, 
JavaSaveable, JavaLoader):
     ...     pass
     """
 
-    @property
+    @property  # type: ignore[misc]
     @since("1.5.0")
-    def k(self):
+    def k(self) -> int:
         """
         Returns the number of clusters.
         """
         return self.call("k")
 
     @since("1.5.0")
-    def assignments(self):
+    def assignments(self) -> RDD["PowerIterationClustering.Assignment"]:
         """
         Returns the cluster assignments of this model.
         """
@@ -728,10 +793,12 @@ class PowerIterationClusteringModel(JavaModelWrapper, 
JavaSaveable, JavaLoader):
 
     @classmethod
     @since("1.5.0")
-    def load(cls, sc, path):
+    def load(cls, sc: SparkContext, path: str) -> 
"PowerIterationClusteringModel":
         """
         Load a model from the given path.
         """
+        assert sc._jvm is not None
+
         model = cls._load_java(sc, path)
         wrapper = 
sc._jvm.org.apache.spark.mllib.api.python.PowerIterationClusteringModelWrapper(
             model
@@ -757,7 +824,13 @@ class PowerIterationClustering:
     """
 
     @classmethod
-    def train(cls, rdd, k, maxIterations=100, initMode="random"):
+    def train(
+        cls,
+        rdd: RDD[Tuple[int, int, float]],
+        k: int,
+        maxIterations: int = 100,
+        initMode: str = "random",
+    ) -> PowerIterationClusteringModel:
         r"""
         Train PowerIterationClusteringModel
 
@@ -867,18 +940,20 @@ class StreamingKMeansModel(KMeansModel):
     1
     """
 
-    def __init__(self, clusterCenters, clusterWeights):
+    def __init__(self, clusterCenters: List["VectorLike"], clusterWeights: 
"VectorLike"):
         super(StreamingKMeansModel, self).__init__(centers=clusterCenters)
-        self._clusterWeights = list(clusterWeights)
+        self._clusterWeights = list(clusterWeights)  # type: ignore[arg-type]
 
-    @property
+    @property  # type: ignore[misc]
     @since("1.5.0")
-    def clusterWeights(self):
+    def clusterWeights(self) -> List[np.float64]:
         """Return the cluster weights."""
         return self._clusterWeights
 
     @since("1.5.0")
-    def update(self, data, decayFactor, timeUnit):
+    def update(
+        self, data: RDD["VectorLike"], decayFactor: float, timeUnit: str
+    ) -> "StreamingKMeansModel":
         """Update the centroids, according to data
 
         .. versionadded:: 1.5.0
@@ -909,7 +984,7 @@ class StreamingKMeansModel(KMeansModel):
             decayFactor,
             timeUnit,
         )
-        self.centers = array(updatedModel[0])
+        self.centers = array(updatedModel[0])  # type: ignore[assignment]
         self._clusterWeights = list(updatedModel[1])
         return self
 
@@ -938,20 +1013,20 @@ class StreamingKMeans:
         (default: "batches")
     """
 
-    def __init__(self, k=2, decayFactor=1.0, timeUnit="batches"):
+    def __init__(self, k: int = 2, decayFactor: float = 1.0, timeUnit: str = 
"batches"):
         self._k = k
         self._decayFactor = decayFactor
         if timeUnit not in ["batches", "points"]:
             raise ValueError("timeUnit should be 'batches' or 'points', got 
%s." % timeUnit)
         self._timeUnit = timeUnit
-        self._model = None
+        self._model: Optional[StreamingKMeansModel] = None
 
     @since("1.5.0")
-    def latestModel(self):
+    def latestModel(self) -> Optional[StreamingKMeansModel]:
         """Return the latest model"""
         return self._model
 
-    def _validate(self, dstream):
+    def _validate(self, dstream: Any) -> None:
         if self._model is None:
             raise ValueError(
                 "Initial centers should be set either by setInitialCenters " 
"or setRandomCenters."
@@ -962,19 +1037,19 @@ class StreamingKMeans:
             )
 
     @since("1.5.0")
-    def setK(self, k):
+    def setK(self, k: int) -> "StreamingKMeans":
         """Set number of clusters."""
         self._k = k
         return self
 
     @since("1.5.0")
-    def setDecayFactor(self, decayFactor):
+    def setDecayFactor(self, decayFactor: float) -> "StreamingKMeans":
         """Set decay factor."""
         self._decayFactor = decayFactor
         return self
 
     @since("1.5.0")
-    def setHalfLife(self, halfLife, timeUnit):
+    def setHalfLife(self, halfLife: float, timeUnit: str) -> "StreamingKMeans":
         """
         Set number of batches after which the centroids of that
         particular batch has half the weightage.
@@ -984,7 +1059,9 @@ class StreamingKMeans:
         return self
 
     @since("1.5.0")
-    def setInitialCenters(self, centers, weights):
+    def setInitialCenters(
+        self, centers: List["VectorLike"], weights: List[float]
+    ) -> "StreamingKMeans":
         """
         Set initial centers. Should be set before calling trainOn.
         """
@@ -992,7 +1069,7 @@ class StreamingKMeans:
         return self
 
     @since("1.5.0")
-    def setRandomCenters(self, dim, weight, seed):
+    def setRandomCenters(self, dim: int, weight: float, seed: int) -> 
"StreamingKMeans":
         """
         Set the initial centers to be random samples from
         a gaussian population with constant weights.
@@ -1000,39 +1077,39 @@ class StreamingKMeans:
         rng = random.RandomState(seed)
         clusterCenters = rng.randn(self._k, dim)
         clusterWeights = tile(weight, self._k)
-        self._model = StreamingKMeansModel(clusterCenters, clusterWeights)
+        self._model = StreamingKMeansModel(clusterCenters, clusterWeights)  # 
type: ignore[arg-type]
         return self
 
     @since("1.5.0")
-    def trainOn(self, dstream):
+    def trainOn(self, dstream: "DStream[VectorLike]") -> None:
         """Train the model on the incoming dstream."""
         self._validate(dstream)
 
-        def update(rdd):
-            self._model.update(rdd, self._decayFactor, self._timeUnit)
+        def update(rdd: RDD["VectorLike"]) -> None:
+            self._model.update(rdd, self._decayFactor, self._timeUnit)  # 
type: ignore[union-attr]
 
         dstream.foreachRDD(update)
 
     @since("1.5.0")
-    def predictOn(self, dstream):
+    def predictOn(self, dstream: "DStream[VectorLike]") -> "DStream[int]":
         """
         Make predictions on a dstream.
         Returns a transformed dstream object
         """
         self._validate(dstream)
-        return dstream.map(lambda x: self._model.predict(x))
+        return dstream.map(lambda x: self._model.predict(x))  # type: 
ignore[union-attr]
 
     @since("1.5.0")
-    def predictOnValues(self, dstream):
+    def predictOnValues(self, dstream: "DStream[Tuple[T, VectorLike]]") -> 
"DStream[Tuple[T, int]]":
         """
         Make predictions on a keyed dstream.
         Returns a transformed dstream object.
         """
         self._validate(dstream)
-        return dstream.mapValues(lambda x: self._model.predict(x))
+        return dstream.mapValues(lambda x: self._model.predict(x))  # type: 
ignore[union-attr]
 
 
-class LDAModel(JavaModelWrapper, JavaSaveable, Loader):
+class LDAModel(JavaModelWrapper, JavaSaveable, Loader["LDAModel"]):
 
     """A clustering model derived from the LDA method.
 
@@ -1089,16 +1166,18 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader):
     """
 
     @since("1.5.0")
-    def topicsMatrix(self):
+    def topicsMatrix(self) -> np.ndarray:
         """Inferred topics, where each topic is represented by a distribution 
over terms."""
         return self.call("topicsMatrix").toArray()
 
     @since("1.5.0")
-    def vocabSize(self):
+    def vocabSize(self) -> int:
         """Vocabulary size (number of terms or terms in the vocabulary)"""
         return self.call("vocabSize")
 
-    def describeTopics(self, maxTermsPerTopic=None):
+    def describeTopics(
+        self, maxTermsPerTopic: Optional[int] = None
+    ) -> List[Tuple[List[int], List[float]]]:
         """Return the topics described by weighted terms.
 
         .. versionadded:: 1.6.0
@@ -1124,7 +1203,7 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader):
         return topics
 
     @classmethod
-    def load(cls, sc, path):
+    def load(cls, sc: SparkContext, path: str) -> "LDAModel":
         """Load the LDAModel from disk.
 
         .. versionadded:: 1.5.0
@@ -1153,15 +1232,15 @@ class LDA:
     @classmethod
     def train(
         cls,
-        rdd,
-        k=10,
-        maxIterations=20,
-        docConcentration=-1.0,
-        topicConcentration=-1.0,
-        seed=None,
-        checkpointInterval=10,
-        optimizer="em",
-    ):
+        rdd: RDD[Tuple[int, "VectorLike"]],
+        k: int = 10,
+        maxIterations: int = 20,
+        docConcentration: float = -1.0,
+        topicConcentration: float = -1.0,
+        seed: Optional[int] = None,
+        checkpointInterval: int = 10,
+        optimizer: str = "em",
+    ) -> LDAModel:
         """Train a LDA model.
 
         .. versionadded:: 1.5.0
@@ -1215,7 +1294,7 @@ class LDA:
         return LDAModel(model)
 
 
-def _test():
+def _test() -> None:
     import doctest
     import numpy
     import pyspark.mllib.clustering
diff --git a/python/pyspark/mllib/clustering.pyi 
b/python/pyspark/mllib/clustering.pyi
deleted file mode 100644
index 8a8401d3565..00000000000
--- a/python/pyspark/mllib/clustering.pyi
+++ /dev/null
@@ -1,188 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from typing import overload
-from typing import List, NamedTuple, Optional, Tuple, TypeVar
-
-import array
-
-from numpy import float64, int64, ndarray
-from py4j.java_gateway import JavaObject
-
-from pyspark.mllib._typing import VectorLike
-from pyspark.context import SparkContext
-from pyspark.rdd import RDD
-from pyspark.mllib.common import JavaModelWrapper
-from pyspark.mllib.stat.distribution import MultivariateGaussian
-from pyspark.mllib.util import Saveable, Loader, JavaLoader, JavaSaveable
-from pyspark.streaming.dstream import DStream
-
-T = TypeVar("T")
-
-class BisectingKMeansModel(JavaModelWrapper):
-    centers: List[VectorLike]
-    def __init__(self, java_model: JavaObject) -> None: ...
-    @property
-    def clusterCenters(self) -> List[ndarray]: ...
-    @property
-    def k(self) -> int: ...
-    @overload
-    def predict(self, x: VectorLike) -> int: ...
-    @overload
-    def predict(self, x: RDD[VectorLike]) -> RDD[int]: ...
-    @overload
-    def computeCost(self, x: VectorLike) -> float: ...
-    @overload
-    def computeCost(self, x: RDD[VectorLike]) -> float: ...
-
-class BisectingKMeans:
-    @classmethod
-    def train(
-        self,
-        rdd: RDD[VectorLike],
-        k: int = ...,
-        maxIterations: int = ...,
-        minDivisibleClusterSize: float = ...,
-        seed: int = ...,
-    ) -> BisectingKMeansModel: ...
-
-class KMeansModel(Saveable, Loader[KMeansModel]):
-    centers: List[VectorLike]
-    def __init__(self, centers: List[VectorLike]) -> None: ...
-    @property
-    def clusterCenters(self) -> List[ndarray]: ...
-    @property
-    def k(self) -> int: ...
-    @overload
-    def predict(self, x: VectorLike) -> int: ...
-    @overload
-    def predict(self, x: RDD[VectorLike]) -> RDD[int]: ...
-    def computeCost(self, rdd: RDD[VectorLike]) -> float: ...
-    def save(self, sc: SparkContext, path: str) -> None: ...
-    @classmethod
-    def load(cls, sc: SparkContext, path: str) -> KMeansModel: ...
-
-class KMeans:
-    @classmethod
-    def train(
-        cls,
-        rdd: RDD[VectorLike],
-        k: int,
-        maxIterations: int = ...,
-        initializationMode: str = ...,
-        seed: Optional[int] = ...,
-        initializationSteps: int = ...,
-        epsilon: float = ...,
-        initialModel: Optional[KMeansModel] = ...,
-    ) -> KMeansModel: ...
-
-class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, 
JavaLoader[GaussianMixtureModel]):
-    @property
-    def weights(self) -> ndarray: ...
-    @property
-    def gaussians(self) -> List[MultivariateGaussian]: ...
-    @property
-    def k(self) -> int: ...
-    @overload
-    def predict(self, x: VectorLike) -> int64: ...
-    @overload
-    def predict(self, x: RDD[VectorLike]) -> RDD[int]: ...
-    @overload
-    def predictSoft(self, x: VectorLike) -> ndarray: ...
-    @overload
-    def predictSoft(self, x: RDD[VectorLike]) -> RDD[array.array]: ...
-    @classmethod
-    def load(cls, sc: SparkContext, path: str) -> GaussianMixtureModel: ...
-
-class GaussianMixture:
-    @classmethod
-    def train(
-        cls,
-        rdd: RDD[VectorLike],
-        k: int,
-        convergenceTol: float = ...,
-        maxIterations: int = ...,
-        seed: Optional[int] = ...,
-        initialModel: Optional[GaussianMixtureModel] = ...,
-    ) -> GaussianMixtureModel: ...
-
-class PowerIterationClusteringModel(
-    JavaModelWrapper, JavaSaveable, JavaLoader[PowerIterationClusteringModel]
-):
-    @property
-    def k(self) -> int: ...
-    def assignments(self) -> RDD[PowerIterationClustering.Assignment]: ...
-    @classmethod
-    def load(cls, sc: SparkContext, path: str) -> 
PowerIterationClusteringModel: ...
-
-class PowerIterationClustering:
-    @classmethod
-    def train(
-        cls,
-        rdd: RDD[Tuple[int, int, float]],
-        k: int,
-        maxIterations: int = ...,
-        initMode: str = ...,
-    ) -> PowerIterationClusteringModel: ...
-    class Assignment(NamedTuple("Assignment", [("id", int), ("cluster", 
int)])): ...
-
-class StreamingKMeansModel(KMeansModel):
-    def __init__(self, clusterCenters: List[VectorLike], clusterWeights: 
VectorLike) -> None: ...
-    @property
-    def clusterWeights(self) -> List[float64]: ...
-    centers: List[VectorLike]
-    def update(
-        self, data: RDD[VectorLike], decayFactor: float, timeUnit: str
-    ) -> StreamingKMeansModel: ...
-
-class StreamingKMeans:
-    def __init__(self, k: int = ..., decayFactor: float = ..., timeUnit: str = 
...) -> None: ...
-    def latestModel(self) -> StreamingKMeansModel: ...
-    def setK(self, k: int) -> StreamingKMeans: ...
-    def setDecayFactor(self, decayFactor: float) -> StreamingKMeans: ...
-    def setHalfLife(self, halfLife: float, timeUnit: str) -> StreamingKMeans: 
...
-    def setInitialCenters(
-        self, centers: List[VectorLike], weights: List[float]
-    ) -> StreamingKMeans: ...
-    def setRandomCenters(self, dim: int, weight: float, seed: int) -> 
StreamingKMeans: ...
-    def trainOn(self, dstream: DStream[VectorLike]) -> None: ...
-    def predictOn(self, dstream: DStream[VectorLike]) -> DStream[int]: ...
-    def predictOnValues(self, dstream: DStream[Tuple[T, VectorLike]]) -> 
DStream[Tuple[T, int]]: ...
-
-class LDAModel(JavaModelWrapper, JavaSaveable, Loader[LDAModel]):
-    def topicsMatrix(self) -> ndarray: ...
-    def vocabSize(self) -> int: ...
-    def describeTopics(
-        self, maxTermsPerTopic: Optional[int] = ...
-    ) -> List[Tuple[List[int], List[float]]]: ...
-    @classmethod
-    def load(cls, sc: SparkContext, path: str) -> LDAModel: ...
-
-class LDA:
-    @classmethod
-    def train(
-        cls,
-        rdd: RDD[Tuple[int, VectorLike]],
-        k: int = ...,
-        maxIterations: int = ...,
-        docConcentration: float = ...,
-        topicConcentration: float = ...,
-        seed: Optional[int] = ...,
-        checkpointInterval: int = ...,
-        optimizer: str = ...,
-    ) -> LDAModel: ...


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch branch-3.3 updated: [SPARK-37402][PYTHON][MLLIB] Inline typehints for pyspark.mllib.clustering

Reply via email to