This is an automated email from the ASF dual-hosted git repository.
zero323 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push:
new f4457e6310f [SPARK-37402][PYTHON][MLLIB] Inline typehints for
pyspark.mllib.clustering
f4457e6310f is described below
commit f4457e6310f6bd900d7634b279606436a6faf8fb
Author: zero323 <[email protected]>
AuthorDate: Mon Apr 11 11:29:47 2022 +0200
[SPARK-37402][PYTHON][MLLIB] Inline typehints for pyspark.mllib.clustering
### What changes were proposed in this pull request?
This PR migrates type `pyspark.mllib.clustering` annotations from stub file
to inline type hints.
### Why are the changes needed?
Part of ongoing migration of type hints.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing tests.
Closes #35578 from zero323/SPARK-37402.
Authored-by: zero323 <[email protected]>
Signed-off-by: zero323 <[email protected]>
(cherry picked from commit e71cf3907b9ff2036dfe45bc8fe939f20cca741b)
Signed-off-by: zero323 <[email protected]>
---
python/pyspark/mllib/clustering.py | 247 ++++++++++++++++++++++++------------
python/pyspark/mllib/clustering.pyi | 188 ---------------------------
2 files changed, 163 insertions(+), 272 deletions(-)
diff --git a/python/pyspark/mllib/clustering.py
b/python/pyspark/mllib/clustering.py
index a9e4fd82089..fd33887fd9e 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -19,7 +19,9 @@ import sys
import array as pyarray
from math import exp, log
from collections import namedtuple
+from typing import Any, List, Optional, Tuple, TypeVar, Union, overload,
TYPE_CHECKING
+import numpy as np
from numpy import array, random, tile
from pyspark import SparkContext, since
@@ -30,6 +32,12 @@ from pyspark.mllib.stat.distribution import
MultivariateGaussian
from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader,
JavaSaveable
from pyspark.streaming import DStream
+if TYPE_CHECKING:
+ from py4j.java_gateway import JavaObject
+ from pyspark.mllib._typing import VectorLike
+
+T = TypeVar("T")
+
__all__ = [
"BisectingKMeansModel",
"BisectingKMeans",
@@ -67,24 +75,32 @@ class BisectingKMeansModel(JavaModelWrapper):
0.0
"""
- def __init__(self, java_model):
+ def __init__(self, java_model: "JavaObject"):
super(BisectingKMeansModel, self).__init__(java_model)
self.centers = [c.toArray() for c in self.call("clusterCenters")]
- @property
+ @property # type: ignore[misc]
@since("2.0.0")
- def clusterCenters(self):
+ def clusterCenters(self) -> List[np.ndarray]:
"""Get the cluster centers, represented as a list of NumPy
arrays."""
return self.centers
- @property
+ @property # type: ignore[misc]
@since("2.0.0")
- def k(self):
+ def k(self) -> int:
"""Get the number of clusters"""
return self.call("k")
- def predict(self, x):
+ @overload
+ def predict(self, x: "VectorLike") -> int:
+ ...
+
+ @overload
+ def predict(self, x: RDD["VectorLike"]) -> RDD[int]:
+ ...
+
+ def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[int,
RDD[int]]:
"""
Find the cluster that each of the points belongs to in this
model.
@@ -111,7 +127,7 @@ class BisectingKMeansModel(JavaModelWrapper):
x = _convert_to_vector(x)
return self.call("predict", x)
- def computeCost(self, x):
+ def computeCost(self, x: Union["VectorLike", RDD["VectorLike"]]) -> float:
"""
Return the Bisecting K-means cost (sum of squared distances of
points to their nearest center) for this model on the given
@@ -159,7 +175,14 @@ class BisectingKMeans:
"""
@classmethod
- def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0,
seed=-1888008604):
+ def train(
+ self,
+ rdd: RDD["VectorLike"],
+ k: int = 4,
+ maxIterations: int = 20,
+ minDivisibleClusterSize: float = 1.0,
+ seed: int = -1888008604,
+ ) -> BisectingKMeansModel:
"""
Runs the bisecting k-means algorithm return the model.
@@ -197,7 +220,7 @@ class BisectingKMeans:
@inherit_doc
-class KMeansModel(Saveable, Loader):
+class KMeansModel(Saveable, Loader["KMeansModel"]):
"""A clustering model derived from the k-means method.
@@ -255,22 +278,30 @@ class KMeansModel(Saveable, Loader):
[array([-1000., -1000.]), array([ 5., 5.]), array([ 1000., 1000.])]
"""
- def __init__(self, centers):
+ def __init__(self, centers: List["VectorLike"]):
self.centers = centers
- @property
+ @property # type: ignore[misc]
@since("1.0.0")
- def clusterCenters(self):
+ def clusterCenters(self) -> List["VectorLike"]:
"""Get the cluster centers, represented as a list of NumPy arrays."""
return self.centers
- @property
+ @property # type: ignore[misc]
@since("1.4.0")
- def k(self):
+ def k(self) -> int:
"""Total number of clusters."""
return len(self.centers)
- def predict(self, x):
+ @overload
+ def predict(self, x: "VectorLike") -> int:
+ ...
+
+ @overload
+ def predict(self, x: RDD["VectorLike"]) -> RDD[int]:
+ ...
+
+ def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[int,
RDD[int]]:
"""
Find the cluster that each of the points belongs to in this
model.
@@ -297,13 +328,13 @@ class KMeansModel(Saveable, Loader):
x = _convert_to_vector(x)
for i in range(len(self.centers)):
- distance = x.squared_distance(self.centers[i])
+ distance = x.squared_distance(self.centers[i]) # type:
ignore[attr-defined]
if distance < best_distance:
best = i
best_distance = distance
return best
- def computeCost(self, rdd):
+ def computeCost(self, rdd: RDD["VectorLike"]) -> float:
"""
Return the K-means cost (sum of squared distances of points to
their nearest center) for this model on the given
@@ -324,20 +355,24 @@ class KMeansModel(Saveable, Loader):
return cost
@since("1.4.0")
- def save(self, sc, path):
+ def save(self, sc: SparkContext, path: str) -> None:
"""
Save this model to the given path.
"""
+ assert sc._jvm is not None
+
java_centers = _py2java(sc, [_convert_to_vector(c) for c in
self.centers])
java_model =
sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers)
java_model.save(sc._jsc.sc(), path)
@classmethod
@since("1.4.0")
- def load(cls, sc, path):
+ def load(cls, sc: SparkContext, path: str) -> "KMeansModel":
"""
Load a model from the given path.
"""
+ assert sc._jvm is not None
+
java_model =
sc._jvm.org.apache.spark.mllib.clustering.KMeansModel.load(sc._jsc.sc(), path)
return KMeansModel(_java2py(sc, java_model.clusterCenters()))
@@ -352,16 +387,16 @@ class KMeans:
@classmethod
def train(
cls,
- rdd,
- k,
- maxIterations=100,
- initializationMode="k-means||",
- seed=None,
- initializationSteps=2,
- epsilon=1e-4,
- initialModel=None,
- distanceMeasure="euclidean",
- ):
+ rdd: RDD["VectorLike"],
+ k: int,
+ maxIterations: int = 100,
+ initializationMode: str = "k-means||",
+ seed: Optional[int] = None,
+ initializationSteps: int = 2,
+ epsilon: float = 1e-4,
+ initialModel: Optional[KMeansModel] = None,
+ distanceMeasure: str = "euclidean",
+ ) -> "KMeansModel":
"""
Train a k-means clustering model.
@@ -428,7 +463,7 @@ class KMeans:
@inherit_doc
-class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader):
+class GaussianMixtureModel(JavaModelWrapper, JavaSaveable,
JavaLoader["GaussianMixtureModel"]):
"""
A clustering model derived from the Gaussian Mixture Model method.
@@ -497,18 +532,18 @@ class GaussianMixtureModel(JavaModelWrapper,
JavaSaveable, JavaLoader):
True
"""
- @property
+ @property # type: ignore[misc]
@since("1.4.0")
- def weights(self):
+ def weights(self) -> np.ndarray:
"""
Weights for each Gaussian distribution in the mixture, where
weights[i] is
the weight for Gaussian i, and weights.sum == 1.
"""
return array(self.call("weights"))
- @property
+ @property # type: ignore[misc]
@since("1.4.0")
- def gaussians(self):
+ def gaussians(self) -> List[MultivariateGaussian]:
"""
Array of MultivariateGaussian where gaussians[i] represents
the Multivariate Gaussian (Normal) Distribution for Gaussian i.
@@ -517,13 +552,21 @@ class GaussianMixtureModel(JavaModelWrapper,
JavaSaveable, JavaLoader):
MultivariateGaussian(gaussian[0], gaussian[1]) for gaussian in
self.call("gaussians")
]
- @property
+ @property # type: ignore[misc]
@since("1.4.0")
- def k(self):
+ def k(self) -> int:
"""Number of gaussians in mixture."""
return len(self.weights)
- def predict(self, x):
+ @overload
+ def predict(self, x: "VectorLike") -> np.int64:
+ ...
+
+ @overload
+ def predict(self, x: RDD["VectorLike"]) -> RDD[int]:
+ ...
+
+ def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) ->
Union[np.int64, RDD[int]]:
"""
Find the cluster to which the point 'x' or each point in RDD 'x'
has maximum membership in this model.
@@ -548,7 +591,17 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable,
JavaLoader):
z = self.predictSoft(x)
return z.argmax()
- def predictSoft(self, x):
+ @overload
+ def predictSoft(self, x: "VectorLike") -> np.ndarray:
+ ...
+
+ @overload
+ def predictSoft(self, x: RDD["VectorLike"]) -> RDD[pyarray.array]:
+ ...
+
+ def predictSoft(
+ self, x: Union["VectorLike", RDD["VectorLike"]]
+ ) -> Union[np.ndarray, RDD[pyarray.array]]:
"""
Find the membership of point 'x' or each point in RDD 'x' to all
mixture components.
@@ -579,7 +632,7 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable,
JavaLoader):
return self.call("predictSoft", _convert_to_vector(x)).toArray()
@classmethod
- def load(cls, sc, path):
+ def load(cls, sc: SparkContext, path: str) -> "GaussianMixtureModel":
"""Load the GaussianMixtureModel from disk.
.. versionadded:: 1.5.0
@@ -590,6 +643,8 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable,
JavaLoader):
path : str
Path to where the model is stored.
"""
+ assert sc._jvm is not None
+
model = cls._load_java(sc, path)
wrapper =
sc._jvm.org.apache.spark.mllib.api.python.GaussianMixtureModelWrapper(model)
return cls(wrapper)
@@ -603,7 +658,15 @@ class GaussianMixture:
"""
@classmethod
- def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None,
initialModel=None):
+ def train(
+ cls,
+ rdd: RDD["VectorLike"],
+ k: int,
+ convergenceTol: float = 1e-3,
+ maxIterations: int = 100,
+ seed: Optional[int] = None,
+ initialModel: Optional[GaussianMixtureModel] = None,
+ ) -> GaussianMixtureModel:
"""
Train a Gaussian Mixture clustering model.
@@ -658,7 +721,9 @@ class GaussianMixture:
return GaussianMixtureModel(java_model)
-class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable,
JavaLoader):
+class PowerIterationClusteringModel(
+ JavaModelWrapper, JavaSaveable, JavaLoader["PowerIterationClusteringModel"]
+):
"""
Model produced by :py:class:`PowerIterationClustering`.
@@ -711,16 +776,16 @@ class PowerIterationClusteringModel(JavaModelWrapper,
JavaSaveable, JavaLoader):
... pass
"""
- @property
+ @property # type: ignore[misc]
@since("1.5.0")
- def k(self):
+ def k(self) -> int:
"""
Returns the number of clusters.
"""
return self.call("k")
@since("1.5.0")
- def assignments(self):
+ def assignments(self) -> RDD["PowerIterationClustering.Assignment"]:
"""
Returns the cluster assignments of this model.
"""
@@ -728,10 +793,12 @@ class PowerIterationClusteringModel(JavaModelWrapper,
JavaSaveable, JavaLoader):
@classmethod
@since("1.5.0")
- def load(cls, sc, path):
+ def load(cls, sc: SparkContext, path: str) ->
"PowerIterationClusteringModel":
"""
Load a model from the given path.
"""
+ assert sc._jvm is not None
+
model = cls._load_java(sc, path)
wrapper =
sc._jvm.org.apache.spark.mllib.api.python.PowerIterationClusteringModelWrapper(
model
@@ -757,7 +824,13 @@ class PowerIterationClustering:
"""
@classmethod
- def train(cls, rdd, k, maxIterations=100, initMode="random"):
+ def train(
+ cls,
+ rdd: RDD[Tuple[int, int, float]],
+ k: int,
+ maxIterations: int = 100,
+ initMode: str = "random",
+ ) -> PowerIterationClusteringModel:
r"""
Train PowerIterationClusteringModel
@@ -867,18 +940,20 @@ class StreamingKMeansModel(KMeansModel):
1
"""
- def __init__(self, clusterCenters, clusterWeights):
+ def __init__(self, clusterCenters: List["VectorLike"], clusterWeights:
"VectorLike"):
super(StreamingKMeansModel, self).__init__(centers=clusterCenters)
- self._clusterWeights = list(clusterWeights)
+ self._clusterWeights = list(clusterWeights) # type: ignore[arg-type]
- @property
+ @property # type: ignore[misc]
@since("1.5.0")
- def clusterWeights(self):
+ def clusterWeights(self) -> List[np.float64]:
"""Return the cluster weights."""
return self._clusterWeights
@since("1.5.0")
- def update(self, data, decayFactor, timeUnit):
+ def update(
+ self, data: RDD["VectorLike"], decayFactor: float, timeUnit: str
+ ) -> "StreamingKMeansModel":
"""Update the centroids, according to data
.. versionadded:: 1.5.0
@@ -909,7 +984,7 @@ class StreamingKMeansModel(KMeansModel):
decayFactor,
timeUnit,
)
- self.centers = array(updatedModel[0])
+ self.centers = array(updatedModel[0]) # type: ignore[assignment]
self._clusterWeights = list(updatedModel[1])
return self
@@ -938,20 +1013,20 @@ class StreamingKMeans:
(default: "batches")
"""
- def __init__(self, k=2, decayFactor=1.0, timeUnit="batches"):
+ def __init__(self, k: int = 2, decayFactor: float = 1.0, timeUnit: str =
"batches"):
self._k = k
self._decayFactor = decayFactor
if timeUnit not in ["batches", "points"]:
raise ValueError("timeUnit should be 'batches' or 'points', got
%s." % timeUnit)
self._timeUnit = timeUnit
- self._model = None
+ self._model: Optional[StreamingKMeansModel] = None
@since("1.5.0")
- def latestModel(self):
+ def latestModel(self) -> Optional[StreamingKMeansModel]:
"""Return the latest model"""
return self._model
- def _validate(self, dstream):
+ def _validate(self, dstream: Any) -> None:
if self._model is None:
raise ValueError(
"Initial centers should be set either by setInitialCenters "
"or setRandomCenters."
@@ -962,19 +1037,19 @@ class StreamingKMeans:
)
@since("1.5.0")
- def setK(self, k):
+ def setK(self, k: int) -> "StreamingKMeans":
"""Set number of clusters."""
self._k = k
return self
@since("1.5.0")
- def setDecayFactor(self, decayFactor):
+ def setDecayFactor(self, decayFactor: float) -> "StreamingKMeans":
"""Set decay factor."""
self._decayFactor = decayFactor
return self
@since("1.5.0")
- def setHalfLife(self, halfLife, timeUnit):
+ def setHalfLife(self, halfLife: float, timeUnit: str) -> "StreamingKMeans":
"""
Set number of batches after which the centroids of that
particular batch has half the weightage.
@@ -984,7 +1059,9 @@ class StreamingKMeans:
return self
@since("1.5.0")
- def setInitialCenters(self, centers, weights):
+ def setInitialCenters(
+ self, centers: List["VectorLike"], weights: List[float]
+ ) -> "StreamingKMeans":
"""
Set initial centers. Should be set before calling trainOn.
"""
@@ -992,7 +1069,7 @@ class StreamingKMeans:
return self
@since("1.5.0")
- def setRandomCenters(self, dim, weight, seed):
+ def setRandomCenters(self, dim: int, weight: float, seed: int) ->
"StreamingKMeans":
"""
Set the initial centers to be random samples from
a gaussian population with constant weights.
@@ -1000,39 +1077,39 @@ class StreamingKMeans:
rng = random.RandomState(seed)
clusterCenters = rng.randn(self._k, dim)
clusterWeights = tile(weight, self._k)
- self._model = StreamingKMeansModel(clusterCenters, clusterWeights)
+ self._model = StreamingKMeansModel(clusterCenters, clusterWeights) #
type: ignore[arg-type]
return self
@since("1.5.0")
- def trainOn(self, dstream):
+ def trainOn(self, dstream: "DStream[VectorLike]") -> None:
"""Train the model on the incoming dstream."""
self._validate(dstream)
- def update(rdd):
- self._model.update(rdd, self._decayFactor, self._timeUnit)
+ def update(rdd: RDD["VectorLike"]) -> None:
+ self._model.update(rdd, self._decayFactor, self._timeUnit) #
type: ignore[union-attr]
dstream.foreachRDD(update)
@since("1.5.0")
- def predictOn(self, dstream):
+ def predictOn(self, dstream: "DStream[VectorLike]") -> "DStream[int]":
"""
Make predictions on a dstream.
Returns a transformed dstream object
"""
self._validate(dstream)
- return dstream.map(lambda x: self._model.predict(x))
+ return dstream.map(lambda x: self._model.predict(x)) # type:
ignore[union-attr]
@since("1.5.0")
- def predictOnValues(self, dstream):
+ def predictOnValues(self, dstream: "DStream[Tuple[T, VectorLike]]") ->
"DStream[Tuple[T, int]]":
"""
Make predictions on a keyed dstream.
Returns a transformed dstream object.
"""
self._validate(dstream)
- return dstream.mapValues(lambda x: self._model.predict(x))
+ return dstream.mapValues(lambda x: self._model.predict(x)) # type:
ignore[union-attr]
-class LDAModel(JavaModelWrapper, JavaSaveable, Loader):
+class LDAModel(JavaModelWrapper, JavaSaveable, Loader["LDAModel"]):
"""A clustering model derived from the LDA method.
@@ -1089,16 +1166,18 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader):
"""
@since("1.5.0")
- def topicsMatrix(self):
+ def topicsMatrix(self) -> np.ndarray:
"""Inferred topics, where each topic is represented by a distribution
over terms."""
return self.call("topicsMatrix").toArray()
@since("1.5.0")
- def vocabSize(self):
+ def vocabSize(self) -> int:
"""Vocabulary size (number of terms or terms in the vocabulary)"""
return self.call("vocabSize")
- def describeTopics(self, maxTermsPerTopic=None):
+ def describeTopics(
+ self, maxTermsPerTopic: Optional[int] = None
+ ) -> List[Tuple[List[int], List[float]]]:
"""Return the topics described by weighted terms.
.. versionadded:: 1.6.0
@@ -1124,7 +1203,7 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader):
return topics
@classmethod
- def load(cls, sc, path):
+ def load(cls, sc: SparkContext, path: str) -> "LDAModel":
"""Load the LDAModel from disk.
.. versionadded:: 1.5.0
@@ -1153,15 +1232,15 @@ class LDA:
@classmethod
def train(
cls,
- rdd,
- k=10,
- maxIterations=20,
- docConcentration=-1.0,
- topicConcentration=-1.0,
- seed=None,
- checkpointInterval=10,
- optimizer="em",
- ):
+ rdd: RDD[Tuple[int, "VectorLike"]],
+ k: int = 10,
+ maxIterations: int = 20,
+ docConcentration: float = -1.0,
+ topicConcentration: float = -1.0,
+ seed: Optional[int] = None,
+ checkpointInterval: int = 10,
+ optimizer: str = "em",
+ ) -> LDAModel:
"""Train a LDA model.
.. versionadded:: 1.5.0
@@ -1215,7 +1294,7 @@ class LDA:
return LDAModel(model)
-def _test():
+def _test() -> None:
import doctest
import numpy
import pyspark.mllib.clustering
diff --git a/python/pyspark/mllib/clustering.pyi
b/python/pyspark/mllib/clustering.pyi
deleted file mode 100644
index 8a8401d3565..00000000000
--- a/python/pyspark/mllib/clustering.pyi
+++ /dev/null
@@ -1,188 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from typing import overload
-from typing import List, NamedTuple, Optional, Tuple, TypeVar
-
-import array
-
-from numpy import float64, int64, ndarray
-from py4j.java_gateway import JavaObject
-
-from pyspark.mllib._typing import VectorLike
-from pyspark.context import SparkContext
-from pyspark.rdd import RDD
-from pyspark.mllib.common import JavaModelWrapper
-from pyspark.mllib.stat.distribution import MultivariateGaussian
-from pyspark.mllib.util import Saveable, Loader, JavaLoader, JavaSaveable
-from pyspark.streaming.dstream import DStream
-
-T = TypeVar("T")
-
-class BisectingKMeansModel(JavaModelWrapper):
- centers: List[VectorLike]
- def __init__(self, java_model: JavaObject) -> None: ...
- @property
- def clusterCenters(self) -> List[ndarray]: ...
- @property
- def k(self) -> int: ...
- @overload
- def predict(self, x: VectorLike) -> int: ...
- @overload
- def predict(self, x: RDD[VectorLike]) -> RDD[int]: ...
- @overload
- def computeCost(self, x: VectorLike) -> float: ...
- @overload
- def computeCost(self, x: RDD[VectorLike]) -> float: ...
-
-class BisectingKMeans:
- @classmethod
- def train(
- self,
- rdd: RDD[VectorLike],
- k: int = ...,
- maxIterations: int = ...,
- minDivisibleClusterSize: float = ...,
- seed: int = ...,
- ) -> BisectingKMeansModel: ...
-
-class KMeansModel(Saveable, Loader[KMeansModel]):
- centers: List[VectorLike]
- def __init__(self, centers: List[VectorLike]) -> None: ...
- @property
- def clusterCenters(self) -> List[ndarray]: ...
- @property
- def k(self) -> int: ...
- @overload
- def predict(self, x: VectorLike) -> int: ...
- @overload
- def predict(self, x: RDD[VectorLike]) -> RDD[int]: ...
- def computeCost(self, rdd: RDD[VectorLike]) -> float: ...
- def save(self, sc: SparkContext, path: str) -> None: ...
- @classmethod
- def load(cls, sc: SparkContext, path: str) -> KMeansModel: ...
-
-class KMeans:
- @classmethod
- def train(
- cls,
- rdd: RDD[VectorLike],
- k: int,
- maxIterations: int = ...,
- initializationMode: str = ...,
- seed: Optional[int] = ...,
- initializationSteps: int = ...,
- epsilon: float = ...,
- initialModel: Optional[KMeansModel] = ...,
- ) -> KMeansModel: ...
-
-class GaussianMixtureModel(JavaModelWrapper, JavaSaveable,
JavaLoader[GaussianMixtureModel]):
- @property
- def weights(self) -> ndarray: ...
- @property
- def gaussians(self) -> List[MultivariateGaussian]: ...
- @property
- def k(self) -> int: ...
- @overload
- def predict(self, x: VectorLike) -> int64: ...
- @overload
- def predict(self, x: RDD[VectorLike]) -> RDD[int]: ...
- @overload
- def predictSoft(self, x: VectorLike) -> ndarray: ...
- @overload
- def predictSoft(self, x: RDD[VectorLike]) -> RDD[array.array]: ...
- @classmethod
- def load(cls, sc: SparkContext, path: str) -> GaussianMixtureModel: ...
-
-class GaussianMixture:
- @classmethod
- def train(
- cls,
- rdd: RDD[VectorLike],
- k: int,
- convergenceTol: float = ...,
- maxIterations: int = ...,
- seed: Optional[int] = ...,
- initialModel: Optional[GaussianMixtureModel] = ...,
- ) -> GaussianMixtureModel: ...
-
-class PowerIterationClusteringModel(
- JavaModelWrapper, JavaSaveable, JavaLoader[PowerIterationClusteringModel]
-):
- @property
- def k(self) -> int: ...
- def assignments(self) -> RDD[PowerIterationClustering.Assignment]: ...
- @classmethod
- def load(cls, sc: SparkContext, path: str) ->
PowerIterationClusteringModel: ...
-
-class PowerIterationClustering:
- @classmethod
- def train(
- cls,
- rdd: RDD[Tuple[int, int, float]],
- k: int,
- maxIterations: int = ...,
- initMode: str = ...,
- ) -> PowerIterationClusteringModel: ...
- class Assignment(NamedTuple("Assignment", [("id", int), ("cluster",
int)])): ...
-
-class StreamingKMeansModel(KMeansModel):
- def __init__(self, clusterCenters: List[VectorLike], clusterWeights:
VectorLike) -> None: ...
- @property
- def clusterWeights(self) -> List[float64]: ...
- centers: List[VectorLike]
- def update(
- self, data: RDD[VectorLike], decayFactor: float, timeUnit: str
- ) -> StreamingKMeansModel: ...
-
-class StreamingKMeans:
- def __init__(self, k: int = ..., decayFactor: float = ..., timeUnit: str =
...) -> None: ...
- def latestModel(self) -> StreamingKMeansModel: ...
- def setK(self, k: int) -> StreamingKMeans: ...
- def setDecayFactor(self, decayFactor: float) -> StreamingKMeans: ...
- def setHalfLife(self, halfLife: float, timeUnit: str) -> StreamingKMeans:
...
- def setInitialCenters(
- self, centers: List[VectorLike], weights: List[float]
- ) -> StreamingKMeans: ...
- def setRandomCenters(self, dim: int, weight: float, seed: int) ->
StreamingKMeans: ...
- def trainOn(self, dstream: DStream[VectorLike]) -> None: ...
- def predictOn(self, dstream: DStream[VectorLike]) -> DStream[int]: ...
- def predictOnValues(self, dstream: DStream[Tuple[T, VectorLike]]) ->
DStream[Tuple[T, int]]: ...
-
-class LDAModel(JavaModelWrapper, JavaSaveable, Loader[LDAModel]):
- def topicsMatrix(self) -> ndarray: ...
- def vocabSize(self) -> int: ...
- def describeTopics(
- self, maxTermsPerTopic: Optional[int] = ...
- ) -> List[Tuple[List[int], List[float]]]: ...
- @classmethod
- def load(cls, sc: SparkContext, path: str) -> LDAModel: ...
-
-class LDA:
- @classmethod
- def train(
- cls,
- rdd: RDD[Tuple[int, VectorLike]],
- k: int = ...,
- maxIterations: int = ...,
- docConcentration: float = ...,
- topicConcentration: float = ...,
- seed: Optional[int] = ...,
- checkpointInterval: int = ...,
- optimizer: str = ...,
- ) -> LDAModel: ...
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]