This is an automated email from the ASF dual-hosted git repository.
zero323 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new c17ba3f [SPARK-37417][PYTHON][ML] Inline type hints for
pyspark.ml.linalg.__init__.py
c17ba3f is described below
commit c17ba3f9e9802ae5491cd914a4262cfe4e8e6d20
Author: zero323 <[email protected]>
AuthorDate: Sun Feb 6 10:59:21 2022 +0100
[SPARK-37417][PYTHON][ML] Inline type hints for
pyspark.ml.linalg.__init__.py
### What changes were proposed in this pull request?
Migration of type type annotations for `pyspark.ml.linalg.__init__.py` from
stub file to inline hints.
### Why are the changes needed?
As part of ongoing type hint migrations.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing tests.
Closes #35380 from zero323/SPARK-37417.
Authored-by: zero323 <[email protected]>
Signed-off-by: zero323 <[email protected]>
---
python/pyspark/ml/_typing.pyi | 7 +-
python/pyspark/ml/linalg/__init__.py | 370 ++++++++++++++++++++++----------
python/pyspark/ml/linalg/__init__.pyi | 243 ---------------------
python/pyspark/mllib/linalg/__init__.py | 14 +-
4 files changed, 267 insertions(+), 367 deletions(-)
diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi
index b51aa96..7862078 100644
--- a/python/pyspark/ml/_typing.pyi
+++ b/python/pyspark/ml/_typing.pyi
@@ -16,12 +16,15 @@
# specific language governing permissions and limitations
# under the License.
-from typing import Any, Dict, TypeVar, Union
+from typing import Any, Dict, List, TypeVar, Tuple, Union
from typing_extensions import Literal
+from numpy import ndarray
+
import pyspark.ml.base
import pyspark.ml.param
import pyspark.ml.util
+from pyspark.ml.linalg import Vector
import pyspark.ml.wrapper
from py4j.java_gateway import JavaObject
@@ -75,3 +78,5 @@ RankingEvaluatorMetricType = Union[
Literal["ndcgAtK"],
Literal["recallAtK"],
]
+
+VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]]
diff --git a/python/pyspark/ml/linalg/__init__.py
b/python/pyspark/ml/linalg/__init__.py
index 03e63e9..d3d2cbd 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -40,6 +40,22 @@ from pyspark.sql.types import (
BooleanType,
)
+from typing import (
+ Any,
+ Callable,
+ cast,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ overload,
+ Sequence,
+ Tuple,
+ Type,
+ TYPE_CHECKING,
+ Union,
+)
+
__all__ = [
"Vector",
@@ -52,6 +68,11 @@ __all__ = [
"Matrices",
]
+if TYPE_CHECKING:
+ from pyspark.mllib._typing import NormType
+ from pyspark.ml._typing import VectorLike
+ from scipy.sparse import spmatrix
+
# Check whether we have SciPy. MLlib works without it too, but if we have it,
some methods,
# such as _dot and _serialize_double_vector, start to support scipy.sparse
matrices.
@@ -65,23 +86,23 @@ except BaseException:
_have_scipy = False
-def _convert_to_vector(d):
+def _convert_to_vector(d: Union["VectorLike", "spmatrix", range]) -> "Vector":
if isinstance(d, Vector):
return d
elif type(d) in (array.array, np.array, np.ndarray, list, tuple, range):
return DenseVector(d)
elif _have_scipy and scipy.sparse.issparse(d):
- assert d.shape[1] == 1, "Expected column vector"
+ assert cast("spmatrix", d).shape[1] == 1, "Expected column vector"
# Make sure the converted csc_matrix has sorted indices.
- csc = d.tocsc()
+ csc = cast("spmatrix", d).tocsc()
if not csc.has_sorted_indices:
csc.sort_indices()
- return SparseVector(d.shape[0], csc.indices, csc.data)
+ return SparseVector(cast("spmatrix", d).shape[0], csc.indices,
csc.data)
else:
raise TypeError("Cannot convert type %s into Vector" % type(d))
-def _vector_size(v):
+def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int:
"""
Returns the size of the vector.
@@ -112,24 +133,24 @@ def _vector_size(v):
else:
raise ValueError("Cannot treat an ndarray of shape %s as a vector"
% str(v.shape))
elif _have_scipy and scipy.sparse.issparse(v):
- assert v.shape[1] == 1, "Expected column vector"
- return v.shape[0]
+ assert cast("spmatrix", v).shape[1] == 1, "Expected column vector"
+ return cast("spmatrix", v).shape[0]
else:
raise TypeError("Cannot treat type %s as a vector" % type(v))
-def _format_float(f, digits=4):
+def _format_float(f: float, digits: int = 4) -> str:
s = str(round(f, digits))
if "." in s:
s = s[: s.index(".") + 1 + digits]
return s
-def _format_float_list(xs):
+def _format_float_list(xs: Iterable[float]) -> List[str]:
return [_format_float(x) for x in xs]
-def _double_to_long_bits(value):
+def _double_to_long_bits(value: float) -> int:
if np.isnan(value):
value = float("nan")
# pack double into 64 bits, then unpack as long int
@@ -142,7 +163,7 @@ class VectorUDT(UserDefinedType):
"""
@classmethod
- def sqlType(cls):
+ def sqlType(cls) -> StructType:
return StructType(
[
StructField("type", ByteType(), False),
@@ -153,37 +174,41 @@ class VectorUDT(UserDefinedType):
)
@classmethod
- def module(cls):
+ def module(cls) -> str:
return "pyspark.ml.linalg"
@classmethod
- def scalaUDT(cls):
+ def scalaUDT(cls) -> str:
return "org.apache.spark.ml.linalg.VectorUDT"
- def serialize(self, obj):
+ def serialize(
+ self, obj: "Vector"
+ ) -> Tuple[int, Optional[int], Optional[List[int]], List[float]]:
if isinstance(obj, SparseVector):
indices = [int(i) for i in obj.indices]
values = [float(v) for v in obj.values]
return (0, obj.size, indices, values)
elif isinstance(obj, DenseVector):
- values = [float(v) for v in obj]
+ values = [float(v) for v in obj] # type: ignore[attr-defined]
return (1, None, None, values)
else:
raise TypeError("cannot serialize %r of type %r" % (obj,
type(obj)))
- def deserialize(self, datum):
+ def deserialize(
+ self, datum: Tuple[int, Optional[int], Optional[List[int]],
List[float]]
+ ) -> "Vector":
assert (
len(datum) == 4
), "VectorUDT.deserialize given row with length %d but requires 4" %
len(datum)
tpe = datum[0]
if tpe == 0:
- return SparseVector(datum[1], datum[2], datum[3])
+ return SparseVector(cast(int, datum[1]), cast(List[int],
datum[2]), datum[3])
elif tpe == 1:
return DenseVector(datum[3])
else:
raise ValueError("do not recognize type %r" % tpe)
- def simpleString(self):
+ def simpleString(self) -> str:
return "vector"
@@ -193,7 +218,7 @@ class MatrixUDT(UserDefinedType):
"""
@classmethod
- def sqlType(cls):
+ def sqlType(cls) -> StructType:
return StructType(
[
StructField("type", ByteType(), False),
@@ -207,14 +232,16 @@ class MatrixUDT(UserDefinedType):
)
@classmethod
- def module(cls):
+ def module(cls) -> str:
return "pyspark.ml.linalg"
@classmethod
- def scalaUDT(cls):
+ def scalaUDT(cls) -> str:
return "org.apache.spark.ml.linalg.MatrixUDT"
- def serialize(self, obj):
+ def serialize(
+ self, obj: "Matrix"
+ ) -> Tuple[int, int, int, Optional[List[int]], Optional[List[int]],
List[float], bool]:
if isinstance(obj, SparseMatrix):
colPtrs = [int(i) for i in obj.colPtrs]
rowIndices = [int(i) for i in obj.rowIndices]
@@ -234,19 +261,22 @@ class MatrixUDT(UserDefinedType):
else:
raise TypeError("cannot serialize type %r" % (type(obj)))
- def deserialize(self, datum):
+ def deserialize(
+ self,
+ datum: Tuple[int, int, int, Optional[List[int]], Optional[List[int]],
List[float], bool],
+ ) -> "Matrix":
assert (
len(datum) == 7
), "MatrixUDT.deserialize given row with length %d but requires 7" %
len(datum)
tpe = datum[0]
if tpe == 0:
- return SparseMatrix(*datum[1:])
+ return SparseMatrix(*datum[1:]) # type: ignore[arg-type]
elif tpe == 1:
return DenseMatrix(datum[1], datum[2], datum[5], datum[6])
else:
raise ValueError("do not recognize type %r" % tpe)
- def simpleString(self):
+ def simpleString(self) -> str:
return "matrix"
@@ -258,7 +288,7 @@ class Vector:
Abstract class for DenseVector and SparseVector
"""
- def toArray(self):
+ def toArray(self) -> np.ndarray:
"""
Convert the vector into an numpy.ndarray
@@ -266,6 +296,9 @@ class Vector:
"""
raise NotImplementedError
+ def __len__(self) -> int:
+ raise NotImplementedError
+
class DenseVector(Vector):
"""
@@ -293,25 +326,26 @@ class DenseVector(Vector):
DenseVector([-1.0, -2.0])
"""
- def __init__(self, ar):
+ def __init__(self, ar: Union[bytes, np.ndarray, Iterable[float]]):
+ ar_: np.ndarray
if isinstance(ar, bytes):
- ar = np.frombuffer(ar, dtype=np.float64)
+ ar_ = np.frombuffer(ar, dtype=np.float64)
elif not isinstance(ar, np.ndarray):
- ar = np.array(ar, dtype=np.float64)
- if ar.dtype != np.float64:
- ar = ar.astype(np.float64)
- self.array = ar
+ ar_ = np.array(ar, dtype=np.float64)
+ else:
+ ar_ = ar.astype(np.float64) if ar.dtype != np.float64 else ar
+ self.array = ar_
- def __reduce__(self):
+ def __reduce__(self) -> Tuple[Type["DenseVector"], Tuple[bytes]]:
return DenseVector, (self.array.tobytes(),)
- def numNonzeros(self):
+ def numNonzeros(self) -> int:
"""
Number of nonzero elements. This scans all active values and count non
zeros
"""
return np.count_nonzero(self.array)
- def norm(self, p):
+ def norm(self, p: "NormType") -> np.float64:
"""
Calculates the norm of a DenseVector.
@@ -325,7 +359,7 @@ class DenseVector(Vector):
"""
return np.linalg.norm(self.array, p)
- def dot(self, other):
+ def dot(self, other: Iterable[float]) -> np.float64:
"""
Compute the dot product of two Vectors. We support
(Numpy array, list, SparseVector, or SciPy sparse)
@@ -359,8 +393,8 @@ class DenseVector(Vector):
assert len(self) == other.shape[0], "dimension mismatch"
return np.dot(self.array, other)
elif _have_scipy and scipy.sparse.issparse(other):
- assert len(self) == other.shape[0], "dimension mismatch"
- return other.transpose().dot(self.toArray())
+ assert len(self) == cast("spmatrix", other).shape[0], "dimension
mismatch"
+ return cast("spmatrix", other).transpose().dot(self.toArray())
else:
assert len(self) == _vector_size(other), "dimension mismatch"
if isinstance(other, SparseVector):
@@ -368,9 +402,9 @@ class DenseVector(Vector):
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
else:
- return np.dot(self.toArray(), other)
+ return np.dot(self.toArray(), other) # type:
ignore[call-overload]
- def squared_distance(self, other):
+ def squared_distance(self, other: Iterable[float]) -> np.float64:
"""
Squared distance of two Vectors.
@@ -401,41 +435,49 @@ class DenseVector(Vector):
if isinstance(other, SparseVector):
return other.squared_distance(self)
elif _have_scipy and scipy.sparse.issparse(other):
- return _convert_to_vector(other).squared_distance(self)
+ return _convert_to_vector(other).squared_distance(self) # type:
ignore[attr-defined]
if isinstance(other, Vector):
other = other.toArray()
elif not isinstance(other, np.ndarray):
other = np.array(other)
- diff = self.toArray() - other
+ diff: np.ndarray = self.toArray() - other
return np.dot(diff, diff)
- def toArray(self):
+ def toArray(self) -> np.ndarray:
"""
Returns the underlying numpy.ndarray
"""
return self.array
@property
- def values(self):
+ def values(self) -> np.ndarray:
"""
Returns the underlying numpy.ndarray
"""
return self.array
- def __getitem__(self, item):
+ @overload
+ def __getitem__(self, item: int) -> np.float64:
+ ...
+
+ @overload
+ def __getitem__(self, item: slice) -> np.ndarray:
+ ...
+
+ def __getitem__(self, item: Union[int, slice]) -> Union[np.float64,
np.ndarray]:
return self.array[item]
- def __len__(self):
+ def __len__(self) -> int:
return len(self.array)
- def __str__(self):
+ def __str__(self) -> str:
return "[" + ",".join([str(v) for v in self.array]) + "]"
- def __repr__(self):
+ def __repr__(self) -> str:
return "DenseVector([%s])" % (", ".join(_format_float(i) for i in
self.array))
- def __eq__(self, other):
+ def __eq__(self, other: Any) -> bool:
if isinstance(other, DenseVector):
return np.array_equal(self.array, other.array)
elif isinstance(other, SparseVector):
@@ -444,10 +486,10 @@ class DenseVector(Vector):
return Vectors._equals(list(range(len(self))), self.array,
other.indices, other.values)
return False
- def __ne__(self, other):
+ def __ne__(self, other: Any) -> bool:
return not self == other
- def __hash__(self):
+ def __hash__(self) -> int:
size = len(self)
result = 31 + size
nnz = 0
@@ -461,14 +503,14 @@ class DenseVector(Vector):
i += 1
return result
- def __getattr__(self, item):
+ def __getattr__(self, item: str) -> Any:
return getattr(self.array, item)
- def __neg__(self):
+ def __neg__(self) -> "DenseVector":
return DenseVector(-self.array)
- def _delegate(op):
- def func(self, other):
+ def _delegate(op: str) -> Callable[["DenseVector", Any], "DenseVector"]:
# type: ignore[misc]
+ def func(self: "DenseVector", other: Any) -> "DenseVector":
if isinstance(other, DenseVector):
other = other.array
return DenseVector(getattr(self.array, op)(other))
@@ -495,7 +537,33 @@ class SparseVector(Vector):
alternatively pass SciPy's {scipy.sparse} data types.
"""
- def __init__(self, size, *args):
+ @overload
+ def __init__(self, size: int, __indices: bytes, __values: bytes):
+ ...
+
+ @overload
+ def __init__(self, size: int, *args: Tuple[int, float]):
+ ...
+
+ @overload
+ def __init__(self, size: int, __indices: Iterable[int], __values:
Iterable[float]):
+ ...
+
+ @overload
+ def __init__(self, size: int, __pairs: Iterable[Tuple[int, float]]):
+ ...
+
+ @overload
+ def __init__(self, size: int, __map: Dict[int, float]):
+ ...
+
+ def __init__(
+ self,
+ size: int,
+ *args: Union[
+ bytes, Tuple[int, float], Iterable[float], Iterable[Tuple[int,
float]], Dict[int, float]
+ ],
+ ):
"""
Create a sparse vector, using either a dictionary, a list of
(index, value) pairs, or two separate arrays of indices and
@@ -535,7 +603,7 @@ class SparseVector(Vector):
pairs = args[0]
if type(pairs) == dict:
pairs = pairs.items()
- pairs = sorted(pairs)
+ pairs = cast(Iterable[Tuple[int, float]], sorted(pairs))
self.indices = np.array([p[0] for p in pairs], dtype=np.int32)
""" A list of indices corresponding to active entries. """
self.values = np.array([p[1] for p in pairs], dtype=np.float64)
@@ -570,13 +638,13 @@ class SparseVector(Vector):
)
assert np.min(self.indices) >= 0, "Contains negative index %d" %
(np.min(self.indices))
- def numNonzeros(self):
+ def numNonzeros(self) -> int:
"""
Number of nonzero elements. This scans all active values and count non
zeros.
"""
return np.count_nonzero(self.values)
- def norm(self, p):
+ def norm(self, p: "NormType") -> np.float64:
"""
Calculates the norm of a SparseVector.
@@ -590,10 +658,10 @@ class SparseVector(Vector):
"""
return np.linalg.norm(self.values, p)
- def __reduce__(self):
+ def __reduce__(self) -> Tuple[Type["SparseVector"], Tuple[int, bytes,
bytes]]:
return (SparseVector, (self.size, self.indices.tobytes(),
self.values.tobytes()))
- def dot(self, other):
+ def dot(self, other: Iterable[float]) -> np.float64:
"""
Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
@@ -643,15 +711,15 @@ class SparseVector(Vector):
self_cmind = np.in1d(self.indices, other.indices,
assume_unique=True)
self_values = self.values[self_cmind]
if self_values.size == 0:
- return 0.0
+ return np.float64(0.0)
else:
other_cmind = np.in1d(other.indices, self.indices,
assume_unique=True)
return np.dot(self_values, other.values[other_cmind])
else:
- return self.dot(_convert_to_vector(other))
+ return self.dot(_convert_to_vector(other)) # type:
ignore[arg-type]
- def squared_distance(self, other):
+ def squared_distance(self, other: Iterable[float]) -> np.float64:
"""
Squared distance from a SparseVector or 1-dimensional NumPy array.
@@ -719,9 +787,9 @@ class SparseVector(Vector):
j += 1
return result
else:
- return self.squared_distance(_convert_to_vector(other))
+ return self.squared_distance(_convert_to_vector(other)) # type:
ignore[arg-type]
- def toArray(self):
+ def toArray(self) -> np.ndarray:
"""
Returns a copy of this SparseVector as a 1-dimensional numpy.ndarray.
"""
@@ -729,15 +797,15 @@ class SparseVector(Vector):
arr[self.indices] = self.values
return arr
- def __len__(self):
+ def __len__(self) -> int:
return self.size
- def __str__(self):
+ def __str__(self) -> str:
inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
vals = "[" + ",".join([str(v) for v in self.values]) + "]"
return "(" + ",".join((str(self.size), inds, vals)) + ")"
- def __repr__(self):
+ def __repr__(self) -> str:
inds = self.indices
vals = self.values
entries = ", ".join(
@@ -745,7 +813,7 @@ class SparseVector(Vector):
)
return "SparseVector({0}, {{{1}}})".format(self.size, entries)
- def __eq__(self, other):
+ def __eq__(self, other: Any) -> bool:
if isinstance(other, SparseVector):
return (
other.size == self.size
@@ -758,7 +826,7 @@ class SparseVector(Vector):
return Vectors._equals(self.indices, self.values,
list(range(len(other))), other.array)
return False
- def __getitem__(self, index):
+ def __getitem__(self, index: int) -> np.float64:
inds = self.indices
vals = self.values
if not isinstance(index, int):
@@ -770,18 +838,18 @@ class SparseVector(Vector):
index += self.size
if (inds.size == 0) or (index > inds.item(-1)):
- return 0.0
+ return np.float64(0.0)
insert_index = np.searchsorted(inds, index)
row_ind = inds[insert_index]
if row_ind == index:
return vals[insert_index]
- return 0.0
+ return np.float64(0.0)
- def __ne__(self, other):
+ def __ne__(self, other: Any) -> bool:
return not self.__eq__(other)
- def __hash__(self):
+ def __hash__(self) -> int:
result = 31 + self.size
nnz = 0
i = 0
@@ -809,7 +877,37 @@ class Vectors:
"""
@staticmethod
- def sparse(size, *args):
+ @overload
+ def sparse(size: int, __indices: bytes, __values: bytes) -> SparseVector:
+ ...
+
+ @staticmethod
+ @overload
+ def sparse(size: int, *args: Tuple[int, float]) -> SparseVector:
+ ...
+
+ @staticmethod
+ @overload
+ def sparse(size: int, __indices: Iterable[int], __values: Iterable[float])
-> SparseVector:
+ ...
+
+ @staticmethod
+ @overload
+ def sparse(size: int, __pairs: Iterable[Tuple[int, float]]) ->
SparseVector:
+ ...
+
+ @staticmethod
+ @overload
+ def sparse(size: int, __map: Dict[int, float]) -> SparseVector:
+ ...
+
+ @staticmethod
+ def sparse(
+ size: int,
+ *args: Union[
+ bytes, Tuple[int, float], Iterable[float], Iterable[Tuple[int,
float]], Dict[int, float]
+ ],
+ ) -> SparseVector:
"""
Create a sparse vector, using either a dictionary, a list of
(index, value) pairs, or two separate arrays of indices and
@@ -832,10 +930,25 @@ class Vectors:
>>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
SparseVector(4, {1: 1.0, 3: 5.5})
"""
- return SparseVector(size, *args)
+ return SparseVector(size, *args) # type: ignore[arg-type]
+
+ @overload
+ @staticmethod
+ def dense(*elements: float) -> DenseVector:
+ ...
+
+ @overload
+ @staticmethod
+ def dense(__arr: bytes) -> DenseVector:
+ ...
+
+ @overload
+ @staticmethod
+ def dense(__arr: Iterable[float]) -> DenseVector:
+ ...
@staticmethod
- def dense(*elements):
+ def dense(*elements: Union[float, bytes, np.ndarray, Iterable[float]]) ->
DenseVector:
"""
Create a dense vector of 64-bit floats from a Python list or numbers.
@@ -848,11 +961,11 @@ class Vectors:
"""
if len(elements) == 1 and not isinstance(elements[0], (float, int)):
# it's list, numpy.array or other iterable object.
- elements = elements[0]
- return DenseVector(elements)
+ elements = elements[0] # type: ignore[assignment]
+ return DenseVector(cast(Iterable[float], elements))
@staticmethod
- def squared_distance(v1, v2):
+ def squared_distance(v1: Vector, v2: Vector) -> np.float64:
"""
Squared distance between two vectors.
a and b can be of type SparseVector, DenseVector, np.ndarray
@@ -866,21 +979,26 @@ class Vectors:
51.0
"""
v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2)
- return v1.squared_distance(v2)
+ return v1.squared_distance(v2) # type: ignore[attr-defined]
@staticmethod
- def norm(vector, p):
+ def norm(vector: Vector, p: "NormType") -> np.float64:
"""
Find norm of the given vector.
"""
- return _convert_to_vector(vector).norm(p)
+ return _convert_to_vector(vector).norm(p) # type: ignore[attr-defined]
@staticmethod
- def zeros(size):
+ def zeros(size: int) -> DenseVector:
return DenseVector(np.zeros(size))
@staticmethod
- def _equals(v1_indices, v1_values, v2_indices, v2_values):
+ def _equals(
+ v1_indices: Union[Sequence[int], np.ndarray],
+ v1_values: Union[Sequence[float], np.ndarray],
+ v2_indices: Union[Sequence[int], np.ndarray],
+ v2_values: Union[Sequence[float], np.ndarray],
+ ) -> bool:
"""
Check equality between sparse/dense vectors,
v1_indices and v2_indices assume to be strictly increasing.
@@ -913,19 +1031,19 @@ class Matrix:
Represents a local matrix.
"""
- def __init__(self, numRows, numCols, isTransposed=False):
+ def __init__(self, numRows: int, numCols: int, isTransposed: bool = False):
self.numRows = numRows
self.numCols = numCols
self.isTransposed = isTransposed
- def toArray(self):
+ def toArray(self) -> np.ndarray:
"""
Returns its elements in a numpy.ndarray.
"""
raise NotImplementedError
@staticmethod
- def _convert_to_array(array_like, dtype):
+ def _convert_to_array(array_like: Union[bytes, Iterable[float]], dtype:
Any) -> np.ndarray:
"""
Convert Matrix attributes which are array-like or buffer to array.
"""
@@ -939,13 +1057,19 @@ class DenseMatrix(Matrix):
Column-major dense matrix.
"""
- def __init__(self, numRows, numCols, values, isTransposed=False):
+ def __init__(
+ self,
+ numRows: int,
+ numCols: int,
+ values: Union[bytes, Iterable[float]],
+ isTransposed: bool = False,
+ ):
Matrix.__init__(self, numRows, numCols, isTransposed)
values = self._convert_to_array(values, np.float64)
assert len(values) == numRows * numCols
self.values = values
- def __reduce__(self):
+ def __reduce__(self) -> Tuple[Type["DenseMatrix"], Tuple[int, int, bytes,
int]]:
return DenseMatrix, (
self.numRows,
self.numCols,
@@ -953,7 +1077,7 @@ class DenseMatrix(Matrix):
int(self.isTransposed),
)
- def __str__(self):
+ def __str__(self) -> str:
"""
Pretty printing of a DenseMatrix
@@ -976,7 +1100,7 @@ class DenseMatrix(Matrix):
x = "\n".join([(" " * 6 + line) for line in array_lines[1:]])
return array_lines[0].replace("array", "DenseMatrix") + "\n" + x
- def __repr__(self):
+ def __repr__(self) -> str:
"""
Representation of a DenseMatrix
@@ -995,12 +1119,12 @@ class DenseMatrix(Matrix):
_format_float_list(self.values[:8]) + ["..."] +
_format_float_list(self.values[-8:])
)
- entries = ", ".join(entries)
+ entries = ", ".join(entries) # type: ignore[assignment]
return "DenseMatrix({0}, {1}, [{2}], {3})".format(
self.numRows, self.numCols, entries, self.isTransposed
)
- def toArray(self):
+ def toArray(self) -> np.ndarray:
"""
Return a :py:class:`numpy.ndarray`
@@ -1016,7 +1140,7 @@ class DenseMatrix(Matrix):
else:
return self.values.reshape((self.numRows, self.numCols), order="F")
- def toSparse(self):
+ def toSparse(self) -> "SparseMatrix":
"""Convert to SparseMatrix"""
if self.isTransposed:
values = np.ravel(self.toArray(), order="F")
@@ -1030,7 +1154,7 @@ class DenseMatrix(Matrix):
return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices,
values)
- def __getitem__(self, indices):
+ def __getitem__(self, indices: Tuple[int, int]) -> np.float64:
i, j = indices
if i < 0 or i >= self.numRows:
raise IndexError("Row index %d is out of range [0, %d)" % (i,
self.numRows))
@@ -1042,21 +1166,29 @@ class DenseMatrix(Matrix):
else:
return self.values[i + j * self.numRows]
- def __eq__(self, other):
+ def __eq__(self, other: Any) -> bool:
if self.numRows != other.numRows or self.numCols != other.numCols:
return False
if isinstance(other, SparseMatrix):
- return np.all(self.toArray() == other.toArray())
+ return np.all(self.toArray() == other.toArray()).tolist()
self_values = np.ravel(self.toArray(), order="F")
other_values = np.ravel(other.toArray(), order="F")
- return np.all(self_values == other_values)
+ return np.all(self_values == other_values).tolist()
class SparseMatrix(Matrix):
"""Sparse Matrix stored in CSC format."""
- def __init__(self, numRows, numCols, colPtrs, rowIndices, values,
isTransposed=False):
+ def __init__(
+ self,
+ numRows: int,
+ numCols: int,
+ colPtrs: Union[bytes, Iterable[int]],
+ rowIndices: Union[bytes, Iterable[int]],
+ values: Union[bytes, Iterable[float]],
+ isTransposed: bool = False,
+ ):
Matrix.__init__(self, numRows, numCols, isTransposed)
self.colPtrs = self._convert_to_array(colPtrs, np.int32)
self.rowIndices = self._convert_to_array(rowIndices, np.int32)
@@ -1078,7 +1210,7 @@ class SparseMatrix(Matrix):
% (self.rowIndices.size, self.values.size)
)
- def __str__(self):
+ def __str__(self) -> str:
"""
Pretty printing of a SparseMatrix
@@ -1124,7 +1256,7 @@ class SparseMatrix(Matrix):
spstr += "\n.." * 2
return spstr
- def __repr__(self):
+ def __repr__(self) -> str:
"""
Representation of a SparseMatrix
@@ -1149,14 +1281,14 @@ class SparseMatrix(Matrix):
if len(self.colPtrs) > 16:
colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:]
- values = ", ".join(values)
- rowIndices = ", ".join([str(ind) for ind in rowIndices])
- colPtrs = ", ".join([str(ptr) for ptr in colPtrs])
+ values = ", ".join(values) # type: ignore[assignment]
+ rowIndices = ", ".join([str(ind) for ind in rowIndices]) # type:
ignore[assignment]
+ colPtrs = ", ".join([str(ptr) for ptr in colPtrs]) # type:
ignore[assignment]
return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format(
self.numRows, self.numCols, colPtrs, rowIndices, values,
self.isTransposed
)
- def __reduce__(self):
+ def __reduce__(self) -> Tuple[Type["SparseMatrix"], Tuple[int, int, bytes,
bytes, bytes, int]]:
return SparseMatrix, (
self.numRows,
self.numCols,
@@ -1166,7 +1298,7 @@ class SparseMatrix(Matrix):
int(self.isTransposed),
)
- def __getitem__(self, indices):
+ def __getitem__(self, indices: Tuple[int, int]) -> np.float64:
i, j = indices
if i < 0 or i >= self.numRows:
raise IndexError("Row index %d is out of range [0, %d)" % (i,
self.numRows))
@@ -1186,9 +1318,9 @@ class SparseMatrix(Matrix):
if ind < colEnd and self.rowIndices[ind] == i:
return self.values[ind]
else:
- return 0.0
+ return np.float64(0.0)
- def toArray(self):
+ def toArray(self) -> np.ndarray:
"""
Return a numpy.ndarray
"""
@@ -1202,32 +1334,38 @@ class SparseMatrix(Matrix):
A[self.rowIndices[startptr:endptr], k] =
self.values[startptr:endptr]
return A
- def toDense(self):
+ def toDense(self) -> "DenseMatrix":
densevals = np.ravel(self.toArray(), order="F")
return DenseMatrix(self.numRows, self.numCols, densevals)
# TODO: More efficient implementation:
- def __eq__(self, other):
- return np.all(self.toArray() == other.toArray())
+ def __eq__(self, other: Any) -> bool:
+ return np.all(self.toArray() == other.toArray()).tolist()
class Matrices:
@staticmethod
- def dense(numRows, numCols, values):
+ def dense(numRows: int, numCols: int, values: Union[bytes,
Iterable[float]]) -> DenseMatrix:
"""
Create a DenseMatrix
"""
return DenseMatrix(numRows, numCols, values)
@staticmethod
- def sparse(numRows, numCols, colPtrs, rowIndices, values):
+ def sparse(
+ numRows: int,
+ numCols: int,
+ colPtrs: Union[bytes, Iterable[int]],
+ rowIndices: Union[bytes, Iterable[int]],
+ values: Union[bytes, Iterable[float]],
+ ) -> SparseMatrix:
"""
Create a SparseMatrix
"""
return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
-def _test():
+def _test() -> None:
import doctest
try:
diff --git a/python/pyspark/ml/linalg/__init__.pyi
b/python/pyspark/ml/linalg/__init__.pyi
deleted file mode 100644
index bb09397..0000000
--- a/python/pyspark/ml/linalg/__init__.pyi
+++ /dev/null
@@ -1,243 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from typing import overload
-from typing import Any, Dict, Iterable, List, NoReturn, Optional, Tuple, Type,
Union
-
-from pyspark.ml import linalg as newlinalg # noqa: F401
-from pyspark.sql.types import StructType, UserDefinedType
-
-from numpy import float64, ndarray
-
-class VectorUDT(UserDefinedType):
- @classmethod
- def sqlType(cls) -> StructType: ...
- @classmethod
- def module(cls) -> str: ...
- @classmethod
- def scalaUDT(cls) -> str: ...
- def serialize(
- self, obj: Vector
- ) -> Tuple[int, Optional[int], Optional[List[int]], List[float]]: ...
- def deserialize(self, datum: Any) -> Vector: ...
- def simpleString(self) -> str: ...
-
-class MatrixUDT(UserDefinedType):
- @classmethod
- def sqlType(cls) -> StructType: ...
- @classmethod
- def module(cls) -> str: ...
- @classmethod
- def scalaUDT(cls) -> str: ...
- def serialize(
- self, obj: Matrix
- ) -> Tuple[int, int, int, Optional[List[int]], Optional[List[int]],
List[float], bool]: ...
- def deserialize(self, datum: Any) -> Matrix: ...
- def simpleString(self) -> str: ...
-
-class Vector:
- __UDT__: VectorUDT
- def toArray(self) -> ndarray: ...
-
-class DenseVector(Vector):
- array: ndarray
- @overload
- def __init__(self, *elements: float) -> None: ...
- @overload
- def __init__(self, __arr: bytes) -> None: ...
- @overload
- def __init__(self, __arr: Iterable[float]) -> None: ...
- def __reduce__(self) -> Tuple[Type[DenseVector], bytes]: ...
- def numNonzeros(self) -> int: ...
- def norm(self, p: Union[float, str]) -> float64: ...
- def dot(self, other: Iterable[float]) -> float64: ...
- def squared_distance(self, other: Iterable[float]) -> float64: ...
- def toArray(self) -> ndarray: ...
- @property
- def values(self) -> ndarray: ...
- def __getitem__(self, item: int) -> float64: ...
- def __len__(self) -> int: ...
- def __eq__(self, other: Any) -> bool: ...
- def __ne__(self, other: Any) -> bool: ...
- def __hash__(self) -> int: ...
- def __getattr__(self, item: str) -> Any: ...
- def __neg__(self) -> DenseVector: ...
- def __add__(self, other: Union[float, Iterable[float]]) -> DenseVector: ...
- def __sub__(self, other: Union[float, Iterable[float]]) -> DenseVector: ...
- def __mul__(self, other: Union[float, Iterable[float]]) -> DenseVector: ...
- def __div__(self, other: Union[float, Iterable[float]]) -> DenseVector: ...
- def __truediv__(self, other: Union[float, Iterable[float]]) ->
DenseVector: ...
- def __mod__(self, other: Union[float, Iterable[float]]) -> DenseVector: ...
- def __radd__(self, other: Union[float, Iterable[float]]) -> DenseVector:
...
- def __rsub__(self, other: Union[float, Iterable[float]]) -> DenseVector:
...
- def __rmul__(self, other: Union[float, Iterable[float]]) -> DenseVector:
...
- def __rdiv__(self, other: Union[float, Iterable[float]]) -> DenseVector:
...
- def __rtruediv__(self, other: Union[float, Iterable[float]]) ->
DenseVector: ...
- def __rmod__(self, other: Union[float, Iterable[float]]) -> DenseVector:
...
-
-class SparseVector(Vector):
- size: int
- indices: ndarray
- values: ndarray
- @overload
- def __init__(self, size: int, *args: Tuple[int, float]) -> None: ...
- @overload
- def __init__(self, size: int, __indices: bytes, __values: bytes) -> None:
...
- @overload
- def __init__(self, size: int, __indices: Iterable[int], __values:
Iterable[float]) -> None: ...
- @overload
- def __init__(self, size: int, __pairs: Iterable[Tuple[int, float]]) ->
None: ...
- @overload
- def __init__(self, size: int, __map: Dict[int, float]) -> None: ...
- def numNonzeros(self) -> int: ...
- def norm(self, p: Union[float, str]) -> float64: ...
- def __reduce__(self) -> Tuple[Type[SparseVector], Tuple[int, bytes,
bytes]]: ...
- def dot(self, other: Iterable[float]) -> float64: ...
- def squared_distance(self, other: Iterable[float]) -> float64: ...
- def toArray(self) -> ndarray: ...
- def __len__(self) -> int: ...
- def __eq__(self, other: Any) -> bool: ...
- def __getitem__(self, index: int) -> float64: ...
- def __ne__(self, other: Any) -> bool: ...
- def __hash__(self) -> int: ...
-
-class Vectors:
- @overload
- @staticmethod
- def sparse(size: int, *args: Tuple[int, float]) -> SparseVector: ...
- @overload
- @staticmethod
- def sparse(size: int, __indices: bytes, __values: bytes) -> SparseVector:
...
- @overload
- @staticmethod
- def sparse(size: int, __indices: Iterable[int], __values: Iterable[float])
-> SparseVector: ...
- @overload
- @staticmethod
- def sparse(size: int, __pairs: Iterable[Tuple[int, float]]) ->
SparseVector: ...
- @overload
- @staticmethod
- def sparse(size: int, __map: Dict[int, float]) -> SparseVector: ...
- @overload
- @staticmethod
- def dense(*elements: float) -> DenseVector: ...
- @overload
- @staticmethod
- def dense(__arr: bytes) -> DenseVector: ...
- @overload
- @staticmethod
- def dense(__arr: Iterable[float]) -> DenseVector: ...
- @staticmethod
- def stringify(vector: Vector) -> str: ...
- @staticmethod
- def squared_distance(v1: Vector, v2: Vector) -> float64: ...
- @staticmethod
- def norm(vector: Vector, p: Union[float, str]) -> float64: ...
- @staticmethod
- def zeros(size: int) -> DenseVector: ...
-
-class Matrix:
- __UDT__: MatrixUDT
- numRows: int
- numCols: int
- isTransposed: bool
- def __init__(self, numRows: int, numCols: int, isTransposed: bool = ...)
-> None: ...
- def toArray(self) -> ndarray: ...
-
-class DenseMatrix(Matrix):
- values: Any
- @overload
- def __init__(
- self, numRows: int, numCols: int, values: bytes, isTransposed: bool =
...
- ) -> None: ...
- @overload
- def __init__(
- self,
- numRows: int,
- numCols: int,
- values: Iterable[float],
- isTransposed: bool = ...,
- ) -> None: ...
- def __reduce__(self) -> Tuple[Type[DenseMatrix], Tuple[int, int, bytes,
int]]: ...
- def toArray(self) -> ndarray: ...
- def toSparse(self) -> SparseMatrix: ...
- def __getitem__(self, indices: Tuple[int, int]) -> float64: ...
- def __eq__(self, other: Any) -> bool: ...
-
-class SparseMatrix(Matrix):
- colPtrs: ndarray
- rowIndices: ndarray
- values: ndarray
- @overload
- def __init__(
- self,
- numRows: int,
- numCols: int,
- colPtrs: bytes,
- rowIndices: bytes,
- values: bytes,
- isTransposed: bool = ...,
- ) -> None: ...
- @overload
- def __init__(
- self,
- numRows: int,
- numCols: int,
- colPtrs: Iterable[int],
- rowIndices: Iterable[int],
- values: Iterable[float],
- isTransposed: bool = ...,
- ) -> None: ...
- def __reduce__(
- self,
- ) -> Tuple[Type[SparseMatrix], Tuple[int, int, bytes, bytes, bytes, int]]:
...
- def __getitem__(self, indices: Tuple[int, int]) -> float64: ...
- def toArray(self) -> ndarray: ...
- def toDense(self) -> DenseMatrix: ...
- def __eq__(self, other: Any) -> bool: ...
-
-class Matrices:
- @overload
- @staticmethod
- def dense(
- numRows: int, numCols: int, values: bytes, isTransposed: bool = ...
- ) -> DenseMatrix: ...
- @overload
- @staticmethod
- def dense(
- numRows: int, numCols: int, values: Iterable[float], isTransposed:
bool = ...
- ) -> DenseMatrix: ...
- @overload
- @staticmethod
- def sparse(
- numRows: int,
- numCols: int,
- colPtrs: bytes,
- rowIndices: bytes,
- values: bytes,
- isTransposed: bool = ...,
- ) -> SparseMatrix: ...
- @overload
- @staticmethod
- def sparse(
- numRows: int,
- numCols: int,
- colPtrs: Iterable[int],
- rowIndices: Iterable[int],
- values: Iterable[float],
- isTransposed: bool = ...,
- ) -> SparseMatrix: ...
diff --git a/python/pyspark/mllib/linalg/__init__.py
b/python/pyspark/mllib/linalg/__init__.py
index b9c391e..dd7fad0 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -575,8 +575,8 @@ class DenseVector(Vector):
def __neg__(self) -> "DenseVector":
return DenseVector(-self.array)
- def _delegate(op: str) -> Callable[["DenseVector", Any], Any]: # type:
ignore[misc]
- def func(self: "DenseVector", other: Any) -> Any:
+ def _delegate(op: str) -> Callable[["DenseVector", Any], "DenseVector"]:
# type: ignore[misc]
+ def func(self: "DenseVector", other: Any) -> "DenseVector":
if isinstance(other, DenseVector):
other = other.array
return DenseVector(getattr(self.array, op)(other))
@@ -768,7 +768,7 @@ class SparseVector(Vector):
raise ValueError("Unable to parse values from %s." % s)
return SparseVector(cast(int, size), indices, values)
- def dot(self, other: Any) -> np.float64:
+ def dot(self, other: Iterable[float]) -> np.float64:
"""
Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
@@ -824,9 +824,9 @@ class SparseVector(Vector):
return np.dot(self_values, other.values[other_cmind])
else:
- return self.dot(_convert_to_vector(other))
+ return self.dot(_convert_to_vector(other)) # type:
ignore[arg-type]
- def squared_distance(self, other: Any) -> np.float64:
+ def squared_distance(self, other: Iterable[float]) -> np.float64:
"""
Squared distance from a SparseVector or 1-dimensional NumPy array.
@@ -894,7 +894,7 @@ class SparseVector(Vector):
j += 1
return result
else:
- return self.squared_distance(_convert_to_vector(other))
+ return self.squared_distance(_convert_to_vector(other)) # type:
ignore[arg-type]
def toArray(self) -> np.ndarray:
"""
@@ -1140,7 +1140,7 @@ class Vectors:
return v1.squared_distance(v2) # type: ignore[attr-defined]
@staticmethod
- def norm(vector: Vector, p: Union[float, str]) -> np.float64:
+ def norm(vector: Vector, p: "NormType") -> np.float64:
"""
Find norm of the given vector.
"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]