[spark] branch master updated: [SPARK-37417][PYTHON][ML] Inline type hints for pyspark.ml.linalg.init.py

zero323 Sun, 06 Feb 2022 02:01:40 -0800

This is an automated email from the ASF dual-hosted git repository.

zero323 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new c17ba3f  [SPARK-37417][PYTHON][ML] Inline type hints for 
pyspark.ml.linalg.__init__.py
c17ba3f is described below

commit c17ba3f9e9802ae5491cd914a4262cfe4e8e6d20
Author: zero323 <[email protected]>
AuthorDate: Sun Feb 6 10:59:21 2022 +0100

    [SPARK-37417][PYTHON][ML] Inline type hints for 
pyspark.ml.linalg.__init__.py
    
    ### What changes were proposed in this pull request?
    
    Migration of type type annotations for `pyspark.ml.linalg.__init__.py` from 
stub file to inline hints.
    
    ### Why are the changes needed?
    
    As part of ongoing type hint migrations.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Existing tests.
    
    Closes #35380 from zero323/SPARK-37417.
    
    Authored-by: zero323 <[email protected]>
    Signed-off-by: zero323 <[email protected]>
---
 python/pyspark/ml/_typing.pyi           |   7 +-
 python/pyspark/ml/linalg/__init__.py    | 370 ++++++++++++++++++++++----------
 python/pyspark/ml/linalg/__init__.pyi   | 243 ---------------------
 python/pyspark/mllib/linalg/__init__.py |  14 +-
 4 files changed, 267 insertions(+), 367 deletions(-)

diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi
index b51aa96..7862078 100644
--- a/python/pyspark/ml/_typing.pyi
+++ b/python/pyspark/ml/_typing.pyi
@@ -16,12 +16,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, Dict, TypeVar, Union
+from typing import Any, Dict, List, TypeVar, Tuple, Union
 from typing_extensions import Literal
 
+from numpy import ndarray
+
 import pyspark.ml.base
 import pyspark.ml.param
 import pyspark.ml.util
+from pyspark.ml.linalg import Vector
 import pyspark.ml.wrapper
 from py4j.java_gateway import JavaObject
 
@@ -75,3 +78,5 @@ RankingEvaluatorMetricType = Union[
     Literal["ndcgAtK"],
     Literal["recallAtK"],
 ]
+
+VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]]
diff --git a/python/pyspark/ml/linalg/__init__.py 
b/python/pyspark/ml/linalg/__init__.py
index 03e63e9..d3d2cbd 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -40,6 +40,22 @@ from pyspark.sql.types import (
     BooleanType,
 )
 
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    overload,
+    Sequence,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+
 
 __all__ = [
     "Vector",
@@ -52,6 +68,11 @@ __all__ = [
     "Matrices",
 ]
 
+if TYPE_CHECKING:
+    from pyspark.mllib._typing import NormType
+    from pyspark.ml._typing import VectorLike
+    from scipy.sparse import spmatrix
+
 
 # Check whether we have SciPy. MLlib works without it too, but if we have it, 
some methods,
 # such as _dot and _serialize_double_vector, start to support scipy.sparse 
matrices.
@@ -65,23 +86,23 @@ except BaseException:
     _have_scipy = False
 
 
-def _convert_to_vector(d):
+def _convert_to_vector(d: Union["VectorLike", "spmatrix", range]) -> "Vector":
     if isinstance(d, Vector):
         return d
     elif type(d) in (array.array, np.array, np.ndarray, list, tuple, range):
         return DenseVector(d)
     elif _have_scipy and scipy.sparse.issparse(d):
-        assert d.shape[1] == 1, "Expected column vector"
+        assert cast("spmatrix", d).shape[1] == 1, "Expected column vector"
         # Make sure the converted csc_matrix has sorted indices.
-        csc = d.tocsc()
+        csc = cast("spmatrix", d).tocsc()
         if not csc.has_sorted_indices:
             csc.sort_indices()
-        return SparseVector(d.shape[0], csc.indices, csc.data)
+        return SparseVector(cast("spmatrix", d).shape[0], csc.indices, 
csc.data)
     else:
         raise TypeError("Cannot convert type %s into Vector" % type(d))
 
 
-def _vector_size(v):
+def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int:
     """
     Returns the size of the vector.
 
@@ -112,24 +133,24 @@ def _vector_size(v):
         else:
             raise ValueError("Cannot treat an ndarray of shape %s as a vector" 
% str(v.shape))
     elif _have_scipy and scipy.sparse.issparse(v):
-        assert v.shape[1] == 1, "Expected column vector"
-        return v.shape[0]
+        assert cast("spmatrix", v).shape[1] == 1, "Expected column vector"
+        return cast("spmatrix", v).shape[0]
     else:
         raise TypeError("Cannot treat type %s as a vector" % type(v))
 
 
-def _format_float(f, digits=4):
+def _format_float(f: float, digits: int = 4) -> str:
     s = str(round(f, digits))
     if "." in s:
         s = s[: s.index(".") + 1 + digits]
     return s
 
 
-def _format_float_list(xs):
+def _format_float_list(xs: Iterable[float]) -> List[str]:
     return [_format_float(x) for x in xs]
 
 
-def _double_to_long_bits(value):
+def _double_to_long_bits(value: float) -> int:
     if np.isnan(value):
         value = float("nan")
     # pack double into 64 bits, then unpack as long int
@@ -142,7 +163,7 @@ class VectorUDT(UserDefinedType):
     """
 
     @classmethod
-    def sqlType(cls):
+    def sqlType(cls) -> StructType:
         return StructType(
             [
                 StructField("type", ByteType(), False),
@@ -153,37 +174,41 @@ class VectorUDT(UserDefinedType):
         )
 
     @classmethod
-    def module(cls):
+    def module(cls) -> str:
         return "pyspark.ml.linalg"
 
     @classmethod
-    def scalaUDT(cls):
+    def scalaUDT(cls) -> str:
         return "org.apache.spark.ml.linalg.VectorUDT"
 
-    def serialize(self, obj):
+    def serialize(
+        self, obj: "Vector"
+    ) -> Tuple[int, Optional[int], Optional[List[int]], List[float]]:
         if isinstance(obj, SparseVector):
             indices = [int(i) for i in obj.indices]
             values = [float(v) for v in obj.values]
             return (0, obj.size, indices, values)
         elif isinstance(obj, DenseVector):
-            values = [float(v) for v in obj]
+            values = [float(v) for v in obj]  # type: ignore[attr-defined]
             return (1, None, None, values)
         else:
             raise TypeError("cannot serialize %r of type %r" % (obj, 
type(obj)))
 
-    def deserialize(self, datum):
+    def deserialize(
+        self, datum: Tuple[int, Optional[int], Optional[List[int]], 
List[float]]
+    ) -> "Vector":
         assert (
             len(datum) == 4
         ), "VectorUDT.deserialize given row with length %d but requires 4" % 
len(datum)
         tpe = datum[0]
         if tpe == 0:
-            return SparseVector(datum[1], datum[2], datum[3])
+            return SparseVector(cast(int, datum[1]), cast(List[int], 
datum[2]), datum[3])
         elif tpe == 1:
             return DenseVector(datum[3])
         else:
             raise ValueError("do not recognize type %r" % tpe)
 
-    def simpleString(self):
+    def simpleString(self) -> str:
         return "vector"
 
 
@@ -193,7 +218,7 @@ class MatrixUDT(UserDefinedType):
     """
 
     @classmethod
-    def sqlType(cls):
+    def sqlType(cls) -> StructType:
         return StructType(
             [
                 StructField("type", ByteType(), False),
@@ -207,14 +232,16 @@ class MatrixUDT(UserDefinedType):
         )
 
     @classmethod
-    def module(cls):
+    def module(cls) -> str:
         return "pyspark.ml.linalg"
 
     @classmethod
-    def scalaUDT(cls):
+    def scalaUDT(cls) -> str:
         return "org.apache.spark.ml.linalg.MatrixUDT"
 
-    def serialize(self, obj):
+    def serialize(
+        self, obj: "Matrix"
+    ) -> Tuple[int, int, int, Optional[List[int]], Optional[List[int]], 
List[float], bool]:
         if isinstance(obj, SparseMatrix):
             colPtrs = [int(i) for i in obj.colPtrs]
             rowIndices = [int(i) for i in obj.rowIndices]
@@ -234,19 +261,22 @@ class MatrixUDT(UserDefinedType):
         else:
             raise TypeError("cannot serialize type %r" % (type(obj)))
 
-    def deserialize(self, datum):
+    def deserialize(
+        self,
+        datum: Tuple[int, int, int, Optional[List[int]], Optional[List[int]], 
List[float], bool],
+    ) -> "Matrix":
         assert (
             len(datum) == 7
         ), "MatrixUDT.deserialize given row with length %d but requires 7" % 
len(datum)
         tpe = datum[0]
         if tpe == 0:
-            return SparseMatrix(*datum[1:])
+            return SparseMatrix(*datum[1:])  # type: ignore[arg-type]
         elif tpe == 1:
             return DenseMatrix(datum[1], datum[2], datum[5], datum[6])
         else:
             raise ValueError("do not recognize type %r" % tpe)
 
-    def simpleString(self):
+    def simpleString(self) -> str:
         return "matrix"
 
 
@@ -258,7 +288,7 @@ class Vector:
     Abstract class for DenseVector and SparseVector
     """
 
-    def toArray(self):
+    def toArray(self) -> np.ndarray:
         """
         Convert the vector into an numpy.ndarray
 
@@ -266,6 +296,9 @@ class Vector:
         """
         raise NotImplementedError
 
+    def __len__(self) -> int:
+        raise NotImplementedError
+
 
 class DenseVector(Vector):
     """
@@ -293,25 +326,26 @@ class DenseVector(Vector):
     DenseVector([-1.0, -2.0])
     """
 
-    def __init__(self, ar):
+    def __init__(self, ar: Union[bytes, np.ndarray, Iterable[float]]):
+        ar_: np.ndarray
         if isinstance(ar, bytes):
-            ar = np.frombuffer(ar, dtype=np.float64)
+            ar_ = np.frombuffer(ar, dtype=np.float64)
         elif not isinstance(ar, np.ndarray):
-            ar = np.array(ar, dtype=np.float64)
-        if ar.dtype != np.float64:
-            ar = ar.astype(np.float64)
-        self.array = ar
+            ar_ = np.array(ar, dtype=np.float64)
+        else:
+            ar_ = ar.astype(np.float64) if ar.dtype != np.float64 else ar
+        self.array = ar_
 
-    def __reduce__(self):
+    def __reduce__(self) -> Tuple[Type["DenseVector"], Tuple[bytes]]:
         return DenseVector, (self.array.tobytes(),)
 
-    def numNonzeros(self):
+    def numNonzeros(self) -> int:
         """
         Number of nonzero elements. This scans all active values and count non 
zeros
         """
         return np.count_nonzero(self.array)
 
-    def norm(self, p):
+    def norm(self, p: "NormType") -> np.float64:
         """
         Calculates the norm of a DenseVector.
 
@@ -325,7 +359,7 @@ class DenseVector(Vector):
         """
         return np.linalg.norm(self.array, p)
 
-    def dot(self, other):
+    def dot(self, other: Iterable[float]) -> np.float64:
         """
         Compute the dot product of two Vectors. We support
         (Numpy array, list, SparseVector, or SciPy sparse)
@@ -359,8 +393,8 @@ class DenseVector(Vector):
                 assert len(self) == other.shape[0], "dimension mismatch"
             return np.dot(self.array, other)
         elif _have_scipy and scipy.sparse.issparse(other):
-            assert len(self) == other.shape[0], "dimension mismatch"
-            return other.transpose().dot(self.toArray())
+            assert len(self) == cast("spmatrix", other).shape[0], "dimension 
mismatch"
+            return cast("spmatrix", other).transpose().dot(self.toArray())
         else:
             assert len(self) == _vector_size(other), "dimension mismatch"
             if isinstance(other, SparseVector):
@@ -368,9 +402,9 @@ class DenseVector(Vector):
             elif isinstance(other, Vector):
                 return np.dot(self.toArray(), other.toArray())
             else:
-                return np.dot(self.toArray(), other)
+                return np.dot(self.toArray(), other)  # type: 
ignore[call-overload]
 
-    def squared_distance(self, other):
+    def squared_distance(self, other: Iterable[float]) -> np.float64:
         """
         Squared distance of two Vectors.
 
@@ -401,41 +435,49 @@ class DenseVector(Vector):
         if isinstance(other, SparseVector):
             return other.squared_distance(self)
         elif _have_scipy and scipy.sparse.issparse(other):
-            return _convert_to_vector(other).squared_distance(self)
+            return _convert_to_vector(other).squared_distance(self)  # type: 
ignore[attr-defined]
 
         if isinstance(other, Vector):
             other = other.toArray()
         elif not isinstance(other, np.ndarray):
             other = np.array(other)
-        diff = self.toArray() - other
+        diff: np.ndarray = self.toArray() - other
         return np.dot(diff, diff)
 
-    def toArray(self):
+    def toArray(self) -> np.ndarray:
         """
         Returns the underlying numpy.ndarray
         """
         return self.array
 
     @property
-    def values(self):
+    def values(self) -> np.ndarray:
         """
         Returns the underlying numpy.ndarray
         """
         return self.array
 
-    def __getitem__(self, item):
+    @overload
+    def __getitem__(self, item: int) -> np.float64:
+        ...
+
+    @overload
+    def __getitem__(self, item: slice) -> np.ndarray:
+        ...
+
+    def __getitem__(self, item: Union[int, slice]) -> Union[np.float64, 
np.ndarray]:
         return self.array[item]
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.array)
 
-    def __str__(self):
+    def __str__(self) -> str:
         return "[" + ",".join([str(v) for v in self.array]) + "]"
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return "DenseVector([%s])" % (", ".join(_format_float(i) for i in 
self.array))
 
-    def __eq__(self, other):
+    def __eq__(self, other: Any) -> bool:
         if isinstance(other, DenseVector):
             return np.array_equal(self.array, other.array)
         elif isinstance(other, SparseVector):
@@ -444,10 +486,10 @@ class DenseVector(Vector):
             return Vectors._equals(list(range(len(self))), self.array, 
other.indices, other.values)
         return False
 
-    def __ne__(self, other):
+    def __ne__(self, other: Any) -> bool:
         return not self == other
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         size = len(self)
         result = 31 + size
         nnz = 0
@@ -461,14 +503,14 @@ class DenseVector(Vector):
             i += 1
         return result
 
-    def __getattr__(self, item):
+    def __getattr__(self, item: str) -> Any:
         return getattr(self.array, item)
 
-    def __neg__(self):
+    def __neg__(self) -> "DenseVector":
         return DenseVector(-self.array)
 
-    def _delegate(op):
-        def func(self, other):
+    def _delegate(op: str) -> Callable[["DenseVector", Any], "DenseVector"]:  
# type: ignore[misc]
+        def func(self: "DenseVector", other: Any) -> "DenseVector":
             if isinstance(other, DenseVector):
                 other = other.array
             return DenseVector(getattr(self.array, op)(other))
@@ -495,7 +537,33 @@ class SparseVector(Vector):
     alternatively pass SciPy's {scipy.sparse} data types.
     """
 
-    def __init__(self, size, *args):
+    @overload
+    def __init__(self, size: int, __indices: bytes, __values: bytes):
+        ...
+
+    @overload
+    def __init__(self, size: int, *args: Tuple[int, float]):
+        ...
+
+    @overload
+    def __init__(self, size: int, __indices: Iterable[int], __values: 
Iterable[float]):
+        ...
+
+    @overload
+    def __init__(self, size: int, __pairs: Iterable[Tuple[int, float]]):
+        ...
+
+    @overload
+    def __init__(self, size: int, __map: Dict[int, float]):
+        ...
+
+    def __init__(
+        self,
+        size: int,
+        *args: Union[
+            bytes, Tuple[int, float], Iterable[float], Iterable[Tuple[int, 
float]], Dict[int, float]
+        ],
+    ):
         """
         Create a sparse vector, using either a dictionary, a list of
         (index, value) pairs, or two separate arrays of indices and
@@ -535,7 +603,7 @@ class SparseVector(Vector):
             pairs = args[0]
             if type(pairs) == dict:
                 pairs = pairs.items()
-            pairs = sorted(pairs)
+            pairs = cast(Iterable[Tuple[int, float]], sorted(pairs))
             self.indices = np.array([p[0] for p in pairs], dtype=np.int32)
             """ A list of indices corresponding to active entries. """
             self.values = np.array([p[1] for p in pairs], dtype=np.float64)
@@ -570,13 +638,13 @@ class SparseVector(Vector):
             )
             assert np.min(self.indices) >= 0, "Contains negative index %d" % 
(np.min(self.indices))
 
-    def numNonzeros(self):
+    def numNonzeros(self) -> int:
         """
         Number of nonzero elements. This scans all active values and count non 
zeros.
         """
         return np.count_nonzero(self.values)
 
-    def norm(self, p):
+    def norm(self, p: "NormType") -> np.float64:
         """
         Calculates the norm of a SparseVector.
 
@@ -590,10 +658,10 @@ class SparseVector(Vector):
         """
         return np.linalg.norm(self.values, p)
 
-    def __reduce__(self):
+    def __reduce__(self) -> Tuple[Type["SparseVector"], Tuple[int, bytes, 
bytes]]:
         return (SparseVector, (self.size, self.indices.tobytes(), 
self.values.tobytes()))
 
-    def dot(self, other):
+    def dot(self, other: Iterable[float]) -> np.float64:
         """
         Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
 
@@ -643,15 +711,15 @@ class SparseVector(Vector):
             self_cmind = np.in1d(self.indices, other.indices, 
assume_unique=True)
             self_values = self.values[self_cmind]
             if self_values.size == 0:
-                return 0.0
+                return np.float64(0.0)
             else:
                 other_cmind = np.in1d(other.indices, self.indices, 
assume_unique=True)
                 return np.dot(self_values, other.values[other_cmind])
 
         else:
-            return self.dot(_convert_to_vector(other))
+            return self.dot(_convert_to_vector(other))  # type: 
ignore[arg-type]
 
-    def squared_distance(self, other):
+    def squared_distance(self, other: Iterable[float]) -> np.float64:
         """
         Squared distance from a SparseVector or 1-dimensional NumPy array.
 
@@ -719,9 +787,9 @@ class SparseVector(Vector):
                 j += 1
             return result
         else:
-            return self.squared_distance(_convert_to_vector(other))
+            return self.squared_distance(_convert_to_vector(other))  # type: 
ignore[arg-type]
 
-    def toArray(self):
+    def toArray(self) -> np.ndarray:
         """
         Returns a copy of this SparseVector as a 1-dimensional numpy.ndarray.
         """
@@ -729,15 +797,15 @@ class SparseVector(Vector):
         arr[self.indices] = self.values
         return arr
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.size
 
-    def __str__(self):
+    def __str__(self) -> str:
         inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
         vals = "[" + ",".join([str(v) for v in self.values]) + "]"
         return "(" + ",".join((str(self.size), inds, vals)) + ")"
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         inds = self.indices
         vals = self.values
         entries = ", ".join(
@@ -745,7 +813,7 @@ class SparseVector(Vector):
         )
         return "SparseVector({0}, {{{1}}})".format(self.size, entries)
 
-    def __eq__(self, other):
+    def __eq__(self, other: Any) -> bool:
         if isinstance(other, SparseVector):
             return (
                 other.size == self.size
@@ -758,7 +826,7 @@ class SparseVector(Vector):
             return Vectors._equals(self.indices, self.values, 
list(range(len(other))), other.array)
         return False
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> np.float64:
         inds = self.indices
         vals = self.values
         if not isinstance(index, int):
@@ -770,18 +838,18 @@ class SparseVector(Vector):
             index += self.size
 
         if (inds.size == 0) or (index > inds.item(-1)):
-            return 0.0
+            return np.float64(0.0)
 
         insert_index = np.searchsorted(inds, index)
         row_ind = inds[insert_index]
         if row_ind == index:
             return vals[insert_index]
-        return 0.0
+        return np.float64(0.0)
 
-    def __ne__(self, other):
+    def __ne__(self, other: Any) -> bool:
         return not self.__eq__(other)
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         result = 31 + self.size
         nnz = 0
         i = 0
@@ -809,7 +877,37 @@ class Vectors:
     """
 
     @staticmethod
-    def sparse(size, *args):
+    @overload
+    def sparse(size: int, __indices: bytes, __values: bytes) -> SparseVector:
+        ...
+
+    @staticmethod
+    @overload
+    def sparse(size: int, *args: Tuple[int, float]) -> SparseVector:
+        ...
+
+    @staticmethod
+    @overload
+    def sparse(size: int, __indices: Iterable[int], __values: Iterable[float]) 
-> SparseVector:
+        ...
+
+    @staticmethod
+    @overload
+    def sparse(size: int, __pairs: Iterable[Tuple[int, float]]) -> 
SparseVector:
+        ...
+
+    @staticmethod
+    @overload
+    def sparse(size: int, __map: Dict[int, float]) -> SparseVector:
+        ...
+
+    @staticmethod
+    def sparse(
+        size: int,
+        *args: Union[
+            bytes, Tuple[int, float], Iterable[float], Iterable[Tuple[int, 
float]], Dict[int, float]
+        ],
+    ) -> SparseVector:
         """
         Create a sparse vector, using either a dictionary, a list of
         (index, value) pairs, or two separate arrays of indices and
@@ -832,10 +930,25 @@ class Vectors:
         >>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
         SparseVector(4, {1: 1.0, 3: 5.5})
         """
-        return SparseVector(size, *args)
+        return SparseVector(size, *args)  # type: ignore[arg-type]
+
+    @overload
+    @staticmethod
+    def dense(*elements: float) -> DenseVector:
+        ...
+
+    @overload
+    @staticmethod
+    def dense(__arr: bytes) -> DenseVector:
+        ...
+
+    @overload
+    @staticmethod
+    def dense(__arr: Iterable[float]) -> DenseVector:
+        ...
 
     @staticmethod
-    def dense(*elements):
+    def dense(*elements: Union[float, bytes, np.ndarray, Iterable[float]]) -> 
DenseVector:
         """
         Create a dense vector of 64-bit floats from a Python list or numbers.
 
@@ -848,11 +961,11 @@ class Vectors:
         """
         if len(elements) == 1 and not isinstance(elements[0], (float, int)):
             # it's list, numpy.array or other iterable object.
-            elements = elements[0]
-        return DenseVector(elements)
+            elements = elements[0]  # type: ignore[assignment]
+        return DenseVector(cast(Iterable[float], elements))
 
     @staticmethod
-    def squared_distance(v1, v2):
+    def squared_distance(v1: Vector, v2: Vector) -> np.float64:
         """
         Squared distance between two vectors.
         a and b can be of type SparseVector, DenseVector, np.ndarray
@@ -866,21 +979,26 @@ class Vectors:
         51.0
         """
         v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2)
-        return v1.squared_distance(v2)
+        return v1.squared_distance(v2)  # type: ignore[attr-defined]
 
     @staticmethod
-    def norm(vector, p):
+    def norm(vector: Vector, p: "NormType") -> np.float64:
         """
         Find norm of the given vector.
         """
-        return _convert_to_vector(vector).norm(p)
+        return _convert_to_vector(vector).norm(p)  # type: ignore[attr-defined]
 
     @staticmethod
-    def zeros(size):
+    def zeros(size: int) -> DenseVector:
         return DenseVector(np.zeros(size))
 
     @staticmethod
-    def _equals(v1_indices, v1_values, v2_indices, v2_values):
+    def _equals(
+        v1_indices: Union[Sequence[int], np.ndarray],
+        v1_values: Union[Sequence[float], np.ndarray],
+        v2_indices: Union[Sequence[int], np.ndarray],
+        v2_values: Union[Sequence[float], np.ndarray],
+    ) -> bool:
         """
         Check equality between sparse/dense vectors,
         v1_indices and v2_indices assume to be strictly increasing.
@@ -913,19 +1031,19 @@ class Matrix:
     Represents a local matrix.
     """
 
-    def __init__(self, numRows, numCols, isTransposed=False):
+    def __init__(self, numRows: int, numCols: int, isTransposed: bool = False):
         self.numRows = numRows
         self.numCols = numCols
         self.isTransposed = isTransposed
 
-    def toArray(self):
+    def toArray(self) -> np.ndarray:
         """
         Returns its elements in a numpy.ndarray.
         """
         raise NotImplementedError
 
     @staticmethod
-    def _convert_to_array(array_like, dtype):
+    def _convert_to_array(array_like: Union[bytes, Iterable[float]], dtype: 
Any) -> np.ndarray:
         """
         Convert Matrix attributes which are array-like or buffer to array.
         """
@@ -939,13 +1057,19 @@ class DenseMatrix(Matrix):
     Column-major dense matrix.
     """
 
-    def __init__(self, numRows, numCols, values, isTransposed=False):
+    def __init__(
+        self,
+        numRows: int,
+        numCols: int,
+        values: Union[bytes, Iterable[float]],
+        isTransposed: bool = False,
+    ):
         Matrix.__init__(self, numRows, numCols, isTransposed)
         values = self._convert_to_array(values, np.float64)
         assert len(values) == numRows * numCols
         self.values = values
 
-    def __reduce__(self):
+    def __reduce__(self) -> Tuple[Type["DenseMatrix"], Tuple[int, int, bytes, 
int]]:
         return DenseMatrix, (
             self.numRows,
             self.numCols,
@@ -953,7 +1077,7 @@ class DenseMatrix(Matrix):
             int(self.isTransposed),
         )
 
-    def __str__(self):
+    def __str__(self) -> str:
         """
         Pretty printing of a DenseMatrix
 
@@ -976,7 +1100,7 @@ class DenseMatrix(Matrix):
         x = "\n".join([(" " * 6 + line) for line in array_lines[1:]])
         return array_lines[0].replace("array", "DenseMatrix") + "\n" + x
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         """
         Representation of a DenseMatrix
 
@@ -995,12 +1119,12 @@ class DenseMatrix(Matrix):
                 _format_float_list(self.values[:8]) + ["..."] + 
_format_float_list(self.values[-8:])
             )
 
-        entries = ", ".join(entries)
+        entries = ", ".join(entries)  # type: ignore[assignment]
         return "DenseMatrix({0}, {1}, [{2}], {3})".format(
             self.numRows, self.numCols, entries, self.isTransposed
         )
 
-    def toArray(self):
+    def toArray(self) -> np.ndarray:
         """
         Return a :py:class:`numpy.ndarray`
 
@@ -1016,7 +1140,7 @@ class DenseMatrix(Matrix):
         else:
             return self.values.reshape((self.numRows, self.numCols), order="F")
 
-    def toSparse(self):
+    def toSparse(self) -> "SparseMatrix":
         """Convert to SparseMatrix"""
         if self.isTransposed:
             values = np.ravel(self.toArray(), order="F")
@@ -1030,7 +1154,7 @@ class DenseMatrix(Matrix):
 
         return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, 
values)
 
-    def __getitem__(self, indices):
+    def __getitem__(self, indices: Tuple[int, int]) -> np.float64:
         i, j = indices
         if i < 0 or i >= self.numRows:
             raise IndexError("Row index %d is out of range [0, %d)" % (i, 
self.numRows))
@@ -1042,21 +1166,29 @@ class DenseMatrix(Matrix):
         else:
             return self.values[i + j * self.numRows]
 
-    def __eq__(self, other):
+    def __eq__(self, other: Any) -> bool:
         if self.numRows != other.numRows or self.numCols != other.numCols:
             return False
         if isinstance(other, SparseMatrix):
-            return np.all(self.toArray() == other.toArray())
+            return np.all(self.toArray() == other.toArray()).tolist()
 
         self_values = np.ravel(self.toArray(), order="F")
         other_values = np.ravel(other.toArray(), order="F")
-        return np.all(self_values == other_values)
+        return np.all(self_values == other_values).tolist()
 
 
 class SparseMatrix(Matrix):
     """Sparse Matrix stored in CSC format."""
 
-    def __init__(self, numRows, numCols, colPtrs, rowIndices, values, 
isTransposed=False):
+    def __init__(
+        self,
+        numRows: int,
+        numCols: int,
+        colPtrs: Union[bytes, Iterable[int]],
+        rowIndices: Union[bytes, Iterable[int]],
+        values: Union[bytes, Iterable[float]],
+        isTransposed: bool = False,
+    ):
         Matrix.__init__(self, numRows, numCols, isTransposed)
         self.colPtrs = self._convert_to_array(colPtrs, np.int32)
         self.rowIndices = self._convert_to_array(rowIndices, np.int32)
@@ -1078,7 +1210,7 @@ class SparseMatrix(Matrix):
                 % (self.rowIndices.size, self.values.size)
             )
 
-    def __str__(self):
+    def __str__(self) -> str:
         """
         Pretty printing of a SparseMatrix
 
@@ -1124,7 +1256,7 @@ class SparseMatrix(Matrix):
             spstr += "\n.." * 2
         return spstr
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         """
         Representation of a SparseMatrix
 
@@ -1149,14 +1281,14 @@ class SparseMatrix(Matrix):
         if len(self.colPtrs) > 16:
             colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:]
 
-        values = ", ".join(values)
-        rowIndices = ", ".join([str(ind) for ind in rowIndices])
-        colPtrs = ", ".join([str(ptr) for ptr in colPtrs])
+        values = ", ".join(values)  # type: ignore[assignment]
+        rowIndices = ", ".join([str(ind) for ind in rowIndices])  # type: 
ignore[assignment]
+        colPtrs = ", ".join([str(ptr) for ptr in colPtrs])  # type: 
ignore[assignment]
         return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format(
             self.numRows, self.numCols, colPtrs, rowIndices, values, 
self.isTransposed
         )
 
-    def __reduce__(self):
+    def __reduce__(self) -> Tuple[Type["SparseMatrix"], Tuple[int, int, bytes, 
bytes, bytes, int]]:
         return SparseMatrix, (
             self.numRows,
             self.numCols,
@@ -1166,7 +1298,7 @@ class SparseMatrix(Matrix):
             int(self.isTransposed),
         )
 
-    def __getitem__(self, indices):
+    def __getitem__(self, indices: Tuple[int, int]) -> np.float64:
         i, j = indices
         if i < 0 or i >= self.numRows:
             raise IndexError("Row index %d is out of range [0, %d)" % (i, 
self.numRows))
@@ -1186,9 +1318,9 @@ class SparseMatrix(Matrix):
         if ind < colEnd and self.rowIndices[ind] == i:
             return self.values[ind]
         else:
-            return 0.0
+            return np.float64(0.0)
 
-    def toArray(self):
+    def toArray(self) -> np.ndarray:
         """
         Return a numpy.ndarray
         """
@@ -1202,32 +1334,38 @@ class SparseMatrix(Matrix):
                 A[self.rowIndices[startptr:endptr], k] = 
self.values[startptr:endptr]
         return A
 
-    def toDense(self):
+    def toDense(self) -> "DenseMatrix":
         densevals = np.ravel(self.toArray(), order="F")
         return DenseMatrix(self.numRows, self.numCols, densevals)
 
     # TODO: More efficient implementation:
-    def __eq__(self, other):
-        return np.all(self.toArray() == other.toArray())
+    def __eq__(self, other: Any) -> bool:
+        return np.all(self.toArray() == other.toArray()).tolist()
 
 
 class Matrices:
     @staticmethod
-    def dense(numRows, numCols, values):
+    def dense(numRows: int, numCols: int, values: Union[bytes, 
Iterable[float]]) -> DenseMatrix:
         """
         Create a DenseMatrix
         """
         return DenseMatrix(numRows, numCols, values)
 
     @staticmethod
-    def sparse(numRows, numCols, colPtrs, rowIndices, values):
+    def sparse(
+        numRows: int,
+        numCols: int,
+        colPtrs: Union[bytes, Iterable[int]],
+        rowIndices: Union[bytes, Iterable[int]],
+        values: Union[bytes, Iterable[float]],
+    ) -> SparseMatrix:
         """
         Create a SparseMatrix
         """
         return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
 
 
-def _test():
+def _test() -> None:
     import doctest
 
     try:
diff --git a/python/pyspark/ml/linalg/__init__.pyi 
b/python/pyspark/ml/linalg/__init__.pyi
deleted file mode 100644
index bb09397..0000000
--- a/python/pyspark/ml/linalg/__init__.pyi
+++ /dev/null
@@ -1,243 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from typing import overload
-from typing import Any, Dict, Iterable, List, NoReturn, Optional, Tuple, Type, 
Union
-
-from pyspark.ml import linalg as newlinalg  # noqa: F401
-from pyspark.sql.types import StructType, UserDefinedType
-
-from numpy import float64, ndarray
-
-class VectorUDT(UserDefinedType):
-    @classmethod
-    def sqlType(cls) -> StructType: ...
-    @classmethod
-    def module(cls) -> str: ...
-    @classmethod
-    def scalaUDT(cls) -> str: ...
-    def serialize(
-        self, obj: Vector
-    ) -> Tuple[int, Optional[int], Optional[List[int]], List[float]]: ...
-    def deserialize(self, datum: Any) -> Vector: ...
-    def simpleString(self) -> str: ...
-
-class MatrixUDT(UserDefinedType):
-    @classmethod
-    def sqlType(cls) -> StructType: ...
-    @classmethod
-    def module(cls) -> str: ...
-    @classmethod
-    def scalaUDT(cls) -> str: ...
-    def serialize(
-        self, obj: Matrix
-    ) -> Tuple[int, int, int, Optional[List[int]], Optional[List[int]], 
List[float], bool]: ...
-    def deserialize(self, datum: Any) -> Matrix: ...
-    def simpleString(self) -> str: ...
-
-class Vector:
-    __UDT__: VectorUDT
-    def toArray(self) -> ndarray: ...
-
-class DenseVector(Vector):
-    array: ndarray
-    @overload
-    def __init__(self, *elements: float) -> None: ...
-    @overload
-    def __init__(self, __arr: bytes) -> None: ...
-    @overload
-    def __init__(self, __arr: Iterable[float]) -> None: ...
-    def __reduce__(self) -> Tuple[Type[DenseVector], bytes]: ...
-    def numNonzeros(self) -> int: ...
-    def norm(self, p: Union[float, str]) -> float64: ...
-    def dot(self, other: Iterable[float]) -> float64: ...
-    def squared_distance(self, other: Iterable[float]) -> float64: ...
-    def toArray(self) -> ndarray: ...
-    @property
-    def values(self) -> ndarray: ...
-    def __getitem__(self, item: int) -> float64: ...
-    def __len__(self) -> int: ...
-    def __eq__(self, other: Any) -> bool: ...
-    def __ne__(self, other: Any) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __getattr__(self, item: str) -> Any: ...
-    def __neg__(self) -> DenseVector: ...
-    def __add__(self, other: Union[float, Iterable[float]]) -> DenseVector: ...
-    def __sub__(self, other: Union[float, Iterable[float]]) -> DenseVector: ...
-    def __mul__(self, other: Union[float, Iterable[float]]) -> DenseVector: ...
-    def __div__(self, other: Union[float, Iterable[float]]) -> DenseVector: ...
-    def __truediv__(self, other: Union[float, Iterable[float]]) -> 
DenseVector: ...
-    def __mod__(self, other: Union[float, Iterable[float]]) -> DenseVector: ...
-    def __radd__(self, other: Union[float, Iterable[float]]) -> DenseVector: 
...
-    def __rsub__(self, other: Union[float, Iterable[float]]) -> DenseVector: 
...
-    def __rmul__(self, other: Union[float, Iterable[float]]) -> DenseVector: 
...
-    def __rdiv__(self, other: Union[float, Iterable[float]]) -> DenseVector: 
...
-    def __rtruediv__(self, other: Union[float, Iterable[float]]) -> 
DenseVector: ...
-    def __rmod__(self, other: Union[float, Iterable[float]]) -> DenseVector: 
...
-
-class SparseVector(Vector):
-    size: int
-    indices: ndarray
-    values: ndarray
-    @overload
-    def __init__(self, size: int, *args: Tuple[int, float]) -> None: ...
-    @overload
-    def __init__(self, size: int, __indices: bytes, __values: bytes) -> None: 
...
-    @overload
-    def __init__(self, size: int, __indices: Iterable[int], __values: 
Iterable[float]) -> None: ...
-    @overload
-    def __init__(self, size: int, __pairs: Iterable[Tuple[int, float]]) -> 
None: ...
-    @overload
-    def __init__(self, size: int, __map: Dict[int, float]) -> None: ...
-    def numNonzeros(self) -> int: ...
-    def norm(self, p: Union[float, str]) -> float64: ...
-    def __reduce__(self) -> Tuple[Type[SparseVector], Tuple[int, bytes, 
bytes]]: ...
-    def dot(self, other: Iterable[float]) -> float64: ...
-    def squared_distance(self, other: Iterable[float]) -> float64: ...
-    def toArray(self) -> ndarray: ...
-    def __len__(self) -> int: ...
-    def __eq__(self, other: Any) -> bool: ...
-    def __getitem__(self, index: int) -> float64: ...
-    def __ne__(self, other: Any) -> bool: ...
-    def __hash__(self) -> int: ...
-
-class Vectors:
-    @overload
-    @staticmethod
-    def sparse(size: int, *args: Tuple[int, float]) -> SparseVector: ...
-    @overload
-    @staticmethod
-    def sparse(size: int, __indices: bytes, __values: bytes) -> SparseVector: 
...
-    @overload
-    @staticmethod
-    def sparse(size: int, __indices: Iterable[int], __values: Iterable[float]) 
-> SparseVector: ...
-    @overload
-    @staticmethod
-    def sparse(size: int, __pairs: Iterable[Tuple[int, float]]) -> 
SparseVector: ...
-    @overload
-    @staticmethod
-    def sparse(size: int, __map: Dict[int, float]) -> SparseVector: ...
-    @overload
-    @staticmethod
-    def dense(*elements: float) -> DenseVector: ...
-    @overload
-    @staticmethod
-    def dense(__arr: bytes) -> DenseVector: ...
-    @overload
-    @staticmethod
-    def dense(__arr: Iterable[float]) -> DenseVector: ...
-    @staticmethod
-    def stringify(vector: Vector) -> str: ...
-    @staticmethod
-    def squared_distance(v1: Vector, v2: Vector) -> float64: ...
-    @staticmethod
-    def norm(vector: Vector, p: Union[float, str]) -> float64: ...
-    @staticmethod
-    def zeros(size: int) -> DenseVector: ...
-
-class Matrix:
-    __UDT__: MatrixUDT
-    numRows: int
-    numCols: int
-    isTransposed: bool
-    def __init__(self, numRows: int, numCols: int, isTransposed: bool = ...) 
-> None: ...
-    def toArray(self) -> ndarray: ...
-
-class DenseMatrix(Matrix):
-    values: Any
-    @overload
-    def __init__(
-        self, numRows: int, numCols: int, values: bytes, isTransposed: bool = 
...
-    ) -> None: ...
-    @overload
-    def __init__(
-        self,
-        numRows: int,
-        numCols: int,
-        values: Iterable[float],
-        isTransposed: bool = ...,
-    ) -> None: ...
-    def __reduce__(self) -> Tuple[Type[DenseMatrix], Tuple[int, int, bytes, 
int]]: ...
-    def toArray(self) -> ndarray: ...
-    def toSparse(self) -> SparseMatrix: ...
-    def __getitem__(self, indices: Tuple[int, int]) -> float64: ...
-    def __eq__(self, other: Any) -> bool: ...
-
-class SparseMatrix(Matrix):
-    colPtrs: ndarray
-    rowIndices: ndarray
-    values: ndarray
-    @overload
-    def __init__(
-        self,
-        numRows: int,
-        numCols: int,
-        colPtrs: bytes,
-        rowIndices: bytes,
-        values: bytes,
-        isTransposed: bool = ...,
-    ) -> None: ...
-    @overload
-    def __init__(
-        self,
-        numRows: int,
-        numCols: int,
-        colPtrs: Iterable[int],
-        rowIndices: Iterable[int],
-        values: Iterable[float],
-        isTransposed: bool = ...,
-    ) -> None: ...
-    def __reduce__(
-        self,
-    ) -> Tuple[Type[SparseMatrix], Tuple[int, int, bytes, bytes, bytes, int]]: 
...
-    def __getitem__(self, indices: Tuple[int, int]) -> float64: ...
-    def toArray(self) -> ndarray: ...
-    def toDense(self) -> DenseMatrix: ...
-    def __eq__(self, other: Any) -> bool: ...
-
-class Matrices:
-    @overload
-    @staticmethod
-    def dense(
-        numRows: int, numCols: int, values: bytes, isTransposed: bool = ...
-    ) -> DenseMatrix: ...
-    @overload
-    @staticmethod
-    def dense(
-        numRows: int, numCols: int, values: Iterable[float], isTransposed: 
bool = ...
-    ) -> DenseMatrix: ...
-    @overload
-    @staticmethod
-    def sparse(
-        numRows: int,
-        numCols: int,
-        colPtrs: bytes,
-        rowIndices: bytes,
-        values: bytes,
-        isTransposed: bool = ...,
-    ) -> SparseMatrix: ...
-    @overload
-    @staticmethod
-    def sparse(
-        numRows: int,
-        numCols: int,
-        colPtrs: Iterable[int],
-        rowIndices: Iterable[int],
-        values: Iterable[float],
-        isTransposed: bool = ...,
-    ) -> SparseMatrix: ...
diff --git a/python/pyspark/mllib/linalg/__init__.py 
b/python/pyspark/mllib/linalg/__init__.py
index b9c391e..dd7fad0 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -575,8 +575,8 @@ class DenseVector(Vector):
     def __neg__(self) -> "DenseVector":
         return DenseVector(-self.array)
 
-    def _delegate(op: str) -> Callable[["DenseVector", Any], Any]:  # type: 
ignore[misc]
-        def func(self: "DenseVector", other: Any) -> Any:
+    def _delegate(op: str) -> Callable[["DenseVector", Any], "DenseVector"]:  
# type: ignore[misc]
+        def func(self: "DenseVector", other: Any) -> "DenseVector":
             if isinstance(other, DenseVector):
                 other = other.array
             return DenseVector(getattr(self.array, op)(other))
@@ -768,7 +768,7 @@ class SparseVector(Vector):
             raise ValueError("Unable to parse values from %s." % s)
         return SparseVector(cast(int, size), indices, values)
 
-    def dot(self, other: Any) -> np.float64:
+    def dot(self, other: Iterable[float]) -> np.float64:
         """
         Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
 
@@ -824,9 +824,9 @@ class SparseVector(Vector):
                 return np.dot(self_values, other.values[other_cmind])
 
         else:
-            return self.dot(_convert_to_vector(other))
+            return self.dot(_convert_to_vector(other))  # type: 
ignore[arg-type]
 
-    def squared_distance(self, other: Any) -> np.float64:
+    def squared_distance(self, other: Iterable[float]) -> np.float64:
         """
         Squared distance from a SparseVector or 1-dimensional NumPy array.
 
@@ -894,7 +894,7 @@ class SparseVector(Vector):
                 j += 1
             return result
         else:
-            return self.squared_distance(_convert_to_vector(other))
+            return self.squared_distance(_convert_to_vector(other))  # type: 
ignore[arg-type]
 
     def toArray(self) -> np.ndarray:
         """
@@ -1140,7 +1140,7 @@ class Vectors:
         return v1.squared_distance(v2)  # type: ignore[attr-defined]
 
     @staticmethod
-    def norm(vector: Vector, p: Union[float, str]) -> np.float64:
+    def norm(vector: Vector, p: "NormType") -> np.float64:
         """
         Find norm of the given vector.
         """

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-37417][PYTHON][ML] Inline type hints for pyspark.ml.linalg.__init__.py

Reply via email to

[spark] branch master updated: [SPARK-37417][PYTHON][ML] Inline type hints for pyspark.ml.linalg.init.py