(spark) branch master updated: [SPARK-55076][PYTHON] Fix the type hint issue in ml/mllib and add scipy requirement

ruifengz Sun, 18 Jan 2026 16:08:15 -0800

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 0a354bc975ad [SPARK-55076][PYTHON] Fix the type hint issue in ml/mllib 
and add scipy requirement
0a354bc975ad is described below

commit 0a354bc975ad79d0cf407b6e3d3293b4b7dd83b7
Author: Tian Gao <[email protected]>
AuthorDate: Mon Jan 19 08:06:36 2026 +0800

    [SPARK-55076][PYTHON] Fix the type hint issue in ml/mllib and add scipy 
requirement
    
    ### What changes were proposed in this pull request?
    
    * Pin scipy version to >=1.8.0, which is the first minor version to support 
3.10
    * Install scipy on lint image so we can find scipy related lint failures
    * Add `sparray` as that's the preferred type for scipy now
    * Expand `VectorLike` to include other vector like types to simplify our 
code
    * Replace some `type(x)` check with `isinstance()` because that's the 
recommended way and mypy understands it
    * Fix a few `numpy` 1 vs 2 related type hints so they can pass with both 
versions
    * Add a few assertions to make mypy happy about attributes
    
    ### Why are the changes needed?
    
    Currently, local `mypy` check will fail with a lot of failures due to 
scipy/numpy because our lint image does not include those stubs. This is bad 
because it's really hard for people to do mypy check locally - they'll think 
that their environment setup has issues so `mypy` result is not to be trusted. 
We want to make `mypy` result consistent between CI and local and make it clean.
    
    ### Does this PR introduce _any_ user-facing change?
    
    It should not. Almost all changes are type annotation related.
    
    ### How was this patch tested?
    
    CI should pass.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #53841 from gaogaotiantian/fix-ml-typehint.
    
    Authored-by: Tian Gao <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 dev/requirements.txt                       |  2 +-
 dev/spark-test-image/lint/Dockerfile       |  2 ++
 python/pyspark/ml/_typing.pyi              |  8 +++--
 python/pyspark/ml/linalg/__init__.py       | 51 ++++++++++++++++--------------
 python/pyspark/mllib/_typing.pyi           |  8 +++--
 python/pyspark/mllib/linalg/__init__.py    | 50 +++++++++++++++--------------
 python/pyspark/mllib/linalg/distributed.py |  2 +-
 python/pyspark/mllib/regression.py         |  2 +-
 python/pyspark/mllib/stat/_statistics.py   |  3 +-
 python/pyspark/mllib/util.py               |  6 +---
 10 files changed, 74 insertions(+), 60 deletions(-)

diff --git a/dev/requirements.txt b/dev/requirements.txt
index a64f9c4cc50a..7153a8d71dc8 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -6,7 +6,7 @@ numpy>=1.22
 pyarrow>=18.0.0
 six==1.16.0
 pandas>=2.2.0
-scipy
+scipy>=1.8.0
 plotly<6.0.0
 mlflow>=2.3.1
 scikit-learn
diff --git a/dev/spark-test-image/lint/Dockerfile 
b/dev/spark-test-image/lint/Dockerfile
index 4984f56fc763..9fd4bcd77e60 100644
--- a/dev/spark-test-image/lint/Dockerfile
+++ b/dev/spark-test-image/lint/Dockerfile
@@ -99,6 +99,8 @@ RUN python3.11 -m pip install \
     'pyarrow>=22.0.0' \
     'pytest-mypy-plugins==1.9.3' \
     'pytest==7.1.3' \
+    'scipy>=1.8.0' \
+    'scipy-stubs' \
     && python3.11 -m pip install torch torchvision --index-url 
https://download.pytorch.org/whl/cpu \
     && python3.11 -m pip install torcheval \
     && python3.11 -m pip cache purge
diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi
index a5237dad7521..c24dfe577350 100644
--- a/python/pyspark/ml/_typing.pyi
+++ b/python/pyspark/ml/_typing.pyi
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, Dict, List, TypeVar, Tuple, Union
+from typing import Any, Dict, List, TYPE_CHECKING, TypeVar, Tuple, Union
 from typing_extensions import Literal
 
 from numpy import ndarray
@@ -24,10 +24,12 @@ from py4j.java_gateway import JavaObject
 
 import pyspark.ml.base
 import pyspark.ml.param
-import pyspark.ml.util
 from pyspark.ml.linalg import Vector
 import pyspark.ml.wrapper
 
+if TYPE_CHECKING:
+    from scipy.sparse import spmatrix, sparray
+
 ParamMap = Dict[pyspark.ml.param.Param, Any]
 PipelineStage = Union[pyspark.ml.base.Estimator, pyspark.ml.base.Transformer]
 
@@ -81,4 +83,4 @@ RankingEvaluatorMetricType = Union[
     Literal["recallAtK"],
 ]
 
-VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]]
+VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], 
"spmatrix", "sparray", range]
diff --git a/python/pyspark/ml/linalg/__init__.py 
b/python/pyspark/ml/linalg/__init__.py
index cedd3b04564e..ee6fc8d21818 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -70,7 +70,6 @@ __all__ = [
 if TYPE_CHECKING:
     from pyspark.mllib._typing import NormType
     from pyspark.ml._typing import VectorLike
-    from scipy.sparse import spmatrix
 
 
 # Check whether we have SciPy. MLlib works without it too, but if we have it, 
some methods,
@@ -85,23 +84,25 @@ except BaseException:
     _have_scipy = False
 
 
-def _convert_to_vector(d: Union["VectorLike", "spmatrix", range]) -> "Vector":
+def _convert_to_vector(d: "VectorLike") -> "Vector":
     if isinstance(d, Vector):
         return d
-    elif type(d) in (array.array, np.array, np.ndarray, list, tuple, range):
+    elif isinstance(d, (array.array, np.ndarray, list, tuple, range)):
         return DenseVector(d)
     elif _have_scipy and scipy.sparse.issparse(d):
-        assert cast("spmatrix", d).shape[1] == 1, "Expected column vector"
+        assert hasattr(d, "shape")
+        assert d.shape[1] == 1, "Expected column vector"
         # Make sure the converted csc_matrix has sorted indices.
-        csc = cast("spmatrix", d).tocsc()
+        assert hasattr(d, "tocsc")
+        csc = d.tocsc()
         if not csc.has_sorted_indices:
             csc.sort_indices()
-        return SparseVector(cast("spmatrix", d).shape[0], csc.indices, 
csc.data)
+        return SparseVector(d.shape[0], csc.indices, csc.data)
     else:
         raise TypeError("Cannot convert type %s into Vector" % type(d))
 
 
-def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int:
+def _vector_size(v: "VectorLike") -> int:
     """
     Returns the size of the vector.
 
@@ -124,16 +125,17 @@ def _vector_size(v: Union["VectorLike", "spmatrix", 
range]) -> int:
     """
     if isinstance(v, Vector):
         return len(v)
-    elif type(v) in (array.array, list, tuple, range):
+    elif isinstance(v, (array.array, list, tuple, range)):
         return len(v)
-    elif type(v) == np.ndarray:
+    elif isinstance(v, np.ndarray):
         if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
             return len(v)
         else:
             raise ValueError("Cannot treat an ndarray of shape %s as a vector" 
% str(v.shape))
     elif _have_scipy and scipy.sparse.issparse(v):
-        assert cast("spmatrix", v).shape[1] == 1, "Expected column vector"
-        return cast("spmatrix", v).shape[0]
+        assert hasattr(v, "shape")
+        assert v.shape[1] == 1, "Expected column vector"
+        return v.shape[0]
     else:
         raise TypeError("Cannot treat type %s as a vector" % type(v))
 
@@ -337,13 +339,13 @@ class DenseVector(Vector):
     def __reduce__(self) -> Tuple[Type["DenseVector"], Tuple[bytes]]:
         return DenseVector, (self.array.tobytes(),)
 
-    def numNonzeros(self) -> int:
+    def numNonzeros(self) -> Union[int, np.intp]:
         """
         Number of nonzero elements. This scans all active values and count non 
zeros
         """
         return np.count_nonzero(self.array)
 
-    def norm(self, p: "NormType") -> np.float64:
+    def norm(self, p: "NormType") -> np.floating[Any]:
         """
         Calculates the norm of a DenseVector.
 
@@ -386,15 +388,17 @@ class DenseVector(Vector):
             ...
         AssertionError: dimension mismatch
         """
-        if type(other) == np.ndarray:
+        if isinstance(other, np.ndarray):
             if other.ndim > 1:
                 assert len(self) == other.shape[0], "dimension mismatch"
             return np.dot(self.array, other)
         elif _have_scipy and scipy.sparse.issparse(other):
-            assert len(self) == cast("spmatrix", other).shape[0], "dimension 
mismatch"
-            return cast("spmatrix", other).transpose().dot(self.toArray())
+            assert hasattr(other, "shape")
+            assert len(self) == other.shape[0], "dimension mismatch"
+            assert hasattr(other, "transpose")
+            return other.transpose().dot(self.toArray())
         else:
-            assert len(self) == _vector_size(other), "dimension mismatch"
+            assert len(self) == _vector_size(other), "dimension mismatch"  # 
type: ignore[arg-type]
             if isinstance(other, SparseVector):
                 return other.dot(self)
             elif isinstance(other, Vector):
@@ -429,10 +433,11 @@ class DenseVector(Vector):
             ...
         AssertionError: dimension mismatch
         """
-        assert len(self) == _vector_size(other), "dimension mismatch"
+        assert len(self) == _vector_size(other), "dimension mismatch"  # type: 
ignore[arg-type]
         if isinstance(other, SparseVector):
             return other.squared_distance(self)
         elif _have_scipy and scipy.sparse.issparse(other):
+            assert isinstance(other, scipy.sparse.spmatrix), "other must be a 
scipy.sparse.spmatrix"
             return _convert_to_vector(other).squared_distance(self)  # type: 
ignore[attr-defined]
 
         if isinstance(other, Vector):
@@ -636,13 +641,13 @@ class SparseVector(Vector):
             )
             assert np.min(self.indices) >= 0, "Contains negative index %d" % 
(np.min(self.indices))
 
-    def numNonzeros(self) -> int:
+    def numNonzeros(self) -> Union[int, np.intp]:
         """
         Number of nonzero elements. This scans all active values and count non 
zeros.
         """
         return np.count_nonzero(self.values)
 
-    def norm(self, p: "NormType") -> np.float64:
+    def norm(self, p: "NormType") -> np.floating[Any]:
         """
         Calculates the norm of a SparseVector.
 
@@ -699,7 +704,7 @@ class SparseVector(Vector):
             assert len(self) == other.shape[0], "dimension mismatch"
             return np.dot(self.values, other[self.indices])
 
-        assert len(self) == _vector_size(other), "dimension mismatch"
+        assert len(self) == _vector_size(other), "dimension mismatch"  # type: 
ignore[arg-type]
 
         if isinstance(other, DenseVector):
             return np.dot(other.array[self.indices], self.values)
@@ -717,7 +722,7 @@ class SparseVector(Vector):
         else:
             return self.dot(_convert_to_vector(other))  # type: 
ignore[arg-type]
 
-    def squared_distance(self, other: Iterable[float]) -> np.float64:
+    def squared_distance(self, other: "VectorLike") -> np.float64:
         """
         Squared distance from a SparseVector or 1-dimensional NumPy array.
 
@@ -785,7 +790,7 @@ class SparseVector(Vector):
                 j += 1
             return result
         else:
-            return self.squared_distance(_convert_to_vector(other))  # type: 
ignore[arg-type]
+            return self.squared_distance(_convert_to_vector(other))
 
     def toArray(self) -> np.ndarray:
         """
diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi
index ff90cb639f4c..d34cfc84c7ae 100644
--- a/python/pyspark/mllib/_typing.pyi
+++ b/python/pyspark/mllib/_typing.pyi
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import List, Tuple, TypeVar, Union
+from typing import List, Tuple, TYPE_CHECKING, TypeVar, Union
 
 from typing_extensions import Literal
 from numpy import ndarray  # noqa: F401
@@ -24,10 +24,14 @@ from py4j.java_gateway import JavaObject
 
 from pyspark.mllib.linalg import Vector
 
-VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]]
+if TYPE_CHECKING:
+    from scipy.sparse import spmatrix, sparray
+
 C = TypeVar("C", bound=type)
 JavaObjectOrPickleDump = Union[JavaObject, bytearray, bytes]
 
 CorrMethodType = Union[Literal["spearman"], Literal["pearson"]]
 KolmogorovSmirnovTestDistNameType = Literal["norm"]
 NormType = Union[None, float, Literal["fro"], Literal["nuc"]]
+
+VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], 
"spmatrix", "sparray", range]
diff --git a/python/pyspark/mllib/linalg/__init__.py 
b/python/pyspark/mllib/linalg/__init__.py
index 40f0255a91bb..1613e3d2cc8d 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -61,7 +61,6 @@ from pyspark.sql.types import (
 
 if TYPE_CHECKING:
     from pyspark.mllib._typing import VectorLike, NormType
-    from scipy.sparse import spmatrix
     from numpy.typing import ArrayLike
 
 
@@ -94,23 +93,25 @@ except BaseException:
     _have_scipy = False
 
 
-def _convert_to_vector(d: Union["VectorLike", "spmatrix", range]) -> "Vector":
+def _convert_to_vector(d: "VectorLike") -> "Vector":
     if isinstance(d, Vector):
         return d
-    elif type(d) in (array.array, np.array, np.ndarray, list, tuple, range):
+    elif isinstance(d, (array.array, np.ndarray, list, tuple, range)):
         return DenseVector(d)
     elif _have_scipy and scipy.sparse.issparse(d):
-        assert cast("spmatrix", d).shape[1] == 1, "Expected column vector"
+        assert hasattr(d, "shape")
+        assert d.shape[1] == 1, "Expected column vector"
         # Make sure the converted csc_matrix has sorted indices.
-        csc = cast("spmatrix", d).tocsc()
+        assert hasattr(d, "tocsc")
+        csc = d.tocsc()
         if not csc.has_sorted_indices:
             csc.sort_indices()
-        return SparseVector(cast("spmatrix", d).shape[0], csc.indices, 
csc.data)
+        return SparseVector(d.shape[0], csc.indices, csc.data)
     else:
         raise TypeError("Cannot convert type %s into Vector" % type(d))
 
 
-def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int:
+def _vector_size(v: "VectorLike") -> int:
     """
     Returns the size of the vector.
 
@@ -133,16 +134,17 @@ def _vector_size(v: Union["VectorLike", "spmatrix", 
range]) -> int:
     """
     if isinstance(v, Vector):
         return len(v)
-    elif type(v) in (array.array, list, tuple, range):
+    elif isinstance(v, (array.array, list, tuple, range)):
         return len(v)
-    elif type(v) == np.ndarray:
+    elif isinstance(v, np.ndarray):
         if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
             return len(v)
         else:
             raise ValueError("Cannot treat an ndarray of shape %s as a vector" 
% str(v.shape))
     elif _have_scipy and scipy.sparse.issparse(v):
-        assert cast("spmatrix", v).shape[1] == 1, "Expected column vector"
-        return cast("spmatrix", v).shape[0]
+        assert hasattr(v, "shape")
+        assert v.shape[1] == 1, "Expected column vector"
+        return v.shape[0]
     else:
         raise TypeError("Cannot treat type %s as a vector" % type(v))
 
@@ -390,13 +392,13 @@ class DenseVector(Vector):
     def __reduce__(self) -> Tuple[Type["DenseVector"], Tuple[bytes]]:
         return DenseVector, (self.array.tobytes(),)
 
-    def numNonzeros(self) -> int:
+    def numNonzeros(self) -> Union[int, np.intp]:
         """
         Number of nonzero elements. This scans all active values and count non 
zeros
         """
         return np.count_nonzero(self.array)
 
-    def norm(self, p: "NormType") -> np.float64:
+    def norm(self, p: "NormType") -> np.floating[Any]:
         """
         Calculates the norm of a DenseVector.
 
@@ -410,7 +412,7 @@ class DenseVector(Vector):
         """
         return np.linalg.norm(self.array, p)
 
-    def dot(self, other: Iterable[float]) -> np.float64:
+    def dot(self, other: "VectorLike") -> np.float64:
         """
         Compute the dot product of two Vectors. We support
         (Numpy array, list, SparseVector, or SciPy sparse)
@@ -444,8 +446,10 @@ class DenseVector(Vector):
                 assert len(self) == other.shape[0], "dimension mismatch"
             return np.dot(self.array, other)
         elif _have_scipy and scipy.sparse.issparse(other):
-            assert len(self) == cast("spmatrix", other).shape[0], "dimension 
mismatch"
-            return cast("spmatrix", other).transpose().dot(self.toArray())
+            assert hasattr(other, "shape")
+            assert len(self) == other.shape[0], "dimension mismatch"
+            assert hasattr(other, "transpose")
+            return other.transpose().dot(self.toArray())
         else:
             assert len(self) == _vector_size(other), "dimension mismatch"
             if isinstance(other, SparseVector):
@@ -455,7 +459,7 @@ class DenseVector(Vector):
             else:
                 return np.dot(self.toArray(), cast("ArrayLike", other))
 
-    def squared_distance(self, other: Iterable[float]) -> np.float64:
+    def squared_distance(self, other: "VectorLike") -> np.float64:
         """
         Squared distance of two Vectors.
 
@@ -685,13 +689,13 @@ class SparseVector(Vector):
                         % (self.indices[i], self.indices[i + 1])
                     )
 
-    def numNonzeros(self) -> int:
+    def numNonzeros(self) -> Union[int, np.intp]:
         """
         Number of nonzero elements. This scans all active values and count non 
zeros.
         """
         return np.count_nonzero(self.values)
 
-    def norm(self, p: "NormType") -> np.float64:
+    def norm(self, p: "NormType") -> np.floating[Any]:
         """
         Calculates the norm of a SparseVector.
 
@@ -766,7 +770,7 @@ class SparseVector(Vector):
             raise ValueError("Unable to parse values from %s." % s)
         return SparseVector(cast(int, size), indices, values)
 
-    def dot(self, other: Iterable[float]) -> np.float64:
+    def dot(self, other: "VectorLike") -> np.float64:
         """
         Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
 
@@ -822,9 +826,9 @@ class SparseVector(Vector):
                 return np.dot(self_values, other.values[other_cmind])
 
         else:
-            return self.dot(_convert_to_vector(other))  # type: 
ignore[arg-type]
+            return self.dot(_convert_to_vector(other))
 
-    def squared_distance(self, other: Iterable[float]) -> np.float64:
+    def squared_distance(self, other: "VectorLike") -> np.float64:
         """
         Squared distance from a SparseVector or 1-dimensional NumPy array.
 
@@ -892,7 +896,7 @@ class SparseVector(Vector):
                 j += 1
             return result
         else:
-            return self.squared_distance(_convert_to_vector(other))  # type: 
ignore[arg-type]
+            return self.squared_distance(_convert_to_vector(other))
 
     def toArray(self) -> np.ndarray:
         """
diff --git a/python/pyspark/mllib/linalg/distributed.py 
b/python/pyspark/mllib/linalg/distributed.py
index ecdb4e75ed49..aa98f959798c 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -35,7 +35,7 @@ UT = TypeVar("UT", bound="DistributedMatrix")
 VT = TypeVar("VT", bound="Matrix")
 
 if TYPE_CHECKING:
-    from pyspark.ml._typing import VectorLike
+    from pyspark.mllib._typing import VectorLike
 
 __all__ = [
     "BlockMatrix",
diff --git a/python/pyspark/mllib/regression.py 
b/python/pyspark/mllib/regression.py
index b384e0fa608e..a69f8c00b221 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -84,7 +84,7 @@ class LabeledPoint:
     'label' and 'features' are accessible as class attributes.
     """
 
-    def __init__(self, label: float, features: Iterable[float]):
+    def __init__(self, label: float, features: "VectorLike"):
         self.label = float(label)
         self.features = _convert_to_vector(features)
 
diff --git a/python/pyspark/mllib/stat/_statistics.py 
b/python/pyspark/mllib/stat/_statistics.py
index c638fb819506..e993ced1a419 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -189,7 +189,8 @@ class Statistics:
 
         if not y:
             return cast(
-                JavaObject, callMLlibFunc("corr", x.map(_convert_to_vector), 
method)
+                JavaObject,
+                callMLlibFunc("corr", cast(RDD[Vector], 
x).map(_convert_to_vector), method),
             ).toArray()
         else:
             return cast(
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index caa2c9338a95..65b25f8add6d 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -145,11 +145,7 @@ class MLUtils:
         if numFeatures <= 0:
             parsed.cache()
             numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else 
x[1][-1]).reduce(max) + 1
-        return parsed.map(
-            lambda x: LabeledPoint(
-                x[0], Vectors.sparse(numFeatures, x[1], x[2])  # type: 
ignore[arg-type]
-            )
-        )
+        return parsed.map(lambda x: LabeledPoint(x[0], 
Vectors.sparse(numFeatures, x[1], x[2])))
 
     @staticmethod
     def saveAsLibSVMFile(data: RDD["LabeledPoint"], dir: str) -> None:


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-55076][PYTHON] Fix the type hint issue in ml/mllib and add scipy requirement

Reply via email to