This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 0a354bc975ad [SPARK-55076][PYTHON] Fix the type hint issue in ml/mllib
and add scipy requirement
0a354bc975ad is described below
commit 0a354bc975ad79d0cf407b6e3d3293b4b7dd83b7
Author: Tian Gao <[email protected]>
AuthorDate: Mon Jan 19 08:06:36 2026 +0800
[SPARK-55076][PYTHON] Fix the type hint issue in ml/mllib and add scipy
requirement
### What changes were proposed in this pull request?
* Pin scipy version to >=1.8.0, which is the first minor version to support
3.10
* Install scipy on lint image so we can find scipy related lint failures
* Add `sparray` as that's the preferred type for scipy now
* Expand `VectorLike` to include other vector like types to simplify our
code
* Replace some `type(x)` check with `isinstance()` because that's the
recommended way and mypy understands it
* Fix a few `numpy` 1 vs 2 related type hints so they can pass with both
versions
* Add a few assertions to make mypy happy about attributes
### Why are the changes needed?
Currently, local `mypy` check will fail with a lot of failures due to
scipy/numpy because our lint image does not include those stubs. This is bad
because it's really hard for people to do mypy check locally - they'll think
that their environment setup has issues so `mypy` result is not to be trusted.
We want to make `mypy` result consistent between CI and local and make it clean.
### Does this PR introduce _any_ user-facing change?
It should not. Almost all changes are type annotation related.
### How was this patch tested?
CI should pass.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #53841 from gaogaotiantian/fix-ml-typehint.
Authored-by: Tian Gao <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
dev/requirements.txt | 2 +-
dev/spark-test-image/lint/Dockerfile | 2 ++
python/pyspark/ml/_typing.pyi | 8 +++--
python/pyspark/ml/linalg/__init__.py | 51 ++++++++++++++++--------------
python/pyspark/mllib/_typing.pyi | 8 +++--
python/pyspark/mllib/linalg/__init__.py | 50 +++++++++++++++--------------
python/pyspark/mllib/linalg/distributed.py | 2 +-
python/pyspark/mllib/regression.py | 2 +-
python/pyspark/mllib/stat/_statistics.py | 3 +-
python/pyspark/mllib/util.py | 6 +---
10 files changed, 74 insertions(+), 60 deletions(-)
diff --git a/dev/requirements.txt b/dev/requirements.txt
index a64f9c4cc50a..7153a8d71dc8 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -6,7 +6,7 @@ numpy>=1.22
pyarrow>=18.0.0
six==1.16.0
pandas>=2.2.0
-scipy
+scipy>=1.8.0
plotly<6.0.0
mlflow>=2.3.1
scikit-learn
diff --git a/dev/spark-test-image/lint/Dockerfile
b/dev/spark-test-image/lint/Dockerfile
index 4984f56fc763..9fd4bcd77e60 100644
--- a/dev/spark-test-image/lint/Dockerfile
+++ b/dev/spark-test-image/lint/Dockerfile
@@ -99,6 +99,8 @@ RUN python3.11 -m pip install \
'pyarrow>=22.0.0' \
'pytest-mypy-plugins==1.9.3' \
'pytest==7.1.3' \
+ 'scipy>=1.8.0' \
+ 'scipy-stubs' \
&& python3.11 -m pip install torch torchvision --index-url
https://download.pytorch.org/whl/cpu \
&& python3.11 -m pip install torcheval \
&& python3.11 -m pip cache purge
diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi
index a5237dad7521..c24dfe577350 100644
--- a/python/pyspark/ml/_typing.pyi
+++ b/python/pyspark/ml/_typing.pyi
@@ -16,7 +16,7 @@
# specific language governing permissions and limitations
# under the License.
-from typing import Any, Dict, List, TypeVar, Tuple, Union
+from typing import Any, Dict, List, TYPE_CHECKING, TypeVar, Tuple, Union
from typing_extensions import Literal
from numpy import ndarray
@@ -24,10 +24,12 @@ from py4j.java_gateway import JavaObject
import pyspark.ml.base
import pyspark.ml.param
-import pyspark.ml.util
from pyspark.ml.linalg import Vector
import pyspark.ml.wrapper
+if TYPE_CHECKING:
+ from scipy.sparse import spmatrix, sparray
+
ParamMap = Dict[pyspark.ml.param.Param, Any]
PipelineStage = Union[pyspark.ml.base.Estimator, pyspark.ml.base.Transformer]
@@ -81,4 +83,4 @@ RankingEvaluatorMetricType = Union[
Literal["recallAtK"],
]
-VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]]
+VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...],
"spmatrix", "sparray", range]
diff --git a/python/pyspark/ml/linalg/__init__.py
b/python/pyspark/ml/linalg/__init__.py
index cedd3b04564e..ee6fc8d21818 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -70,7 +70,6 @@ __all__ = [
if TYPE_CHECKING:
from pyspark.mllib._typing import NormType
from pyspark.ml._typing import VectorLike
- from scipy.sparse import spmatrix
# Check whether we have SciPy. MLlib works without it too, but if we have it,
some methods,
@@ -85,23 +84,25 @@ except BaseException:
_have_scipy = False
-def _convert_to_vector(d: Union["VectorLike", "spmatrix", range]) -> "Vector":
+def _convert_to_vector(d: "VectorLike") -> "Vector":
if isinstance(d, Vector):
return d
- elif type(d) in (array.array, np.array, np.ndarray, list, tuple, range):
+ elif isinstance(d, (array.array, np.ndarray, list, tuple, range)):
return DenseVector(d)
elif _have_scipy and scipy.sparse.issparse(d):
- assert cast("spmatrix", d).shape[1] == 1, "Expected column vector"
+ assert hasattr(d, "shape")
+ assert d.shape[1] == 1, "Expected column vector"
# Make sure the converted csc_matrix has sorted indices.
- csc = cast("spmatrix", d).tocsc()
+ assert hasattr(d, "tocsc")
+ csc = d.tocsc()
if not csc.has_sorted_indices:
csc.sort_indices()
- return SparseVector(cast("spmatrix", d).shape[0], csc.indices,
csc.data)
+ return SparseVector(d.shape[0], csc.indices, csc.data)
else:
raise TypeError("Cannot convert type %s into Vector" % type(d))
-def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int:
+def _vector_size(v: "VectorLike") -> int:
"""
Returns the size of the vector.
@@ -124,16 +125,17 @@ def _vector_size(v: Union["VectorLike", "spmatrix",
range]) -> int:
"""
if isinstance(v, Vector):
return len(v)
- elif type(v) in (array.array, list, tuple, range):
+ elif isinstance(v, (array.array, list, tuple, range)):
return len(v)
- elif type(v) == np.ndarray:
+ elif isinstance(v, np.ndarray):
if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
return len(v)
else:
raise ValueError("Cannot treat an ndarray of shape %s as a vector"
% str(v.shape))
elif _have_scipy and scipy.sparse.issparse(v):
- assert cast("spmatrix", v).shape[1] == 1, "Expected column vector"
- return cast("spmatrix", v).shape[0]
+ assert hasattr(v, "shape")
+ assert v.shape[1] == 1, "Expected column vector"
+ return v.shape[0]
else:
raise TypeError("Cannot treat type %s as a vector" % type(v))
@@ -337,13 +339,13 @@ class DenseVector(Vector):
def __reduce__(self) -> Tuple[Type["DenseVector"], Tuple[bytes]]:
return DenseVector, (self.array.tobytes(),)
- def numNonzeros(self) -> int:
+ def numNonzeros(self) -> Union[int, np.intp]:
"""
Number of nonzero elements. This scans all active values and count non
zeros
"""
return np.count_nonzero(self.array)
- def norm(self, p: "NormType") -> np.float64:
+ def norm(self, p: "NormType") -> np.floating[Any]:
"""
Calculates the norm of a DenseVector.
@@ -386,15 +388,17 @@ class DenseVector(Vector):
...
AssertionError: dimension mismatch
"""
- if type(other) == np.ndarray:
+ if isinstance(other, np.ndarray):
if other.ndim > 1:
assert len(self) == other.shape[0], "dimension mismatch"
return np.dot(self.array, other)
elif _have_scipy and scipy.sparse.issparse(other):
- assert len(self) == cast("spmatrix", other).shape[0], "dimension
mismatch"
- return cast("spmatrix", other).transpose().dot(self.toArray())
+ assert hasattr(other, "shape")
+ assert len(self) == other.shape[0], "dimension mismatch"
+ assert hasattr(other, "transpose")
+ return other.transpose().dot(self.toArray())
else:
- assert len(self) == _vector_size(other), "dimension mismatch"
+ assert len(self) == _vector_size(other), "dimension mismatch" #
type: ignore[arg-type]
if isinstance(other, SparseVector):
return other.dot(self)
elif isinstance(other, Vector):
@@ -429,10 +433,11 @@ class DenseVector(Vector):
...
AssertionError: dimension mismatch
"""
- assert len(self) == _vector_size(other), "dimension mismatch"
+ assert len(self) == _vector_size(other), "dimension mismatch" # type:
ignore[arg-type]
if isinstance(other, SparseVector):
return other.squared_distance(self)
elif _have_scipy and scipy.sparse.issparse(other):
+ assert isinstance(other, scipy.sparse.spmatrix), "other must be a
scipy.sparse.spmatrix"
return _convert_to_vector(other).squared_distance(self) # type:
ignore[attr-defined]
if isinstance(other, Vector):
@@ -636,13 +641,13 @@ class SparseVector(Vector):
)
assert np.min(self.indices) >= 0, "Contains negative index %d" %
(np.min(self.indices))
- def numNonzeros(self) -> int:
+ def numNonzeros(self) -> Union[int, np.intp]:
"""
Number of nonzero elements. This scans all active values and count non
zeros.
"""
return np.count_nonzero(self.values)
- def norm(self, p: "NormType") -> np.float64:
+ def norm(self, p: "NormType") -> np.floating[Any]:
"""
Calculates the norm of a SparseVector.
@@ -699,7 +704,7 @@ class SparseVector(Vector):
assert len(self) == other.shape[0], "dimension mismatch"
return np.dot(self.values, other[self.indices])
- assert len(self) == _vector_size(other), "dimension mismatch"
+ assert len(self) == _vector_size(other), "dimension mismatch" # type:
ignore[arg-type]
if isinstance(other, DenseVector):
return np.dot(other.array[self.indices], self.values)
@@ -717,7 +722,7 @@ class SparseVector(Vector):
else:
return self.dot(_convert_to_vector(other)) # type:
ignore[arg-type]
- def squared_distance(self, other: Iterable[float]) -> np.float64:
+ def squared_distance(self, other: "VectorLike") -> np.float64:
"""
Squared distance from a SparseVector or 1-dimensional NumPy array.
@@ -785,7 +790,7 @@ class SparseVector(Vector):
j += 1
return result
else:
- return self.squared_distance(_convert_to_vector(other)) # type:
ignore[arg-type]
+ return self.squared_distance(_convert_to_vector(other))
def toArray(self) -> np.ndarray:
"""
diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi
index ff90cb639f4c..d34cfc84c7ae 100644
--- a/python/pyspark/mllib/_typing.pyi
+++ b/python/pyspark/mllib/_typing.pyi
@@ -16,7 +16,7 @@
# specific language governing permissions and limitations
# under the License.
-from typing import List, Tuple, TypeVar, Union
+from typing import List, Tuple, TYPE_CHECKING, TypeVar, Union
from typing_extensions import Literal
from numpy import ndarray # noqa: F401
@@ -24,10 +24,14 @@ from py4j.java_gateway import JavaObject
from pyspark.mllib.linalg import Vector
-VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]]
+if TYPE_CHECKING:
+ from scipy.sparse import spmatrix, sparray
+
C = TypeVar("C", bound=type)
JavaObjectOrPickleDump = Union[JavaObject, bytearray, bytes]
CorrMethodType = Union[Literal["spearman"], Literal["pearson"]]
KolmogorovSmirnovTestDistNameType = Literal["norm"]
NormType = Union[None, float, Literal["fro"], Literal["nuc"]]
+
+VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...],
"spmatrix", "sparray", range]
diff --git a/python/pyspark/mllib/linalg/__init__.py
b/python/pyspark/mllib/linalg/__init__.py
index 40f0255a91bb..1613e3d2cc8d 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -61,7 +61,6 @@ from pyspark.sql.types import (
if TYPE_CHECKING:
from pyspark.mllib._typing import VectorLike, NormType
- from scipy.sparse import spmatrix
from numpy.typing import ArrayLike
@@ -94,23 +93,25 @@ except BaseException:
_have_scipy = False
-def _convert_to_vector(d: Union["VectorLike", "spmatrix", range]) -> "Vector":
+def _convert_to_vector(d: "VectorLike") -> "Vector":
if isinstance(d, Vector):
return d
- elif type(d) in (array.array, np.array, np.ndarray, list, tuple, range):
+ elif isinstance(d, (array.array, np.ndarray, list, tuple, range)):
return DenseVector(d)
elif _have_scipy and scipy.sparse.issparse(d):
- assert cast("spmatrix", d).shape[1] == 1, "Expected column vector"
+ assert hasattr(d, "shape")
+ assert d.shape[1] == 1, "Expected column vector"
# Make sure the converted csc_matrix has sorted indices.
- csc = cast("spmatrix", d).tocsc()
+ assert hasattr(d, "tocsc")
+ csc = d.tocsc()
if not csc.has_sorted_indices:
csc.sort_indices()
- return SparseVector(cast("spmatrix", d).shape[0], csc.indices,
csc.data)
+ return SparseVector(d.shape[0], csc.indices, csc.data)
else:
raise TypeError("Cannot convert type %s into Vector" % type(d))
-def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int:
+def _vector_size(v: "VectorLike") -> int:
"""
Returns the size of the vector.
@@ -133,16 +134,17 @@ def _vector_size(v: Union["VectorLike", "spmatrix",
range]) -> int:
"""
if isinstance(v, Vector):
return len(v)
- elif type(v) in (array.array, list, tuple, range):
+ elif isinstance(v, (array.array, list, tuple, range)):
return len(v)
- elif type(v) == np.ndarray:
+ elif isinstance(v, np.ndarray):
if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
return len(v)
else:
raise ValueError("Cannot treat an ndarray of shape %s as a vector"
% str(v.shape))
elif _have_scipy and scipy.sparse.issparse(v):
- assert cast("spmatrix", v).shape[1] == 1, "Expected column vector"
- return cast("spmatrix", v).shape[0]
+ assert hasattr(v, "shape")
+ assert v.shape[1] == 1, "Expected column vector"
+ return v.shape[0]
else:
raise TypeError("Cannot treat type %s as a vector" % type(v))
@@ -390,13 +392,13 @@ class DenseVector(Vector):
def __reduce__(self) -> Tuple[Type["DenseVector"], Tuple[bytes]]:
return DenseVector, (self.array.tobytes(),)
- def numNonzeros(self) -> int:
+ def numNonzeros(self) -> Union[int, np.intp]:
"""
Number of nonzero elements. This scans all active values and count non
zeros
"""
return np.count_nonzero(self.array)
- def norm(self, p: "NormType") -> np.float64:
+ def norm(self, p: "NormType") -> np.floating[Any]:
"""
Calculates the norm of a DenseVector.
@@ -410,7 +412,7 @@ class DenseVector(Vector):
"""
return np.linalg.norm(self.array, p)
- def dot(self, other: Iterable[float]) -> np.float64:
+ def dot(self, other: "VectorLike") -> np.float64:
"""
Compute the dot product of two Vectors. We support
(Numpy array, list, SparseVector, or SciPy sparse)
@@ -444,8 +446,10 @@ class DenseVector(Vector):
assert len(self) == other.shape[0], "dimension mismatch"
return np.dot(self.array, other)
elif _have_scipy and scipy.sparse.issparse(other):
- assert len(self) == cast("spmatrix", other).shape[0], "dimension
mismatch"
- return cast("spmatrix", other).transpose().dot(self.toArray())
+ assert hasattr(other, "shape")
+ assert len(self) == other.shape[0], "dimension mismatch"
+ assert hasattr(other, "transpose")
+ return other.transpose().dot(self.toArray())
else:
assert len(self) == _vector_size(other), "dimension mismatch"
if isinstance(other, SparseVector):
@@ -455,7 +459,7 @@ class DenseVector(Vector):
else:
return np.dot(self.toArray(), cast("ArrayLike", other))
- def squared_distance(self, other: Iterable[float]) -> np.float64:
+ def squared_distance(self, other: "VectorLike") -> np.float64:
"""
Squared distance of two Vectors.
@@ -685,13 +689,13 @@ class SparseVector(Vector):
% (self.indices[i], self.indices[i + 1])
)
- def numNonzeros(self) -> int:
+ def numNonzeros(self) -> Union[int, np.intp]:
"""
Number of nonzero elements. This scans all active values and count non
zeros.
"""
return np.count_nonzero(self.values)
- def norm(self, p: "NormType") -> np.float64:
+ def norm(self, p: "NormType") -> np.floating[Any]:
"""
Calculates the norm of a SparseVector.
@@ -766,7 +770,7 @@ class SparseVector(Vector):
raise ValueError("Unable to parse values from %s." % s)
return SparseVector(cast(int, size), indices, values)
- def dot(self, other: Iterable[float]) -> np.float64:
+ def dot(self, other: "VectorLike") -> np.float64:
"""
Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
@@ -822,9 +826,9 @@ class SparseVector(Vector):
return np.dot(self_values, other.values[other_cmind])
else:
- return self.dot(_convert_to_vector(other)) # type:
ignore[arg-type]
+ return self.dot(_convert_to_vector(other))
- def squared_distance(self, other: Iterable[float]) -> np.float64:
+ def squared_distance(self, other: "VectorLike") -> np.float64:
"""
Squared distance from a SparseVector or 1-dimensional NumPy array.
@@ -892,7 +896,7 @@ class SparseVector(Vector):
j += 1
return result
else:
- return self.squared_distance(_convert_to_vector(other)) # type:
ignore[arg-type]
+ return self.squared_distance(_convert_to_vector(other))
def toArray(self) -> np.ndarray:
"""
diff --git a/python/pyspark/mllib/linalg/distributed.py
b/python/pyspark/mllib/linalg/distributed.py
index ecdb4e75ed49..aa98f959798c 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -35,7 +35,7 @@ UT = TypeVar("UT", bound="DistributedMatrix")
VT = TypeVar("VT", bound="Matrix")
if TYPE_CHECKING:
- from pyspark.ml._typing import VectorLike
+ from pyspark.mllib._typing import VectorLike
__all__ = [
"BlockMatrix",
diff --git a/python/pyspark/mllib/regression.py
b/python/pyspark/mllib/regression.py
index b384e0fa608e..a69f8c00b221 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -84,7 +84,7 @@ class LabeledPoint:
'label' and 'features' are accessible as class attributes.
"""
- def __init__(self, label: float, features: Iterable[float]):
+ def __init__(self, label: float, features: "VectorLike"):
self.label = float(label)
self.features = _convert_to_vector(features)
diff --git a/python/pyspark/mllib/stat/_statistics.py
b/python/pyspark/mllib/stat/_statistics.py
index c638fb819506..e993ced1a419 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -189,7 +189,8 @@ class Statistics:
if not y:
return cast(
- JavaObject, callMLlibFunc("corr", x.map(_convert_to_vector),
method)
+ JavaObject,
+ callMLlibFunc("corr", cast(RDD[Vector],
x).map(_convert_to_vector), method),
).toArray()
else:
return cast(
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index caa2c9338a95..65b25f8add6d 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -145,11 +145,7 @@ class MLUtils:
if numFeatures <= 0:
parsed.cache()
numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else
x[1][-1]).reduce(max) + 1
- return parsed.map(
- lambda x: LabeledPoint(
- x[0], Vectors.sparse(numFeatures, x[1], x[2]) # type:
ignore[arg-type]
- )
- )
+ return parsed.map(lambda x: LabeledPoint(x[0],
Vectors.sparse(numFeatures, x[1], x[2])))
@staticmethod
def saveAsLibSVMFile(data: RDD["LabeledPoint"], dir: str) -> None:
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]