This is an automated email from the ASF dual-hosted git repository. zero323 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new dc607911 [SPARK-36654][PYTHON] Drop type ignores from numpy imports dc607911 is described below commit dc607911a91c515f23d8192f389e7e54e785f94d Author: zero323 <mszymkiew...@gmail.com> AuthorDate: Fri Oct 22 00:43:21 2021 +0200 [SPARK-36654][PYTHON] Drop type ignores from numpy imports ### What changes were proposed in this pull request? This PR removes `type: ignore[import]` annotations from numpy imports. Additionally, minimum version of numpy required for the mypy tests is explicitly stated in the GitHub workflow files. ### Why are the changes needed? Since version 1.20 numpy is PEP 561 compatible so these ignores are no longer necessary. ### Does this PR introduce _any_ user-facing change? This change targets primarily our development process and should be relatively transparent to the end users: - If `pyspark` is installed as a package in the current environment both current master and this pr yield the same results ([test_pr_package.sh.out.txt](https://github.com/apache/spark/files/7332501/test_pr_package.sh.out.txt) and [test_master_package.sh.out.txt](https://github.com/apache/spark/files/7332499/test_master_package.sh.out.txt) respectively) - If `pyspark` is added manually to `MYPYPATH` both this pr and current master yield large number of errors ([test_pr_mypypath.sh.out.txt](https://github.com/apache/spark/files/7332500/test_pr_mypypath.sh.out.txt) and [test_master_mypypath.sh.out.txt](https://github.com/apache/spark/files/7332498/test_master_mypypath.sh.out.txt) respectively) These errors are primarily, but not exclusively, related to `pyspark.pandas` hints (notable exception is `cloudpickle` ‒ to avoid this we could bring back stub file, that was removed during initial type hints migration). In this case, users can silence the errors,, by providing appropriate `mypy.ini` file. This behavior was tested using simple test files ```python from pyspark import SparkContext sc = SparkContext.getOrCreate() reveal_type(sc.parallelize([1, 2, 3])) ``` and ```python from pyspark.ml.linalg import DenseVector reveal_type(DenseVector([1, 2, 3]).toArray() + 1) ``` ### How was this patch tested? Existing tests and manual verification of the behavior in isolated environments. Closes #33900 from zero323/SPARK-36654. Authored-by: zero323 <mszymkiew...@gmail.com> Signed-off-by: zero323 <mszymkiew...@gmail.com> --- .github/workflows/build_and_test.yml | 6 +++--- examples/src/main/python/sql/arrow.py | 2 +- python/mypy.ini | 3 --- python/pyspark/ml/clustering.pyi | 2 +- python/pyspark/ml/image.pyi | 2 +- python/pyspark/ml/linalg/__init__.pyi | 2 +- python/pyspark/mllib/classification.pyi | 2 +- python/pyspark/mllib/clustering.pyi | 2 +- python/pyspark/mllib/linalg/__init__.pyi | 2 +- python/pyspark/mllib/regression.pyi | 2 +- python/pyspark/mllib/stat/KernelDensity.pyi | 2 +- python/pyspark/mllib/stat/_statistics.pyi | 2 +- python/pyspark/rdd.pyi | 2 +- python/pyspark/sql/pandas/_typing/protocols/frame.pyi | 2 +- python/pyspark/sql/pandas/_typing/protocols/series.pyi | 2 +- 15 files changed, 16 insertions(+), 19 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 96451ac..f586d55 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -197,7 +197,7 @@ jobs: - name: Install Python packages (Python 3.8) if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) run: | - python3.8 -m pip install numpy 'pyarrow<5.0.0' pandas scipy xmlrunner + python3.8 -m pip install 'numpy>=1.20.0' 'pyarrow<5.0.0' pandas scipy xmlrunner python3.8 -m pip list # Run the tests. - name: Run tests @@ -460,8 +460,8 @@ jobs: # See also https://github.com/sphinx-doc/sphinx/issues/7551. # Jinja2 3.0.0+ causes error when building with Sphinx. # See also https://issues.apache.org/jira/browse/SPARK-35375. - python3.9 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0' - python3.9 -m pip install sphinx_plotly_directive 'pyarrow<5.0.0' pandas 'plotly>=4.8' + python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0' + python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' 'pyarrow<5.0.0' pandas 'plotly>=4.8' apt-get update -y apt-get install -y ruby ruby-dev Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py index a0eba0f..1a0480a 100644 --- a/examples/src/main/python/sql/arrow.py +++ b/examples/src/main/python/sql/arrow.py @@ -32,7 +32,7 @@ require_minimum_pyarrow_version() def dataframe_with_arrow_example(spark): - import numpy as np # type: ignore[import] + import numpy as np import pandas as pd # type: ignore[import] # Enable Arrow-based columnar data transfers diff --git a/python/mypy.ini b/python/mypy.ini index 03c9729..eb29109 100644 --- a/python/mypy.ini +++ b/python/mypy.ini @@ -116,9 +116,6 @@ ignore_errors = True [mypy-py4j.*] ignore_missing_imports = True -[mypy-numpy] -ignore_missing_imports = True - [mypy-scipy.*] ignore_missing_imports = True diff --git a/python/pyspark/ml/clustering.pyi b/python/pyspark/ml/clustering.pyi index e2a2d7e..e899b60 100644 --- a/python/pyspark/ml/clustering.pyi +++ b/python/pyspark/ml/clustering.pyi @@ -43,7 +43,7 @@ from pyspark.ml.param import Param from pyspark.ml.stat import MultivariateGaussian from pyspark.sql.dataframe import DataFrame -from numpy import ndarray # type: ignore[import] +from numpy import ndarray class ClusteringSummary(JavaWrapper): @property diff --git a/python/pyspark/ml/image.pyi b/python/pyspark/ml/image.pyi index 9ff3a88..206490a 100644 --- a/python/pyspark/ml/image.pyi +++ b/python/pyspark/ml/image.pyi @@ -20,7 +20,7 @@ from typing import Dict, List from pyspark.sql.types import Row, StructType -from numpy import ndarray # type: ignore[import] +from numpy import ndarray class _ImageSchema: def __init__(self) -> None: ... diff --git a/python/pyspark/ml/linalg/__init__.pyi b/python/pyspark/ml/linalg/__init__.pyi index a302825..46bd812 100644 --- a/python/pyspark/ml/linalg/__init__.pyi +++ b/python/pyspark/ml/linalg/__init__.pyi @@ -22,7 +22,7 @@ from typing import Any, Dict, Iterable, List, NoReturn, Optional, Tuple, Type, U from pyspark.ml import linalg as newlinalg # noqa: F401 from pyspark.sql.types import StructType, UserDefinedType -from numpy import float64, ndarray # type: ignore[import] +from numpy import float64, ndarray class VectorUDT(UserDefinedType): @classmethod diff --git a/python/pyspark/mllib/classification.pyi b/python/pyspark/mllib/classification.pyi index 89229e2..ba88f6d 100644 --- a/python/pyspark/mllib/classification.pyi +++ b/python/pyspark/mllib/classification.pyi @@ -27,7 +27,7 @@ from pyspark.mllib.regression import LabeledPoint, LinearModel, StreamingLinearA from pyspark.mllib.util import Saveable, Loader from pyspark.streaming.dstream import DStream -from numpy import float64, ndarray # type: ignore[import] +from numpy import float64, ndarray class LinearClassificationModel(LinearModel): def __init__(self, weights: Vector, intercept: float) -> None: ... diff --git a/python/pyspark/mllib/clustering.pyi b/python/pyspark/mllib/clustering.pyi index 7b3673a..e52e5e3 100644 --- a/python/pyspark/mllib/clustering.pyi +++ b/python/pyspark/mllib/clustering.pyi @@ -21,7 +21,7 @@ from typing import List, NamedTuple, Optional, Tuple, TypeVar import array -from numpy import float64, int64, ndarray # type: ignore[import] +from numpy import float64, int64, ndarray from py4j.java_gateway import JavaObject # type: ignore[import] from pyspark.mllib._typing import VectorLike diff --git a/python/pyspark/mllib/linalg/__init__.pyi b/python/pyspark/mllib/linalg/__init__.pyi index 60d16b2..dddc40b 100644 --- a/python/pyspark/mllib/linalg/__init__.pyi +++ b/python/pyspark/mllib/linalg/__init__.pyi @@ -31,7 +31,7 @@ from typing import ( ) from pyspark.ml import linalg as newlinalg from pyspark.sql.types import StructType, UserDefinedType -from numpy import float64, ndarray # type: ignore[import] +from numpy import float64, ndarray QT = TypeVar("QT") RT = TypeVar("RT") diff --git a/python/pyspark/mllib/regression.pyi b/python/pyspark/mllib/regression.pyi index 0283378..72d0f5a 100644 --- a/python/pyspark/mllib/regression.pyi +++ b/python/pyspark/mllib/regression.pyi @@ -24,7 +24,7 @@ from pyspark.context import SparkContext from pyspark.mllib.linalg import Vector from pyspark.mllib.util import Saveable, Loader from pyspark.streaming.dstream import DStream -from numpy import ndarray # type: ignore[import] +from numpy import ndarray K = TypeVar("K") diff --git a/python/pyspark/mllib/stat/KernelDensity.pyi b/python/pyspark/mllib/stat/KernelDensity.pyi index efc70c9..eac6007 100644 --- a/python/pyspark/mllib/stat/KernelDensity.pyi +++ b/python/pyspark/mllib/stat/KernelDensity.pyi @@ -18,7 +18,7 @@ from typing import Iterable from pyspark.rdd import RDD -from numpy import ndarray # type: ignore[import] +from numpy import ndarray class KernelDensity: def __init__(self) -> None: ... diff --git a/python/pyspark/mllib/stat/_statistics.pyi b/python/pyspark/mllib/stat/_statistics.pyi index 3834d51..15c8837 100644 --- a/python/pyspark/mllib/stat/_statistics.pyi +++ b/python/pyspark/mllib/stat/_statistics.pyi @@ -19,7 +19,7 @@ from typing import List, Optional, overload, Union from typing_extensions import Literal -from numpy import ndarray # type: ignore[import] +from numpy import ndarray from pyspark.mllib.common import JavaModelWrapper from pyspark.mllib.linalg import Vector, Matrix diff --git a/python/pyspark/rdd.pyi b/python/pyspark/rdd.pyi index 317a0db..a810a2c 100644 --- a/python/pyspark/rdd.pyi +++ b/python/pyspark/rdd.pyi @@ -33,7 +33,7 @@ from typing import ( ) from typing_extensions import Literal -from numpy import int32, int64, float32, float64, ndarray # type: ignore[import] +from numpy import int32, int64, float32, float64, ndarray from pyspark._typing import SupportsOrdering from pyspark.sql.pandas._typing import ( diff --git a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi index 6f450df..3456cfb 100644 --- a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +++ b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi @@ -22,7 +22,7 @@ # - Add Protocol as a base class # - Replace imports with Any -import numpy as np # type: ignore[import] +import numpy as np from typing import Any, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union from typing_extensions import Protocol from .series import SeriesLike diff --git a/python/pyspark/sql/pandas/_typing/protocols/series.pyi b/python/pyspark/sql/pandas/_typing/protocols/series.pyi index f2de2e8..5ca8967 100644 --- a/python/pyspark/sql/pandas/_typing/protocols/series.pyi +++ b/python/pyspark/sql/pandas/_typing/protocols/series.pyi @@ -22,7 +22,7 @@ # - Add Protocol as a base class # - Replace imports with Any -import numpy as np # type: ignore[import] +import numpy as np from typing import Any, Callable, Hashable, IO, Optional from typing_extensions import Protocol --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org