This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new a2198a1e2542 [SPARK-55108][PYTHON] Use the latest pandas-stubs for
type check
a2198a1e2542 is described below
commit a2198a1e2542daf68de5f1be318036cf3204a1be
Author: Tian Gao <[email protected]>
AuthorDate: Wed Jan 21 15:06:49 2026 +0800
[SPARK-55108][PYTHON] Use the latest pandas-stubs for type check
### What changes were proposed in this pull request?
Upgrade `pandas-stubs` version (basically do not pin it anymore) and fix
all the mypy errors.
### Why are the changes needed?
`pandas-stubs` sync with `pandas` - in theory we should use the same
version for both. The stubs for numpy is smarter that it actually depends on a
specific version. `pandas-stubs` does not do that. However, we should
definitely not pin it on a 1.x version.
Some of the ignore comments might because we have a low mypy version (we
have to do one at a time). I did not really "fix" all typing issues - I
explicitly ignored a lot of them. Some of the errors are because our type hint
is wrong or inaccurate.
One step at a time. I think we are moving to the right direction.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Let's check CI.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #53877 from gaogaotiantian/upgrade-pandas-stubs.
Authored-by: Tian Gao <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
dev/requirements.txt | 2 +-
dev/spark-test-image/lint/Dockerfile | 2 +-
python/pyspark/instrumentation_utils.py | 2 +-
python/pyspark/ml/functions.py | 2 +-
python/pyspark/pandas/accessors.py | 2 +-
python/pyspark/pandas/base.py | 2 +-
python/pyspark/pandas/categorical.py | 2 +-
python/pyspark/pandas/data_type_ops/base.py | 2 +-
python/pyspark/pandas/data_type_ops/boolean_ops.py | 2 +-
.../pandas/data_type_ops/categorical_ops.py | 6 +--
python/pyspark/pandas/data_type_ops/null_ops.py | 2 +-
python/pyspark/pandas/data_type_ops/num_ops.py | 2 +-
python/pyspark/pandas/frame.py | 46 +++++++++++-----------
python/pyspark/pandas/generic.py | 12 +++---
python/pyspark/pandas/groupby.py | 8 ++--
python/pyspark/pandas/indexes/base.py | 14 +++----
python/pyspark/pandas/indexes/category.py | 2 +-
python/pyspark/pandas/indexes/datetimes.py | 2 +-
python/pyspark/pandas/indexes/multi.py | 2 +-
python/pyspark/pandas/indexes/timedelta.py | 2 +-
python/pyspark/pandas/indexing.py | 6 +--
python/pyspark/pandas/internal.py | 4 +-
python/pyspark/pandas/namespace.py | 19 +++++----
python/pyspark/pandas/plot/core.py | 2 +-
python/pyspark/pandas/plot/matplotlib.py | 6 +--
python/pyspark/pandas/resample.py | 15 ++++---
python/pyspark/pandas/series.py | 19 +++++----
python/pyspark/pandas/testing.py | 14 +++----
python/pyspark/pandas/typedef/typehints.py | 2 +-
python/pyspark/pandas/usage_logging/__init__.py | 4 +-
python/pyspark/pandas/utils.py | 7 ++--
python/pyspark/sql/connect/session.py | 5 +--
python/pyspark/sql/pandas/conversion.py | 14 +++----
python/pyspark/sql/pandas/types.py | 30 +++++---------
python/pyspark/sql/plot/core.py | 2 +-
35 files changed, 128 insertions(+), 137 deletions(-)
diff --git a/dev/requirements.txt b/dev/requirements.txt
index 7153a8d71dc8..0cfbd650d9f0 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -26,7 +26,7 @@ ruff==0.14.8
mypy==1.8.0
pytest-mypy-plugins==1.9.3
# See SPARK-38680.
-pandas-stubs<1.2.0.54
+pandas-stubs>=2.2.0
scipy-stubs; python_version>='3.10'
types-PyYAML
diff --git a/dev/spark-test-image/lint/Dockerfile
b/dev/spark-test-image/lint/Dockerfile
index 9fd4bcd77e60..c76eb82b32b5 100644
--- a/dev/spark-test-image/lint/Dockerfile
+++ b/dev/spark-test-image/lint/Dockerfile
@@ -94,7 +94,7 @@ RUN python3.11 -m pip install \
'numpy==2.0.2' \
'numpydoc' \
'pandas' \
- 'pandas-stubs==1.2.0.53' \
+ 'pandas-stubs' \
'plotly>=4.8' \
'pyarrow>=22.0.0' \
'pytest-mypy-plugins==1.9.3' \
diff --git a/python/pyspark/instrumentation_utils.py
b/python/pyspark/instrumentation_utils.py
index 8fe579ac14f9..ac971c7543fe 100644
--- a/python/pyspark/instrumentation_utils.py
+++ b/python/pyspark/instrumentation_utils.py
@@ -124,7 +124,7 @@ def _attach(
logger_module: Union[str, ModuleType],
modules: List[ModuleType],
classes: List[Type[Any]],
- missings: List[Tuple[Type[Any], Type[Any]]],
+ missings: List[Tuple[Union[ModuleType, Type[Any]], Type[Any]]],
) -> None:
if isinstance(logger_module, str):
logger_module = importlib.import_module(logger_module)
diff --git a/python/pyspark/ml/functions.py b/python/pyspark/ml/functions.py
index de5539afd4a1..c72e343b73f3 100644
--- a/python/pyspark/ml/functions.py
+++ b/python/pyspark/ml/functions.py
@@ -241,7 +241,7 @@ def _validate_and_transform_single_input(
# tensor columns
if len(batch.columns) == 1:
# one tensor column and one expected input, vstack rows
- single_input = np.vstack(batch.iloc[:, 0])
+ single_input = np.vstack(batch.iloc[:, 0]) # type:
ignore[call-overload]
else:
raise ValueError(
"Multiple input columns found, but model expected a single "
diff --git a/python/pyspark/pandas/accessors.py
b/python/pyspark/pandas/accessors.py
index a3af94e0e808..359c2db7cb7d 100644
--- a/python/pyspark/pandas/accessors.py
+++ b/python/pyspark/pandas/accessors.py
@@ -579,7 +579,7 @@ class PandasOnSparkFrameMethods:
return original_func(o, *args, **kwargs)
def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
- return new_func(pdf).to_frame()
+ return new_func(pdf).to_frame() # type: ignore[operator]
def pandas_series_func(
f: Callable[[pd.DataFrame], pd.DataFrame], return_type: DataType
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index a16a8902f9f0..cd6ae554cb02 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -26,7 +26,7 @@ from typing import Any, Callable, Optional, Sequence, Tuple,
Union, cast, TYPE_C
import numpy as np
import pandas as pd
-from pandas.api.types import is_list_like, CategoricalDtype # type:
ignore[attr-defined]
+from pandas.api.types import is_list_like, CategoricalDtype
from pyspark.sql import functions as F, Column, Window
from pyspark.sql.types import LongType, BooleanType, NumericType
diff --git a/python/pyspark/pandas/categorical.py
b/python/pyspark/pandas/categorical.py
index c7e6ab873f6b..024912ebc2eb 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -17,7 +17,7 @@
from typing import Any, Callable, List, Optional, Union, TYPE_CHECKING, cast
import pandas as pd
-from pandas.api.types import ( # type: ignore[attr-defined]
+from pandas.api.types import (
CategoricalDtype,
is_dict_like,
is_list_like,
diff --git a/python/pyspark/pandas/data_type_ops/base.py
b/python/pyspark/pandas/data_type_ops/base.py
index bb2ef8b09aea..7be658b79192 100644
--- a/python/pyspark/pandas/data_type_ops/base.py
+++ b/python/pyspark/pandas/data_type_ops/base.py
@@ -116,7 +116,7 @@ def _should_return_all_false(left: IndexOpsLike, right:
Any) -> bool:
based on incompatible dtypes: non-numeric vs. numeric (including bools).
"""
from pyspark.pandas.base import IndexOpsMixin
- from pandas.api.types import is_list_like # type: ignore[attr-defined]
+ from pandas.api.types import is_list_like
def are_both_numeric(left_dtype: Dtype, right_dtype: Dtype) -> bool:
return is_numeric_dtype(left_dtype) and is_numeric_dtype(right_dtype)
diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py
b/python/pyspark/pandas/data_type_ops/boolean_ops.py
index d9a24dee0802..a88b1b7a5d24 100644
--- a/python/pyspark/pandas/data_type_ops/boolean_ops.py
+++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -19,7 +19,7 @@ import numbers
from typing import Any, Union
import pandas as pd
-from pandas.api.types import CategoricalDtype, is_integer_dtype # type:
ignore[attr-defined]
+from pandas.api.types import CategoricalDtype, is_integer_dtype
from pandas.core.dtypes.common import is_numeric_dtype
from pyspark.pandas.base import column_op, IndexOpsMixin
diff --git a/python/pyspark/pandas/data_type_ops/categorical_ops.py
b/python/pyspark/pandas/data_type_ops/categorical_ops.py
index 824666b5819b..3a977f418641 100644
--- a/python/pyspark/pandas/data_type_ops/categorical_ops.py
+++ b/python/pyspark/pandas/data_type_ops/categorical_ops.py
@@ -16,11 +16,11 @@
#
from itertools import chain
-from typing import cast, Any, Union
+from typing import cast, Any, Sequence, Union
import pandas as pd
import numpy as np
-from pandas.api.types import is_list_like, CategoricalDtype # type:
ignore[attr-defined]
+from pandas.api.types import is_list_like, CategoricalDtype
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
from pyspark.pandas.base import IndexOpsMixin
@@ -43,7 +43,7 @@ class CategoricalOps(DataTypeOps):
"""Restore column when to_pandas."""
return pd.Series(
pd.Categorical.from_codes(
- col.replace(np.nan, -1).astype(int),
+ cast(Sequence[int], col.replace(np.nan, -1).astype(int)),
categories=cast(CategoricalDtype, self.dtype).categories,
ordered=cast(CategoricalDtype, self.dtype).ordered,
)
diff --git a/python/pyspark/pandas/data_type_ops/null_ops.py
b/python/pyspark/pandas/data_type_ops/null_ops.py
index 329a3790df6b..1c3296011b61 100644
--- a/python/pyspark/pandas/data_type_ops/null_ops.py
+++ b/python/pyspark/pandas/data_type_ops/null_ops.py
@@ -17,7 +17,7 @@
from typing import Any, Union
-from pandas.api.types import CategoricalDtype, is_list_like # type:
ignore[attr-defined]
+from pandas.api.types import CategoricalDtype, is_list_like
from pyspark.pandas._typing import Dtype, IndexOpsLike
from pyspark.pandas.data_type_ops.base import (
diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py
b/python/pyspark/pandas/data_type_ops/num_ops.py
index 499c921fc1e7..31cdd7e8cc07 100644
--- a/python/pyspark/pandas/data_type_ops/num_ops.py
+++ b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -21,7 +21,7 @@ from typing import Any, Union, Callable, cast
import numpy as np
import pandas as pd
-from pandas.api.types import ( # type: ignore[attr-defined]
+from pandas.api.types import (
is_bool_dtype,
is_integer_dtype,
is_float_dtype,
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index e7ec1ea28b65..1d0c0fc638b1 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -38,6 +38,7 @@ from typing import (
Iterable,
Iterator,
List,
+ Literal,
Optional,
Sequence,
Tuple,
@@ -52,20 +53,20 @@ import datetime
import numpy as np
import pandas as pd
-from pandas.api.types import ( # type: ignore[attr-defined]
+from pandas.api.types import (
is_bool_dtype,
is_list_like,
is_dict_like,
is_scalar,
)
-from pandas.tseries.frequencies import DateOffset, to_offset
+from pandas.tseries.frequencies import DateOffset, to_offset # type:
ignore[attr-defined]
if TYPE_CHECKING:
from pandas.io.formats.style import Styler
-from pandas.core.dtypes.common import infer_dtype_from_object
-from pandas.core.accessor import CachedAccessor
-from pandas.core.dtypes.inference import is_sequence
+from pandas.core.dtypes.common import infer_dtype_from_object # type:
ignore[attr-defined]
+from pandas.core.accessor import CachedAccessor # type: ignore[attr-defined]
+from pandas.core.dtypes.inference import is_sequence # type:
ignore[attr-defined]
from pyspark.errors import PySparkValueError
from pyspark import StorageLevel
@@ -3193,7 +3194,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
should_retain_index = should_infer_schema
def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
- pdf_or_pser = pdf.apply(func, axis=axis, args=args, **kwds) #
type: ignore[arg-type]
+ pdf_or_pser = pdf.apply(func, axis=axis, args=args, **kwds)
if isinstance(pdf_or_pser, pd.Series):
return pdf_or_pser.to_frame()
else:
@@ -3211,7 +3212,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
)
limit = get_option("compute.shortcut_limit")
pdf = self_applied.head(limit + 1)._to_internal_pandas()
- applied = pdf.apply(func, axis=axis, args=args, **kwds) # type:
ignore[arg-type]
+ applied = pdf.apply(func, axis=axis, args=args, **kwds)
psser_or_psdf = ps.from_pandas(applied)
if len(pdf) <= limit:
return psser_or_psdf
@@ -3458,7 +3459,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
)
limit = get_option("compute.shortcut_limit")
pdf = self.head(limit + 1)._to_internal_pandas()
- transformed = pdf.transform(func, axis, *args, **kwargs) # type:
ignore[arg-type]
+ transformed = pdf.transform(func, axis, *args, **kwargs)
psdf: DataFrame = DataFrame(transformed)
if len(pdf) <= limit:
return psdf
@@ -5780,7 +5781,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
2 0.0 0.0 1.0
"""
return DataFrame(
- pd.DataFrame.from_records(data, index, exclude, columns,
coerce_float, nrows)
+ pd.DataFrame.from_records(data, index, exclude, columns,
coerce_float, nrows) # type: ignore[arg-type]
)
def to_records(
@@ -6970,8 +6971,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
psdf_columns = psdf.columns
if isinstance(psdf_columns, pd.MultiIndex):
- psdf.columns = psdf_columns.set_levels(
- psdf_columns.levels[-1].astype( # type: ignore[index]
+ psdf.columns = psdf_columns.set_levels( # type:
ignore[call-overload]
+ psdf_columns.levels[-1].astype(
spark_type_to_pandas_dtype(self._psser_for(columns).spark.data_type)
),
level=-1,
@@ -7496,7 +7497,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
return DataFrame(internal)
else:
psdf = self.copy()
- psdf.columns = psdf.columns.droplevel(level) # type:
ignore[arg-type]
+ psdf.columns = psdf.columns.droplevel(level)
return psdf
def drop(
@@ -10367,9 +10368,10 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
labels.append(label)
if isinstance(columns, pd.Index):
- column_label_names = [
- name if is_name_like_tuple(name) else (name,) for name in
columns.names
- ]
+ column_label_names = cast(
+ list[tuple[Any]],
+ [name if is_name_like_tuple(name) else (name,) for name in
columns.names],
+ )
internal = self._internal.with_new_columns(
scols_or_pssers, column_labels=labels,
column_label_names=column_label_names
)
@@ -11283,7 +11285,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
)
else:
# axis=None case - return single boolean value
- return bool(self.any(axis=1, bool_only=bool_only,
skipna=skipna).any()) # type: ignore
+ return bool(self.any(axis=1, bool_only=bool_only,
skipna=skipna).any())
def _bool_column_labels(self, column_labels: List[Label]) -> List[Label]:
"""
@@ -11291,7 +11293,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
"""
# Rely on dtype rather than spark type because columns that consist of
bools and
# Nones should be excluded if bool_only is True
- return [label for label in column_labels if
is_bool_dtype(self._psser_for(label))]
+ return [label for label in column_labels if
is_bool_dtype(self._psser_for(label))] # type: ignore[arg-type]
def _result_aggregated(
self, column_labels: List[Label], scols: Sequence[PySparkColumn]
@@ -12404,7 +12406,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
buf=buf,
max_cols=max_cols,
memory_usage=False,
- show_counts=show_counts, # type: ignore
+ show_counts=show_counts,
)
finally:
del self._data
@@ -13416,8 +13418,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
row_2 10 20 30 40
"""
return DataFrame(
- pd.DataFrame.from_dict(
- data, orient=orient, dtype=dtype, columns=columns # type:
ignore[arg-type]
+ pd.DataFrame.from_dict( # type: ignore[call-overload]
+ data, orient=orient, dtype=dtype, columns=columns
)
)
@@ -13445,8 +13447,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
def resample(
self,
rule: str,
- closed: Optional[str] = None,
- label: Optional[str] = None,
+ closed: Optional[Literal["left", "right"]] = None,
+ label: Optional[Literal["left", "right"]] = None,
on: Optional["Series"] = None,
) -> "DataFrameResampler":
"""
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index 14c2ec410589..6d01d486e64a 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -37,7 +37,7 @@ import warnings
import numpy as np
import pandas as pd
-from pandas.api.types import is_list_like # type: ignore[attr-defined]
+from pandas.api.types import is_list_like
from pyspark.sql import Column, functions as F
from pyspark.sql.internal import InternalFunction as SF
@@ -946,7 +946,7 @@ class Frame(object, metaclass=ABCMeta):
psdf_or_ser = self
pdf = psdf_or_ser._to_pandas()
if isinstance(self, ps.Series):
- pdf = pdf.to_frame()
+ pdf = pdf.to_frame() # type: ignore[operator]
# To make the format consistent and readable by `read_json`,
convert it to pandas' and
# use 'records' orient for now.
return pdf.to_json(orient="records")
@@ -2665,7 +2665,7 @@ class Frame(object, metaclass=ABCMeta):
with sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
# Disable Arrow to keep row ordering.
- first_valid_row = (
+ first_valid_row_df = (
self._internal.spark_frame.filter(cond)
.select(self._internal.index_spark_columns)
.limit(1)
@@ -2673,10 +2673,10 @@ class Frame(object, metaclass=ABCMeta):
)
# For Empty Series or DataFrame, returns None.
- if len(first_valid_row) == 0:
+ if len(first_valid_row_df) == 0:
return None
- first_valid_row = first_valid_row.iloc[0]
+ first_valid_row = first_valid_row_df.iloc[0]
if len(first_valid_row) == 1:
return first_valid_row.iloc[0]
else:
@@ -3085,7 +3085,7 @@ class Frame(object, metaclass=ABCMeta):
# If DataFrame has only a single value, use pandas API directly.
if has_single_value:
result = self._to_internal_pandas().squeeze(axis)
- return ps.Series(result) if isinstance(result, pd.Series) else
result
+ return ps.Series(result) if isinstance(result, pd.Series) else
result # type: ignore[return-value]
elif axis == 0:
return self
else:
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 24a6dbe2e40f..bdc6a66448e0 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -44,8 +44,8 @@ from typing import (
import warnings
import pandas as pd
-from pandas.api.types import is_number, is_hashable, is_list_like # type:
ignore[attr-defined]
-from pandas.core.common import _builtin_table # type: ignore[attr-defined]
+from pandas.api.types import is_number, is_hashable, is_list_like
+from pandas.core.common import _builtin_table # type: ignore[import-untyped]
from pyspark.sql import Column, DataFrame as SparkDataFrame, Window, functions
as F
from pyspark.sql.internal import InternalFunction as SF
@@ -2179,7 +2179,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
if is_series_groupby:
def pandas_filter(pdf: pd.DataFrame) -> pd.DataFrame:
- return
pd.DataFrame(pdf.groupby(groupkey_names)[pdf.columns[-1]].filter(func))
+ return
pd.DataFrame(pdf.groupby(groupkey_names)[pdf.columns[-1]].filter(func)) #
type: ignore[arg-type]
else:
f = _builtin_table.get(func, func)
@@ -2260,7 +2260,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
)
# Just positionally map the column names to given schema's.
- pdf.columns = return_schema.names
+ pdf.columns = pd.Index(return_schema.names)
return pdf
diff --git a/python/pyspark/pandas/indexes/base.py
b/python/pyspark/pandas/indexes/base.py
index a7890e26f995..420f4d5e1f50 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -32,7 +32,7 @@ import warnings
import pandas as pd
import numpy as np
-from pandas.api.types import ( # type: ignore[attr-defined]
+from pandas.api.types import (
is_list_like,
is_bool_dtype,
is_integer_dtype,
@@ -40,9 +40,9 @@ from pandas.api.types import ( # type: ignore[attr-defined]
is_numeric_dtype,
is_object_dtype,
)
-from pandas.core.accessor import CachedAccessor
-from pandas.io.formats.printing import pprint_thing
-from pandas.api.types import CategoricalDtype, is_hashable # type:
ignore[attr-defined]
+from pandas.core.accessor import CachedAccessor # type: ignore[attr-defined]
+from pandas.io.formats.printing import pprint_thing # type:
ignore[import-untyped]
+from pandas.api.types import CategoricalDtype, is_hashable
from pandas._libs import lib
from pyspark.sql.column import Column
@@ -255,9 +255,7 @@ class Index(IndexOpsMixin):
)
return DataFrame(internal).index
- spark: "SparkIndexOpsMethods" = CachedAccessor( # type: ignore[assignment]
- "spark", SparkIndexMethods
- )
+ spark: "SparkIndexOpsMethods" = CachedAccessor("spark", SparkIndexMethods)
# This method is used via `DataFrame.info` API internally.
def _summary(self, name: Optional[str] = None) -> str:
@@ -2241,7 +2239,7 @@ class Index(IndexOpsMixin):
raise ValueError("index must be monotonic increasing or
decreasing")
result = sdf.toPandas().iloc[0, 0]
- return result if result is not None else np.nan
+ return result if result is not None else np.nan # type:
ignore[return-value]
def _index_fields_for_union_like(
self: "Index", other: "Index", func_name: str
diff --git a/python/pyspark/pandas/indexes/category.py
b/python/pyspark/pandas/indexes/category.py
index 85dbe7654efa..da22c029633b 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -17,7 +17,7 @@
from typing import Any, Callable, List, Optional, Union, cast, no_type_check
import pandas as pd
-from pandas.api.types import is_hashable, CategoricalDtype # type:
ignore[attr-defined]
+from pandas.api.types import is_hashable, CategoricalDtype
from pyspark import pandas as ps
from pyspark.pandas.indexes.base import Index
diff --git a/python/pyspark/pandas/indexes/datetimes.py
b/python/pyspark/pandas/indexes/datetimes.py
index 25a09f9e2d20..ed7862da1009 100644
--- a/python/pyspark/pandas/indexes/datetimes.py
+++ b/python/pyspark/pandas/indexes/datetimes.py
@@ -20,7 +20,7 @@ from functools import partial
from typing import Any, Optional, Union, cast, no_type_check
import pandas as pd
-from pandas.api.types import is_hashable # type: ignore[attr-defined]
+from pandas.api.types import is_hashable
from pandas.tseries.offsets import DateOffset
from pyspark._globals import _NoValue
diff --git a/python/pyspark/pandas/indexes/multi.py
b/python/pyspark/pandas/indexes/multi.py
index b5aae890d50a..56640f5c8802 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -19,7 +19,7 @@ from functools import partial, reduce
from typing import Any, Callable, Iterator, List, Optional, Tuple, Union,
cast, no_type_check
import pandas as pd
-from pandas.api.types import is_hashable, is_list_like # type:
ignore[attr-defined]
+from pandas.api.types import is_hashable, is_list_like
from pyspark.sql import functions as F, Column as PySparkColumn, Window
from pyspark.sql.types import DataType
diff --git a/python/pyspark/pandas/indexes/timedelta.py
b/python/pyspark/pandas/indexes/timedelta.py
index 3457ebb5bc58..8ffebe2acf0b 100644
--- a/python/pyspark/pandas/indexes/timedelta.py
+++ b/python/pyspark/pandas/indexes/timedelta.py
@@ -19,7 +19,7 @@ from typing import cast, no_type_check, Any
from functools import partial
import pandas as pd
-from pandas.api.types import is_hashable # type: ignore[attr-defined]
+from pandas.api.types import is_hashable
import numpy as np
from pyspark import pandas as ps
diff --git a/python/pyspark/pandas/indexing.py
b/python/pyspark/pandas/indexing.py
index 5d9d1bc7eea6..9212dc96ec3d 100644
--- a/python/pyspark/pandas/indexing.py
+++ b/python/pyspark/pandas/indexing.py
@@ -24,7 +24,7 @@ from functools import reduce
from typing import Any, Optional, List, Tuple, TYPE_CHECKING, Union, cast,
Sized
import pandas as pd
-from pandas.api.types import is_list_like # type: ignore[attr-defined]
+from pandas.api.types import is_list_like
import numpy as np
from pyspark.sql import functions as F, Column as PySparkColumn
@@ -174,7 +174,7 @@ class AtIndexer(IndexerLike):
values = pdf.iloc[:, 0].values
return (
- values if (len(row_sel) < self._internal.index_level or
len(values) > 1) else values[0]
+ values if (len(row_sel) < self._internal.index_level or
len(values) > 1) else values[0] # type: ignore[return-value]
)
@@ -575,7 +575,7 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta):
if length == 0:
raise KeyError(name_like_string(key))
elif length == 1:
- return pdf_or_pser.iloc[0]
+ return pdf_or_pser.iloc[0] # type: ignore[return-value]
else:
return psdf_or_psser
else:
diff --git a/python/pyspark/pandas/internal.py
b/python/pyspark/pandas/internal.py
index db6c0ebd3d09..fab553f71fc7 100644
--- a/python/pyspark/pandas/internal.py
+++ b/python/pyspark/pandas/internal.py
@@ -1562,12 +1562,12 @@ class InternalFrame:
pdf = pdf.copy()
data_columns = [name_like_string(col) for col in pdf.columns]
- pdf.columns = data_columns
+ pdf.columns = pd.Index(data_columns)
if retain_index:
index_nlevels = pdf.index.nlevels
index_columns = [SPARK_INDEX_NAME_FORMAT(i) for i in
range(index_nlevels)]
- pdf.index.names = index_columns
+ pdf.index.names = index_columns # type: ignore[assignment]
reset_index = pdf.reset_index()
else:
index_nlevels = 0
diff --git a/python/pyspark/pandas/namespace.py
b/python/pyspark/pandas/namespace.py
index 094a16e1a12c..0170e4533424 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -23,11 +23,13 @@ from typing import (
Callable,
Dict,
List,
+ Literal,
Optional,
Set,
Sized,
Tuple,
Type,
+ TYPE_CHECKING,
Union,
cast,
no_type_check,
@@ -42,7 +44,7 @@ import warnings
import numpy as np
import pandas as pd
-from pandas.api.types import ( # type: ignore[attr-defined]
+from pandas.api.types import (
is_datetime64_dtype,
is_list_like,
)
@@ -100,6 +102,9 @@ from pyspark.pandas.spark.utils import
as_nullable_spark_type, force_decimal_pre
from pyspark.pandas.indexes import Index, DatetimeIndex, TimedeltaIndex
from pyspark.pandas.indexes.multi import MultiIndex
+if TYPE_CHECKING:
+ from pandas._typing import HTMLFlavors
+
__all__ = [
"from_pandas",
"range",
@@ -1137,7 +1142,7 @@ def read_excel(
sn: Union[str, int, List[Union[str, int]], None],
nr: Optional[int] = None,
) -> pd.DataFrame:
- return pd.read_excel(
+ return pd.read_excel( # type: ignore[call-overload, misc]
io=BytesIO(io_or_bin) if isinstance(io_or_bin, (bytes, bytearray))
else io_or_bin,
sheet_name=sn,
header=header,
@@ -1154,7 +1159,7 @@ def read_excel(
na_values=na_values,
keep_default_na=keep_default_na,
verbose=verbose,
- parse_dates=parse_dates, # type: ignore[arg-type]
+ parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
@@ -1260,7 +1265,7 @@ def read_excel(
def read_html(
io: Union[str, Any],
match: str = ".+",
- flavor: Optional[str] = None,
+ flavor: Optional["HTMLFlavors"] = None,
header: Optional[Union[int, List[int]]] = None,
index_col: Optional[Union[int, List[int]]] = None,
skiprows: Optional[Union[int, List[int], slice]] = None,
@@ -1917,7 +1922,7 @@ def date_range(
return cast(
DatetimeIndex,
ps.from_pandas(
- pd.date_range(
+ pd.date_range( # type: ignore[call-overload]
start=start,
end=end,
periods=periods,
@@ -2032,7 +2037,7 @@ def timedelta_range(
periods: Optional[int] = None,
freq: Optional[Union[str, DateOffset]] = None,
name: Optional[str] = None,
- closed: Optional[str] = None,
+ closed: Optional[Literal["left", "right"]] = None,
) -> TimedeltaIndex:
"""
Return a fixed frequency TimedeltaIndex, with day as the default frequency.
@@ -2101,7 +2106,7 @@ def timedelta_range(
return cast(
TimedeltaIndex,
ps.from_pandas(
- pd.timedelta_range(
+ pd.timedelta_range( # type: ignore[call-overload]
start=start,
end=end,
periods=periods,
diff --git a/python/pyspark/pandas/plot/core.py
b/python/pyspark/pandas/plot/core.py
index 2dfb6e7e2980..8c50d7401918 100644
--- a/python/pyspark/pandas/plot/core.py
+++ b/python/pyspark/pandas/plot/core.py
@@ -20,7 +20,7 @@ import math
import pandas as pd
import numpy as np
-from pandas.core.base import PandasObject
+from pandas.core.base import PandasObject # type: ignore[attr-defined]
from pandas.core.dtypes.inference import is_integer
from pyspark.sql import functions as F, Column
diff --git a/python/pyspark/pandas/plot/matplotlib.py
b/python/pyspark/pandas/plot/matplotlib.py
index 7dff6adbdea7..616cf1de340e 100644
--- a/python/pyspark/pandas/plot/matplotlib.py
+++ b/python/pyspark/pandas/plot/matplotlib.py
@@ -24,8 +24,8 @@ import numpy as np
from matplotlib.axes._base import _process_plot_format # type:
ignore[attr-defined]
from matplotlib.figure import Figure
from pandas.core.dtypes.inference import is_list_like
-from pandas.io.formats.printing import pprint_thing
-from pandas.plotting._matplotlib import ( # type: ignore[attr-defined]
+from pandas.io.formats.printing import pprint_thing # type:
ignore[import-untyped]
+from pandas.plotting._matplotlib import ( # type: ignore[import-untyped]
BarPlot as PandasBarPlot,
BoxPlot as PandasBoxPlot,
HistPlot as PandasHistPlot,
@@ -37,7 +37,7 @@ from pandas.plotting._matplotlib import ( # type:
ignore[attr-defined]
KdePlot as PandasKdePlot,
)
from pandas.plotting._core import PlotAccessor
-from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot
+from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot # type:
ignore[import-untyped]
from pyspark.pandas.plot import (
TopNPlotBase,
diff --git a/python/pyspark/pandas/resample.py
b/python/pyspark/pandas/resample.py
index 152bf90e60cf..6fd6382ca4a6 100644
--- a/python/pyspark/pandas/resample.py
+++ b/python/pyspark/pandas/resample.py
@@ -24,6 +24,7 @@ from typing import (
Any,
Generic,
List,
+ Literal,
Optional,
)
@@ -82,8 +83,8 @@ class Resampler(Generic[FrameLike], metaclass=ABCMeta):
psdf: DataFrame,
resamplekey: Optional[Series],
rule: str,
- closed: Optional[str] = None,
- label: Optional[str] = None,
+ closed: Optional[Literal["left", "right"]] = None,
+ label: Optional[Literal["left", "right"]] = None,
agg_columns: List[Series] = [],
):
self._psdf = psdf
@@ -96,6 +97,7 @@ class Resampler(Generic[FrameLike], metaclass=ABCMeta):
if not getattr(self._offset, "n") > 0:
raise ValueError("rule offset must be positive")
+ self._closed: Literal["left", "right"]
if closed is None:
self._closed = "right" if self._offset.rule_code in ["A-DEC", "M",
"ME"] else "left"
elif closed in ["left", "right"]:
@@ -103,6 +105,7 @@ class Resampler(Generic[FrameLike], metaclass=ABCMeta):
else:
raise ValueError("invalid closed: '{}'".format(closed))
+ self._label: Literal["left", "right"]
if label is None:
self._label = "right" if self._offset.rule_code in ["A-DEC", "M",
"ME"] else "left"
elif label in ["left", "right"]:
@@ -704,8 +707,8 @@ class DataFrameResampler(Resampler[DataFrame]):
psdf: DataFrame,
resamplekey: Optional[Series],
rule: str,
- closed: Optional[str] = None,
- label: Optional[str] = None,
+ closed: Optional[Literal["left", "right"]] = None,
+ label: Optional[Literal["left", "right"]] = None,
agg_columns: List[Series] = [],
):
super().__init__(
@@ -735,8 +738,8 @@ class SeriesResampler(Resampler[Series]):
psser: Series,
resamplekey: Optional[Series],
rule: str,
- closed: Optional[str] = None,
- label: Optional[str] = None,
+ closed: Optional[Literal["left", "right"]] = None,
+ label: Optional[Literal["left", "right"]] = None,
agg_columns: List[Series] = [],
):
super().__init__(
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index e5c8890b646c..6407749c14fc 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -32,6 +32,7 @@ from typing import (
IO,
Iterable,
List,
+ Literal,
Optional,
Sequence,
Tuple,
@@ -46,14 +47,14 @@ from typing import (
import numpy as np
import pandas as pd
-from pandas.core.accessor import CachedAccessor
-from pandas.io.formats.printing import pprint_thing
-from pandas.api.types import ( # type: ignore[attr-defined]
+from pandas.core.accessor import CachedAccessor # type: ignore[attr-defined]
+from pandas.io.formats.printing import pprint_thing # type:
ignore[import-untyped]
+from pandas.api.types import (
is_list_like,
is_hashable,
CategoricalDtype,
)
-from pandas.tseries.frequencies import DateOffset
+from pandas.tseries.frequencies import DateOffset # type: ignore[attr-defined]
from pyspark.sql import (
functions as F,
@@ -513,9 +514,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
)
return first_series(DataFrame(internal))
- spark: "SparkIndexOpsMethods" = CachedAccessor( # type: ignore[assignment]
- "spark", SparkSeriesMethods
- )
+ spark: "SparkIndexOpsMethods" = CachedAccessor("spark", SparkSeriesMethods)
@property
def dtypes(self) -> Dtype:
@@ -7160,8 +7159,8 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
def resample(
self,
rule: str_type,
- closed: Optional[str_type] = None,
- label: Optional[str_type] = None,
+ closed: Optional[Literal["left", "right"]] = None,
+ label: Optional[Literal["left", "right"]] = None,
on: Optional["Series"] = None,
) -> "SeriesResampler":
"""
@@ -7363,7 +7362,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
)
)
return rest + footer
- return pser.to_string(name=self.name, dtype=self.dtype)
+ return pser.to_string(name=self.name, dtype=self.dtype) # type:
ignore[call-overload]
def __dir__(self) -> Iterable[str_type]:
if not isinstance(self.spark.data_type, StructType):
diff --git a/python/pyspark/pandas/testing.py b/python/pyspark/pandas/testing.py
index 49ec6081338a..dccd119ed1eb 100644
--- a/python/pyspark/pandas/testing.py
+++ b/python/pyspark/pandas/testing.py
@@ -146,8 +146,8 @@ def assert_frame_equal(
left,
right,
check_dtype=check_dtype,
- check_index_type=check_index_type, # type: ignore[arg-type]
- check_column_type=check_column_type, # type: ignore[arg-type]
+ check_index_type=check_index_type,
+ check_column_type=check_column_type,
check_frame_type=check_frame_type,
check_names=check_names,
by_blocks=by_blocks,
@@ -238,11 +238,11 @@ def assert_series_equal(
if isinstance(right, ps.Series):
right = right.to_pandas()
- pd.testing.assert_series_equal( # type: ignore[call-arg]
+ pd.testing.assert_series_equal( # type: ignore[call-overload]
left,
right,
check_dtype=check_dtype,
- check_index_type=check_index_type, # type: ignore[arg-type]
+ check_index_type=check_index_type,
check_series_type=check_series_type,
check_names=check_names,
check_exact=check_exact,
@@ -251,8 +251,8 @@ def assert_series_equal(
check_category_order=check_category_order,
check_freq=check_freq,
check_flags=check_flags,
- rtol=rtol, # type: ignore[arg-type]
- atol=atol, # type: ignore[arg-type]
+ rtol=rtol,
+ atol=atol,
obj=obj,
check_index=check_index,
check_like=check_like,
@@ -314,7 +314,7 @@ def assert_index_equal(
if isinstance(right, ps.Index):
right = right.to_pandas()
- pd.testing.assert_index_equal( # type: ignore[call-arg]
+ pd.testing.assert_index_equal(
left,
right,
exact=exact,
diff --git a/python/pyspark/pandas/typedef/typehints.py
b/python/pyspark/pandas/typedef/typehints.py
index 8d2fd4a3c0af..a8e4ae0e4aaf 100644
--- a/python/pyspark/pandas/typedef/typehints.py
+++ b/python/pyspark/pandas/typedef/typehints.py
@@ -28,7 +28,7 @@ from typing import Any, Callable, Generic, List, Tuple,
Union, Type, get_type_hi
import numpy as np
import pandas as pd
-from pandas.api.types import CategoricalDtype, pandas_dtype # type:
ignore[attr-defined]
+from pandas.api.types import CategoricalDtype, pandas_dtype
from pandas.api.extensions import ExtensionDtype
diff --git a/python/pyspark/pandas/usage_logging/__init__.py
b/python/pyspark/pandas/usage_logging/__init__.py
index 4478b6c85f66..29a858c199d3 100644
--- a/python/pyspark/pandas/usage_logging/__init__.py
+++ b/python/pyspark/pandas/usage_logging/__init__.py
@@ -117,7 +117,7 @@ def attach(logger_module: Union[str, ModuleType]) -> None:
sql_formatter._CAPTURE_SCOPES = 4
modules.append(sql_formatter)
- missings = [
+ missings: list[tuple[Union[type, ModuleType], type]] = [
(pd, MissingPandasLikeGeneralFunctions),
(pd.DataFrame, MissingPandasLikeDataFrame),
(pd.Series, MissingPandasLikeSeries),
@@ -132,7 +132,7 @@ def attach(logger_module: Union[str, ModuleType]) -> None:
(pd.core.window.RollingGroupby, MissingPandasLikeRollingGroupby),
(pd.core.window.ExponentialMovingWindow,
MissingPandasLikeExponentialMoving),
(
- pd.core.window.ExponentialMovingWindowGroupby, # type:
ignore[attr-defined]
+ pd.core.window.ExponentialMovingWindowGroupby,
MissingPandasLikeExponentialMovingGroupby,
),
]
diff --git a/python/pyspark/pandas/utils.py b/python/pyspark/pandas/utils.py
index 6a0561fad9c5..6cd531043bbc 100644
--- a/python/pyspark/pandas/utils.py
+++ b/python/pyspark/pandas/utils.py
@@ -29,6 +29,7 @@ from typing import (
Dict,
Iterator,
List,
+ Literal,
Optional,
Tuple,
Union,
@@ -40,7 +41,7 @@ from typing import (
import warnings
import pandas as pd
-from pandas.api.types import is_list_like # type: ignore[attr-defined]
+from pandas.api.types import is_list_like
from pyspark.sql import functions as F, Column, DataFrame as PySparkDataFrame,
SparkSession
from pyspark.sql.types import DoubleType
@@ -745,14 +746,14 @@ def is_name_like_value(
return True
-def validate_axis(axis: Optional[Axis] = 0, none_axis: int = 0) -> int:
+def validate_axis(axis: Optional[Axis] = 0, none_axis: Literal[0, 1] = 0) ->
Literal[0, 1]:
"""Check the given axis is valid."""
# convert to numeric axis
axis = cast(Dict[Optional[Axis], int], {None: none_axis, "index": 0,
"columns": 1}).get(
axis, axis
)
if axis in (none_axis, 0, 1):
- return cast(int, axis)
+ return axis # type: ignore[return-value]
else:
raise ValueError("No axis named {0}".format(axis))
diff --git a/python/pyspark/sql/connect/session.py
b/python/pyspark/sql/connect/session.py
index b432ea224045..2288677763d3 100644
--- a/python/pyspark/sql/connect/session.py
+++ b/python/pyspark/sql/connect/session.py
@@ -46,10 +46,7 @@ from typing import (
import numpy as np
import pandas as pd
import pyarrow as pa
-from pandas.api.types import ( # type: ignore[attr-defined]
- is_datetime64_dtype,
- is_timedelta64_dtype,
-)
+from pandas.api.types import is_datetime64_dtype, is_timedelta64_dtype
import urllib
from pyspark.sql.connect.dataframe import DataFrame
diff --git a/python/pyspark/sql/pandas/conversion.py
b/python/pyspark/sql/pandas/conversion.py
index ea41e70c37d2..333f9803df3a 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -163,7 +163,7 @@ def _convert_arrow_table_to_pandas(
)
# Restore original column names (including duplicates)
- pdf.columns = schema.names
+ pdf.columns = pd.Index(schema.names)
return pdf
@@ -294,9 +294,7 @@ class PandasConversionMixin:
# Below is toPandas without Arrow optimization.
rows = self.collect()
if len(rows) > 0:
- pdf = pd.DataFrame.from_records(
- rows, index=range(len(rows)), columns=self.columns # type:
ignore[arg-type]
- )
+ pdf = pd.DataFrame.from_records(rows, index=range(len(rows)),
columns=self.columns)
else:
pdf = pd.DataFrame(columns=self.columns)
@@ -698,7 +696,7 @@ class SparkConversionMixin:
conv = _converter(data_type)
if conv is not None:
- return lambda pser: pser.apply(conv) # type:
ignore[return-value]
+ return lambda pser: pser.apply(conv)
else:
return lambda pser: pser
@@ -744,7 +742,7 @@ class SparkConversionMixin:
# Convert pandas.DataFrame to list of numpy records
np_records = pdf.set_axis(
- [f"col_{i}" for i in range(len(pdf.columns))], axis="columns" #
type: ignore[arg-type]
+ [f"col_{i}" for i in range(len(pdf.columns))], axis="columns"
).to_records(index=False)
# Check if any columns need to be fixed for Spark to infer properly
@@ -825,9 +823,7 @@ class SparkConversionMixin:
require_minimum_pyarrow_version()
import pandas as pd
- from pandas.api.types import ( # type: ignore[attr-defined]
- is_datetime64_dtype,
- )
+ from pandas.api.types import is_datetime64_dtype
import pyarrow as pa
# Create the Spark schema from list of names passed in with Arrow types
diff --git a/python/pyspark/sql/pandas/types.py
b/python/pyspark/sql/pandas/types.py
index 5ccbd37f0d24..5d1e1aae8be9 100644
--- a/python/pyspark/sql/pandas/types.py
+++ b/python/pyspark/sql/pandas/types.py
@@ -708,9 +708,7 @@ def _check_series_convert_timestamps_internal(
require_minimum_pandas_version()
import pandas as pd
- from pandas.api.types import ( # type: ignore[attr-defined]
- is_datetime64_dtype,
- )
+ from pandas.api.types import is_datetime64_dtype
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64_dtype(s.dtype):
@@ -776,9 +774,7 @@ def _check_series_convert_timestamps_localize(
require_minimum_pandas_version()
import pandas as pd
- from pandas.api.types import ( # type: ignore[attr-defined]
- is_datetime64_dtype,
- )
+ from pandas.api.types import is_datetime64_dtype
from_tz = from_timezone or _get_local_timezone()
to_tz = to_timezone or _get_local_timezone()
@@ -848,7 +844,7 @@ def _convert_map_items_to_dict(s: "PandasSeriesLike") ->
"PandasSeriesLike":
:param s: pandas.Series of lists of (key, value) pairs
:return: pandas.Series of dictionaries
"""
- return cast("PandasSeriesLike", s.apply(lambda m: None if m is None else
{k: v for k, v in m}))
+ return s.apply(lambda m: None if m is None else {k: v for k, v in m})
def _convert_dict_to_map_items(s: "PandasSeriesLike") -> "PandasSeriesLike":
@@ -858,7 +854,7 @@ def _convert_dict_to_map_items(s: "PandasSeriesLike") ->
"PandasSeriesLike":
:param s: pandas.Series of dictionaries
:return: pandas.Series of lists of (key, value) pairs
"""
- return cast("PandasSeriesLike", s.apply(lambda d: list(d.items()) if d is
not None else None))
+ return s.apply(lambda d: list(d.items()) if d is not None else None)
def _to_corrected_pandas_type(dt: DataType) -> Optional[Any]:
@@ -982,7 +978,7 @@ def _create_converter_to_pandas(
def correct_dtype(pser: pd.Series) -> pd.Series:
if pser.isnull().any():
- return pser.astype(nullable_type, copy=False)
+ return pser.astype(nullable_type, copy=False) # type:
ignore[arg-type]
else:
return pser.astype(pandas_type, copy=False)
@@ -1279,9 +1275,7 @@ def _create_converter_to_pandas(
conv = _converter(data_type, struct_in_pandas, ndarray_as_list)
if conv is not None:
- return lambda pser: pser.apply( # type: ignore[return-value]
- lambda x: conv(x) if x is not None else None
- )
+ return lambda pser: pser.apply(lambda x: conv(x) if x is not None else
None)
else:
return lambda pser: pser
@@ -1344,10 +1338,8 @@ def _create_converter_from_pandas(
# lambda x: Decimal(x))).cast(pa.decimal128(1))
def convert_int_to_decimal(pser: pd.Series) -> pd.Series:
- if pd.api.types.is_integer_dtype(pser): # type:
ignore[attr-defined]
- return pser.apply( # type: ignore[return-value]
- lambda x: Decimal(x) if pd.notna(x) else None
- )
+ if pd.api.types.is_integer_dtype(pser):
+ return pser.apply(lambda x: Decimal(x) if pd.notna(x) else
None)
else:
return pser
@@ -1581,9 +1573,7 @@ def _create_converter_from_pandas(
conv = _converter(data_type)
if conv is not None:
- return lambda pser: pser.apply( # type: ignore[return-value]
- lambda x: conv(x) if x is not None else None
- )
+ return lambda pser: pser.apply(lambda x: conv(x) if x is not None else
None)
else:
return lambda pser: pser
@@ -1677,5 +1667,5 @@ def convert_pandas_using_numpy_type(
),
):
np_type = _to_numpy_type(field.dataType)
- df[field.name] = df[field.name].astype(np_type)
+ df[field.name] = df[field.name].astype(np_type) # type:
ignore[arg-type]
return df
diff --git a/python/pyspark/sql/plot/core.py b/python/pyspark/sql/plot/core.py
index 526f4897d390..cd0d5921370d 100644
--- a/python/pyspark/sql/plot/core.py
+++ b/python/pyspark/sql/plot/core.py
@@ -709,7 +709,7 @@ class PySparkHistogramPlotBase:
for i, input_column_name in enumerate(colnames):
pdf = result[result["__group_id"] == i]
pdf = pdf[["count"]]
- pdf.columns = [input_column_name]
+ pdf.columns = [input_column_name] # type: ignore[assignment]
output_series.append(pdf[input_column_name])
return output_series
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]