This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 2d6541e7044 [SPARK-38755][PYTHON][TEST] Add file to address missing
pandas general functions
2d6541e7044 is described below
commit 2d6541e704493b1b67507563326113285a90d762
Author: itholic <[email protected]>
AuthorDate: Fri Apr 8 11:12:23 2022 +0900
[SPARK-38755][PYTHON][TEST] Add file to address missing pandas general
functions
### What changes were proposed in this pull request?
This PR proposes to add
`python/pyspark/pandas/missing/general_functions.py` to track the missing
[pandas general
functions](https://pandas.pydata.org/docs/reference/general_functions.html) API.
### Why are the changes needed?
We have scripts in `missing` directory to track & address the missing
pandas APIs, but one for general functions is missing.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
The existing tests should cover
Closes #36034 from itholic/SPARK-38755.
Authored-by: itholic <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../reference/pyspark.pandas/general_functions.rst | 14 ++++++--
python/pyspark/pandas/__init__.py | 13 ++++++-
python/pyspark/pandas/missing/general_functions.py | 41 ++++++++++++++++++++++
python/pyspark/pandas/tests/test_namespace.py | 17 +++++++++
python/pyspark/pandas/usage_logging/__init__.py | 4 ++-
5 files changed, 84 insertions(+), 5 deletions(-)
diff --git a/python/docs/source/reference/pyspark.pandas/general_functions.rst
b/python/docs/source/reference/pyspark.pandas/general_functions.rst
index 1358a99a97e..a67c0a872e0 100644
--- a/python/docs/source/reference/pyspark.pandas/general_functions.rst
+++ b/python/docs/source/reference/pyspark.pandas/general_functions.rst
@@ -41,6 +41,7 @@ Data manipulations and SQL
melt
merge
+ merge_asof
get_dummies
concat
sql
@@ -52,18 +53,25 @@ Top-level missing data
.. autosummary::
:toctree: api/
- to_numeric
isna
isnull
notna
notnull
-Top-level dealing with datetimelike
+Top-level dealing with numeric data
-----------------------------------
+
+.. autosummary::
+ :toctree: api/
+
+ to_numeric
+
+Top-level dealing with datetimelike data
+----------------------------------------
.. autosummary::
:toctree: api/
to_datetime
date_range
to_timedelta
- timedelta_range
\ No newline at end of file
+ timedelta_range
diff --git a/python/pyspark/pandas/__init__.py
b/python/pyspark/pandas/__init__.py
index a11c496e2ca..56a4f80a13c 100644
--- a/python/pyspark/pandas/__init__.py
+++ b/python/pyspark/pandas/__init__.py
@@ -22,9 +22,11 @@
import os
import sys
-from distutils.version import LooseVersion
import warnings
+from distutils.version import LooseVersion
+from typing import Any
+from pyspark.pandas.missing.general_functions import
_MissingPandasLikeGeneralFunctions
from pyspark.sql.pandas.utils import require_minimum_pandas_version,
require_minimum_pyarrow_version
try:
@@ -151,3 +153,12 @@ _auto_patch_pandas()
from pyspark.pandas.config import get_option, options, option_context,
reset_option, set_option
from pyspark.pandas.namespace import * # noqa: F403
from pyspark.pandas.sql_formatter import sql
+
+
+def __getattr__(key: str) -> Any:
+ if key.startswith("__"):
+ raise AttributeError(key)
+ if hasattr(_MissingPandasLikeGeneralFunctions, key):
+ return getattr(_MissingPandasLikeGeneralFunctions, key)
+ else:
+ raise AttributeError("module 'pyspark.pandas' has no attribute '%s'" %
(key))
diff --git a/python/pyspark/pandas/missing/general_functions.py
b/python/pyspark/pandas/missing/general_functions.py
new file mode 100644
index 00000000000..2fd5b877cc9
--- /dev/null
+++ b/python/pyspark/pandas/missing/general_functions.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from pyspark.pandas.missing import unsupported_function
+
+
+def _unsupported_function(method_name, deprecated=False, reason=""):
+ return unsupported_function(
+ class_name="pd", method_name=method_name, deprecated=deprecated,
reason=reason
+ )
+
+
+class _MissingPandasLikeGeneralFunctions:
+
+ pivot = _unsupported_function("pivot")
+ pivot_table = _unsupported_function("pivot_table")
+ crosstab = _unsupported_function("crosstab")
+ cut = _unsupported_function("cut")
+ qcut = _unsupported_function("qcut")
+ merge_ordered = _unsupported_function("merge_ordered")
+ factorize = _unsupported_function("factorize")
+ unique = _unsupported_function("unique")
+ wide_to_long = _unsupported_function("wide_to_long")
+ bdate_range = _unsupported_function("bdate_range")
+ period_range = _unsupported_function("period_range")
+ infer_freq = _unsupported_function("infer_freq")
+ interval_range = _unsupported_function("interval_range")
+ eval = _unsupported_function("eval")
diff --git a/python/pyspark/pandas/tests/test_namespace.py
b/python/pyspark/pandas/tests/test_namespace.py
index b5290257089..8c5adb9bae5 100644
--- a/python/pyspark/pandas/tests/test_namespace.py
+++ b/python/pyspark/pandas/tests/test_namespace.py
@@ -16,13 +16,16 @@
#
import itertools
+import inspect
import pandas as pd
import numpy as np
from pyspark import pandas as ps
+from pyspark.pandas.exceptions import PandasNotImplementedError
from pyspark.pandas.namespace import _get_index_map, read_delta
from pyspark.pandas.utils import spark_column_equals
+from pyspark.pandas.missing.general_functions import
_MissingPandasLikeGeneralFunctions
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
@@ -554,6 +557,20 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils):
lambda: ps.to_numeric(psser, errors="ignore"),
)
+ def test_missing(self):
+ missing_functions = inspect.getmembers(
+ _MissingPandasLikeGeneralFunctions, inspect.isfunction
+ )
+ unsupported_functions = [
+ name for (name, type_) in missing_functions if type_.__name__ ==
"unsupported_function"
+ ]
+ for name in unsupported_functions:
+ with self.assertRaisesRegex(
+ PandasNotImplementedError,
+ "The method.*pd.*{}.*not implemented yet.".format(name),
+ ):
+ getattr(ps, name)()
+
if __name__ == "__main__":
import unittest
diff --git a/python/pyspark/pandas/usage_logging/__init__.py
b/python/pyspark/pandas/usage_logging/__init__.py
index a6f1470b9f4..7f082623c03 100644
--- a/python/pyspark/pandas/usage_logging/__init__.py
+++ b/python/pyspark/pandas/usage_logging/__init__.py
@@ -31,6 +31,7 @@ from pyspark.pandas.indexes.datetimes import DatetimeIndex
from pyspark.pandas.indexes.multi import MultiIndex
from pyspark.pandas.indexes.numeric import Float64Index, Int64Index
from pyspark.pandas.missing.frame import _MissingPandasLikeDataFrame
+from pyspark.pandas.missing.general_functions import
_MissingPandasLikeGeneralFunctions
from pyspark.pandas.missing.groupby import (
MissingPandasLikeDataFrameGroupBy,
MissingPandasLikeSeriesGroupBy,
@@ -109,6 +110,7 @@ def attach(logger_module: Union[str, ModuleType]) -> None:
modules.append(sql_formatter)
missings = [
+ (pd, _MissingPandasLikeGeneralFunctions),
(pd.DataFrame, _MissingPandasLikeDataFrame),
(pd.Series, MissingPandasLikeSeries),
(pd.Index, MissingPandasLikeIndex),
@@ -122,4 +124,4 @@ def attach(logger_module: Union[str, ModuleType]) -> None:
(pd.core.window.RollingGroupby, MissingPandasLikeRollingGroupby),
]
- _attach(logger_module, modules, classes, missings)
+ _attach(logger_module, modules, classes, missings) # type:
ignore[arg-type]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]