[spark] branch master updated: [SPARK-38755][PYTHON][TEST] Add file to address missing pandas general functions

gurwls223 Thu, 07 Apr 2022 19:12:46 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 2d6541e7044 [SPARK-38755][PYTHON][TEST] Add file to address missing 
pandas general functions
2d6541e7044 is described below

commit 2d6541e704493b1b67507563326113285a90d762
Author: itholic <[email protected]>
AuthorDate: Fri Apr 8 11:12:23 2022 +0900

    [SPARK-38755][PYTHON][TEST] Add file to address missing pandas general 
functions
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to add 
`python/pyspark/pandas/missing/general_functions.py` to track the missing 
[pandas general 
functions](https://pandas.pydata.org/docs/reference/general_functions.html) API.
    
    ### Why are the changes needed?
    
    We have scripts in `missing` directory to track & address the missing 
pandas APIs, but one for general functions is missing.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    The existing tests should cover
    
    Closes #36034 from itholic/SPARK-38755.
    
    Authored-by: itholic <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 .../reference/pyspark.pandas/general_functions.rst | 14 ++++++--
 python/pyspark/pandas/__init__.py                  | 13 ++++++-
 python/pyspark/pandas/missing/general_functions.py | 41 ++++++++++++++++++++++
 python/pyspark/pandas/tests/test_namespace.py      | 17 +++++++++
 python/pyspark/pandas/usage_logging/__init__.py    |  4 ++-
 5 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/python/docs/source/reference/pyspark.pandas/general_functions.rst 
b/python/docs/source/reference/pyspark.pandas/general_functions.rst
index 1358a99a97e..a67c0a872e0 100644
--- a/python/docs/source/reference/pyspark.pandas/general_functions.rst
+++ b/python/docs/source/reference/pyspark.pandas/general_functions.rst
@@ -41,6 +41,7 @@ Data manipulations and SQL
 
    melt
    merge
+   merge_asof
    get_dummies
    concat
    sql
@@ -52,18 +53,25 @@ Top-level missing data
 .. autosummary::
    :toctree: api/
 
-   to_numeric
    isna
    isnull
    notna
    notnull
 
-Top-level dealing with datetimelike
+Top-level dealing with numeric data
 -----------------------------------
+
+.. autosummary::
+   :toctree: api/
+
+   to_numeric
+
+Top-level dealing with datetimelike data
+----------------------------------------
 .. autosummary::
    :toctree: api/
 
    to_datetime
    date_range
    to_timedelta
-   timedelta_range
\ No newline at end of file
+   timedelta_range
diff --git a/python/pyspark/pandas/__init__.py 
b/python/pyspark/pandas/__init__.py
index a11c496e2ca..56a4f80a13c 100644
--- a/python/pyspark/pandas/__init__.py
+++ b/python/pyspark/pandas/__init__.py
@@ -22,9 +22,11 @@
 
 import os
 import sys
-from distutils.version import LooseVersion
 import warnings
+from distutils.version import LooseVersion
+from typing import Any
 
+from pyspark.pandas.missing.general_functions import 
_MissingPandasLikeGeneralFunctions
 from pyspark.sql.pandas.utils import require_minimum_pandas_version, 
require_minimum_pyarrow_version
 
 try:
@@ -151,3 +153,12 @@ _auto_patch_pandas()
 from pyspark.pandas.config import get_option, options, option_context, 
reset_option, set_option
 from pyspark.pandas.namespace import *  # noqa: F403
 from pyspark.pandas.sql_formatter import sql
+
+
+def __getattr__(key: str) -> Any:
+    if key.startswith("__"):
+        raise AttributeError(key)
+    if hasattr(_MissingPandasLikeGeneralFunctions, key):
+        return getattr(_MissingPandasLikeGeneralFunctions, key)
+    else:
+        raise AttributeError("module 'pyspark.pandas' has no attribute '%s'" % 
(key))
diff --git a/python/pyspark/pandas/missing/general_functions.py 
b/python/pyspark/pandas/missing/general_functions.py
new file mode 100644
index 00000000000..2fd5b877cc9
--- /dev/null
+++ b/python/pyspark/pandas/missing/general_functions.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from pyspark.pandas.missing import unsupported_function
+
+
+def _unsupported_function(method_name, deprecated=False, reason=""):
+    return unsupported_function(
+        class_name="pd", method_name=method_name, deprecated=deprecated, 
reason=reason
+    )
+
+
+class _MissingPandasLikeGeneralFunctions:
+
+    pivot = _unsupported_function("pivot")
+    pivot_table = _unsupported_function("pivot_table")
+    crosstab = _unsupported_function("crosstab")
+    cut = _unsupported_function("cut")
+    qcut = _unsupported_function("qcut")
+    merge_ordered = _unsupported_function("merge_ordered")
+    factorize = _unsupported_function("factorize")
+    unique = _unsupported_function("unique")
+    wide_to_long = _unsupported_function("wide_to_long")
+    bdate_range = _unsupported_function("bdate_range")
+    period_range = _unsupported_function("period_range")
+    infer_freq = _unsupported_function("infer_freq")
+    interval_range = _unsupported_function("interval_range")
+    eval = _unsupported_function("eval")
diff --git a/python/pyspark/pandas/tests/test_namespace.py 
b/python/pyspark/pandas/tests/test_namespace.py
index b5290257089..8c5adb9bae5 100644
--- a/python/pyspark/pandas/tests/test_namespace.py
+++ b/python/pyspark/pandas/tests/test_namespace.py
@@ -16,13 +16,16 @@
 #
 
 import itertools
+import inspect
 
 import pandas as pd
 import numpy as np
 
 from pyspark import pandas as ps
+from pyspark.pandas.exceptions import PandasNotImplementedError
 from pyspark.pandas.namespace import _get_index_map, read_delta
 from pyspark.pandas.utils import spark_column_equals
+from pyspark.pandas.missing.general_functions import 
_MissingPandasLikeGeneralFunctions
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
 
@@ -554,6 +557,20 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils):
             lambda: ps.to_numeric(psser, errors="ignore"),
         )
 
+    def test_missing(self):
+        missing_functions = inspect.getmembers(
+            _MissingPandasLikeGeneralFunctions, inspect.isfunction
+        )
+        unsupported_functions = [
+            name for (name, type_) in missing_functions if type_.__name__ == 
"unsupported_function"
+        ]
+        for name in unsupported_functions:
+            with self.assertRaisesRegex(
+                PandasNotImplementedError,
+                "The method.*pd.*{}.*not implemented yet.".format(name),
+            ):
+                getattr(ps, name)()
+
 
 if __name__ == "__main__":
     import unittest
diff --git a/python/pyspark/pandas/usage_logging/__init__.py 
b/python/pyspark/pandas/usage_logging/__init__.py
index a6f1470b9f4..7f082623c03 100644
--- a/python/pyspark/pandas/usage_logging/__init__.py
+++ b/python/pyspark/pandas/usage_logging/__init__.py
@@ -31,6 +31,7 @@ from pyspark.pandas.indexes.datetimes import DatetimeIndex
 from pyspark.pandas.indexes.multi import MultiIndex
 from pyspark.pandas.indexes.numeric import Float64Index, Int64Index
 from pyspark.pandas.missing.frame import _MissingPandasLikeDataFrame
+from pyspark.pandas.missing.general_functions import 
_MissingPandasLikeGeneralFunctions
 from pyspark.pandas.missing.groupby import (
     MissingPandasLikeDataFrameGroupBy,
     MissingPandasLikeSeriesGroupBy,
@@ -109,6 +110,7 @@ def attach(logger_module: Union[str, ModuleType]) -> None:
     modules.append(sql_formatter)
 
     missings = [
+        (pd, _MissingPandasLikeGeneralFunctions),
         (pd.DataFrame, _MissingPandasLikeDataFrame),
         (pd.Series, MissingPandasLikeSeries),
         (pd.Index, MissingPandasLikeIndex),
@@ -122,4 +124,4 @@ def attach(logger_module: Union[str, ModuleType]) -> None:
         (pd.core.window.RollingGroupby, MissingPandasLikeRollingGroupby),
     ]
 
-    _attach(logger_module, modules, classes, missings)
+    _attach(logger_module, modules, classes, missings)  # type: 
ignore[arg-type]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-38755][PYTHON][TEST] Add file to address missing pandas general functions

Reply via email to