[GitHub] [spark] HyukjinKwon commented on a diff in pull request #36509: [SPARK-38961][PYTHON][DOCS] Enhance to automatically generate the the pandas API support list

GitBox Wed, 11 May 2022 03:32:58 -0700


HyukjinKwon commented on code in PR #36509:
URL: https://github.com/apache/spark/pull/36509#discussion_r870139820



##########
python/docs/source/user_guide/pandas_on_spark/supported_status_rst_generator.py:
##########
@@ -0,0 +1,225 @@
+from enum import Enum, unique
+from inspect import getmembers, isclass, isfunction, signature
+from typing import Callable, Dict, List, Set, TextIO
+
+import pandas as pd
+import pandas.core.groupby as pdg
+import pandas.core.window as pdw
+import pyspark.pandas as ps
+import pyspark.pandas.groupby as psg
+import pyspark.pandas.window as psw
+
+MAX_MISSING_PARAMS_SIZE = 5
+COMMON_PARAMETER_SET = {"kwargs", "args", "cls"}
+FILE_PATH_PREFIX = "./user_guide/pandas_on_spark"
+HEADER_DOC_FILE = f"{FILE_PATH_PREFIX}/supported_pandas_api_header.txt"
+TARGET_DOC_FILE = f"{FILE_PATH_PREFIX}/supported_pandas_api.rst"
+MODULE_GROUP_MATCH = [(pd, ps), (pdw, psw), (pdg, psg)]
+
+
+@unique
+class Implemented(Enum):
+    IMPLEMENTED = "Y"
+    NOT_IMPLEMENTED = "N"
+    PARTIALLY_IMPLEMENTED = "P"
+
+
+class SupportedStatus:
+    def __init__(self, implemented: str, missing: str = ""):
+        self.implemented = implemented
+        self.missing = missing
+
+
+class SuppportedStatusRSTGenerator:
+    def __init__(self):
+        self.all_supported_status = {}
+
+    def execute(self) -> None:
+        for pd_module_group, ps_module_group in MODULE_GROUP_MATCH:
+            pd_modules = self.get_pd_modules(pd_module_group)
+            self.update_all_supported_status(
+                pd_modules, pd_module_group, ps_module_group
+            )
+        self.write_rst()
+
+    def create_supported_by_module(
+        self, module_name: str, pd_module_group, ps_module_group
+    ) -> Dict[str, SupportedStatus]:
+        pd_module = (
+            getattr(pd_module_group, module_name) if module_name else 
pd_module_group
+        )
+        try:
+            ps_module = (
+                getattr(ps_module_group, module_name)
+                if module_name
+                else ps_module_group
+            )
+        except AttributeError:
+            # module not implemented
+            return {}
+
+        pd_funcs = dict(
+            [m for m in getmembers(pd_module, isfunction) if not 
m[0].startswith("_")]
+        )
+        if not pd_funcs:
+            return {}
+
+        ps_funcs = dict(
+            [m for m in getmembers(ps_module, isfunction) if not 
m[0].startswith("_")]
+        )
+
+        return self.organize_by_implementation_status(
+            module_name, pd_funcs, ps_funcs, pd_module_group, ps_module_group
+        )
+
+    def organize_by_implementation_status(
+        self,
+        module_name: str,
+        pd_funcs: Dict[str, Callable],
+        ps_funcs: Dict[str, Callable],
+        pd_module_group,
+        ps_module_group,
+    ) -> Dict[str, SupportedStatus]:
+        pd_dict = {}
+        for pd_func_name, pd_func in pd_funcs.items():
+            ps_func = ps_funcs.get(pd_func_name)
+            if ps_func:
+                missing_set = (
+                    set(signature(pd_func).parameters)
+                    - set(signature(ps_func).parameters)
+                    - COMMON_PARAMETER_SET
+                )
+                if missing_set:
+                    # partially implemented
+                    pd_dict[pd_func_name] = SupportedStatus(
+                        Implemented.PARTIALLY_IMPLEMENTED.value,
+                        self.transform_missing(
+                            module_name,
+                            pd_func_name,
+                            missing_set,
+                            pd_module_group.__name__,
+                            ps_module_group.__name__,
+                        ),
+                    )
+                else:
+                    # implemented including it's whole parameter
+                    pd_dict[pd_func_name] = SupportedStatus(
+                        Implemented.IMPLEMENTED.value
+                    )
+            else:
+                # not implemented yet
+                pd_dict[pd_func_name] = SupportedStatus(
+                    Implemented.NOT_IMPLEMENTED.value
+                )
+        return pd_dict
+
+    def transform_missing(
+        self,
+        module_name: str,
+        pd_func_name: str,
+        missing_set: Set[str],
+        pd_module_path: str,
+        ps_module_path: str,
+    ) -> str:
+        missing_str = " , ".join(
+            list(
+                map(lambda x: f"``{x}``", 
sorted(missing_set)[:MAX_MISSING_PARAMS_SIZE])
+            )
+        )
+        if len(missing_set) > MAX_MISSING_PARAMS_SIZE:
+            module_dot_func = (
+                f"{module_name}.{pd_func_name}" if module_name else 
pd_func_name
+            )
+            additional_str = (
+                " and more. See the "
+                f"`{pd_module_path}.{module_dot_func} "
+                "<https://pandas.pydata.org/docs/reference/api/";
+                f"{pd_module_path}.{module_dot_func}.html>`__ and "
+                f"`{ps_module_path}.{module_dot_func} "
+                
"<https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/";
+                f"{ps_module_path}.{module_dot_func}.html>`__ for detail."
+            )
+            missing_str += additional_str
+        return missing_str
+
+    def get_pd_modules(self, pd_module_group) -> List[str]:
+        return sorted(
+            [
+                m[0]
+                for m in getmembers(pd_module_group, isclass)
+                if not m[0].startswith("_")
+            ]
+        )
+
+    def update_all_supported_status(
+        self, pd_modules: List[str], pd_module_group, ps_module_group
+    ) -> None:
+        pd_modules += [""]  # for General Function APIs
+        for module_name in pd_modules:
+            supported_status = self.create_supported_by_module(
+                module_name, pd_module_group, ps_module_group
+            )
+            if supported_status:
+                self.all_supported_status[
+                    (module_name, ps_module_group.__name__)
+                ] = supported_status
+
+    def draw_table(

Review Comment:
   write_table or generate_table?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] HyukjinKwon commented on a diff in pull request #36509: [SPARK-38961][PYTHON][DOCS] Enhance to automatically generate the the pandas API support list

Reply via email to