This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new ef27b9b15687 [SPARK-46099][PS][DOCS] Refactor "Supported pandas API"
generation script
ef27b9b15687 is described below
commit ef27b9b15687dad416b6353409b1b44bc1451885
Author: Haejoon Lee <[email protected]>
AuthorDate: Mon Nov 27 09:00:11 2023 +0900
[SPARK-46099][PS][DOCS] Refactor "Supported pandas API" generation script
### What changes were proposed in this pull request?
This PR proposes to refactor the script used to generate the [Supported
pandas
API](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/supported_pandas_api.html)
documentation. The script has been restructured for better readability and
maintainability. The refactoring includes:
- Simplifying complex functions and breaking them into smaller, more
manageable pieces.
- Improving variable and function naming for clarity.
- Adding comprehensive docstrings in the NumPy docstyle.
- Streamlining the flow of the script to enhance logical coherence.
### Why are the changes needed?
The previous version of the script was hard to understand and maintain due
to its complexity and lack of documentation. This refactoring makes the script
more accessible to new contributors and easier to modify or extend in the
future. It also ensures that the script adheres to best practices in Python
coding, making it a more reliable tool for generating accurate and up-to-date
documentation.
### Does this PR introduce _any_ user-facing change?
No user-facing changes. This PR only affects the internal documentation
generation process.
### How was this patch tested?
Tested by generating the documentation manually and verifying that the
output remains consistent with the previous version.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #44010 from itholic/refactor_doc_gen_script.
Authored-by: Haejoon Lee <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/supported_api_gen.py | 188 +++++++++++++++++++----------
1 file changed, 124 insertions(+), 64 deletions(-)
diff --git a/python/pyspark/pandas/supported_api_gen.py
b/python/pyspark/pandas/supported_api_gen.py
index 27d5cd4b37f9..1f893520d2ce 100644
--- a/python/pyspark/pandas/supported_api_gen.py
+++ b/python/pyspark/pandas/supported_api_gen.py
@@ -33,13 +33,11 @@ import pandas.core.window as pdw
from pyspark.loose_version import LooseVersion
from pyspark.pandas.exceptions import PandasNotImplementedError
+# Constants
MAX_MISSING_PARAMS_SIZE = 5
-COMMON_PARAMETER_SET = {
- "kwargs",
- "args",
- "cls",
-} # These are not counted as missing parameters.
+COMMON_PARAMETER_SET = {"kwargs", "args", "cls"}
MODULE_GROUP_MATCH = [(pd, ps), (pdw, psw), (pdg, psg)]
+PANDAS_LATEST_VERSION = "2.1.3"
RST_HEADER = """
=====================
@@ -73,6 +71,10 @@ The API list is updated based on the `latest pandas official
API reference
@unique
class Implemented(Enum):
+ """
+ Enumeration of implementation statuses.
+ """
+
IMPLEMENTED = "Y"
NOT_IMPLEMENTED = "N"
PARTIALLY_IMPLEMENTED = "P"
@@ -80,7 +82,7 @@ class Implemented(Enum):
class SupportedStatus(NamedTuple):
"""
- Defines a supported status for specific pandas API
+ Defines a supported status for specific pandas API.
"""
implemented: str
@@ -89,47 +91,108 @@ class SupportedStatus(NamedTuple):
def generate_supported_api(output_rst_file_path: str) -> None:
"""
- Generate supported APIs status dictionary.
+ Generate the supported APIs status dictionary and write it to an RST file.
Parameters
----------
output_rst_file_path : str
The path to the document file in RST format.
+ """
+ _check_pandas_version()
+ all_supported_status = _collect_supported_status()
+ _write_rst(output_rst_file_path, all_supported_status)
+
- Write supported APIs documentation.
+def _check_pandas_version() -> None:
"""
- pandas_latest_version = "2.1.3"
- if LooseVersion(pd.__version__) != LooseVersion(pandas_latest_version):
+ Check if the installed pandas version matches the expected version.
+ """
+ if LooseVersion(pd.__version__) != LooseVersion(PANDAS_LATEST_VERSION):
msg = (
- "Warning: Latest version of pandas (%s) is required to generate
the documentation; "
- "however, your version was %s" % (pandas_latest_version,
pd.__version__)
+ f"Warning: pandas {PANDAS_LATEST_VERSION} is required; your
version is {pd.__version__}"
)
warnings.warn(msg, UserWarning)
raise ImportError(msg)
+
+def _collect_supported_status() -> Dict[Tuple[str, str], Dict[str,
SupportedStatus]]:
+ """
+ Collect the supported status across multiple module paths.
+ """
all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]] =
{}
for pd_module_group, ps_module_group in MODULE_GROUP_MATCH:
pd_modules = _get_pd_modules(pd_module_group)
_update_all_supported_status(
all_supported_status, pd_modules, pd_module_group, ps_module_group
)
- _write_rst(output_rst_file_path, all_supported_status)
+ return all_supported_status
+
+
+def _get_pd_modules(pd_module_group: Any) -> List[str]:
+ """
+ Get sorted list of pandas member names from a pandas module.
+
+ Parameters
+ ----------
+ pd_module_group : Any
+ Importable pandas module.
+
+ Returns
+ -------
+ List[str]
+ Sorted list of member names.
+ """
+ return sorted(m[0] for m in getmembers(pd_module_group, isclass) if not
m[0].startswith("_"))
+
+
+def _update_all_supported_status(
+ all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]],
+ pd_modules: List[str],
+ pd_module_group: Any,
+ ps_module_group: Any,
+) -> None:
+ """
+ Update the supported status dictionary with status from multiple modules.
+
+ Parameters
+ ----------
+ all_supported_status : Dict[Tuple[str, str], Dict[str, SupportedStatus]]
+ The dictionary to update with supported statuses.
+ pd_modules : List[str]
+ List of module names in pandas.
+ pd_module_group : Any
+ Importable pandas module group.
+ ps_module_group : Any
+ Corresponding pyspark.pandas module group.
+ """
+ pd_modules.append("") # Include General Function APIs
+ for module_name in pd_modules:
+ supported_status = _create_supported_by_module(
+ module_name, pd_module_group, ps_module_group
+ )
+ if supported_status:
+ all_supported_status[(module_name, ps_module_group.__name__)] =
supported_status
def _create_supported_by_module(
module_name: str, pd_module_group: Any, ps_module_group: Any
) -> Dict[str, SupportedStatus]:
"""
- Retrieves supported status of pandas module
+ Create a dictionary of supported status for a specific pandas module.
Parameters
----------
module_name : str
- Class name that exists in the path of the module.
+ Name of the module in pandas.
pd_module_group : Any
- Specific path of importable pandas module.
- ps_module_group: Any
- Specific path of importable pyspark.pandas module.
+ Importable pandas module.
+ ps_module_group : Any
+ Corresponding pyspark.pandas module.
+
+ Returns
+ -------
+ Dict[str, SupportedStatus]
+ Dictionary of supported status for the module.
"""
pd_module = getattr(pd_module_group, module_name) if module_name else
pd_module_group
try:
@@ -157,7 +220,7 @@ def _organize_by_implementation_status(
ps_module_group: Any,
) -> Dict[str, SupportedStatus]:
"""
- Check the implementation status and parameters of both modules.
+ Organize functions by implementation status between pandas and
pyspark.pandas.
Parameters
----------
@@ -171,6 +234,11 @@ def _organize_by_implementation_status(
Specific path of importable pandas module.
ps_module_group: Any
Specific path of importable pyspark.pandas module.
+
+ Returns
+ -------
+ Dict[str, SupportedStatus]
+ Dictionary of implementation status.
"""
pd_dict = {}
for pd_func_name, pd_func in pd_funcs.items():
@@ -214,7 +282,7 @@ def _transform_missing(
ps_module_path: str,
) -> str:
"""
- Transform missing parameters into table information string.
+ Transform missing parameters into a formatted string for table display.
Parameters
----------
@@ -229,6 +297,11 @@ def _transform_missing(
ps_module_path : str
Path string of pyspark.pandas module.
+ Returns
+ -------
+ str
+ Formatted string representing missing parameters.
+
Examples
--------
>>> _transform_missing("DataFrame", "add", {"axis", "fill_value", "level"},
@@ -251,47 +324,6 @@ def _transform_missing(
return missing_str
-def _get_pd_modules(pd_module_group: Any) -> List[str]:
- """
- Returns sorted pandas member list from pandas module path.
-
- Parameters
- ----------
- pd_module_group : Any
- Specific path of importable pandas module.
- """
- return sorted([m[0] for m in getmembers(pd_module_group, isclass) if not
m[0].startswith("_")])
-
-
-def _update_all_supported_status(
- all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]],
- pd_modules: List[str],
- pd_module_group: Any,
- ps_module_group: Any,
-) -> None:
- """
- Updates supported status across multiple module paths.
-
- Parameters
- ----------
- all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]]
- Data that stores the supported status across multiple module paths.
- pd_modules: List[str]
- Name list of pandas modules.
- pd_module_group : Any
- Specific path of importable pandas module.
- ps_module_group: Any
- Specific path of importable pyspark.pandas module.
- """
- pd_modules += [""] # for General Function APIs
- for module_name in pd_modules:
- supported_status = _create_supported_by_module(
- module_name, pd_module_group, ps_module_group
- )
- if supported_status:
- all_supported_status[(module_name, ps_module_group.__name__)] =
supported_status
-
-
def _write_table(
module_name: str,
module_path: str,
@@ -299,7 +331,18 @@ def _write_table(
w_fd: TextIO,
) -> None:
"""
- Write table by using Sphinx list-table directive.
+ Write the support status in a table format using Sphinx list-table
directive.
+
+ Parameters
+ ----------
+ module_name : str
+ The name of the module whose support status is being documented.
+ module_path : str
+ The import path of the module in the documentation.
+ supported_status : Dict[str, SupportedStatus]
+ A dictionary mapping each function name to its support status.
+ w_fd : TextIO
+ An open file descriptor where the table will be written.
"""
lines = []
if module_name:
@@ -336,7 +379,17 @@ def _write_table(
def _escape_func_str(func_str: str) -> str:
"""
- Transforms which affecting rst data format.
+ Escape function names to conform to RST format.
+
+ Parameters
+ ----------
+ func_str : str
+ Function name to escape.
+
+ Returns
+ -------
+ str
+ Escaped function name.
"""
# TODO: Take into account that this function can create links incorrectly
# We can create alias links or links to parent methods
@@ -351,7 +404,14 @@ def _write_rst(
all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]],
) -> None:
"""
- Writes the documentation to the target file path.
+ Write the final RST file with the collected support status.
+
+ Parameters
+ ----------
+ output_rst_file_path : str
+ Path to the output RST file.
+ all_supported_status : Dict
+ Collected support status data.
"""
with open(output_rst_file_path, "w") as w_fd:
w_fd.write(RST_HEADER)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]