(spark) branch master updated: [SPARK-40373][PS] Implement ps.show_versions

gurwls223 Sun, 10 May 2026 14:52:37 -0700

This is an automated email from the ASF dual-hosted git repository.

HyukjinKwon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 35f02b163018 [SPARK-40373][PS] Implement ps.show_versions
35f02b163018 is described below

commit 35f02b163018317e36ab227334863274ca02057a
Author: Devin Petersohn <[email protected]>
AuthorDate: Mon May 11 06:52:15 2026 +0900

    [SPARK-40373][PS] Implement ps.show_versions
    
    ### What changes were proposed in this pull request?
    
    Implement `ps.show_versions()` in pandas API on Spark.
    
    ### Why are the changes needed?
    
    Missing API coverage.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes.
    
    ### How was this patch tested?
    
    Unit tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (claude-opus-4-7)
    
    Closes #55772 from devin-petersohn/devin/show-versions.
    
    Authored-by: Devin Petersohn <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 .../reference/pyspark.pandas/general_functions.rst |   7 ++
 python/pyspark/pandas/__init__.py                  |   1 +
 python/pyspark/pandas/namespace.py                 | 129 +++++++++++++++++++++
 python/pyspark/pandas/tests/test_namespace.py      |  41 +++++++
 4 files changed, 178 insertions(+)

diff --git a/python/docs/source/reference/pyspark.pandas/general_functions.rst 
b/python/docs/source/reference/pyspark.pandas/general_functions.rst
index 9de322b2968b..f9fcd36e2fca 100644
--- a/python/docs/source/reference/pyspark.pandas/general_functions.rst
+++ b/python/docs/source/reference/pyspark.pandas/general_functions.rst
@@ -64,3 +64,10 @@ Top-level dealing with datetimelike data
    date_range
    to_timedelta
    timedelta_range
+
+Utility functions
+-----------------
+.. autosummary::
+   :toctree: api/
+
+   show_versions
diff --git a/python/pyspark/pandas/__init__.py 
b/python/pyspark/pandas/__init__.py
index ac749c195a1e..bb05ec3bf857 100644
--- a/python/pyspark/pandas/__init__.py
+++ b/python/pyspark/pandas/__init__.py
@@ -75,6 +75,7 @@ __all__ = [  # noqa: F405
     "options",
     "option_context",
     "NamedAgg",
+    "show_versions",
 ]
 
 
diff --git a/python/pyspark/pandas/namespace.py 
b/python/pyspark/pandas/namespace.py
index 7be107ec55ca..15dd70c0234d 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -141,6 +141,7 @@ __all__ = [
     "broadcast",
     "read_orc",
     "json_normalize",
+    "show_versions",
 ]
 
 
@@ -3879,6 +3880,134 @@ def json_normalize(
     return ps.DataFrame(internal)
 
 
+def _get_sys_info() -> Dict[str, Any]:
+    """Returns system information as a dictionary."""
+    import locale
+    import os
+    import platform
+    import struct
+    import sys as _sys
+
+    uname_result = platform.uname()
+    try:
+        language_code, encoding = locale.getlocale()
+    except (TypeError, ValueError):
+        language_code, encoding = (None, None)
+    return {
+        "python": platform.python_version(),
+        "python-bits": struct.calcsize("P") * 8,
+        "OS": uname_result.system,
+        "OS-release": uname_result.release,
+        "Version": uname_result.version,
+        "machine": uname_result.machine,
+        "processor": uname_result.processor,
+        "byteorder": _sys.byteorder,
+        "LC_ALL": os.environ.get("LC_ALL"),
+        "LANG": os.environ.get("LANG"),
+        "LOCALE": {"language-code": language_code, "encoding": encoding},
+    }
+
+
+def _get_dependency_info() -> Dict[str, Optional[str]]:
+    """Returns dependency information as a dictionary."""
+    import importlib
+
+    import pyspark
+
+    deps = [
+        "pyspark",
+        "pandas",
+        "numpy",
+        "pyarrow",
+        "grpc",
+        "google.protobuf",
+        "matplotlib",
+        "IPython",
+        "sphinx",
+        "plotly",
+        "tabulate",
+        "scipy",
+        "mlflow",
+    ]
+    result: Dict[str, Optional[str]] = {}
+    for modname in deps:
+        if modname == "pyspark":
+            result[modname] = pyspark.__version__
+            continue
+        try:
+            mod = importlib.import_module(modname)
+        except ImportError:
+            result[modname] = None
+        except Exception:
+            # Dependency conflicts may cause non-ImportError failures.
+            result[modname] = "N/A"
+        else:
+            result[modname] = getattr(mod, "__version__", None)
+    return result
+
+
+def show_versions(as_json: Union[str, bool] = False) -> None:
+    """
+    Provide useful information, important for bug reports.
+
+    It comprises info about hosting operation system, pyspark.pandas version,
+    and versions of other installed relative packages.
+
+    .. versionadded:: 4.3.0
+
+    Parameters
+    ----------
+    as_json : str or bool, default False
+        * If False, outputs info in a human readable form to the console.
+        * If str, it will be considered as a path to a file.
+          Info will be written to that file in JSON format.
+        * If True, outputs info in JSON format to the console.
+
+    Examples
+    --------
+    >>> ps.show_versions()  # doctest: +SKIP
+    INSTALLED VERSIONS
+    ------------------
+    python           : 3.10.6.final.0
+    python-bits      : 64
+    ...
+    pyspark          : 4.3.0.dev0
+    pandas           : 2.2.0
+    numpy            : 1.24.3
+    pyarrow          : 15.0.0
+    ...
+    """
+    sys_info = _get_sys_info()
+    deps = _get_dependency_info()
+
+    if as_json:
+        import sys as _sys
+
+        j = {"system": sys_info, "dependencies": deps}
+
+        if as_json is True:
+            _sys.stdout.writelines(json.dumps(j, indent=2))
+        else:
+            assert isinstance(as_json, str)
+            with open(as_json, "w", encoding="utf-8") as f:
+                json.dump(j, f, indent=2)
+        return
+
+    locale_info = sys_info["LOCALE"]
+    sys_info["LOCALE"] = "{language_code}.{encoding}".format(
+        language_code=locale_info["language-code"], 
encoding=locale_info["encoding"]
+    )
+
+    maxlen = max(len(x) for x in deps)
+    print("\nINSTALLED VERSIONS")
+    print("------------------")
+    for k, v in sys_info.items():
+        print(f"{k:<{maxlen}}: {v}")
+    print("")
+    for k, v in deps.items():
+        print(f"{k:<{maxlen}}: {v}")
+
+
 def _get_index_map(
     sdf: PySparkDataFrame, index_col: Optional[Union[str, List[str]]] = None
 ) -> Tuple[Optional[List[PySparkColumn]], Optional[List[Label]]]:
diff --git a/python/pyspark/pandas/tests/test_namespace.py 
b/python/pyspark/pandas/tests/test_namespace.py
index dffaf87f7937..beb2bab06e0e 100644
--- a/python/pyspark/pandas/tests/test_namespace.py
+++ b/python/pyspark/pandas/tests/test_namespace.py
@@ -17,10 +17,16 @@
 
 import itertools
 import inspect
+import io
+import json
+import os
+import tempfile
+from contextlib import redirect_stdout
 
 import pandas as pd
 import numpy as np
 
+import pyspark
 from pyspark.loose_version import LooseVersion
 from pyspark import pandas as ps
 from pyspark.pandas.exceptions import PandasNotImplementedError
@@ -678,6 +684,41 @@ class NamespaceTestsMixin:
         data = []
         self.assert_eq(pd.json_normalize(data), ps.json_normalize(data))
 
+    def test_show_versions(self):
+        # Default: prints human-readable output containing pyspark, pandas, 
and numpy versions.
+        buf = io.StringIO()
+        with redirect_stdout(buf):
+            ps.show_versions()
+        output = buf.getvalue()
+        self.assertIn("INSTALLED VERSIONS", output)
+        self.assertIn("pyspark", output)
+        self.assertIn(pyspark.__version__, output)
+        self.assertIn("pandas", output)
+        self.assertIn(pd.__version__, output)
+        self.assertIn("numpy", output)
+        self.assertIn(np.__version__, output)
+        self.assertIn("python", output)
+
+        # as_json=True: writes JSON to stdout with system and dependencies 
sections.
+        buf = io.StringIO()
+        with redirect_stdout(buf):
+            ps.show_versions(as_json=True)
+        parsed = json.loads(buf.getvalue())
+        self.assertIn("system", parsed)
+        self.assertIn("dependencies", parsed)
+        self.assertEqual(parsed["dependencies"]["pyspark"], 
pyspark.__version__)
+        self.assertEqual(parsed["dependencies"]["pandas"], pd.__version__)
+
+        # as_json=<path>: writes JSON to a file.
+        with tempfile.TemporaryDirectory() as tmp:
+            path = os.path.join(tmp, "versions.json")
+            ps.show_versions(as_json=path)
+            with open(path, "r", encoding="utf-8") as f:
+                file_parsed = json.load(f)
+        self.assertIn("system", file_parsed)
+        self.assertIn("dependencies", file_parsed)
+        self.assertEqual(file_parsed["dependencies"]["pyspark"], 
pyspark.__version__)
+
     def test_missing(self):
         missing_functions = inspect.getmembers(
             MissingPandasLikeGeneralFunctions, inspect.isfunction


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-40373][PS] Implement ps.show_versions

Reply via email to