This is an automated email from the ASF dual-hosted git repository.
HyukjinKwon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 35f02b163018 [SPARK-40373][PS] Implement ps.show_versions
35f02b163018 is described below
commit 35f02b163018317e36ab227334863274ca02057a
Author: Devin Petersohn <[email protected]>
AuthorDate: Mon May 11 06:52:15 2026 +0900
[SPARK-40373][PS] Implement ps.show_versions
### What changes were proposed in this pull request?
Implement `ps.show_versions()` in pandas API on Spark.
### Why are the changes needed?
Missing API coverage.
### Does this PR introduce _any_ user-facing change?
Yes.
### How was this patch tested?
Unit tests.
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: Claude Code (claude-opus-4-7)
Closes #55772 from devin-petersohn/devin/show-versions.
Authored-by: Devin Petersohn <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../reference/pyspark.pandas/general_functions.rst | 7 ++
python/pyspark/pandas/__init__.py | 1 +
python/pyspark/pandas/namespace.py | 129 +++++++++++++++++++++
python/pyspark/pandas/tests/test_namespace.py | 41 +++++++
4 files changed, 178 insertions(+)
diff --git a/python/docs/source/reference/pyspark.pandas/general_functions.rst
b/python/docs/source/reference/pyspark.pandas/general_functions.rst
index 9de322b2968b..f9fcd36e2fca 100644
--- a/python/docs/source/reference/pyspark.pandas/general_functions.rst
+++ b/python/docs/source/reference/pyspark.pandas/general_functions.rst
@@ -64,3 +64,10 @@ Top-level dealing with datetimelike data
date_range
to_timedelta
timedelta_range
+
+Utility functions
+-----------------
+.. autosummary::
+ :toctree: api/
+
+ show_versions
diff --git a/python/pyspark/pandas/__init__.py
b/python/pyspark/pandas/__init__.py
index ac749c195a1e..bb05ec3bf857 100644
--- a/python/pyspark/pandas/__init__.py
+++ b/python/pyspark/pandas/__init__.py
@@ -75,6 +75,7 @@ __all__ = [ # noqa: F405
"options",
"option_context",
"NamedAgg",
+ "show_versions",
]
diff --git a/python/pyspark/pandas/namespace.py
b/python/pyspark/pandas/namespace.py
index 7be107ec55ca..15dd70c0234d 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -141,6 +141,7 @@ __all__ = [
"broadcast",
"read_orc",
"json_normalize",
+ "show_versions",
]
@@ -3879,6 +3880,134 @@ def json_normalize(
return ps.DataFrame(internal)
+def _get_sys_info() -> Dict[str, Any]:
+ """Returns system information as a dictionary."""
+ import locale
+ import os
+ import platform
+ import struct
+ import sys as _sys
+
+ uname_result = platform.uname()
+ try:
+ language_code, encoding = locale.getlocale()
+ except (TypeError, ValueError):
+ language_code, encoding = (None, None)
+ return {
+ "python": platform.python_version(),
+ "python-bits": struct.calcsize("P") * 8,
+ "OS": uname_result.system,
+ "OS-release": uname_result.release,
+ "Version": uname_result.version,
+ "machine": uname_result.machine,
+ "processor": uname_result.processor,
+ "byteorder": _sys.byteorder,
+ "LC_ALL": os.environ.get("LC_ALL"),
+ "LANG": os.environ.get("LANG"),
+ "LOCALE": {"language-code": language_code, "encoding": encoding},
+ }
+
+
+def _get_dependency_info() -> Dict[str, Optional[str]]:
+ """Returns dependency information as a dictionary."""
+ import importlib
+
+ import pyspark
+
+ deps = [
+ "pyspark",
+ "pandas",
+ "numpy",
+ "pyarrow",
+ "grpc",
+ "google.protobuf",
+ "matplotlib",
+ "IPython",
+ "sphinx",
+ "plotly",
+ "tabulate",
+ "scipy",
+ "mlflow",
+ ]
+ result: Dict[str, Optional[str]] = {}
+ for modname in deps:
+ if modname == "pyspark":
+ result[modname] = pyspark.__version__
+ continue
+ try:
+ mod = importlib.import_module(modname)
+ except ImportError:
+ result[modname] = None
+ except Exception:
+ # Dependency conflicts may cause non-ImportError failures.
+ result[modname] = "N/A"
+ else:
+ result[modname] = getattr(mod, "__version__", None)
+ return result
+
+
+def show_versions(as_json: Union[str, bool] = False) -> None:
+ """
+ Provide useful information, important for bug reports.
+
+ It comprises info about hosting operation system, pyspark.pandas version,
+ and versions of other installed relative packages.
+
+ .. versionadded:: 4.3.0
+
+ Parameters
+ ----------
+ as_json : str or bool, default False
+ * If False, outputs info in a human readable form to the console.
+ * If str, it will be considered as a path to a file.
+ Info will be written to that file in JSON format.
+ * If True, outputs info in JSON format to the console.
+
+ Examples
+ --------
+ >>> ps.show_versions() # doctest: +SKIP
+ INSTALLED VERSIONS
+ ------------------
+ python : 3.10.6.final.0
+ python-bits : 64
+ ...
+ pyspark : 4.3.0.dev0
+ pandas : 2.2.0
+ numpy : 1.24.3
+ pyarrow : 15.0.0
+ ...
+ """
+ sys_info = _get_sys_info()
+ deps = _get_dependency_info()
+
+ if as_json:
+ import sys as _sys
+
+ j = {"system": sys_info, "dependencies": deps}
+
+ if as_json is True:
+ _sys.stdout.writelines(json.dumps(j, indent=2))
+ else:
+ assert isinstance(as_json, str)
+ with open(as_json, "w", encoding="utf-8") as f:
+ json.dump(j, f, indent=2)
+ return
+
+ locale_info = sys_info["LOCALE"]
+ sys_info["LOCALE"] = "{language_code}.{encoding}".format(
+ language_code=locale_info["language-code"],
encoding=locale_info["encoding"]
+ )
+
+ maxlen = max(len(x) for x in deps)
+ print("\nINSTALLED VERSIONS")
+ print("------------------")
+ for k, v in sys_info.items():
+ print(f"{k:<{maxlen}}: {v}")
+ print("")
+ for k, v in deps.items():
+ print(f"{k:<{maxlen}}: {v}")
+
+
def _get_index_map(
sdf: PySparkDataFrame, index_col: Optional[Union[str, List[str]]] = None
) -> Tuple[Optional[List[PySparkColumn]], Optional[List[Label]]]:
diff --git a/python/pyspark/pandas/tests/test_namespace.py
b/python/pyspark/pandas/tests/test_namespace.py
index dffaf87f7937..beb2bab06e0e 100644
--- a/python/pyspark/pandas/tests/test_namespace.py
+++ b/python/pyspark/pandas/tests/test_namespace.py
@@ -17,10 +17,16 @@
import itertools
import inspect
+import io
+import json
+import os
+import tempfile
+from contextlib import redirect_stdout
import pandas as pd
import numpy as np
+import pyspark
from pyspark.loose_version import LooseVersion
from pyspark import pandas as ps
from pyspark.pandas.exceptions import PandasNotImplementedError
@@ -678,6 +684,41 @@ class NamespaceTestsMixin:
data = []
self.assert_eq(pd.json_normalize(data), ps.json_normalize(data))
+ def test_show_versions(self):
+ # Default: prints human-readable output containing pyspark, pandas,
and numpy versions.
+ buf = io.StringIO()
+ with redirect_stdout(buf):
+ ps.show_versions()
+ output = buf.getvalue()
+ self.assertIn("INSTALLED VERSIONS", output)
+ self.assertIn("pyspark", output)
+ self.assertIn(pyspark.__version__, output)
+ self.assertIn("pandas", output)
+ self.assertIn(pd.__version__, output)
+ self.assertIn("numpy", output)
+ self.assertIn(np.__version__, output)
+ self.assertIn("python", output)
+
+ # as_json=True: writes JSON to stdout with system and dependencies
sections.
+ buf = io.StringIO()
+ with redirect_stdout(buf):
+ ps.show_versions(as_json=True)
+ parsed = json.loads(buf.getvalue())
+ self.assertIn("system", parsed)
+ self.assertIn("dependencies", parsed)
+ self.assertEqual(parsed["dependencies"]["pyspark"],
pyspark.__version__)
+ self.assertEqual(parsed["dependencies"]["pandas"], pd.__version__)
+
+ # as_json=<path>: writes JSON to a file.
+ with tempfile.TemporaryDirectory() as tmp:
+ path = os.path.join(tmp, "versions.json")
+ ps.show_versions(as_json=path)
+ with open(path, "r", encoding="utf-8") as f:
+ file_parsed = json.load(f)
+ self.assertIn("system", file_parsed)
+ self.assertIn("dependencies", file_parsed)
+ self.assertEqual(file_parsed["dependencies"]["pyspark"],
pyspark.__version__)
+
def test_missing(self):
missing_functions = inspect.getmembers(
MissingPandasLikeGeneralFunctions, inspect.isfunction
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]