This is an automated email from the ASF dual-hosted git repository.

rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 0a2a0c75f9 GH-49452: [Python] Reintroduce docstring injection for 
stubfiles (#49453)
0a2a0c75f9 is described below

commit 0a2a0c75f9e8106d0ad7e53d2e3183a89c043620
Author: Rok Mihevc <[email protected]>
AuthorDate: Fri Mar 13 11:49:26 2026 +0100

    GH-49452: [Python] Reintroduce docstring injection for stubfiles (#49453)
    
    ### Rationale for this change
    
    Warning: should not be merged before 
https://github.com/apache/arrow/pull/49259.
    See https://github.com/apache/arrow/issues/49452 and 
https://github.com/apache/arrow/pull/48618
    
    ### What changes are included in this PR?
    
    Adds a wheel build time script to populate stubfiles with runtime 
docstrings.
    
    ### Are these changes tested?
    
    Not yet.
    
    ### Are there any user-facing changes?
    
    Users will get docstrings.
    * GitHub Issue: #49452
    
    Lead-authored-by: Rok Mihevc <[email protected]>
    Co-authored-by: Raúl Cumplido <[email protected]>
    Signed-off-by: Rok Mihevc <[email protected]>
---
 ci/scripts/python_test_type_annotations.sh   |  4 +-
 ci/scripts/python_wheel_macos_build.sh       |  1 +
 ci/scripts/python_wheel_validate_contents.py | 72 +++++++++++++++++----
 ci/scripts/python_wheel_windows_build.bat    |  1 +
 ci/scripts/python_wheel_xlinux_build.sh      |  1 +
 python/CMakeLists.txt                        | 33 ++++++++++
 python/pyproject.toml                        |  6 +-
 python/scripts/update_stub_docstrings.py     | 96 +++++++++++++++++++++-------
 8 files changed, 176 insertions(+), 38 deletions(-)

diff --git a/ci/scripts/python_test_type_annotations.sh 
b/ci/scripts/python_test_type_annotations.sh
index c1a051b1e5..092bedf3f5 100755
--- a/ci/scripts/python_test_type_annotations.sh
+++ b/ci/scripts/python_test_type_annotations.sh
@@ -34,5 +34,5 @@ pip install mypy pyright ty
 # Run type checkers
 cd "${pyarrow_dir}"
 mypy
-pyright
-ty check
+pyright --stats
+ty check --verbose --output-format concise
diff --git a/ci/scripts/python_wheel_macos_build.sh 
b/ci/scripts/python_wheel_macos_build.sh
index 1571cd57f2..31395e26c2 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -147,6 +147,7 @@ popd
 
 echo "=== (${PYTHON_VERSION}) Building wheel ==="
 export PYARROW_BUNDLE_ARROW_CPP=ON
+export PYARROW_REQUIRE_STUB_DOCSTRINGS=ON
 export PYARROW_WITH_ACERO=${ARROW_ACERO}
 export PYARROW_WITH_AZURE=${ARROW_AZURE}
 export PYARROW_WITH_DATASET=${ARROW_DATASET}
diff --git a/ci/scripts/python_wheel_validate_contents.py 
b/ci/scripts/python_wheel_validate_contents.py
index 153a70eb40..8388f6ebf3 100644
--- a/ci/scripts/python_wheel_validate_contents.py
+++ b/ci/scripts/python_wheel_validate_contents.py
@@ -16,29 +16,79 @@
 # under the License.
 
 import argparse
+import ast
 from pathlib import Path
 import re
 import zipfile
 
 
+def _count_docstrings(source):
+    """Count docstrings in module, function, and class bodies."""
+    tree = ast.parse(source)
+    count = 0
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.Module, ast.FunctionDef,
+                             ast.AsyncFunctionDef, ast.ClassDef)):
+            if (node.body
+                    and isinstance(node.body[0], ast.Expr)
+                    and isinstance(node.body[0].value, ast.Constant)
+                    and isinstance(node.body[0].value.value, str)):
+                count += 1
+    return count
+
+
 def validate_wheel(path):
     p = Path(path)
     wheels = list(p.glob('*.whl'))
     error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})"
     assert len(wheels) == 1, error_msg
-    f = zipfile.ZipFile(wheels[0])
-    outliers = [
-        info.filename for info in f.filelist if not re.match(
-            r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)', 
info.filename
+    with zipfile.ZipFile(wheels[0]) as wheel_zip:
+        outliers = [
+            info.filename for info in wheel_zip.filelist if not re.match(
+                r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)', 
info.filename
+            )
+        ]
+        assert not outliers, f"Unexpected contents in wheel: 
{sorted(outliers)}"
+        for filename in ('LICENSE.txt', 'NOTICE.txt'):
+            assert any(
+                info.filename.split("/")[-1] == filename for info in 
wheel_zip.filelist
+            ), f"{filename} is missing from the wheel."
+
+        assert any(
+            info.filename == "pyarrow/py.typed" for info in wheel_zip.filelist
+        ), "pyarrow/py.typed is missing from the wheel."
+
+        source_root = Path(__file__).resolve().parents[2]
+        stubs_dir = source_root / "python" / "pyarrow-stubs" / "pyarrow"
+        assert stubs_dir.exists(), f"Stub source directory not found: 
{stubs_dir}"
+
+        expected_stub_files = {
+            f"pyarrow/{stub_file.relative_to(stubs_dir).as_posix()}"
+            for stub_file in stubs_dir.rglob("*.pyi")
+        }
+
+        wheel_stub_files = {
+            info.filename
+            for info in wheel_zip.filelist
+            if info.filename.startswith("pyarrow/") and 
info.filename.endswith(".pyi")
+        }
+
+        assert wheel_stub_files == expected_stub_files, (
+            "Wheel .pyi files differ from python/pyarrow-stubs/pyarrow.\n"
+            f"Missing in wheel: {sorted(expected_stub_files - 
wheel_stub_files)}\n"
+            f"Unexpected in wheel: {sorted(wheel_stub_files - 
expected_stub_files)}"
+        )
+
+        wheel_docstring_count = sum(
+            _count_docstrings(wheel_zip.read(wsf).decode("utf-8"))
+            for wsf in wheel_stub_files
         )
-    ]
-    assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}"
-    for filename in ('LICENSE.txt', 'NOTICE.txt'):
-        assert any(info.filename.split("/")[-1] == filename
-                   for info in f.filelist), \
-            f"{filename} is missing from the wheel."
+
+        print(f"Found {wheel_docstring_count} docstring(s) in wheel stubs.")
+        assert wheel_docstring_count, "No docstrings found in wheel stub 
files."
+
     print(f"The wheel: {wheels[0]} seems valid.")
-    # TODO(GH-32609): Validate some docstrings were generated and added.
+
 
 def main():
     parser = argparse.ArgumentParser()
diff --git a/ci/scripts/python_wheel_windows_build.bat 
b/ci/scripts/python_wheel_windows_build.bat
index 14e3e5a629..e094d82861 100644
--- a/ci/scripts/python_wheel_windows_build.bat
+++ b/ci/scripts/python_wheel_windows_build.bat
@@ -116,6 +116,7 @@ popd
 
 echo "=== (%PYTHON%) Building wheel ==="
 set PYARROW_BUNDLE_ARROW_CPP=ON
+set PYARROW_REQUIRE_STUB_DOCSTRINGS=ON
 set PYARROW_WITH_ACERO=%ARROW_ACERO%
 set PYARROW_WITH_AZURE=%ARROW_AZURE%
 set PYARROW_WITH_DATASET=%ARROW_DATASET%
diff --git a/ci/scripts/python_wheel_xlinux_build.sh 
b/ci/scripts/python_wheel_xlinux_build.sh
index 960fe5bad6..223bd0b1cb 100755
--- a/ci/scripts/python_wheel_xlinux_build.sh
+++ b/ci/scripts/python_wheel_xlinux_build.sh
@@ -155,6 +155,7 @@ check_arrow_visibility
 
 echo "=== (${PYTHON_VERSION}) Building wheel ==="
 export PYARROW_BUNDLE_ARROW_CPP=ON
+export PYARROW_REQUIRE_STUB_DOCSTRINGS=ON
 export PYARROW_WITH_ACERO=${ARROW_ACERO}
 export PYARROW_WITH_AZURE=${ARROW_AZURE}
 export PYARROW_WITH_DATASET=${ARROW_DATASET}
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 0630e0cff7..6395b3e1e7 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1025,3 +1025,36 @@ if(PYARROW_BUILD_PARQUET)
     target_link_libraries(_parquet_encryption PRIVATE 
arrow_python_parquet_encryption)
   endif()
 endif()
+
+#
+# Type stubs with docstring injection
+#
+# Stubs live in pyarrow-stubs/pyarrow/ during development but are installed
+# alongside the package so type checkers can find them (PEP 561).
+set(PYARROW_STUBS_SOURCE_DIR 
"${CMAKE_CURRENT_SOURCE_DIR}/pyarrow-stubs/pyarrow")
+if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}")
+  install(DIRECTORY "${PYARROW_STUBS_SOURCE_DIR}/"
+          DESTINATION "."
+          FILES_MATCHING
+          PATTERN "*.pyi")
+
+  if(PYARROW_REQUIRE_STUB_DOCSTRINGS)
+    install(CODE "
+      execute_process(
+        COMMAND \"${Python3_EXECUTABLE}\"
+                
\"${CMAKE_CURRENT_SOURCE_DIR}/scripts/update_stub_docstrings.py\"
+                \"${CMAKE_INSTALL_PREFIX}\"
+                \"${CMAKE_CURRENT_SOURCE_DIR}\"
+        RESULT_VARIABLE _pyarrow_stub_docstrings_result
+      )
+      if(NOT _pyarrow_stub_docstrings_result EQUAL 0)
+        message(FATAL_ERROR \"Stub docstring injection failed (exit code: 
\${_pyarrow_stub_docstrings_result})\")
+      endif()
+    ")
+  endif()
+else()
+  if(PYARROW_REQUIRE_STUB_DOCSTRINGS)
+    message(FATAL_ERROR "PyArrow stub source directory not found at 
${PYARROW_STUBS_SOURCE_DIR}; "
+                        "cannot build wheel without .pyi files.")
+  endif()
+endif()
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 14aa37ed04..a6bba335b8 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -85,7 +85,7 @@ exclude = [
 [tool.scikit-build]
 cmake.build-type = "Release"
 metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
-sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/"]
+sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/", 
"pyarrow-stubs/"]
 wheel.packages = ["pyarrow"]
 wheel.install-dir = "pyarrow"
 
@@ -94,6 +94,7 @@ PYARROW_BUNDLE_ARROW_CPP = {env = "PYARROW_BUNDLE_ARROW_CPP", 
default = "OFF"}
 PYARROW_BUNDLE_CYTHON_CPP = {env = "PYARROW_BUNDLE_CYTHON_CPP", default = 
"OFF"}
 PYARROW_GENERATE_COVERAGE = {env = "PYARROW_GENERATE_COVERAGE", default = 
"OFF"}
 PYARROW_CXXFLAGS = {env = "PYARROW_CXXFLAGS", default = ""}
+PYARROW_REQUIRE_STUB_DOCSTRINGS = {env = "PYARROW_REQUIRE_STUB_DOCSTRINGS", 
default = "OFF"}
 
 [tool.setuptools_scm]
 root = '..'
@@ -129,6 +130,9 @@ stubPath = "pyarrow-stubs"
 typeCheckingMode = "basic"
 
 # TODO: Enable type checking once stubs are merged
+[tool.ty.environment]
+extra-paths = ["pyarrow-stubs"]
+
 [tool.ty.src]
 include = ["pyarrow-stubs"]
 exclude = [
diff --git a/python/scripts/update_stub_docstrings.py 
b/python/scripts/update_stub_docstrings.py
index 5fd24014a0..44bd19bfdc 100644
--- a/python/scripts/update_stub_docstrings.py
+++ b/python/scripts/update_stub_docstrings.py
@@ -18,14 +18,17 @@
 """
 Extract docstrings from pyarrow runtime and insert them into stub files.
 
-Usage (from python/ directory with pyarrow built):
-    python scripts/update_stub_docstrings.py pyarrow-stubs
+Usage:
+    python scripts/update_stub_docstrings.py <install_prefix> <source_dir>
 """
 
 import argparse
 import importlib
 import inspect
+import os
+import shutil
 import sys
+import tempfile
 from pathlib import Path
 from textwrap import indent
 
@@ -178,7 +181,7 @@ def add_docstrings_to_stubs(stubs_dir):
 
     pyarrow = importlib.import_module("pyarrow")
 
-    for stub_file in stubs_dir.rglob('*.pyi'):
+    for stub_file in sorted(stubs_dir.rglob('*.pyi')):
         if stub_file.name == "_stubs_typing.pyi":
             continue
 
@@ -186,43 +189,88 @@ def add_docstrings_to_stubs(stubs_dir):
         if module_name in LIB_MODULES:
             namespace = "lib"
         elif stub_file.parent.name in ("parquet", "interchange"):
-            namespace = f"{stub_file.parent.name}.{module_name}"
+            namespace = (stub_file.parent.name if module_name == "__init__"
+                         else f"{stub_file.parent.name}.{module_name}")
         elif module_name == "__init__":
             namespace = ""
         else:
             namespace = module_name
 
         print(f"  {stub_file.name} -> {namespace or '(root)'}")
-        tree = libcst.parse_module(stub_file.read_text())
+        tree = libcst.parse_module(stub_file.read_text(encoding="utf-8"))
         modified = tree.visit(DocstringInserter(pyarrow, namespace))
-        stub_file.write_text(modified.code)
+        stub_file.write_text(modified.code, encoding="utf-8")
 
 
-def add_docstrings_from_build(stubs_dir, build_lib):
+def _link_or_copy(source, destination):
+    # Prefer symlinks (faster, no disk use) but fall back to copying when the
+    # filesystem doesn't support them (e.g. Docker volumes, network mounts).
+    if sys.platform != "win32":
+        try:
+            os.symlink(source, destination)
+            return
+        except OSError:
+            pass
+
+    if source.is_dir():
+        shutil.copytree(source, destination, symlinks=(sys.platform != 
"win32"))
+    else:
+        shutil.copy2(source, destination)
+
+
+def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir):
     """
-    Entry point for setup.py: update docstrings using pyarrow from build 
directory.
+    Assemble an importable pyarrow package inside a temporary directory.
 
-    During the build process, pyarrow is not installed in the system Python.
-    We need to temporarily add the build directory to sys.path so we can
-    import pyarrow and extract docstrings from it.
+    During wheel builds the .py sources and compiled binary artifacts live in
+    separate trees (source checkout vs CMake install prefix). This function
+    symlinks (or copies) both into pyarrow_pkg folder so that a plain
+    ``import pyarrow`` works and docstrings can be extracted at build time.
     """
-    stubs_dir, build_lib = Path(stubs_dir), Path(build_lib)
+    source_pyarrow = source_dir / "pyarrow"
+    if not source_pyarrow.exists():
+        raise FileNotFoundError(f"PyArrow source package not found: 
{source_pyarrow}")
+
+    for source_path in sorted(source_pyarrow.iterdir()):
+        if source_path.suffix == ".py":
+            _link_or_copy(source_path, pyarrow_pkg / source_path.name)
+        elif source_path.is_dir() and not source_path.name.startswith((".", 
"__")):
+            _link_or_copy(source_path, pyarrow_pkg / source_path.name)
+
+    for artifact in sorted(install_pyarrow_dir.iterdir()):
+        if not artifact.is_file() or artifact.suffix == ".pyi":
+            continue
 
-    sys.path.insert(0, str(build_lib))
-    try:
-        add_docstrings_to_stubs(stubs_dir)
-    finally:
-        sys.path.pop(0)
+        destination = pyarrow_pkg / artifact.name
+        if not destination.exists():
+            _link_or_copy(artifact, destination)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("stubs_dir", type=Path, help="Path to pyarrow-stubs 
folder")
+    parser.add_argument("install_prefix", type=Path,
+                        help="CMAKE_INSTALL_PREFIX used by wheel build")
+    parser.add_argument("source_dir", type=Path,
+                        help="PyArrow source directory")
     args = parser.parse_args()
 
-    # Add the directory containing this script's parent (python/) to sys.path
-    # so pyarrow can be imported when running from the python/ directory
-    script_dir = Path(__file__).resolve().parent
-    python_dir = script_dir.parent
-    sys.path.insert(0, str(python_dir))
-    add_docstrings_to_stubs(args.stubs_dir.resolve())
+    install_prefix = args.install_prefix.resolve()
+    source_dir = args.source_dir.resolve()
+    install_pyarrow_dir = install_prefix / "pyarrow"
+    if not install_pyarrow_dir.exists():
+        install_pyarrow_dir = install_prefix
+
+    if not any(install_pyarrow_dir.rglob("*.pyi")):
+        print("No .pyi files found in install tree, skipping docstring 
injection")
+        sys.exit(0)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pyarrow_pkg = Path(tmpdir) / "pyarrow"
+        pyarrow_pkg.mkdir()
+        _create_importable_pyarrow(pyarrow_pkg, source_dir, 
install_pyarrow_dir)
+
+        sys.path.insert(0, tmpdir)
+        try:
+            add_docstrings_to_stubs(install_pyarrow_dir)
+        finally:
+            sys.path.pop(0)

Reply via email to