This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/main by this push:
     new 4cf176e6a4f Add Git Sparse Checkout to Git Dag Bundle (#67047)
4cf176e6a4f is described below

commit 4cf176e6a4fd29062e8e8f00f40c8d5cdc2471d1
Author: Jens Scheffler <[email protected]>
AuthorDate: Sun May 17 22:47:43 2026 +0200

    Add Git Sparse Checkout to Git Dag Bundle (#67047)
    
    * Add Git Sparse Checkout to Git Dag Bundle
    
    * CoPilot Feedback
    
    Co-authored-by: Copilot Autofix powered by AI 
<[email protected]>
    
    * Add test from CoPilot Feedback
    
    ---------
    
    Co-authored-by: Copilot Autofix powered by AI 
<[email protected]>
---
 providers/git/docs/bundles/index.rst               |  7 ++-
 .../git/src/airflow/providers/git/bundles/git.py   | 16 ++++-
 providers/git/tests/unit/git/bundles/test_git.py   | 69 +++++++++++++++++++++-
 3 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/providers/git/docs/bundles/index.rst 
b/providers/git/docs/bundles/index.rst
index f9e9efe9a17..d551235bb86 100644
--- a/providers/git/docs/bundles/index.rst
+++ b/providers/git/docs/bundles/index.rst
@@ -34,9 +34,10 @@ Example of using the GitDagBundle:
          "kwargs": {
              "subdir": "dags",
              "tracking_ref": "main",
-             "refresh_interval": 3600
-             "submodules": False,
-             "prune_dotgit_folder": True
+             "refresh_interval": 3600,
+             "submodules": false,
+             "prune_dotgit_folder": true,
+             "sparse_dirs": ["dags", "includes"]
          }
      }
     ]'
diff --git a/providers/git/src/airflow/providers/git/bundles/git.py 
b/providers/git/src/airflow/providers/git/bundles/git.py
index 33d3c9979d5..2f03ed48cf1 100644
--- a/providers/git/src/airflow/providers/git/bundles/git.py
+++ b/providers/git/src/airflow/providers/git/bundles/git.py
@@ -52,6 +52,11 @@ class GitDagBundle(BaseDagBundle):
         to share the object directory via hard links, but if you have a lot of 
current versions
         running, or an especially large git repo leaving this as True will 
save some disk space
         at the expense of `git` operations not working in the bundle that 
Tasks run from.
+    :param sparse_dirs: List of directories to include when cloning the 
repository. Needs git version 2.25 or higher.
+
+        The sparse checkout will only produce the files and subfolders of the 
list of provided directories
+        into the working tree. The "cone" mode is used, which means that 
effective and fast filtering can be made.
+        See https://git-scm.com/docs/git-sparse-checkout for more information 
on the sparse checkout feature.
     """
 
     supports_versioning = True
@@ -65,6 +70,7 @@ class GitDagBundle(BaseDagBundle):
         repo_url: str | None = None,
         submodules: bool = False,
         prune_dotgit_folder: bool = True,
+        sparse_dirs: list[str] | None = None,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -78,7 +84,7 @@ class GitDagBundle(BaseDagBundle):
         self.git_conn_id = git_conn_id
         self.repo_url = repo_url
         self.submodules = submodules
-
+        self.sparse_dirs = sparse_dirs
         # Force prune to False if submodules are used, otherwise git links 
break
         if self.submodules:
             self.prune_dotgit_folder = False
@@ -93,6 +99,7 @@ class GitDagBundle(BaseDagBundle):
             versions_path=self.versions_dir,
             git_conn_id=self.git_conn_id,
             submodules=self.submodules,
+            sparse_dirs=self.sparse_dirs,
         )
 
         self._log.debug("bundle configured")
@@ -247,7 +254,14 @@ class GitDagBundle(BaseDagBundle):
                 Repo.clone_from(
                     url=self.bare_repo_path,
                     to_path=self.repo_path,
+                    multi_options=["--sparse", "--no-checkout"] if 
self.sparse_dirs else None,
                 )
+                if self.sparse_dirs:
+                    self._log.info("Setting up sparse checkout")
+                    repo = Repo(self.repo_path)
+                    repo.git.sparse_checkout("init", "--cone")
+                    repo.git.sparse_checkout("set", *self.sparse_dirs)
+                    repo.git.checkout(self.tracking_ref)
             else:
                 self._log.debug("repo exists", repo_path=self.repo_path)
             self.repo = Repo(self.repo_path)
diff --git a/providers/git/tests/unit/git/bundles/test_git.py 
b/providers/git/tests/unit/git/bundles/test_git.py
index 6297ab675d9..43808ffdf09 100644
--- a/providers/git/tests/unit/git/bundles/test_git.py
+++ b/providers/git/tests/unit/git/bundles/test_git.py
@@ -21,6 +21,7 @@ import json
 import os
 import re
 import types
+from pathlib import Path
 from unittest import mock
 from unittest.mock import patch
 
@@ -54,8 +55,8 @@ CONN_NO_REPO_URL = "my_git_conn_no_repo_url"
 
 
 @pytest.fixture
-def git_repo(tmp_path_factory):
-    directory = tmp_path_factory.mktemp("repo")
+def git_repo(tmp_path_factory) -> tuple[Path, Repo]:
+    directory: Path = tmp_path_factory.mktemp("repo")
     repo = Repo.init(directory)
     repo.git.symbolic_ref("HEAD", f"refs/heads/{GIT_DEFAULT_BRANCH}")
     file_path = directory / "test_dag.py"
@@ -839,6 +840,70 @@ class TestGitDagBundle:
         assert str(bundle.path).endswith(subdir)
         assert {"some_new_file.py"} == files_in_repo
 
+    @mock.patch("airflow.providers.git.bundles.git.GitHook")
+    def test_sparse_checkout(self, mock_githook, git_repo):
+        repo_path, repo = git_repo
+        mock_githook.return_value.repo_url = repo_path
+
+        subdir = "some/subdir"
+        subdir_path = repo_path / subdir
+        subdir_path.mkdir(parents=True)
+        file_path = subdir_path / "some_relevant_file.py"
+        with open(file_path, "w") as f:
+            f.write("hello world")
+        otherdir = "other/dir"
+        otherdir_path = repo_path / otherdir
+        otherdir_path.mkdir(parents=True)
+        otherfile_path = otherdir_path / "some_other_file.py"
+        with open(otherfile_path, "w") as f:
+            f.write("hello world")
+
+        repo.index.add([file_path, otherfile_path])
+        repo.index.commit("Other commit")
+
+        bundle = GitDagBundle(
+            name="test-sparse",
+            git_conn_id=CONN_HTTPS,
+            tracking_ref=GIT_DEFAULT_BRANCH,
+            sparse_dirs=[subdir],
+        )
+        bundle.initialize()
+
+        files_in_repo = {f.name for f in bundle.path.glob("**/*.py") if 
f.is_file()}
+        assert "some_other_file.py" not in files_in_repo
+        assert "some_relevant_file.py" in files_in_repo
+
+    @mock.patch("airflow.providers.git.bundles.git.GitHook")
+    def test_sparse_checkout_with_version_prunes_dotgit(self, mock_githook, 
git_repo):
+        repo_path, repo = git_repo
+        mock_githook.return_value.repo_url = repo_path
+        subdir = "some/subdir"
+        subdir_path = repo_path / subdir
+        subdir_path.mkdir(parents=True)
+        file_path = subdir_path / "some_relevant_file.py"
+        with open(file_path, "w") as f:
+            f.write("hello world")
+        otherdir = "other/dir"
+        otherdir_path = repo_path / otherdir
+        otherdir_path.mkdir(parents=True)
+        otherfile_path = otherdir_path / "some_other_file.py"
+        with open(otherfile_path, "w") as f:
+            f.write("hello world")
+        repo.index.add([file_path, otherfile_path])
+        commit = repo.index.commit("Other commit")
+        bundle = GitDagBundle(
+            name="test-sparse-version",
+            git_conn_id=CONN_HTTPS,
+            tracking_ref=GIT_DEFAULT_BRANCH,
+            version=commit.hexsha,
+            sparse_dirs=[subdir],
+        )
+        bundle.initialize()
+        files_in_repo = {f.name for f in bundle.path.glob("**/*.py") if 
f.is_file()}
+        assert "some_other_file.py" not in files_in_repo
+        assert "some_relevant_file.py" in files_in_repo
+        assert not (bundle.path / ".git").exists()
+
     def test_raises_when_no_repo_url(self):
         bundle = GitDagBundle(
             name="test",

Reply via email to