This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 4cf176e6a4f Add Git Sparse Checkout to Git Dag Bundle (#67047)
4cf176e6a4f is described below
commit 4cf176e6a4fd29062e8e8f00f40c8d5cdc2471d1
Author: Jens Scheffler <[email protected]>
AuthorDate: Sun May 17 22:47:43 2026 +0200
Add Git Sparse Checkout to Git Dag Bundle (#67047)
* Add Git Sparse Checkout to Git Dag Bundle
* CoPilot Feedback
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
* Add test from CoPilot Feedback
---------
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
---
providers/git/docs/bundles/index.rst | 7 ++-
.../git/src/airflow/providers/git/bundles/git.py | 16 ++++-
providers/git/tests/unit/git/bundles/test_git.py | 69 +++++++++++++++++++++-
3 files changed, 86 insertions(+), 6 deletions(-)
diff --git a/providers/git/docs/bundles/index.rst
b/providers/git/docs/bundles/index.rst
index f9e9efe9a17..d551235bb86 100644
--- a/providers/git/docs/bundles/index.rst
+++ b/providers/git/docs/bundles/index.rst
@@ -34,9 +34,10 @@ Example of using the GitDagBundle:
"kwargs": {
"subdir": "dags",
"tracking_ref": "main",
- "refresh_interval": 3600
- "submodules": False,
- "prune_dotgit_folder": True
+ "refresh_interval": 3600,
+ "submodules": false,
+ "prune_dotgit_folder": true,
+ "sparse_dirs": ["dags", "includes"]
}
}
]'
diff --git a/providers/git/src/airflow/providers/git/bundles/git.py
b/providers/git/src/airflow/providers/git/bundles/git.py
index 33d3c9979d5..2f03ed48cf1 100644
--- a/providers/git/src/airflow/providers/git/bundles/git.py
+++ b/providers/git/src/airflow/providers/git/bundles/git.py
@@ -52,6 +52,11 @@ class GitDagBundle(BaseDagBundle):
to share the object directory via hard links, but if you have a lot of
current versions
running, or an especially large git repo leaving this as True will
save some disk space
at the expense of `git` operations not working in the bundle that
Tasks run from.
+ :param sparse_dirs: List of directories to include when cloning the
repository. Needs git version 2.25 or higher.
+
+ The sparse checkout will only produce the files and subfolders of the
list of provided directories
+ into the working tree. The "cone" mode is used, which means that
effective and fast filtering can be made.
+ See https://git-scm.com/docs/git-sparse-checkout for more information
on the sparse checkout feature.
"""
supports_versioning = True
@@ -65,6 +70,7 @@ class GitDagBundle(BaseDagBundle):
repo_url: str | None = None,
submodules: bool = False,
prune_dotgit_folder: bool = True,
+ sparse_dirs: list[str] | None = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
@@ -78,7 +84,7 @@ class GitDagBundle(BaseDagBundle):
self.git_conn_id = git_conn_id
self.repo_url = repo_url
self.submodules = submodules
-
+ self.sparse_dirs = sparse_dirs
# Force prune to False if submodules are used, otherwise git links
break
if self.submodules:
self.prune_dotgit_folder = False
@@ -93,6 +99,7 @@ class GitDagBundle(BaseDagBundle):
versions_path=self.versions_dir,
git_conn_id=self.git_conn_id,
submodules=self.submodules,
+ sparse_dirs=self.sparse_dirs,
)
self._log.debug("bundle configured")
@@ -247,7 +254,14 @@ class GitDagBundle(BaseDagBundle):
Repo.clone_from(
url=self.bare_repo_path,
to_path=self.repo_path,
+ multi_options=["--sparse", "--no-checkout"] if
self.sparse_dirs else None,
)
+ if self.sparse_dirs:
+ self._log.info("Setting up sparse checkout")
+ repo = Repo(self.repo_path)
+ repo.git.sparse_checkout("init", "--cone")
+ repo.git.sparse_checkout("set", *self.sparse_dirs)
+ repo.git.checkout(self.tracking_ref)
else:
self._log.debug("repo exists", repo_path=self.repo_path)
self.repo = Repo(self.repo_path)
diff --git a/providers/git/tests/unit/git/bundles/test_git.py
b/providers/git/tests/unit/git/bundles/test_git.py
index 6297ab675d9..43808ffdf09 100644
--- a/providers/git/tests/unit/git/bundles/test_git.py
+++ b/providers/git/tests/unit/git/bundles/test_git.py
@@ -21,6 +21,7 @@ import json
import os
import re
import types
+from pathlib import Path
from unittest import mock
from unittest.mock import patch
@@ -54,8 +55,8 @@ CONN_NO_REPO_URL = "my_git_conn_no_repo_url"
@pytest.fixture
-def git_repo(tmp_path_factory):
- directory = tmp_path_factory.mktemp("repo")
+def git_repo(tmp_path_factory) -> tuple[Path, Repo]:
+ directory: Path = tmp_path_factory.mktemp("repo")
repo = Repo.init(directory)
repo.git.symbolic_ref("HEAD", f"refs/heads/{GIT_DEFAULT_BRANCH}")
file_path = directory / "test_dag.py"
@@ -839,6 +840,70 @@ class TestGitDagBundle:
assert str(bundle.path).endswith(subdir)
assert {"some_new_file.py"} == files_in_repo
+ @mock.patch("airflow.providers.git.bundles.git.GitHook")
+ def test_sparse_checkout(self, mock_githook, git_repo):
+ repo_path, repo = git_repo
+ mock_githook.return_value.repo_url = repo_path
+
+ subdir = "some/subdir"
+ subdir_path = repo_path / subdir
+ subdir_path.mkdir(parents=True)
+ file_path = subdir_path / "some_relevant_file.py"
+ with open(file_path, "w") as f:
+ f.write("hello world")
+ otherdir = "other/dir"
+ otherdir_path = repo_path / otherdir
+ otherdir_path.mkdir(parents=True)
+ otherfile_path = otherdir_path / "some_other_file.py"
+ with open(otherfile_path, "w") as f:
+ f.write("hello world")
+
+ repo.index.add([file_path, otherfile_path])
+ repo.index.commit("Other commit")
+
+ bundle = GitDagBundle(
+ name="test-sparse",
+ git_conn_id=CONN_HTTPS,
+ tracking_ref=GIT_DEFAULT_BRANCH,
+ sparse_dirs=[subdir],
+ )
+ bundle.initialize()
+
+ files_in_repo = {f.name for f in bundle.path.glob("**/*.py") if
f.is_file()}
+ assert "some_other_file.py" not in files_in_repo
+ assert "some_relevant_file.py" in files_in_repo
+
+ @mock.patch("airflow.providers.git.bundles.git.GitHook")
+ def test_sparse_checkout_with_version_prunes_dotgit(self, mock_githook,
git_repo):
+ repo_path, repo = git_repo
+ mock_githook.return_value.repo_url = repo_path
+ subdir = "some/subdir"
+ subdir_path = repo_path / subdir
+ subdir_path.mkdir(parents=True)
+ file_path = subdir_path / "some_relevant_file.py"
+ with open(file_path, "w") as f:
+ f.write("hello world")
+ otherdir = "other/dir"
+ otherdir_path = repo_path / otherdir
+ otherdir_path.mkdir(parents=True)
+ otherfile_path = otherdir_path / "some_other_file.py"
+ with open(otherfile_path, "w") as f:
+ f.write("hello world")
+ repo.index.add([file_path, otherfile_path])
+ commit = repo.index.commit("Other commit")
+ bundle = GitDagBundle(
+ name="test-sparse-version",
+ git_conn_id=CONN_HTTPS,
+ tracking_ref=GIT_DEFAULT_BRANCH,
+ version=commit.hexsha,
+ sparse_dirs=[subdir],
+ )
+ bundle.initialize()
+ files_in_repo = {f.name for f in bundle.path.glob("**/*.py") if
f.is_file()}
+ assert "some_other_file.py" not in files_in_repo
+ assert "some_relevant_file.py" in files_in_repo
+ assert not (bundle.path / ".git").exists()
+
def test_raises_when_no_repo_url(self):
bundle = GitDagBundle(
name="test",