This is an automated email from the ASF dual-hosted git repository.
gopidesu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-site-archive.git
The following commit(s) were added to refs/heads/main by this push:
new d7ac68fb67 Add S3/Github/S3 sync scripts and workflows (#1)
d7ac68fb67 is described below
commit d7ac68fb67d0aa3efecfa7918d572052356cc529
Author: GPK <[email protected]>
AuthorDate: Thu May 1 14:40:01 2025 +0100
Add S3/Github/S3 sync scripts and workflows (#1)
* Add s3 to github sync script and workflow
* Add bulk sync scripts and last-commit sync scripts
* Add bulk sync scripts and last-commit sync scripts
---
.github/workflows/github-to-s3.yml | 83 +++++++++++++++++++++
.github/workflows/s3-to-github.yml | 86 ++++++++++++++++++++++
.gitignore | 4 ++
README.md | 31 ++++++++
scripts/__init__.py | 0
scripts/github_to_s3.py | 143 +++++++++++++++++++++++++++++++++++++
scripts/s3_to_github.py | 89 +++++++++++++++++++++++
scripts/transfer_utils.py | 95 ++++++++++++++++++++++++
8 files changed, 531 insertions(+)
diff --git a/.github/workflows/github-to-s3.yml
b/.github/workflows/github-to-s3.yml
new file mode 100644
index 0000000000..7d44688a70
--- /dev/null
+++ b/.github/workflows/github-to-s3.yml
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+---
+name: Sync GitHub to S3
+on: # yamllint disable-line rule:truthy
+ workflow_dispatch:
+ inputs:
+ destination-location:
+ description: "The destination location in S3"
+ required: false
+ default: "s3://staging-docs-airflow-apache-org/docs/"
+ type: string
+ local-path:
+ description: "The location of the local folder to sync"
+ required: false
+ default: "./docs-archive"
+ type: string
+ document-folder:
+ description: "Provide any specific package document folder to sync"
+ required: false
+ default: "NO_DOCS"
+ type: string
+ sync-type:
+ description: "Perform a full sync or just sync the last commit"
+ required: false
+ default: "last_commit"
+ type: choice
+ options:
+ - last_commit
+ - full_sync
+jobs:
+ github-to-s3:
+ name: GitHub to S3
+ runs-on: ubuntu-latest
+ steps:
+ - name: Setup Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.11'
+
+ - name: Install AWS CLI v2
+ run: |
+ curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o
/tmp/awscliv2.zip
+ unzip -q /tmp/awscliv2.zip -d /tmp
+ rm /tmp/awscliv2.zip
+ sudo /tmp/aws/install --update
+ rm -rf /tmp/aws/
+
+ - name: Configure AWS credentials
+ uses:
aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a
# v4.0.1
+ with:
+ aws-access-key-id: ${{ secrets.DOCS_AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.DOCS_AWS_SECRET_ACCESS_KEY }}
+ aws-region: us-east-2
+
+ - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )"
+ uses: actions/checkout@v4
+ with:
+ lfs: true
+ fetch-depth: 1
+
+ - name: "Syncing ${{ github.ref }} ( ${{ github.sha }} )"
+ env:
+ COMMIT_SHA: ${{ github.sha }}
+ run: |
+ python3 -m pip install uv
+ uv run ./scripts/github_to_s3.py --bucket-path
${{inputs.destination-location}} --local-path ${{inputs.local-path}} \
+ --document-folder ${{inputs.document-folder}} --commit-sha
${COMMIT_SHA} --sync-type ${{ inputs.sync-type }}
\ No newline at end of file
diff --git a/.github/workflows/s3-to-github.yml
b/.github/workflows/s3-to-github.yml
new file mode 100644
index 0000000000..2648876f59
--- /dev/null
+++ b/.github/workflows/s3-to-github.yml
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+---
+name: Sync S3 to GitHub
+on: # yamllint disable-line rule:truthy
+ workflow_dispatch:
+ inputs:
+ source-location:
+ description: "The destination location in S3"
+ required: false
+ default: "s3://staging-docs-airflow-apache-org/docs/"
+ type: string
+ local-destination:
+ description: "The local destination location"
+ required: false
+ default: "./docs-archive"
+ type: string
+ document-folder:
+ description: "Provide any specific package document folder to sync"
+ required: false
+ default: "NO_DOCS"
+ type: string
+jobs:
+ s3-to-github:
+ name: S3 to GitHub
+ runs-on: ubuntu-latest
+ steps:
+ - name: Setup Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.11'
+
+ - name: Install AWS CLI v2
+ run: |
+ curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o
/tmp/awscliv2.zip
+ unzip -q /tmp/awscliv2.zip -d /tmp
+ rm /tmp/awscliv2.zip
+ sudo /tmp/aws/install --update
+ rm -rf /tmp/aws/
+
+ - name: Configure AWS credentials
+ uses:
aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a
# v4.0.1
+ with:
+ aws-access-key-id: ${{ secrets.DOCS_AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.DOCS_AWS_SECRET_ACCESS_KEY }}
+ aws-region: us-east-2
+
+ - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )"
+ uses: actions/checkout@v4
+ with:
+ lfs: true
+ fetch-depth: 1
+
+ - name: "Check space available"
+ run: df -h
+
+ - name: Syncing
+ run: |
+ python3 -m pip install uv
+ uv run ./scripts/s3_to_github.py --bucket-path
${{inputs.source-location}} --local-path ${{inputs.local-destination}}
--document-folder ${{inputs.document-folder}}
+
+ - name: Commiting changes
+ run: |
+ echo "Running git config"
+ git config user.name "GitHub Actions"
+ git config user.email "[email protected]"
+ echo "Running git add"
+ git add .
+ echo "Running git commit"
+ git commit -m "Sync S3 to GitHub" || echo "No changes to commit"
+ git push --force origin main
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..e42e468883
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+__pycache__/
+.idea/*
+*.iml
+.venv/*
\ No newline at end of file
diff --git a/README.md b/README.md
index 6e965a13c5..c91b05afdc 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,32 @@
# airflow-site-archive
+
+### Documentation Syncing Process
+### S3 To GitHub
+**Sync S3 to Github**: Use the `scripts/s3_to_github.py` script to download
the latest documentation from S3 to your ./docs-archive folder.
+It has the following command line arguments:
+- `--bucket-path`: The S3 bucket path where the documentation is stored.
+- `--local-path`: The local path where the documentation will be downloaded.
+- `--document-folder`: The folder in the S3 bucket where the documentation is
stored (This is optional if any particular
+ folder need to be synced, provide the folder name ex:
`apache-airflow-providers-amazon`).
+```bash
+uv run ./scripts/s3_to_github.py --bucket-path
s3://staging-docs-airflow-apache-org/docs/ --local-path ./docs-archive
+```
+
+
+### GitHub To S3
+**Sync Github to S3**: Use the `scripts/github_to_s3.py` script to upload the
latest documentation from your ./docs-archive folder to S3.
+It has two modes:
+1. **Last commit**: Syncs only last commit changes to S3.
+2. **Full sync**: Syncs all files under `./docs-archive` to S3.
+It has the following command line arguments:
+
+- `--bucket-path`: The S3 bucket path where the documentation will be stored.
+- `--local-path`: The local path where the documentation is stored.
+- `--document-folder`: The folder in the local path where the documentation is
stored (This is optional if any particular
+ folder need to be synced, provide the folder name ex:
`apache-airflow-providers-amazon`).
+- `--sync-type`: The type of sync to perform. Can be either `last_commit` or
`full_sync`.
+- `--commit-sha`: The commit sha to sync to S3. This is only required if the
sync type is `last_commit`.
+
+```bash
+uv run ./scripts/github_to_s3.py --bucket-path
s3://staging-docs-airflow-apache-org/docs/ --local-path ./docs-archive
--sync-type last-commit
+```
\ No newline at end of file
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/github_to_s3.py b/scripts/github_to_s3.py
new file mode 100644
index 0000000000..2a1f7d133d
--- /dev/null
+++ b/scripts/github_to_s3.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+# "rich",
+# "boto3",
+# ]
+# ///
+
+import argparse
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+from rich.console import Console
+
+from transfer_utils import CommonTransferUtils
+
+console = Console(width=200, color_system="standard")
+
+class GithubToS3(CommonTransferUtils):
+ def __init__(self, bucket, local_path):
+ super().__init__(bucket, local_path)
+
+ @staticmethod
+ def fetch_last_commit_files(commit_sha, diff_filter="ACM"):
+ console.print(f"[blue] Fetching files from last commit {commit_sha}
[/]")
+ cmd = [
+ "git",
+ "diff-tree",
+ "--no-commit-id",
+ "--name-only",
+ "-r",
+ commit_sha,
+ f"--diff-filter={diff_filter}"
+ ]
+ result = subprocess.run(cmd, check=False, capture_output=True,
text=True)
+
+ if result.returncode != 0:
+ console.print(
+ f"[warning] Error when running diff-tree command
[/]\n{result.stdout}\n{result.stderr}"
+ )
+ return []
+ return result.stdout.splitlines() if result.stdout else []
+
+ def sync_last_commit_files(self, commit_sha: str):
+ '''
+ There are two parts here.
+ 1. When any file gets removed under docs folder, we will remove from
target location
+ 2. When any file gets added/changed/modified under docs folder, we
will copy from source to target location
+ '''
+ # Fetching `d` excludes deleted files
+ # Fetching `D` includes deleted files
+
+ files_cp_required = self.fetch_last_commit_files(commit_sha,
diff_filter="d")
+ files_del_required = self.fetch_last_commit_files(commit_sha,
diff_filter="D")
+
+ files_cp_required_under_docs = [f for f in files_cp_required if
f.startswith("docs-archive/")]
+ files_del_required_required_under_docs = [f for f in
files_del_required if f.startswith("docs-archive/")]
+ copy_files_pool_args = []
+ delete_files_pool_args = []
+
+ for file in files_cp_required_under_docs:
+ destination_prefix = file.replace("docs-archive/", "docs/")
+ dest = f"s3://{self.bucket_name}/{destination_prefix}"
+ copy_files_pool_args.append((file, dest))
+
+ for file in files_del_required_required_under_docs:
+ destination_prefix = file.replace("docs-archive/", "docs/")
+ dest = f"s3://{self.bucket_name}/{destination_prefix}"
+ delete_files_pool_args.append(dest)
+
+ self.run_with_pool(self.remove, delete_files_pool_args)
+ self.run_with_pool(self.copy, copy_files_pool_args)
+
+ def full_sync(self):
+ console.print(f"[blue] Syncing all files from {self.local_path} to
{self.bucket_name} [/]")
+ list_of_folders = os.listdir(self.local_path)
+ pool_args = []
+ for folder in list_of_folders:
+ source = os.path.join(self.local_path, folder)
+ dest = f"s3://{self.bucket_name}/{self.prefix}".rstrip("/") + "/"
+ folder
+ pool_args.append((source, dest))
+
+ self.run_with_pool(self.sync, pool_args)
+
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Sync GitHub to S3")
+ parser.add_argument("--bucket-path", required=True, help="S3 bucket name
with path")
+ parser.add_argument("--local-path", required=True, help="local path to
sync")
+ parser.add_argument("--document-folder", help="Document folder to sync",
default="")
+ parser.add_argument("--commit-sha", help="Commit SHA to sync", default="")
+ parser.add_argument("--sync-type", help="Sync type", default="last_commit")
+
+ args = parser.parse_args()
+
+ syncer = GithubToS3(bucket=args.bucket_path, local_path=args.local_path)
+ syncer.check_bucket()
+
+ document_folder = args.document_folder
+
+ if document_folder and document_folder != "NO_DOCS":
+ full_local_path = Path(f"{args.local_path}/{document_folder}")
+ if full_local_path.exists():
+ console.print(f"[blue] Document folder {document_folder} exists in
bucket {args.bucket_path}.[/]")
+
+ destination =
f"s3://{syncer.bucket_name}/{syncer.prefix}".rstrip("/") + "/" + document_folder
+ syncer.sync(source=full_local_path, destination=destination)
+ sys.exit(0)
+ else:
+ console.print(f"[red] Document folder {document_folder} does not
exist in github {args.local_path}.[/]")
+ sys.exit(1)
+
+ if args.sync_type == "last_commit" and args.commit_sha:
+ console.print(f"[blue] Syncing last commit {args.commit_sha} from
{args.local_path} [/]")
+ syncer.sync_last_commit_files(args.commit_sha)
+ sys.exit(0)
+
+ if args.sync_type == "full_sync":
+ syncer.full_sync()
+ sys.exit(0)
+
+ console.print(f"[red] Invalid sync type {args.sync_type} [/]")
+
diff --git a/scripts/s3_to_github.py b/scripts/s3_to_github.py
new file mode 100644
index 0000000000..0e92634842
--- /dev/null
+++ b/scripts/s3_to_github.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+# "rich",
+# "boto3",
+# ]
+# ///
+
+from rich.console import Console
+
+from transfer_utils import CommonTransferUtils
+
+console = Console(width=200, color_system="standard")
+
+import argparse
+import sys
+
+
+class S3TOGithub(CommonTransferUtils):
+
+ def __init__(self, bucket, local_path):
+ super().__init__(bucket, local_path)
+
+ def verify_document_folder(self, document_folder):
+ response= self.s3_client.list_objects_v2(
+ Bucket=self.bucket_name,
+ Prefix=self.prefix.rstrip("/") + "/" + document_folder,
+ )
+ return response["KeyCount"] > 0
+
+ def sync_to_s3(self):
+ console.print("[blue] Syncing files from S3 to GitHub...[/]")
+ prefixes = self.get_list_of_folders()
+ pool_args = []
+ for pref in prefixes:
+ source_bucket_path = f"s3://{self.bucket_name}/{pref}"
+
+ # we want to store the files in the github under docs-archive/
+ destination = self.local_path + pref.replace("docs/", "")
+ pool_args.append((source_bucket_path, destination))
+
+ self.run_with_pool(self.sync, pool_args)
+
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Sync S3 to GitHub")
+ parser.add_argument("--bucket-path", required=True, help="S3 bucket name
with path")
+ parser.add_argument("--local-path", required=True, help="local path to
sync")
+ parser.add_argument("--document-folder", help="Document folder to sync",
default="")
+
+ args = parser.parse_args()
+
+ syncer = S3TOGithub(bucket=args.bucket_path, local_path=args.local_path)
+ syncer.check_bucket()
+ document_folder = args.document_folder
+
+ if document_folder and document_folder != "NO_DOCS":
+ if syncer.verify_document_folder(document_folder):
+ console.print(f"[blue] Document folder {document_folder} exists in
bucket {args.bucket_path}.[/]")
+ source_prefix = syncer.prefix.rstrip("/") + "/" + document_folder
+ source =
f"s3://{syncer.bucket_name}/{syncer.prefix}{document_folder}"
+ syncer.sync(source=source, destination=args.local_path)
+ sys.exit(0)
+ else:
+ console.print(f"[red] Document folder {document_folder} does not
exist in bucket {args.bucket_path}.[/]")
+ sys.exit(1)
+
+ syncer.sync_to_s3()
+
+
+
diff --git a/scripts/transfer_utils.py b/scripts/transfer_utils.py
new file mode 100644
index 0000000000..5dc058967a
--- /dev/null
+++ b/scripts/transfer_utils.py
@@ -0,0 +1,95 @@
+import subprocess
+import sys
+from functools import cached_property
+from multiprocessing import Pool
+
+import boto3
+import urllib3
+from rich.console import Console
+
+console = Console(width=200, color_system="standard")
+
+class CommonTransferUtils:
+ s3_client = boto3.client('s3')
+
+ def __init__(self, bucket, local_path):
+ self.bucket = bucket
+ self.local_path = local_path.rstrip("/") + "/"
+
+ @cached_property
+ def bucket_name(self):
+ try:
+ bucket = urllib3.util.parse_url(self.bucket).netloc
+ return bucket
+ except Exception as e:
+ console.print(f"[red] Error: {e}[/]")
+ sys.exit(1)
+
+ @cached_property
+ def prefix(self):
+ try:
+ pref = urllib3.util.parse_url(self.bucket).path
+ if pref.startswith('/'):
+ pref = pref[1:]
+ return pref
+ except Exception as e:
+ console.print(f"[red] Error: {e}[/]")
+ sys.exit(1)
+
+ def check_bucket(self):
+ try:
+ response = self.s3_client.head_bucket(Bucket=self.bucket_name)
+ if response['ResponseMetadata']['HTTPStatusCode'] == 200:
+ console.print(f"[blue] Bucket {self.bucket} exists.[/]")
+ except Exception as e:
+ console.print(f"[red] Error: {e}[/]")
+ sys.exit(1)
+
+ def get_list_of_folders(self):
+ folders = []
+ try:
+ response = self.s3_client.list_objects_v2(
+ Bucket=self.bucket_name,
+ Prefix=self.prefix,
+ Delimiter='/'
+ )
+ if 'CommonPrefixes' in response:
+ for cur_prefix in response['CommonPrefixes']:
+ folders.append(cur_prefix['Prefix'])
+ return folders
+ except Exception as e:
+ console.print(f"[yellow] Error: {e}[/]")
+ return []
+
+ def sync(self, source, destination):
+
+ console.print(f"[blue] Syncing {source} to {destination} [/]")
+
+ subprocess.run(
+ ["aws", "s3", "sync", "--delete", source, destination],
capture_output=True, text=True, check=True
+ )
+ console.print(f"[blue] Sync completed for {source} to {destination}
[/]")
+
+ @staticmethod
+ def run_with_pool(func, args):
+
+ with Pool(processes=4) as pool:
+ pool.starmap(func, args)
+
+ @staticmethod
+ def copy(source, destination):
+ console.print(f"[blue] Copying {source} to {destination} [/]")
+
+ subprocess.run(
+ ["aws", "s3", "cp", source, destination], capture_output=True,
text=True, check=True
+ )
+ console.print(f"[blue] Copy completed for {source} to {destination}
[/]")
+
+ @staticmethod
+ def remove(file_to_delete):
+ console.print(f"[blue] Deleting {file_to_delete} [/]")
+
+ subprocess.run(
+ ["aws", "s3", "rm", file_to_delete], capture_output=True,
text=True, check=True
+ )
+ console.print(f"[blue] Delete completed for {file_to_delete} [/]")