This is an automated email from the ASF dual-hosted git repository.

gopidesu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-site-archive.git


The following commit(s) were added to refs/heads/main by this push:
     new d7ac68fb67 Add S3/Github/S3 sync scripts and workflows (#1)
d7ac68fb67 is described below

commit d7ac68fb67d0aa3efecfa7918d572052356cc529
Author: GPK <[email protected]>
AuthorDate: Thu May 1 14:40:01 2025 +0100

    Add S3/Github/S3 sync scripts and workflows (#1)
    
    * Add s3 to github sync script and workflow
    
    * Add bulk sync scripts and last-commit sync scripts
    
    * Add bulk sync scripts and last-commit sync scripts
---
 .github/workflows/github-to-s3.yml |  83 +++++++++++++++++++++
 .github/workflows/s3-to-github.yml |  86 ++++++++++++++++++++++
 .gitignore                         |   4 ++
 README.md                          |  31 ++++++++
 scripts/__init__.py                |   0
 scripts/github_to_s3.py            | 143 +++++++++++++++++++++++++++++++++++++
 scripts/s3_to_github.py            |  89 +++++++++++++++++++++++
 scripts/transfer_utils.py          |  95 ++++++++++++++++++++++++
 8 files changed, 531 insertions(+)

diff --git a/.github/workflows/github-to-s3.yml 
b/.github/workflows/github-to-s3.yml
new file mode 100644
index 0000000000..7d44688a70
--- /dev/null
+++ b/.github/workflows/github-to-s3.yml
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+---
+name: Sync GitHub to S3
+on: # yamllint disable-line rule:truthy
+  workflow_dispatch:
+    inputs:
+      destination-location:
+        description: "The destination location in S3"
+        required: false
+        default: "s3://staging-docs-airflow-apache-org/docs/"
+        type: string
+      local-path:
+        description: "The location of the local folder to sync"
+        required: false
+        default: "./docs-archive"
+        type: string
+      document-folder:
+        description: "Provide any specific package document folder to sync"
+        required: false
+        default: "NO_DOCS"
+        type: string
+      sync-type:
+        description: "Perform a full sync or just sync the last commit"
+        required: false
+        default: "last_commit"
+        type: choice
+        options:
+          - last_commit
+          - full_sync
+jobs:
+  github-to-s3:
+    name: GitHub to S3
+    runs-on: ubuntu-latest
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install AWS CLI v2
+        run: |
+          curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip"; -o 
/tmp/awscliv2.zip
+          unzip -q /tmp/awscliv2.zip -d /tmp
+          rm /tmp/awscliv2.zip
+          sudo /tmp/aws/install --update
+          rm -rf /tmp/aws/
+
+      - name: Configure AWS credentials
+        uses: 
aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a  
# v4.0.1
+        with:
+          aws-access-key-id: ${{ secrets.DOCS_AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.DOCS_AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-2
+
+      - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )"
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+          fetch-depth: 1
+
+      - name: "Syncing ${{ github.ref }} ( ${{ github.sha }} )"
+        env:
+          COMMIT_SHA: ${{ github.sha }}
+        run: |
+          python3 -m pip install uv
+          uv run ./scripts/github_to_s3.py --bucket-path 
${{inputs.destination-location}} --local-path ${{inputs.local-path}} \
+          --document-folder ${{inputs.document-folder}} --commit-sha 
${COMMIT_SHA} --sync-type ${{ inputs.sync-type }}
\ No newline at end of file
diff --git a/.github/workflows/s3-to-github.yml 
b/.github/workflows/s3-to-github.yml
new file mode 100644
index 0000000000..2648876f59
--- /dev/null
+++ b/.github/workflows/s3-to-github.yml
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+---
+name: Sync S3 to GitHub
+on: # yamllint disable-line rule:truthy
+  workflow_dispatch:
+    inputs:
+      source-location:
+        description: "The destination location in S3"
+        required: false
+        default: "s3://staging-docs-airflow-apache-org/docs/"
+        type: string
+      local-destination:
+        description: "The local destination location"
+        required: false
+        default: "./docs-archive"
+        type: string
+      document-folder:
+        description: "Provide any specific package document folder to sync"
+        required: false
+        default: "NO_DOCS"
+        type: string
+jobs:
+  s3-to-github:
+    name: S3 to GitHub
+    runs-on: ubuntu-latest
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install AWS CLI v2
+        run: |
+          curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip"; -o 
/tmp/awscliv2.zip
+          unzip -q /tmp/awscliv2.zip -d /tmp
+          rm /tmp/awscliv2.zip
+          sudo /tmp/aws/install --update
+          rm -rf /tmp/aws/
+
+      - name: Configure AWS credentials
+        uses: 
aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a  
# v4.0.1
+        with:
+          aws-access-key-id: ${{ secrets.DOCS_AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.DOCS_AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-2
+
+      - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )"
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+          fetch-depth: 1
+
+      - name: "Check space available"
+        run: df -h
+
+      - name: Syncing
+        run: |
+          python3 -m pip install uv
+          uv run ./scripts/s3_to_github.py --bucket-path 
${{inputs.source-location}} --local-path ${{inputs.local-destination}} 
--document-folder ${{inputs.document-folder}}
+
+      - name: Commiting changes
+        run: |
+          echo "Running git config"
+          git config user.name "GitHub Actions"
+          git config user.email "[email protected]"
+          echo "Running git add"
+          git add .
+          echo "Running git commit"
+          git commit -m "Sync S3 to GitHub" || echo "No changes to commit"
+          git push --force origin main
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..e42e468883
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+__pycache__/
+.idea/*
+*.iml
+.venv/*
\ No newline at end of file
diff --git a/README.md b/README.md
index 6e965a13c5..c91b05afdc 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,32 @@
 # airflow-site-archive
+
+### Documentation Syncing Process
+### S3 To GitHub
+**Sync S3 to Github**: Use the `scripts/s3_to_github.py` script to download 
the latest documentation from S3 to your ./docs-archive folder.
+It has the following command line arguments:
+- `--bucket-path`: The S3 bucket path where the documentation is stored.
+- `--local-path`: The local path where the documentation will be downloaded.
+- `--document-folder`: The folder in the S3 bucket where the documentation is 
stored (This is optional if any particular 
+                      folder need to be synced, provide the folder name ex: 
`apache-airflow-providers-amazon`).
+```bash
+uv run ./scripts/s3_to_github.py --bucket-path 
s3://staging-docs-airflow-apache-org/docs/ --local-path ./docs-archive
+```
+
+
+### GitHub To S3
+**Sync Github to S3**: Use the `scripts/github_to_s3.py` script to upload the 
latest documentation from your ./docs-archive folder to S3.
+It has two modes:
+1. **Last commit**: Syncs only last commit changes to S3.
+2. **Full sync**: Syncs all files under `./docs-archive` to S3.
+It has the following command line arguments:
+
+- `--bucket-path`: The S3 bucket path where the documentation will be stored.
+- `--local-path`: The local path where the documentation is stored.
+- `--document-folder`: The folder in the local path where the documentation is 
stored (This is optional if any particular 
+                      folder need to be synced, provide the folder name ex: 
`apache-airflow-providers-amazon`).
+- `--sync-type`: The type of sync to perform. Can be either `last_commit` or 
`full_sync`.
+- `--commit-sha`: The commit sha to sync to S3. This is only required if the 
sync type is `last_commit`.
+
+```bash
+uv run ./scripts/github_to_s3.py --bucket-path 
s3://staging-docs-airflow-apache-org/docs/ --local-path ./docs-archive 
--sync-type last-commit
+```
\ No newline at end of file
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/github_to_s3.py b/scripts/github_to_s3.py
new file mode 100644
index 0000000000..2a1f7d133d
--- /dev/null
+++ b/scripts/github_to_s3.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "rich",
+#     "boto3",
+# ]
+# ///
+
+import argparse
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+from rich.console import Console
+
+from transfer_utils import CommonTransferUtils
+
+console = Console(width=200, color_system="standard")
+
+class GithubToS3(CommonTransferUtils):
+    def __init__(self, bucket, local_path):
+        super().__init__(bucket, local_path)
+
+    @staticmethod
+    def fetch_last_commit_files(commit_sha, diff_filter="ACM"):
+        console.print(f"[blue] Fetching files from last commit {commit_sha} 
[/]")
+        cmd = [
+            "git",
+            "diff-tree",
+            "--no-commit-id",
+            "--name-only",
+            "-r",
+            commit_sha,
+            f"--diff-filter={diff_filter}"
+        ]
+        result = subprocess.run(cmd, check=False, capture_output=True, 
text=True)
+
+        if result.returncode != 0:
+            console.print(
+                f"[warning] Error when running diff-tree command 
[/]\n{result.stdout}\n{result.stderr}"
+            )
+            return []
+        return result.stdout.splitlines() if result.stdout else []
+
+    def sync_last_commit_files(self, commit_sha: str):
+        '''
+        There are two parts here.
+        1. When any file gets removed under docs folder, we will remove from 
target location
+        2. When any file gets added/changed/modified under docs folder, we 
will copy from source to target location
+        '''
+        # Fetching `d` excludes deleted files
+        # Fetching `D` includes deleted files
+
+        files_cp_required = self.fetch_last_commit_files(commit_sha, 
diff_filter="d")
+        files_del_required = self.fetch_last_commit_files(commit_sha, 
diff_filter="D")
+
+        files_cp_required_under_docs = [f for f in files_cp_required if 
f.startswith("docs-archive/")]
+        files_del_required_required_under_docs = [f for f in 
files_del_required if f.startswith("docs-archive/")]
+        copy_files_pool_args = []
+        delete_files_pool_args = []
+
+        for file in files_cp_required_under_docs:
+            destination_prefix = file.replace("docs-archive/", "docs/")
+            dest = f"s3://{self.bucket_name}/{destination_prefix}"
+            copy_files_pool_args.append((file, dest))
+
+        for file in files_del_required_required_under_docs:
+            destination_prefix = file.replace("docs-archive/", "docs/")
+            dest = f"s3://{self.bucket_name}/{destination_prefix}"
+            delete_files_pool_args.append(dest)
+
+        self.run_with_pool(self.remove, delete_files_pool_args)
+        self.run_with_pool(self.copy, copy_files_pool_args)
+
+    def full_sync(self):
+        console.print(f"[blue] Syncing all files from {self.local_path} to 
{self.bucket_name} [/]")
+        list_of_folders = os.listdir(self.local_path)
+        pool_args = []
+        for folder in list_of_folders:
+            source = os.path.join(self.local_path, folder)
+            dest = f"s3://{self.bucket_name}/{self.prefix}".rstrip("/") + "/" 
+ folder
+            pool_args.append((source, dest))
+
+        self.run_with_pool(self.sync, pool_args)
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Sync GitHub to S3")
+    parser.add_argument("--bucket-path", required=True, help="S3 bucket name 
with path")
+    parser.add_argument("--local-path", required=True, help="local path to 
sync")
+    parser.add_argument("--document-folder", help="Document folder to sync", 
default="")
+    parser.add_argument("--commit-sha", help="Commit SHA to sync", default="")
+    parser.add_argument("--sync-type", help="Sync type", default="last_commit")
+
+    args = parser.parse_args()
+
+    syncer = GithubToS3(bucket=args.bucket_path, local_path=args.local_path)
+    syncer.check_bucket()
+
+    document_folder = args.document_folder
+
+    if document_folder and document_folder != "NO_DOCS":
+        full_local_path = Path(f"{args.local_path}/{document_folder}")
+        if full_local_path.exists():
+            console.print(f"[blue] Document folder {document_folder} exists in 
bucket {args.bucket_path}.[/]")
+
+            destination = 
f"s3://{syncer.bucket_name}/{syncer.prefix}".rstrip("/") + "/" + document_folder
+            syncer.sync(source=full_local_path, destination=destination)
+            sys.exit(0)
+        else:
+            console.print(f"[red] Document folder {document_folder} does not 
exist in github {args.local_path}.[/]")
+            sys.exit(1)
+
+    if args.sync_type == "last_commit" and args.commit_sha:
+        console.print(f"[blue] Syncing last commit {args.commit_sha} from 
{args.local_path} [/]")
+        syncer.sync_last_commit_files(args.commit_sha)
+        sys.exit(0)
+
+    if args.sync_type == "full_sync":
+        syncer.full_sync()
+        sys.exit(0)
+
+    console.print(f"[red] Invalid sync type {args.sync_type} [/]")
+
diff --git a/scripts/s3_to_github.py b/scripts/s3_to_github.py
new file mode 100644
index 0000000000..0e92634842
--- /dev/null
+++ b/scripts/s3_to_github.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "rich",
+#     "boto3",
+# ]
+# ///
+
+from rich.console import Console
+
+from transfer_utils import CommonTransferUtils
+
+console = Console(width=200, color_system="standard")
+
+import argparse
+import sys
+
+
+class S3TOGithub(CommonTransferUtils):
+
+    def __init__(self, bucket, local_path):
+        super().__init__(bucket, local_path)
+
+    def verify_document_folder(self, document_folder):
+        response= self.s3_client.list_objects_v2(
+            Bucket=self.bucket_name,
+            Prefix=self.prefix.rstrip("/") + "/" + document_folder,
+        )
+        return response["KeyCount"] > 0
+
+    def sync_to_s3(self):
+        console.print("[blue] Syncing files from S3 to GitHub...[/]")
+        prefixes = self.get_list_of_folders()
+        pool_args = []
+        for pref in prefixes:
+            source_bucket_path = f"s3://{self.bucket_name}/{pref}"
+
+            # we want to store the files in the github under docs-archive/
+            destination = self.local_path + pref.replace("docs/", "")
+            pool_args.append((source_bucket_path, destination))
+
+        self.run_with_pool(self.sync, pool_args)
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Sync S3 to GitHub")
+    parser.add_argument("--bucket-path", required=True, help="S3 bucket name 
with path")
+    parser.add_argument("--local-path", required=True, help="local path to 
sync")
+    parser.add_argument("--document-folder", help="Document folder to sync", 
default="")
+
+    args = parser.parse_args()
+
+    syncer = S3TOGithub(bucket=args.bucket_path, local_path=args.local_path)
+    syncer.check_bucket()
+    document_folder = args.document_folder
+
+    if document_folder and document_folder != "NO_DOCS":
+        if syncer.verify_document_folder(document_folder):
+            console.print(f"[blue] Document folder {document_folder} exists in 
bucket {args.bucket_path}.[/]")
+            source_prefix = syncer.prefix.rstrip("/") + "/" + document_folder
+            source = 
f"s3://{syncer.bucket_name}/{syncer.prefix}{document_folder}"
+            syncer.sync(source=source, destination=args.local_path)
+            sys.exit(0)
+        else:
+            console.print(f"[red] Document folder {document_folder} does not 
exist in bucket {args.bucket_path}.[/]")
+            sys.exit(1)
+
+    syncer.sync_to_s3()
+
+
+
diff --git a/scripts/transfer_utils.py b/scripts/transfer_utils.py
new file mode 100644
index 0000000000..5dc058967a
--- /dev/null
+++ b/scripts/transfer_utils.py
@@ -0,0 +1,95 @@
+import subprocess
+import sys
+from functools import cached_property
+from multiprocessing import Pool
+
+import boto3
+import urllib3
+from rich.console import Console
+
+console = Console(width=200, color_system="standard")
+
+class CommonTransferUtils:
+    s3_client = boto3.client('s3')
+
+    def __init__(self, bucket, local_path):
+        self.bucket = bucket
+        self.local_path = local_path.rstrip("/") + "/"
+
+    @cached_property
+    def bucket_name(self):
+        try:
+            bucket = urllib3.util.parse_url(self.bucket).netloc
+            return bucket
+        except Exception as e:
+            console.print(f"[red] Error: {e}[/]")
+            sys.exit(1)
+
+    @cached_property
+    def prefix(self):
+        try:
+            pref = urllib3.util.parse_url(self.bucket).path
+            if pref.startswith('/'):
+                pref = pref[1:]
+            return pref
+        except Exception as e:
+            console.print(f"[red] Error: {e}[/]")
+            sys.exit(1)
+
+    def check_bucket(self):
+        try:
+            response = self.s3_client.head_bucket(Bucket=self.bucket_name)
+            if response['ResponseMetadata']['HTTPStatusCode'] == 200:
+                console.print(f"[blue] Bucket {self.bucket} exists.[/]")
+        except Exception as e:
+            console.print(f"[red] Error: {e}[/]")
+            sys.exit(1)
+
+    def get_list_of_folders(self):
+        folders = []
+        try:
+            response = self.s3_client.list_objects_v2(
+                Bucket=self.bucket_name,
+                Prefix=self.prefix,
+                Delimiter='/'
+            )
+            if 'CommonPrefixes' in response:
+                for cur_prefix in response['CommonPrefixes']:
+                    folders.append(cur_prefix['Prefix'])
+            return folders
+        except Exception as e:
+            console.print(f"[yellow] Error: {e}[/]")
+            return []
+
+    def sync(self, source, destination):
+
+        console.print(f"[blue] Syncing {source} to {destination} [/]")
+
+        subprocess.run(
+            ["aws", "s3", "sync", "--delete", source, destination], 
capture_output=True, text=True, check=True
+        )
+        console.print(f"[blue] Sync completed for {source} to {destination} 
[/]")
+
+    @staticmethod
+    def run_with_pool(func, args):
+
+        with Pool(processes=4) as pool:
+            pool.starmap(func, args)
+
+    @staticmethod
+    def copy(source, destination):
+        console.print(f"[blue] Copying {source} to {destination} [/]")
+
+        subprocess.run(
+            ["aws", "s3", "cp", source, destination], capture_output=True, 
text=True, check=True
+        )
+        console.print(f"[blue] Copy completed for {source} to {destination} 
[/]")
+
+    @staticmethod
+    def remove(file_to_delete):
+        console.print(f"[blue] Deleting {file_to_delete} [/]")
+
+        subprocess.run(
+            ["aws", "s3", "rm", file_to_delete], capture_output=True, 
text=True, check=True
+        )
+        console.print(f"[blue] Delete completed for {file_to_delete} [/]")

Reply via email to