This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm-site.git
The following commit(s) were added to refs/heads/main by this push:
new 6beeb405bd Download and repalce 3rdparty urls (#52)
6beeb405bd is described below
commit 6beeb405bdeba697d522aaaa478073b26e473d0e
Author: Siyuan Feng <[email protected]>
AuthorDate: Fri Jun 6 18:42:28 2025 +0800
Download and repalce 3rdparty urls (#52)
---
Gemfile | 2 +-
scripts/download_3rdparty_embeds.py | 314 ++++++++++++++++++++++++++++++++++++
scripts/task_deploy_asf_site.sh | 1 +
tvm | 1 -
4 files changed, 316 insertions(+), 2 deletions(-)
diff --git a/Gemfile b/Gemfile
index dea4159254..8cbb793e03 100644
--- a/Gemfile
+++ b/Gemfile
@@ -7,7 +7,7 @@ source "https://rubygems.org"
#
# This will help ensure the proper Jekyll version is running.
# Happy Jekylling!
-gem "jekyll", "~> 4.1.1"
+gem "jekyll", "~> 4.4.1"
# This is the default theme for new Jekyll sites. You may change this to
anything you like.
gem "minima", "~> 2.5"
# If you want to use GitHub Pages, remove the "gem "jekyll"" above and
diff --git a/scripts/download_3rdparty_embeds.py
b/scripts/download_3rdparty_embeds.py
new file mode 100644
index 0000000000..c8284b9af7
--- /dev/null
+++ b/scripts/download_3rdparty_embeds.py
@@ -0,0 +1,314 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=redefined-outer-name, missing-module-docstring
+import argparse
+import hashlib
+import os
+import re
+from html.parser import HTMLParser
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
+from urllib.parse import urlparse
+
+import requests
+
+# NOTE: This script is called by the Makefile via `make htmldepoly`.
+# It's not called every time the docs are built on CI. However, it's
+# can be only called during deployment stage, instead of building the docs.
+# Also, we can download the resources manually before running this script to
+# avoid the overhead of downloading the resources every time the docs are
built.
+
+# Set to store unique external URLs found during processing
+BASE_URL = "https://tvm.apache.org"
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+HTML_DIR = os.path.join(SCRIPT_DIR, "../_site/")
+
+
+class ExternalURLParser(HTMLParser):
+ """HTML Parser to find external URLs in HTML content."""
+
+ def __init__(self):
+ super().__init__()
+ self.external_urls: List[str] = []
+ self.base_domain = urlparse(BASE_URL).netloc
+ # Tags and their attributes that might contain external resources
+ self.tags_to_check = {
+ "img": "src",
+ "script": "src",
+ "iframe": "src",
+ "video": "src",
+ "audio": "src",
+ "link": "href",
+ "source": "src",
+ "embed": "src",
+ }
+
+ def handle_starttag(self, tag: str, attrs: List[tuple[str, Union[str,
None]]]) -> None:
+ """Handle HTML start tags to find external URLs."""
+ if tag not in self.tags_to_check:
+ return
+
+ attr_name = self.tags_to_check[tag]
+ for name, value in attrs:
+ if name != attr_name or not value:
+ continue
+
+ if value.startswith(("http://", "https://")):
+ domain = urlparse(value).netloc
+ if domain and domain != self.base_domain:
+ self.external_urls.append(value)
+
+
+def detect_html_external_urls(html_content: str) -> List[str]:
+ """
+ Detect third-party embedded resources in HTML content.
+
+ Parameters
+ ----------
+ html_content : str
+ The HTML content to analyze
+
+ Returns
+ -------
+ List[str]
+ List of external URLs found in the HTML content
+ """
+ parser = ExternalURLParser()
+ parser.feed(html_content)
+ return parser.external_urls
+
+
+def detect_css_external_urls(css_content: str) -> List[str]:
+ """
+ Detect external URLs in CSS content.
+
+ Parameters
+ ----------
+ css_content : str
+ The CSS content to analyze
+
+ Returns
+ -------
+ List[str]
+ List of external URLs found in the CSS content
+ """
+ external_urls: List[str] = []
+ # Regex to find URLs in CSS
+ url_pattern = re.compile(r'url\(["\']?(.*?)["\']?\)')
+ matches = url_pattern.findall(css_content)
+ for match in matches:
+ if match.startswith(("http://", "https://")) and not
match.startswith(BASE_URL):
+ external_urls.append(match)
+ return external_urls
+
+
+def all_files_in_dir(path: str) -> List[str]:
+ """
+ Get a list of all files in a directory and its subdirectories.
+
+ Parameters
+ ----------
+ path : str
+ The root directory path to search
+
+ Returns
+ -------
+ List[str]
+ List of full paths to all files found
+ """
+ return [os.path.join(root, file) for root, _, files in os.walk(path) for
file in files]
+
+
+def detect_urls(files: List[str], verbose: bool = False) -> List[str]:
+ """
+ Detect external URLs in the given HTML and CSS files.
+
+ Parameters
+ ----------
+ files : List[str]
+ List of file paths to check for external URLs
+ verbose : bool, optional
+ Whether to print verbose output, by default False
+
+ Returns
+ -------
+ List[str]
+ List of external URLs found in the files
+ """
+
+ external_urls: Set[str] = set()
+ for file in files:
+ f_detect: Union[Callable[[str, str], List[str]], None] = None
+ if file.endswith(".html"):
+ f_detect = detect_html_external_urls
+ elif file.endswith(".css"):
+ f_detect = detect_css_external_urls
+ else:
+ continue
+ with open(file, "r", encoding="utf-8") as f:
+ content = f.read()
+ urls = f_detect(content)
+ if verbose:
+ print(f"Processing {file}")
+ exist_urls, new_urls = 0, 0
+ for url in urls:
+ if url in external_urls:
+ exist_urls += 1
+ else:
+ new_urls += 1
+ if verbose:
+ print(f"Found new {url}")
+ print(f"Found {exist_urls} existing resources and {new_urls} new
resources")
+ external_urls.update(urls)
+ if verbose:
+ print(f"Total {len(external_urls)} external resources")
+ print("External resources:")
+ print("\n".join(external_urls))
+
+ return list(external_urls)
+
+
+def download_external_urls(
+ external_urls: List[str], verbose: bool = False
+) -> Tuple[Dict[str, str], List[str]]:
+ """
+ Download external URLs and save them to docs/_static/downloads.
+
+ Parameters
+ ----------
+ external_urls : List[str]
+ List of external URLs to download
+ verbose : bool, optional
+ Whether to print verbose output, by default False
+
+ Returns
+ -------
+ Tuple[Dict[str, str], List[str]]
+ A tuple containing:
+ - Dictionary mapping original URLs to their downloaded file paths
+ - List of paths to all downloaded files (including source maps)
+ """
+ download_dir = os.path.join(HTML_DIR, "_static/downloads")
+ os.makedirs(download_dir, exist_ok=True)
+ used_file_names: Set[str] = set()
+ downloaded_files: List[str] = []
+ remap_urls: Dict[str, str] = {}
+ for url in external_urls:
+ query = urlparse(url).query
+ if url.startswith("https://fonts.googleapis.com/css2"):
+ file_name = f"{hashlib.md5(url.encode()).hexdigest()}.css"
+ elif query:
+ raise ValueError(f"Unsupported URL with query: {url}")
+ else:
+ file_name = urlparse(url).path.split("/")[-1]
+ if verbose:
+ print(f"remapping {url} to {file_name}")
+ if file_name in used_file_names:
+ raise ValueError(f"File name {file_name} already exists")
+ used_file_names.add(file_name)
+ response = requests.get(url, timeout=30)
+ body = response.content
+ with open(os.path.join(download_dir, file_name), "wb") as f:
+ f.write(body)
+ remap_urls[url] = os.path.join(download_dir, file_name)
+ downloaded_files.append(os.path.join(download_dir, file_name))
+
+ # Also download the sourceMappingURL
+ if not url.startswith("https://fonts.googleapis.com/css2"):
+ map_file_name = f"{file_name}.map"
+ response = requests.get(f"{url}.map", timeout=30)
+ if response.status_code == 200:
+ body = response.content
+ with open(os.path.join(download_dir, map_file_name), "wb") as
f:
+ f.write(body)
+ if verbose:
+ print(f"Downloaded {map_file_name} for {url}")
+ downloaded_files.append(os.path.join(download_dir,
map_file_name))
+
+ return remap_urls, downloaded_files
+
+
+def replace_urls_in_files(remap_urls: Dict[str, str], verbose: bool = False):
+ """
+ Replace external URLs with their downloaded versions in HTML/CSS files.
+
+ Parameters
+ ----------
+ remap_urls : Dict[str, str]
+ Dictionary mapping original URLs to their downloaded file paths
+ verbose : bool, optional
+ Whether to print verbose output, by default False
+ """
+ for root, _, files in os.walk(HTML_DIR):
+ for file in files:
+ if not (file.endswith(".html") or file.endswith(".css")):
+ continue
+
+ file_path = os.path.join(root, file)
+ if verbose:
+ print(f"Processing {file_path}")
+
+ # Calculate relative path from current file to _static/downloads
+ rel_path = os.path.relpath(
+ os.path.join(HTML_DIR, "_static/downloads"),
os.path.dirname(file_path)
+ )
+
+ with open(file_path, "r", encoding="utf-8") as f:
+ content = f.read()
+
+ new_content = content
+ for original_url, new_path in remap_urls.items():
+ relative_url = os.path.join(rel_path,
os.path.basename(new_path))
+ new_content = new_content.replace(original_url, relative_url)
+
+ if new_content != content:
+ with open(file_path, "w", encoding="utf-8") as f:
+ f.write(new_content)
+ if verbose:
+ print(f"Updated {file_path}")
+
+
+def download_and_replace_urls(files: Optional[List[str]] = None, verbose: bool
= False):
+ """
+ Download external URLs found in files and replace them with local copies.
+ Recursively processes any new external URLs found in downloaded content.
+
+ Parameters
+ ----------
+ files : Optional[List[str]], optional
+ List of files to check for external URLs. If None, checks all files
under HTML_DIR
+ verbose : bool, optional
+ Whether to print verbose output, by default False
+ """
+ if files is None:
+ files = all_files_in_dir(HTML_DIR)
+ remap_urls = {}
+ while True:
+ external_urls = detect_urls(files, verbose=verbose)
+ if not external_urls:
+ break
+ round_remap_urls, files = download_external_urls(external_urls,
verbose=verbose)
+ remap_urls.update(round_remap_urls)
+
+ replace_urls_in_files(remap_urls, verbose=verbose)
+
+
+if __name__ == "__main__":
+ args = argparse.ArgumentParser()
+ args.add_argument("-v", "--verbose", action="store_true")
+ args = args.parse_args()
+ download_and_replace_urls(verbose=args.verbose)
diff --git a/scripts/task_deploy_asf_site.sh b/scripts/task_deploy_asf_site.sh
index d7ff59e7dc..22c58c4f74 100755
--- a/scripts/task_deploy_asf_site.sh
+++ b/scripts/task_deploy_asf_site.sh
@@ -5,6 +5,7 @@ set -u
echo "Start to generate and deploy site ..."
bundle exec jekyll b
+python3 scripts/download_3rdparty_embeds.py -v
cp .gitignore .gitignore.bak
cp .asf.yaml .asf.yaml.bak
diff --git a/tvm b/tvm
deleted file mode 160000
index bb48a45bcf..0000000000
--- a/tvm
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit bb48a45bcfc7d8a40dadca0ab7f589f59fdec374