imbajin commented on code in PR #452:
URL:
https://github.com/apache/incubator-hugegraph-doc/pull/452#discussion_r2791586905
##########
dist/validate-links.sh:
##########
@@ -1,63 +1,249 @@
#!/bin/bash
-# Configuration
CONTENT_DIR="content"
EXIT_CODE=0
+VERBOSE="${VERBOSE:-0}"
+
+log_verbose() {
+ [[ "$VERBOSE" == "1" ]] && echo "Info: $*"
+}
+
+
+ASSET_EXTENSIONS_REGEX='png|jpg|jpeg|svg|gif|webp|avif|ico|xml|yaml|yml|json|css|js|pdf|zip|tar.gz|woff|woff2|ttf|eot|mp4|webm'
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit 1
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" || exit 1
+CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)" || exit 1
+
+if [[ ! -d "$CONTENT_ROOT" ]]; then
+ echo "Error: content directory not found. Run from repository root."
+ exit 1
+fi
+
+normalize_link() {
+ local link="$1"
+
+ # Decode common URL-encoded characters explicitly
+ link="${link//%20/ }" # space
+ link="${link//%23/#}" # hash
+ link="${link//%2F/\/}" # forward slash
+
+ # Generic percent-decoding for remaining cases
+ link="${link//%/\\x}"
+ link="$(printf '%b' "$link")"
+
+ link="${link%%#*}"
+ link="${link%%\?*}"
+
+ if [[ "$link" != "/" ]]; then
+ link="${link%/}"
+ fi
+
+ printf "%s" "$link"
+}
+
+canonicalize_path() {
+ local path="$1"
+ local result=()
+ local part
+
+ IFS='/' read -r -a parts <<< "$path"
+
+ for part in "${parts[@]}"; do
+ if [[ -z "$part" || "$part" == "." ]]; then
+ continue
+ elif [[ "$part" == ".." ]]; then
+ if [[ ${#result[@]} -gt 0 ]]; then
+ unset 'result[-1]'
+ fi
+ else
+ result+=("$part")
+ fi
+ done
+
+ if [[ ${#result[@]} -eq 0 ]]; then
+ printf "/"
+ else
+ ( IFS='/'; printf "/%s" "${result[*]}" )
+ fi
+}
+
+resolve_real_path() {
+ local path="$1"
+
+ if command -v python3 >/dev/null 2>&1; then
+ # Use Python to compute realpath which resolves symlinks AND
normalizes paths
+ # Python's os.path.realpath is tolerant of non-existent final targets
+ python3 - <<'PY' "$path"
+import os
+import sys
+p = sys.argv[1]
+print(os.path.realpath(p))
+PY
+ else
+ # Fallback: Normalize without symlink resolution if Python3 unavailable
+ # Note: This won't resolve symlinks, only normalize .. and . components
+ canonicalize_path "$path"
+ fi
+}
+
+check_internal_link() {
+ local link="$1"
+ local file="$2"
+ local line_no="$3"
+ local clean_link
+ local target_path
+ local location
+
+ clean_link="$(normalize_link "$link")"
+
+ [[ -z "$clean_link" || "$clean_link" == "#" ]] && return 0
+
+ if [[ "$clean_link" == "{{"* ]]; then
+ log_verbose "Skipping Hugo shortcode link: $link ($file:$line_no)"
+ return 0
+ fi
+
+ local clean_lower="${clean_link,,}"
+
+ if [[ "$clean_lower" == http://* || "$clean_lower" == https://* ||
"$clean_lower" == "//"* ]]; then
+ log_verbose "Skipping external link: $link ($file:$line_no)"
+ return 0
+ fi
+
+ case "$clean_lower" in
+ mailto:*|tel:*|javascript:*|data:*)
+ return 0
+ ;;
+ esac
+
+ if [[ "$clean_link" == /docs/* ]]; then
+ target_path="$CONTENT_ROOT/en${clean_link}"
+ elif [[ "$clean_link" == /cn/docs/* ]]; then
+ target_path="$CONTENT_ROOT${clean_link}"
+ elif [[ "$clean_link" == /* ]]; then
+ # Skip validation for ambiguous absolute paths (Hugo runtime URLs)
+ location="$file"
+ [[ -n "$line_no" ]] && location="$file:$line_no"
+ echo "Warning: Skipping validation for ambiguous absolute path"
+ echo " File: $location"
+ echo " Link: $link"
+ echo " Reason: Hugo runtime URL (not directly mappable to filesystem)"
+ return 0
+ else
+ local file_dir
+ file_dir="$(cd "$(dirname "$file")" && pwd)"
+ target_path="$file_dir/$clean_link"
+ fi
+
+ target_path="$(canonicalize_path "$target_path")"
+ target_path="$(resolve_real_path "$target_path")"
+
+ case "$target_path" in
+ "$CONTENT_ROOT"/*) ;;
+ *)
+ location="$file"
+ [[ -n "$line_no" ]] && location="$file:$line_no"
+ echo "Error: Link resolves outside content directory"
+ echo " File: $location"
+ echo " Link: $link"
+ EXIT_CODE=1
+ return
+ ;;
+ esac
+
+ if [[ "$clean_lower" =~ \.(${ASSET_EXTENSIONS_REGEX})$ ]]; then
+ if [[ -f "$target_path" ]]; then
+ return 0
+ else
+ location="$file"
+ [[ -n "$line_no" ]] && location="$file:$line_no"
+ echo "Error: Broken link"
+ echo " File: $location"
+ echo " Link: $link"
+ echo " Target: $target_path"
+ EXIT_CODE=1
+ return
+ fi
+ fi
+
+ if [[ -f "$target_path" || -f "$target_path.md" || -f
"$target_path/_index.md" || -f "$target_path/README.md" ]]; then
+ return 0
+ fi
+
+ location="$file"
+ [[ -n "$line_no" ]] && location="$file:$line_no"
+
+ echo "Error: Broken link"
+ echo " File: $location"
+ echo " Link: $link"
+ echo " Target: $target_path"
+ EXIT_CODE=1
+}
+
echo "Starting link validation..."
-# Find all markdown files and verify links
while read -r FILE; do
- # Extract internal links starting with /docs/ or /cn/docs/
- # We look for [text](url) pattern where url starts with /docs/ or /cn/docs/
- # Using grep to find all matching links in the file
- while read -r MATCH; do
- if [ -z "$MATCH" ]; then continue; fi
+ declare -A CODE_LINES
Review Comment:
P0: This script now depends on Bash 4+ (`declare -A` here and `${var,,}`
below), but macOS default `/bin/bash` is 3.2. In that environment the script
logs syntax errors and still exits 0, which can produce false "Link validation
passed". Please either add an explicit version guard (`BASH_VERSINFO >= 4`,
otherwise exit 1) or remove Bash-4-only syntax so failure is deterministic.
##########
dist/validate-links.sh:
##########
@@ -1,63 +1,249 @@
#!/bin/bash
-# Configuration
CONTENT_DIR="content"
EXIT_CODE=0
+VERBOSE="${VERBOSE:-0}"
+
+log_verbose() {
+ [[ "$VERBOSE" == "1" ]] && echo "Info: $*"
+}
+
+
+ASSET_EXTENSIONS_REGEX='png|jpg|jpeg|svg|gif|webp|avif|ico|xml|yaml|yml|json|css|js|pdf|zip|tar.gz|woff|woff2|ttf|eot|mp4|webm'
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit 1
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" || exit 1
+CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)" || exit 1
+
+if [[ ! -d "$CONTENT_ROOT" ]]; then
+ echo "Error: content directory not found. Run from repository root."
+ exit 1
+fi
+
+normalize_link() {
+ local link="$1"
+
+ # Decode common URL-encoded characters explicitly
+ link="${link//%20/ }" # space
+ link="${link//%23/#}" # hash
+ link="${link//%2F/\/}" # forward slash
+
+ # Generic percent-decoding for remaining cases
+ link="${link//%/\\x}"
+ link="$(printf '%b' "$link")"
+
+ link="${link%%#*}"
+ link="${link%%\?*}"
+
+ if [[ "$link" != "/" ]]; then
+ link="${link%/}"
+ fi
+
+ printf "%s" "$link"
+}
+
+canonicalize_path() {
+ local path="$1"
+ local result=()
+ local part
+
+ IFS='/' read -r -a parts <<< "$path"
+
+ for part in "${parts[@]}"; do
+ if [[ -z "$part" || "$part" == "." ]]; then
+ continue
+ elif [[ "$part" == ".." ]]; then
+ if [[ ${#result[@]} -gt 0 ]]; then
+ unset 'result[-1]'
+ fi
+ else
+ result+=("$part")
+ fi
+ done
+
+ if [[ ${#result[@]} -eq 0 ]]; then
+ printf "/"
+ else
+ ( IFS='/'; printf "/%s" "${result[*]}" )
+ fi
+}
+
+resolve_real_path() {
+ local path="$1"
+
+ if command -v python3 >/dev/null 2>&1; then
+ # Use Python to compute realpath which resolves symlinks AND
normalizes paths
+ # Python's os.path.realpath is tolerant of non-existent final targets
+ python3 - <<'PY' "$path"
+import os
+import sys
+p = sys.argv[1]
+print(os.path.realpath(p))
+PY
+ else
+ # Fallback: Normalize without symlink resolution if Python3 unavailable
+ # Note: This won't resolve symlinks, only normalize .. and . components
+ canonicalize_path "$path"
+ fi
+}
+
+check_internal_link() {
+ local link="$1"
+ local file="$2"
+ local line_no="$3"
+ local clean_link
+ local target_path
+ local location
+
+ clean_link="$(normalize_link "$link")"
+
+ [[ -z "$clean_link" || "$clean_link" == "#" ]] && return 0
+
+ if [[ "$clean_link" == "{{"* ]]; then
+ log_verbose "Skipping Hugo shortcode link: $link ($file:$line_no)"
+ return 0
+ fi
+
+ local clean_lower="${clean_link,,}"
+
+ if [[ "$clean_lower" == http://* || "$clean_lower" == https://* ||
"$clean_lower" == "//"* ]]; then
+ log_verbose "Skipping external link: $link ($file:$line_no)"
+ return 0
+ fi
+
+ case "$clean_lower" in
+ mailto:*|tel:*|javascript:*|data:*)
+ return 0
+ ;;
+ esac
+
+ if [[ "$clean_link" == /docs/* ]]; then
+ target_path="$CONTENT_ROOT/en${clean_link}"
+ elif [[ "$clean_link" == /cn/docs/* ]]; then
+ target_path="$CONTENT_ROOT${clean_link}"
+ elif [[ "$clean_link" == /* ]]; then
Review Comment:
P1: This branch skips validation for every absolute path except `/docs/*`
and `/cn/docs/*`. That leaves real internal links (for example
`/community/maturity/`, `/clients/gremlin-console.html`, `/blog/...`)
completely unchecked, so broken links can still pass CI. Please consider
resolving known site-root paths to `content/` or `static/`, and failing unknown
absolute paths unless they are explicitly allowlisted.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]