Re: [PR] Harden documentation link validation to prevent false CI passes [incubator-hugegraph-doc]

via GitHub Tue, 10 Feb 2026 22:14:28 -0800


imbajin commented on code in PR #452:
URL: 
https://github.com/apache/incubator-hugegraph-doc/pull/452#discussion_r2791586905



##########
dist/validate-links.sh:
##########
@@ -1,63 +1,249 @@
 #!/bin/bash
 
-# Configuration
 CONTENT_DIR="content"
 EXIT_CODE=0
 
+VERBOSE="${VERBOSE:-0}"
+
+log_verbose() {
+    [[ "$VERBOSE" == "1" ]] && echo "Info: $*"
+}
+
+
+ASSET_EXTENSIONS_REGEX='png|jpg|jpeg|svg|gif|webp|avif|ico|xml|yaml|yml|json|css|js|pdf|zip|tar.gz|woff|woff2|ttf|eot|mp4|webm'
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit 1
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" || exit 1
+CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)" || exit 1
+
+if [[ ! -d "$CONTENT_ROOT" ]]; then
+    echo "Error: content directory not found. Run from repository root."
+    exit 1
+fi
+
+normalize_link() {
+    local link="$1"
+
+    # Decode common URL-encoded characters explicitly
+    link="${link//%20/ }"   # space
+    link="${link//%23/#}"   # hash
+    link="${link//%2F/\/}"  # forward slash
+
+    # Generic percent-decoding for remaining cases
+    link="${link//%/\\x}"
+    link="$(printf '%b' "$link")"
+
+    link="${link%%#*}"
+    link="${link%%\?*}"
+
+    if [[ "$link" != "/" ]]; then
+        link="${link%/}"
+    fi
+
+    printf "%s" "$link"
+}
+
+canonicalize_path() {
+    local path="$1"
+    local result=()
+    local part
+
+    IFS='/' read -r -a parts <<< "$path"
+
+    for part in "${parts[@]}"; do
+        if [[ -z "$part" || "$part" == "." ]]; then
+            continue
+        elif [[ "$part" == ".." ]]; then
+            if [[ ${#result[@]} -gt 0 ]]; then
+                unset 'result[-1]'
+            fi
+        else
+            result+=("$part")
+        fi
+    done
+
+    if [[ ${#result[@]} -eq 0 ]]; then
+        printf "/"
+    else
+        ( IFS='/'; printf "/%s" "${result[*]}" )
+    fi
+}
+
+resolve_real_path() {
+    local path="$1"
+
+    if command -v python3 >/dev/null 2>&1; then
+        # Use Python to compute realpath which resolves symlinks AND 
normalizes paths
+        # Python's os.path.realpath is tolerant of non-existent final targets
+        python3 - <<'PY' "$path"
+import os
+import sys
+p = sys.argv[1]
+print(os.path.realpath(p))
+PY
+    else
+        # Fallback: Normalize without symlink resolution if Python3 unavailable
+        # Note: This won't resolve symlinks, only normalize .. and . components
+        canonicalize_path "$path"
+    fi
+}
+
+check_internal_link() {
+    local link="$1"
+    local file="$2"
+    local line_no="$3"
+    local clean_link
+    local target_path
+    local location
+
+    clean_link="$(normalize_link "$link")"
+
+    [[ -z "$clean_link" || "$clean_link" == "#" ]] && return 0
+
+    if [[ "$clean_link" == "{{"* ]]; then
+        log_verbose "Skipping Hugo shortcode link: $link ($file:$line_no)"
+        return 0
+    fi
+
+    local clean_lower="${clean_link,,}"
+
+    if [[ "$clean_lower" == http://* || "$clean_lower" == https://* || 
"$clean_lower" == "//"* ]]; then
+        log_verbose "Skipping external link: $link ($file:$line_no)"
+        return 0
+    fi
+
+    case "$clean_lower" in
+        mailto:*|tel:*|javascript:*|data:*)
+            return 0
+            ;;
+    esac
+
+    if [[ "$clean_link" == /docs/* ]]; then
+        target_path="$CONTENT_ROOT/en${clean_link}"
+    elif [[ "$clean_link" == /cn/docs/* ]]; then
+        target_path="$CONTENT_ROOT${clean_link}"
+    elif [[ "$clean_link" == /* ]]; then
+        # Skip validation for ambiguous absolute paths (Hugo runtime URLs)
+        location="$file"
+        [[ -n "$line_no" ]] && location="$file:$line_no"
+        echo "Warning: Skipping validation for ambiguous absolute path"
+        echo "  File: $location"
+        echo "  Link: $link"
+        echo "  Reason: Hugo runtime URL (not directly mappable to filesystem)"
+        return 0
+    else
+        local file_dir
+        file_dir="$(cd "$(dirname "$file")" && pwd)"
+        target_path="$file_dir/$clean_link"
+    fi
+
+    target_path="$(canonicalize_path "$target_path")"
+    target_path="$(resolve_real_path "$target_path")"
+
+    case "$target_path" in
+        "$CONTENT_ROOT"/*) ;;
+        *)
+            location="$file"
+            [[ -n "$line_no" ]] && location="$file:$line_no"
+            echo "Error: Link resolves outside content directory"
+            echo "  File: $location"
+            echo "  Link: $link"
+            EXIT_CODE=1
+            return
+            ;;
+    esac
+
+    if [[ "$clean_lower" =~ \.(${ASSET_EXTENSIONS_REGEX})$ ]]; then
+        if [[ -f "$target_path" ]]; then
+            return 0
+        else
+            location="$file"
+            [[ -n "$line_no" ]] && location="$file:$line_no"
+            echo "Error: Broken link"
+            echo "  File: $location"
+            echo "  Link: $link"
+            echo "  Target: $target_path"
+            EXIT_CODE=1
+            return
+        fi
+    fi
+
+    if [[ -f "$target_path" || -f "$target_path.md" || -f 
"$target_path/_index.md" || -f "$target_path/README.md" ]]; then
+        return 0
+    fi
+
+    location="$file"
+    [[ -n "$line_no" ]] && location="$file:$line_no"
+
+    echo "Error: Broken link"
+    echo "  File: $location"
+    echo "  Link: $link"
+    echo "  Target: $target_path"
+    EXIT_CODE=1
+}
+
 echo "Starting link validation..."
 
-# Find all markdown files and verify links
 while read -r FILE; do
-    # Extract internal links starting with /docs/ or /cn/docs/
-    # We look for [text](url) pattern where url starts with /docs/ or /cn/docs/
-    # Using grep to find all matching links in the file
-    while read -r MATCH; do
-        if [ -z "$MATCH" ]; then continue; fi
+    declare -A CODE_LINES

Review Comment:
   P0: This script now depends on Bash 4+ (`declare -A` here and `${var,,}` 
below), but macOS default `/bin/bash` is 3.2. In that environment the script 
logs syntax errors and still exits 0, which can produce false "Link validation 
passed". Please either add an explicit version guard (`BASH_VERSINFO >= 4`, 
otherwise exit 1) or remove Bash-4-only syntax so failure is deterministic.



##########
dist/validate-links.sh:
##########
@@ -1,63 +1,249 @@
 #!/bin/bash
 
-# Configuration
 CONTENT_DIR="content"
 EXIT_CODE=0
 
+VERBOSE="${VERBOSE:-0}"
+
+log_verbose() {
+    [[ "$VERBOSE" == "1" ]] && echo "Info: $*"
+}
+
+
+ASSET_EXTENSIONS_REGEX='png|jpg|jpeg|svg|gif|webp|avif|ico|xml|yaml|yml|json|css|js|pdf|zip|tar.gz|woff|woff2|ttf|eot|mp4|webm'
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit 1
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" || exit 1
+CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)" || exit 1
+
+if [[ ! -d "$CONTENT_ROOT" ]]; then
+    echo "Error: content directory not found. Run from repository root."
+    exit 1
+fi
+
+normalize_link() {
+    local link="$1"
+
+    # Decode common URL-encoded characters explicitly
+    link="${link//%20/ }"   # space
+    link="${link//%23/#}"   # hash
+    link="${link//%2F/\/}"  # forward slash
+
+    # Generic percent-decoding for remaining cases
+    link="${link//%/\\x}"
+    link="$(printf '%b' "$link")"
+
+    link="${link%%#*}"
+    link="${link%%\?*}"
+
+    if [[ "$link" != "/" ]]; then
+        link="${link%/}"
+    fi
+
+    printf "%s" "$link"
+}
+
+canonicalize_path() {
+    local path="$1"
+    local result=()
+    local part
+
+    IFS='/' read -r -a parts <<< "$path"
+
+    for part in "${parts[@]}"; do
+        if [[ -z "$part" || "$part" == "." ]]; then
+            continue
+        elif [[ "$part" == ".." ]]; then
+            if [[ ${#result[@]} -gt 0 ]]; then
+                unset 'result[-1]'
+            fi
+        else
+            result+=("$part")
+        fi
+    done
+
+    if [[ ${#result[@]} -eq 0 ]]; then
+        printf "/"
+    else
+        ( IFS='/'; printf "/%s" "${result[*]}" )
+    fi
+}
+
+resolve_real_path() {
+    local path="$1"
+
+    if command -v python3 >/dev/null 2>&1; then
+        # Use Python to compute realpath which resolves symlinks AND 
normalizes paths
+        # Python's os.path.realpath is tolerant of non-existent final targets
+        python3 - <<'PY' "$path"
+import os
+import sys
+p = sys.argv[1]
+print(os.path.realpath(p))
+PY
+    else
+        # Fallback: Normalize without symlink resolution if Python3 unavailable
+        # Note: This won't resolve symlinks, only normalize .. and . components
+        canonicalize_path "$path"
+    fi
+}
+
+check_internal_link() {
+    local link="$1"
+    local file="$2"
+    local line_no="$3"
+    local clean_link
+    local target_path
+    local location
+
+    clean_link="$(normalize_link "$link")"
+
+    [[ -z "$clean_link" || "$clean_link" == "#" ]] && return 0
+
+    if [[ "$clean_link" == "{{"* ]]; then
+        log_verbose "Skipping Hugo shortcode link: $link ($file:$line_no)"
+        return 0
+    fi
+
+    local clean_lower="${clean_link,,}"
+
+    if [[ "$clean_lower" == http://* || "$clean_lower" == https://* || 
"$clean_lower" == "//"* ]]; then
+        log_verbose "Skipping external link: $link ($file:$line_no)"
+        return 0
+    fi
+
+    case "$clean_lower" in
+        mailto:*|tel:*|javascript:*|data:*)
+            return 0
+            ;;
+    esac
+
+    if [[ "$clean_link" == /docs/* ]]; then
+        target_path="$CONTENT_ROOT/en${clean_link}"
+    elif [[ "$clean_link" == /cn/docs/* ]]; then
+        target_path="$CONTENT_ROOT${clean_link}"
+    elif [[ "$clean_link" == /* ]]; then

Review Comment:
   P1: This branch skips validation for every absolute path except `/docs/*` 
and `/cn/docs/*`. That leaves real internal links (for example 
`/community/maturity/`, `/clients/gremlin-console.html`, `/blog/...`) 
completely unchecked, so broken links can still pass CI. Please consider 
resolving known site-root paths to `content/` or `static/`, and failing unknown 
absolute paths unless they are explicitly allowlisted.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Harden documentation link validation to prevent false CI passes [incubator-hugegraph-doc]

Reply via email to