Re: [PR] Harden documentation link validation to prevent false CI passes [incubator-hugegraph-doc]

via GitHub Wed, 11 Feb 2026 06:12:34 -0800


imbajin commented on code in PR #452:
URL: 
https://github.com/apache/incubator-hugegraph-doc/pull/452#discussion_r2793512035



##########
dist/validate-links.sh:
##########
@@ -1,63 +1,283 @@
 #!/bin/bash
+set -o errexit
+set -o pipefail
 
-# Configuration
 CONTENT_DIR="content"
 EXIT_CODE=0
 
-echo "Starting link validation..."
+VERBOSE="${VERBOSE:-0}"
+
+log_verbose() {
+    if [[ "$VERBOSE" == "1" ]]; then
+        echo "Info: $*"
+    fi
+}
+
+ASSET_EXTENSIONS_REGEX='png|jpg|jpeg|svg|gif|webp|avif|ico|xml|yaml|yml|json|css|js|pdf|zip|tar\.gz|woff|woff2|ttf|eot|mp4|webm'
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit 1
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" || exit 1
+CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)" || exit 1
+
+if [[ ! -d "$CONTENT_ROOT" ]]; then
+    echo "Error: content directory not found. Run from repository root."
+    exit 1
+fi
+
+normalize_link() {
+    local link="$1"
+
+    # Decode common URL-encoded characters explicitly
+    link="${link//%20/ }"   # space
+    link="${link//%23/#}"   # hash
+    link="${link//%2F/\/}"  # forward slash
+
+    # Generic percent-decoding for remaining cases
+    link="${link//%/\\x}"
+    link="$(printf '%b' "$link")"
+
+    link="${link%%#*}"
+    link="${link%%\?*}"
+
+    if [[ "$link" != "/" ]]; then
+        link="${link%/}"
+    fi
+
+    printf "%s" "$link"
+}
+
+canonicalize_path() {
+    local path="$1"
+    local result=()
+    local part
+    local parts
+
+    # Bash 3.2 compatible: use here-string
+    IFS='/' read -r -a parts <<< "$path"
+
+    for part in "${parts[@]}"; do
+        if [[ -z "$part" || "$part" == "." ]]; then
+            continue
+        elif [[ "$part" == ".." ]]; then
+            # Bash 3.2 compatible: calculate last index instead of using -1
+            if [[ ${#result[@]} -gt 0 ]]; then
+                local last_idx=$((${#result[@]} - 1))
+                unset "result[$last_idx]"
+            fi
+        else
+            result+=("$part")
+        fi
+    done
+
+    if [[ ${#result[@]} -eq 0 ]]; then
+        printf "/"
+    else
+        ( IFS='/'; printf "/%s" "${result[*]}" )
+    fi
+}
+
+resolve_real_path() {
+    local path="$1"
+
+    if command -v python3 >/dev/null 2>&1; then
+        # Use Python to compute realpath which resolves symlinks AND 
normalizes paths
+        # Python's os.path.realpath is tolerant of non-existent final targets
+        python3 - <<'PY' "$path"
+import os
+import sys
+p = sys.argv[1]
+print(os.path.realpath(p))
+PY
+    else
+        # Fallback: Normalize without symlink resolution if Python3 unavailable
+        # Note: This won't resolve symlinks, only normalize .. and . components
+        canonicalize_path "$path"
+    fi
+}
+
+check_internal_link() {
+    local link="$1"
+    local file="$2"
+    local line_no="$3"
+    local clean_link
+    local target_path
+    local location
+
+    clean_link="$(normalize_link "$link")"
+
+    [[ -z "$clean_link" || "$clean_link" == "#" ]] && return 0
+
+    if [[ "$clean_link" == "{{"* ]]; then
+        log_verbose "Skipping Hugo shortcode link: $link ($file:$line_no)"
+        return 0
+    fi
+
+    local clean_lower
+    clean_lower="$(printf "%s" "$clean_link" | tr '[:upper:]' '[:lower:]')"
+
+    if [[ "$clean_lower" == http://* || "$clean_lower" == https://* || 
"$clean_lower" == "//"* ]]; then
+        log_verbose "Skipping external link: $link ($file:$line_no)"
+        return 0
+    fi
+
+    case "$clean_lower" in
+        mailto:*|tel:*|javascript:*|data:*)
+            return 0
+            ;;
+    esac
+
+    if [[ "$clean_link" == /docs/* ]]; then
+        target_path="$CONTENT_ROOT/en${clean_link}"
+
+    elif [[ "$clean_link" == /cn/docs/* ]]; then
+        target_path="$CONTENT_ROOT${clean_link}"
+
+    elif [[ "$clean_link" == /community/* ]]; then
+        target_path="$CONTENT_ROOT/en${clean_link}"
 
-# Find all markdown files and verify links
-while read -r FILE; do
-    # Extract internal links starting with /docs/ or /cn/docs/
-    # We look for [text](url) pattern where url starts with /docs/ or /cn/docs/
-    # Using grep to find all matching links in the file
-    while read -r MATCH; do
-        if [ -z "$MATCH" ]; then continue; fi
-
-        # Extract URL from ](url)
-        LINK=${MATCH#*](}
-        LINK=${LINK%)}
-
-        # Remove anchor and query parameters
-        CLEAN_LINK=$(echo "$LINK" | cut -d'#' -f1 | cut -d'?' -f1)
-        CLEAN_LINK=${CLEAN_LINK%/}
-
-        # Determine target file path based on language prefix
-        if [[ "$CLEAN_LINK" == /docs/* ]]; then
-            TARGET_PATH="content/en${CLEAN_LINK}"
-        elif [[ "$CLEAN_LINK" == /cn/docs/* ]]; then
-            TARGET_PATH="content${CLEAN_LINK}"
+    elif [[ "$clean_link" == /blog/* ]]; then
+        target_path="$CONTENT_ROOT/en${clean_link}"
+
+    elif [[ "$clean_link" == /language/* ]]; then
+        target_path="$CONTENT_ROOT/en${clean_link}"
+
+    elif [[ "$clean_link" == /clients/* ]]; then
+        target_path="$REPO_ROOT/static${clean_link}"
+
+    elif [[ "$clean_link" == /* ]]; then
+        location="$file"
+        [[ -n "$line_no" ]] && location="$file:$line_no"
+
+        echo "Error: Unsupported absolute internal path (cannot validate 
deterministically)"
+        echo "  File: $location"
+        echo "  Link: $link"
+
+        EXIT_CODE=1
+        return
+
+    else
+        local file_dir
+        file_dir="$(cd "$(dirname "$file")" && pwd)"
+        target_path="$file_dir/$clean_link"
+    fi
+
+    target_path="$(canonicalize_path "$target_path")"
+    target_path="$(resolve_real_path "$target_path")"
+
+    case "$target_path" in
+        "$CONTENT_ROOT"/*) ;;
+        "$REPO_ROOT/static"/*) ;;
+        *)
+            location="$file"
+            [[ -n "$line_no" ]] && location="$file:$line_no"
+            echo "Error: Link resolves outside content directory"
+            echo "  File: $location"
+            echo "  Link: $link"
+            EXIT_CODE=1
+            return
+            ;;
+    esac
+
+    if [[ "$clean_lower" =~ \.(${ASSET_EXTENSIONS_REGEX})$ ]]; then
+        if [[ -f "$target_path" ]]; then
+            return 0
         else
+            location="$file"
+            [[ -n "$line_no" ]] && location="$file:$line_no"
+            echo "Error: Broken link"
+            echo "  File: $location"
+            echo "  Link: $link"
+            echo "  Target: $target_path"
+            EXIT_CODE=1
+            return
+        fi
+    fi
+
+    if [[ -f "$target_path" || -f "$target_path.md" || -f 
"$target_path/_index.md" || -f "$target_path/README.md" ]]; then
+        return 0
+    fi
+
+    location="$file"
+    [[ -n "$line_no" ]] && location="$file:$line_no"
+
+    echo "Error: Broken link"
+    echo "  File: $location"
+    echo "  Link: $link"
+    echo "  Target: $target_path"
+    EXIT_CODE=1
+}
+
+echo "Starting link validation..."
+
+while IFS= read -r FILE; do
+
+    CODE_LINES=""
+    in_fence=false
+    line_no=0
+
+    while IFS= read -r line || [[ -n "$line" ]]; do
+        ((++line_no))
+        # NOTE:
+        # Code fence detection is heuristic and does not validate proper 
pairing.
+        # The logic simply toggles state when encountering ``` or ~~~ markers.
+        # If a Markdown file contains an unclosed fence or mismatched fence 
types,
+        # all subsequent lines may be treated as code and skipped from 
validation.
+        # This behavior is intentional to keep the validator lightweight and
+        # avoids implementing a full Markdown parser. Such cases require 
manual review.
+        if [[ "$line" =~ ^[[:space:]]*(\`\`\`|~~~) ]]; then
+            # NOTE:
+            # Code fence detection assumes fences are properly paired.
+            # If a Markdown file contains an unclosed or mismatched fence,
+            # subsequent content may be treated as code and skipped.
+            # This script does not attempt full Markdown validation.
+
+            if $in_fence; then
+                in_fence=false
+            else
+                in_fence=true
+            fi
+            CODE_LINES="$CODE_LINES $line_no "
             continue
         fi
 
-        # Check for file existence variations
-        FOUND=false
-
-        # Check 1: As .md file
-        if [[ -f "${TARGET_PATH}.md" ]]; then
-            FOUND=true
-        # Check 2: Exact file (if extension was included)
-        elif [[ -f "$TARGET_PATH" ]]; then
-            FOUND=true
-        # Check 3: Directory index
-        elif [[ -f "${TARGET_PATH}/_index.md" ]]; then
-            FOUND=true
-        # Check 4: Directory README (legacy)
-        elif [[ -f "${TARGET_PATH}/README.md" ]]; then
-            FOUND=true
+        if $in_fence; then
+            CODE_LINES="$CODE_LINES $line_no "
+            continue
         fi
 
-        if [ "$FOUND" = false ]; then
-            echo "Error: Broken link in $FILE"
-            echo "  Link: $LINK"
-            echo "  Target: $TARGET_PATH (and variants)"
-            EXIT_CODE=1
+        # NOTE:
+        # Inline code detection is heuristic and intentionally simplistic.
+        # The logic assumes backticks are properly paired within a single line
+        # after removing escaped backticks. Malformed Markdown, complex inline
+        # constructs, or unusual escaping patterns may cause false positives
+        # or false negatives. This validator does not implement a full Markdown
+        # parser and therefore cannot guarantee perfect inline code detection.
+        escaped_line="${line//\\\`/}"
+        inline_count=$(printf "%s\n" "$escaped_line" | grep -o "\`" || true)
+        inline_count=$(printf "%s\n" "$inline_count" | wc -l)
+        if (( inline_count % 2 == 1 )); then

Review Comment:
   ‼️ **Potential false negative in inline-code detection**: lines without 
backticks are currently treated as if they had an odd backtick count, so 
non-code lines are added to `CODE_LINES` and skipped. This can allow broken 
links to pass CI undetected. Could we normalize the `wc -l` output before 
parity check?
   
   ```suggestion
           escaped_line="${line//\\\`/}"
           inline_count=$(printf "%s\n" "$escaped_line" | grep -o "\`" | wc -l 
|| true)
           inline_count="${inline_count//[[:space:]]/}"
           if (( inline_count % 2 == 1 )); then
               CODE_LINES="$CODE_LINES $line_no "
           fi
   ```



##########
dist/validate-links.sh:
##########
@@ -1,63 +1,283 @@
 #!/bin/bash
+set -o errexit
+set -o pipefail
 
-# Configuration
 CONTENT_DIR="content"
 EXIT_CODE=0
 
-echo "Starting link validation..."
+VERBOSE="${VERBOSE:-0}"
+
+log_verbose() {
+    if [[ "$VERBOSE" == "1" ]]; then
+        echo "Info: $*"
+    fi
+}
+
+ASSET_EXTENSIONS_REGEX='png|jpg|jpeg|svg|gif|webp|avif|ico|xml|yaml|yml|json|css|js|pdf|zip|tar\.gz|woff|woff2|ttf|eot|mp4|webm'
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit 1
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" || exit 1
+CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)" || exit 1
+
+if [[ ! -d "$CONTENT_ROOT" ]]; then
+    echo "Error: content directory not found. Run from repository root."
+    exit 1
+fi
+
+normalize_link() {
+    local link="$1"
+
+    # Decode common URL-encoded characters explicitly
+    link="${link//%20/ }"   # space
+    link="${link//%23/#}"   # hash
+    link="${link//%2F/\/}"  # forward slash
+
+    # Generic percent-decoding for remaining cases
+    link="${link//%/\\x}"
+    link="$(printf '%b' "$link")"
+
+    link="${link%%#*}"
+    link="${link%%\?*}"
+
+    if [[ "$link" != "/" ]]; then
+        link="${link%/}"
+    fi
+
+    printf "%s" "$link"
+}
+
+canonicalize_path() {
+    local path="$1"
+    local result=()
+    local part
+    local parts
+
+    # Bash 3.2 compatible: use here-string
+    IFS='/' read -r -a parts <<< "$path"
+
+    for part in "${parts[@]}"; do
+        if [[ -z "$part" || "$part" == "." ]]; then
+            continue
+        elif [[ "$part" == ".." ]]; then
+            # Bash 3.2 compatible: calculate last index instead of using -1
+            if [[ ${#result[@]} -gt 0 ]]; then
+                local last_idx=$((${#result[@]} - 1))
+                unset "result[$last_idx]"
+            fi
+        else
+            result+=("$part")
+        fi
+    done
+
+    if [[ ${#result[@]} -eq 0 ]]; then
+        printf "/"
+    else
+        ( IFS='/'; printf "/%s" "${result[*]}" )
+    fi
+}
+
+resolve_real_path() {
+    local path="$1"
+
+    if command -v python3 >/dev/null 2>&1; then
+        # Use Python to compute realpath which resolves symlinks AND 
normalizes paths
+        # Python's os.path.realpath is tolerant of non-existent final targets
+        python3 - <<'PY' "$path"
+import os
+import sys
+p = sys.argv[1]
+print(os.path.realpath(p))
+PY
+    else
+        # Fallback: Normalize without symlink resolution if Python3 unavailable
+        # Note: This won't resolve symlinks, only normalize .. and . components
+        canonicalize_path "$path"
+    fi
+}
+
+check_internal_link() {
+    local link="$1"
+    local file="$2"
+    local line_no="$3"
+    local clean_link
+    local target_path
+    local location
+
+    clean_link="$(normalize_link "$link")"
+
+    [[ -z "$clean_link" || "$clean_link" == "#" ]] && return 0
+
+    if [[ "$clean_link" == "{{"* ]]; then
+        log_verbose "Skipping Hugo shortcode link: $link ($file:$line_no)"
+        return 0
+    fi
+
+    local clean_lower
+    clean_lower="$(printf "%s" "$clean_link" | tr '[:upper:]' '[:lower:]')"
+
+    if [[ "$clean_lower" == http://* || "$clean_lower" == https://* || 
"$clean_lower" == "//"* ]]; then
+        log_verbose "Skipping external link: $link ($file:$line_no)"
+        return 0
+    fi
+
+    case "$clean_lower" in
+        mailto:*|tel:*|javascript:*|data:*)
+            return 0
+            ;;
+    esac
+
+    if [[ "$clean_link" == /docs/* ]]; then
+        target_path="$CONTENT_ROOT/en${clean_link}"
+
+    elif [[ "$clean_link" == /cn/docs/* ]]; then
+        target_path="$CONTENT_ROOT${clean_link}"
+
+    elif [[ "$clean_link" == /community/* ]]; then
+        target_path="$CONTENT_ROOT/en${clean_link}"
 
-# Find all markdown files and verify links
-while read -r FILE; do
-    # Extract internal links starting with /docs/ or /cn/docs/
-    # We look for [text](url) pattern where url starts with /docs/ or /cn/docs/
-    # Using grep to find all matching links in the file
-    while read -r MATCH; do
-        if [ -z "$MATCH" ]; then continue; fi
-
-        # Extract URL from ](url)
-        LINK=${MATCH#*](}
-        LINK=${LINK%)}
-
-        # Remove anchor and query parameters
-        CLEAN_LINK=$(echo "$LINK" | cut -d'#' -f1 | cut -d'?' -f1)
-        CLEAN_LINK=${CLEAN_LINK%/}
-
-        # Determine target file path based on language prefix
-        if [[ "$CLEAN_LINK" == /docs/* ]]; then
-            TARGET_PATH="content/en${CLEAN_LINK}"
-        elif [[ "$CLEAN_LINK" == /cn/docs/* ]]; then
-            TARGET_PATH="content${CLEAN_LINK}"
+    elif [[ "$clean_link" == /blog/* ]]; then

Review Comment:
   ‼️ **`/blog/*` and `/cn/blog/*` are permalink-based routes** and usually do 
not map 1:1 to `content/...` filesystem paths. Valid links can be reported as 
broken (or unsupported) even when Hugo renders them correctly. Suggest skipping 
deterministic file checks for these routes to avoid false positives.
   
   ```suggestion
       elif [[ "$clean_link" == /blog/* || "$clean_link" == /cn/blog/* ]]; then
           # Blog URLs are permalink-based and don't map 1:1 to content file 
paths.
           # Skip deterministic filesystem validation for these routes.
           log_verbose "Skipping permalink-based blog link: $link 
($file:$line_no)"
           return 0
   ```



##########
content/en/docs/SUMMARY.md:
##########
@@ -1,69 +1,67 @@
 # HugeGraph Docs
 
-* [Download](download.md)
+* [Download](download/download)
 
 ## Quickstart
-* [Install HugeGraph-Server](quickstart/hugegraph-server.md)
-* [Load data with HugeGraph-Loader](quickstart/hugegraph-loader.md)
-* [Visual with HugeGraph-Hubble](quickstart/hugegraph-hubble.md)
-* [Develop with HugeGraph-Client](quickstart/hugegraph-client.md)
-* [Manage with HugeGraph-Tools](quickstart/hugegraph-tools.md)
-* [Analysis with HugeGraph-Computer](quickstart/hugegraph-computer.md)
-* [Display with HugeGraph-Studio](quickstart/hugegraph-studio.md)
+* [Install HugeGraph-Server](quickstart/hugegraph/hugegraph-server)
+* [Load data with HugeGraph-Loader](quickstart/toolchain/hugegraph-loader)
+* [Visual with HugeGraph-Hubble](quickstart/toolchain/hugegraph-hubble)
+* [Develop with HugeGraph-Client](quickstart/client/hugegraph-client)
+* [Manage with HugeGraph-Tools](quickstart/toolchain/hugegraph-tools)
+* [Analysis with HugeGraph-Computer](quickstart/computing/hugegraph-computer)
+* [Display with HugeGraph-Studio](quickstart/hugegraph-studio)

Review Comment:
   ⚠️ This link currently resolves to a 404 in rendered docs 
(`/docs/quickstart/hugegraph-studio/`, and same issue for the CN summary 
entry). Since this PR updates `SUMMARY.md` links for correctness, could we also 
update or remove this entry to keep navigation valid?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Harden documentation link validation to prevent false CI passes [incubator-hugegraph-doc]

Reply via email to