imbajin commented on code in PR #452:
URL:
https://github.com/apache/incubator-hugegraph-doc/pull/452#discussion_r2786433953
##########
dist/validate-links.sh:
##########
@@ -1,63 +1,213 @@
#!/bin/bash
-# Configuration
CONTENT_DIR="content"
EXIT_CODE=0
-echo "Starting link validation..."
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)"
-# Find all markdown files and verify links
-while read -r FILE; do
- # Extract internal links starting with /docs/ or /cn/docs/
- # We look for [text](url) pattern where url starts with /docs/ or /cn/docs/
- # Using grep to find all matching links in the file
- while read -r MATCH; do
- if [ -z "$MATCH" ]; then continue; fi
+if [[ ! -d "$CONTENT_ROOT" ]]; then
+ echo "Error: content directory not found. Run from repository root."
+ exit 1
+fi
+
+normalize_link() {
+ local link="$1"
Review Comment:
**⚠️ Important: URL decoding might not handle all edge cases**
The current generic decoding approach may not handle all URL-encoded
characters correctly. Consider handling common cases explicitly before the
generic decode:
```suggestion
# Decode common URL-encoded characters explicitly
link="${link//%20/ }" # space
link="${link//%23/#}" # hash
link="${link//%2F/\/}" # forward slash
# Then do generic decoding for remaining cases
link="${link//%/\\x}"
link="$(printf '%b' "$link")"
```
##########
dist/validate-links.sh:
##########
@@ -1,63 +1,213 @@
#!/bin/bash
-# Configuration
CONTENT_DIR="content"
EXIT_CODE=0
-echo "Starting link validation..."
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)"
-# Find all markdown files and verify links
-while read -r FILE; do
- # Extract internal links starting with /docs/ or /cn/docs/
- # We look for [text](url) pattern where url starts with /docs/ or /cn/docs/
- # Using grep to find all matching links in the file
- while read -r MATCH; do
- if [ -z "$MATCH" ]; then continue; fi
+if [[ ! -d "$CONTENT_ROOT" ]]; then
+ echo "Error: content directory not found. Run from repository root."
+ exit 1
+fi
+
+normalize_link() {
+ local link="$1"
+
+ link="${link//%/\\x}"
+ link="$(printf '%b' "$link")"
- # Extract URL from ](url)
- LINK=${MATCH#*](}
- LINK=${LINK%)}
+ link="${link%%#*}"
+ link="${link%%\?*}"
- # Remove anchor and query parameters
- CLEAN_LINK=$(echo "$LINK" | cut -d'#' -f1 | cut -d'?' -f1)
- CLEAN_LINK=${CLEAN_LINK%/}
+ if [[ "$link" != "/" ]]; then
+ link="${link%/}"
+ fi
- # Determine target file path based on language prefix
- if [[ "$CLEAN_LINK" == /docs/* ]]; then
- TARGET_PATH="content/en${CLEAN_LINK}"
- elif [[ "$CLEAN_LINK" == /cn/docs/* ]]; then
- TARGET_PATH="content${CLEAN_LINK}"
+ printf "%s" "$link"
+}
+
+canonicalize_path() {
+ local path="$1"
+ local result=()
+ local part
+
+ IFS='/' read -r -a parts <<< "$path"
+
+ for part in "${parts[@]}"; do
+ if [[ -z "$part" || "$part" == "." ]]; then
+ continue
+ elif [[ "$part" == ".." ]]; then
+ if [[ ${#result[@]} -gt 0 ]]; then
+ unset 'result[-1]'
+ fi
else
+ result+=("$part")
+ fi
+ done
+
+ if [[ ${#result[@]} -eq 0 ]]; then
+ printf "/"
+ else
+ ( IFS='/'; printf "/%s" "${result[*]}" )
+ fi
+}
+
+resolve_real_path() {
+ local path="$1"
+
+ if command -v python3 >/dev/null 2>&1; then
+ # Use python to compute realpath which is tolerant of non existing
final target
+ python3 - <<'PY' "$path"
+import os
+import sys
+p = sys.argv[1]
+# os.path.realpath resolves symlinks for existing components and otherwise
returns a normalized path
+print(os.path.realpath(p))
+PY
+ else
+ # Fallback to the safe canonicalize_path output if python3 is not
available
+ canonicalize_path "$path"
+ fi
+}
+
+check_internal_link() {
+ local link="$1"
+ local file="$2"
+ local line_no="$3"
+ local clean_link
+ local target_path
+ local location
+
+ clean_link="$(normalize_link "$link")"
+
+ [[ -z "$clean_link" || "$clean_link" == "#" ]] && return 0
+
+ if [[ "$clean_link" == "{{"* ]]; then
+ return 0
+ fi
+
+ local clean_lower="${clean_link,,}"
+
+ if [[ "$clean_lower" == http://* || "$clean_lower" == https://* ||
"$clean_lower" == "//"* ]]; then
+ return 0
+ fi
+
+ case "$clean_lower" in
+ mailto:*|tel:*|javascript:*|data:*)
+ return 0
+ ;;
+ esac
+
+ if [[ "$clean_link" == /docs/* ]]; then
+ target_path="$CONTENT_ROOT/en${clean_link}"
+ elif [[ "$clean_link" == /cn/docs/* ]]; then
+ target_path="$CONTENT_ROOT${clean_link}"
+ elif [[ "$clean_link" == /* ]]; then
+ target_path="$CONTENT_ROOT/en${clean_link}"
+ else
+ local file_dir
+ file_dir="$(cd "$(dirname "$file")" && pwd)"
+ target_path="$file_dir/$clean_link"
+ fi
+
+ target_path="$(canonicalize_path "$target_path")"
+ target_path="$(resolve_real_path "$target_path")"
+
+ case "$target_path" in
+ "$CONTENT_ROOT"/*) ;;
+ *)
+ location="$file"
+ [[ -n "$line_no" ]] && location="$file:$line_no"
+ echo "Error: Link resolves outside content directory"
+ echo " File: $location"
+ echo " Link: $link"
+ EXIT_CODE=1
+ return
+ ;;
+ esac
Review Comment:
**⚠️ Important: Asset file extension whitelist may be incomplete**
Current list:
`.png|.jpg|.jpeg|.svg|.gif|.xml|.yaml|.yml|.json|.css|.js|.pdf|.zip|.tar.gz`
Consider adding commonly used formats:
- Modern image formats: `.webp`, `.avif`, `.ico`
- Web fonts: `.woff`, `.woff2`, `.ttf`, `.eot`
- Other assets: `.mp4`, `.webm` (if videos are used)
Alternatively, make this list configurable via a variable at the top of the
script for easier maintenance.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]