This is an automated email from the ASF dual-hosted git repository.
lhotari pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/pulsar-site.git
The following commit(s) were added to refs/heads/main by this push:
new a80cca003d9 chore(seo): add API sitemap and reorganize robots.txt by
library
a80cca003d9 is described below
commit a80cca003d9367c5c76d429193db38f744855c02
Author: Lari Hotari <[email protected]>
AuthorDate: Mon May 4 10:17:13 2026 +0300
chore(seo): add API sitemap and reorganize robots.txt by library
Restructure robots.txt to disallow specific outdated API doc versions per
library (admin, client, pulsar-functions) and add a supplementary
api-sitemap.xml so crawlers can discover the latest API doc index pages.
The sitemap is generated by scripts/generate-api-sitemap.js, which respects
robots.txt as the source of truth and picks the latest non-disallowed
version per library.
---
scripts/generate-api-sitemap.js | 135 ++++++++++++++++++++++++++++++++++++++++
static/api-sitemap.xml | 27 ++++++++
static/robots.txt | 66 +++++++++-----------
3 files changed, 193 insertions(+), 35 deletions(-)
diff --git a/scripts/generate-api-sitemap.js b/scripts/generate-api-sitemap.js
new file mode 100644
index 00000000000..b99271a755c
--- /dev/null
+++ b/scripts/generate-api-sitemap.js
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Generates static/api-sitemap.xml from the contents of static/api/.
+// One entry per API library (admin, client, cpp, js, pulsar-functions,
+// python) — the latest non-disallowed version. Crawlers walk deeper from
+// each index page. Versions matched by a Disallow line in static/robots.txt
+// are excluded before "latest" is picked, so robots.txt remains the single
+// source of truth for which API versions are excluded from indexing.
+//
+// Run manually after adding a new API version:
+// node scripts/generate-api-sitemap.js
+// Then add a `Sitemap: https://pulsar.apache.org/api-sitemap.xml` line to
+// static/robots.txt if not already present, and commit both files.
+
+"use strict";
+
+const fs = require("fs");
+const path = require("path");
+
+const SITE_URL = "https://pulsar.apache.org";
+const STATIC_DIR = path.resolve(__dirname, "..", "static");
+const API_DIR = path.join(STATIC_DIR, "api");
+const ROBOTS_FILE = path.join(STATIC_DIR, "robots.txt");
+const OUTPUT_FILE = path.join(STATIC_DIR, "api-sitemap.xml");
+
+function parseDisallowPatterns(robotsTxt) {
+ // Translate robots.txt path patterns to anchored RegExps.
+ // '*' matches any sequence; '$' anchors end-of-path; everything else is
literal.
+ const patterns = [];
+ for (const line of robotsTxt.split(/\r?\n/)) {
+ const match = line.match(/^\s*Disallow:\s*(\S+)\s*$/i);
+ if (!match) continue;
+ const raw = match[1];
+ if (!raw || raw === "/") continue;
+ let regex = "";
+ for (const ch of raw) {
+ if (ch === "*") regex += ".*";
+ else if (ch === "$") regex += "$";
+ else regex += ch.replace(/[.+?^${}()|[\]\\]/g, "\\$&");
+ }
+ patterns.push(new RegExp("^" + regex));
+ }
+ return patterns;
+}
+
+function isDisallowed(urlPath, patterns) {
+ return patterns.some((p) => p.test(urlPath));
+}
+
+const VERSION_RE = /^(\d+)\.(\d+)\./;
+
+// Sort key: higher major/minor first; non-version-shaped names sort last.
+function versionSortKey(name) {
+ const m = name.match(VERSION_RE);
+ return m ? [-Number(m[1]), -Number(m[2])] : [Infinity, Infinity];
+}
+
+function compareVersions(a, b) {
+ const [a1, a2] = versionSortKey(a);
+ const [b1, b2] = versionSortKey(b);
+ return a1 - b1 || a2 - b2 || a.localeCompare(b);
+}
+
+// Returns {libName: [version, ...]} for all libraries under static/api/.
+function collectLibraryVersions() {
+ const libs = {};
+ if (!fs.existsSync(API_DIR)) return libs;
+ for (const lib of fs.readdirSync(API_DIR, {withFileTypes: true})) {
+ if (!lib.isDirectory() || lib.name.startsWith(".")) continue;
+ const libDir = path.join(API_DIR, lib.name);
+ libs[lib.name] = fs.readdirSync(libDir, {withFileTypes: true})
+ .filter((v) => v.isDirectory() && !v.name.startsWith("."))
+ .map((v) => v.name);
+ }
+ return libs;
+}
+
+function buildSitemap(urls) {
+ const items = urls
+ .map((p) =>
+ ` <url>
+ <loc>${SITE_URL}${p}</loc>
+ <changefreq>weekly</changefreq>
+ </url>`,
+ )
+ .join("\n");
+ return `<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+${items}
+</urlset>
+`;
+}
+
+function main() {
+ const robotsTxt = fs.readFileSync(ROBOTS_FILE, "utf8");
+ const patterns = parseDisallowPatterns(robotsTxt);
+ const libs = collectLibraryVersions();
+
+ const urls = [];
+ const skipped = [];
+ for (const lib of Object.keys(libs).sort()) {
+ const candidates = libs[lib]
+ .filter((v) => !isDisallowed(`/api/${lib}/${v}/`, patterns))
+ .sort(compareVersions);
+ if (candidates.length === 0) {
+ skipped.push(lib);
+ continue;
+ }
+ urls.push(`/api/${lib}/${candidates[0]}/`);
+ }
+
+ fs.writeFileSync(OUTPUT_FILE, buildSitemap(urls));
+ const rel = path.relative(process.cwd(), OUTPUT_FILE);
+ console.log(
+ `[generate-api-sitemap] wrote ${urls.length} URL(s) to ${rel}` +
+ (skipped.length ? ` (no allowed version for: ${skipped.join(", ")})` :
""),
+ );
+}
+
+main();
diff --git a/static/api-sitemap.xml b/static/api-sitemap.xml
new file mode 100644
index 00000000000..92d81efe237
--- /dev/null
+++ b/static/api-sitemap.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+ <url>
+ <loc>https://pulsar.apache.org/api/admin/4.2.x/</loc>
+ <changefreq>weekly</changefreq>
+ </url>
+ <url>
+ <loc>https://pulsar.apache.org/api/client/4.2.x/</loc>
+ <changefreq>weekly</changefreq>
+ </url>
+ <url>
+ <loc>https://pulsar.apache.org/api/cpp/4.1.x/</loc>
+ <changefreq>weekly</changefreq>
+ </url>
+ <url>
+ <loc>https://pulsar.apache.org/api/js/1.17.x/</loc>
+ <changefreq>weekly</changefreq>
+ </url>
+ <url>
+ <loc>https://pulsar.apache.org/api/pulsar-functions/4.2.x/</loc>
+ <changefreq>weekly</changefreq>
+ </url>
+ <url>
+ <loc>https://pulsar.apache.org/api/python/3.11.x/</loc>
+ <changefreq>weekly</changefreq>
+ </url>
+</urlset>
diff --git a/static/robots.txt b/static/robots.txt
index a20af0ccac7..c128cc78152 100644
--- a/static/robots.txt
+++ b/static/robots.txt
@@ -1,38 +1,34 @@
User-agent: *
-Disallow: /api/*/2.2.0/
-Disallow: /api/*/2.2.1/
-Disallow: /api/*/2.3.0/
-Disallow: /api/*/2.3.1/
-Disallow: /api/*/2.3.2/
-Disallow: /api/*/2.4.0/
-Disallow: /api/*/2.4.1/
-Disallow: /api/*/2.4.2/
-Disallow: /api/*/2.5.0/
-Disallow: /api/*/2.5.1/
-Disallow: /api/*/2.5.2/
-Disallow: /api/*/2.6.0/
-Disallow: /api/*/2.6.1/
-Disallow: /api/*/2.6.2/
-Disallow: /api/*/2.6.3/
-Disallow: /api/*/2.6.4/
-Disallow: /api/*/2.7.0/
-Disallow: /api/*/2.7.1/
-Disallow: /api/*/2.7.2/
-Disallow: /api/*/2.7.3/
-Disallow: /api/*/2.7.4/
-Disallow: /api/*/2.7.5/
-Disallow: /api/*/2.8.0/
-Disallow: /api/*/2.8.1/
-Disallow: /api/*/2.8.2/
-Disallow: /api/*/2.8.3/
-Disallow: /api/*/2.8.x/
-Disallow: /api/*/2.9.0/
-Disallow: /api/*/2.9.1/
-Disallow: /api/*/2.9.2/
-Disallow: /api/*/2.9.3/
-Disallow: /api/*/2.9.x/
-Disallow: /api/*/2.10.0/
-Disallow: /api/*/2.10.1/
-Disallow: /api/*/2.10.2/
+
+Disallow: /api/client/2.8.x/
+Disallow: /api/client/2.9.x/
+Disallow: /api/client/2.10.x/
+Disallow: /api/client/2.11.x/
+Disallow: /api/client/3.0.x/
+Disallow: /api/client/3.1.x/
+Disallow: /api/client/3.2.x/
+Disallow: /api/client/3.3.x/
+Disallow: /api/client/4.1.x/
+
+Disallow: /api/admin/2.8.x/
+Disallow: /api/admin/2.9.x/
+Disallow: /api/admin/2.10.x/
+Disallow: /api/admin/2.11.x/
+Disallow: /api/admin/3.0.x/
+Disallow: /api/admin/3.1.x/
+Disallow: /api/admin/3.2.x/
+Disallow: /api/admin/3.3.x/
+Disallow: /api/admin/4.1.x/
+
+Disallow: /api/pulsar-functions/2.8.x/
+Disallow: /api/pulsar-functions/2.9.x/
+Disallow: /api/pulsar-functions/2.10.x/
+Disallow: /api/pulsar-functions/2.11.x/
+Disallow: /api/pulsar-functions/3.0.x/
+Disallow: /api/pulsar-functions/3.1.x/
+Disallow: /api/pulsar-functions/3.2.x/
+Disallow: /api/pulsar-functions/3.3.x/
+Disallow: /api/pulsar-functions/4.1.x/
Sitemap: https://pulsar.apache.org/sitemap.xml
+Sitemap: https://pulsar.apache.org/api-sitemap.xml