This is an automated email from the ASF dual-hosted git repository.
houston pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new ad7ceb7ad71 SOLR-16262: Add a script to add canonical links to Solr
6-8 ref guide pages (#3319)
ad7ceb7ad71 is described below
commit ad7ceb7ad71eb4279c3b530fb2ba53e5ee3e1ec2
Author: Houston Putman <[email protected]>
AuthorDate: Tue Apr 15 11:52:23 2025 -0500
SOLR-16262: Add a script to add canonical links to Solr 6-8 ref guide pages
(#3319)
---
.../scripts/refguide/gen-refguide-redirects.py | 7 -
dev-tools/scripts/refguide/htaccess.txt | 21 ++-
dev-tools/scripts/refguide/mappings.csv | 16 ++
dev-tools/scripts/refguide/old-guide.txt | 15 ++
.../scripts/refguide/refguide-add-canonical-url.py | 189 +++++++++++++++++++++
5 files changed, 235 insertions(+), 13 deletions(-)
diff --git a/dev-tools/scripts/refguide/gen-refguide-redirects.py
b/dev-tools/scripts/refguide/gen-refguide-redirects.py
index e7ab1cb1ce0..fac79229702 100755
--- a/dev-tools/scripts/refguide/gen-refguide-redirects.py
+++ b/dev-tools/scripts/refguide/gen-refguide-redirects.py
@@ -129,13 +129,6 @@ def main():
print("# Paths we could not map")
for key in failed:
print("# %s: %s" % (key, failed[key]))
-
- print("""
-
-# Do not index old reference guide pages on search engines, except for pages
that don't exist in 9+
-<If "%%{REQUEST_URI} =~ m#/guide/(6|7|8)_.*# && %%{REQUEST_URI} !~
m#/guide/8_11/%s$#">
- Header set X-Robots-Tag "noindex,nofollow,noarchive"
-</If>""" % old_version_pages_regex)
else:
out("Regex mappings:")
pprint(regex_new)
diff --git a/dev-tools/scripts/refguide/htaccess.txt
b/dev-tools/scripts/refguide/htaccess.txt
index 419467d09dd..ba9ae2e9f0b 100644
--- a/dev-tools/scripts/refguide/htaccess.txt
+++ b/dev-tools/scripts/refguide/htaccess.txt
@@ -8,6 +8,7 @@ RedirectMatch 301
^/guide/(major-changes-in-solr-7|major-changes-in-solr-8|solr-
# Page renames between 8.x and 9.0
RewriteRule ^guide/9_0/solr-tutorial.html
/guide/solr/latest/getting-started/solr-tutorial.html [R=301,NE,L]
RewriteRule ^guide/a-quick-overview.html
/guide/solr/latest/getting-started/introduction.html [R=301,NE,L]
+RewriteRule ^guide/a-step-closer.html
/guide/solr/latest/configuration-guide/configuration-files.html [R=301,NE,L]
RewriteRule ^guide/about-filters.html
/guide/solr/latest/indexing-guide/filters.html [R=301,NE,L]
RewriteRule ^guide/about-tokenizers.html
/guide/solr/latest/indexing-guide/tokenizers.html [R=301,NE,L]
RewriteRule ^guide/aws-solrcloud-tutorial.html
/guide/solr/latest/getting-started/tutorial-aws.html [R=301,NE,L]
@@ -32,6 +33,7 @@ RewriteRule ^guide/files-screen.html
/guide/solr/latest/configuration-guide/conf
RewriteRule ^guide/filter-descriptions.html
/guide/solr/latest/indexing-guide/filters.html [R=301,NE,L]
RewriteRule ^guide/format-of-solr-xml.html
/guide/solr/latest/configuration-guide/configuring-solr-xml.html [R=301,NE,L]
RewriteRule ^guide/further-assistance.html
https://solr.apache.org/community.html [R=301,NE,L]
+RewriteRule ^guide/getting-assistance.html
https://solr.apache.org/community.html [R=301,NE,L]
RewriteRule ^guide/getting-started-with-solrcloud.html
/guide/solr/latest/getting-started/tutorial-solrcloud.html [R=301,NE,L]
RewriteRule ^guide/getting-started.html
/guide/solr/latest/getting-started/introduction.html [R=301,NE,L]
RewriteRule ^guide/how-solrcloud-works.html
/guide/solr/latest/deployment-guide/cluster-types.html#solrcloud-mode
[R=301,NE,L]
@@ -45,10 +47,12 @@ RewriteRule
^guide/introduction-to-scaling-and-distribution.html /guide/solr/lat
RewriteRule ^guide/introduction-to-solr-indexing.html
/guide/solr/latest/getting-started/solr-indexing.html [R=301,NE,L]
RewriteRule ^guide/java-properties.html
/guide/solr/latest/deployment-guide/jvm-settings.html [R=301,NE,L]
RewriteRule ^guide/legacy-scaling-and-distribution.html
/guide/solr/latest/deployment-guide/cluster-types.html#user-managed-mode
[R=301,NE,L]
+RewriteRule ^guide/lib-directives-in-solrconfig.html
/guide/solr/latest/configuration-guide/libs.html [R=301,NE,L]
RewriteRule ^guide/local-parameters-in-queries.html
/guide/solr/latest/query-guide/local-params.html [R=301,NE,L]
RewriteRule ^guide/logging.html
/guide/solr/latest/deployment-guide/configuring-logging.html [R=301,NE,L]
RewriteRule ^guide/major-changes-from-solr-5-to-solr-6.html
/guide/solr/latest/upgrade-notes/major-changes-in-solr-6.html [R=301,NE,L]
RewriteRule ^guide/making-and-restoring-backups.html
/guide/solr/latest/deployment-guide/backup-restore.html [R=301,NE,L]
+RewriteRule ^guide/managing-solr.html
/guide/solr/latest/deployment-guide/solr-control-script-reference.html
[R=301,NE,L]
RewriteRule ^guide/merging-indexes.html
/guide/solr/latest/configuration-guide/coreadmin-api.html [R=301,NE,L]
RewriteRule ^guide/monitoring-solr-with-prometheus-and-grafana.html
/guide/solr/latest/deployment-guide/monitoring-with-prometheus-and-grafana.html
[R=301,NE,L]
RewriteRule ^guide/monitoring-solr.html
/guide/solr/latest/deployment-guide/configuring-logging.html [R=301,NE,L]
@@ -61,11 +65,14 @@ RewriteRule ^guide/parallel-sql-interface.html
/guide/solr/latest/query-guide/sq
RewriteRule ^guide/parameter-reference.html
/guide/solr/latest/configuration-guide/configuring-solr-xml.html [R=301,NE,L]
RewriteRule ^guide/query-settings-in-solrconfig.html
/guide/solr/latest/configuration-guide/caches-warming.html [R=301,NE,L]
RewriteRule ^guide/query-syntax-and-parsing.html
/guide/solr/latest/query-guide/query-syntax-and-parsers.html [R=301,NE,L]
+RewriteRule ^guide/read-and-write-side-fault-tolerance.html
/guide/solr/latest/deployment-guide/solrcloud-recoveries-and-write-tolerance.html
[R=301,NE,L]
RewriteRule ^guide/replication-screen.html
/guide/solr/latest/deployment-guide/user-managed-index-replication.html
[R=301,NE,L]
RewriteRule ^guide/requestdispatcher-in-solrconfig.html
/guide/solr/latest/configuration-guide/requestdispatcher.html [R=301,NE,L]
RewriteRule ^guide/requesthandlers-and-searchcomponents-in-solrconfig.html
/guide/solr/latest/configuration-guide/requesthandlers-searchcomponents.html
[R=301,NE,L]
+RewriteRule ^guide/resource-and-plugin-loading.html
/guide/solr/latest/configuration-guide/resource-loading.html [R=301,NE,L]
RewriteRule ^guide/running-solr-on-hdfs.html
/guide/solr/latest/deployment-guide/solr-on-hdfs.html [R=301,NE,L]
RewriteRule ^guide/running-your-analyzer.html
/guide/solr/latest/indexing-guide/analysis-screen.html [R=301,NE,L]
+RewriteRule ^guide/running-solr.html
/guide/solr/latest/deployment-guide/installing-solr.html [R=301,NE,L]
RewriteRule ^guide/schema-factory-definition-in-solrconfig.html
/guide/solr/latest/configuration-guide/schema-factory.html [R=301,NE,L]
RewriteRule ^guide/searching.html
/guide/solr/latest/query-guide/query-syntax-and-parsers.html [R=301,NE,L]
RewriteRule ^guide/segments-info.html
/guide/solr/latest/configuration-guide/index-segments-merging.html [R=301,NE,L]
@@ -85,6 +92,11 @@ RewriteRule
^guide/solrcloud-configuration-and-parameters.html /guide/solr/lates
RewriteRule ^guide/solrcloud-query-routing-and-read-tolerance.html
/guide/solr/latest/deployment-guide/solrcloud-distributed-requests.html
[R=301,NE,L]
RewriteRule ^guide/solrcloud-resilience.html
/guide/solr/latest/deployment-guide/solrcloud-recoveries-and-write-tolerance.html
[R=301,NE,L]
RewriteRule ^guide/solrcloud.html
/guide/solr/latest/deployment-guide/cluster-types.html#solrcloud-mode
[R=301,NE,L]
+RewriteRule ^guide/statistical-programming.html
/guide/solr/latest/query-guide/math-expressions.html [R=301,NE,L]
+RewriteRule ^guide/stream-decorators.html
/guide/solr/latest/query-guide/stream-decorator-reference.html [R=301,NE,L]
+RewriteRule ^guide/stream-evaluators.html
/guide/solr/latest/query-guide/stream-evaluator-reference.html [R=301,NE,L]
+RewriteRule ^guide/stream-sources.html
/guide/solr/latest/query-guide/stream-source-reference.html [R=301,NE,L]
+RewriteRule ^guide/time-routed-aliases.html
https://solr.apache.org/guide/solr/latest/deployment-guide/aliases.html#time-routed-aliases
[R=301,NE,L]
RewriteRule ^guide/the-dismax-query-parser.html
/guide/solr/latest/query-guide/dismax-query-parser.html [R=301,NE,L]
RewriteRule ^guide/the-extended-dismax-query-parser.html
/guide/solr/latest/query-guide/edismax-query-parser.html [R=301,NE,L]
RewriteRule ^guide/the-query-elevation-component.html
/guide/solr/latest/query-guide/query-elevation-component.html [R=301,NE,L]
@@ -95,9 +107,11 @@ RewriteRule ^guide/the-term-vector-component.html
/guide/solr/latest/query-guide
RewriteRule ^guide/the-terms-component.html
/guide/solr/latest/query-guide/terms-component.html [R=301,NE,L]
RewriteRule ^guide/the-well-configured-solr-instance.html
/guide/solr/latest/configuration-guide/configuration-files.html [R=301,NE,L]
RewriteRule ^guide/transforming-result-documents.html
/guide/solr/latest/query-guide/document-transformers.html [R=301,NE,L]
+RewriteRule ^guide/uima-integration.html
https://solr.apache.org/guide/7_4/uima-integration.html [R=301,NE,L]
RewriteRule ^guide/understanding-analyzers-tokenizers-and-filters.html
/guide/solr/latest/indexing-guide/document-analysis.html [R=301,NE,L]
RewriteRule ^guide/updatehandlers-in-solrconfig.html
/guide/solr/latest/configuration-guide/commits-transaction-logs.html
[R=301,NE,L]
RewriteRule ^guide/updating-parts-of-documents.html
/guide/solr/latest/indexing-guide/partial-document-updates.html [R=301,NE,L]
+RewriteRule ^guide/upgrading-solr.html
/guide/solr/latest/deployment-guide/upgrading-a-solr-cluster.html [R=301,NE,L]
RewriteRule ^guide/uploading-data-with-index-handlers.html
/guide/solr/latest/indexing-guide/indexing-with-update-handlers.html
[R=301,NE,L]
RewriteRule ^guide/uploading-data-with-solr-cell-using-apache-tika.html
/guide/solr/latest/indexing-guide/indexing-with-tika.html [R=301,NE,L]
RewriteRule ^guide/using-javascript.html
/guide/solr/latest/deployment-guide/javascript.html [R=301,NE,L]
@@ -107,6 +121,7 @@ RewriteRule ^guide/using-solr-from-ruby.html
/guide/solr/latest/deployment-guide
RewriteRule ^guide/using-solrj.html
/guide/solr/latest/deployment-guide/solrj.html [R=301,NE,L]
RewriteRule ^guide/using-the-solr-administration-user-interface.html
/guide/solr/latest/getting-started/solr-admin-ui.html [R=301,NE,L]
RewriteRule ^guide/using-zookeeper-to-manage-configuration-files.html
/guide/solr/latest/deployment-guide/zookeeper-file-management.html [R=301,NE,L]
+RewriteRule ^guide/vectorization.html
/guide/solr/latest/query-guide/math-expressions.html [R=301,NE,L]
RewriteRule ^guide/working-with-currencies-and-exchange-rates.html
/guide/solr/latest/indexing-guide/currencies-exchange-rates.html [R=301,NE,L]
RewriteRule ^guide/working-with-dates.html
/guide/solr/latest/indexing-guide/date-formatting-math.html [R=301,NE,L]
RewriteRule ^guide/working-with-enum-fields.html
/guide/solr/latest/indexing-guide/enum-fields.html [R=301,NE,L]
@@ -118,9 +133,3 @@ RewriteRule ^guide/real-time-get.html
/guide/solr/latest/configuration-guide/rea
# Removed pages redirected to latest 8.x guide
RedirectMatch 301
^/guide/(adding-custom-plugins-in-solrcloud-mode|blob-store-api|blockjoin-faceting|cdcr-api|cdcr-architecture|cdcr-config|cdcr-operations|colocating-collections|cross-data-center-replication-cdcr|dataimport-screen|errata|metrics-history|migrate-to-policy-rule|putting-the-pieces-together|rule-based-replica-placement|solrcloud-autoscaling-api|solrcloud-autoscaling-auto-add-replicas|solrcloud-autoscaling-fault-tolerance|solrcloud-autoscaling-listeners|solrcloud-autoscaling
[...]
# Paths we could not map
-
-
-# Do not index old reference guide pages on search engines, except for pages
that don't exist in 9+
-<If "%{REQUEST_URI} =~ m#/guide/(6|7|8)_.*# && %{REQUEST_URI} !~
m#/guide/8_11/(adding-custom-plugins-in-solrcloud-mode|blob-store-api|blockjoin-faceting|cdcr-api|cdcr-architecture|cdcr-config|cdcr-operations|colocating-collections|cross-data-center-replication-cdcr|dataimport-screen|errata|metrics-history|migrate-to-policy-rule|putting-the-pieces-together|rule-based-replica-placement|solrcloud-autoscaling-api|solrcloud-autoscaling-auto-add-replicas|solrcloud-autoscaling-fault-tolerance|
[...]
- Header set X-Robots-Tag "noindex,nofollow,noarchive"
-</If>
diff --git a/dev-tools/scripts/refguide/mappings.csv
b/dev-tools/scripts/refguide/mappings.csv
index 910207af4fb..94f6754b1ef 100644
--- a/dev-tools/scripts/refguide/mappings.csv
+++ b/dev-tools/scripts/refguide/mappings.csv
@@ -19,6 +19,7 @@ indexconfig-in-solrconfig.html;index-segments-merging.html
indexing-and-basic-data-operations.html;indexing-with-update-handlers.html
initparams-in-solrconfig.html;initparams.html
introduction-to-solr-indexing.html;solr-indexing.html
+lib-directives-in-solrconfig.html;libs.html
local-parameters-in-queries.html;local-params.html
major-changes-from-solr-5-to-solr-6.html;major-changes-in-solr-6.html
making-and-restoring-backups.html;backup-restore.html
@@ -105,6 +106,21 @@
the-well-configured-solr-instance.adoc;configuration-files.html
solrcloud.adoc;cluster-types.html#solrcloud-mode
how-to-contribute.adoc;https://solr.apache.org/community.html#how-to-contribute
deployment-and-operations.adoc;installing-solr.html
+vectorization.adoc;math-expressions.html
+time-routed-aliases.adoc;https://solr.apache.org/guide/solr/latest/deployment-guide/aliases.html#time-routed-aliases
+resource-and-plugin-loading.adoc;resource-loading.html
+getting-assistance.adoc;https://solr.apache.org/community.html
+statistical-programming.adoc;math-expressions.html
+lib-directives-in-solrconfig.adoc;libs.html
+uima-integration.adoc;https://solr.apache.org/guide/7_4/uima-integration.html
+upgrading-solr.adoc;upgrading-a-solr-cluster.html
+stream-decorators.adoc;stream-decorator-reference.html
+stream-sources.adoc;stream-source-reference.html
+stream-evaluators.adoc;stream-evaluator-reference.html
+a-step-closer.adoc;configuration-files.html
+read-and-write-side-fault-tolerance.adoc;solrcloud-recoveries-and-write-tolerance.html
+managing-solr.adoc;solr-control-script-reference.html
+running-solr.adoc;installing-solr.html
# A bit uncertain of these
parallel-sql-interface.html;sql-query.html
diff --git a/dev-tools/scripts/refguide/old-guide.txt
b/dev-tools/scripts/refguide/old-guide.txt
index cf1478100d6..7edc6f5fe1a 100644
--- a/dev-tools/scripts/refguide/old-guide.txt
+++ b/dev-tools/scripts/refguide/old-guide.txt
@@ -1,4 +1,5 @@
a-quick-overview.adoc
+a-step-closer.adoc
about-filters.adoc
about-this-guide.adoc
about-tokenizers.adoc
@@ -77,6 +78,7 @@ filter-descriptions.adoc
format-of-solr-xml.adoc
function-queries.adoc
further-assistance.adoc
+getting-assistance.adoc
getting-started-with-solrcloud.adoc
getting-started.adoc
graph-traversal.adoc
@@ -107,6 +109,7 @@ kerberos-authentication-plugin.adoc
language-analysis.adoc
learning-to-rank.adoc
legacy-scaling-and-distribution.adoc
+lib-directives-in-solrconfig.adoc
libs.adoc
loading.adoc
local-parameters-in-queries.adoc
@@ -119,6 +122,7 @@ major-changes-in-solr-7.adoc
major-changes-in-solr-8.adoc
making-and-restoring-backups.adoc
managed-resources.adoc
+managing-solr.adoc
math-expressions.adoc
math-start.adoc
matrix-math.adoc
@@ -153,6 +157,7 @@ query-re-ranking.adoc
query-screen.adoc
query-settings-in-solrconfig.adoc
query-syntax-and-parsing.adoc
+read-and-write-side-fault-tolerance.adoc
realtime-get.adoc
regression.adoc
reindexing.adoc
@@ -162,6 +167,7 @@ replication-screen.adoc
request-parameters-api.adoc
requestdispatcher-in-solrconfig.adoc
requesthandlers-and-searchcomponents-in-solrconfig.adoc
+resource-and-plugin-loading.adoc
resource-loading.adoc
response-writers.adoc
result-grouping.adoc
@@ -169,6 +175,7 @@ rule-based-authorization-plugin.adoc
rule-based-replica-placement.adoc
running-solr-on-hdfs.adoc
running-your-analyzer.adoc
+running-solr.adoc
scalar-math.adoc
schema-api.adoc
schema-browser-screen.adoc
@@ -218,16 +225,21 @@ solrcloud.adoc
spatial-search.adoc
spell-checking.adoc
statistics.adoc
+statistical-programming.adoc
stream-api.adoc
stream-decorator-reference.adoc
+stream-decorators.adoc
stream-evaluator-reference.adoc
+stream-evaluators.adoc
stream-screen.adoc
stream-source-reference.adoc
+stream-sources.adoc
streaming-expressions.adoc
suggester.adoc
suggestions-screen.adoc
taking-solr-to-production.adoc
term-vectors.adoc
+time-routed-aliases.adoc
the-dismax-query-parser.adoc
the-extended-dismax-query-parser.adoc
the-query-elevation-component.adoc
@@ -243,11 +255,13 @@ tokenizers.adoc
transform.adoc
transforming-and-indexing-custom-json.adoc
transforming-result-documents.adoc
+uima-integration.adoc
understanding-analyzers-tokenizers-and-filters.adoc
update-request-processors.adoc
updatehandlers-in-solrconfig.adoc
updating-parts-of-documents.adoc
upgrading-a-solr-cluster.adoc
+upgrading-solr.adoc
uploading-data-with-index-handlers.adoc
uploading-data-with-solr-cell-using-apache-tika.adoc
uploading-structured-data-store-data-with-the-data-import-handler.adoc
@@ -261,6 +275,7 @@ using-zookeeper-to-manage-configuration-files.adoc
v2-api.adoc
variables.adoc
vector-math.adoc
+vectorization.adoc
velocity-response-writer.adoc
velocity-search-ui.adoc
visualization.adoc
diff --git a/dev-tools/scripts/refguide/refguide-add-canonical-url.py
b/dev-tools/scripts/refguide/refguide-add-canonical-url.py
new file mode 100755
index 00000000000..d0ed8c7e941
--- /dev/null
+++ b/dev-tools/scripts/refguide/refguide-add-canonical-url.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script processes all static html files for Solr's refernce guide
+and downloads external JS and CSS files to local folders js/ and css/ for
+each version. It also updates the HTML files to reference the local files.
+Context is that ASF policy for web sites changed to not allow external
+references to JS and CSS files, and these sites were generated long ago.
+"""
+
+import os
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+import re
+import argparse
+
+
+def lines_from_file(filename):
+ with open(filename, 'r') as fp:
+ lines = []
+ for line in fp.readlines():
+ if line.startswith("#") or len(line.strip()) == 0:
+ continue
+ lines.append(line.replace(".adoc", ".html").strip())
+ return lines
+
+def generate_canonical_mapping(conf):
+ new = {}
+ name_map = {}
+
+ print("Reading config")
+ old = lines_from_file(conf.old)
+ for line in lines_from_file(conf.new):
+ (path, file) = line.split("/")
+ new[file] = line
+ for line in lines_from_file(conf.mapping):
+ (frm, to) = line.split(";")
+ name_map[frm] = to
+
+ # Files in src/old-pages as of 2022-02-04
+ old_pages = ["configuration-apis.html", "configuration-guide.html",
"controlling-results.html", "deployment-guide.html", "enhancing-queries.html",
"field-types.html", "fields-and-schema-design.html", "getting-started.html",
"indexing-data-operations.html", "installation-deployment.html",
"monitoring-solr.html", "query-guide.html", "scaling-solr.html",
"schema-indexing-guide.html", "solr-concepts.html", "solr-schema.html",
"solrcloud-clusters.html", "user-managed-clusters.html"]
+
+ result = {}
+ old_guide = []
+ failed = {}
+ regex_new = {}
+ print("Converting...")
+ for frm in old:
+ if frm in new:
+ (subpath, name) = new[frm].split("/")
+ if subpath not in regex_new:
+ regex_new[subpath] = []
+ regex_new[subpath].append(name.split(".html")[0])
+ elif frm in name_map:
+ new_name = name_map[frm]
+ new_name_without_anchor = new_name
+ anchor = ""
+ anchor_index = new_name.find("#")
+ if anchor_index > 0:
+ new_name_without_anchor = new_name[:anchor_index]
+ anchor = new_name[anchor_index:]
+ if new_name_without_anchor.startswith("https://"):
+ result[frm] = new_name
+ elif new_name_without_anchor in new:
+ result[frm] = new[new_name_without_anchor] + anchor
+ elif new_name_without_anchor.startswith("/guide/"):
+ result[frm] = new_name[7:]
+ elif new_name_without_anchor == "_8_11":
+ old_guide.append(frm.split(".html")[0])
+ else:
+ failed[frm] = "Mapped value %s not in new guide" %
new_name_without_anchor
+ elif frm in old_pages:
+ failed[frm] = "Not yet mapped (in src/old-pages)"
+ else:
+ failed[frm] = "404"
+
+ mappings = {
+ "index.html": "https://solr.apache.org/guide/solr/latest/index.html",
+
+ }
+ # Add direct mappings from old to new files
+ for key in regex_new:
+ for file in regex_new[key]:
+ mappings[file + ".html"] =
f"https://solr.apache.org/guide/solr/latest/{key}/{file}.html"
+
+ # Add mappings for renamed files
+ for key in result:
+ if result[key].startswith("https://"):
+ mappings[key] = result[key]
+ else:
+ mappings[key] =
f"https://solr.apache.org/guide/solr/latest/{result[key]}"
+
+ # Add mappings for files removed in 9.0, they will be canonical to 8.11
+ for file in old_guide:
+ mappings[file + ".html"] =
f"https://solr.apache.org/guide/8_11/{file}.html"
+
+ for (key, value) in mappings.items():
+ print(key, value)
+ return mappings
+
+def extract_filename_from_path(html_file_path):
+ """Extract filename from path."""
+ match = re.search(r'/([^\/]+)$', html_file_path)
+ return match.group(1) if match else None
+
+def process_html_file(html_file_path, url, mappings):
+ """Process an HTML file to localize external JS and CSS references."""
+ with open(html_file_path, "r", encoding="utf-8") as f:
+ lines = f.readlines()
+
+ file_name = extract_filename_from_path(html_file_path)
+ if file_name and file_name in mappings:
+ canonical_url = mappings[file_name]
+ else:
+ canonical_url = url
+ if canonical_url == url:
+ print(f"Skipped {html_file_path}, filename {file_name}, it is the
canonical url: {url}")
+ return
+ canonical_link_html = f"<link rel=\"canonical\"
href=\"{canonical_url}\">\n"
+
+ new_lines = []
+ found_title = False
+ for line in lines:
+ soup = BeautifulSoup(line, "html.parser")
+ title = soup.find("title")
+ canon_link = soup.find("link", attrs={'rel': 'canonical'})
+
+ if title and not found_title:
+ new_lines.append(line)
+ new_lines.append(canonical_link_html)
+ found_title = True
+ elif not (found_title and canon_link):
+ # Skip any other canonical url we find
+ new_lines.append(line)
+
+ if found_title:
+ with open(html_file_path, "w", encoding="utf-8") as f:
+ f.writelines(new_lines)
+ print(f"Updated {html_file_path} to canonical url: {canonical_url}")
+
+def main():
+ parser = argparse.ArgumentParser(description='Process HTML files to add
Canonical URLs to old ref guide pages')
+ parser.add_argument('--old', required=True, help='Old pagenames file, one
.adoc filename per line')
+ parser.add_argument('--new', required=True, help='New pagenames file, one
.adoc filename per line')
+ parser.add_argument('--mapping', required=True, help='Semicolon separated
from-to file names (adoc)')
+ parser.add_argument('--folder', help='Folder of svn checkout
(https://svn.apache.org/repos/infra/sites/solr/guide/)')
+ args = parser.parse_args()
+ mappings = generate_canonical_mapping(args)
+
+ base_dir = args.folder
+
+ # Iterate over the folder structure
+ folders = [name for name in os.listdir(base_dir) if re.match(r'\d+_\d+',
name)]
+ if not folders:
+ print(f"No versioned directories 'N_M' found in {base_dir}, exiting.")
+ return
+ for root_dir in folders:
+ print(f"\nProcessing directory {root_dir}")
+ print(f"=================================")
+ full_path = os.path.join(base_dir, root_dir)
+ if not os.path.exists(full_path):
+ print(f"Directory {full_path} not found, skipping.")
+ continue
+
+ # Process each HTML file in the directory
+ for filename in os.listdir(full_path):
+ if filename.endswith(".html"):
+ html_file_path = os.path.join(full_path, filename)
+ url = f"https://solr.apache.org/guide/{root_dir}/{filename}"
+ process_html_file(html_file_path, url, mappings)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file