This is an automated email from the ASF dual-hosted git repository.

houston pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new ad7ceb7ad71 SOLR-16262: Add a script to add canonical links to Solr 
6-8 ref guide pages (#3319)
ad7ceb7ad71 is described below

commit ad7ceb7ad71eb4279c3b530fb2ba53e5ee3e1ec2
Author: Houston Putman <[email protected]>
AuthorDate: Tue Apr 15 11:52:23 2025 -0500

    SOLR-16262: Add a script to add canonical links to Solr 6-8 ref guide pages 
(#3319)
---
 .../scripts/refguide/gen-refguide-redirects.py     |   7 -
 dev-tools/scripts/refguide/htaccess.txt            |  21 ++-
 dev-tools/scripts/refguide/mappings.csv            |  16 ++
 dev-tools/scripts/refguide/old-guide.txt           |  15 ++
 .../scripts/refguide/refguide-add-canonical-url.py | 189 +++++++++++++++++++++
 5 files changed, 235 insertions(+), 13 deletions(-)

diff --git a/dev-tools/scripts/refguide/gen-refguide-redirects.py 
b/dev-tools/scripts/refguide/gen-refguide-redirects.py
index e7ab1cb1ce0..fac79229702 100755
--- a/dev-tools/scripts/refguide/gen-refguide-redirects.py
+++ b/dev-tools/scripts/refguide/gen-refguide-redirects.py
@@ -129,13 +129,6 @@ def main():
         print("# Paths we could not map")
         for key in failed:
             print("# %s: %s" % (key, failed[key]))
-
-        print("""
-
-# Do not index old reference guide pages on search engines, except for pages 
that don't exist in 9+
-<If "%%{REQUEST_URI} =~ m#/guide/(6|7|8)_.*# && %%{REQUEST_URI} !~ 
m#/guide/8_11/%s$#">
-  Header set X-Robots-Tag "noindex,nofollow,noarchive"
-</If>""" % old_version_pages_regex)
     else:
         out("Regex mappings:")
         pprint(regex_new)
diff --git a/dev-tools/scripts/refguide/htaccess.txt 
b/dev-tools/scripts/refguide/htaccess.txt
index 419467d09dd..ba9ae2e9f0b 100644
--- a/dev-tools/scripts/refguide/htaccess.txt
+++ b/dev-tools/scripts/refguide/htaccess.txt
@@ -8,6 +8,7 @@ RedirectMatch 301 
^/guide/(major-changes-in-solr-7|major-changes-in-solr-8|solr-
 # Page renames between 8.x and 9.0
 RewriteRule ^guide/9_0/solr-tutorial.html 
/guide/solr/latest/getting-started/solr-tutorial.html [R=301,NE,L]
 RewriteRule ^guide/a-quick-overview.html 
/guide/solr/latest/getting-started/introduction.html [R=301,NE,L]
+RewriteRule ^guide/a-step-closer.html 
/guide/solr/latest/configuration-guide/configuration-files.html [R=301,NE,L]
 RewriteRule ^guide/about-filters.html 
/guide/solr/latest/indexing-guide/filters.html [R=301,NE,L]
 RewriteRule ^guide/about-tokenizers.html 
/guide/solr/latest/indexing-guide/tokenizers.html [R=301,NE,L]
 RewriteRule ^guide/aws-solrcloud-tutorial.html 
/guide/solr/latest/getting-started/tutorial-aws.html [R=301,NE,L]
@@ -32,6 +33,7 @@ RewriteRule ^guide/files-screen.html 
/guide/solr/latest/configuration-guide/conf
 RewriteRule ^guide/filter-descriptions.html 
/guide/solr/latest/indexing-guide/filters.html [R=301,NE,L]
 RewriteRule ^guide/format-of-solr-xml.html 
/guide/solr/latest/configuration-guide/configuring-solr-xml.html [R=301,NE,L]
 RewriteRule ^guide/further-assistance.html 
https://solr.apache.org/community.html [R=301,NE,L]
+RewriteRule ^guide/getting-assistance.html 
https://solr.apache.org/community.html [R=301,NE,L]
 RewriteRule ^guide/getting-started-with-solrcloud.html 
/guide/solr/latest/getting-started/tutorial-solrcloud.html [R=301,NE,L]
 RewriteRule ^guide/getting-started.html 
/guide/solr/latest/getting-started/introduction.html [R=301,NE,L]
 RewriteRule ^guide/how-solrcloud-works.html 
/guide/solr/latest/deployment-guide/cluster-types.html#solrcloud-mode 
[R=301,NE,L]
@@ -45,10 +47,12 @@ RewriteRule 
^guide/introduction-to-scaling-and-distribution.html /guide/solr/lat
 RewriteRule ^guide/introduction-to-solr-indexing.html 
/guide/solr/latest/getting-started/solr-indexing.html [R=301,NE,L]
 RewriteRule ^guide/java-properties.html 
/guide/solr/latest/deployment-guide/jvm-settings.html [R=301,NE,L]
 RewriteRule ^guide/legacy-scaling-and-distribution.html 
/guide/solr/latest/deployment-guide/cluster-types.html#user-managed-mode 
[R=301,NE,L]
+RewriteRule ^guide/lib-directives-in-solrconfig.html 
/guide/solr/latest/configuration-guide/libs.html [R=301,NE,L]
 RewriteRule ^guide/local-parameters-in-queries.html 
/guide/solr/latest/query-guide/local-params.html [R=301,NE,L]
 RewriteRule ^guide/logging.html 
/guide/solr/latest/deployment-guide/configuring-logging.html [R=301,NE,L]
 RewriteRule ^guide/major-changes-from-solr-5-to-solr-6.html 
/guide/solr/latest/upgrade-notes/major-changes-in-solr-6.html [R=301,NE,L]
 RewriteRule ^guide/making-and-restoring-backups.html 
/guide/solr/latest/deployment-guide/backup-restore.html [R=301,NE,L]
+RewriteRule ^guide/managing-solr.html 
/guide/solr/latest/deployment-guide/solr-control-script-reference.html 
[R=301,NE,L]
 RewriteRule ^guide/merging-indexes.html 
/guide/solr/latest/configuration-guide/coreadmin-api.html [R=301,NE,L]
 RewriteRule ^guide/monitoring-solr-with-prometheus-and-grafana.html 
/guide/solr/latest/deployment-guide/monitoring-with-prometheus-and-grafana.html 
[R=301,NE,L]
 RewriteRule ^guide/monitoring-solr.html 
/guide/solr/latest/deployment-guide/configuring-logging.html [R=301,NE,L]
@@ -61,11 +65,14 @@ RewriteRule ^guide/parallel-sql-interface.html 
/guide/solr/latest/query-guide/sq
 RewriteRule ^guide/parameter-reference.html 
/guide/solr/latest/configuration-guide/configuring-solr-xml.html [R=301,NE,L]
 RewriteRule ^guide/query-settings-in-solrconfig.html 
/guide/solr/latest/configuration-guide/caches-warming.html [R=301,NE,L]
 RewriteRule ^guide/query-syntax-and-parsing.html 
/guide/solr/latest/query-guide/query-syntax-and-parsers.html [R=301,NE,L]
+RewriteRule ^guide/read-and-write-side-fault-tolerance.html 
/guide/solr/latest/deployment-guide/solrcloud-recoveries-and-write-tolerance.html
 [R=301,NE,L]
 RewriteRule ^guide/replication-screen.html 
/guide/solr/latest/deployment-guide/user-managed-index-replication.html 
[R=301,NE,L]
 RewriteRule ^guide/requestdispatcher-in-solrconfig.html 
/guide/solr/latest/configuration-guide/requestdispatcher.html [R=301,NE,L]
 RewriteRule ^guide/requesthandlers-and-searchcomponents-in-solrconfig.html 
/guide/solr/latest/configuration-guide/requesthandlers-searchcomponents.html 
[R=301,NE,L]
+RewriteRule ^guide/resource-and-plugin-loading.html 
/guide/solr/latest/configuration-guide/resource-loading.html [R=301,NE,L]
 RewriteRule ^guide/running-solr-on-hdfs.html 
/guide/solr/latest/deployment-guide/solr-on-hdfs.html [R=301,NE,L]
 RewriteRule ^guide/running-your-analyzer.html 
/guide/solr/latest/indexing-guide/analysis-screen.html [R=301,NE,L]
+RewriteRule ^guide/running-solr.html 
/guide/solr/latest/deployment-guide/installing-solr.html [R=301,NE,L]
 RewriteRule ^guide/schema-factory-definition-in-solrconfig.html 
/guide/solr/latest/configuration-guide/schema-factory.html [R=301,NE,L]
 RewriteRule ^guide/searching.html 
/guide/solr/latest/query-guide/query-syntax-and-parsers.html [R=301,NE,L]
 RewriteRule ^guide/segments-info.html 
/guide/solr/latest/configuration-guide/index-segments-merging.html [R=301,NE,L]
@@ -85,6 +92,11 @@ RewriteRule 
^guide/solrcloud-configuration-and-parameters.html /guide/solr/lates
 RewriteRule ^guide/solrcloud-query-routing-and-read-tolerance.html 
/guide/solr/latest/deployment-guide/solrcloud-distributed-requests.html 
[R=301,NE,L]
 RewriteRule ^guide/solrcloud-resilience.html 
/guide/solr/latest/deployment-guide/solrcloud-recoveries-and-write-tolerance.html
 [R=301,NE,L]
 RewriteRule ^guide/solrcloud.html 
/guide/solr/latest/deployment-guide/cluster-types.html#solrcloud-mode 
[R=301,NE,L]
+RewriteRule ^guide/statistical-programming.html 
/guide/solr/latest/query-guide/math-expressions.html [R=301,NE,L]
+RewriteRule ^guide/stream-decorators.html 
/guide/solr/latest/query-guide/stream-decorator-reference.html [R=301,NE,L]
+RewriteRule ^guide/stream-evaluators.html 
/guide/solr/latest/query-guide/stream-evaluator-reference.html [R=301,NE,L]
+RewriteRule ^guide/stream-sources.html 
/guide/solr/latest/query-guide/stream-source-reference.html [R=301,NE,L]
+RewriteRule ^guide/time-routed-aliases.html 
https://solr.apache.org/guide/solr/latest/deployment-guide/aliases.html#time-routed-aliases
 [R=301,NE,L]
 RewriteRule ^guide/the-dismax-query-parser.html 
/guide/solr/latest/query-guide/dismax-query-parser.html [R=301,NE,L]
 RewriteRule ^guide/the-extended-dismax-query-parser.html 
/guide/solr/latest/query-guide/edismax-query-parser.html [R=301,NE,L]
 RewriteRule ^guide/the-query-elevation-component.html 
/guide/solr/latest/query-guide/query-elevation-component.html [R=301,NE,L]
@@ -95,9 +107,11 @@ RewriteRule ^guide/the-term-vector-component.html 
/guide/solr/latest/query-guide
 RewriteRule ^guide/the-terms-component.html 
/guide/solr/latest/query-guide/terms-component.html [R=301,NE,L]
 RewriteRule ^guide/the-well-configured-solr-instance.html 
/guide/solr/latest/configuration-guide/configuration-files.html [R=301,NE,L]
 RewriteRule ^guide/transforming-result-documents.html 
/guide/solr/latest/query-guide/document-transformers.html [R=301,NE,L]
+RewriteRule ^guide/uima-integration.html 
https://solr.apache.org/guide/7_4/uima-integration.html [R=301,NE,L]
 RewriteRule ^guide/understanding-analyzers-tokenizers-and-filters.html 
/guide/solr/latest/indexing-guide/document-analysis.html [R=301,NE,L]
 RewriteRule ^guide/updatehandlers-in-solrconfig.html 
/guide/solr/latest/configuration-guide/commits-transaction-logs.html 
[R=301,NE,L]
 RewriteRule ^guide/updating-parts-of-documents.html 
/guide/solr/latest/indexing-guide/partial-document-updates.html [R=301,NE,L]
+RewriteRule ^guide/upgrading-solr.html 
/guide/solr/latest/deployment-guide/upgrading-a-solr-cluster.html [R=301,NE,L]
 RewriteRule ^guide/uploading-data-with-index-handlers.html 
/guide/solr/latest/indexing-guide/indexing-with-update-handlers.html 
[R=301,NE,L]
 RewriteRule ^guide/uploading-data-with-solr-cell-using-apache-tika.html 
/guide/solr/latest/indexing-guide/indexing-with-tika.html [R=301,NE,L]
 RewriteRule ^guide/using-javascript.html 
/guide/solr/latest/deployment-guide/javascript.html [R=301,NE,L]
@@ -107,6 +121,7 @@ RewriteRule ^guide/using-solr-from-ruby.html 
/guide/solr/latest/deployment-guide
 RewriteRule ^guide/using-solrj.html 
/guide/solr/latest/deployment-guide/solrj.html [R=301,NE,L]
 RewriteRule ^guide/using-the-solr-administration-user-interface.html 
/guide/solr/latest/getting-started/solr-admin-ui.html [R=301,NE,L]
 RewriteRule ^guide/using-zookeeper-to-manage-configuration-files.html 
/guide/solr/latest/deployment-guide/zookeeper-file-management.html [R=301,NE,L]
+RewriteRule ^guide/vectorization.html 
/guide/solr/latest/query-guide/math-expressions.html [R=301,NE,L]
 RewriteRule ^guide/working-with-currencies-and-exchange-rates.html 
/guide/solr/latest/indexing-guide/currencies-exchange-rates.html [R=301,NE,L]
 RewriteRule ^guide/working-with-dates.html 
/guide/solr/latest/indexing-guide/date-formatting-math.html [R=301,NE,L]
 RewriteRule ^guide/working-with-enum-fields.html 
/guide/solr/latest/indexing-guide/enum-fields.html [R=301,NE,L]
@@ -118,9 +133,3 @@ RewriteRule ^guide/real-time-get.html 
/guide/solr/latest/configuration-guide/rea
 # Removed pages redirected to latest 8.x guide
 RedirectMatch 301 
^/guide/(adding-custom-plugins-in-solrcloud-mode|blob-store-api|blockjoin-faceting|cdcr-api|cdcr-architecture|cdcr-config|cdcr-operations|colocating-collections|cross-data-center-replication-cdcr|dataimport-screen|errata|metrics-history|migrate-to-policy-rule|putting-the-pieces-together|rule-based-replica-placement|solrcloud-autoscaling-api|solrcloud-autoscaling-auto-add-replicas|solrcloud-autoscaling-fault-tolerance|solrcloud-autoscaling-listeners|solrcloud-autoscaling
 [...]
 # Paths we could not map
-
-
-# Do not index old reference guide pages on search engines, except for pages 
that don't exist in 9+
-<If "%{REQUEST_URI} =~ m#/guide/(6|7|8)_.*# && %{REQUEST_URI} !~ 
m#/guide/8_11/(adding-custom-plugins-in-solrcloud-mode|blob-store-api|blockjoin-faceting|cdcr-api|cdcr-architecture|cdcr-config|cdcr-operations|colocating-collections|cross-data-center-replication-cdcr|dataimport-screen|errata|metrics-history|migrate-to-policy-rule|putting-the-pieces-together|rule-based-replica-placement|solrcloud-autoscaling-api|solrcloud-autoscaling-auto-add-replicas|solrcloud-autoscaling-fault-tolerance|
 [...]
-  Header set X-Robots-Tag "noindex,nofollow,noarchive"
-</If>
diff --git a/dev-tools/scripts/refguide/mappings.csv 
b/dev-tools/scripts/refguide/mappings.csv
index 910207af4fb..94f6754b1ef 100644
--- a/dev-tools/scripts/refguide/mappings.csv
+++ b/dev-tools/scripts/refguide/mappings.csv
@@ -19,6 +19,7 @@ indexconfig-in-solrconfig.html;index-segments-merging.html
 indexing-and-basic-data-operations.html;indexing-with-update-handlers.html
 initparams-in-solrconfig.html;initparams.html
 introduction-to-solr-indexing.html;solr-indexing.html
+lib-directives-in-solrconfig.html;libs.html
 local-parameters-in-queries.html;local-params.html
 major-changes-from-solr-5-to-solr-6.html;major-changes-in-solr-6.html
 making-and-restoring-backups.html;backup-restore.html
@@ -105,6 +106,21 @@ 
the-well-configured-solr-instance.adoc;configuration-files.html
 solrcloud.adoc;cluster-types.html#solrcloud-mode
 how-to-contribute.adoc;https://solr.apache.org/community.html#how-to-contribute
 deployment-and-operations.adoc;installing-solr.html
+vectorization.adoc;math-expressions.html
+time-routed-aliases.adoc;https://solr.apache.org/guide/solr/latest/deployment-guide/aliases.html#time-routed-aliases
+resource-and-plugin-loading.adoc;resource-loading.html
+getting-assistance.adoc;https://solr.apache.org/community.html
+statistical-programming.adoc;math-expressions.html
+lib-directives-in-solrconfig.adoc;libs.html
+uima-integration.adoc;https://solr.apache.org/guide/7_4/uima-integration.html
+upgrading-solr.adoc;upgrading-a-solr-cluster.html
+stream-decorators.adoc;stream-decorator-reference.html
+stream-sources.adoc;stream-source-reference.html
+stream-evaluators.adoc;stream-evaluator-reference.html
+a-step-closer.adoc;configuration-files.html
+read-and-write-side-fault-tolerance.adoc;solrcloud-recoveries-and-write-tolerance.html
+managing-solr.adoc;solr-control-script-reference.html
+running-solr.adoc;installing-solr.html
 
 # A bit uncertain of these
 parallel-sql-interface.html;sql-query.html
diff --git a/dev-tools/scripts/refguide/old-guide.txt 
b/dev-tools/scripts/refguide/old-guide.txt
index cf1478100d6..7edc6f5fe1a 100644
--- a/dev-tools/scripts/refguide/old-guide.txt
+++ b/dev-tools/scripts/refguide/old-guide.txt
@@ -1,4 +1,5 @@
 a-quick-overview.adoc
+a-step-closer.adoc
 about-filters.adoc
 about-this-guide.adoc
 about-tokenizers.adoc
@@ -77,6 +78,7 @@ filter-descriptions.adoc
 format-of-solr-xml.adoc
 function-queries.adoc
 further-assistance.adoc
+getting-assistance.adoc
 getting-started-with-solrcloud.adoc
 getting-started.adoc
 graph-traversal.adoc
@@ -107,6 +109,7 @@ kerberos-authentication-plugin.adoc
 language-analysis.adoc
 learning-to-rank.adoc
 legacy-scaling-and-distribution.adoc
+lib-directives-in-solrconfig.adoc
 libs.adoc
 loading.adoc
 local-parameters-in-queries.adoc
@@ -119,6 +122,7 @@ major-changes-in-solr-7.adoc
 major-changes-in-solr-8.adoc
 making-and-restoring-backups.adoc
 managed-resources.adoc
+managing-solr.adoc
 math-expressions.adoc
 math-start.adoc
 matrix-math.adoc
@@ -153,6 +157,7 @@ query-re-ranking.adoc
 query-screen.adoc
 query-settings-in-solrconfig.adoc
 query-syntax-and-parsing.adoc
+read-and-write-side-fault-tolerance.adoc
 realtime-get.adoc
 regression.adoc
 reindexing.adoc
@@ -162,6 +167,7 @@ replication-screen.adoc
 request-parameters-api.adoc
 requestdispatcher-in-solrconfig.adoc
 requesthandlers-and-searchcomponents-in-solrconfig.adoc
+resource-and-plugin-loading.adoc
 resource-loading.adoc
 response-writers.adoc
 result-grouping.adoc
@@ -169,6 +175,7 @@ rule-based-authorization-plugin.adoc
 rule-based-replica-placement.adoc
 running-solr-on-hdfs.adoc
 running-your-analyzer.adoc
+running-solr.adoc
 scalar-math.adoc
 schema-api.adoc
 schema-browser-screen.adoc
@@ -218,16 +225,21 @@ solrcloud.adoc
 spatial-search.adoc
 spell-checking.adoc
 statistics.adoc
+statistical-programming.adoc
 stream-api.adoc
 stream-decorator-reference.adoc
+stream-decorators.adoc
 stream-evaluator-reference.adoc
+stream-evaluators.adoc
 stream-screen.adoc
 stream-source-reference.adoc
+stream-sources.adoc
 streaming-expressions.adoc
 suggester.adoc
 suggestions-screen.adoc
 taking-solr-to-production.adoc
 term-vectors.adoc
+time-routed-aliases.adoc
 the-dismax-query-parser.adoc
 the-extended-dismax-query-parser.adoc
 the-query-elevation-component.adoc
@@ -243,11 +255,13 @@ tokenizers.adoc
 transform.adoc
 transforming-and-indexing-custom-json.adoc
 transforming-result-documents.adoc
+uima-integration.adoc
 understanding-analyzers-tokenizers-and-filters.adoc
 update-request-processors.adoc
 updatehandlers-in-solrconfig.adoc
 updating-parts-of-documents.adoc
 upgrading-a-solr-cluster.adoc
+upgrading-solr.adoc
 uploading-data-with-index-handlers.adoc
 uploading-data-with-solr-cell-using-apache-tika.adoc
 uploading-structured-data-store-data-with-the-data-import-handler.adoc
@@ -261,6 +275,7 @@ using-zookeeper-to-manage-configuration-files.adoc
 v2-api.adoc
 variables.adoc
 vector-math.adoc
+vectorization.adoc
 velocity-response-writer.adoc
 velocity-search-ui.adoc
 visualization.adoc
diff --git a/dev-tools/scripts/refguide/refguide-add-canonical-url.py 
b/dev-tools/scripts/refguide/refguide-add-canonical-url.py
new file mode 100755
index 00000000000..d0ed8c7e941
--- /dev/null
+++ b/dev-tools/scripts/refguide/refguide-add-canonical-url.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script processes all static html files for Solr's refernce guide
+and downloads external JS and CSS files to local folders js/ and css/ for
+each version. It also updates the HTML files to reference the local files.
+Context is that ASF policy for web sites changed to not allow external
+references to JS and CSS files, and these sites were generated long ago.
+"""
+
+import os
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+import re
+import argparse
+
+
+def lines_from_file(filename):
+    with open(filename, 'r') as fp:
+        lines = []
+        for line in fp.readlines():
+            if line.startswith("#") or len(line.strip()) == 0:
+                continue
+            lines.append(line.replace(".adoc", ".html").strip())
+        return lines
+
+def generate_canonical_mapping(conf):
+    new = {}
+    name_map = {}
+
+    print("Reading config")
+    old = lines_from_file(conf.old)
+    for line in lines_from_file(conf.new):
+        (path, file) = line.split("/")
+        new[file] = line
+    for line in lines_from_file(conf.mapping):
+        (frm, to) = line.split(";")
+        name_map[frm] = to
+
+    # Files in src/old-pages as of 2022-02-04
+    old_pages = ["configuration-apis.html", "configuration-guide.html", 
"controlling-results.html", "deployment-guide.html", "enhancing-queries.html", 
"field-types.html", "fields-and-schema-design.html", "getting-started.html", 
"indexing-data-operations.html", "installation-deployment.html", 
"monitoring-solr.html", "query-guide.html", "scaling-solr.html", 
"schema-indexing-guide.html", "solr-concepts.html", "solr-schema.html", 
"solrcloud-clusters.html", "user-managed-clusters.html"]
+
+    result = {}
+    old_guide = []
+    failed = {}
+    regex_new = {}
+    print("Converting...")
+    for frm in old:
+        if frm in new:
+            (subpath, name) = new[frm].split("/")
+            if subpath not in regex_new:
+                regex_new[subpath] = []
+            regex_new[subpath].append(name.split(".html")[0])
+        elif frm in name_map:
+            new_name = name_map[frm]
+            new_name_without_anchor = new_name
+            anchor = ""
+            anchor_index = new_name.find("#")
+            if anchor_index > 0:
+                new_name_without_anchor = new_name[:anchor_index]
+                anchor = new_name[anchor_index:]
+            if new_name_without_anchor.startswith("https://";):
+                result[frm] = new_name
+            elif new_name_without_anchor in new:
+                result[frm] = new[new_name_without_anchor] + anchor
+            elif new_name_without_anchor.startswith("/guide/"):
+                result[frm] = new_name[7:]
+            elif new_name_without_anchor == "_8_11":
+                old_guide.append(frm.split(".html")[0])
+            else:
+                failed[frm] = "Mapped value %s not in new guide" % 
new_name_without_anchor
+        elif frm in old_pages:
+            failed[frm] = "Not yet mapped (in src/old-pages)"
+        else:
+            failed[frm] = "404"
+
+    mappings = {
+        "index.html": "https://solr.apache.org/guide/solr/latest/index.html";,
+
+    }
+    # Add direct mappings from old to new files
+    for key in regex_new:
+        for file in regex_new[key]:
+            mappings[file + ".html"] = 
f"https://solr.apache.org/guide/solr/latest/{key}/{file}.html";
+
+    # Add mappings for renamed files
+    for key in result:
+        if result[key].startswith("https://";):
+            mappings[key] = result[key]
+        else:
+            mappings[key] = 
f"https://solr.apache.org/guide/solr/latest/{result[key]}";
+
+    # Add mappings for files removed in 9.0, they will be canonical to 8.11
+    for file in old_guide:
+        mappings[file + ".html"] = 
f"https://solr.apache.org/guide/8_11/{file}.html";
+
+    for (key, value) in mappings.items():
+        print(key, value)
+    return mappings
+
+def extract_filename_from_path(html_file_path):
+    """Extract filename from path."""
+    match = re.search(r'/([^\/]+)$', html_file_path)
+    return match.group(1) if match else None
+
+def process_html_file(html_file_path, url, mappings):
+    """Process an HTML file to localize external JS and CSS references."""
+    with open(html_file_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+
+    file_name = extract_filename_from_path(html_file_path)
+    if file_name and file_name in mappings:
+        canonical_url = mappings[file_name]
+    else:
+        canonical_url = url
+    if canonical_url == url:
+        print(f"Skipped {html_file_path}, filename {file_name}, it is the 
canonical url: {url}")
+        return
+    canonical_link_html = f"<link rel=\"canonical\" 
href=\"{canonical_url}\">\n"
+
+    new_lines = []
+    found_title = False
+    for line in lines:
+        soup = BeautifulSoup(line, "html.parser")
+        title = soup.find("title")
+        canon_link = soup.find("link", attrs={'rel': 'canonical'})
+
+        if title and not found_title:
+            new_lines.append(line)
+            new_lines.append(canonical_link_html)
+            found_title = True
+        elif not (found_title and canon_link):
+            # Skip any other canonical url we find
+            new_lines.append(line)
+
+    if found_title:
+        with open(html_file_path, "w", encoding="utf-8") as f:
+            f.writelines(new_lines)
+        print(f"Updated {html_file_path} to canonical url: {canonical_url}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Process HTML files to add 
Canonical URLs to old ref guide pages')
+    parser.add_argument('--old', required=True, help='Old pagenames file, one 
.adoc filename per line')
+    parser.add_argument('--new', required=True, help='New pagenames file, one 
.adoc filename per line')
+    parser.add_argument('--mapping', required=True, help='Semicolon separated 
from-to file names (adoc)')
+    parser.add_argument('--folder', help='Folder of svn checkout 
(https://svn.apache.org/repos/infra/sites/solr/guide/)')
+    args = parser.parse_args()
+    mappings = generate_canonical_mapping(args)
+
+    base_dir = args.folder
+
+    # Iterate over the folder structure
+    folders = [name for name in os.listdir(base_dir) if re.match(r'\d+_\d+', 
name)]
+    if not folders:
+        print(f"No versioned directories 'N_M' found in {base_dir}, exiting.")
+        return
+    for root_dir in folders:
+        print(f"\nProcessing directory {root_dir}")
+        print(f"=================================")
+        full_path = os.path.join(base_dir, root_dir)
+        if not os.path.exists(full_path):
+            print(f"Directory {full_path} not found, skipping.")
+            continue
+
+        # Process each HTML file in the directory
+        for filename in os.listdir(full_path):
+            if filename.endswith(".html"):
+                html_file_path = os.path.join(full_path, filename)
+                url = f"https://solr.apache.org/guide/{root_dir}/{filename}";
+                process_html_file(html_file_path, url, mappings)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

Reply via email to