This is an automated email from the ASF dual-hosted git repository.
houston pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new ee97f7cefbe SOLR-16262: Canonical URL script should add noindex robots
meta tags
ee97f7cefbe is described below
commit ee97f7cefbec8db442f694f343a52e1f51070247
Author: Houston Putman <[email protected]>
AuthorDate: Tue Apr 22 11:11:42 2025 -0500
SOLR-16262: Canonical URL script should add noindex robots meta tags
---
.../scripts/refguide/refguide-add-canonical-url.py | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/dev-tools/scripts/refguide/refguide-add-canonical-url.py
b/dev-tools/scripts/refguide/refguide-add-canonical-url.py
index d0ed8c7e941..bd40f64c231 100755
--- a/dev-tools/scripts/refguide/refguide-add-canonical-url.py
+++ b/dev-tools/scripts/refguide/refguide-add-canonical-url.py
@@ -16,11 +16,14 @@
# limitations under the License.
"""
-This script processes all static html files for Solr's refernce guide
-and downloads external JS and CSS files to local folders js/ and css/ for
-each version. It also updates the HTML files to reference the local files.
-Context is that ASF policy for web sites changed to not allow external
-references to JS and CSS files, and these sites were generated long ago.
+This script processes all static html files for Solr's reference guide
+and adds canonical URLs for old pages (Solr 6 - Solr 8, Solr 9+ should not
+be affected). Since Google doesn't always respect the canonical URL
+directive, the meta tag for robots "noindex" is also added to ensure these
+outdated pages do not show up on Google search results.
+This script uses the same logic as the htaccess generation script to
+determine which pages are the "last" versions of that page, so that it can
+be indexed by google as the most recent information.
"""
import os
@@ -30,6 +33,7 @@ from urllib.parse import urlparse
import re
import argparse
+robots_no_index_html = "<meta name=\"robots\" content=\"noindex\">"
def lines_from_file(filename):
with open(filename, 'r') as fp:
@@ -144,6 +148,7 @@ def process_html_file(html_file_path, url, mappings):
if title and not found_title:
new_lines.append(line)
new_lines.append(canonical_link_html)
+ new_lines.append(robots_no_index_html)
found_title = True
elif not (found_title and canon_link):
# Skip any other canonical url we find
@@ -186,4 +191,4 @@ def main():
process_html_file(html_file_path, url, mappings)
if __name__ == "__main__":
- main()
\ No newline at end of file
+ main()