This is an automated email from the ASF dual-hosted git repository.

houston pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new ee97f7cefbe SOLR-16262: Canonical URL script should add noindex robots 
meta tags
ee97f7cefbe is described below

commit ee97f7cefbec8db442f694f343a52e1f51070247
Author: Houston Putman <[email protected]>
AuthorDate: Tue Apr 22 11:11:42 2025 -0500

    SOLR-16262: Canonical URL script should add noindex robots meta tags
---
 .../scripts/refguide/refguide-add-canonical-url.py      | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/dev-tools/scripts/refguide/refguide-add-canonical-url.py 
b/dev-tools/scripts/refguide/refguide-add-canonical-url.py
index d0ed8c7e941..bd40f64c231 100755
--- a/dev-tools/scripts/refguide/refguide-add-canonical-url.py
+++ b/dev-tools/scripts/refguide/refguide-add-canonical-url.py
@@ -16,11 +16,14 @@
 # limitations under the License.
 
 """
-This script processes all static html files for Solr's refernce guide
-and downloads external JS and CSS files to local folders js/ and css/ for
-each version. It also updates the HTML files to reference the local files.
-Context is that ASF policy for web sites changed to not allow external
-references to JS and CSS files, and these sites were generated long ago.
+This script processes all static html files for Solr's reference guide
+and adds canonical URLs for old pages (Solr 6 - Solr 8, Solr 9+ should not
+be affected). Since Google doesn't always respect the canonical URL
+directive, the meta tag for robots "noindex" is also added to ensure these
+outdated pages do not show up on Google search results.
+This script uses the same logic as the htaccess generation script to
+determine which pages are the "last" versions of that page, so that it can
+be indexed by google as the most recent information.
 """
 
 import os
@@ -30,6 +33,7 @@ from urllib.parse import urlparse
 import re
 import argparse
 
+robots_no_index_html = "<meta name=\"robots\" content=\"noindex\">"
 
 def lines_from_file(filename):
     with open(filename, 'r') as fp:
@@ -144,6 +148,7 @@ def process_html_file(html_file_path, url, mappings):
         if title and not found_title:
             new_lines.append(line)
             new_lines.append(canonical_link_html)
+            new_lines.append(robots_no_index_html)
             found_title = True
         elif not (found_title and canon_link):
             # Skip any other canonical url we find
@@ -186,4 +191,4 @@ def main():
                 process_html_file(html_file_path, url, mappings)
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

Reply via email to