cbaines pushed a commit to branch master
in repository maintenance.

commit ef18d820bbdfc257793824229e0ae22032d4986e
Author: Christopher Baines <[email protected]>
AuthorDate: Mon Nov 4 10:40:57 2024 +0000

    hydra: bayfront: Try to stop bots from crawling git.qa.guix.gnu.org.
    
    As branches pass through this repository, there's no value in indexing
    it anyway, and the requests cause cgit to timeout.
    
    * hydra/bayfront.scm (%git.qa.guix.gnu.org-cgit-configuration-nginx):
    Add robots.txt and block specific bots.
---
 hydra/bayfront.scm | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/hydra/bayfront.scm b/hydra/bayfront.scm
index b9ffe0e4..ce9ac949 100644
--- a/hydra/bayfront.scm
+++ b/hydra/bayfront.scm
@@ -1021,6 +1021,10 @@ access_log   /var/log/nginx/git.qa.access.log;"))
            "access_log /var/log/nginx/git.qa.access.log;"))
     (locations
      (list
+      (nginx-location-configuration
+       (uri "/robots.txt")
+       (body '("add_header  Content-Type  text/plain;"
+               "return 200 \"User-agent: *\\nDisallow: /\\n\";")))
       (let ((base
              (git-http-nginx-location-configuration
               (git-http-configuration
@@ -1034,7 +1038,12 @@ access_log   /var/log/nginx/git.qa.access.log;"))
             "fastcgi_param GIT_CONFIG_SYSTEM \"/etc/gitconfig\";"))))
       (nginx-location-configuration
        (uri "@cgit")
-       (body '("fastcgi_param SCRIPT_FILENAME 
$document_root/lib/cgit/cgit.cgi;"
+       (body '(;; This can maybe be removed if the robots.txt starts
+               ;; to be effective
+               "if ($http_user_agent ~ (Bytespider|ClaudeBot) ) {
+  return 403;
+}"
+               "fastcgi_param SCRIPT_FILENAME 
$document_root/lib/cgit/cgit.cgi;"
                "fastcgi_param PATH_INFO $uri;"
                "fastcgi_param QUERY_STRING $args;"
                "fastcgi_param HTTP_HOST $server_name;"

Reply via email to