cbaines pushed a commit to branch master
in repository maintenance.
commit ef18d820bbdfc257793824229e0ae22032d4986e
Author: Christopher Baines <[email protected]>
AuthorDate: Mon Nov 4 10:40:57 2024 +0000
hydra: bayfront: Try to stop bots from crawling git.qa.guix.gnu.org.
As branches pass through this repository, there's no value in indexing
it anyway, and the requests cause cgit to timeout.
* hydra/bayfront.scm (%git.qa.guix.gnu.org-cgit-configuration-nginx):
Add robots.txt and block specific bots.
---
hydra/bayfront.scm | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/hydra/bayfront.scm b/hydra/bayfront.scm
index b9ffe0e4..ce9ac949 100644
--- a/hydra/bayfront.scm
+++ b/hydra/bayfront.scm
@@ -1021,6 +1021,10 @@ access_log /var/log/nginx/git.qa.access.log;"))
"access_log /var/log/nginx/git.qa.access.log;"))
(locations
(list
+ (nginx-location-configuration
+ (uri "/robots.txt")
+ (body '("add_header Content-Type text/plain;"
+ "return 200 \"User-agent: *\\nDisallow: /\\n\";")))
(let ((base
(git-http-nginx-location-configuration
(git-http-configuration
@@ -1034,7 +1038,12 @@ access_log /var/log/nginx/git.qa.access.log;"))
"fastcgi_param GIT_CONFIG_SYSTEM \"/etc/gitconfig\";"))))
(nginx-location-configuration
(uri "@cgit")
- (body '("fastcgi_param SCRIPT_FILENAME
$document_root/lib/cgit/cgit.cgi;"
+ (body '(;; This can maybe be removed if the robots.txt starts
+ ;; to be effective
+ "if ($http_user_agent ~ (Bytespider|ClaudeBot) ) {
+ return 403;
+}"
+ "fastcgi_param SCRIPT_FILENAME
$document_root/lib/cgit/cgit.cgi;"
"fastcgi_param PATH_INFO $uri;"
"fastcgi_param QUERY_STRING $args;"
"fastcgi_param HTTP_HOST $server_name;"