This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new b481f912c NUTCH-3083 Add RobotRulesParser to bin/nutch
b481f912c is described below
commit b481f912cee9ddf985886491b1e1ce695af4d23d
Author: Sebastian Nagel <[email protected]>
AuthorDate: Sun Oct 27 12:42:14 2024 +0100
NUTCH-3083 Add RobotRulesParser to bin/nutch
Add command *robotsparser* to bin/nutch, invoking the main method
of org.apache.nutch.protocol.RobotRulesParser
---
src/bin/nutch | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/bin/nutch b/src/bin/nutch
index 0b55388c6..257059deb 100755
--- a/src/bin/nutch
+++ b/src/bin/nutch
@@ -86,6 +86,7 @@ if [ $# = 0 ]; then
echo " indexchecker check the indexing filters for a given url"
echo " filterchecker check url filters for a given url"
echo " normalizerchecker check url normalizers for a given url"
+ echo " robotsparser parse a robots.txt file and check whether urls are
allowed or not"
echo " domainstats calculate domain statistics from crawldb"
echo " protocolstats calculate protocol status code stats from crawldb"
echo " crawlcomplete calculate crawl completion stats from crawldb"
@@ -268,6 +269,8 @@ elif [ "$COMMAND" = "filterchecker" ] ; then
CLASS=org.apache.nutch.net.URLFilterChecker
elif [ "$COMMAND" = "normalizerchecker" ] ; then
CLASS=org.apache.nutch.net.URLNormalizerChecker
+elif [ "$COMMAND" = "robotsparser" ] ; then
+ CLASS=org.apache.nutch.protocol.RobotRulesParser
elif [ "$COMMAND" = "domainstats" ] ; then
CLASS=org.apache.nutch.util.DomainStatistics
elif [ "$COMMAND" = "protocolstats" ] ; then