This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new b481f912c NUTCH-3083 Add RobotRulesParser to bin/nutch
b481f912c is described below

commit b481f912cee9ddf985886491b1e1ce695af4d23d
Author: Sebastian Nagel <[email protected]>
AuthorDate: Sun Oct 27 12:42:14 2024 +0100

    NUTCH-3083 Add RobotRulesParser to bin/nutch
    
    Add command *robotsparser* to bin/nutch, invoking the main method
    of org.apache.nutch.protocol.RobotRulesParser
---
 src/bin/nutch | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/bin/nutch b/src/bin/nutch
index 0b55388c6..257059deb 100755
--- a/src/bin/nutch
+++ b/src/bin/nutch
@@ -86,6 +86,7 @@ if [ $# = 0 ]; then
   echo "  indexchecker      check the indexing filters for a given url"
   echo "  filterchecker     check url filters for a given url"
   echo "  normalizerchecker check url normalizers for a given url"
+  echo "  robotsparser      parse a robots.txt file and check whether urls are 
allowed or not"
   echo "  domainstats       calculate domain statistics from crawldb"
   echo "  protocolstats     calculate protocol status code stats from crawldb"
   echo "  crawlcomplete     calculate crawl completion stats from crawldb"
@@ -268,6 +269,8 @@ elif [ "$COMMAND" = "filterchecker" ] ; then
   CLASS=org.apache.nutch.net.URLFilterChecker
 elif [ "$COMMAND" = "normalizerchecker" ] ; then
   CLASS=org.apache.nutch.net.URLNormalizerChecker
+elif [ "$COMMAND" = "robotsparser" ] ; then
+  CLASS=org.apache.nutch.protocol.RobotRulesParser
 elif [ "$COMMAND" = "domainstats" ] ; then 
   CLASS=org.apache.nutch.util.DomainStatistics
 elif [ "$COMMAND" = "protocolstats" ] ; then

Reply via email to