(nutch) branch master updated: NUTCH-3113 Group commands in bin/nutch command-line help thematically

snagel Fri, 11 Jul 2025 06:47:43 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new cf4f805fa NUTCH-3113 Group commands in bin/nutch command-line help 
thematically
cf4f805fa is described below

commit cf4f805fa37aac9b71641202842087e081b6cd92
Author: Sebastian Nagel <[email protected]>
AuthorDate: Thu Mar 27 19:43:05 2025 +0100

    NUTCH-3113 Group commands in bin/nutch command-line help thematically
---
 src/bin/nutch | 67 +++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 44 insertions(+), 23 deletions(-)

diff --git a/src/bin/nutch b/src/bin/nutch
index 257059deb..bc602a45b 100755
--- a/src/bin/nutch
+++ b/src/bin/nutch
@@ -64,46 +64,67 @@ if [ $# = 0 ]; then
   echo "nutch 1.21-SNAPSHOT"
   echo "Usage: nutch COMMAND [-Dproperty=value]... [command-specific args]..."
   echo "where COMMAND is one of:"
-  echo "  readdb            read / dump crawl db"
-  echo "  mergedb           merge crawldb-s, with optional filtering"
-  echo "  readlinkdb        read / dump link db"
+  echo " (Crawl commands)"
   echo "  inject            inject new urls into the database"
   echo "  generate          generate new segments to fetch from crawl db"
-  echo "  freegen           generate new segments to fetch from text files"
   echo "  fetch             fetch a segment's pages"
   echo "  parse             parse a segment's pages"
+  echo "  updatedb          update crawl db from segments after fetching"
+  echo ""
+  echo " (CrawlDb commands)"
+  echo "  readdb            read / dump crawl db"
+  echo "  mergedb           merge crawldb-s, with optional filtering"
+  echo "  dedup             deduplicate entries in the crawldb and assign them 
a special status"
+  echo "  domainstats       calculate domain statistics from crawldb"
+  echo "  protocolstats     calculate protocol status code stats from crawldb"
+  echo "  crawlcomplete     calculate crawl completion stats from crawldb"
+  echo ""
+  echo " (Segment tools)"
+  echo "  freegen           generate a new segment to fetch from a URL text 
file"
   echo "  readseg           read / dump segment data"
   echo "  mergesegs         merge several segments, with optional filtering 
and slicing"
-  echo "  updatedb          update crawl db from segments after fetching"
+  echo ""
+  echo " (HostDb commands)"
+  echo "  updatehostdb      update the host db with records from the crawl db"
+  echo "  readhostdb        read / dump host db"
+  echo "  sitemap           perform Sitemap processing"
+  echo ""
+  echo " (LinkDb commands)"
+  echo "  readlinkdb        read / dump link db"
   echo "  invertlinks       create a linkdb from parsed segments"
   echo "  mergelinkdb       merge linkdb-s, with optional filtering"
+  echo ""
+  echo " (Index commands)"
   echo "  index             run the plugin-based indexer on parsed segments 
and linkdb"
-  echo "  dedup             deduplicate entries in the crawldb and give them a 
special status"
-  echo "  dump              exports crawled data from segments into files"
-  echo "  commoncrawldump   exports crawled data from segments into common 
crawl data format encoded as CBOR"
-  echo "  clean             remove HTTP 301 and 404 documents and duplicates 
from indexing backends configured via plugins"
+  echo "  clean             remove HTTP 301 and 404 documents and duplicates 
from indexing backends"
+  echo ""
+  echo " (Webgraph commands)"
+  echo "  webgraph          generate a web graph from existing segments"
+  echo "  linkrank          run a link analysis program on the generated web 
graph"
+  echo "  scoreupdater      updates the crawldb with linkrank scores"
+  echo "  nodedumper        dumps the web graph's node scores"
+  echo ""
+  echo " (Debugging and validation tools)"
   echo "  parsechecker      check the parser for a given url"
   echo "  indexchecker      check the indexing filters for a given url"
   echo "  filterchecker     check url filters for a given url"
   echo "  normalizerchecker check url normalizers for a given url"
   echo "  robotsparser      parse a robots.txt file and check whether urls are 
allowed or not"
-  echo "  domainstats       calculate domain statistics from crawldb"
-  echo "  protocolstats     calculate protocol status code stats from crawldb"
-  echo "  crawlcomplete     calculate crawl completion stats from crawldb"
-  echo "  webgraph          generate a web graph from existing segments"
-  echo "  linkrank          run a link analysis program on the generated web 
graph"
-  echo "  scoreupdater      updates the crawldb with linkrank scores"
-  echo "  nodedumper        dumps the web graph's node scores"
   echo "  plugin            load a plugin and run one of its classes main()"
   echo "  junit             runs the given JUnit test"
-  echo "  startserver       runs the Nutch Server on localhost:8081"
-  echo "  warc              exports crawled data from segments at the WARC 
format"
-  echo "  updatehostdb      update the host db with records from the crawl db"
-  echo "  readhostdb        read / dump host db"
-  echo "  sitemap           perform Sitemap processing"
   echo "  showproperties    print Nutch/Hadoop configuration properties to 
stdout"
-  echo " or"
-  echo "  CLASSNAME         run the class named CLASSNAME"
+  echo ""
+  echo " (Data export)"
+  echo "  dump              exports crawled data from segments into files"
+  echo "  commoncrawldump   exports crawled data from segments into common 
crawl data format encoded as CBOR"
+  echo "  warc              exports crawled data from segments at the WARC 
format"
+  echo ""
+  echo " (Nutch Server)"
+  echo "  startserver       runs the Nutch Server on localhost:8081"
+  echo ""
+  echo " (or)"
+  echo "  CLASSNAME         run the main of the class named CLASSNAME"
+  echo ""
   echo "Most commands print help when invoked w/o parameters."
   exit 1
 fi

(nutch) branch master updated: NUTCH-3113 Group commands in bin/nutch command-line help thematically

Reply via email to