This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new cf4f805fa NUTCH-3113 Group commands in bin/nutch command-line help
thematically
cf4f805fa is described below
commit cf4f805fa37aac9b71641202842087e081b6cd92
Author: Sebastian Nagel <[email protected]>
AuthorDate: Thu Mar 27 19:43:05 2025 +0100
NUTCH-3113 Group commands in bin/nutch command-line help thematically
---
src/bin/nutch | 67 +++++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 44 insertions(+), 23 deletions(-)
diff --git a/src/bin/nutch b/src/bin/nutch
index 257059deb..bc602a45b 100755
--- a/src/bin/nutch
+++ b/src/bin/nutch
@@ -64,46 +64,67 @@ if [ $# = 0 ]; then
echo "nutch 1.21-SNAPSHOT"
echo "Usage: nutch COMMAND [-Dproperty=value]... [command-specific args]..."
echo "where COMMAND is one of:"
- echo " readdb read / dump crawl db"
- echo " mergedb merge crawldb-s, with optional filtering"
- echo " readlinkdb read / dump link db"
+ echo " (Crawl commands)"
echo " inject inject new urls into the database"
echo " generate generate new segments to fetch from crawl db"
- echo " freegen generate new segments to fetch from text files"
echo " fetch fetch a segment's pages"
echo " parse parse a segment's pages"
+ echo " updatedb update crawl db from segments after fetching"
+ echo ""
+ echo " (CrawlDb commands)"
+ echo " readdb read / dump crawl db"
+ echo " mergedb merge crawldb-s, with optional filtering"
+ echo " dedup deduplicate entries in the crawldb and assign them
a special status"
+ echo " domainstats calculate domain statistics from crawldb"
+ echo " protocolstats calculate protocol status code stats from crawldb"
+ echo " crawlcomplete calculate crawl completion stats from crawldb"
+ echo ""
+ echo " (Segment tools)"
+ echo " freegen generate a new segment to fetch from a URL text
file"
echo " readseg read / dump segment data"
echo " mergesegs merge several segments, with optional filtering
and slicing"
- echo " updatedb update crawl db from segments after fetching"
+ echo ""
+ echo " (HostDb commands)"
+ echo " updatehostdb update the host db with records from the crawl db"
+ echo " readhostdb read / dump host db"
+ echo " sitemap perform Sitemap processing"
+ echo ""
+ echo " (LinkDb commands)"
+ echo " readlinkdb read / dump link db"
echo " invertlinks create a linkdb from parsed segments"
echo " mergelinkdb merge linkdb-s, with optional filtering"
+ echo ""
+ echo " (Index commands)"
echo " index run the plugin-based indexer on parsed segments
and linkdb"
- echo " dedup deduplicate entries in the crawldb and give them a
special status"
- echo " dump exports crawled data from segments into files"
- echo " commoncrawldump exports crawled data from segments into common
crawl data format encoded as CBOR"
- echo " clean remove HTTP 301 and 404 documents and duplicates
from indexing backends configured via plugins"
+ echo " clean remove HTTP 301 and 404 documents and duplicates
from indexing backends"
+ echo ""
+ echo " (Webgraph commands)"
+ echo " webgraph generate a web graph from existing segments"
+ echo " linkrank run a link analysis program on the generated web
graph"
+ echo " scoreupdater updates the crawldb with linkrank scores"
+ echo " nodedumper dumps the web graph's node scores"
+ echo ""
+ echo " (Debugging and validation tools)"
echo " parsechecker check the parser for a given url"
echo " indexchecker check the indexing filters for a given url"
echo " filterchecker check url filters for a given url"
echo " normalizerchecker check url normalizers for a given url"
echo " robotsparser parse a robots.txt file and check whether urls are
allowed or not"
- echo " domainstats calculate domain statistics from crawldb"
- echo " protocolstats calculate protocol status code stats from crawldb"
- echo " crawlcomplete calculate crawl completion stats from crawldb"
- echo " webgraph generate a web graph from existing segments"
- echo " linkrank run a link analysis program on the generated web
graph"
- echo " scoreupdater updates the crawldb with linkrank scores"
- echo " nodedumper dumps the web graph's node scores"
echo " plugin load a plugin and run one of its classes main()"
echo " junit runs the given JUnit test"
- echo " startserver runs the Nutch Server on localhost:8081"
- echo " warc exports crawled data from segments at the WARC
format"
- echo " updatehostdb update the host db with records from the crawl db"
- echo " readhostdb read / dump host db"
- echo " sitemap perform Sitemap processing"
echo " showproperties print Nutch/Hadoop configuration properties to
stdout"
- echo " or"
- echo " CLASSNAME run the class named CLASSNAME"
+ echo ""
+ echo " (Data export)"
+ echo " dump exports crawled data from segments into files"
+ echo " commoncrawldump exports crawled data from segments into common
crawl data format encoded as CBOR"
+ echo " warc exports crawled data from segments at the WARC
format"
+ echo ""
+ echo " (Nutch Server)"
+ echo " startserver runs the Nutch Server on localhost:8081"
+ echo ""
+ echo " (or)"
+ echo " CLASSNAME run the main of the class named CLASSNAME"
+ echo ""
echo "Most commands print help when invoked w/o parameters."
exit 1
fi