I prefer and encourage use of individual commands because: a) you learn how 
the tools actually work and get a better understanding and b) the crawl 
command does not expose all options of separate commands.

> I created a script based on:
> http://wiki.apache.org/nutch/NutchTutorial#A3.2_Using_Individual_Commands_f
> or_Whole-Web_Crawling
> 
> The script is similar to what happens in the old runbot.sh script, but
> it isn't working for me.  The part that does the "s1" barely gets
> anything, but then the "s2" fails completely (it tries to make the
> same segment?).
> 
> The plain "crawl" command seems to work, but how practical is it to
> use that instead of making this script?  Is it better to have all the
> parts broken down like this, instead of passing more parts to the
> "crawl" command and letting that handle things?
> 
> Any help would be greatly appreciated!
> 
> Script:
> 
> # runbot script to run the Nutch bot for crawling and re-crawling.
> # Usage: bin/runbot [safe]
> #        If executed in 'safe' mode, it doesn't delete the temporary
> #        directories generated during crawl. This might be helpful for
> #        analysis and recovery in case a crawl fails.
> #
> # Author: Susam Pal
> 
> #Set up some local vars -- like NUTCH_HOME & CATALINA_HOME
> . /common/setenv.sh
> 
> CRAWL_DIR=$NUTCH_HOME/crawl
> SEGMENTS_DIR=$CRAWL_DIR/segments
> CRAWL_DB=$CRAWL_DIR/crawldb
> LINK_DB=$CRAWL_DIR/linkdb
> 
> depth=5
> threads=5
> adddays=5
> topN=15 #Comment this statement if you don't want to set topN value
> 
> # Arguments for rm and mv
> RMARGS="-rf"
> MVARGS="--verbose"
> 
> # Parse arguments
> if [ "$1" == "safe" ]
> then
>   safe=yes
> fi
> 
> if [ -z "$NUTCH_HOME" ]
> then
>   NUTCH_HOME=/cdda/nutch
>   echo runbot: $0 could not find environment variable NUTCH_HOME
>   echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script
> else
>   echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME
> fi
> 
> if [ -z "$CATALINA_HOME" ]
> then
>   CATALINA_HOME=/cdda/apache-tomcat
>   echo runbot: $0 could not find environment variable NUTCH_HOME
>   echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script
> else
>   echo runbot: $0 found environment variable CATALINA_HOME=$CATALINA_HOME
> fi
> 
> if [ -n "$topN" ]
> then
>   topN="-topN $topN"
> else
>   topN=""
> fi
> 
> echo "crawl.sh: STARTING"
> echo $NUTCH_HOME/bin/nutch inject $CRAWL_DB $NUTCH_HOME/urls
> echo ""
> $NUTCH_HOME/bin/nutch inject $CRAWL_DB $NUTCH_HOME/urls
> 
> echo $NUTCH_HOME/bin/nutch generate $NUTCH_HOME/crawl/crawldb
> $SEGMENTS_DIR -numFetchers 5
> echo ""
> $NUTCH_HOME/bin/nutch generate $CRAWL_DB $SEGMENTS_DIR -numFetchers 5
> 
> s1=`ls -rd $SEGMENTS_DIR/* | tail -1`
> echo Segments: $s1
> echo ""
> 
> echo $NUTCH_HOME/bin/nutch fetch $s1 -threads $threads
> echo ""
> $NUTCH_HOME/bin/nutch fetch $s1 -threads $threads
> 
> echo $NUTCH_HOME/bin/nutch parse $s1
> echo ""
> $NUTCH_HOME/bin/nutch parse $s1
> 
> echo $NUTCH_HOME/bin/nutch updatedb $CRAWL_DB $s1
> echo ""
> $NUTCH_HOME/bin/nutch updatedb $CRAWL_DB $s1
> 
> echo $NUTCH_HOME/bin/nutch generate crawldb $SEGMENTS_DIR -numFetchers 5
> echo ""
> $NUTCH_HOME/bin/nutch generate crawldb $SEGMENTS_DIR -numFetchers 5
> s2=`ls -rd $SEGMENTS_DIR/* | tail -1`
> echo Segments: $s2
> echo ""
> 
> echo $NUTCH_HOME/bin/nutch fetch $s2 -threads $threads
> echo ""
> $NUTCH_HOME/bin/nutch fetch $s2 -threads $threads
> 
> echo $NUTCH_HOME/bin/nutch parse $s2
> echo ""
> $NUTCH_HOME/bin/nutch parse $s2
> 
> echo $NUTCH_HOME/bin/nutch updatedb $CRAWL_DB $s2
> echo ""
> $NUTCH_HOME/bin/nutch updatedb $CRAWL_DB $s2
> 
> echo $NUTCH_HOME/bin/nutch invertlinks $LINK_DB -dir $SEGMENTS_DIR
> echo ""
> $NUTCH_HOME/bin/nutch invertlinks $LINK_DB -dir $SEGMENTS_DIR
> 
> echo $NUTCH_HOME/bin/nutch solrindex http://localhost:8080/nutchsolr/
> $CRAWLDB $LINK_DB -dir $SEGMENTS_DIR/*
> echo ""
> $NUTCH_HOME/bin/nutch solrindex http://localhost:8080/nutchsolr/
> $CRAWLDB $LINK_DB -dir $SEGMENTS_DIR/*
> 
> echo "cleaning $SEGMENTS_DIR"
> #rm -rf $SEGMENTS_DIR/*
> 
> echo "crawl.sh: FINISHED: Crawl completed!"
> echo ""
> 
> 
> 
> -- Chris

Reply via email to