Author: snagel
Date: Tue Nov 11 16:20:01 2014
New Revision: 1638203
URL: http://svn.apache.org/r1638203
Log:
NUTCH-1883 in case of generate: break loop and do not exit with error
Modified:
nutch/branches/2.x/src/bin/crawl
nutch/trunk/src/bin/crawl
Modified: nutch/branches/2.x/src/bin/crawl
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1638203&r1=1638202&r2=1638203&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Tue Nov 11 16:20:01 2014
@@ -137,7 +137,22 @@ do
batchId=`date +%s`-$RANDOM
echo "Generating a new fetchlist"
- __bin_nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter
-adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId
+ generate_args=($commonOptions -topN $sizeFetchlist -noNorm -noFilter
-adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId)
+ echo "$bin/nutch generate ${generate_args[@]}"
+ $bin/nutch generate "${generate_args[@]}"
+ RETCODE=$?
+ if [ $RETCODE -eq 0 ]; then
+ : # ok: no error
+ elif [ $RETCODE -eq 1 ]; then
+ echo "Generate returned 1 (no new segments created)"
+ echo "Escaping loop: no more URLs to fetch now"
+ break
+ else
+ echo "Error running:"
+ echo " $bin/nutch generate ${generate_args[@]}"
+ echo "Failed with exit value $RETCODE."
+ exit $RETCODE
+ fi
echo "Fetching : "
__bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch
$batchId -crawlId "$CRAWL_ID" -threads 50
Modified: nutch/trunk/src/bin/crawl
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1638203&r1=1638202&r2=1638203&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Tue Nov 11 16:20:01 2014
@@ -133,7 +133,22 @@ do
echo `date` ": Iteration $a of $LIMIT"
echo "Generating a new segment"
- __bin_nutch generate $commonOptions "$CRAWL_PATH"/crawldb
"$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments
-topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
+ echo "$bin/nutch generate ${generate_args[@]}"
+ $bin/nutch generate "${generate_args[@]}"
+ RETCODE=$?
+ if [ $RETCODE -eq 0 ]; then
+ : # ok: no error
+ elif [ $RETCODE -eq 1 ]; then
+ echo "Generate returned 1 (no new segments created)"
+ echo "Escaping loop: no more URLs to fetch now"
+ break
+ else
+ echo "Error running:"
+ echo " $bin/nutch generate ${generate_args[@]}"
+ echo "Failed with exit value $RETCODE."
+ exit $RETCODE
+ fi
# capture the name of the segment
# call hadoop in distributed mode
@@ -168,7 +183,7 @@ do
__bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
echo "Dedup on crawldb"
- $bin/nutch dedup $CRAWL_PATH/crawldb
+ __bin_nutch dedup "$CRAWL_PATH"/crawldb
if [ -n "$SOLRURL" ]; then
echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"