crawl

snagel Mon, 27 Oct 2014 14:40:12 -0700

Author: snagel
Date: Mon Oct 27 21:38:50 2014
New Revision: 1634694

URL: http://svn.apache.org/r1634694
Log:
NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value


Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/bin/crawl
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/bin/crawl

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1634694&r1=1634693&r2=1634694&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Oct 27 21:38:50 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.3-SNAPSHOT
 
+* NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value 
(snagel)
+
 * NUTCH-1882 ant eclipse target to add output path to src/test (snagel)
 
 * NUTCH-1827 Port NUTCH-1467 and NUTCH-1561 to 2.x (snagel)

Modified: nutch/branches/2.x/src/bin/crawl
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1634694&r1=1634693&r2=1634694&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Mon Oct 27 21:38:50 2014
@@ -30,7 +30,7 @@ elif [ "$#" -eq 4 ]; then
      LIMIT="$4"
 else
     echo "Unknown # of arguments $#"
-    echo "Usage: crawl <seedDir> <crawlDir> [<solrUrl>] <numberOfRounds>"
+    echo "Usage: crawl <seedDir> <crawlID> [<solrUrl>] <numberOfRounds>"
     exit -1;
 fi
 
@@ -40,7 +40,7 @@ if [ "$SEEDDIR" = "" ]; then
 fi
 
 if [ "$CRAWL_ID" = "" ]; then
-    echo "Missing crawlDir : crawl <seedDir> <crawlID> [<solrURL>] 
<numberOfRounds>"
+    echo "Missing crawlID : crawl <seedDir> <crawlID> [<solrURL>] 
<numberOfRounds>"
     exit -1;
 fi
 
@@ -98,16 +98,30 @@ if [ $mode = "distributed" ]; then
  fi
 fi
 
-# initial injection
-"$bin/nutch" inject "$SEEDDIR" -crawlId "$CRAWL_ID"
-RETCODE=$?
 
-if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-fi
+function __bin_nutch {
+    # run $bin/nutch, exit if exit value indicates error
+
+    echo "$bin/nutch $@" ;# echo command and arguments
+    "$bin/nutch" "$@"
+
+    RETCODE=$?
+    if [ $RETCODE -ne 0 ]
+    then
+        echo "Error running:"
+        echo "  $bin/nutch $@"
+        echo "Failed with exit value $RETCODE."
+        exit $RETCODE
+    fi
+}
 
 
 
+# initial injection
+echo "Injecting seed URLs"
+__bin_nutch inject "$SEEDDIR" -crawlId "$CRAWL_ID"
+
+
 # main loop : rounds of generate - fetch - parse - update
 for ((a=1; a <= LIMIT ; a++))
 do
@@ -123,58 +137,28 @@ do
   batchId=`date +%s`-$RANDOM
 
   echo "Generating a new fetchlist"
-  "$bin/nutch" generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter 
-adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-    then exit $RETCODE 
-  fi
+  __bin_nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter 
-adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId
 
   echo "Fetching : "
-  "$bin/nutch" fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch 
$batchId -crawlId "$CRAWL_ID" -threads 50
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-    then exit $RETCODE 
-  fi
+  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch 
$batchId -crawlId "$CRAWL_ID" -threads 50
 
   # parsing the batch
   echo "Parsing : "
   # enable the skipping of records for the parsing so that a dodgy document 
   # so that it does not fail the full task
   skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D 
mapred.skip.map.max.skip.records=1"
-  "$bin/nutch" parse $commonOptions $skipRecordsOptions $batchId -crawlId 
"$CRAWL_ID"
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-    then exit $RETCODE 
-  fi
+  __bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId 
"$CRAWL_ID"
 
   # updatedb with this batch
   echo "CrawlDB update for $CRAWL_ID"
-  "$bin/nutch" updatedb $commonOptions $batchId -crawlId "$CRAWL_ID"
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-    then exit $RETCODE 
-  fi
+  __bin_nutch updatedb $commonOptions $batchId -crawlId "$CRAWL_ID"
 
   if [ -n "$SOLRURL" ]; then
     echo "Indexing $CRAWL_ID on SOLR index -> $SOLRURL"
-    "$bin/nutch" index $commonOptions -D solr.server.url=$SOLRURL -all 
-crawlId "$CRAWL_ID"
-    RETCODE=$?
-
-    if [ $RETCODE -ne 0 ] 
-      then exit $RETCODE 
-    fi
+    __bin_nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId 
"$CRAWL_ID"
 
     echo "SOLR dedup -> $SOLRURL"
-    "$bin/nutch" solrdedup $commonOptions $SOLRURL
-    RETCODE=$?
-
-    if [ $RETCODE -ne 0 ] 
-      then exit $RETCODE 
-    fi
+    __bin_nutch solrdedup $commonOptions $SOLRURL
   else
       echo "Skipping indexing tasks: no SOLR url provided."
   fi

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1634694&r1=1634693&r2=1634694&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Oct 27 21:38:50 2014
@@ -2,7 +2,9 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
-* NUTCH-NUTCH-1865 Enable use of SNAPSHOT's with Nutch Ivy dependency 
management (lewismc)
+* NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value 
(snagel)
+
+* NUTCH-1865 Enable use of SNAPSHOT's with Nutch Ivy dependency management 
(lewismc)
 
 * NUTCH-1882 ant eclipse target to add output path to src/test (snagel)
 

Modified: nutch/trunk/src/bin/crawl
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1634694&r1=1634693&r2=1634694&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Mon Oct 27 21:38:50 2014
@@ -97,13 +97,28 @@ if [ $mode = "distributed" ]; then
  fi
 fi
 
-# initial injection
-"$bin/nutch" inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
-RETCODE=$?
 
-if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-fi
+function __bin_nutch {
+    # run $bin/nutch, exit if exit value indicates error
+
+    echo "$bin/nutch $@" ;# echo command and arguments
+    "$bin/nutch" "$@"
+
+    RETCODE=$?
+    if [ $RETCODE -ne 0 ]
+    then
+        echo "Error running:"
+        echo "  $bin/nutch $@"
+        echo "Failed with exit value $RETCODE."
+        exit $RETCODE
+    fi
+}
+
+
+
+# initial injection
+echo "Injecting seed URLs"
+__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
 
 
 # main loop : rounds of generate - fetch - parse - update
@@ -118,12 +133,7 @@ do
   echo `date` ": Iteration $a of $LIMIT"
 
   echo "Generating a new segment"
-  "$bin/nutch" generate $commonOptions "$CRAWL_PATH"/crawldb 
"$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter
-  RETCODE=$?
-  
-  if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-  fi
+  __bin_nutch generate $commonOptions "$CRAWL_PATH"/crawldb 
"$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter
 
   # capture the name of the segment
   # call hadoop in distributed mode
@@ -139,67 +149,33 @@ do
 
   # fetching the segment
   echo "Fetching : $SEGMENT"
-  "$bin/nutch" fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch 
"$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-  fi
+  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch 
"$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
 
   # parsing the segment
   echo "Parsing : $SEGMENT"
   # enable the skipping of records for the parsing so that a dodgy document 
   # so that it does not fail the full task
   skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D 
mapred.skip.map.max.skip.records=1"
-  "$bin/nutch" parse $commonOptions $skipRecordsOptions 
"$CRAWL_PATH"/segments/$SEGMENT
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-  fi
+  __bin_nutch parse $commonOptions $skipRecordsOptions 
"$CRAWL_PATH"/segments/$SEGMENT
 
   # updatedb with this segment
   echo "CrawlDB update"
-  "$bin/nutch" updatedb $commonOptions "$CRAWL_PATH"/crawldb  
"$CRAWL_PATH"/segments/$SEGMENT
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-  fi
+  __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb  
"$CRAWL_PATH"/segments/$SEGMENT
 
 # note that the link inversion - indexing routine can be done within the main 
loop 
 # on a per segment basis
   echo "Link inversion"
-  "$bin/nutch" invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ]   
-  then exit $RETCODE 
-  fi
+  __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
 
   echo "Dedup on crawldb"
   $bin/nutch dedup $CRAWL_PATH/crawldb
-  RETCODE=$?
-  
-  if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-  fi
 
   if [ -n "$SOLRURL" ]; then
       echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
-      "$bin/nutch" index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb 
-linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-      RETCODE=$?
-      if [ $RETCODE -ne 0 ]; then 
-         exit $RETCODE
-      fi
+      __bin_nutch index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb 
-linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
   
       echo "Cleanup on SOLR index -> $SOLRURL"
-      "$bin/nutch" clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
-      RETCODE=$?
-
-      if [ $RETCODE -ne 0 ]; then 
-         exit $RETCODE
-      fi
+      __bin_nutch clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
   else
       echo "Skipping indexing: no SOLR url provided."
   fi
@@ -209,39 +185,20 @@ do
   # and should be uncommented based on your requirements
   #######################################################
   #echo "Building WebGraph within $CRAWL_PATH on all segments in 
$CRAWL_PATH/segments/"
-  #"$bin/nutch" webgraph $commonOptions -filter -normalize -segmentDir 
"$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
-
-  #if [ $? -ne 0 ]
-  # then exit $?
-  #fi
+  #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir 
"$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
 
   #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
-  #"$bin/nutch" org.apache.nutch.scoring.webgraph.Loops $commonOptions 
-webgraphdb "$CRAWL_PATH"
-
-  #if [ $? -ne 0 ]
-  # then exit $?
-  #fi
+  #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions 
-webgraphdb "$CRAWL_PATH"
 
   #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
-  #"$bin/nutch" linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
-
-  #if [ $? -ne 0 ]
-  # then exit $?
-  #fi
+  #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
 
   #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph 
within $CRAWL_PATH"
-  #"$bin/nutch" scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb 
-webgraphdb "$CRAWL_PATH"
-
-  #if [ $? -ne 0 ]
-  # then exit $?
-  #fi
+  #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb 
-webgraphdb "$CRAWL_PATH"
 
   #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output 
to $CRAWL_PATH/dump/scores"
-  #"$bin/nutch" nodedumper $commonOptions -scores -topn 1000 -webgraphdb 
"$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
+  #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb 
"$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
 
-  #if [ $? -ne 0 ]
-  # then exit $?
-  #fi
 done
 
 exit 0

svn commit: r1634694 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/src/bin/crawl trunk/CHANGES.txt trunk/src/bin/crawl

Reply via email to