Author: jnioche
Date: Tue Jul 10 15:16:04 2012
New Revision: 1359720

URL: http://svn.apache.org/viewvc?rev=1359720&view=rev
Log:
NUTCH-1087 Deprecate crawl command and replace with example script

Added:
    nutch/trunk/src/bin/crawl
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/bin/nutch

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1359720&r1=1359719&r2=1359720&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 10 15:16:04 2012
@@ -1,6 +1,9 @@
 Nutch Change Log
 
 (trunk) Current Development:
+
+* NUTCH-1087 Deprecate crawl command and replace with example script (jnioche)
+
 * NUTCH-1306 Add option to not commit and clarify existing solr.commit.size 
(ferdy)
 
 * NUTCH-1405 Allow to overwrite CrawlDatum's with injected entries (markus)

Added: nutch/trunk/src/bin/crawl
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1359720&view=auto
==============================================================================
--- nutch/trunk/src/bin/crawl (added)
+++ nutch/trunk/src/bin/crawl Tue Jul 10 15:16:04 2012
@@ -0,0 +1,185 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# The Crawl command script : crawl <seedDir> <crawlDir> <solrURL> 
<numberOfRounds>
+#
+# 
+# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK 
INVERSION AND 
+# INDEXING FOR EACH SEGMENT
+
+SEEDDIR="$1"
+CRAWL_PATH="$2"
+SOLRURL="$3"
+LIMIT="$4"
+
+if [ "$SEEDDIR" = "" ]; then
+    echo "Missing seedDir : crawl <seedDir> <crawlDir> <solrURL> 
<numberOfRounds>"
+    exit -1;
+fi
+
+if [ "$CRAWL_PATH" = "" ]; then
+    echo "Missing crawlDir : crawl <seedDir> <crawlDir> <solrURL> 
<numberOfRounds>"
+    exit -1;
+fi
+
+if [ "$SOLRURL" = "" ]; then
+    echo "Missing SOLRURL : crawl <seedDir> <crawlDir> <solrURL> 
<numberOfRounds>"
+    exit -1;
+fi
+
+if [ "$LIMIT" = "" ]; then
+    echo "Missing numberOfRounds : crawl <seedDir> <crawlDir> <solrURL> 
<numberOfRounds>"
+    exit -1;
+fi
+
+#############################################
+# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
+#############################################
+
+# set the number of slaves nodes
+numSlaves=1
+
+# and the total number of available tasks
+# sets Hadoop parameter "mapred.reduce.tasks"
+numTasks=`expr $numSlaves \* 2`
+
+# number of urls to fetch in one iteration
+# 250K per task?
+sizeFetchlist=`expr $numSlaves \* 50000`
+
+# time limit for feching
+timeLimitFetch=180
+
+# num threads for fetching
+numThreads=50
+
+#############################################
+
+# determines whether mode based on presence of job file
+
+mode=local
+if [ -f ../nutch-*.job ]; then
+    mode=distributed
+fi
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+# note that some of the options listed here could be set in the 
+# corresponding hadoop site xml param file 
+commonOptions="-D mapred.reduce.tasks=$numTasks -D 
mapred.child.java.opts=-Xmx1000m -D 
mapred.reduce.tasks.speculative.execution=false -D 
mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"
+
+ # check that hadoop can be found on the path 
+if [ $mode = "distributed" ]; then
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+    echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run 
in local mode."
+    exit -1;
+ fi
+fi
+
+# initial injection
+$bin/nutch inject $CRAWL_PATH/crawldb $SEEDDIR
+
+if [ $? -ne 0 ] 
+  then exit $? 
+fi
+
+
+# main loop : rounds of generate - fetch - parse - update
+for ((a=1; a <= LIMIT ; a++))
+do
+  if [ -e ".STOP" ]
+  then
+   echo "STOP file found - escaping loop"
+   break
+  fi
+
+  echo `date` ": Iteration $a of $LIMIT"
+
+  echo "Generating a new segment"
+  $bin/nutch generate $commonOptions $CRAWL_PATH/crawldb $CRAWL_PATH/segments 
-topN $sizeFetchlist -numFetchers $numSlaves -noFilter
+  
+  if [ $? -ne 0 ] 
+  then exit $? 
+  fi
+
+  # capture the name of the segment
+  # call hadoop in distributed mode
+  # or use ls
+
+  if [ $mode = "local" ]; then
+   SEGMENT=`ls -l $CRAWL_PATH/segments/ | sed -e "s/ /\\n/g" | egrep 20[0-9]+ 
| sort -n | tail -n 1`
+  else
+   SEGMENT=`hadoop fs -ls $CRAWL_PATH/segments/ | grep segments |  sed -e 
"s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
+  fi
+  
+  echo "Operating on segment : $SEGMENT"
+
+  # fetching the segment
+  echo "Fetching : $SEGMENT"
+  $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch 
$CRAWL_PATH/segments/$SEGMENT -noParsing -threads $numThreads
+
+  if [ $? -ne 0 ] 
+  then exit $? 
+  fi
+
+  # parsing the segment
+  echo "Parsing : $SEGMENT"
+  # enable the skipping of records for the parsing so that a dodgy document 
+  # so that it does not fail the full task
+  skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D 
mapred.skip.map.max.skip.records=1"
+  $bin/nutch parse $commonOptions $skipRecordsOptions 
$CRAWL_PATH/segments/$SEGMENT
+
+  if [ $? -ne 0 ] 
+  then exit $? 
+  fi
+
+  # updatedb with this segment
+  echo "CrawlDB update"
+  $bin/nutch updatedb $commonOptions $CRAWL_PATH/crawldb  
$CRAWL_PATH/segments/$SEGMENT
+
+  if [ $? -ne 0 ] 
+  then exit $? 
+  fi
+
+# note that the link inversion - indexing routine can be done within the main 
loop 
+# on a per segment basis
+  echo "Link inversion"
+  $bin/nutch invertlinks $CRAWL_PATH/linkdb $CRAWL_PATH/segments/$SEGMENT
+
+  if [ $? -ne 0 ] 
+  then exit $? 
+  fi
+
+  echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
+  $bin/nutch solrindex $SOLRURL $CRAWL_PATH/crawldb -linkdb $CRAWL_PATH/linkdb 
$SEGMENT
+  
+  if [ $? -ne 0 ] 
+   then exit $? 
+  fi
+
+  echo "SOLR dedup -> $SOLRURL"
+  $bin/nutch solrdedup $SOLRURL
+  
+  if [ $? -ne 0 ] 
+   then exit $? 
+  fi
+
+done
+
+exit 0
+

Modified: nutch/trunk/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1359720&r1=1359719&r2=1359720&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Tue Jul 10 15:16:04 2012
@@ -47,7 +47,7 @@ done
 if [ $# = 0 ]; then
   echo "Usage: nutch COMMAND"
   echo "where COMMAND is one of:"
-  echo "  crawl             one-step crawler for intranets"
+  echo "  crawl             one-step crawler for intranets (DEPRECATED - USE 
CRAWL SCRIPT INSTEAD)"
   echo "  readdb            read / dump crawl db"
   echo "  mergedb           merge crawldb-s, with optional filtering"
   echo "  readlinkdb        read / dump link db"


Reply via email to