I created a script based on:
http://wiki.apache.org/nutch/NutchTutorial#A3.2_Using_Individual_Commands_for_Whole-Web_Crawling

The script is similar to what happens in the old runbot.sh script, but
it isn't working for me.  The part that does the "s1" barely gets
anything, but then the "s2" fails completely (it tries to make the
same segment?).

The plain "crawl" command seems to work, but how practical is it to
use that instead of making this script?  Is it better to have all the
parts broken down like this, instead of passing more parts to the
"crawl" command and letting that handle things?

Any help would be greatly appreciated!

Script:

# runbot script to run the Nutch bot for crawling and re-crawling.
# Usage: bin/runbot [safe]
#        If executed in 'safe' mode, it doesn't delete the temporary
#        directories generated during crawl. This might be helpful for
#        analysis and recovery in case a crawl fails.
#
# Author: Susam Pal

#Set up some local vars -- like NUTCH_HOME & CATALINA_HOME
. /common/setenv.sh

CRAWL_DIR=$NUTCH_HOME/crawl
SEGMENTS_DIR=$CRAWL_DIR/segments
CRAWL_DB=$CRAWL_DIR/crawldb
LINK_DB=$CRAWL_DIR/linkdb

depth=5
threads=5
adddays=5
topN=15 #Comment this statement if you don't want to set topN value

# Arguments for rm and mv
RMARGS="-rf"
MVARGS="--verbose"

# Parse arguments
if [ "$1" == "safe" ]
then
  safe=yes
fi

if [ -z "$NUTCH_HOME" ]
then
  NUTCH_HOME=/cdda/nutch
  echo runbot: $0 could not find environment variable NUTCH_HOME
  echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script
else
  echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME
fi

if [ -z "$CATALINA_HOME" ]
then
  CATALINA_HOME=/cdda/apache-tomcat
  echo runbot: $0 could not find environment variable NUTCH_HOME
  echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script
else
  echo runbot: $0 found environment variable CATALINA_HOME=$CATALINA_HOME
fi

if [ -n "$topN" ]
then
  topN="-topN $topN"
else
  topN=""
fi

echo "crawl.sh: STARTING"
echo $NUTCH_HOME/bin/nutch inject $CRAWL_DB $NUTCH_HOME/urls
echo ""
$NUTCH_HOME/bin/nutch inject $CRAWL_DB $NUTCH_HOME/urls

echo $NUTCH_HOME/bin/nutch generate $NUTCH_HOME/crawl/crawldb
$SEGMENTS_DIR -numFetchers 5
echo ""
$NUTCH_HOME/bin/nutch generate $CRAWL_DB $SEGMENTS_DIR -numFetchers 5

s1=`ls -rd $SEGMENTS_DIR/* | tail -1`
echo Segments: $s1
echo ""

echo $NUTCH_HOME/bin/nutch fetch $s1 -threads $threads
echo ""
$NUTCH_HOME/bin/nutch fetch $s1 -threads $threads

echo $NUTCH_HOME/bin/nutch parse $s1
echo ""
$NUTCH_HOME/bin/nutch parse $s1

echo $NUTCH_HOME/bin/nutch updatedb $CRAWL_DB $s1
echo ""
$NUTCH_HOME/bin/nutch updatedb $CRAWL_DB $s1

echo $NUTCH_HOME/bin/nutch generate crawldb $SEGMENTS_DIR -numFetchers 5
echo ""
$NUTCH_HOME/bin/nutch generate crawldb $SEGMENTS_DIR -numFetchers 5
s2=`ls -rd $SEGMENTS_DIR/* | tail -1`
echo Segments: $s2
echo ""

echo $NUTCH_HOME/bin/nutch fetch $s2 -threads $threads
echo ""
$NUTCH_HOME/bin/nutch fetch $s2 -threads $threads

echo $NUTCH_HOME/bin/nutch parse $s2
echo ""
$NUTCH_HOME/bin/nutch parse $s2

echo $NUTCH_HOME/bin/nutch updatedb $CRAWL_DB $s2
echo ""
$NUTCH_HOME/bin/nutch updatedb $CRAWL_DB $s2

echo $NUTCH_HOME/bin/nutch invertlinks $LINK_DB -dir $SEGMENTS_DIR
echo ""
$NUTCH_HOME/bin/nutch invertlinks $LINK_DB -dir $SEGMENTS_DIR

echo $NUTCH_HOME/bin/nutch solrindex http://localhost:8080/nutchsolr/
$CRAWLDB $LINK_DB -dir $SEGMENTS_DIR/*
echo ""
$NUTCH_HOME/bin/nutch solrindex http://localhost:8080/nutchsolr/
$CRAWLDB $LINK_DB -dir $SEGMENTS_DIR/*

echo "cleaning $SEGMENTS_DIR"
#rm -rf $SEGMENTS_DIR/*

echo "crawl.sh: FINISHED: Crawl completed!"
echo ""



-- Chris

Reply via email to