Hi,
Starting about six months ago, I have been using Nutch to index web
pages on my company's intranet. After some experimentation, I used
Susam Pal's "runbot" script to do the crawling and re-crawling. (I have
included it below)
To get around some other problems, I tried using a recent nightly build
with the script below. Unfortunately, it gives me a Java exception so I
am thinking that there may be an updated script for the current nightly
build.
Is there a particular recently nightly build that is particularly
stable?
Is there a crawl/re-crawl script I should use?
I am running on a single Linux box with Centos/Linux 2.6.18.
Thanks for any help!
Lee
-----
#! /bin/bash
# runbot script to run the Nutch bot for crawling and re-crawling.
# Usage: bin/runbot [safe]
# If executed in 'safe' mode, it doesn't delete the temporary
# directories generated during crawl. This might be helpful for
# analysis and recovery in case a crawl fails.
#
# Author: Susam Pal
depth=8
threads=20
adddays=5
topN=200 #Comment this statement if you don't want to set topN value
# Arguments for rm and mv
RMARGS="-rf"
MVARGS="--verbose"
# Parse arguments
if [ "$1" == "safe" ]
then
safe=yes
fi
if [ -z "$NUTCH_HOME" ]
then
NUTCH_HOME=.
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script
else
echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME
fi
if [ -z "$CATALINA_HOME" ]
then
CATALINA_HOME=/opt/apache-tomcat-6.0.10
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script
else
echo runbot: $0 found environment variable
CATALINA_HOME=$CATALINA_HOME
fi
if [ -n "$topN" ]
then
topN="-topN $topN"
else
topN=""
fi
steps=8
echo "----- Inject (Step 1 of $steps) -----"
$NUTCH_HOME/bin/nutch inject crawl/crawldb urls
echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
for((i=0; i < $depth; i++))
do
echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
$NUTCH_HOME/bin/nutch generate crawl/crawldb crawl/segments $topN \
-adddays $adddays
if [ $? -ne 0 ]
then
echo "runbot: Stopping at depth $depth. No more URLs to fetch."
break
fi
segment=`ls -d crawl/segments/* | tail -1`
$NUTCH_HOME/bin/nutch fetch $segment -threads $threads
if [ $? -ne 0 ]
then
echo "runbot: fetch $segment at depth `expr $i + 1` failed."
echo "runbot: Deleting segment $segment."
rm $RMARGS $segment
continue
fi
$NUTCH_HOME/bin/nutch updatedb crawl/crawldb $segment
done
echo "----- Merge Segments (Step 3 of $steps) -----"
$NUTCH_HOME/bin/nutch mergesegs crawl/MERGEDsegments crawl/segments/*
if [ "$safe" != "yes" ]
then
rm $RMARGS crawl/segments
else
rm $RMARGS crawl/BACKUPsegments
mv $MVARGS crawl/segments crawl/BACKUPsegments
fi
mv $MVARGS crawl/MERGEDsegments crawl/segments
echo "----- Invert Links (Step 4 of $steps) -----"
$NUTCH_HOME/bin/nutch invertlinks crawl/linkdb crawl/segments/*
echo "----- Index (Step 5 of $steps) -----"
$NUTCH_HOME/bin/nutch index crawl/NEWindexes crawl/crawldb crawl/linkdb
\
crawl/segments/*
echo "----- Dedup (Step 6 of $steps) -----"
$NUTCH_HOME/bin/nutch dedup crawl/NEWindexes
echo "----- Merge Indexes (Step 7 of $steps) -----"
$NUTCH_HOME/bin/nutch merge crawl/NEWindex crawl/NEWindexes
echo "----- Loading New Index (Step 8 of $steps) -----"
${CATALINA_HOME}/bin/shutdown.sh
if [ "$safe" != "yes" ]
then
rm $RMARGS crawl/NEWindexes
rm $RMARGS crawl/index
else
rm $RMARGS crawl/BACKUPindexes
rm $RMARGS crawl/BACKUPindex
mv $MVARGS crawl/NEWindexes crawl/BACKUPindexes
mv $MVARGS crawl/index crawl/BACKUPindex
fi
mv $MVARGS crawl/NEWindex crawl/index
${CATALINA_HOME}/bin/startup.sh
echo "runbot: FINISHED: Crawl completed!"
echo ""