thanks for your answer, i think topN caused this problem, beacuse when nutch
fetch a url , it will fetch any links that exist in page.the maximum links
that will fetch from a page is equals to topN. i think if nutch fetch urls
equals topN it will not fetch another url from sites.txt. please give me an
example about topN, i don't know verey much about it. here is my script :
# deepcrawler script to run the Nutch bot for crawling and re-crawling.
# Usage: bin/deepcrawler [safe]
# If executed in 'safe' mode, it doesn't delete the temporary
# directories generated during crawl. This might be helpful for
# analysis and recovery in case a crawl fails.
#
# Author: Susam Pal
# set host
export HOST=127.0.0.1
# set depth
depth=4
# set threads
threads=5
adddays=0
# set topn
topN=10
#Comment this statement if you don't want to set topN value
# Arguments for rm and mv
RMARGS="-rf"
MVARGS="--verbose"
# Parse arguments
if [ "$1" == "safe" ]
then
safe=yes
fi
if [ -z "$NUTCH_HOME" ]
then
# set nutchHome
export NUTCH_HOME=/search-engine/nutch/runtime/local
# set javaHome
export JAVA_HOME=/opt/jdk1.6.0_25/
echo deepcrawler: $0 could not find environment variable NUTCH_HOME
echo "host is $HOST"
echo deepcrawler: NUTCH_HOME=$NUTCH_HOME has been set by the script
else
echo deepcrawler: $0 found environment variable NUTCH_HOME=$NUTCH_HOME
fi
if [ -z "$CATALINA_HOME" ]
then
CATALINA_HOME=/home/ganjyar/Development/apache-tomcat-6.0.33
echo deepcrawler: $0 could not find environment variable NUTCH_HOME
echo deepcrawler: CATALINA_HOME=$CATALINA_HOME has been set by the script
else
echo deepcrawler: $0 found environment variable
CATALINA_HOME=$CATALINA_HOME
fi
if [ -n "$topN" ]
then
topN="-topN $topN"
else
topN=""
fi
steps=10
echo "----- Inject (Step 1 of $steps) -----"
$NUTCH_HOME/bin/nutch inject $NUTCH_HOME/bin/crawl1/crawldb
$NUTCH_HOME/bin/urls/sites.txt
echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
for((i=0; i < $depth; i++))
do
echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
$NUTCH_HOME/bin/nutch generate $NUTCH_HOME/bin/crawl1/crawldb
$NUTCH_HOME/bin/crawl1/segments $topN \
-adddays $adddays
if [ $? -ne 0 ]
then
echo "deepcrawler: Stopping at depth $depth. No more URLs to fetch."
break
fi
segment1=`ls -d $NUTCH_HOME/bin/crawl1/segments/* | tail -1`
$NUTCH_HOME/bin/nutch fetch $segment1 -threads $threads
if [ $? -ne 0 ]
then
echo "deepcrawler: fetch $segment1 at depth `expr $i + 1` failed."
echo "deepcrawler: Deleting segment $segment1."
rm $RMARGS $segment1
continue
fi
$NUTCH_HOME/bin/nutch parse $segment1
$NUTCH_HOME/bin/nutch updatedb $NUTCH_HOME/bin/crawl1/crawldb $segment1
done
#echo "----- Generate, Fetch, Parse, Update (Step 3 of $steps) -----"
#for((i=0; i < $depth; i++))
#do
# echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
# sh nutch generate crawl1/crawldb crawl1/segments topN 1000 \
# -adddays $adddays
# if [ $? -ne 0 ]
# then
# echo "deepcrawler: Stopping at depth $depth. No more URLs to fetch."
# break
# fi
# segment2=`ls -d crawl1/segments/* | tail -1`
# sh nutch fetch $segment2
# if [ $? -ne 0 ]
# then
# echo "deepcrawler: fetch $segment2 at depth `expr $i + 1` failed."
# echo "deepcrawler: Deleting segment $segment2."
# rm $RMARGS $segment2
# continue
# fi
#sh nutch parse $segment2
# sh nutch updatedb crawl1/crawldb $segment2
#done
#echo "----- Generate, Fetch, Parse, Update (Step 4 of $steps) -----"
#for((i=0; i < $depth; i++))
#do
# echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
# sh nutch generate crawl1/crawldb crawl1/segments topN 1000 \
# -adddays $adddays
# if [ $? -ne 0 ]
# then
# echo "deepcrawler: Stopping at depth $depth. No more URLs to fetch."
# break
# fi
# segment3=`ls -d crawl1/segments/* | tail -1`
# sh nutch fetch $segment3
# if [ $? -ne 0 ]
# then
# echo "deepcrawler: fetch $segment3 at depth `expr $i + 1` failed."
# echo "deepcrawler: Deleting segment $segment3."
# rm $RMARGS $segment3
# continue
# fi
#sh nutch parse $segment3
# sh nutch updatedb crawl1/crawldb $segment3
#done
echo "----- Merge Segments (Step 5 of $steps) -----"
$NUTCH_HOME/bin/nutch mergesegs $NUTCH_HOME/bin/crawl1/MERGEDsegments
$NUTCH_HOME/bin/crawl1/segments/*
if [ "$safe" != "yes" ]
then
rm $RMARGS $NUTCH_HOME/bin/crawl1/segments
else
rm $RMARGS $NUTCH_HOME/bin/crawl1/BACKUPsegments
mv $MVARGS $NUTCH_HOME/bin/crawl1/segments
$NUTCH_HOME/bin/crawl1/BACKUPsegments
fi
mv $MVARGS $NUTCH_HOME/bin/crawl1/MERGEDsegments
$NUTCH_HOME/bin/crawl1/segments
echo "----- Invert Links (Step 6 of $steps) -----"
$NUTCH_HOME/bin/nutch invertlinks $NUTCH_HOME/bin/crawl1/linkdb
$NUTCH_HOME/bin/crawl1/segments/*
echo "----- Index (Step 7 of $steps) -----"
#sh nutch index crawl1/NEWindexes crawl1/crawldb crawl1/linkdb \
# crawl1/segments/*
echo "----- Dedup (Step 8 of $steps) -----"
#sh nutch dedup crawl1/NEWindexes
echo "----- Merge Indexes (Step 9 of $steps) -----"
#sh nutch merge crawl1/NEWindex crawl1/NEWindexes
echo "----- Loading New Index (Step 10 of $steps) -----"
#${CATALINA_HOME}/bin/shutdown.sh
if [ "$safe" != "yes" ]
then
rm $RMARGS $NUTCH_HOME/bin/crawl1/NEWindexes
rm $RMARGS $NUTCH_HOME/bin/crawl1/index
else
rm $RMARGS $NUTCH_HOME/bin/crawl1/BACKUPindexes
rm $RMARGS $NUTCH_HOME/bin/crawl1/BACKUPindex
mv $MVARGS $NUTCH_HOME/bin/crawl1/NEWindexes
$NUTCH_HOME/bin/crawl1/BACKUPindexes
mv $MVARGS $NUTCH_HOME/bin/crawl1/index $NUTCH_HOME/bin/crawl1/BACKUPindex
fi
#mv $MVARGS crawl1/NEWindex crawl1/index
#sh catalina startup.sh
$NUTCH_HOME/bin/nutch solrindex http://$HOST:8983/solr/
$NUTCH_HOME/bin/crawl1/crawldb $NUTCH_HOME/bin/crawl1/linkdb
$NUTCH_HOME/bin/crawl1/segments/*
echo "deepcrawler: FINISHED: Crawl completed!"
echo ""
--
View this message in context:
http://lucene.472066.n3.nabble.com/crawl-sites-in-nutch-1-3-tp3492962p3500896.html
Sent from the Nutch - User mailing list archive at Nabble.com.