Re: crawl sites in nutch 1.3?

mina Fri, 11 Nov 2011 13:37:38 -0800

thanks for your answer, i think topN caused this problem, beacuse when nutch
fetch a url , it will fetch any links that exist in page.the maximum links
that will fetch from a page is equals to topN. i think if nutch fetch urls
equals topN it will not fetch another url from sites.txt. please give me an
example about topN, i don't know verey much about it. here is my script :
# deepcrawler script to run the Nutch bot for crawling and re-crawling.
# Usage: bin/deepcrawler [safe]
#        If executed in 'safe' mode, it doesn't delete the temporary
#        directories generated during crawl. This might be helpful for
#        analysis and recovery in case a crawl fails.
#
# Author: Susam Pal


# set host
export HOST=127.0.0.1

# set depth
depth=4

# set threads
threads=5

adddays=0

# set topn
topN=10

#Comment this statement if you don't want to set topN value

# Arguments for rm and mv
RMARGS="-rf"
MVARGS="--verbose"

# Parse arguments
if [ "$1" == "safe" ]
then
  safe=yes
fi

if [ -z "$NUTCH_HOME" ]
then

# set nutchHome
export NUTCH_HOME=/search-engine/nutch/runtime/local

# set javaHome
export JAVA_HOME=/opt/jdk1.6.0_25/


echo deepcrawler: $0 could not find environment variable NUTCH_HOME
echo "host is $HOST"
  echo deepcrawler: NUTCH_HOME=$NUTCH_HOME has been set by the script 
else
  echo deepcrawler: $0 found environment variable NUTCH_HOME=$NUTCH_HOME 
fi

if [ -z "$CATALINA_HOME" ]
then
  CATALINA_HOME=/home/ganjyar/Development/apache-tomcat-6.0.33
  echo deepcrawler: $0 could not find environment variable NUTCH_HOME
  echo deepcrawler: CATALINA_HOME=$CATALINA_HOME has been set by the script 
else
  echo deepcrawler: $0 found environment variable
CATALINA_HOME=$CATALINA_HOME 
fi

if [ -n "$topN" ]
then
  topN="-topN $topN"
else
  topN=""
fi

steps=10
echo "----- Inject (Step 1 of $steps) -----"
$NUTCH_HOME/bin/nutch inject $NUTCH_HOME/bin/crawl1/crawldb
$NUTCH_HOME/bin/urls/sites.txt

echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
for((i=0; i < $depth; i++))
do
  echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
$NUTCH_HOME/bin/nutch generate $NUTCH_HOME/bin/crawl1/crawldb
$NUTCH_HOME/bin/crawl1/segments $topN \
      -adddays $adddays
  if [ $? -ne 0 ]
  then
    echo "deepcrawler: Stopping at depth $depth. No more URLs to fetch."
    break
  fi
  segment1=`ls -d $NUTCH_HOME/bin/crawl1/segments/* | tail -1`

$NUTCH_HOME/bin/nutch fetch $segment1 -threads $threads
  if [ $? -ne 0 ]
 then
    echo "deepcrawler: fetch $segment1 at depth `expr $i + 1` failed."
    echo "deepcrawler: Deleting segment $segment1."
    rm $RMARGS $segment1
    continue
  fi
$NUTCH_HOME/bin/nutch parse $segment1
$NUTCH_HOME/bin/nutch updatedb $NUTCH_HOME/bin/crawl1/crawldb $segment1
done

#echo "----- Generate, Fetch, Parse, Update (Step 3 of $steps) -----"
#for((i=0; i < $depth; i++))
#do
#  echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
#  sh nutch generate crawl1/crawldb crawl1/segments topN 1000 \
#      -adddays $adddays
#  if [ $? -ne 0 ]
#  then
#    echo "deepcrawler: Stopping at depth $depth. No more URLs to fetch."
#    break
#  fi
#  segment2=`ls -d crawl1/segments/* | tail -1`

#  sh nutch fetch $segment2 
#  if [ $? -ne 0 ]
#  then
#    echo "deepcrawler: fetch $segment2 at depth `expr $i + 1` failed."
#    echo "deepcrawler: Deleting segment $segment2."
#    rm $RMARGS $segment2
#    continue
#  fi
#sh nutch parse $segment2
#  sh nutch updatedb crawl1/crawldb $segment2
#done

#echo "----- Generate, Fetch, Parse, Update (Step 4 of $steps) -----"
#for((i=0; i < $depth; i++))
#do
#  echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
#  sh nutch generate crawl1/crawldb crawl1/segments topN 1000 \
#      -adddays $adddays
#  if [ $? -ne 0 ]
#  then
#    echo "deepcrawler: Stopping at depth $depth. No more URLs to fetch."
#    break
#  fi
#  segment3=`ls -d crawl1/segments/* | tail -1`

#  sh nutch fetch $segment3 
#  if [ $? -ne 0 ]
#  then
#    echo "deepcrawler: fetch $segment3 at depth `expr $i + 1` failed."
#    echo "deepcrawler: Deleting segment $segment3."
#    rm $RMARGS $segment3
#    continue
#  fi
#sh nutch parse $segment3
#  sh nutch updatedb crawl1/crawldb $segment3
#done

echo "----- Merge Segments (Step 5 of $steps) -----"
$NUTCH_HOME/bin/nutch mergesegs $NUTCH_HOME/bin/crawl1/MERGEDsegments
$NUTCH_HOME/bin/crawl1/segments/*
if [ "$safe" != "yes" ]
then
  rm $RMARGS $NUTCH_HOME/bin/crawl1/segments
else
  rm $RMARGS $NUTCH_HOME/bin/crawl1/BACKUPsegments
  mv $MVARGS $NUTCH_HOME/bin/crawl1/segments
$NUTCH_HOME/bin/crawl1/BACKUPsegments
fi

mv $MVARGS $NUTCH_HOME/bin/crawl1/MERGEDsegments
$NUTCH_HOME/bin/crawl1/segments

echo "----- Invert Links (Step 6 of $steps) -----"
$NUTCH_HOME/bin/nutch invertlinks $NUTCH_HOME/bin/crawl1/linkdb
$NUTCH_HOME/bin/crawl1/segments/*

echo "----- Index (Step 7 of $steps) -----"
#sh nutch index crawl1/NEWindexes crawl1/crawldb crawl1/linkdb \
#    crawl1/segments/*

echo "----- Dedup (Step 8 of $steps) -----"
#sh nutch dedup crawl1/NEWindexes

echo "----- Merge Indexes (Step 9 of $steps) -----"
#sh nutch merge crawl1/NEWindex crawl1/NEWindexes

echo "----- Loading New Index (Step 10 of $steps) -----"
#${CATALINA_HOME}/bin/shutdown.sh

if [ "$safe" != "yes" ]
then
  rm $RMARGS $NUTCH_HOME/bin/crawl1/NEWindexes
  rm $RMARGS $NUTCH_HOME/bin/crawl1/index
else
  rm $RMARGS $NUTCH_HOME/bin/crawl1/BACKUPindexes
  rm $RMARGS $NUTCH_HOME/bin/crawl1/BACKUPindex
  mv $MVARGS $NUTCH_HOME/bin/crawl1/NEWindexes
$NUTCH_HOME/bin/crawl1/BACKUPindexes
  mv $MVARGS $NUTCH_HOME/bin/crawl1/index $NUTCH_HOME/bin/crawl1/BACKUPindex
fi

#mv $MVARGS crawl1/NEWindex crawl1/index

#sh catalina startup.sh
$NUTCH_HOME/bin/nutch solrindex http://$HOST:8983/solr/
$NUTCH_HOME/bin/crawl1/crawldb $NUTCH_HOME/bin/crawl1/linkdb
$NUTCH_HOME/bin/crawl1/segments/*
echo "deepcrawler: FINISHED: Crawl completed!"
echo ""


--
View this message in context: 
http://lucene.472066.n3.nabble.com/crawl-sites-in-nutch-1-3-tp3492962p3500896.html
Sent from the Nutch - User mailing list archive at Nabble.com.

Re: crawl sites in nutch 1.3?

Reply via email to