Dear Wiki user, You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.
The "Tutorial on incremental crawling" page has been changed by Gabriele Kahlout. http://wiki.apache.org/nutch/Tutorial%20on%20incremental%20crawling?action=diff&rev1=2&rev2=3 -------------------------------------------------- == 1. Abridged script using Solr == == 2. Unabridged script with explanations and using nutch index == + {{{ - {{{#!/bin/sh + #!/bin/sh + # # Created by Gabriele Kahlout on 27.03.11. + # The following script crawls the whole-web incrementally; Specifying a list of urls to crawl, nutch will continuously fetch $it_size urls from a specified list of urls, index and merge them with our whole-web index, so that they can be immediately searched, until all urls have been fetched. # + # TO USE: - # The following script crawls the whole-web incrementally; Specifying a list of urls to crawl, nutch will continuously fetch $it_size urls from a - # specified list of urls, index and merge them with our whole-web index, so that they can be immediately searched, until all urls have been fetched. - # - # Usage: ./whole-web-crawling-incremental [it_seedsDir-path urls-to-fetch-per-iteration depth] - # - # Getting Started: # 1. $ mv whole-web-crawling-incremental $NUTCH_HOME/whole-web-crawling-incremental # 2. $ cd $NUTCH_HOME # 3. $ chmod +x whole-web-crawling-incremental # 4. $ ./whole-web-crawling-incremental - # + + # Usage: ./whole-web-crawling-incremental [it_seedsDir-path urls-to-fetch-per-iteration depth] # Start + function echoThenRun () { # echo and then run the command echo $1 $1 echo } + echoThenRun "rm -r crawl" # fresh crawl + if [[ ! -d "build" ]] then - echoThenRun "ant" + echoThenRun "ant" fi + seedsDir="seeds" if [[ $1 != "" ]] then - seedsDir=$1 + seedsDir=$1 fi + it_size=10 if [[ $2 != "" ]] then - it_size=$2 + it_size=$2 fi + indexedPlus1=1 #indexedPlus1 urls+1 because of tail. Never printed out it_seedsDir="$seedsDir/it_seeds" rm -r $it_seedsDir mkdir $it_seedsDir + allUrls=`cat $seedsDir/*url* | wc -l | sed -e "s/^ *//"` echo $allUrls" urls to crawl" + it_crawldb="crawl/crawldb" + depth=1 if [[ $3 != "" ]] then - depth=$3 + depth=$3 fi + while [[ $indexedPlus1 -le $allUrls ]] #repeat generate-fetch-updatedb-invertlinks-index-merge loop until all urls are fetched do - rm $it_seedsDir/urls + rm $it_seedsDir/urls - tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > $it_seedsDir/urls + tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > $it_seedsDir/urls - echo + echo - echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir" + echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir" - i=0 + i=0 + + while [[ $i -lt $depth ]] # depth-first + do + echo + echo "generate-fetch-updatedb-invertlinks-index-merge iteration "$i":" + + echo + cmd="bin/nutch generate $it_crawldb crawl/segments -topN $it_size" + echo $cmd + output=`$cmd` + echo $output + if [[ $output == *'0 records selected for fetching'* ]] #all the urls of this iteration have been fetched + then + break; + fi + s1=`ls -d crawl/segments/2* | tail -1` - while [[ $i -lt $depth ]] # depth-first - do - echo - echo "generate-fetch-updatedb-invertlinks-index-merge iteration "$i":" - echo - cmd="bin/nutch generate $it_crawldb crawl/segments -topN $it_size" - echo $cmd - output=`$cmd` - echo $output - if [[ $output == *'0 records selected for fetching'* ]] #all the urls of this iteration have been fetched - then - break; - fi - s1=`ls -d crawl/segments/2* | tail -1` - echoThenRun "bin/nutch fetch $s1" + echoThenRun "bin/nutch fetch $s1" + - echoThenRun "bin/nutch updatedb $it_crawldb $s1" + echoThenRun "bin/nutch updatedb $it_crawldb $s1" + - echoThenRun "bin/nutch invertlinks crawl/linkdb -dir crawl/segments" + echoThenRun "bin/nutch invertlinks crawl/linkdb -dir crawl/segments" + + - # echoThenRun "bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb crawl/linkdb crawl/segments/*" + # echoThenRun "bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb crawl/linkdb crawl/segments/*" - # if you have solr setup you can use it by uncommenting the above command and commenting the following nutch index and merge step. + # if you have solr setup you can use it by uncommenting the above command and commenting the following nutch index and merge step. + - # start nutch index and merge step + # start nutch index and merge step - new_indexes="crawl/new_indexes" + new_indexes="crawl/new_indexes" - rm -r $new_indexes $temp_indexes + rm -r $new_indexes $temp_indexes - echoThenRun "bin/nutch index $new_indexes $it_crawldb crawl/linkdb crawl/segments/*" + echoThenRun "bin/nutch index $new_indexes $it_crawldb crawl/linkdb crawl/segments/*" - indexes="crawl/indexes" + indexes="crawl/indexes" - temp_indexes="crawl/temp_indexes" + temp_indexes="crawl/temp_indexes" + - # solrindex also merged, with nutch index we've to do it: + # solrindex also merged, with nutch index we've to do it: - echoThenRun "bin/nutch merge $temp_indexes/part-1 $indexes $new_indexes" # work-around for https://issues.apache.org/jira/browse/NUTCH-971 (Patch available) + echoThenRun "bin/nutch merge $temp_indexes/part-1 $indexes $new_indexes" # work-around for https://issues.apache.org/jira/browse/NUTCH-971 (Patch available) + - rm -r $indexes $new_indexes + rm -r $indexes $new_indexes - mv $temp_indexes $indexes + mv $temp_indexes $indexes + - # end nutch index and merge step + # end nutch index and merge step + - # you can now search the index with http://localhost:8080/solr/admin/ (if setup) or http://code.google.com/p/luke/ . The index is stored in crawl/indexes, while if Solr is used then in $NUTCH_HOME/solr/data/index. + # you can now search the index with http://localhost:8080/solr/admin/ (if setup) or http://code.google.com/p/luke/ . The index is stored in crawl/indexes, while if Solr is used then in $NUTCH_HOME/solr/data/index. - ((i++)) + ((i++)) - ((indexedPlus1+=$it_size)) # maybe should readdb crawl/crawldb -stats number of actually fetched, but (! going to fetch a page) --> infinite loop + ((indexedPlus1+=$it_size)) # maybe should readdb crawl/crawldb -stats number of actually fetched, but (! going to fetch a page) --> infinite loop - done + done + - echoThenRun "bin/nutch readdb $it_crawldb -stats" + echoThenRun "bin/nutch readdb $it_crawldb -stats" + - allcrawldb="crawl/allcrawldb" + allcrawldb="crawl/allcrawldb" - temp_crawldb="crawl/temp_crawldb" + temp_crawldb="crawl/temp_crawldb" - merge_dbs="$it_crawldb $allcrawldb" + merge_dbs="$it_crawldb $allcrawldb" + - # work-around for https://issues.apache.org/jira/browse/NUTCH-972 (Patch available) + # work-around for https://issues.apache.org/jira/browse/NUTCH-972 (Patch available) - if [[ ! -d $allcrawldb ]] + if [[ ! -d $allcrawldb ]] - then + then - merge_dbs="$it_crawldb" + merge_dbs="$it_crawldb" - fi + fi + - echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs" + echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs" + - rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb + rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb - mv $temp_crawldb $allcrawldb + mv $temp_crawldb $allcrawldb done + echo crawl_dump="$allcrawldb/dump" + rm -r $crawl_dump $it_seedsDir echoThenRun "bin/nutch readdb $allcrawldb -dump $crawl_dump" # you can inspect the dump with $ vim $crawl_dump bin/nutch readdb $allcrawldb -stats

