Dear Wiki user, You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.
The "Tutorial on incremental crawling" page has been changed by Gabriele Kahlout. http://wiki.apache.org/nutch/Tutorial%20on%20incremental%20crawling?action=diff&rev1=10&rev2=11 -------------------------------------------------- The following scripts crawl the whole-web incrementally; - Input: - a list of urls to crawl + Input: a list of urls to crawl - Output: - Nutch will continuously fetch $it_size urls from the input list, index and merge them with the whole-web index (so that they can be immediately searched) until all urls have been fetched. + Output: Nutch will continuously fetch $it_size urls from the input list, index and merge them with the whole-web index (so that they can be immediately searched) until all urls have been fetched. + Tested with Nutch-1.2 release (see [[Incremental Crawling Scripts Test|tests output]]); If you don't have Nutch up and running, follow [[Tutorial]] - Tested with Nutch-1.2 release [[Incremental Crawling Scripts Test|Output]]. Please report any bug you find on the mailing list and to [[Gabriele Kahlout|me]]. - If you don't have Nutch up and running, follow [[Tutorial]]. - - Script Editions: + === Script Editions: === + 1. Abridged using Solr (tersest) - - 1. Abridged script using Solr; - 2. Unabridged script with explanations and using nutch index. + 1. Unabridged with explanations and using nutch index (beginner) + 1. TODO: Unabridged with explanations, using solr and Hadoop fs (most advanced) + + Please report any bug you find on the mailing list and to [[Gabriele Kahlout|me]]. == 1. Abridged script using Solr == - - {{{ - #!/bin/sh + {{{#!/bin/sh # # Created by Gabriele Kahlout on 27.03.11. @@ -54, +50 @@ while [[ $indexedPlus1 -le $allUrls ]] do - rm $it_seedsDir/urls + rm $it_seedsDir/urls - tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > $it_seedsDir/urls + tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > $it_seedsDir/urls - + - bin/nutch inject $it_crawldb $it_seedsDir + bin/nutch inject $it_crawldb $it_seedsDir - i=0 - + i=0 + - while [[ $i -lt $depth ]] + while [[ $i -lt $depth ]] - do + do - cmd="bin/nutch generate $it_crawldb crawl/segments -topN $it_size" + cmd="bin/nutch generate $it_crawldb crawl/segments -topN $it_size" - output=`$cmd` + output=`$cmd` - if [[ $output == *'0 records selected for fetching'* ]] + if [[ $output == *'0 records selected for fetching'* ]] - then - break; - fi + then + break; + fi - s1=`ls -d crawl/segments/2* | tail -1` + s1=`ls -d crawl/segments/2* | tail -1` - bin/nutch fetch $s1 + bin/nutch fetch $s1 - bin/nutch updatedb $it_crawldb $s1 + bin/nutch updatedb $it_crawldb $s1 - bin/nutch invertlinks crawl/linkdb -dir crawl/segments + bin/nutch invertlinks crawl/linkdb -dir crawl/segments - bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb crawl/linkdb crawl/segments/* + bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb crawl/linkdb crawl/segments/* - - ((i++)) - ((indexedPlus1+=$it_size)) - done + + ((i++)) + ((indexedPlus1+=$it_size)) + done done rm -r $it_seedsDir - }}} - - == 2. Unabridged script with explanations and using nutch index == - {{{ - - #!/bin/sh + {{{#!/bin/sh # # Created by Gabriele Kahlout on 27.03.11. @@ -115, +106 @@ if [[ ! -d "build" ]] then - echoThenRun "ant" + echoThenRun "ant" fi seedsDir="seeds" if [[ $1 != "" ]] then - seedsDir=$1 + seedsDir=$1 fi it_size=10 if [[ $2 != "" ]] then - it_size=$2 + it_size=$2 fi indexedPlus1=1 #indexedPlus1 urls+1 because of tail. Never printed out @@ -143, +134 @@ depth=1 if [[ $3 != "" ]] then - depth=$3 + depth=$3 fi while [[ $indexedPlus1 -le $allUrls ]] #repeat generate-fetch-updatedb-invertlinks-index-merge loop until all urls are fetched do - rm $it_seedsDir/urls + rm $it_seedsDir/urls - tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > $it_seedsDir/urls + tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > $it_seedsDir/urls - echo + echo - echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir" + echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir" - i=0 - + i=0 + - while [[ $i -lt $depth ]] # depth-first + while [[ $i -lt $depth ]] # depth-first - do - echo + do + echo - echo "generate-fetch-updatedb-invertlinks-index-merge iteration "$i":" + echo "generate-fetch-updatedb-invertlinks-index-merge iteration "$i":" - - echo + + echo - cmd="bin/nutch generate $it_crawldb crawl/segments -topN $it_size" + cmd="bin/nutch generate $it_crawldb crawl/segments -topN $it_size" - echo $cmd - output=`$cmd` - echo $output + echo $cmd + output=`$cmd` + echo $output - if [[ $output == *'0 records selected for fetching'* ]] #all the urls of this iteration have been fetched + if [[ $output == *'0 records selected for fetching'* ]] #all the urls of this iteration have been fetched - then - break; - fi + then + break; + fi - s1=`ls -d crawl/segments/2* | tail -1` + s1=`ls -d crawl/segments/2* | tail -1` - echoThenRun "bin/nutch fetch $s1" + echoThenRun "bin/nutch fetch $s1" - echoThenRun "bin/nutch updatedb $it_crawldb $s1" + echoThenRun "bin/nutch updatedb $it_crawldb $s1" - echoThenRun "bin/nutch invertlinks crawl/linkdb -dir crawl/segments" + echoThenRun "bin/nutch invertlinks crawl/linkdb -dir crawl/segments" - # echoThenRun "bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb crawl/linkdb crawl/segments/*" + # echoThenRun "bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb crawl/linkdb crawl/segments/*" - # if you have solr setup you can use it by uncommenting the above command and commenting the following nutch index and merge step. + # if you have solr setup you can use it by uncommenting the above command and commenting the following nutch index and merge step. - # start nutch index and merge step + # start nutch index and merge step - new_indexes="crawl/new_indexes" + new_indexes="crawl/new_indexes" - rm -r $new_indexes $temp_indexes + rm -r $new_indexes $temp_indexes - echoThenRun "bin/nutch index $new_indexes $it_crawldb crawl/linkdb crawl/segments/*" + echoThenRun "bin/nutch index $new_indexes $it_crawldb crawl/linkdb crawl/segments/*" - indexes="crawl/indexes" + indexes="crawl/indexes" - temp_indexes="crawl/temp_indexes" + temp_indexes="crawl/temp_indexes" - # solrindex also merged, with nutch index we've to do it: + # solrindex also merged, with nutch index we've to do it: - echoThenRun "bin/nutch merge $temp_indexes/part-1 $indexes $new_indexes" # work-around for https://issues.apache.org/jira/browse/NUTCH-971 (Patch available) + echoThenRun "bin/nutch merge $temp_indexes/part-1 $indexes $new_indexes" # work-around for https://issues.apache.org/jira/browse/NUTCH-971 (Patch available) - rm -r $indexes $new_indexes + rm -r $indexes $new_indexes - mv $temp_indexes $indexes + mv $temp_indexes $indexes - # end nutch index and merge step + # end nutch index and merge step - # you can now search the index with http://localhost:8080/solr/admin/ (if setup) or http://code.google.com/p/luke/ . The index is stored in crawl/indexes, while if Solr is used then in $NUTCH_HOME/solr/data/index. + # you can now search the index with http://localhost:8080/solr/admin/ (if setup) or http://code.google.com/p/luke/ . The index is stored in crawl/indexes, while if Solr is used then in $NUTCH_HOME/solr/data/index. - ((i++)) + ((i++)) - ((indexedPlus1+=$it_size)) # maybe should readdb crawl/crawldb -stats number of actually fetched, but (! going to fetch a page) --> infinite loop + ((indexedPlus1+=$it_size)) # maybe should readdb crawl/crawldb -stats number of actually fetched, but (! going to fetch a page) --> infinite loop - done + done - echoThenRun "bin/nutch readdb $it_crawldb -stats" + echoThenRun "bin/nutch readdb $it_crawldb -stats" - allcrawldb="crawl/allcrawldb" + allcrawldb="crawl/allcrawldb" - temp_crawldb="crawl/temp_crawldb" + temp_crawldb="crawl/temp_crawldb" - merge_dbs="$it_crawldb $allcrawldb" + merge_dbs="$it_crawldb $allcrawldb" - # work-around for https://issues.apache.org/jira/browse/NUTCH-972 (Patch available) + # work-around for https://issues.apache.org/jira/browse/NUTCH-972 (Patch available) - if [[ ! -d $allcrawldb ]] + if [[ ! -d $allcrawldb ]] - then - merge_dbs="$it_crawldb" - fi + then + merge_dbs="$it_crawldb" + fi - echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs" + echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs" - rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb + rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb - mv $temp_crawldb $allcrawldb + mv $temp_crawldb $allcrawldb done echo @@ -224, +215 @@ rm -r $crawl_dump $it_seedsDir echoThenRun "bin/nutch readdb $allcrawldb -dump $crawl_dump" # you can inspect the dump with $ vim $crawl_dump bin/nutch readdb $allcrawldb -stats - }}}

