[Nutch Wiki] Update of "Tutorial on incremental crawling" by Gabriele Kahlout

Apache Wiki Sun, 27 Mar 2011 06:33:15 -0700

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change 
notification.


The "Tutorial on incremental crawling" page has been changed by Gabriele 
Kahlout.
http://wiki.apache.org/nutch/Tutorial%20on%20incremental%20crawling?action=diff&rev1=10&rev2=11

--------------------------------------------------

  The following scripts crawl the whole-web incrementally;
  
- Input:
- a list of urls to crawl
+ Input: a list of urls to crawl
  
- Output: 
- Nutch will continuously fetch $it_size urls from the input list, index and 
merge them with the whole-web index (so that they can be immediately searched) 
until all urls have been fetched.
+ Output:  Nutch will continuously fetch $it_size urls from the input list, 
index and merge them with the whole-web index (so that they can be immediately 
searched) until all urls have been fetched.
  
+ Tested with Nutch-1.2 release (see [[Incremental Crawling Scripts Test|tests 
output]]); If you don't have Nutch up and running, follow [[Tutorial]]
  
- Tested with Nutch-1.2 release [[Incremental Crawling Scripts Test|Output]]. 
Please report any bug you find on the mailing list and to [[Gabriele 
Kahlout|me]].
- If you don't have Nutch up and running, follow [[Tutorial]].
- 
- Script Editions:
+ === Script Editions: ===
+  1. Abridged using Solr (tersest)
- 
- 1. Abridged script using Solr;
- 2. Unabridged script with explanations and using nutch index.
+  1. Unabridged with explanations and using nutch index (beginner)
+  1. TODO: Unabridged with explanations, using solr and Hadoop fs (most 
advanced)
+ 
+ Please report any bug you find on the mailing list and to [[Gabriele 
Kahlout|me]].
  
  == 1. Abridged script using Solr ==
- 
- {{{
- #!/bin/sh
+ {{{#!/bin/sh
  
  #
  # Created by Gabriele Kahlout on 27.03.11.
@@ -54, +50 @@

  
  while [[ $indexedPlus1 -le $allUrls ]]
  do
-       rm $it_seedsDir/urls
+         rm $it_seedsDir/urls
-       tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > 
$it_seedsDir/urls
+         tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > 
$it_seedsDir/urls
-       
+ 
-       bin/nutch inject $it_crawldb $it_seedsDir
+         bin/nutch inject $it_crawldb $it_seedsDir
-       i=0
-       
+         i=0
+ 
-       while [[ $i -lt $depth ]]
+         while [[ $i -lt $depth ]]
-       do              
+         do
-               cmd="bin/nutch generate $it_crawldb crawl/segments -topN 
$it_size"
+                 cmd="bin/nutch generate $it_crawldb crawl/segments -topN 
$it_size"
-               output=`$cmd`
+                 output=`$cmd`
-               if [[ $output == *'0 records selected for fetching'* ]]
+                 if [[ $output == *'0 records selected for fetching'* ]]
-               then
-                       break;
-               fi
+                 then
+                         break;
+                 fi
-               s1=`ls -d crawl/segments/2* | tail -1`
+                 s1=`ls -d crawl/segments/2* | tail -1`
  
-               bin/nutch fetch $s1
+                 bin/nutch fetch $s1
  
-               bin/nutch updatedb $it_crawldb $s1
+                 bin/nutch updatedb $it_crawldb $s1
  
-               bin/nutch invertlinks crawl/linkdb -dir crawl/segments
+                 bin/nutch invertlinks crawl/linkdb -dir crawl/segments
  
-               bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb 
crawl/linkdb crawl/segments/*
+                 bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb 
crawl/linkdb crawl/segments/*
-                               
-               ((i++))
-               ((indexedPlus1+=$it_size))
-       done
+ 
+                 ((i++))
+                 ((indexedPlus1+=$it_size))
+         done
  done
  rm -r $it_seedsDir
- 
  }}}
- 
- 
  == 2. Unabridged script with explanations and using nutch index ==
- {{{
- 
- #!/bin/sh
+ {{{#!/bin/sh
  
  #
  # Created by Gabriele Kahlout on 27.03.11.
@@ -115, +106 @@

  
  if [[ ! -d "build" ]]
  then
-       echoThenRun "ant"
+         echoThenRun "ant"
  fi
  
  seedsDir="seeds"
  if [[ $1 != "" ]]
  then
-       seedsDir=$1
+         seedsDir=$1
  fi
  
  it_size=10
  if [[ $2 != "" ]]
  then
-       it_size=$2
+         it_size=$2
  fi
  
  indexedPlus1=1 #indexedPlus1 urls+1 because of tail. Never printed out
@@ -143, +134 @@

  depth=1
  if [[ $3 != "" ]]
  then
-       depth=$3
+         depth=$3
  fi
  
  while [[ $indexedPlus1 -le $allUrls ]] #repeat 
generate-fetch-updatedb-invertlinks-index-merge loop until all urls are fetched
  do
-       rm $it_seedsDir/urls
+         rm $it_seedsDir/urls
-       tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > 
$it_seedsDir/urls
+         tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > 
$it_seedsDir/urls
-       echo
+         echo
-       echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir"
+         echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir"
-       i=0
-       
+         i=0
+ 
-       while [[ $i -lt $depth ]] # depth-first
+         while [[ $i -lt $depth ]] # depth-first
-       do
-               echo
+         do
+                 echo
-               echo "generate-fetch-updatedb-invertlinks-index-merge iteration 
"$i":"
+                 echo "generate-fetch-updatedb-invertlinks-index-merge 
iteration "$i":"
-               
-               echo
+ 
+                 echo
-               cmd="bin/nutch generate $it_crawldb crawl/segments -topN 
$it_size"
+                 cmd="bin/nutch generate $it_crawldb crawl/segments -topN 
$it_size"
-               echo $cmd
-               output=`$cmd`
-               echo $output
+                 echo $cmd
+                 output=`$cmd`
+                 echo $output
-               if [[ $output == *'0 records selected for fetching'* ]] #all 
the urls of this iteration have been fetched
+                 if [[ $output == *'0 records selected for fetching'* ]] #all 
the urls of this iteration have been fetched
-               then
-                       break;
-               fi
+                 then
+                         break;
+                 fi
-               s1=`ls -d crawl/segments/2* | tail -1`
+                 s1=`ls -d crawl/segments/2* | tail -1`
  
-               echoThenRun "bin/nutch fetch $s1"
+                 echoThenRun "bin/nutch fetch $s1"
  
-               echoThenRun "bin/nutch updatedb $it_crawldb $s1"
+                 echoThenRun "bin/nutch updatedb $it_crawldb $s1"
  
-               echoThenRun "bin/nutch invertlinks crawl/linkdb -dir 
crawl/segments"
+                 echoThenRun "bin/nutch invertlinks crawl/linkdb -dir 
crawl/segments"
  
  
-               # echoThenRun "bin/nutch solrindex http://localhost:8080/solr/ 
$it_crawldb crawl/linkdb crawl/segments/*"
+                 # echoThenRun "bin/nutch solrindex 
http://localhost:8080/solr/ $it_crawldb crawl/linkdb crawl/segments/*"
-               # if you have solr setup you can use it by uncommenting the 
above command and commenting the following nutch index and merge step.
+                 # if you have solr setup you can use it by uncommenting the 
above command and commenting the following nutch index and merge step.
  
-               # start nutch index and merge step
+                 # start nutch index and merge step
-               new_indexes="crawl/new_indexes"
+                 new_indexes="crawl/new_indexes"
-               rm -r $new_indexes $temp_indexes
+                 rm -r $new_indexes $temp_indexes
-               echoThenRun "bin/nutch index $new_indexes $it_crawldb 
crawl/linkdb crawl/segments/*"
+                 echoThenRun "bin/nutch index $new_indexes $it_crawldb 
crawl/linkdb crawl/segments/*"
-               indexes="crawl/indexes"
+                 indexes="crawl/indexes"
-               temp_indexes="crawl/temp_indexes"
+                 temp_indexes="crawl/temp_indexes"
  
-               # solrindex also merged, with nutch index we've to do it:
+                 # solrindex also merged, with nutch index we've to do it:
-               echoThenRun "bin/nutch merge $temp_indexes/part-1 $indexes 
$new_indexes" # work-around for https://issues.apache.org/jira/browse/NUTCH-971 
(Patch available)
+                 echoThenRun "bin/nutch merge $temp_indexes/part-1 $indexes 
$new_indexes" # work-around for https://issues.apache.org/jira/browse/NUTCH-971 
(Patch available)
  
-               rm -r $indexes $new_indexes
+                 rm -r $indexes $new_indexes
-               mv $temp_indexes $indexes
+                 mv $temp_indexes $indexes
  
-               # end nutch index and merge step
+                 # end nutch index and merge step
  
-               # you can now search the index with 
http://localhost:8080/solr/admin/ (if setup) or http://code.google.com/p/luke/ 
. The index is stored in crawl/indexes, while if Solr is used then in 
$NUTCH_HOME/solr/data/index.
+                 # you can now search the index with 
http://localhost:8080/solr/admin/ (if setup) or http://code.google.com/p/luke/ 
. The index is stored in crawl/indexes, while if Solr is used then in 
$NUTCH_HOME/solr/data/index.
-               ((i++))
+                 ((i++))
-               ((indexedPlus1+=$it_size)) # maybe should readdb crawl/crawldb 
-stats number of actually fetched, but (! going to fetch a page) --> infinite 
loop
+                 ((indexedPlus1+=$it_size)) # maybe should readdb 
crawl/crawldb -stats number of actually fetched, but (! going to fetch a page) 
--> infinite loop
-       done
+         done
  
-       echoThenRun "bin/nutch readdb $it_crawldb -stats"
+         echoThenRun "bin/nutch readdb $it_crawldb -stats"
  
-       allcrawldb="crawl/allcrawldb"
+         allcrawldb="crawl/allcrawldb"
-       temp_crawldb="crawl/temp_crawldb"
+         temp_crawldb="crawl/temp_crawldb"
-       merge_dbs="$it_crawldb $allcrawldb"
+         merge_dbs="$it_crawldb $allcrawldb"
  
-       # work-around for https://issues.apache.org/jira/browse/NUTCH-972 
(Patch available)
+         # work-around for https://issues.apache.org/jira/browse/NUTCH-972 
(Patch available)
-       if [[ ! -d $allcrawldb ]]
+         if [[ ! -d $allcrawldb ]]
-       then
-               merge_dbs="$it_crawldb"
-       fi
+         then
+                 merge_dbs="$it_crawldb"
+         fi
  
-       echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs"
+         echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs"
  
-       rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb
+         rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb
-       mv $temp_crawldb $allcrawldb
+         mv $temp_crawldb $allcrawldb
  done
  
  echo
@@ -224, +215 @@

  rm -r $crawl_dump $it_seedsDir
  echoThenRun "bin/nutch readdb $allcrawldb -dump $crawl_dump" # you can 
inspect the dump with $ vim $crawl_dump
  bin/nutch readdb $allcrawldb -stats
- 
  }}}

[Nutch Wiki] Update of "Tutorial on incremental crawling" by Gabriele Kahlout

Reply via email to