This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 7ebd35d  NUTCH-2495: Use -deleteGone instead of clean job in crawl 
script while indexing
     new c50575a  Merge pull request #517 from 
sebastian-nagel/NUTCH-2495-bin-crawl-delete-while-indexing
7ebd35d is described below

commit 7ebd35dc96b8d40846103a8c343edecec1763595
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Mon Apr 27 10:24:18 2020 +0200

    NUTCH-2495: Use -deleteGone instead of clean job in crawl script while 
indexing
---
 src/bin/crawl | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/bin/crawl b/src/bin/crawl
index 331ee65..9b77ce4 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -370,8 +370,8 @@ do
   echo "CrawlDB update"
   __bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb  
"$CRAWL_PATH"/segments/$SEGMENT
 
-# note that the link inversion - indexing routine can be done within the main 
loop
-# on a per segment basis
+  # note that the link inversion - indexing routine can be done within the 
main loop
+  # on a per segment basis
   echo "Link inversion"
   __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb 
"$CRAWL_PATH"/segments/$SEGMENT
 
@@ -380,10 +380,7 @@ do
 
   if $INDEXFLAG; then
       echo "Indexing $SEGMENT to index"
-      __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb 
"$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-
-      echo "Cleaning up index if possible"
-      __bin_nutch clean "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
+      __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb 
"$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone
   else
       echo "Skipping indexing ..."
   fi

Reply via email to