This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 7ebd35d NUTCH-2495: Use -deleteGone instead of clean job in crawl
script while indexing
new c50575a Merge pull request #517 from
sebastian-nagel/NUTCH-2495-bin-crawl-delete-while-indexing
7ebd35d is described below
commit 7ebd35dc96b8d40846103a8c343edecec1763595
Author: Sebastian Nagel <[email protected]>
AuthorDate: Mon Apr 27 10:24:18 2020 +0200
NUTCH-2495: Use -deleteGone instead of clean job in crawl script while
indexing
---
src/bin/crawl | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/src/bin/crawl b/src/bin/crawl
index 331ee65..9b77ce4 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -370,8 +370,8 @@ do
echo "CrawlDB update"
__bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
"$CRAWL_PATH"/segments/$SEGMENT
-# note that the link inversion - indexing routine can be done within the main
loop
-# on a per segment basis
+ # note that the link inversion - indexing routine can be done within the
main loop
+ # on a per segment basis
echo "Link inversion"
__bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb
"$CRAWL_PATH"/segments/$SEGMENT
@@ -380,10 +380,7 @@ do
if $INDEXFLAG; then
echo "Indexing $SEGMENT to index"
- __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb
"$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-
- echo "Cleaning up index if possible"
- __bin_nutch clean "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
+ __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb
"$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone
else
echo "Skipping indexing ..."
fi