This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 7fba6df NUTCH-2496 Speed up link inversion step in crawling script
new 5c98446 Merge pull request #527 from sebastian-nagel/NUTCH-2496
7fba6df is described below
commit 7fba6df55d0db81a05958983ba704823c2dff07e
Author: Sebastian Nagel <[email protected]>
AuthorDate: Fri May 15 19:17:00 2020 +0200
NUTCH-2496 Speed up link inversion step in crawling script
- disable URL filtering and normalizing when calling invertlinks
in bin/crawl
- add note that the steps invertlinks, dedup, index could also
be done outside the loop over all segments created in the loop
iterations
- move webgraph construction (commented out anyway) outside the
loop because it's done over all available segments
---
src/bin/crawl | 50 +++++++++++++++++++++++++++-----------------------
1 file changed, 27 insertions(+), 23 deletions(-)
diff --git a/src/bin/crawl b/src/bin/crawl
index 9b77ce4..23a2940 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -370,10 +370,19 @@ do
echo "CrawlDB update"
__bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
"$CRAWL_PATH"/segments/$SEGMENT
- # note that the link inversion - indexing routine can be done within the
main loop
- # on a per segment basis
+ echo "HostDB update"
+ if $HOSTDBUPDATE; then
+ __update_hostdb
+ fi
+
+ # Note that all steps below in this loop (link inversion, deduplication,
indexing)
+ # can be done
+ # - either inside the loop on a per segment basis
+ # - or after the loop over all segments created in all loop iterations
+ # (both invertlinks and index accept multiple segments as input)
+ # The latter is more efficient but the index is then updated later.
echo "Link inversion"
- __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb
"$CRAWL_PATH"/segments/$SEGMENT
+ __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb
"$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter
echo "Dedup on crawldb"
__bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
@@ -385,30 +394,25 @@ do
echo "Skipping indexing ..."
fi
- echo "HostDB update"
- if $HOSTDBUPDATE; then
- __update_hostdb
- fi
-
- #######################################################
- # The following commands fall into WebGraph territory
- # and should be uncommented based on your requirements
- #######################################################
- #echo "Building WebGraph within $CRAWL_PATH on all segments in
$CRAWL_PATH/segments/"
- #__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir
"$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
+done
- #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
- #__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}"
-webgraphdb "$CRAWL_PATH"
+#######################################################
+# The following commands fall into WebGraph territory
+# and should be uncommented based on your requirements
+#######################################################
+#echo "Building WebGraph within $CRAWL_PATH on all segments in
$CRAWL_PATH/segments/"
+#__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir
"$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
- #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
- #__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"
+#echo "Running Loops Job on WebGraph within $CRAWL_PATH"
+#__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}"
-webgraphdb "$CRAWL_PATH"
- #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph
within $CRAWL_PATH"
- #__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb
"$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
+#echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
+#__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"
- #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output
to $CRAWL_PATH/dump/scores"
- #__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb
"$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
+#echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within
$CRAWL_PATH"
+#__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb
-webgraphdb "$CRAWL_PATH"
-done
+#echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to
$CRAWL_PATH/dump/scores"
+#__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb
"$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
exit 0