This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 7fba6df NUTCH-2496 Speed up link inversion step in crawling script new 5c98446 Merge pull request #527 from sebastian-nagel/NUTCH-2496 7fba6df is described below commit 7fba6df55d0db81a05958983ba704823c2dff07e Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Fri May 15 19:17:00 2020 +0200 NUTCH-2496 Speed up link inversion step in crawling script - disable URL filtering and normalizing when calling invertlinks in bin/crawl - add note that the steps invertlinks, dedup, index could also be done outside the loop over all segments created in the loop iterations - move webgraph construction (commented out anyway) outside the loop because it's done over all available segments --- src/bin/crawl | 50 +++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/src/bin/crawl b/src/bin/crawl index 9b77ce4..23a2940 100755 --- a/src/bin/crawl +++ b/src/bin/crawl @@ -370,10 +370,19 @@ do echo "CrawlDB update" __bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT - # note that the link inversion - indexing routine can be done within the main loop - # on a per segment basis + echo "HostDB update" + if $HOSTDBUPDATE; then + __update_hostdb + fi + + # Note that all steps below in this loop (link inversion, deduplication, indexing) + # can be done + # - either inside the loop on a per segment basis + # - or after the loop over all segments created in all loop iterations + # (both invertlinks and index accept multiple segments as input) + # The latter is more efficient but the index is then updated later. echo "Link inversion" - __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT + __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter echo "Dedup on crawldb" __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb @@ -385,30 +394,25 @@ do echo "Skipping indexing ..." fi - echo "HostDB update" - if $HOSTDBUPDATE; then - __update_hostdb - fi - - ####################################################### - # The following commands fall into WebGraph territory - # and should be uncommented based on your requirements - ####################################################### - #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/" - #__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH" +done - #echo "Running Loops Job on WebGraph within $CRAWL_PATH" - #__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH" +####################################################### +# The following commands fall into WebGraph territory +# and should be uncommented based on your requirements +####################################################### +#echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/" +#__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH" - #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH" - #__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH" +#echo "Running Loops Job on WebGraph within $CRAWL_PATH" +#__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH" - #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH" - #__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH" +#echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH" +#__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH" - #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores" - #__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores +#echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH" +#__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH" -done +#echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores" +#__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores exit 0