This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 5c5f415 NUTCH-2510 bin/crawl: optionally update HostDb and use in
generate - make creation/update of HostDb optional (--hostdbupdate) - pass
HostDb as argument to Generator (--hostdbgenerate)
5c5f415 is described below
commit 5c5f415b7e1ecb09b5d7f634ecafa02ffeb4255d
Author: Semyon Semyonov <[email protected]>
AuthorDate: Wed Feb 14 14:55:25 2018 +0100
NUTCH-2510 bin/crawl: optionally update HostDb and use in generate
- make creation/update of HostDb optional (--hostdbupdate)
- pass HostDb as argument to Generator (--hostdbgenerate)
---
src/bin/crawl | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
diff --git a/src/bin/crawl b/src/bin/crawl
index dc32367..ff5e456 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -30,6 +30,10 @@
# specified second is used by default.
[default: -1]
# -s <seed_dir> Path to seeds file(s)
# -sm <sitemap_dir> Path to sitemap URL file(s)
+#
+# --hostdbupdate Boolean indicator if we call
hostdbupdate or not
+# --hostdbgenerate Boolean indicator if we use hostdb
in generate or not
+#
# --num-slaves <num_slaves> Number of slave nodes [default: 1]
# Note: This can only be set when
running in distribution mode
# --num-tasks <num_tasks> Number of reducer tasks [default: 2]
@@ -79,6 +83,8 @@ function __print_usage {
echo -e " \t\t\t\t\tspecified second is used by default. [default: -1]"
echo -e " -s <seed_dir>\t\t\t\tPath to seeds file(s)"
echo -e " -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
+ echo -e " --hostdbupdate\t\t\t\tBoolean flag showing if we either update or
not update hostdb for each round"
+ echo -e " --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in
generate or not"
echo -e " --num-slaves <num_slaves>\t\tNumber of slave nodes [default: 1]"
echo -e " \t\t\t\t\tNote: This can only be set when running in distribution
mode"
echo -e " --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
@@ -96,6 +102,8 @@ function __print_usage {
# default values
INDEXFLAG=false
+HOSTDBUPDATE=false
+HOSTDBGENERATE=false
JAVA_PROPERTIES=""
WAIT=-1 # don't wait if there are no URLs to fetch
SEEDDIR=""
@@ -153,6 +161,14 @@ do
SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}"
shift 2
;;
+ --hostdbupdate)
+ HOSTDBUPDATE=true
+ shift
+ ;;
+ --hostdbgenerate)
+ HOSTDBGENERATE=true
+ shift
+ ;;
*)
break
;;
@@ -219,7 +235,7 @@ function __bin_nutch {
fi
}
-#check if directory exists locally or on hdfs
+# check if directory exists locally or on hdfs
function __directory_exists {
if [[ "$mode" == local && -d "$1" ]]; then
return 0
@@ -279,7 +295,12 @@ do
fi
echo "Generating a new segment"
- generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments
-topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter)
+ if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists
"$CRAWL_PATH"/hostdb; then
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments
-topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter -hostdb
"$CRAWL_PATH"/hostdb)
+ else
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments
-topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter)
+ fi
+
echo "$bin/nutch generate ${generate_args[@]}"
$bin/nutch generate "${generate_args[@]}"
RETCODE=$?
@@ -348,7 +369,10 @@ do
echo "Skipping indexing ..."
fi
+ echo "HostDB update"
+ if $HOSTDBUPDATE; then
__update_hostdb
+ fi
#######################################################
# The following commands fall into WebGraph territory