This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 5c5f415  NUTCH-2510 bin/crawl: optionally update HostDb and use in 
generate - make creation/update of HostDb optional (--hostdbupdate) - pass 
HostDb as argument to Generator (--hostdbgenerate)
5c5f415 is described below

commit 5c5f415b7e1ecb09b5d7f634ecafa02ffeb4255d
Author: Semyon Semyonov <[email protected]>
AuthorDate: Wed Feb 14 14:55:25 2018 +0100

    NUTCH-2510 bin/crawl: optionally update HostDb and use in generate
    - make creation/update of HostDb optional (--hostdbupdate)
    - pass HostDb as argument to Generator (--hostdbgenerate)
---
 src/bin/crawl | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/bin/crawl b/src/bin/crawl
index dc32367..ff5e456 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -30,6 +30,10 @@
 #                                         specified second is used by default. 
[default: -1]
 #   -s <seed_dir>                         Path to seeds file(s)
 #   -sm <sitemap_dir>                     Path to sitemap URL file(s)
+#
+#   --hostdbupdate                        Boolean indicator if we call 
hostdbupdate or not
+#   --hostdbgenerate                      Boolean indicator if we use hostdb 
in generate or not
+#
 #   --num-slaves <num_slaves>             Number of slave nodes [default: 1]
 #                                         Note: This can only be set when 
running in distribution mode
 #   --num-tasks <num_tasks>               Number of reducer tasks [default: 2]
@@ -79,6 +83,8 @@ function __print_usage {
   echo -e "  \t\t\t\t\tspecified second is used by default. [default: -1]"
   echo -e "  -s <seed_dir>\t\t\t\tPath to seeds file(s)"
   echo -e "  -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
+  echo -e "  --hostdbupdate\t\t\t\tBoolean flag showing if we either update or 
not update hostdb for each round"
+  echo -e "  --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in 
generate or not"
   echo -e "  --num-slaves <num_slaves>\t\tNumber of slave nodes [default: 1]"
   echo -e "  \t\t\t\t\tNote: This can only be set when running in distribution 
mode"
   echo -e "  --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
@@ -96,6 +102,8 @@ function __print_usage {
 
 # default values
 INDEXFLAG=false
+HOSTDBUPDATE=false
+HOSTDBGENERATE=false
 JAVA_PROPERTIES=""
 WAIT=-1 # don't wait if there are no URLs to fetch
 SEEDDIR=""
@@ -153,6 +161,14 @@ do
             SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}"
             shift 2
             ;;
+        --hostdbupdate)
+            HOSTDBUPDATE=true
+            shift
+            ;;
+        --hostdbgenerate)
+            HOSTDBGENERATE=true
+            shift
+            ;;
         *)
             break
             ;;
@@ -219,7 +235,7 @@ function __bin_nutch {
     fi
 }
 
-#check if directory exists locally or on hdfs
+# check if directory exists locally or on hdfs
 function __directory_exists {
   if [[ "$mode" == local  &&  -d "$1" ]]; then
     return 0
@@ -279,7 +295,12 @@ do
   fi
 
   echo "Generating a new segment"
-  generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments 
-topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter)
+  if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists 
"$CRAWL_PATH"/hostdb; then
+   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments 
-topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter -hostdb 
"$CRAWL_PATH"/hostdb)
+  else
+   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments 
-topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter)
+  fi
+
   echo "$bin/nutch generate ${generate_args[@]}"
   $bin/nutch generate "${generate_args[@]}"
   RETCODE=$?
@@ -348,7 +369,10 @@ do
       echo "Skipping indexing ..."
   fi
 
+  echo "HostDB update"
+  if $HOSTDBUPDATE; then
   __update_hostdb
+  fi
 
   #######################################################
   # The following commands fall into WebGraph territory

Reply via email to