nutch - document that environment variables are only used in local mode

snagel Tue, 28 Apr 2020 01:41:12 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new b5e794e  NUTCH-2501 allow to set Java heap size when using crawl 
script in distributed mode - bin/crawl   - add hint how to set map and reduce 
task memory via -D ... options   - use -D options for all steps (Nutch tools), 
fixes NUTCH-2379   - fix quoting of -D options, eg. -D 
plugin.includes='protocol-xyz|parse-xyz'   - use -D options for all steps 
(Nutch tools) - bin/nutch   - document that environment variables are only used 
in local mode
     new fccc634  Merge pull request #513 from 
sebastian-nagel/NUTCH-2501-java-heap-size-distr-mode
b5e794e is described below

commit b5e794e575a563ee472ad36835604b0647f7c2bd
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Thu Apr 23 11:55:46 2020 +0200

    NUTCH-2501 allow to set Java heap size when using crawl script in 
distributed mode
    - bin/crawl
      - add hint how to set map and reduce task memory via -D ... options
      - use -D options for all steps (Nutch tools), fixes NUTCH-2379
      - fix quoting of -D options, eg. -D 
plugin.includes='protocol-xyz|parse-xyz'
      - use -D options for all steps (Nutch tools)
    - bin/nutch
      - document that environment variables are only used in local mode
---
 src/bin/crawl | 61 ++++++++++++++++++++++++++++++++++-------------------------
 src/bin/nutch | 11 +++++++++--
 2 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/src/bin/crawl b/src/bin/crawl
index 2e85bad..8690929 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -23,7 +23,13 @@
 #
 # Options:
 #   -i|--index                            Indexes crawl results into a 
configured indexer
-#   -D                                    A Java property to pass to Nutch 
calls
+#   -D <propery>=<value>                  A Nutch or Hadoop property to pass 
to Nutch calls overwriting
+#                                         properties defined in configuration 
files, e.g.
+#                                           increase content limit to 2MB:
+#                                             -Dhttp.content.limit=2097152
+#                                         (in distributed mode) configure 
memory of map and reduce tasks:
+#                                           -D mapreduce.map.memory.mb=4608    
-D mapreduce.map.java.opts=-Xmx4096m
+#                                           -D mapreduce.reduce.memory.mb=4608 
-D mapreduce.reduce.java.opts=-Xmx4096m
 #   -w|--wait <NUMBER[SUFFIX]>            Time to wait before generating a new 
segment when no URLs
 #                                         are scheduled for fetching. Suffix 
can be: s for second,
 #                                         m for minute, h for hour and d for 
day. If no suffix is
@@ -42,9 +48,6 @@
 #   --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the 
fetching [default: 180]
 #   --num-threads <num_threads>           Number of threads for fetching / 
sitemap processing [default: 50]
 #
-#
-# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK 
INVERSION AND
-# INDEXING FOR EACH SEGMENT
 
 function __to_seconds() {
   NUMBER=$(echo $1 | tr -dc '0-9')
@@ -77,7 +80,13 @@ function __print_usage {
   echo -e ""
   echo -e "Options:"
   echo -e "  -i|--index\t\t\t\tIndexes crawl results into a configured indexer"
-  echo -e "  -D\t\t\t\t\tA Java property to pass to Nutch calls"
+  echo -e "  -D\t\t\t\t\tA Nutch or Hadoop property to pass to Nutch calls 
overwriting"
+  echo -e "  \t\t\t\t\tproperties defined in configuration files, e.g."
+  echo -e "  \t\t\t\t\tincrease content limit to 2MB:"
+  echo -e "  \t\t\t\t\t  -Dhttp.content.limit=2097152"
+  echo -e "  \t\t\t\t\t(distributed mode only) configure memory of map and 
reduce tasks:"
+  echo -e "  \t\t\t\t\t  -Dmapreduce.map.memory.mb=4608    
-Dmapreduce.map.java.opts=-Xmx4096m"
+  echo -e "  \t\t\t\t\t  -Dmapreduce.reduce.memory.mb=4608 
-Dmapreduce.reduce.java.opts=-Xmx4096m"
   echo -e "  -w|--wait <NUMBER[SUFFIX]>\t\tTime to wait before generating a 
new segment when no URLs"
   echo -e "  \t\t\t\t\tare scheduled for fetching. Suffix can be: s for 
second,"
   echo -e "  \t\t\t\t\tm for minute, h for hour and d for day. If no suffix is"
@@ -106,7 +115,7 @@ function __print_usage {
 INDEXFLAG=false
 HOSTDBUPDATE=false
 HOSTDBGENERATE=false
-JAVA_PROPERTIES=""
+HADOOP_PROPERTIES=()
 WAIT=-1 # don't wait if there are no URLs to fetch
 SEEDDIR=""
 NUM_FETCHERS=1
@@ -124,7 +133,7 @@ do
             shift
             ;;
         -D)
-            JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
+            HADOOP_PROPERTIES=("${HADOOP_PROPERTIES[@]}" -D"${2}")
             shift 2
             ;;
         -s)
@@ -218,7 +227,7 @@ fi
 
 # note that some of the options listed here could be set in the
 # corresponding hadoop site xml param file
-commonOptions="-D mapreduce.job.reduces=$NUM_TASKS -D 
mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D 
mapreduce.map.output.compress=true"
+commonOptions=("${HADOOP_PROPERTIES[@]}" -Dmapreduce.job.reduces=$NUM_TASKS 
-Dmapreduce.reduce.speculative=false -Dmapreduce.map.speculative=false 
-Dmapreduce.map.output.compress=true)
 
  # check that hadoop can be found on the path
 if [ $mode = "distributed" ]; then
@@ -259,20 +268,20 @@ function __directory_exists {
 function __update_hostdb {
   if __directory_exists "$CRAWL_PATH"/crawldb; then
     echo "Updating HostDB"
-    __bin_nutch updatehostdb -crawldb "$CRAWL_PATH"/crawldb -hostdb 
"$CRAWL_PATH"/hostdb
+    __bin_nutch updatehostdb "${commonOptions[@]}" -crawldb 
"$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
   fi
 }
 
 # initial injection
 if [[ ! -z $SEEDDIR  ]]; then
   echo "Injecting seed URLs"
-  __bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
+  __bin_nutch inject "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$SEEDDIR"
 fi
 
 # sitemap processing based on sitemap definition file(s)
 if [[ ! -z $SITEMAPDIR ]]; then
   echo "Processing sitemaps defined in $SITEMAPDIR"
-  __bin_nutch sitemap "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" 
-threads $NUM_THREADS
+  __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH/crawldb" -sitemapUrls 
"$SITEMAPDIR" -threads $NUM_THREADS
 fi
 
 # main loop : rounds of generate - fetch - parse - update
@@ -300,15 +309,15 @@ do
     # sitemap processing based on HostDB
     if __directory_exists "$CRAWL_PATH"/hostdb; then
       echo "Processing sitemaps based on hosts in HostDB"
-      __bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb 
-threads $NUM_THREADS
+      __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -hostdb 
"$CRAWL_PATH"/hostdb -threads $NUM_THREADS
     fi
   fi
 
   echo "Generating a new segment"
   if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists 
"$CRAWL_PATH"/hostdb; then
-   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments 
-topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb 
"$CRAWL_PATH"/hostdb)
+   generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb 
"$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS 
-noFilter -hostdb "$CRAWL_PATH"/hostdb)
   else
-   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments 
-topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter)
+   generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb 
"$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS 
-noFilter)
   fi
 
   echo "$bin/nutch generate ${generate_args[@]}"
@@ -348,33 +357,33 @@ do
 
   # fetching the segment
   echo "Fetching : $SEGMENT"
-  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH 
"$CRAWL_PATH"/segments/$SEGMENT -threads $NUM_THREADS
+  __bin_nutch fetch "${commonOptions[@]}" -D 
fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT 
-threads $NUM_THREADS
 
   # parsing the segment
   echo "Parsing : $SEGMENT"
   # enable the skipping of records for the parsing so that a dodgy document
   # so that it does not fail the full task
   skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D 
mapreduce.map.skip.maxrecords=1"
-  __bin_nutch parse $commonOptions $skipRecordsOptions 
"$CRAWL_PATH"/segments/$SEGMENT
+  __bin_nutch parse "${commonOptions[@]}" $skipRecordsOptions 
"$CRAWL_PATH"/segments/$SEGMENT
 
   # updatedb with this segment
   echo "CrawlDB update"
-  __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb  
"$CRAWL_PATH"/segments/$SEGMENT
+  __bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb  
"$CRAWL_PATH"/segments/$SEGMENT
 
 # note that the link inversion - indexing routine can be done within the main 
loop
 # on a per segment basis
   echo "Link inversion"
-  __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+  __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb 
"$CRAWL_PATH"/segments/$SEGMENT
 
   echo "Dedup on crawldb"
-  __bin_nutch dedup "$CRAWL_PATH"/crawldb
+  __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
 
   if $INDEXFLAG; then
       echo "Indexing $SEGMENT to index"
-      __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb 
"$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+      __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb 
"$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
 
       echo "Cleaning up index if possible"
-      __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
+      __bin_nutch clean "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
   else
       echo "Skipping indexing ..."
   fi
@@ -389,19 +398,19 @@ do
   # and should be uncommented based on your requirements
   #######################################################
   #echo "Building WebGraph within $CRAWL_PATH on all segments in 
$CRAWL_PATH/segments/"
-  #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir 
"$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
+  #__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir 
"$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
 
   #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
-  #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions 
-webgraphdb "$CRAWL_PATH"
+  #__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" 
-webgraphdb "$CRAWL_PATH"
 
   #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
-  #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
+  #__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"
 
   #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph 
within $CRAWL_PATH"
-  #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb 
-webgraphdb "$CRAWL_PATH"
+  #__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb 
"$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
 
   #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output 
to $CRAWL_PATH/dump/scores"
-  #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb 
"$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
+  #__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb 
"$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
 
 done
 
diff --git a/src/bin/nutch b/src/bin/nutch
index e79b391..3a25738 100755
--- a/src/bin/nutch
+++ b/src/bin/nutch
@@ -17,7 +17,7 @@
 # 
 # The Nutch command script
 #
-# Environment Variables
+# Environment Variables (local mode only)
 #
 #   NUTCH_JAVA_HOME The java implementation to use.  Overrides JAVA_HOME.
 #
@@ -34,6 +34,13 @@
 #   NUTCH_CONF_DIR  Path(s) to configuration files (default: $NUTCH_HOME/conf).
 #                   Multiple paths must be separated by a colon ':'.
 #
+# Note: environment variables are only used in local mode. When running Nutch
+#       on a Hadoop cluster (distributed mode), the corresponding settings
+#       are configured by Hadoop configuration properties set globally for the
+#       cluster or per Nutch job. For the complete list of properties, see
+#         
https://hadoop.apache.org/docs/stable3/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
+#         
https://hadoop.apache.org/docs/stable3/hadoop-yarn/hadoop-yarn-common/yarn-default.xml
+#
 cygwin=false
 case "`uname`" in
 CYGWIN*) cygwin=true;;
@@ -54,7 +61,7 @@ done
 # if no args specified, show usage
 if [ $# = 0 ]; then
   echo "nutch 1.17-SNAPSHOT"
-  echo "Usage: nutch COMMAND"
+  echo "Usage: nutch COMMAND [-Dproperty=value]... [command-specific args]..."
   echo "where COMMAND is one of:"
   echo "  readdb            read / dump crawl db"
   echo "  mergedb           merge crawldb-s, with optional filtering"

Reply via email to