[
https://issues.apache.org/jira/browse/NUTCH-2492?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16316163#comment-16316163
]
ASF GitHub Bot commented on NUTCH-2492:
---------------------------------------
lewismc closed pull request #271: NUTCH-2492: Add more configuration parameters
to crawl script
URL: https://github.com/apache/nutch/pull/271
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/src/bin/crawl b/src/bin/crawl
index c92e5b46f..f34908740 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -15,26 +15,32 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] [-s <SeedDir>] [-sm
<SitemapDir>] <CrawlDir> <NumRounds>
-# -i|--index Indexes crawl results into a configured indexer
-# -w|--wait NUMBER[SUFFIX] Time to wait before generating a new
segment when no URLs
-# are scheduled for fetching. Suffix can be: s for second,
-# m for minute, h for hour and d for day. If no suffix is
-# specified second is used by default.
-# -D A Java property to pass to Nutch calls
-# -s Path to seeds file(s)
-# -sm Path to sitemap URL file(s)
-# CrawlDir Directory where the crawl/link/segments dirs are saved
-# NumRounds The number of rounds to run this crawl for
+# Usage: crawl [options] <crawl_dir> <num_rounds>
+#
+# Arguments:
+# <crawl_dir> Directory where the
crawl/host/link/segments dirs are saved
+# <num_rounds> The number of rounds to run this
crawl for
+#
+# Options:
+# -i|--index Indexes crawl results into a
configured indexer
+# -D A Java property to pass to Nutch
calls
+# -w|--wait <NUMBER[SUFFIX]> Time to wait before generating a new
segment when no URLs
+# are scheduled for fetching. Suffix
can be: s for second,
+# m for minute, h for hour and d for
day. If no suffix is
+# specified second is used by default.
[default: -1]
+# -s <seed_dir> Path to seeds file(s)
+# -sm <sitemap_dir> Path to sitemap URL file(s)
+# --num-slaves <num_slaves> Number of slave nodes [default: 1]
+# Note: This can only be set when
running in distribution mode
+# --num-tasks <num_tasks> Number of reducer tasks [default: 2]
+# --size-fetchlist <size_fetchlist> Number of URLs to fetch in one
iteration [default: 50000]
+# --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the
fetching [default: 180]
+# --num-threads <num_threads> Number of threads for fetching /
sitemap processing [default: 50]
#
#
# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK
INVERSION AND
# INDEXING FOR EACH SEGMENT
-INDEXFLAG=false
-JAVA_PROPERTIES=""
-WAIT=-1 # don't wait if there are no URLs to fetch
-
function __to_seconds() {
NUMBER=$(echo $1 | tr -dc '0-9')
MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
@@ -57,7 +63,16 @@ function __to_seconds() {
echo $SECONDS
}
+# default values
+INDEXFLAG=false
+JAVA_PROPERTIES=""
+WAIT=-1 # don't wait if there are no URLs to fetch
SEEDDIR=""
+NUM_SLAVES=1
+NUM_TASKS=2 # 2 x NUM_SLAVES
+SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
+TIME_LIMIT_FETCH=180
+NUM_THREADS=50
while [[ $# > 0 ]]
do
@@ -82,6 +97,26 @@ do
WAIT="${2}"
shift 2
;;
+ --num-slaves)
+ NUM_SLAVES="${2}"
+ shift 2
+ ;;
+ --num-tasks)
+ NUM_TASKS="${2}"
+ shift 2
+ ;;
+ --size-fetchlist)
+ SIZE_FETCHLIST="${2}"
+ shift 2
+ ;;
+ --time-limit-fetch)
+ TIME_LIMIT_FETCH="${2}"
+ shift 2
+ ;;
+ --num-threads)
+ NUM_THREADS="${2}"
+ shift 2
+ ;;
*)
break
;;
@@ -89,17 +124,28 @@ do
done
if [[ $# != 2 ]]; then
- echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] [-s
<SeedDir>] [-sm <SitemapDir>] <CrawlDir> <NumRounds>"
- echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
- echo -e "\t-D\t\tA Java property to pass to Nutch calls"
- echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new
segment when no URLs"
- echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
- echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
- echo -e "\t\t\tspecified second is used by default."
- echo -e "\t-s SeedDir\tPath to seeds file(s)"
- echo -e "\t-sm SitemapDir\tPath to sitemap URL file(s)"
- echo -e "\tCrawlDir\tDirectory where the crawl/host/link/segments dirs are
saved"
- echo -e "\tNumRounds\tThe number of rounds to run this crawl for"
+ echo "Usage: crawl [options] <crawl_dir> <num_rounds>"
+ echo -e ""
+ echo -e "Arguments:"
+ echo -e " <crawl_dir>\t\t\t\tDirectory where the crawl/host/link/segments
dirs are saved"
+ echo -e " <num_rounds>\t\t\t\tThe number of rounds to run this crawl for"
+ echo -e ""
+ echo -e "Options:"
+ echo -e " -i|--index\t\t\t\tIndexes crawl results into a configured
indexer"
+ echo -e " -D\t\t\t\t\tA Java property to pass to Nutch calls"
+ echo -e " -w|--wait <NUMBER[SUFFIX]>\t\tTime to wait before generating a
new segment when no URLs"
+ echo -e " \t\t\t\t\tare scheduled for fetching. Suffix can be: s for
second,"
+ echo -e " \t\t\t\t\tm for minute, h for hour and d for day. If no suffix
is"
+ echo -e " \t\t\t\t\tspecified second is used by default. [default: -1]"
+ echo -e " -s <seed_dir>\t\t\t\tPath to seeds file(s)"
+ echo -e " -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
+ echo -e " --num-slaves <num_slaves>\t\tNumber of slave nodes [default: 1]"
+ echo -e " \t\t\t\t\tNote: This can only be set when running in
distribution mode"
+ echo -e " --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
+ echo -e " --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in
one iteration [default: 50000]"
+ echo -e " --time-limit-fetch <time_limit_fetch>\tNumber of minutes
allocated to the fetching [default: 180]"
+ echo -e " --num-threads <num_threads>\t\tNumber of threads for fetching /
sitemap processing [default: 50]"
+
exit 1
fi
@@ -112,29 +158,6 @@ if [ "$WAIT" != "-1" ]; then
echo "Time to wait (--wait) = $WAIT sec."
fi
-#############################################
-# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
-#############################################
-
-# set the number of slaves nodes
-numSlaves=1
-
-# and the total number of available tasks
-# sets Hadoop parameter "mapreduce.job.reduces"
-numTasks=`expr $numSlaves \* 2`
-
-# number of urls to fetch in one iteration
-# 250K per task?
-sizeFetchlist=`expr $numSlaves \* 50000`
-
-# time limit for feching
-timeLimitFetch=180
-
-# num threads for fetching
-numThreads=50
-
-#############################################
-
bin="`dirname "$0"`"
bin="`cd "$bin"; pwd`"
@@ -143,10 +166,13 @@ mode=local
if [ -f "${bin}"/../*nutch*.job ]; then
mode=distributed
fi
+if [[ "$mode" = "local" ]]; then
+ NUM_SLAVES=1
+fi
# note that some of the options listed here could be set in the
# corresponding hadoop site xml param file
-commonOptions="-D mapreduce.job.reduces=$numTasks -D
mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D
mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
+commonOptions="-D mapreduce.job.reduces=$NUM_TASKS -D
mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D
mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
# check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
@@ -174,7 +200,7 @@ function __bin_nutch {
}
# initial injection
-if [[ ! -z $SEEDDIR ]]; then
+if [[ ! -z $SEEDDIR ]]; then
echo "Injecting seed URLs"
__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
fi
@@ -182,7 +208,7 @@ fi
# sitemap processing based on sitemap definition file(s)
if [[ ! -z $SITEMAPDIR ]]; then
echo "Processing sitemaps defined in $SITEMAPDIR"
- __bin_nutch sitemap "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR"
-threads $numThreads
+ __bin_nutch sitemap "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR"
-threads $NUM_THREADS
fi
# main loop : rounds of generate - fetch - parse - update
@@ -212,11 +238,11 @@ do
# sitemap processing based on HostDB
if [[ -d "$CRAWL_PATH"/hostdb ]]; then
echo "Processing sitemaps based on hosts in HostDB"
- __bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
-threads $numThreads
+ __bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
-threads $NUM_THREADS
fi
echo "Generating a new segment"
- generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments
-topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments
-topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter)
echo "$bin/nutch generate ${generate_args[@]}"
$bin/nutch generate "${generate_args[@]}"
RETCODE=$?
@@ -254,7 +280,7 @@ do
# fetching the segment
echo "Fetching : $SEGMENT"
- __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch
"$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
+ __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH
"$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $NUM_THREADS
# parsing the segment
echo "Parsing : $SEGMENT"
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Add more configuration parameters to crawl script
> --------------------------------------------------
>
> Key: NUTCH-2492
> URL: https://issues.apache.org/jira/browse/NUTCH-2492
> Project: Nutch
> Issue Type: New Feature
> Reporter: Moreno Feltscher
> Assignee: Moreno Feltscher
>
> Instead of having to copy and adjust the crawl script in order to specify the
> following configuration options allow the user to pass them in using
> arguments:
> - numSlaves
> - numTasks
> - sizeFetchlist
> - timeLimitFetch
> - numThreads
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)