Repository: nutch Updated Branches: refs/heads/NUTCH-2292 9173fd4d6 -> 9f3ba3eda
Reproduced runtime/local build without breaking the backward compatibility Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/020f581a Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/020f581a Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/020f581a Branch: refs/heads/NUTCH-2292 Commit: 020f581a2cc735f26d6a423e87da7f7462ed3a35 Parents: 9173fd4 Author: Thamme Gowda <[email protected]> Authored: Sun Jul 10 17:49:51 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Sun Jul 10 17:49:51 2016 -0700 ---------------------------------------------------------------------- bin/crawl | 281 +++++++++++++++++++++++++++++++++++++++ bin/nutch | 324 +++++++++++++++++++++++++++++++++++++++++++++ nutch-core/pom.xml | 30 ++++- nutch-plugins/pom.xml | 37 +++++- pom.xml | 51 ++++++- src/bin/crawl | 281 --------------------------------------- src/bin/nutch | 324 --------------------------------------------- 7 files changed, 717 insertions(+), 611 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/bin/crawl ---------------------------------------------------------------------- diff --git a/bin/crawl b/bin/crawl new file mode 100755 index 0000000..567d35e --- /dev/null +++ b/bin/crawl @@ -0,0 +1,281 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds> +# -i|--index Indexes crawl results into a configured indexer +# -w|--wait NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs +# are scheduled for fetching. Suffix can be: s for second, +# m for minute, h for hour and d for day. If no suffix is +# specified second is used by default. +# -D A Java property to pass to Nutch calls +# Seed Dir Directory in which to look for a seeds file +# Crawl Dir Directory where the crawl/link/segments dirs are saved +# Num Rounds The number of rounds to run this crawl for +# +# +# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND +# INDEXING FOR EACH SEGMENT + +INDEXFLAG=false +JAVA_PROPERTIES="" +WAIT=-1 # don't wait if there are no URLs to fetch + +function __to_seconds() { + NUMBER=$(echo $1 | tr -dc '0-9') + MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]') + + case $MODIFIER in + m|M) + SECONDS=`expr $NUMBER \* 60` + ;; + h|H) + SECONDS=`expr $NUMBER \* 120` + ;; + d|D) + SECONDS=`expr $NUMBER \* 86400` + ;; + s|S|*) + SECONDS=$NUMBER + ;; + esac + + echo $SECONDS +} + +while [[ $# > 0 ]] +do + case $1 in + -i|--index) + INDEXFLAG=true + shift + ;; + -D) + JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}" + shift 2 + ;; + -w|--wait) + WAIT="${2}" + shift 2 + ;; + *) + break + ;; + esac +done + +if [[ $# != 3 ]]; then + echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>" + echo -e "\t-i|--index\tIndexes crawl results into a configured indexer" + echo -e "\t-D\t\tA Java property to pass to Nutch calls" + echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs" + echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second," + echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is" + echo -e "\t\t\tspecified second is used by default." + echo -e "\tSeed Dir\tDirectory in which to look for a seeds file" + echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved" + echo -e "\tNum Rounds\tThe number of rounds to run this crawl for" + exit 1 +fi + +SEEDDIR="$1" +CRAWL_PATH="$2" +LIMIT="$3" + +# convert wait time to seconds for compatibility reasons +if [ "$WAIT" != "-1" ]; then + WAIT=$( __to_seconds "$WAIT" ) + echo "Time to wait (--wait) = $WAIT sec." +fi + +############################################# +# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS # +############################################# + +# set the number of slaves nodes +numSlaves=1 + +# and the total number of available tasks +# sets Hadoop parameter "mapreduce.job.reduces" +numTasks=`expr $numSlaves \* 2` + +# number of urls to fetch in one iteration +# 250K per task? +sizeFetchlist=`expr $numSlaves \* 50000` + +# time limit for feching +timeLimitFetch=180 + +# num threads for fetching +numThreads=50 + +############################################# + +bin="`dirname "$0"`" +bin="`cd "$bin"; pwd`" + +# determines whether mode based on presence of job file +mode=local +if [ -f "${bin}"/../*nutch*.job ]; then + mode=distributed +fi + +# note that some of the options listed here could be set in the +# corresponding hadoop site xml param file +commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true" + + # check that hadoop can be found on the path +if [ $mode = "distributed" ]; then + if [ $(which hadoop | wc -l ) -eq 0 ]; then + echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." + exit -1; + fi +fi + + +function __bin_nutch { + # run $bin/nutch, exit if exit value indicates error + + echo "$bin/nutch $@" ;# echo command and arguments + "$bin/nutch" "$@" + + RETCODE=$? + if [ $RETCODE -ne 0 ] + then + echo "Error running:" + echo " $bin/nutch $@" + echo "Failed with exit value $RETCODE." + exit $RETCODE + fi +} + + + +# initial injection +echo "Injecting seed URLs" +__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR" + +# main loop : rounds of generate - fetch - parse - update +for ((a=1; ; a++)) +do + if [ -e ".STOP" ] + then + echo "STOP file found - escaping loop" + break + fi + + if [ $LIMIT -ne -1 ]; then + if [ $a -gt $LIMIT ]; then + echo `date` ": Finished loop with $LIMIT iterations" + break + fi + echo `date` ": Iteration $a of $LIMIT" + else + echo `date` ": Iteration $a" + fi + + echo "Generating a new segment" + generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter) + echo "$bin/nutch generate ${generate_args[@]}" + $bin/nutch generate "${generate_args[@]}" + RETCODE=$? + if [ $RETCODE -eq 0 ]; then + : # ok: no error + elif [ $RETCODE -eq 1 ]; then + echo "Generate returned 1 (no new segments created)" + + if [ "$WAIT" -ne -1 ]; then + echo "Waiting for $WAIT sec. ..." + sleep $WAIT + continue + else + echo "Escaping loop: no more URLs to fetch now" + break + fi + else + echo "Error running:" + echo " $bin/nutch generate ${generate_args[@]}" + echo "Failed with exit value $RETCODE." + exit $RETCODE + fi + + # capture the name of the segment + # call hadoop in distributed mode + # or use ls + + if [ $mode = "local" ]; then + SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1` + else + SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1` + fi + + echo "Operating on segment : $SEGMENT" + + # fetching the segment + echo "Fetching : $SEGMENT" + __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads + + # parsing the segment + echo "Parsing : $SEGMENT" + # enable the skipping of records for the parsing so that a dodgy document + # so that it does not fail the full task + skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1" + __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT + + # updatedb with this segment + echo "CrawlDB update" + __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT + +# note that the link inversion - indexing routine can be done within the main loop +# on a per segment basis + echo "Link inversion" + __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT + + echo "Dedup on crawldb" + __bin_nutch dedup "$CRAWL_PATH"/crawldb + + if $INDEXFLAG; then + echo "Indexing $SEGMENT to index" + __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT + + echo "Cleaning up index if possible" + __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb + else + echo "Skipping indexing ..." + fi + + ####################################################### + # The following commands fall into WebGraph territory + # and should be uncommented based on your requirements + ####################################################### + #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/" + #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH" + + #echo "Running Loops Job on WebGraph within $CRAWL_PATH" + #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH" + + #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH" + #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH" + + #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH" + #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH" + + #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores" + #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores + +done + +exit 0 http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/bin/nutch ---------------------------------------------------------------------- diff --git a/bin/nutch b/bin/nutch new file mode 100755 index 0000000..1649069 --- /dev/null +++ b/bin/nutch @@ -0,0 +1,324 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# The Nutch command script +# +# Environment Variables +# +# NUTCH_JAVA_HOME The java implementation to use. Overrides JAVA_HOME. +# +# NUTCH_HEAPSIZE The maximum amount of heap to use, in MB. +# Default is 1000. +# +# NUTCH_OPTS Extra Java runtime options. +# Multiple options must be separated by white space. +# +# NUTCH_LOG_DIR Log directory (default: $NUTCH_HOME/logs) +# +# NUTCH_LOGFILE Log file (default: hadoop.log) +# +# NUTCH_CONF_DIR Path(s) to configuration files (default: $NUTCH_HOME/conf). +# Multiple paths must be separated by a colon ':'. +# +cygwin=false +case "`uname`" in +CYGWIN*) cygwin=true;; +esac + +# resolve links - $0 may be a softlink +THIS="$0" +while [ -h "$THIS" ]; do + ls=`ls -ld "$THIS"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + THIS="$link" + else + THIS=`dirname "$THIS"`/"$link" + fi +done + +# if no args specified, show usage +if [ $# = 0 ]; then + echo "nutch 1.12" + echo "Usage: nutch COMMAND" + echo "where COMMAND is one of:" + echo " readdb read / dump crawl db" + echo " mergedb merge crawldb-s, with optional filtering" + echo " readlinkdb read / dump link db" + echo " inject inject new urls into the database" + echo " generate generate new segments to fetch from crawl db" + echo " freegen generate new segments to fetch from text files" + echo " fetch fetch a segment's pages" + echo " parse parse a segment's pages" + echo " readseg read / dump segment data" + echo " mergesegs merge several segments, with optional filtering and slicing" + echo " updatedb update crawl db from segments after fetching" + echo " invertlinks create a linkdb from parsed segments" + echo " mergelinkdb merge linkdb-s, with optional filtering" + echo " index run the plugin-based indexer on parsed segments and linkdb" + echo " dedup deduplicate entries in the crawldb and give them a special status" + echo " dump exports crawled data from segments into files" + echo " commoncrawldump exports crawled data from segments into common crawl data format encoded as CBOR" + echo " solrindex run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead" + echo " solrdedup remove duplicates from solr - DEPRECATED use the dedup command instead" + echo " solrclean remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead" + echo " clean remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins" + echo " parsechecker check the parser for a given url" + echo " indexchecker check the indexing filters for a given url" + echo " filterchecker check url filters for a given url" + echo " normalizerchecker check url normalizers for a given url" + echo " domainstats calculate domain statistics from crawldb" + echo " protocolstats calculate protocol status code stats from crawldb" + echo " crawlcomplete calculate crawl completion stats from crawldb" + echo " webgraph generate a web graph from existing segments" + echo " linkrank run a link analysis program on the generated web graph" + echo " scoreupdater updates the crawldb with linkrank scores" + echo " nodedumper dumps the web graph's node scores" + echo " plugin load a plugin and run one of its classes main()" + echo " junit runs the given JUnit test" + echo " startserver runs the Nutch Server on localhost:8081" + echo " webapp run a local Nutch Web Application on locahost:8080" + echo " warc exports crawled data from segments at the WARC format" + echo " updatehostdb update the host db with records from the crawl db" + echo " readhostdb read / dump host db" + echo " or" + echo " CLASSNAME run the class named CLASSNAME" + echo "Most commands print help when invoked w/o parameters." + exit 1 +fi + +# get arguments +COMMAND=$1 +shift + +# some directories +THIS_DIR="`dirname "$THIS"`" +NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`" + +# some Java parameters +if [ "$NUTCH_JAVA_HOME" != "" ]; then + #echo "run java in $NUTCH_JAVA_HOME" + JAVA_HOME="$NUTCH_JAVA_HOME" +fi + +if [ "$JAVA_HOME" = "" ]; then + echo "Error: JAVA_HOME is not set." + exit 1 +fi + +local=true + +# NUTCH_JOB +if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then + local=false + for f in "$NUTCH_HOME"/*nutch*.job; do + NUTCH_JOB="$f" + done + # cygwin path translation + if $cygwin; then + NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`" + fi +fi + +JAVA="$JAVA_HOME/bin/java" +JAVA_HEAP_MAX=-Xmx1000m + +# check envvars which might override default args +if [ "$NUTCH_HEAPSIZE" != "" ]; then + #echo "run with heapsize $NUTCH_HEAPSIZE" + JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m" + #echo $JAVA_HEAP_MAX +fi + +# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf +CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}" +CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar" + +# so that filenames w/ spaces are handled correctly in loops below +IFS= + +# add libs to CLASSPATH +if $local; then + for f in "$NUTCH_HOME"/lib/*.jar; do + CLASSPATH="${CLASSPATH}:$f"; + done + # local runtime + # add plugins to classpath + if [ -d "$NUTCH_HOME/plugins" ]; then + CLASSPATH="${NUTCH_HOME}:${CLASSPATH}" + fi +fi + +# cygwin path translation +if $cygwin; then + CLASSPATH="`cygpath -p -w "$CLASSPATH"`" +fi + +# setup 'java.library.path' for native-hadoop code if necessary +# used only in local mode +JAVA_LIBRARY_PATH='' +if [ -d "${NUTCH_HOME}/lib/native" ]; then + + JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'` + + if [ -d "${NUTCH_HOME}/lib/native" ]; then + if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then + JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}" + else + JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}" + fi + fi +fi + +if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then + JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`" +fi + +# restore ordinary behaviour +unset IFS + +# default log directory & file +if [ "$NUTCH_LOG_DIR" = "" ]; then + NUTCH_LOG_DIR="$NUTCH_HOME/logs" +fi +if [ "$NUTCH_LOGFILE" = "" ]; then + NUTCH_LOGFILE='hadoop.log' +fi + +#Fix log path under cygwin +if $cygwin; then + NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`" +fi + +NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR") +NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE") + +if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then + NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH") +fi + +# figure out which class to run +if [ "$COMMAND" = "crawl" ] ; then + echo "Command $COMMAND is deprecated, please use bin/crawl instead" + exit -1 +elif [ "$COMMAND" = "inject" ] ; then + CLASS=org.apache.nutch.crawl.Injector +elif [ "$COMMAND" = "generate" ] ; then + CLASS=org.apache.nutch.crawl.Generator +elif [ "$COMMAND" = "freegen" ] ; then + CLASS=org.apache.nutch.tools.FreeGenerator +elif [ "$COMMAND" = "fetch" ] ; then + CLASS=org.apache.nutch.fetcher.Fetcher +elif [ "$COMMAND" = "parse" ] ; then + CLASS=org.apache.nutch.parse.ParseSegment +elif [ "$COMMAND" = "readdb" ] ; then + CLASS=org.apache.nutch.crawl.CrawlDbReader +elif [ "$COMMAND" = "mergedb" ] ; then + CLASS=org.apache.nutch.crawl.CrawlDbMerger +elif [ "$COMMAND" = "readlinkdb" ] ; then + CLASS=org.apache.nutch.crawl.LinkDbReader +elif [ "$COMMAND" = "readseg" ] ; then + CLASS=org.apache.nutch.segment.SegmentReader +elif [ "$COMMAND" = "mergesegs" ] ; then + CLASS=org.apache.nutch.segment.SegmentMerger +elif [ "$COMMAND" = "updatedb" ] ; then + CLASS=org.apache.nutch.crawl.CrawlDb +elif [ "$COMMAND" = "invertlinks" ] ; then + CLASS=org.apache.nutch.crawl.LinkDb +elif [ "$COMMAND" = "mergelinkdb" ] ; then + CLASS=org.apache.nutch.crawl.LinkDbMerger +elif [ "$COMMAND" = "dump" ] ; then + CLASS=org.apache.nutch.tools.FileDumper +elif [ "$COMMAND" = "commoncrawldump" ] ; then + CLASS=org.apache.nutch.tools.CommonCrawlDataDumper +elif [ "$COMMAND" = "solrindex" ] ; then + CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1" + shift +elif [ "$COMMAND" = "index" ] ; then + CLASS=org.apache.nutch.indexer.IndexingJob +elif [ "$COMMAND" = "solrdedup" ] ; then + echo "Command $COMMAND is deprecated, please use dedup instead" + exit -1 +elif [ "$COMMAND" = "dedup" ] ; then + CLASS=org.apache.nutch.crawl.DeduplicationJob +elif [ "$COMMAND" = "solrclean" ] ; then + CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1" + shift; shift +elif [ "$COMMAND" = "clean" ] ; then + CLASS=org.apache.nutch.indexer.CleaningJob +elif [ "$COMMAND" = "parsechecker" ] ; then + CLASS=org.apache.nutch.parse.ParserChecker +elif [ "$COMMAND" = "indexchecker" ] ; then + CLASS=org.apache.nutch.indexer.IndexingFiltersChecker +elif [ "$COMMAND" = "filterchecker" ] ; then + CLASS=org.apache.nutch.net.URLFilterChecker +elif [ "$COMMAND" = "normalizerchecker" ] ; then + CLASS=org.apache.nutch.net.URLNormalizerChecker +elif [ "$COMMAND" = "domainstats" ] ; then + CLASS=org.apache.nutch.util.domain.DomainStatistics +elif [ "$COMMAND" = "protocolstats" ] ; then + CLASS=org.apache.nutch.util.ProtocolStatusStatistics +elif [ "$COMMAND" = "crawlcomplete" ] ; then + CLASS=org.apache.nutch.util.CrawlCompletionStats +elif [ "$COMMAND" = "webgraph" ] ; then + CLASS=org.apache.nutch.scoring.webgraph.WebGraph +elif [ "$COMMAND" = "linkrank" ] ; then + CLASS=org.apache.nutch.scoring.webgraph.LinkRank +elif [ "$COMMAND" = "scoreupdater" ] ; then + CLASS=org.apache.nutch.scoring.webgraph.ScoreUpdater +elif [ "$COMMAND" = "nodedumper" ] ; then + CLASS=org.apache.nutch.scoring.webgraph.NodeDumper +elif [ "$COMMAND" = "plugin" ] ; then + CLASS=org.apache.nutch.plugin.PluginRepository +elif [ "$COMMAND" = "junit" ] ; then + CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/" + if $local; then + for f in "$NUTCH_HOME"/test/lib/*.jar; do + CLASSPATH="${CLASSPATH}:$f"; + done + fi + CLASS=org.junit.runner.JUnitCore +elif [ "$COMMAND" = "startserver" ] ; then + CLASS=org.apache.nutch.service.NutchServer +elif [ "$COMMAND" = "webapp" ] ; then + CLASS=org.apache.nutch.webui.NutchUiServer +elif [ "$COMMAND" = "warc" ] ; then + CLASS=org.apache.nutch.tools.warc.WARCExporter +elif [ "$COMMAND" = "updatehostdb" ] ; then + CLASS=org.apache.nutch.hostdb.UpdateHostDb +elif [ "$COMMAND" = "readhostdb" ] ; then + CLASS=org.apache.nutch.hostdb.ReadHostDb +else + CLASS=$COMMAND +fi + +# distributed mode +EXEC_CALL=(hadoop jar "$NUTCH_JOB") + +if $local; then + EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH") +else + # check that hadoop can be found on the path + if [ $(which hadoop | wc -l ) -eq 0 ]; then + echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." + exit -1; + fi +fi + +# run it +exec "${EXEC_CALL[@]}" $CLASS "$@" + http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/nutch-core/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-core/pom.xml b/nutch-core/pom.xml index e358f50..ad5c3af 100644 --- a/nutch-core/pom.xml +++ b/nutch-core/pom.xml @@ -113,7 +113,8 @@ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <slf4j.version>1.7.12</slf4j.version> <junit.version>4.12</junit.version> - <libs.dir>${project.parent.basedir}${file.separator}${libs.subdir}</libs.dir> + <dir.root>${project.parent.basedir}</dir.root> + <libs.dir>${dir.local}${file.separator}lib</libs.dir> </properties> <dependencies> @@ -468,7 +469,32 @@ </execution> </executions> </plugin> + <plugin> + <artifactId>maven-resources-plugin</artifactId> + <version>3.0.1</version> + <executions> + <execution> + <id>copy-resources</id> + <phase>package</phase> + <goals> + <goal>copy-resources</goal> + </goals> + <configuration> + <outputDirectory>${libs.dir}</outputDirectory> + <resources> + <resource> + <directory>${project.build.directory}</directory> + <include>${build.finalName}.jar</include> + </resource> + <resource> + <directory>${project.basedir}</directory> + <include>plugin.xml</include> + </resource> + </resources> + </configuration> + </execution> + </executions> + </plugin> </plugins> </build> - </project> http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/nutch-plugins/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/pom.xml b/nutch-plugins/pom.xml index e6a6abd..fa7adb7 100644 --- a/nutch-plugins/pom.xml +++ b/nutch-plugins/pom.xml @@ -32,6 +32,7 @@ <url>http://nutch.apache.org</url> <modules> + <!--<module>indexer-solr</module>--> <module>creativecommons</module> <module>feed</module> <module>headings</module> @@ -101,7 +102,9 @@ </modules> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> - <libs.dir>..${file.separator}..${file.separator}${libs.subdir}</libs.dir> <!-- Note : one additional level is for the child modules--> + <!-- Note : an additional level is for the child modules (defined ahead in hierarchy)--> + <dir.root>..${file.separator}..${file.separator}</dir.root> + <libs.dir>${dir.local.plugins}${file.separator}${project.artifactId}</libs.dir> </properties> <dependencies> <dependency> @@ -118,5 +121,35 @@ <type>test-jar</type> </dependency> </dependencies> - + <build> + <finalName>${project.artifactId}</finalName> + <plugins> + <plugin> + <artifactId>maven-resources-plugin</artifactId> + <version>3.0.1</version> + <executions> + <execution> + <id>copy-resources</id> + <phase>package</phase> + <goals> + <goal>copy-resources</goal> + </goals> + <configuration> + <outputDirectory>${libs.dir}</outputDirectory> + <resources> + <resource> + <directory>${project.build.directory}</directory> + <include>${build.finalName}.jar</include> + </resource> + <resource> + <directory>${project.basedir}</directory> + <include>plugin.xml</include> + </resource> + </resources> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> </project> http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 268ab2d..18e22c7 100644 --- a/pom.xml +++ b/pom.xml @@ -10,8 +10,10 @@ <packaging>pom</packaging> <properties> - <libs.subdir>runtime${file.separator}local${file.separator}lib</libs.subdir> - <libs.dir>${project.basedir}${file.separator}${libs.subdir}</libs.dir> + <dir.root>${project.basedir}</dir.root> + <dir.local>${dir.root}${file.separator}runtime${file.separator}local</dir.local> + <dir.local.libs>${dir.local}${file.separator}libs</dir.local.libs> + <dir.local.plugins>${dir.local}${file.separator}plugins</dir.local.plugins> <junit.version>4.12</junit.version> </properties> <modules> @@ -37,6 +39,14 @@ <directory>runtime</directory> <followSymlinks>false</followSymlinks> </fileset> + <fileset> + <directory>${dir.local.libs}</directory> + <includes> + <include>**/*.jar</include> + <include>**/*.xml</include> + </includes> + <followSymlinks>false</followSymlinks> + </fileset> </filesets> </configuration> </plugin> @@ -61,6 +71,43 @@ </execution> </executions> </plugin> + <plugin> + <artifactId>maven-resources-plugin</artifactId> + <version>3.0.1</version> + <executions> + <execution> + <id>copy-scripts</id> + <phase>package</phase> + <goals> + <goal>copy-resources</goal> + </goals> + <configuration> + <outputDirectory>${dir.local}${file.separator}bin</outputDirectory> + <resources> + <resource> + <directory>bin</directory> + <!-- This plugin doesn't preserve permissions, so the scripts aren't executable--> + </resource> + </resources> + </configuration> + </execution> + <execution> + <id>copy-conf</id> + <phase>package</phase> + <goals> + <goal>copy-resources</goal> + </goals> + <configuration> + <outputDirectory>${dir.local}${file.separator}conf</outputDirectory> + <resources> + <resource> + <directory>conf</directory> + </resource> + </resources> + </configuration> + </execution> + </executions> + </plugin> </plugins> </build> <dependencies> http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/src/bin/crawl ---------------------------------------------------------------------- diff --git a/src/bin/crawl b/src/bin/crawl deleted file mode 100755 index 567d35e..0000000 --- a/src/bin/crawl +++ /dev/null @@ -1,281 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds> -# -i|--index Indexes crawl results into a configured indexer -# -w|--wait NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs -# are scheduled for fetching. Suffix can be: s for second, -# m for minute, h for hour and d for day. If no suffix is -# specified second is used by default. -# -D A Java property to pass to Nutch calls -# Seed Dir Directory in which to look for a seeds file -# Crawl Dir Directory where the crawl/link/segments dirs are saved -# Num Rounds The number of rounds to run this crawl for -# -# -# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND -# INDEXING FOR EACH SEGMENT - -INDEXFLAG=false -JAVA_PROPERTIES="" -WAIT=-1 # don't wait if there are no URLs to fetch - -function __to_seconds() { - NUMBER=$(echo $1 | tr -dc '0-9') - MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]') - - case $MODIFIER in - m|M) - SECONDS=`expr $NUMBER \* 60` - ;; - h|H) - SECONDS=`expr $NUMBER \* 120` - ;; - d|D) - SECONDS=`expr $NUMBER \* 86400` - ;; - s|S|*) - SECONDS=$NUMBER - ;; - esac - - echo $SECONDS -} - -while [[ $# > 0 ]] -do - case $1 in - -i|--index) - INDEXFLAG=true - shift - ;; - -D) - JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}" - shift 2 - ;; - -w|--wait) - WAIT="${2}" - shift 2 - ;; - *) - break - ;; - esac -done - -if [[ $# != 3 ]]; then - echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>" - echo -e "\t-i|--index\tIndexes crawl results into a configured indexer" - echo -e "\t-D\t\tA Java property to pass to Nutch calls" - echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs" - echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second," - echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is" - echo -e "\t\t\tspecified second is used by default." - echo -e "\tSeed Dir\tDirectory in which to look for a seeds file" - echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved" - echo -e "\tNum Rounds\tThe number of rounds to run this crawl for" - exit 1 -fi - -SEEDDIR="$1" -CRAWL_PATH="$2" -LIMIT="$3" - -# convert wait time to seconds for compatibility reasons -if [ "$WAIT" != "-1" ]; then - WAIT=$( __to_seconds "$WAIT" ) - echo "Time to wait (--wait) = $WAIT sec." -fi - -############################################# -# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS # -############################################# - -# set the number of slaves nodes -numSlaves=1 - -# and the total number of available tasks -# sets Hadoop parameter "mapreduce.job.reduces" -numTasks=`expr $numSlaves \* 2` - -# number of urls to fetch in one iteration -# 250K per task? -sizeFetchlist=`expr $numSlaves \* 50000` - -# time limit for feching -timeLimitFetch=180 - -# num threads for fetching -numThreads=50 - -############################################# - -bin="`dirname "$0"`" -bin="`cd "$bin"; pwd`" - -# determines whether mode based on presence of job file -mode=local -if [ -f "${bin}"/../*nutch*.job ]; then - mode=distributed -fi - -# note that some of the options listed here could be set in the -# corresponding hadoop site xml param file -commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true" - - # check that hadoop can be found on the path -if [ $mode = "distributed" ]; then - if [ $(which hadoop | wc -l ) -eq 0 ]; then - echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." - exit -1; - fi -fi - - -function __bin_nutch { - # run $bin/nutch, exit if exit value indicates error - - echo "$bin/nutch $@" ;# echo command and arguments - "$bin/nutch" "$@" - - RETCODE=$? - if [ $RETCODE -ne 0 ] - then - echo "Error running:" - echo " $bin/nutch $@" - echo "Failed with exit value $RETCODE." - exit $RETCODE - fi -} - - - -# initial injection -echo "Injecting seed URLs" -__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR" - -# main loop : rounds of generate - fetch - parse - update -for ((a=1; ; a++)) -do - if [ -e ".STOP" ] - then - echo "STOP file found - escaping loop" - break - fi - - if [ $LIMIT -ne -1 ]; then - if [ $a -gt $LIMIT ]; then - echo `date` ": Finished loop with $LIMIT iterations" - break - fi - echo `date` ": Iteration $a of $LIMIT" - else - echo `date` ": Iteration $a" - fi - - echo "Generating a new segment" - generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter) - echo "$bin/nutch generate ${generate_args[@]}" - $bin/nutch generate "${generate_args[@]}" - RETCODE=$? - if [ $RETCODE -eq 0 ]; then - : # ok: no error - elif [ $RETCODE -eq 1 ]; then - echo "Generate returned 1 (no new segments created)" - - if [ "$WAIT" -ne -1 ]; then - echo "Waiting for $WAIT sec. ..." - sleep $WAIT - continue - else - echo "Escaping loop: no more URLs to fetch now" - break - fi - else - echo "Error running:" - echo " $bin/nutch generate ${generate_args[@]}" - echo "Failed with exit value $RETCODE." - exit $RETCODE - fi - - # capture the name of the segment - # call hadoop in distributed mode - # or use ls - - if [ $mode = "local" ]; then - SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1` - else - SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1` - fi - - echo "Operating on segment : $SEGMENT" - - # fetching the segment - echo "Fetching : $SEGMENT" - __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads - - # parsing the segment - echo "Parsing : $SEGMENT" - # enable the skipping of records for the parsing so that a dodgy document - # so that it does not fail the full task - skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1" - __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT - - # updatedb with this segment - echo "CrawlDB update" - __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT - -# note that the link inversion - indexing routine can be done within the main loop -# on a per segment basis - echo "Link inversion" - __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT - - echo "Dedup on crawldb" - __bin_nutch dedup "$CRAWL_PATH"/crawldb - - if $INDEXFLAG; then - echo "Indexing $SEGMENT to index" - __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT - - echo "Cleaning up index if possible" - __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb - else - echo "Skipping indexing ..." - fi - - ####################################################### - # The following commands fall into WebGraph territory - # and should be uncommented based on your requirements - ####################################################### - #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/" - #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH" - - #echo "Running Loops Job on WebGraph within $CRAWL_PATH" - #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH" - - #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH" - #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH" - - #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH" - #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH" - - #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores" - #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores - -done - -exit 0 http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/src/bin/nutch ---------------------------------------------------------------------- diff --git a/src/bin/nutch b/src/bin/nutch deleted file mode 100755 index 1649069..0000000 --- a/src/bin/nutch +++ /dev/null @@ -1,324 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# The Nutch command script -# -# Environment Variables -# -# NUTCH_JAVA_HOME The java implementation to use. Overrides JAVA_HOME. -# -# NUTCH_HEAPSIZE The maximum amount of heap to use, in MB. -# Default is 1000. -# -# NUTCH_OPTS Extra Java runtime options. -# Multiple options must be separated by white space. -# -# NUTCH_LOG_DIR Log directory (default: $NUTCH_HOME/logs) -# -# NUTCH_LOGFILE Log file (default: hadoop.log) -# -# NUTCH_CONF_DIR Path(s) to configuration files (default: $NUTCH_HOME/conf). -# Multiple paths must be separated by a colon ':'. -# -cygwin=false -case "`uname`" in -CYGWIN*) cygwin=true;; -esac - -# resolve links - $0 may be a softlink -THIS="$0" -while [ -h "$THIS" ]; do - ls=`ls -ld "$THIS"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '.*/.*' > /dev/null; then - THIS="$link" - else - THIS=`dirname "$THIS"`/"$link" - fi -done - -# if no args specified, show usage -if [ $# = 0 ]; then - echo "nutch 1.12" - echo "Usage: nutch COMMAND" - echo "where COMMAND is one of:" - echo " readdb read / dump crawl db" - echo " mergedb merge crawldb-s, with optional filtering" - echo " readlinkdb read / dump link db" - echo " inject inject new urls into the database" - echo " generate generate new segments to fetch from crawl db" - echo " freegen generate new segments to fetch from text files" - echo " fetch fetch a segment's pages" - echo " parse parse a segment's pages" - echo " readseg read / dump segment data" - echo " mergesegs merge several segments, with optional filtering and slicing" - echo " updatedb update crawl db from segments after fetching" - echo " invertlinks create a linkdb from parsed segments" - echo " mergelinkdb merge linkdb-s, with optional filtering" - echo " index run the plugin-based indexer on parsed segments and linkdb" - echo " dedup deduplicate entries in the crawldb and give them a special status" - echo " dump exports crawled data from segments into files" - echo " commoncrawldump exports crawled data from segments into common crawl data format encoded as CBOR" - echo " solrindex run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead" - echo " solrdedup remove duplicates from solr - DEPRECATED use the dedup command instead" - echo " solrclean remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead" - echo " clean remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins" - echo " parsechecker check the parser for a given url" - echo " indexchecker check the indexing filters for a given url" - echo " filterchecker check url filters for a given url" - echo " normalizerchecker check url normalizers for a given url" - echo " domainstats calculate domain statistics from crawldb" - echo " protocolstats calculate protocol status code stats from crawldb" - echo " crawlcomplete calculate crawl completion stats from crawldb" - echo " webgraph generate a web graph from existing segments" - echo " linkrank run a link analysis program on the generated web graph" - echo " scoreupdater updates the crawldb with linkrank scores" - echo " nodedumper dumps the web graph's node scores" - echo " plugin load a plugin and run one of its classes main()" - echo " junit runs the given JUnit test" - echo " startserver runs the Nutch Server on localhost:8081" - echo " webapp run a local Nutch Web Application on locahost:8080" - echo " warc exports crawled data from segments at the WARC format" - echo " updatehostdb update the host db with records from the crawl db" - echo " readhostdb read / dump host db" - echo " or" - echo " CLASSNAME run the class named CLASSNAME" - echo "Most commands print help when invoked w/o parameters." - exit 1 -fi - -# get arguments -COMMAND=$1 -shift - -# some directories -THIS_DIR="`dirname "$THIS"`" -NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`" - -# some Java parameters -if [ "$NUTCH_JAVA_HOME" != "" ]; then - #echo "run java in $NUTCH_JAVA_HOME" - JAVA_HOME="$NUTCH_JAVA_HOME" -fi - -if [ "$JAVA_HOME" = "" ]; then - echo "Error: JAVA_HOME is not set." - exit 1 -fi - -local=true - -# NUTCH_JOB -if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then - local=false - for f in "$NUTCH_HOME"/*nutch*.job; do - NUTCH_JOB="$f" - done - # cygwin path translation - if $cygwin; then - NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`" - fi -fi - -JAVA="$JAVA_HOME/bin/java" -JAVA_HEAP_MAX=-Xmx1000m - -# check envvars which might override default args -if [ "$NUTCH_HEAPSIZE" != "" ]; then - #echo "run with heapsize $NUTCH_HEAPSIZE" - JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m" - #echo $JAVA_HEAP_MAX -fi - -# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf -CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}" -CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar" - -# so that filenames w/ spaces are handled correctly in loops below -IFS= - -# add libs to CLASSPATH -if $local; then - for f in "$NUTCH_HOME"/lib/*.jar; do - CLASSPATH="${CLASSPATH}:$f"; - done - # local runtime - # add plugins to classpath - if [ -d "$NUTCH_HOME/plugins" ]; then - CLASSPATH="${NUTCH_HOME}:${CLASSPATH}" - fi -fi - -# cygwin path translation -if $cygwin; then - CLASSPATH="`cygpath -p -w "$CLASSPATH"`" -fi - -# setup 'java.library.path' for native-hadoop code if necessary -# used only in local mode -JAVA_LIBRARY_PATH='' -if [ -d "${NUTCH_HOME}/lib/native" ]; then - - JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'` - - if [ -d "${NUTCH_HOME}/lib/native" ]; then - if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then - JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}" - else - JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}" - fi - fi -fi - -if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then - JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`" -fi - -# restore ordinary behaviour -unset IFS - -# default log directory & file -if [ "$NUTCH_LOG_DIR" = "" ]; then - NUTCH_LOG_DIR="$NUTCH_HOME/logs" -fi -if [ "$NUTCH_LOGFILE" = "" ]; then - NUTCH_LOGFILE='hadoop.log' -fi - -#Fix log path under cygwin -if $cygwin; then - NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`" -fi - -NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR") -NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE") - -if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then - NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH") -fi - -# figure out which class to run -if [ "$COMMAND" = "crawl" ] ; then - echo "Command $COMMAND is deprecated, please use bin/crawl instead" - exit -1 -elif [ "$COMMAND" = "inject" ] ; then - CLASS=org.apache.nutch.crawl.Injector -elif [ "$COMMAND" = "generate" ] ; then - CLASS=org.apache.nutch.crawl.Generator -elif [ "$COMMAND" = "freegen" ] ; then - CLASS=org.apache.nutch.tools.FreeGenerator -elif [ "$COMMAND" = "fetch" ] ; then - CLASS=org.apache.nutch.fetcher.Fetcher -elif [ "$COMMAND" = "parse" ] ; then - CLASS=org.apache.nutch.parse.ParseSegment -elif [ "$COMMAND" = "readdb" ] ; then - CLASS=org.apache.nutch.crawl.CrawlDbReader -elif [ "$COMMAND" = "mergedb" ] ; then - CLASS=org.apache.nutch.crawl.CrawlDbMerger -elif [ "$COMMAND" = "readlinkdb" ] ; then - CLASS=org.apache.nutch.crawl.LinkDbReader -elif [ "$COMMAND" = "readseg" ] ; then - CLASS=org.apache.nutch.segment.SegmentReader -elif [ "$COMMAND" = "mergesegs" ] ; then - CLASS=org.apache.nutch.segment.SegmentMerger -elif [ "$COMMAND" = "updatedb" ] ; then - CLASS=org.apache.nutch.crawl.CrawlDb -elif [ "$COMMAND" = "invertlinks" ] ; then - CLASS=org.apache.nutch.crawl.LinkDb -elif [ "$COMMAND" = "mergelinkdb" ] ; then - CLASS=org.apache.nutch.crawl.LinkDbMerger -elif [ "$COMMAND" = "dump" ] ; then - CLASS=org.apache.nutch.tools.FileDumper -elif [ "$COMMAND" = "commoncrawldump" ] ; then - CLASS=org.apache.nutch.tools.CommonCrawlDataDumper -elif [ "$COMMAND" = "solrindex" ] ; then - CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1" - shift -elif [ "$COMMAND" = "index" ] ; then - CLASS=org.apache.nutch.indexer.IndexingJob -elif [ "$COMMAND" = "solrdedup" ] ; then - echo "Command $COMMAND is deprecated, please use dedup instead" - exit -1 -elif [ "$COMMAND" = "dedup" ] ; then - CLASS=org.apache.nutch.crawl.DeduplicationJob -elif [ "$COMMAND" = "solrclean" ] ; then - CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1" - shift; shift -elif [ "$COMMAND" = "clean" ] ; then - CLASS=org.apache.nutch.indexer.CleaningJob -elif [ "$COMMAND" = "parsechecker" ] ; then - CLASS=org.apache.nutch.parse.ParserChecker -elif [ "$COMMAND" = "indexchecker" ] ; then - CLASS=org.apache.nutch.indexer.IndexingFiltersChecker -elif [ "$COMMAND" = "filterchecker" ] ; then - CLASS=org.apache.nutch.net.URLFilterChecker -elif [ "$COMMAND" = "normalizerchecker" ] ; then - CLASS=org.apache.nutch.net.URLNormalizerChecker -elif [ "$COMMAND" = "domainstats" ] ; then - CLASS=org.apache.nutch.util.domain.DomainStatistics -elif [ "$COMMAND" = "protocolstats" ] ; then - CLASS=org.apache.nutch.util.ProtocolStatusStatistics -elif [ "$COMMAND" = "crawlcomplete" ] ; then - CLASS=org.apache.nutch.util.CrawlCompletionStats -elif [ "$COMMAND" = "webgraph" ] ; then - CLASS=org.apache.nutch.scoring.webgraph.WebGraph -elif [ "$COMMAND" = "linkrank" ] ; then - CLASS=org.apache.nutch.scoring.webgraph.LinkRank -elif [ "$COMMAND" = "scoreupdater" ] ; then - CLASS=org.apache.nutch.scoring.webgraph.ScoreUpdater -elif [ "$COMMAND" = "nodedumper" ] ; then - CLASS=org.apache.nutch.scoring.webgraph.NodeDumper -elif [ "$COMMAND" = "plugin" ] ; then - CLASS=org.apache.nutch.plugin.PluginRepository -elif [ "$COMMAND" = "junit" ] ; then - CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/" - if $local; then - for f in "$NUTCH_HOME"/test/lib/*.jar; do - CLASSPATH="${CLASSPATH}:$f"; - done - fi - CLASS=org.junit.runner.JUnitCore -elif [ "$COMMAND" = "startserver" ] ; then - CLASS=org.apache.nutch.service.NutchServer -elif [ "$COMMAND" = "webapp" ] ; then - CLASS=org.apache.nutch.webui.NutchUiServer -elif [ "$COMMAND" = "warc" ] ; then - CLASS=org.apache.nutch.tools.warc.WARCExporter -elif [ "$COMMAND" = "updatehostdb" ] ; then - CLASS=org.apache.nutch.hostdb.UpdateHostDb -elif [ "$COMMAND" = "readhostdb" ] ; then - CLASS=org.apache.nutch.hostdb.ReadHostDb -else - CLASS=$COMMAND -fi - -# distributed mode -EXEC_CALL=(hadoop jar "$NUTCH_JOB") - -if $local; then - EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH") -else - # check that hadoop can be found on the path - if [ $(which hadoop | wc -l ) -eq 0 ]; then - echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." - exit -1; - fi -fi - -# run it -exec "${EXEC_CALL[@]}" $CLASS "$@" -
